Remove old manifests. They are still available on legacy tag

This commit is contained in:
CarlosEDP 2019-03-25 19:53:45 -03:00
parent 4ee0c86a03
commit de1c46dd63
58 changed files with 0 additions and 9193 deletions

View File

@ -1,6 +0,0 @@
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-main
data:
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg==

View File

@ -1,15 +0,0 @@
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: alertmanager
namespace: monitoring
spec:
rules:
- host: alertmanager.internal.carlosedp.com
http:
paths:
- path: /
backend:
serviceName: alertmanager-main
servicePort: web

View File

@ -1,16 +0,0 @@
apiVersion: v1
kind: Service
metadata:
labels:
alertmanager: main
name: alertmanager-main
spec:
type: NodePort
ports:
- name: web
nodePort: 30903
port: 9093
protocol: TCP
targetPort: web
selector:
alertmanager: main

View File

@ -1,11 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: Alertmanager
metadata:
name: main
labels:
alertmanager: main
spec:
replicas: 1
baseImage: carlosedp/alertmanager
version: v0.14.0

View File

@ -1,22 +0,0 @@
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: arm-exporter
namespace: monitoring
labels:
k8s-app: arm-exporter
spec:
template:
metadata:
name: arm-exporter
labels:
k8s-app: arm-exporter
spec:
hostNetwork: true
containers:
- image: carlosedp/arm_exporter
name: arm-exporter
ports:
- name: http
containerPort: 9243
hostPort: 9243

View File

@ -1,18 +0,0 @@
apiVersion: v1
kind: Service
metadata:
namespace: monitoring
labels:
app: arm-exporter
k8s-app: arm-exporter
name: arm-exporter
spec:
type: ClusterIP
clusterIP: None
ports:
- name: http
port: 9243
protocol: TCP
selector:
k8s-app: arm-exporter

View File

@ -1,12 +0,0 @@
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: grafana-claim
namespace: monitoring
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 5Gi

View File

@ -1,32 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-config
namespace: monitoring
data:
config.ini: |
[database]
path = /data/grafana.db
[paths]
data = /data
logs = /data/log
plugins = /data/plugins
[session]
provider = memory
[auth.basic]
enabled = false
[auth.anonymous]
enabled = false
[smtp]
enabled = true
host = smtp-server.monitoring.svc:25
user =
password =
from_address = 'carlosedp@gmail.com'
from_name = Grafana Alert
skip_verify = true

View File

@ -1,7 +0,0 @@
apiVersion: v1
kind: Secret
metadata:
name: grafana-credentials
data:
user: YWRtaW4=
password: YWRtaW4=

File diff suppressed because it is too large Load Diff

View File

@ -1,12 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-dashboards
data:
dashboards.yaml: |+
- name: '0'
org_id: 1
folder: ''
type: file
options:
folder: /grafana-dashboard-definitions/0

View File

@ -1,15 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-datasources
data:
prometheus.yaml: |+
datasources:
- name: prometheus
type: prometheus
access: proxy
org_id: 1
url: http://prometheus-k8s.monitoring.svc:9090
version: 1
editable: false

View File

@ -1,60 +0,0 @@
apiVersion: apps/v1beta1
kind: Deployment
metadata:
name: grafana
spec:
replicas: 1
template:
metadata:
labels:
app: grafana
spec:
securityContext:
runAsNonRoot: true
runAsUser: 65534
containers:
- name: grafana
image: carlosedp/monitoring-grafana:v5.1.3
volumeMounts:
- name: grafana-config
mountPath: /grafana/conf/config.ini
subPath: config.ini
- name: grafana-storage
mountPath: /data
- name: grafana-datasources
mountPath: /grafana/conf/provisioning/datasources
- name: grafana-dashboards
mountPath: /grafana/conf/provisioning/dashboards
- name: grafana-dashboard-definitions-0
mountPath: /grafana-dashboard-definitions/0
ports:
- name: web
containerPort: 3000
env:
- name: GF_INSTALL_PLUGINS
value: "grafana-clock-panel,grafana-piechart-panel"
- name: GF_PATHS_PLUGINS
value: "/data/plugins"
resources:
requests:
memory: 100Mi
cpu: 100m
limits:
memory: 200Mi
cpu: 200m
volumes:
- name: grafana-config
configMap:
name: grafana-config
- name: grafana-storage
persistentVolumeClaim:
claimName: grafana-claim
- name: grafana-datasources
configMap:
name: grafana-datasources
- name: grafana-dashboards
configMap:
name: grafana-dashboards
- name: grafana-dashboard-definitions-0
configMap:
name: grafana-dashboard-definitions-0

View File

@ -1,20 +0,0 @@
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: grafana-external
namespace: monitoring
labels:
traffic-type: external
annotations:
traefik.frontend.rule.type: PathPrefix
#traefik.frontend.redirect.regex: ^http://(.*)
#traefik.frontend.redirect.replacement: https://$1
spec:
rules:
- host: grafana.cloud.carlosedp.com
http:
paths:
- path: /
backend:
serviceName: grafana
servicePort: 3000

View File

@ -1,15 +0,0 @@
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: grafana
namespace: monitoring
spec:
rules:
- host: grafana.internal.carlosedp.com
http:
paths:
- path: /
backend:
serviceName: grafana
servicePort: 3000

View File

@ -1,15 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: grafana
labels:
app: grafana
spec:
type: NodePort
ports:
- port: 3000
protocol: TCP
nodePort: 30902
targetPort: web
selector:
app: grafana

View File

@ -1,17 +0,0 @@
apiVersion: v1
kind: Service
metadata:
namespace: kube-system
name: kube-controller-manager-prometheus-discovery
labels:
k8s-app: kube-controller-manager
spec:
selector:
component: kube-controller-manager
type: ClusterIP
clusterIP: None
ports:
- name: http-metrics
port: 10252
targetPort: 10252
protocol: TCP

View File

@ -1,17 +0,0 @@
apiVersion: v1
kind: Service
metadata:
namespace: kube-system
name: kube-scheduler-prometheus-discovery
labels:
k8s-app: kube-scheduler
spec:
selector:
component: kube-scheduler
type: ClusterIP
clusterIP: None
ports:
- name: http-metrics
port: 10251
targetPort: 10251
protocol: TCP

View File

@ -1,12 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics
namespace: monitoring

View File

@ -1,45 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: kube-state-metrics
rules:
- apiGroups: [""]
resources:
- nodes
- pods
- services
- resourcequotas
- replicationcontrollers
- limitranges
- persistentvolumeclaims
- persistentvolumes
- namespaces
- endpoints
verbs: ["list", "watch"]
- apiGroups: ["extensions"]
resources:
- daemonsets
- deployments
- replicasets
verbs: ["list", "watch"]
- apiGroups: ["apps"]
resources:
- statefulsets
verbs: ["list", "watch"]
- apiGroups: ["batch"]
resources:
- cronjobs
- jobs
verbs: ["list", "watch"]
- apiGroups: ["autoscaling"]
resources:
- horizontalpodautoscalers
verbs: ["list", "watch"]
- apiGroups: ["authentication.k8s.io"]
resources:
- tokenreviews
verbs: ["create"]
- apiGroups: ["authorization.k8s.io"]
resources:
- subjectaccessreviews
verbs: ["create"]

View File

@ -1,55 +0,0 @@
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
name: kube-state-metrics
spec:
replicas: 1
template:
metadata:
labels:
app: kube-state-metrics
spec:
serviceAccountName: kube-state-metrics
securityContext:
runAsNonRoot: true
runAsUser: 65534
containers:
- name: kube-state-metrics
image: carlosedp/kube-state-metrics:v1.2.0
args:
- "--host=0.0.0.0"
- "--port=8443"
- "--telemetry-host=0.0.0.0"
- "--telemetry-port=9443"
ports:
- name: http-main
containerPort: 8443
- name: http-self
containerPort: 9443
- name: addon-resizer
image: carlosedp/addon-resizer:2.1
resources:
limits:
cpu: 100m
memory: 30Mi
requests:
cpu: 100m
memory: 30Mi
env:
- name: MY_POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: MY_POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
command:
- /pod_nanny
- --container=kube-state-metrics
- --cpu=100m
- --extra-cpu=2m
- --memory=150Mi
- --extra-memory=30Mi
#- --threshold=5
- --deployment=kube-state-metrics

View File

@ -1,12 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: kube-state-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: kube-state-metrics-resizer
subjects:
- kind: ServiceAccount
name: kube-state-metrics

View File

@ -1,15 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: kube-state-metrics-resizer
rules:
- apiGroups: [""]
resources:
- pods
verbs: ["get"]
- apiGroups: ["extensions"]
resources:
- deployments
resourceNames: ["kube-state-metrics"]
verbs: ["get", "update"]

View File

@ -1,4 +0,0 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: kube-state-metrics

View File

@ -1,21 +0,0 @@
apiVersion: v1
kind: Service
metadata:
labels:
app: kube-state-metrics
k8s-app: kube-state-metrics
name: kube-state-metrics
spec:
clusterIP: None
ports:
- name: http-main
port: 8443
targetPort: http-main
protocol: TCP
- name: http-self
port: 9443
targetPort: http-self
protocol: TCP
selector:
app: kube-state-metrics

View File

@ -1,12 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: node-exporter
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: node-exporter
subjects:
- kind: ServiceAccount
name: node-exporter
namespace: monitoring

View File

@ -1,13 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: node-exporter
rules:
- apiGroups: ["authentication.k8s.io"]
resources:
- tokenreviews
verbs: ["create"]
- apiGroups: ["authorization.k8s.io"]
resources:
- subjectaccessreviews
verbs: ["create"]

View File

@ -1,59 +0,0 @@
apiVersion: extensions/v1beta1
kind: DaemonSet
metadata:
name: node-exporter
spec:
updateStrategy:
rollingUpdate:
maxUnavailable: 1
type: RollingUpdate
template:
metadata:
labels:
app: node-exporter
name: node-exporter
spec:
serviceAccountName: node-exporter
securityContext:
runAsNonRoot: true
runAsUser: 65534
hostNetwork: true
hostPID: true
containers:
- image: carlosedp/node_exporter:v0.15.2
args:
#- "--web.listen-address=0.0.0.1:9100"
- "--path.procfs=/host/proc"
- "--path.sysfs=/host/sys"
- "--collector.filesystem.ignored-mount-points"
- '^(\/(host|root)\/sys\/kernel\/debug\/).*'
name: node-exporter
ports:
- containerPort: 9100
hostPort: 9100
name: http
resources:
requests:
memory: 30Mi
cpu: 100m
limits:
memory: 50Mi
cpu: 200m
volumeMounts:
- name: proc
readOnly: true
mountPath: /host/proc
- name: sys
readOnly: true
mountPath: /host/sys
tolerations:
- effect: NoSchedule
operator: Exists
volumes:
- name: proc
hostPath:
path: /proc
- name: sys
hostPath:
path: /sys

View File

@ -1,4 +0,0 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: node-exporter

View File

@ -1,17 +0,0 @@
apiVersion: v1
kind: Service
metadata:
labels:
app: node-exporter
k8s-app: node-exporter
name: node-exporter
spec:
type: ClusterIP
clusterIP: None
ports:
- name: http
port: 9100
protocol: TCP
selector:
app: node-exporter

View File

@ -1,12 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus-operator
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-operator
subjects:
- kind: ServiceAccount
name: prometheus-operator
namespace: monitoring

View File

@ -1,54 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus-operator
rules:
- apiGroups:
- extensions
resources:
- thirdpartyresources
verbs:
- "*"
- apiGroups:
- apiextensions.k8s.io
resources:
- customresourcedefinitions
verbs:
- "*"
- apiGroups:
- monitoring.coreos.com
resources:
- alertmanagers
- prometheuses
- prometheuses/finalizers
- alertmanagers/finalizers
- servicemonitors
verbs:
- "*"
- apiGroups:
- apps
resources:
- statefulsets
verbs: ["*"]
- apiGroups: [""]
resources:
- configmaps
- secrets
verbs: ["*"]
- apiGroups: [""]
resources:
- pods
verbs: ["list", "delete"]
- apiGroups: [""]
resources:
- services
- endpoints
verbs: ["get", "create", "update"]
- apiGroups: [""]
resources:
- nodes
verbs: ["list", "watch"]
- apiGroups: [""]
resources:
- namespaces
verbs: ["list"]

View File

@ -1,4 +0,0 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus-operator

View File

@ -1,15 +0,0 @@
apiVersion: v1
kind: Service
metadata:
name: prometheus-operator
labels:
k8s-app: prometheus-operator
spec:
type: ClusterIP
ports:
- name: http
port: 8080
targetPort: http
protocol: TCP
selector:
k8s-app: prometheus-operator

View File

@ -1,34 +0,0 @@
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
labels:
k8s-app: prometheus-operator
name: prometheus-operator
spec:
replicas: 1
template:
metadata:
labels:
k8s-app: prometheus-operator
spec:
containers:
- args:
- --kubelet-service=kube-system/kubelet
- --prometheus-config-reloader=carlosedp/prometheus-config-reloader:v0.0.2
- --config-reloader-image=carlosedp/configmap-reload:v0.2.2-arm
image: carlosedp/prometheus-operator:v0.17.0
name: prometheus-operator
ports:
- containerPort: 8080
name: http
resources:
limits:
cpu: 200m
memory: 100Mi
requests:
cpu: 100m
memory: 50Mi
securityContext:
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: prometheus-operator

View File

@ -1,14 +0,0 @@
apiVersion: extensions/v1beta1
kind: Ingress
metadata:
name: prometheus
namespace: monitoring
spec:
rules:
- host: prometheus.internal.carlosedp.com
http:
paths:
- path: /
backend:
serviceName: prometheus-k8s
servicePort: web

View File

@ -1,54 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: kube-system
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: default
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRoleBinding
metadata:
name: prometheus-k8s
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring

View File

@ -1,55 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: monitoring
rules:
- apiGroups: [""]
resources:
- nodes
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources:
- configmaps
verbs: ["get"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: kube-system
rules:
- apiGroups: [""]
resources:
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: Role
metadata:
name: prometheus-k8s
namespace: default
rules:
- apiGroups: [""]
resources:
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1beta1
kind: ClusterRole
metadata:
name: prometheus-k8s
rules:
- apiGroups: [""]
resources:
- nodes/metrics
verbs: ["get"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]

View File

@ -1,602 +0,0 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-k8s-rules
labels:
role: prometheus-rulefiles
prometheus: k8s
data:
alertmanager.rules.yaml: |+
groups:
- name: alertmanager.rules
rules:
- alert: AlertmanagerConfigInconsistent
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
"alertmanager-$1", "alertmanager", "(.*)") != 1
for: 5m
labels:
severity: critical
annotations:
description: The configuration of the instances of the Alertmanager cluster
`{{$labels.service}}` are out of sync.
- alert: AlertmanagerDownOrMissing
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
for: 5m
labels:
severity: warning
annotations:
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
disappeared from discovery.
- alert: AlertmanagerFailedReload
expr: alertmanager_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}.
etcd3.rules.yaml: |+
groups:
- name: ./etcd3.rules
rules:
- alert: InsufficientMembers
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
for: 3m
labels:
severity: critical
annotations:
description: If one more etcd member goes down the cluster will be unavailable
summary: etcd cluster insufficient members
- alert: NoLeader
expr: etcd_server_has_leader{job="etcd"} == 0
for: 1m
labels:
severity: critical
annotations:
description: etcd member {{ $labels.instance }} has no leader
summary: etcd member has no leader
- alert: HighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
changes within the last hour
summary: a high number of leader changes within the etcd cluster are happening
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: HighNumberOfFailedGRPCRequests
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
on etcd instance {{ $labels.instance }}'
summary: a high number of gRPC requests are failing
- alert: GRPCRequestsSlow
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
> 0.15
for: 10m
labels:
severity: critical
annotations:
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
}} are slow
summary: slow gRPC requests
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
BY (method) > 0.01
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HighNumberOfFailedHTTPRequests
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
BY (method) > 0.05
for: 5m
labels:
severity: critical
annotations:
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
instance {{ $labels.instance }}'
summary: a high number of HTTP requests are failing
- alert: HTTPRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
}} are slow
summary: slow HTTP requests
- alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
> 0.15
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} member communication with
{{ $labels.To }} is slow
summary: etcd member communication is slow
- alert: HighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
failures within the last hour
summary: a high number of proposals within the etcd cluster are failing
- alert: HighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
> 0.5
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} fync durations are high
summary: high fsync durations
- alert: HighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
> 0.25
for: 10m
labels:
severity: warning
annotations:
description: etcd instance {{ $labels.instance }} commit durations are high
summary: high commit durations
general.rules.yaml: |+
groups:
- name: general.rules
rules:
- alert: TargetDown
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
for: 10m
labels:
severity: warning
annotations:
description: '{{ $value }}% of {{ $labels.job }} targets are down.'
summary: Targets are down
- alert: DeadMansSwitch
expr: vector(1)
labels:
severity: none
annotations:
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
pipeline is functional.
summary: Alerting DeadMansSwitch
- record: fd_utilization
expr: process_open_fds / process_max_fds
- alert: FdExhaustionClose
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
for: 10m
labels:
severity: warning
annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
will exhaust in file/socket descriptors within the next 4 hours'
summary: file descriptors soon exhausted
- alert: FdExhaustionClose
expr: predict_linear(fd_utilization[10m], 3600) > 1
for: 10m
labels:
severity: critical
annotations:
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
will exhaust in file/socket descriptors within the next hour'
summary: file descriptors soon exhausted
kube-controller-manager.rules.yaml: |+
groups:
- name: kube-controller-manager.rules
rules:
- alert: K8SControllerManagerDown
expr: absent(up{job="kube-controller-manager"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S controller manager. Deployments and replication
controllers are not making progress.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
summary: Controller manager is down
kube-scheduler.rules.yaml: |+
groups:
- name: kube-scheduler.rules
rules:
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.99"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.9"
- record: cluster:scheduler_binding_latency_seconds:quantile
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
BY (le, cluster)) / 1e+06
labels:
quantile: "0.5"
- alert: K8SSchedulerDown
expr: absent(up{job="kube-scheduler"} == 1)
for: 5m
labels:
severity: critical
annotations:
description: There is no running K8S scheduler. New pods are not being assigned
to nodes.
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
summary: Scheduler is down
kube-state-metrics.rules.yaml: |+
groups:
- name: kube-state-metrics.rules
rules:
- alert: DeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
for: 15m
labels:
severity: warning
annotations:
description: Observed deployment generation does not match expected one for
deployment {{$labels.namespaces}}/{{$labels.deployment}}
summary: Deployment is outdated
- alert: DeploymentReplicasNotUpdated
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
unless (kube_deployment_spec_paused == 1)
for: 15m
labels:
severity: warning
annotations:
description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
summary: Deployment replicas are outdated
- alert: DaemonSetRolloutStuck
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
* 100 < 100
for: 15m
labels:
severity: warning
annotations:
description: Only {{$value}}% of desired pods scheduled and ready for daemon
set {{$labels.namespaces}}/{{$labels.daemonset}}
summary: DaemonSet is missing pods
- alert: K8SDaemonSetsNotScheduled
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
> 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are not scheduled.
summary: Daemonsets are not scheduled correctly
- alert: DaemonSetsMissScheduled
expr: kube_daemonset_status_number_misscheduled > 0
for: 10m
labels:
severity: warning
annotations:
description: A number of daemonsets are running where they are not supposed
to run.
summary: Daemonsets are not scheduled correctly
- alert: PodFrequentlyRestarting
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
for: 10m
labels:
severity: warning
annotations:
description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
times within the last hour
summary: Pod is restarting frequently
kubelet.rules.yaml: |+
groups:
- name: kubelet.rules
rules:
- alert: K8SNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
for: 1h
labels:
severity: warning
annotations:
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
or has set itself to NotReady, for more than an hour
summary: Node status is NotReady
- alert: K8SManyNodesNotReady
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
for: 1m
labels:
severity: critical
annotations:
description: '{{ $value }}% of Kubernetes nodes are not ready'
- alert: K8SKubeletDown
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
for: 1h
labels:
severity: warning
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets.
- alert: K8SKubeletDown
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
* 100 > 1
for: 1h
labels:
severity: critical
annotations:
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
have disappeared from service discovery.
summary: Many Kubelets cannot be scraped
- alert: K8SKubeletTooManyPods
expr: kubelet_running_pod_count > 100
for: 10m
labels:
severity: warning
annotations:
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
to the limit of 110
summary: Kubelet is close to pod limit
kubernetes.rules.yaml: |+
groups:
- name: kubernetes.rules
rules:
- record: pod_name:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(pod_name)
- record: pod_name:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: pod_name:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
BY (pod_name)
- record: pod_name:container_fs_usage_bytes:sum
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
- record: namespace:container_memory_usage_bytes:sum
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
- record: namespace:container_spec_cpu_shares:sum
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
- record: namespace:container_cpu_usage:sum
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
BY (namespace)
- record: cluster:memory_usage:ratio
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
(cluster) / sum(machine_memory_bytes) BY (cluster)
- record: cluster:container_spec_cpu_shares:ratio
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
/ sum(machine_cpu_cores)
- record: cluster:container_cpu_usage:ratio
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
/ sum(machine_cpu_cores)
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.99"
- record: apiserver_latency:quantile_seconds
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.9"
- record: apiserver_latency_seconds:quantile
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
1e+06
labels:
quantile: "0.5"
- alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 1
for: 10m
labels:
severity: warning
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}}
- alert: APIServerLatencyHigh
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
> 4
for: 10m
labels:
severity: critical
annotations:
description: the API server has a 99th percentile latency of {{ $value }} seconds
for {{$labels.verb}} {{$labels.resource}}
- alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 2
for: 10m
labels:
severity: warning
annotations:
description: API server returns errors for {{ $value }}% of requests
- alert: APIServerErrorsHigh
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
* 100 > 5
for: 10m
labels:
severity: critical
annotations:
description: API server returns errors for {{ $value }}% of requests
- alert: K8SApiserverDown
expr: absent(up{job="apiserver"} == 1)
for: 20m
labels:
severity: critical
annotations:
description: No API servers are reachable or all have disappeared from service
discovery
- alert: K8sCertificateExpirationNotice
labels:
severity: warning
annotations:
description: Kubernetes API Certificate is expiring soon (less than 7 days)
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
- alert: K8sCertificateExpirationNotice
labels:
severity: critical
annotations:
description: Kubernetes API Certificate is expiring in less than 1 day
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
node.rules.yaml: |+
groups:
- name: node.rules
rules:
- record: instance:node_cpu:rate:sum
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
BY (instance)
- record: instance:node_filesystem_usage:sum
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
BY (instance)
- record: instance:node_network_receive_bytes:rate:sum
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
- record: instance:node_network_transmit_bytes:rate:sum
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
- record: instance:node_cpu:ratio
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
- record: cluster:node_cpu:sum_rate5m
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
- record: cluster:node_cpu:ratio
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
- alert: NodeExporterDown
expr: absent(up{job="node-exporter"} == 1)
for: 10m
labels:
severity: warning
annotations:
description: Prometheus could not scrape a node-exporter for more than 10m,
or node-exporters have disappeared from discovery
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
for: 30m
labels:
severity: warning
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running
full within the next 24 hours (mounted at {{$labels.mountpoint}})
- alert: NodeDiskRunningFull
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
for: 10m
labels:
severity: critical
annotations:
description: device {{$labels.device}} on node {{$labels.instance}} is running
full within the next 2 hours (mounted at {{$labels.mountpoint}})
prometheus.rules.yaml: |+
groups:
- name: prometheus.rules
rules:
- alert: PrometheusConfigReloadFailed
expr: prometheus_config_last_reload_successful == 0
for: 10m
labels:
severity: warning
annotations:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
- alert: PrometheusNotificationQueueRunningFull
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
for: 10m
labels:
severity: warning
annotations:
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
$labels.pod}}
- alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.01
for: 10m
labels:
severity: warning
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusErrorSendingAlerts
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
> 0.03
for: 10m
labels:
severity: critical
annotations:
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
- alert: PrometheusNotConnectedToAlertmanagers
expr: prometheus_notifications_alertmanagers_discovered < 1
for: 10m
labels:
severity: warning
annotations:
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
to any Alertmanagers
- alert: PrometheusTSDBReloadsFailing
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk
- alert: PrometheusTSDBCompactionsFailing
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
for: 12h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks
- alert: PrometheusTSDBWALCorruptions
expr: tsdb_wal_corruptions_total > 0
for: 4h
labels:
severity: warning
annotations:
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
log (WAL).'
summary: Prometheus write-ahead log is corrupted

View File

@ -1,4 +0,0 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus-k8s

View File

@ -1,16 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: alertmanager
labels:
k8s-app: alertmanager
spec:
selector:
matchLabels:
alertmanager: main
namespaceSelector:
matchNames:
- monitoring
endpoints:
- port: web
interval: 30s

View File

@ -1,23 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kube-apiserver
labels:
k8s-app: apiserver
spec:
jobLabel: component
selector:
matchLabels:
component: apiserver
provider: kubernetes
namespaceSelector:
matchNames:
- default
endpoints:
- port: https
interval: 30s
scheme: https
tlsConfig:
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
serverName: kubernetes
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token

View File

@ -1,18 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: arm-exporter
namespace: monitoring
labels:
k8s-app: arm-exporter
spec:
jobLabel: k8s-app
selector:
matchLabels:
k8s-app: arm-exporter
namespaceSelector:
matchNames:
- monitoring
endpoints:
- port: http
interval: 30s

View File

@ -1,17 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kube-controller-manager
labels:
k8s-app: kube-controller-manager
spec:
jobLabel: k8s-app
endpoints:
- port: http-metrics
interval: 30s
selector:
matchLabels:
k8s-app: kube-controller-manager
namespaceSelector:
matchNames:
- kube-system

View File

@ -1,17 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kube-scheduler
labels:
k8s-app: kube-scheduler
spec:
jobLabel: k8s-app
endpoints:
- port: http-metrics
interval: 30s
selector:
matchLabels:
k8s-app: kube-scheduler
namespaceSelector:
matchNames:
- kube-system

View File

@ -1,28 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kube-state-metrics
labels:
k8s-app: kube-state-metrics
spec:
jobLabel: k8s-app
selector:
matchLabels:
k8s-app: kube-state-metrics
namespaceSelector:
matchNames:
- monitoring
endpoints:
- port: http-main
#scheme: https
interval: 30s
honorLabels: true
#bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
#tlsConfig:
# insecureSkipVerify: true
- port: http-self
# scheme: https
interval: 30s
# bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
# tlsConfig:
# insecureSkipVerify: true

View File

@ -1,29 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: kubelet
labels:
k8s-app: kubelet
spec:
jobLabel: k8s-app
endpoints:
- port: https-metrics
scheme: https
interval: 30s
tlsConfig:
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
- port: https-metrics
scheme: https
path: /metrics/cadvisor
interval: 30s
honorLabels: true
tlsConfig:
insecureSkipVerify: true
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
selector:
matchLabels:
k8s-app: kubelet
namespaceSelector:
matchNames:
- kube-system

View File

@ -1,21 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: node-exporter
labels:
k8s-app: node-exporter
spec:
jobLabel: k8s-app
selector:
matchLabels:
k8s-app: node-exporter
namespaceSelector:
matchNames:
- monitoring
endpoints:
- port: http
#scheme: http
interval: 30s
#bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
#tlsConfig:
# insecureSkipVerify: true

View File

@ -1,12 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: prometheus-operator
labels:
k8s-app: prometheus-operator
spec:
endpoints:
- port: http
selector:
matchLabels:
k8s-app: prometheus-operator

View File

@ -1,16 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: prometheus
labels:
k8s-app: prometheus
spec:
selector:
matchLabels:
prometheus: k8s
namespaceSelector:
matchNames:
- monitoring
endpoints:
- port: web
interval: 30s

View File

@ -1,17 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: traefik-ingress-lb
labels:
k8s-app: traefik-ingress-lb
spec:
jobLabel: k8s-app
endpoints:
- port: admin
interval: 30s
selector:
matchLabels:
k8s-app: traefik-ingress-lb
namespaceSelector:
matchNames:
- kube-system

View File

@ -1,16 +0,0 @@
apiVersion: v1
kind: Service
metadata:
labels:
prometheus: k8s
name: prometheus-k8s
spec:
type: NodePort
ports:
- name: web
nodePort: 30900
port: 9090
protocol: TCP
targetPort: web
selector:
prometheus: k8s

View File

@ -1,39 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: Prometheus
metadata:
name: k8s
labels:
prometheus: k8s
spec:
replicas: 1
baseImage: carlosedp/prometheus
version: v2.2.1
retention: "168h"
serviceAccountName: prometheus-k8s
serviceMonitorSelector:
matchExpressions:
- {key: k8s-app, operator: Exists}
ruleSelector:
matchLabels:
role: prometheus-rulefiles
prometheus: k8s
resources:
requests:
# 2Gi is default, but won't schedule if you don't have a node with >2Gi
# memory. Modify based on your target and time-series count for
# production use. This value is mainly meant for demonstration/testing
# purposes.
memory: 512Mi
alerting:
alertmanagers:
- namespace: monitoring
name: alertmanager-main
port: web
storage:
volumeClaimTemplate:
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 50Gi

View File

@ -1,10 +0,0 @@
#!/bin/bash
echo "Please enter your Gmail account";
read username;
echo "Please enter your Gmail password";
read -s password;
echo "Creating secret"
kubectl create secret generic smtp-account -n monitoring --from-literal=username=${username} --from-literal=password=${password}

View File

@ -1,68 +0,0 @@
apiVersion: extensions/v1beta1
kind: Deployment
metadata:
annotations:
deployment.kubernetes.io/revision: "1"
generation: 1
labels:
run: smtp-server
name: smtp-server
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
run: smtp-server
strategy:
rollingUpdate:
maxSurge: 1
maxUnavailable: 1
type: RollingUpdate
template:
metadata:
creationTimestamp: null
labels:
run: smtp-server
spec:
containers:
- image: carlosedp/docker-smtp
imagePullPolicy: Always
name: smtp-server
ports:
- containerPort: 25
protocol: TCP
env:
- name: GMAIL_USER
valueFrom:
secretKeyRef:
name: smtp-account
key: username
- name: GMAIL_PASSWORD
valueFrom:
secretKeyRef:
name: smtp-account
key: password
- name: DISABLE_IPV6
value: "True"
- name: RELAY_DOMAINS
value: ":192.168.0.0/24:10.0.0.0/16"
resources: {}
restartPolicy: Always
replicas: 1
---
apiVersion: v1
kind: Service
metadata:
labels:
run: smtp-server
name: smtp-server
namespace: monitoring
spec:
ports:
- nodePort: 30025
port: 25
protocol: TCP
targetPort: 25
selector:
run: smtp-server
type: NodePort

View File

@ -1,20 +0,0 @@
apiVersion: apps/v1beta1
kind: Deployment
metadata:
name: snmp-exporter
spec:
replicas: 1
selector:
matchLabels:
app: snmp-exporter
template:
metadata:
labels:
app: snmp-exporter
spec:
containers:
- image: carlosedp/snmp_exporter:v0.9.0
name: snmp-exporter
ports:
- containerPort: 9116
name: metrics

View File

@ -1,15 +0,0 @@
apiVersion: v1
kind: Service
metadata:
labels:
app: snmp-exporter
name: snmp-exporter
spec:
ports:
- name: http-metrics
port: 9116
protocol: TCP
targetPort: metrics
selector:
app: snmp-exporter

View File

@ -1,24 +0,0 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
k8s-app: snmp-exporter
name: snmp-exporter
spec:
jobLabel: k8s-app
selector:
app: snmp-exporter
namespaceSelector:
matchNames:
- monitoring
endpoints:
- interval: 60s
port: http-metrics
params:
module:
- ddwrt
target:
- 192.168.1.1
path: "/snmp"
targetPort: 9116