Update libs

This commit is contained in:
Carlos de Paula 2019-10-09 17:33:29 -03:00
parent 5fd83a4ef2
commit 6213f12199
13 changed files with 7446 additions and 171 deletions

View File

@ -69,6 +69,7 @@ local utils = import 'utils.libsonnet';
local pvc = k.core.v1.persistentVolumeClaim, local pvc = k.core.v1.persistentVolumeClaim,
prometheus+: { prometheus+: {
spec+: { spec+: {
replicas: $._config.prometheus.replicas,
retention: '15d', retention: '15d',
externalUrl: 'http://' + $._config.urls.prom_ingress, externalUrl: 'http://' + $._config.urls.prom_ingress,
} }

View File

@ -8,7 +8,7 @@
"subdir": "jsonnet/kube-prometheus" "subdir": "jsonnet/kube-prometheus"
} }
}, },
"version": "da959c643657c7d2aac6f5ddd68582a949283c49" "version": "8405360a467a34fca34735d92c763ae38bfe5917"
}, },
{ {
"name": "ksonnet", "name": "ksonnet",
@ -28,7 +28,7 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "193d4934f85c9ff596d1f3e4dce7bd2da62a4d5e" "version": "acac6762be477a7ec16a96ae5222dd365568d87d"
}, },
{ {
"name": "grafonnet", "name": "grafonnet",
@ -48,7 +48,7 @@
"subdir": "grafana-builder" "subdir": "grafana-builder"
} }
}, },
"version": "565bf6b51d636e0efe4add39f2ab8e2b1abb731f" "version": "1e4da2e3aa19d9880cf5ae07a5220bbd10c11083"
}, },
{ {
"name": "grafana", "name": "grafana",
@ -58,7 +58,7 @@
"subdir": "grafana" "subdir": "grafana"
} }
}, },
"version": "7fadaf2274d5cbe4ac6fbaf8786e4b7ecf3c1713" "version": "c79fdb1f6cbd14a003f73d9a6f571f1cefc6638d"
}, },
{ {
"name": "prometheus-operator", "name": "prometheus-operator",
@ -68,7 +68,7 @@
"subdir": "jsonnet/prometheus-operator" "subdir": "jsonnet/prometheus-operator"
} }
}, },
"version": "6efd4e5e12213021516c10b3ebd0699260ddd804" "version": "908ee0372a9ac2c6574d589fdc56a4f3cb5f12d1"
}, },
{ {
"name": "etcd-mixin", "name": "etcd-mixin",
@ -78,7 +78,7 @@
"subdir": "Documentation/etcd-mixin" "subdir": "Documentation/etcd-mixin"
} }
}, },
"version": "8037e6e08727d4a17649f782cb4dbc482b8fe780" "version": "f197ab45785dfe81a9e412affbdca7ff733dd5d0"
}, },
{ {
"name": "prometheus", "name": "prometheus",
@ -88,7 +88,27 @@
"subdir": "documentation/prometheus-mixin" "subdir": "documentation/prometheus-mixin"
} }
}, },
"version": "f0bb8129c3e6ffc6906bdc130f5625110643f168" "version": "81d284f806ef828e8f323088e00871fedd5a77c2"
},
{
"name": "node-mixin",
"source": {
"git": {
"remote": "https://github.com/prometheus/node_exporter",
"subdir": "docs/node-mixin"
}
},
"version": "fb54f7f2e0c8e154768e63efffd9a4c161ed6931"
},
{
"name": "promgrafonnet",
"source": {
"git": {
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
"subdir": "lib/promgrafonnet"
}
},
"version": "acac6762be477a7ec16a96ae5222dd365568d87d"
} }
] ]
} }

View File

@ -206,6 +206,13 @@ spec:
value in the first response, unless you have received value in the first response, unless you have received
this token from an error message. this token from an error message.
type: string type: string
remainingItemCount:
description: |-
remainingItemCount is the number of subsequent items in the list which are not included in this list response. If the list request contained label or field selectors, then the number of remaining items is unknown and the field will be left unset and omitted during serialization. If the list is complete (either because it is not chunking or because this is the last chunk), then there are no more remaining items and this field will be left unset and omitted during serialization. Servers older than v1.15 do not set this field. The intended use of the remainingItemCount is *estimating* the size of a collection. Clients should not rely on the remainingItemCount to be set or to be exact.
This field is alpha and can be changed or removed without notice.
format: int64
type: integer
resourceVersion: resourceVersion:
description: 'String that identifies the server''s internal description: 'String that identifies the server''s internal
version of this object that can be used by clients to version of this object that can be used by clients to

View File

@ -47,7 +47,7 @@ spec:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
type: string type: string
optional: optional:
description: Specify whether the Secret or it's key must description: Specify whether the Secret or its key must
be defined be defined
type: boolean type: boolean
required: required:
@ -64,7 +64,7 @@ spec:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
type: string type: string
optional: optional:
description: Specify whether the Secret or it's key must description: Specify whether the Secret or its key must
be defined be defined
type: boolean type: boolean
required: required:
@ -141,7 +141,7 @@ spec:
to proxy through this endpoint. to proxy through this endpoint.
type: string type: string
relabelings: relabelings:
description: 'RelabelConfigs to apply to samples before ingestion. description: 'RelabelConfigs to apply to samples before scraping.
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config' More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
items: items:
description: 'RelabelConfig allows dynamic rewriting of the description: 'RelabelConfig allows dynamic rewriting of the

File diff suppressed because it is too large Load Diff

View File

@ -72,6 +72,15 @@ spec:
- mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard - mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard
name: grafana-dashboard-kubernetes-cluster-dashboard name: grafana-dashboard-kubernetes-cluster-dashboard
readOnly: false readOnly: false
- mountPath: /grafana-dashboard-definitions/0/node-cluster-rsrc-use
name: grafana-dashboard-node-cluster-rsrc-use
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/node-rsrc-use
name: grafana-dashboard-node-rsrc-use
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/nodes
name: grafana-dashboard-nodes
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/persistentvolumesusage - mountPath: /grafana-dashboard-definitions/0/persistentvolumesusage
name: grafana-dashboard-persistentvolumesusage name: grafana-dashboard-persistentvolumesusage
readOnly: false readOnly: false
@ -144,6 +153,15 @@ spec:
- configMap: - configMap:
name: grafana-dashboard-kubernetes-cluster-dashboard name: grafana-dashboard-kubernetes-cluster-dashboard
name: grafana-dashboard-kubernetes-cluster-dashboard name: grafana-dashboard-kubernetes-cluster-dashboard
- configMap:
name: grafana-dashboard-node-cluster-rsrc-use
name: grafana-dashboard-node-cluster-rsrc-use
- configMap:
name: grafana-dashboard-node-rsrc-use
name: grafana-dashboard-node-rsrc-use
- configMap:
name: grafana-dashboard-nodes
name: grafana-dashboard-nodes
- configMap: - configMap:
name: grafana-dashboard-persistentvolumesusage name: grafana-dashboard-persistentvolumesusage
name: grafana-dashboard-persistentvolumesusage name: grafana-dashboard-persistentvolumesusage

View File

@ -64,35 +64,6 @@ spec:
requests: requests:
cpu: 100m cpu: 100m
memory: 150Mi memory: 150Mi
- command:
- /pod_nanny
- --container=kube-state-metrics
- --cpu=100m
- --extra-cpu=2m
- --memory=150Mi
- --extra-memory=30Mi
- --threshold=5
- --deployment=kube-state-metrics
env:
- name: MY_POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: MY_POD_NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
image: carlosedp/addon-resizer:v1.8.4
name: addon-resizer
resources:
limits:
cpu: 50m
memory: 30Mi
requests:
cpu: 10m
memory: 30Mi
nodeSelector: nodeSelector:
kubernetes.io/os: linux kubernetes.io/os: linux
securityContext: securityContext:

View File

@ -11,6 +11,9 @@ spec:
honorLabels: true honorLabels: true
interval: 30s interval: 30s
port: https-main port: https-main
relabelings:
- action: labeldrop
regex: (pod|service|endpoint|namespace)
scheme: https scheme: https
scrapeTimeout: 30s scrapeTimeout: 30s
tlsConfig: tlsConfig:

View File

@ -13,7 +13,7 @@ spec:
relabelings: relabelings:
- action: replace - action: replace
regex: (.*) regex: (.*)
replacment: $1 replacement: $1
sourceLabels: sourceLabels:
- __meta_kubernetes_pod_node_name - __meta_kubernetes_pod_node_name
targetLabel: instance targetLabel: instance

View File

@ -3,30 +3,30 @@ data:
config.yaml: | config.yaml: |
resourceRules: resourceRules:
cpu: cpu:
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}[1m])) by (<<.GroupBy>>) containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)
nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>) nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
resources: resources:
overrides: overrides:
node: node:
resource: node resource: node
namespace: namespace:
resource: namespace resource: namespace
pod_name: pod:
resource: pod resource: pod
containerLabel: container_name containerLabel: container
memory: memory:
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}) by (<<.GroupBy>>) containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}) by (<<.GroupBy>>)
nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>) nodeQuery: sum(node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}) by (<<.GroupBy>>)
resources: resources:
overrides: overrides:
node: instance:
resource: node resource: node
namespace: namespace:
resource: namespace resource: namespace
pod_name: pod:
resource: pod resource: pod
containerLabel: container_name containerLabel: container
window: 1m window: 5m
kind: ConfigMap kind: ConfigMap
metadata: metadata:
name: adapter-config name: adapter-config

View File

@ -8,6 +8,67 @@ metadata:
namespace: monitoring namespace: monitoring
spec: spec:
groups: groups:
- name: node-exporter.rules
rules:
- expr: |
count without (cpu) (
count without (mode) (
node_cpu_seconds_total{job="node-exporter"}
)
)
record: instance:node_num_cpu:sum
- expr: |
1 - avg without (cpu, mode) (
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
)
record: instance:node_cpu_utilisation:rate1m
- expr: |
(
node_load1{job="node-exporter"}
/
instance:node_num_cpu:sum{job="node-exporter"}
)
record: instance:node_load1_per_cpu:ratio
- expr: |
1 - (
node_memory_MemAvailable_bytes{job="node-exporter"}
/
node_memory_MemTotal_bytes{job="node-exporter"}
)
record: instance:node_memory_utilisation:ratio
- expr: |
(
rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
+
rate(node_vmstat_pgpgout{job="node-exporter"}[1m])
)
record: instance:node_memory_swap_io_pages:rate1m
- expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m
- expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_receive_drop_excluding_lo:rate1m
- expr: |
sum without (device) (
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
)
record: instance:node_network_transmit_drop_excluding_lo:rate1m
- name: k8s.rules - name: k8s.rules
rules: rules:
- expr: | - expr: |
@ -21,13 +82,6 @@ spec:
- expr: | - expr: |
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace) sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace)
record: namespace:container_memory_usage_bytes:sum record: namespace:container_memory_usage_bytes:sum
- expr: |
sum by (namespace, label_name) (
sum(container_memory_usage_bytes{job="kubelet",image!="", container!="POD"}) by (pod, namespace)
* on (namespace, pod)
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
)
record: namespace:container_memory_usage_bytes:sum
- expr: | - expr: |
sum by (namespace, label_name) ( sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
@ -177,6 +231,162 @@ spec:
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance, cpu))
record: cluster:node_cpu:ratio record: cluster:node_cpu:ratio
- name: node-exporter
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 24 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left and is filling
up fast.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
summary: Filesystem has less than 5% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available space left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
summary: Filesystem has less than 3% space left.
expr: |
(
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left and is filling
up fast.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
and
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
summary: Filesystem has less than 5% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
has only {{ printf "%.2f" $value }}% available inodes left.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
summary: Filesystem has less than 3% inodes left.
expr: |
(
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
and
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
)
for: 1h
labels:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: |
increase(node_network_receive_errs_total[2m]) > 10
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: |
increase(node_network_transmit_errs_total[2m]) > 10
for: 1h
labels:
severity: warning
- name: kubernetes-absent - name: kubernetes-absent
rules: rules:
- alert: AlertmanagerDown - alert: AlertmanagerDown
@ -287,7 +497,7 @@ spec:
state for longer than 15 minutes. state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: | expr: |
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) > 0 sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!="Job"}) > 0
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
@ -363,13 +573,13 @@ spec:
severity: critical severity: critical
- alert: KubeDaemonSetRolloutStuck - alert: KubeDaemonSetRolloutStuck
annotations: annotations:
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet
}}/{{ $labels.daemonset }} are scheduled and ready. {{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
expr: | expr: |
kube_daemonset_status_number_ready{job="kube-state-metrics"} kube_daemonset_status_number_ready{job="kube-state-metrics"}
/ /
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
@ -420,7 +630,33 @@ spec:
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
expr: | expr: |
kube_job_status_failed{job="kube-state-metrics"} > 0 kube_job_failed{job="kube-state-metrics"} > 0
for: 15m
labels:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the
desired number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
expr: |
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
!=
kube_hpa_status_current_replicas{job="kube-state-metrics"})
and
changes(kube_hpa_status_current_replicas[15m]) == 0
for: 15m
labels:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at
max replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
expr: |
kube_hpa_status_current_replicas{job="kube-state-metrics"}
==
kube_hpa_spec_max_replicas{job="kube-state-metrics"}
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -482,25 +718,28 @@ spec:
severity: warning severity: warning
- alert: KubeQuotaExceeded - alert: KubeQuotaExceeded
annotations: annotations:
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}}% of its {{ $labels.resource }} quota. }} of its {{ $labels.resource }} quota.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
expr: | expr: |
100 * kube_resourcequota{job="kube-state-metrics", type="used"} kube_resourcequota{job="kube-state-metrics", type="used"}
/ ignoring(instance, job, type) / ignoring(instance, job, type)
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
> 90 > 0.90
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
- alert: CPUThrottlingHigh - alert: CPUThrottlingHigh
annotations: annotations:
message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace message: '{{ $value | humanizePercentage }} throttling of CPU in namespace
}} for container {{ $labels.container }} in pod {{ $labels.pod }}.' {{ $labels.namespace }} for container {{ $labels.container }} in pod {{
$labels.pod }}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\", expr: |
}[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
by (container, pod, namespace)\n > 25 \n" /
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
> ( 25 / 100 )
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -509,14 +748,14 @@ spec:
- alert: KubePersistentVolumeUsageCritical - alert: KubePersistentVolumeUsageCritical
annotations: annotations:
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value }} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
}}% free. }} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
expr: | expr: |
100 * kubelet_volume_stats_available_bytes{job="kubelet"} kubelet_volume_stats_available_bytes{job="kubelet"}
/ /
kubelet_volume_stats_capacity_bytes{job="kubelet"} kubelet_volume_stats_capacity_bytes{job="kubelet"}
< 3 < 0.03
for: 1m for: 1m
labels: labels:
severity: critical severity: critical
@ -524,14 +763,14 @@ spec:
annotations: annotations:
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} is expected to fill up within four }} in Namespace {{ $labels.namespace }} is expected to fill up within four
days. Currently {{ printf "%0.2f" $value }}% is available. days. Currently {{ $value | humanizePercentage }} is available.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr: | expr: |
100 * ( (
kubelet_volume_stats_available_bytes{job="kubelet"} kubelet_volume_stats_available_bytes{job="kubelet"}
/ /
kubelet_volume_stats_capacity_bytes{job="kubelet"} kubelet_volume_stats_capacity_bytes{job="kubelet"}
) < 15 ) < 0.15
and and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
for: 5m for: 5m
@ -551,7 +790,7 @@ spec:
rules: rules:
- alert: KubeNodeNotReady - alert: KubeNodeNotReady
annotations: annotations:
message: '{{ $labels.node }} has been unready for more than an hour.' message: '{{ $labels.node }} has been unready for more than 15 minutes.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
expr: | expr: |
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
@ -571,33 +810,23 @@ spec:
- alert: KubeClientErrors - alert: KubeClientErrors
annotations: annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }}% errors.' }}' is experiencing {{ $value | humanizePercentage }} errors.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: | expr: |
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
/ /
sum(rate(rest_client_requests_total[5m])) by (instance, job)) sum(rate(rest_client_requests_total[5m])) by (instance, job))
* 100 > 1 > 0.01
for: 15m
labels:
severity: warning
- alert: KubeClientErrors
annotations:
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
expr: |
sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
- alert: KubeletTooManyPods - alert: KubeletTooManyPods
annotations: annotations:
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
to the limit of 110. }} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: | expr: |
kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -623,47 +852,51 @@ spec:
severity: critical severity: critical
- alert: KubeAPIErrorsHigh - alert: KubeAPIErrorsHigh
annotations: annotations:
message: API server is returning errors for {{ $value }}% of requests. message: API server is returning errors for {{ $value | humanizePercentage
}} of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: | expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m]))
/ /
sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3 sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.03
for: 10m for: 10m
labels: labels:
severity: critical severity: critical
- alert: KubeAPIErrorsHigh - alert: KubeAPIErrorsHigh
annotations: annotations:
message: API server is returning errors for {{ $value }}% of requests. message: API server is returning errors for {{ $value | humanizePercentage
}} of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: | expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m]))
/ /
sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1 sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.01
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
- alert: KubeAPIErrorsHigh - alert: KubeAPIErrorsHigh
annotations: annotations:
message: API server is returning errors for {{ $value }}% of requests for message: API server is returning errors for {{ $value | humanizePercentage
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource
}}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: | expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/ /
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10 sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.10
for: 10m for: 10m
labels: labels:
severity: critical severity: critical
- alert: KubeAPIErrorsHigh - alert: KubeAPIErrorsHigh
annotations: annotations:
message: API server is returning errors for {{ $value }}% of requests for message: API server is returning errors for {{ $value | humanizePercentage
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}. }} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource
}}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: | expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb) sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
/ /
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5 sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.05
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -794,7 +1027,8 @@ spec:
- alert: PrometheusDuplicateTimestamps - alert: PrometheusDuplicateTimestamps
annotations: annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
{{$value | humanize}} samples/s with different values but duplicated timestamp. {{ printf "%.4g" $value }} samples/s with different values but duplicated
timestamp.
summary: Prometheus is dropping samples with duplicate timestamps. summary: Prometheus is dropping samples with duplicate timestamps.
expr: | expr: |
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@ -804,7 +1038,7 @@ spec:
- alert: PrometheusOutOfOrderTimestamps - alert: PrometheusOutOfOrderTimestamps
annotations: annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
{{$value | humanize}} samples/s with timestamps arriving out of order. {{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
summary: Prometheus drops samples with out-of-order timestamps. summary: Prometheus drops samples with out-of-order timestamps.
expr: | expr: |
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
@ -848,6 +1082,25 @@ spec:
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
- alert: PrometheusRemoteWriteDesiredShards
annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
desired shards calculation wants to run {{ printf $value }} shards, which
is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}`
$labels.instance | query | first | value }}.
summary: Prometheus remote write desired shards calculation wants to run more
than configured max shards.
expr: |
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m])
> on(job, instance) group_right
max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m])
)
for: 15m
labels:
severity: warning
- alert: PrometheusRuleFailures - alert: PrometheusRuleFailures
annotations: annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
@ -902,8 +1155,10 @@ spec:
rules: rules:
- alert: TargetDown - alert: TargetDown
annotations: annotations:
message: '{{ $value }}% of the {{ $labels.job }} targets are down.' message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }} targets are
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 down.'
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job,
namespace, service)) > 10
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -918,26 +1173,6 @@ spec:
expr: vector(1) expr: vector(1)
labels: labels:
severity: none severity: none
- name: kube-prometheus-node-alerting.rules
rules:
- alert: NodeDiskRunningFull
annotations:
message: Device {{ $labels.device }} on node {{ $labels.instance }} will be
full within the next 24 hours.
expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
for: 30m
labels:
severity: warning
- alert: NodeDiskRunningFull
annotations:
message: Device {{ $labels.device }} on node {{ $labels.instance }} will be
full within the next 2 hours.
expr: |
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
for: 10m
labels:
severity: critical
- name: node-time - name: node-time
rules: rules:
- alert: ClockSkewDetected - alert: ClockSkewDetected
@ -951,24 +1186,6 @@ spec:
severity: warning severity: warning
- name: node-network - name: node-network
rules: rules:
- alert: NetworkReceiveErrors
annotations:
message: Network interface "{{ $labels.device }}" showing receive errors on
node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
expr: |
rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
for: 2m
labels:
severity: warning
- alert: NetworkTransmitErrors
annotations:
message: Network interface "{{ $labels.device }}" showing transmit errors
on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
expr: |
rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
for: 2m
labels:
severity: warning
- alert: NodeNetworkInterfaceFlapping - alert: NodeNetworkInterfaceFlapping
annotations: annotations:
message: Network interface "{{ $labels.device }}" changing it's up status message: Network interface "{{ $labels.device }}" changing it's up status