mirror of
https://github.com/carlosedp/cluster-monitoring.git
synced 2024-11-20 19:07:17 +01:00
1179 lines
53 KiB
YAML
1179 lines
53 KiB
YAML
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
labels:
|
|
prometheus: k8s
|
|
role: alert-rules
|
|
name: prometheus-k8s-rules
|
|
namespace: monitoring
|
|
spec:
|
|
groups:
|
|
- name: k8s.rules
|
|
rules:
|
|
- expr: |
|
|
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])) by (namespace)
|
|
record: namespace:container_cpu_usage_seconds_total:sum_rate
|
|
- expr: |
|
|
sum by (namespace, pod, container) (
|
|
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])
|
|
)
|
|
record: namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
|
|
- expr: |
|
|
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace)
|
|
record: namespace:container_memory_usage_bytes:sum
|
|
- expr: |
|
|
sum by (namespace, label_name) (
|
|
sum(container_memory_usage_bytes{job="kubelet",image!="", container!="POD"}) by (pod, namespace)
|
|
* on (namespace, pod)
|
|
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
|
)
|
|
record: namespace:container_memory_usage_bytes:sum
|
|
- expr: |
|
|
sum by (namespace, label_name) (
|
|
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
|
|
* on (namespace, pod)
|
|
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
|
)
|
|
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum
|
|
- expr: |
|
|
sum by (namespace, label_name) (
|
|
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
|
|
* on (namespace, pod)
|
|
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
|
)
|
|
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
|
|
- expr: |
|
|
sum(
|
|
label_replace(
|
|
label_replace(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
|
"replicaset", "$1", "owner_name", "(.*)"
|
|
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
|
|
"workload", "$1", "owner_name", "(.*)"
|
|
)
|
|
) by (namespace, workload, pod)
|
|
labels:
|
|
workload_type: deployment
|
|
record: mixin_pod_workload
|
|
- expr: |
|
|
sum(
|
|
label_replace(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
|
"workload", "$1", "owner_name", "(.*)"
|
|
)
|
|
) by (namespace, workload, pod)
|
|
labels:
|
|
workload_type: daemonset
|
|
record: mixin_pod_workload
|
|
- expr: |
|
|
sum(
|
|
label_replace(
|
|
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
|
"workload", "$1", "owner_name", "(.*)"
|
|
)
|
|
) by (namespace, workload, pod)
|
|
labels:
|
|
workload_type: statefulset
|
|
record: mixin_pod_workload
|
|
- name: kube-scheduler.rules
|
|
rules:
|
|
- expr: |
|
|
histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.99"
|
|
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.99"
|
|
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.99"
|
|
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.9"
|
|
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.9"
|
|
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.9, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.9"
|
|
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.5"
|
|
record: cluster_quantile:scheduler_e2e_scheduling_duration_seconds:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.5"
|
|
record: cluster_quantile:scheduler_scheduling_algorithm_duration_seconds:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.5, sum(rate(scheduler_binding_duration_seconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.5"
|
|
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
|
|
- name: kube-apiserver.rules
|
|
rules:
|
|
- expr: |
|
|
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.99"
|
|
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.9"
|
|
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
|
- expr: |
|
|
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod))
|
|
labels:
|
|
quantile: "0.5"
|
|
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
|
|
- name: node.rules
|
|
rules:
|
|
- expr: sum(min(kube_pod_info) by (node))
|
|
record: ':kube_pod_info_node_count:'
|
|
- expr: |
|
|
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod)
|
|
record: 'node_namespace_pod:kube_pod_info:'
|
|
- expr: |
|
|
count by (node) (sum by (node, cpu) (
|
|
node_cpu_seconds_total{job="node-exporter"}
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
))
|
|
record: node:node_num_cpu:sum
|
|
- expr: |
|
|
1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
|
|
record: :node_cpu_utilisation:avg1m
|
|
- expr: |
|
|
1 - avg by (node) (
|
|
rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:)
|
|
record: node:node_cpu_utilisation:avg1m
|
|
- expr: |
|
|
node:node_cpu_utilisation:avg1m
|
|
*
|
|
node:node_num_cpu:sum
|
|
/
|
|
scalar(sum(node:node_num_cpu:sum))
|
|
record: node:cluster_cpu_utilisation:ratio
|
|
- expr: |
|
|
sum(node_load1{job="node-exporter"})
|
|
/
|
|
sum(node:node_num_cpu:sum)
|
|
record: ':node_cpu_saturation_load1:'
|
|
- expr: |
|
|
sum by (node) (
|
|
node_load1{job="node-exporter"}
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
/
|
|
node:node_num_cpu:sum
|
|
record: 'node:node_cpu_saturation_load1:'
|
|
- expr: |
|
|
1 -
|
|
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
|
/
|
|
sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
|
record: ':node_memory_utilisation:'
|
|
- expr: |
|
|
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
|
record: :node_memory_MemFreeCachedBuffers_bytes:sum
|
|
- expr: |
|
|
sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
|
record: :node_memory_MemTotal_bytes:sum
|
|
- expr: |
|
|
sum by (node) (
|
|
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_memory_bytes_available:sum
|
|
- expr: |
|
|
sum by (node) (
|
|
node_memory_MemTotal_bytes{job="node-exporter"}
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_memory_bytes_total:sum
|
|
- expr: |
|
|
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
|
/
|
|
node:node_memory_bytes_total:sum
|
|
record: node:node_memory_utilisation:ratio
|
|
- expr: |
|
|
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
|
/
|
|
scalar(sum(node:node_memory_bytes_total:sum))
|
|
record: node:cluster_memory_utilisation:ratio
|
|
- expr: |
|
|
1e3 * sum(
|
|
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
|
|
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
|
|
)
|
|
record: :node_memory_swap_io_bytes:sum_rate
|
|
- expr: |
|
|
1 -
|
|
sum by (node) (
|
|
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
/
|
|
sum by (node) (
|
|
node_memory_MemTotal_bytes{job="node-exporter"}
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: 'node:node_memory_utilisation:'
|
|
- expr: |
|
|
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
|
|
record: 'node:node_memory_utilisation_2:'
|
|
- expr: |
|
|
1e3 * sum by (node) (
|
|
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
|
|
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_memory_swap_io_bytes:sum_rate
|
|
- expr: |
|
|
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
|
record: :node_disk_utilisation:avg_irate
|
|
- expr: |
|
|
avg by (node) (
|
|
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_disk_utilisation:avg_irate
|
|
- expr: |
|
|
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
|
record: :node_disk_saturation:avg_irate
|
|
- expr: |
|
|
avg by (node) (
|
|
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_disk_saturation:avg_irate
|
|
- expr: |
|
|
max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
|
|
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
record: 'node:node_filesystem_usage:'
|
|
- expr: |
|
|
max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
record: 'node:node_filesystem_avail:'
|
|
- expr: |
|
|
sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) +
|
|
sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
record: :node_net_utilisation:sum_irate
|
|
- expr: |
|
|
sum by (node) (
|
|
(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) +
|
|
irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_net_utilisation:sum_irate
|
|
- expr: |
|
|
sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) +
|
|
sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
record: :node_net_saturation:sum_irate
|
|
- expr: |
|
|
sum by (node) (
|
|
(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) +
|
|
irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
* on (namespace, pod) group_left(node)
|
|
node_namespace_pod:kube_pod_info:
|
|
)
|
|
record: node:node_net_saturation:sum_irate
|
|
- expr: |
|
|
max(
|
|
max(
|
|
kube_pod_info{job="kube-state-metrics", host_ip!=""}
|
|
) by (node, host_ip)
|
|
* on (host_ip) group_right (node)
|
|
label_replace(
|
|
(max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
|
|
)
|
|
) by (node)
|
|
record: 'node:node_inodes_total:'
|
|
- expr: |
|
|
max(
|
|
max(
|
|
kube_pod_info{job="kube-state-metrics", host_ip!=""}
|
|
) by (node, host_ip)
|
|
* on (host_ip) group_right (node)
|
|
label_replace(
|
|
(max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
|
|
)
|
|
) by (node)
|
|
record: 'node:node_inodes_free:'
|
|
- name: kube-prometheus-node-recording.rules
|
|
rules:
|
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
|
|
(instance)
|
|
record: instance:node_cpu:rate:sum
|
|
- expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
|
|
BY (instance)
|
|
record: instance:node_filesystem_usage:sum
|
|
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
|
record: instance:node_network_receive_bytes:rate:sum
|
|
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
|
record: instance:node_network_transmit_bytes:rate:sum
|
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT
|
|
(cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
|
|
BY (instance, cpu)) BY (instance)
|
|
record: instance:node_cpu:ratio
|
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
|
|
record: cluster:node_cpu:sum_rate5m
|
|
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
|
|
BY (instance, cpu))
|
|
record: cluster:node_cpu:ratio
|
|
- name: kubernetes-absent
|
|
rules:
|
|
- alert: AlertmanagerDown
|
|
annotations:
|
|
message: Alertmanager has disappeared from Prometheus target discovery.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
|
|
expr: |
|
|
absent(up{job="alertmanager-main",namespace="monitoring"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: CoreDNSDown
|
|
annotations:
|
|
message: CoreDNS has disappeared from Prometheus target discovery.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown
|
|
expr: |
|
|
absent(up{job="kube-dns"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeAPIDown
|
|
annotations:
|
|
message: KubeAPI has disappeared from Prometheus target discovery.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown
|
|
expr: |
|
|
absent(up{job="apiserver"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeControllerManagerDown
|
|
annotations:
|
|
message: KubeControllerManager has disappeared from Prometheus target discovery.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown
|
|
expr: |
|
|
absent(up{job="kube-controller-manager"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeSchedulerDown
|
|
annotations:
|
|
message: KubeScheduler has disappeared from Prometheus target discovery.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown
|
|
expr: |
|
|
absent(up{job="kube-scheduler"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeStateMetricsDown
|
|
annotations:
|
|
message: KubeStateMetrics has disappeared from Prometheus target discovery.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown
|
|
expr: |
|
|
absent(up{job="kube-state-metrics"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeletDown
|
|
annotations:
|
|
message: Kubelet has disappeared from Prometheus target discovery.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
|
|
expr: |
|
|
absent(up{job="kubelet"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: NodeExporterDown
|
|
annotations:
|
|
message: NodeExporter has disappeared from Prometheus target discovery.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown
|
|
expr: |
|
|
absent(up{job="node-exporter"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusDown
|
|
annotations:
|
|
message: Prometheus has disappeared from Prometheus target discovery.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
|
|
expr: |
|
|
absent(up{job="prometheus-k8s",namespace="monitoring"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusOperatorDown
|
|
annotations:
|
|
message: PrometheusOperator has disappeared from Prometheus target discovery.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
|
|
expr: |
|
|
absent(up{job="prometheus-operator",namespace="monitoring"} == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- name: kubernetes-apps
|
|
rules:
|
|
- alert: KubePodCrashLooping
|
|
annotations:
|
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
|
}}) is restarting {{ printf "%.2f" $value }} times / 5 minutes.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
|
expr: |
|
|
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
- alert: KubePodNotReady
|
|
annotations:
|
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
|
state for longer than an hour.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
|
expr: |
|
|
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) > 0
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeDeploymentGenerationMismatch
|
|
annotations:
|
|
message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
|
}} does not match, this indicates that the Deployment has failed but has
|
|
not been rolled back.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch
|
|
expr: |
|
|
kube_deployment_status_observed_generation{job="kube-state-metrics"}
|
|
!=
|
|
kube_deployment_metadata_generation{job="kube-state-metrics"}
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeDeploymentReplicasMismatch
|
|
annotations:
|
|
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
|
|
matched the expected number of replicas for longer than an hour.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
|
expr: |
|
|
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
|
!=
|
|
kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeStatefulSetReplicasMismatch
|
|
annotations:
|
|
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
|
|
not matched the expected number of replicas for longer than 15 minutes.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
|
|
expr: |
|
|
kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
|
|
!=
|
|
kube_statefulset_status_replicas{job="kube-state-metrics"}
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeStatefulSetGenerationMismatch
|
|
annotations:
|
|
message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
|
}} does not match, this indicates that the StatefulSet has failed but has
|
|
not been rolled back.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch
|
|
expr: |
|
|
kube_statefulset_status_observed_generation{job="kube-state-metrics"}
|
|
!=
|
|
kube_statefulset_metadata_generation{job="kube-state-metrics"}
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeStatefulSetUpdateNotRolledOut
|
|
annotations:
|
|
message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
|
has not been rolled out.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout
|
|
expr: |
|
|
max without (revision) (
|
|
kube_statefulset_status_current_revision{job="kube-state-metrics"}
|
|
unless
|
|
kube_statefulset_status_update_revision{job="kube-state-metrics"}
|
|
)
|
|
*
|
|
(
|
|
kube_statefulset_replicas{job="kube-state-metrics"}
|
|
!=
|
|
kube_statefulset_status_replicas_updated{job="kube-state-metrics"}
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeDaemonSetRolloutStuck
|
|
annotations:
|
|
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
|
|
}}/{{ $labels.daemonset }} are scheduled and ready.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
|
expr: |
|
|
kube_daemonset_status_number_ready{job="kube-state-metrics"}
|
|
/
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeDaemonSetNotScheduled
|
|
annotations:
|
|
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
|
}} are not scheduled.'
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled
|
|
expr: |
|
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"}
|
|
-
|
|
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeDaemonSetMisScheduled
|
|
annotations:
|
|
message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
|
}} are running where they are not supposed to run.'
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled
|
|
expr: |
|
|
kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeCronJobRunning
|
|
annotations:
|
|
message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more
|
|
than 1h to complete.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning
|
|
expr: |
|
|
time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeJobCompletion
|
|
annotations:
|
|
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
|
than one hour to complete.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion
|
|
expr: |
|
|
kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeJobFailed
|
|
annotations:
|
|
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
|
expr: |
|
|
kube_job_status_failed{job="kube-state-metrics"} > 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- name: kubernetes-resources
|
|
rules:
|
|
- alert: KubeCPUOvercommit
|
|
annotations:
|
|
message: Cluster has overcommitted CPU resource requests for Pods and cannot
|
|
tolerate node failure.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
|
expr: |
|
|
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum)
|
|
/
|
|
sum(kube_node_status_allocatable_cpu_cores)
|
|
>
|
|
(count(kube_node_status_allocatable_cpu_cores)-1) / count(kube_node_status_allocatable_cpu_cores)
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeMemOvercommit
|
|
annotations:
|
|
message: Cluster has overcommitted memory resource requests for Pods and cannot
|
|
tolerate node failure.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
|
expr: |
|
|
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum)
|
|
/
|
|
sum(kube_node_status_allocatable_memory_bytes)
|
|
>
|
|
(count(kube_node_status_allocatable_memory_bytes)-1)
|
|
/
|
|
count(kube_node_status_allocatable_memory_bytes)
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeCPUOvercommit
|
|
annotations:
|
|
message: Cluster has overcommitted CPU resource requests for Namespaces.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
|
|
expr: |
|
|
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"})
|
|
/
|
|
sum(kube_node_status_allocatable_cpu_cores)
|
|
> 1.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeMemOvercommit
|
|
annotations:
|
|
message: Cluster has overcommitted memory resource requests for Namespaces.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
|
|
expr: |
|
|
sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"})
|
|
/
|
|
sum(kube_node_status_allocatable_memory_bytes{job="node-exporter"})
|
|
> 1.5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeQuotaExceeded
|
|
annotations:
|
|
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
|
|
}}% of its {{ $labels.resource }} quota.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
|
|
expr: |
|
|
100 * kube_resourcequota{job="kube-state-metrics", type="used"}
|
|
/ ignoring(instance, job, type)
|
|
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
|
> 90
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: CPUThrottlingHigh
|
|
annotations:
|
|
message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace
|
|
}} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
|
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\",
|
|
}[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m]))
|
|
by (container, pod, namespace)\n > 25 \n"
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- name: kubernetes-storage
|
|
rules:
|
|
- alert: KubePersistentVolumeUsageCritical
|
|
annotations:
|
|
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
|
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value
|
|
}}% free.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
|
|
expr: |
|
|
100 * kubelet_volume_stats_available_bytes{job="kubelet"}
|
|
/
|
|
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
|
< 3
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubePersistentVolumeFullInFourDays
|
|
annotations:
|
|
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
|
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
|
|
days. Currently {{ printf "%0.2f" $value }}% is available.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
|
|
expr: |
|
|
100 * (
|
|
kubelet_volume_stats_available_bytes{job="kubelet"}
|
|
/
|
|
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
|
) < 15
|
|
and
|
|
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubePersistentVolumeErrors
|
|
annotations:
|
|
message: The persistent volume {{ $labels.persistentvolume }} has status {{
|
|
$labels.phase }}.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors
|
|
expr: |
|
|
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- name: kubernetes-system
|
|
rules:
|
|
- alert: KubeNodeNotReady
|
|
annotations:
|
|
message: '{{ $labels.node }} has been unready for more than an hour.'
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
|
expr: |
|
|
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeVersionMismatch
|
|
annotations:
|
|
message: There are {{ $value }} different semantic versions of Kubernetes
|
|
components running.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
|
expr: |
|
|
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeClientErrors
|
|
annotations:
|
|
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
|
}}' is experiencing {{ printf "%0.0f" $value }}% errors.'
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
|
expr: |
|
|
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
|
/
|
|
sum(rate(rest_client_requests_total[5m])) by (instance, job))
|
|
* 100 > 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeClientErrors
|
|
annotations:
|
|
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
|
}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
|
expr: |
|
|
sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeletTooManyPods
|
|
annotations:
|
|
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
|
|
to the limit of 110.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
|
expr: |
|
|
kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeAPILatencyHigh
|
|
annotations:
|
|
message: The API server has a 99th percentile latency of {{ $value }} seconds
|
|
for {{ $labels.verb }} {{ $labels.resource }}.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
|
expr: |
|
|
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeAPILatencyHigh
|
|
annotations:
|
|
message: The API server has a 99th percentile latency of {{ $value }} seconds
|
|
for {{ $labels.verb }} {{ $labels.resource }}.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
|
|
expr: |
|
|
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeAPIErrorsHigh
|
|
annotations:
|
|
message: API server is returning errors for {{ $value }}% of requests.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
|
expr: |
|
|
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
|
/
|
|
sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeAPIErrorsHigh
|
|
annotations:
|
|
message: API server is returning errors for {{ $value }}% of requests.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
|
expr: |
|
|
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
|
/
|
|
sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeAPIErrorsHigh
|
|
annotations:
|
|
message: API server is returning errors for {{ $value }}% of requests for
|
|
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
|
expr: |
|
|
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
|
/
|
|
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubeAPIErrorsHigh
|
|
annotations:
|
|
message: API server is returning errors for {{ $value }}% of requests for
|
|
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
|
expr: |
|
|
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
|
/
|
|
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeClientCertificateExpiration
|
|
annotations:
|
|
message: A client certificate used to authenticate to the apiserver is expiring
|
|
in less than 7.0 days.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
|
expr: |
|
|
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
|
labels:
|
|
severity: warning
|
|
- alert: KubeClientCertificateExpiration
|
|
annotations:
|
|
message: A client certificate used to authenticate to the apiserver is expiring
|
|
in less than 24.0 hours.
|
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
|
expr: |
|
|
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
|
labels:
|
|
severity: critical
|
|
- name: prometheus
|
|
rules:
|
|
- alert: PrometheusBadConfig
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
|
|
reload its configuration.
|
|
summary: Failed Prometheus configuration reload.
|
|
expr: |
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
max_over_time(prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"}[5m]) == 0
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusNotificationQueueRunningFull
|
|
annotations:
|
|
description: Alert notification queue of Prometheus {{$labels.namespace}}/{{$labels.pod}}
|
|
is running full.
|
|
summary: Prometheus alert notification queue predicted to run full in less
|
|
than 30m.
|
|
expr: |
|
|
# Without min_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
(
|
|
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30)
|
|
>
|
|
min_over_time(prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}[5m])
|
|
)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusErrorSendingAlertsToSomeAlertmanagers
|
|
annotations:
|
|
description: '{{ printf "%.1f" $value }}% errors while sending alerts from
|
|
Prometheus {{$labels.namespace}}/{{$labels.pod}} to Alertmanager {{$labels.alertmanager}}.'
|
|
summary: Prometheus has encountered more than 1% errors sending alerts to
|
|
a specific Alertmanager.
|
|
expr: |
|
|
(
|
|
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
|
/
|
|
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
|
)
|
|
* 100
|
|
> 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusErrorSendingAlertsToAnyAlertmanager
|
|
annotations:
|
|
description: '{{ printf "%.1f" $value }}% minimum errors while sending alerts
|
|
from Prometheus {{$labels.namespace}}/{{$labels.pod}} to any Alertmanager.'
|
|
summary: Prometheus encounters more than 3% errors sending alerts to any Alertmanager.
|
|
expr: |
|
|
min without(alertmanager) (
|
|
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
|
/
|
|
rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
|
)
|
|
* 100
|
|
> 3
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusNotConnectedToAlertmanagers
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not connected
|
|
to any Alertmanagers.
|
|
summary: Prometheus is not connected to any Alertmanagers.
|
|
expr: |
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
max_over_time(prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"}[5m]) < 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusTSDBReloadsFailing
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
|
|
{{$value | humanize}} reload failures over the last 3h.
|
|
summary: Prometheus has issues reloading blocks from disk.
|
|
expr: |
|
|
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
|
for: 4h
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusTSDBCompactionsFailing
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
|
|
{{$value | humanize}} compaction failures over the last 3h.
|
|
summary: Prometheus has issues compacting blocks.
|
|
expr: |
|
|
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
|
for: 4h
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusTSDBWALCorruptions
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
|
|
{{$value | humanize}} corruptions of the write-ahead log (WAL) over the
|
|
last 3h.
|
|
summary: Prometheus is detecting WAL corruptions.
|
|
expr: |
|
|
increase(tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
|
for: 4h
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusNotIngestingSamples
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
|
|
samples.
|
|
summary: Prometheus is not ingesting samples.
|
|
expr: |
|
|
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusDuplicateTimestamps
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
|
|
{{$value | humanize}} samples/s with different values but duplicated timestamp.
|
|
summary: Prometheus is dropping samples with duplicate timestamps.
|
|
expr: |
|
|
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusOutOfOrderTimestamps
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
|
|
{{$value | humanize}} samples/s with timestamps arriving out of order.
|
|
summary: Prometheus drops samples with out-of-order timestamps.
|
|
expr: |
|
|
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusRemoteStorageFailures
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send
|
|
{{ printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}.
|
|
summary: Prometheus fails to send samples to remote storage.
|
|
expr: |
|
|
(
|
|
rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
|
/
|
|
(
|
|
rate(prometheus_remote_storage_failed_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
|
+
|
|
rate(prometheus_remote_storage_succeeded_samples_total{job="prometheus-k8s",namespace="monitoring"}[5m])
|
|
)
|
|
)
|
|
* 100
|
|
> 1
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusRemoteWriteBehind
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
|
|
is {{ printf "%.1f" $value }}s behind for queue {{$labels.queue}}.
|
|
summary: Prometheus remote write is behind.
|
|
expr: |
|
|
# Without max_over_time, failed scrapes could create false negatives, see
|
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
|
(
|
|
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
|
|
- on(job, instance) group_right
|
|
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{job="prometheus-k8s",namespace="monitoring"}[5m])
|
|
)
|
|
> 120
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusRuleFailures
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
|
|
evaluate {{ printf "%.0f" $value }} rules in the last 5m.
|
|
summary: Prometheus is failing rule evaluations.
|
|
expr: |
|
|
increase(prometheus_rule_evaluation_failures_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
- alert: PrometheusMissingRuleEvaluations
|
|
annotations:
|
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has missed {{
|
|
printf "%.0f" $value }} rule group evaluations in the last 5m.
|
|
summary: Prometheus is missing rule evaluations due to slow rule group evaluation.
|
|
expr: |
|
|
increase(prometheus_rule_group_iterations_missed_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- name: alertmanager.rules
|
|
rules:
|
|
- alert: AlertmanagerConfigInconsistent
|
|
annotations:
|
|
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
|
|
are out of sync.
|
|
expr: |
|
|
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: AlertmanagerFailedReload
|
|
annotations:
|
|
message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
|
}}/{{ $labels.pod}}.
|
|
expr: |
|
|
alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: AlertmanagerMembersInconsistent
|
|
annotations:
|
|
message: Alertmanager has not found all other members of the cluster.
|
|
expr: |
|
|
alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}
|
|
!= on (service) GROUP_LEFT()
|
|
count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"})
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- name: general.rules
|
|
rules:
|
|
- alert: TargetDown
|
|
annotations:
|
|
message: '{{ $value }}% of the {{ $labels.job }} targets are down.'
|
|
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: Watchdog
|
|
annotations:
|
|
message: |
|
|
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
|
This alert is always firing, therefore it should always be firing in Alertmanager
|
|
and always fire against a receiver. There are integrations with various notification
|
|
mechanisms that send a notification when this alert is not firing. For example the
|
|
"DeadMansSnitch" integration in PagerDuty.
|
|
expr: vector(1)
|
|
labels:
|
|
severity: none
|
|
- name: kube-prometheus-node-alerting.rules
|
|
rules:
|
|
- alert: NodeDiskRunningFull
|
|
annotations:
|
|
message: Device {{ $labels.device }} on node {{ $labels.instance }} will be
|
|
full within the next 24 hours.
|
|
expr: |
|
|
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeDiskRunningFull
|
|
annotations:
|
|
message: Device {{ $labels.device }} on node {{ $labels.instance }} will be
|
|
full within the next 2 hours.
|
|
expr: |
|
|
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- name: node-time
|
|
rules:
|
|
- alert: ClockSkewDetected
|
|
annotations:
|
|
message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod
|
|
}}. Ensure NTP is configured correctly on this host.
|
|
expr: |
|
|
abs(node_timex_offset_seconds{job="node-exporter"}) > 0.05
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
- name: node-network
|
|
rules:
|
|
- alert: NetworkReceiveErrors
|
|
annotations:
|
|
message: Network interface "{{ $labels.device }}" showing receive errors on
|
|
node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
|
expr: |
|
|
rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
- alert: NetworkTransmitErrors
|
|
annotations:
|
|
message: Network interface "{{ $labels.device }}" showing transmit errors
|
|
on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
|
expr: |
|
|
rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
- alert: NodeNetworkInterfaceFlapping
|
|
annotations:
|
|
message: Network interface "{{ $labels.device }}" changing it's up status
|
|
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
|
expr: |
|
|
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
- name: prometheus-operator
|
|
rules:
|
|
- alert: PrometheusOperatorReconcileErrors
|
|
annotations:
|
|
message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
|
|
}} Namespace.
|
|
expr: |
|
|
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
- alert: PrometheusOperatorNodeLookupErrors
|
|
annotations:
|
|
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
|
|
expr: |
|
|
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|