mirror of
https://github.com/carlosedp/cluster-monitoring.git
synced 2024-11-25 19:05:44 +01:00
603 lines
26 KiB
YAML
603 lines
26 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: prometheus-k8s-rules
|
|
labels:
|
|
role: prometheus-rulefiles
|
|
prometheus: k8s
|
|
data:
|
|
alertmanager.rules.yaml: |+
|
|
groups:
|
|
- name: alertmanager.rules
|
|
rules:
|
|
- alert: AlertmanagerConfigInconsistent
|
|
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
|
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
|
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: The configuration of the instances of the Alertmanager cluster
|
|
`{{$labels.service}}` are out of sync.
|
|
- alert: AlertmanagerDownOrMissing
|
|
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
|
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
|
disappeared from discovery.
|
|
- alert: AlertmanagerFailedReload
|
|
expr: alertmanager_config_last_reload_successful == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
|
}}/{{ $labels.pod}}.
|
|
etcd3.rules.yaml: |+
|
|
groups:
|
|
- name: ./etcd3.rules
|
|
rules:
|
|
- alert: InsufficientMembers
|
|
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
|
for: 3m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: If one more etcd member goes down the cluster will be unavailable
|
|
summary: etcd cluster insufficient members
|
|
- alert: NoLeader
|
|
expr: etcd_server_has_leader{job="etcd"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: etcd member {{ $labels.instance }} has no leader
|
|
summary: etcd member has no leader
|
|
- alert: HighNumberOfLeaderChanges
|
|
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
|
changes within the last hour
|
|
summary: a high number of leader changes within the etcd cluster are happening
|
|
- alert: HighNumberOfFailedGRPCRequests
|
|
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
|
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
|
on etcd instance {{ $labels.instance }}'
|
|
summary: a high number of gRPC requests are failing
|
|
- alert: HighNumberOfFailedGRPCRequests
|
|
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
|
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
|
on etcd instance {{ $labels.instance }}'
|
|
summary: a high number of gRPC requests are failing
|
|
- alert: GRPCRequestsSlow
|
|
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
|
> 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
|
|
}} are slow
|
|
summary: slow gRPC requests
|
|
- alert: HighNumberOfFailedHTTPRequests
|
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
|
BY (method) > 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
|
instance {{ $labels.instance }}'
|
|
summary: a high number of HTTP requests are failing
|
|
- alert: HighNumberOfFailedHTTPRequests
|
|
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
|
BY (method) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
|
instance {{ $labels.instance }}'
|
|
summary: a high number of HTTP requests are failing
|
|
- alert: HTTPRequestsSlow
|
|
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
|
> 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
|
}} are slow
|
|
summary: slow HTTP requests
|
|
- alert: EtcdMemberCommunicationSlow
|
|
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
|
|
> 0.15
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} member communication with
|
|
{{ $labels.To }} is slow
|
|
summary: etcd member communication is slow
|
|
- alert: HighNumberOfFailedProposals
|
|
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
|
|
failures within the last hour
|
|
summary: a high number of proposals within the etcd cluster are failing
|
|
- alert: HighFsyncDurations
|
|
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
|
> 0.5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} fync durations are high
|
|
summary: high fsync durations
|
|
- alert: HighCommitDurations
|
|
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
|
> 0.25
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: etcd instance {{ $labels.instance }} commit durations are high
|
|
summary: high commit durations
|
|
general.rules.yaml: |+
|
|
groups:
|
|
- name: general.rules
|
|
rules:
|
|
- alert: TargetDown
|
|
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{ $value }}% of {{ $labels.job }} targets are down.'
|
|
summary: Targets are down
|
|
- alert: DeadMansSwitch
|
|
expr: vector(1)
|
|
labels:
|
|
severity: none
|
|
annotations:
|
|
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
|
|
pipeline is functional.
|
|
summary: Alerting DeadMansSwitch
|
|
- record: fd_utilization
|
|
expr: process_open_fds / process_max_fds
|
|
- alert: FdExhaustionClose
|
|
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
|
will exhaust in file/socket descriptors within the next 4 hours'
|
|
summary: file descriptors soon exhausted
|
|
- alert: FdExhaustionClose
|
|
expr: predict_linear(fd_utilization[10m], 3600) > 1
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
|
will exhaust in file/socket descriptors within the next hour'
|
|
summary: file descriptors soon exhausted
|
|
kube-controller-manager.rules.yaml: |+
|
|
groups:
|
|
- name: kube-controller-manager.rules
|
|
rules:
|
|
- alert: K8SControllerManagerDown
|
|
expr: absent(up{job="kube-controller-manager"} == 1)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: There is no running K8S controller manager. Deployments and replication
|
|
controllers are not making progress.
|
|
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
|
summary: Controller manager is down
|
|
kube-scheduler.rules.yaml: |+
|
|
groups:
|
|
- name: kube-scheduler.rules
|
|
rules:
|
|
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
|
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
|
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
|
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
|
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
|
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
|
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- record: cluster:scheduler_binding_latency_seconds:quantile
|
|
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: cluster:scheduler_binding_latency_seconds:quantile
|
|
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: cluster:scheduler_binding_latency_seconds:quantile
|
|
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
|
BY (le, cluster)) / 1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- alert: K8SSchedulerDown
|
|
expr: absent(up{job="kube-scheduler"} == 1)
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: There is no running K8S scheduler. New pods are not being assigned
|
|
to nodes.
|
|
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
|
|
summary: Scheduler is down
|
|
kube-state-metrics.rules.yaml: |+
|
|
groups:
|
|
- name: kube-state-metrics.rules
|
|
rules:
|
|
- alert: DeploymentGenerationMismatch
|
|
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Observed deployment generation does not match expected one for
|
|
deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
|
summary: Deployment is outdated
|
|
- alert: DeploymentReplicasNotUpdated
|
|
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
|
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
|
unless (kube_deployment_spec_paused == 1)
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
|
summary: Deployment replicas are outdated
|
|
- alert: DaemonSetRolloutStuck
|
|
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
|
|
* 100 < 100
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Only {{$value}}% of desired pods scheduled and ready for daemon
|
|
set {{$labels.namespaces}}/{{$labels.daemonset}}
|
|
summary: DaemonSet is missing pods
|
|
- alert: K8SDaemonSetsNotScheduled
|
|
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
|
> 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: A number of daemonsets are not scheduled.
|
|
summary: Daemonsets are not scheduled correctly
|
|
- alert: DaemonSetsMissScheduled
|
|
expr: kube_daemonset_status_number_misscheduled > 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: A number of daemonsets are running where they are not supposed
|
|
to run.
|
|
summary: Daemonsets are not scheduled correctly
|
|
- alert: PodFrequentlyRestarting
|
|
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
|
|
times within the last hour
|
|
summary: Pod is restarting frequently
|
|
kubelet.rules.yaml: |+
|
|
groups:
|
|
- name: kubelet.rules
|
|
rules:
|
|
- alert: K8SNodeNotReady
|
|
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
|
or has set itself to NotReady, for more than an hour
|
|
summary: Node status is NotReady
|
|
- alert: K8SManyNodesNotReady
|
|
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
|
|
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
|
|
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
|
- alert: K8SKubeletDown
|
|
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
|
|
for: 1h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
|
- alert: K8SKubeletDown
|
|
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
|
|
* 100 > 1
|
|
for: 1h
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
|
have disappeared from service discovery.
|
|
summary: Many Kubelets cannot be scraped
|
|
- alert: K8SKubeletTooManyPods
|
|
expr: kubelet_running_pod_count > 100
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
|
to the limit of 110
|
|
summary: Kubelet is close to pod limit
|
|
kubernetes.rules.yaml: |+
|
|
groups:
|
|
- name: kubernetes.rules
|
|
rules:
|
|
- record: pod_name:container_memory_usage_bytes:sum
|
|
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
|
(pod_name)
|
|
- record: pod_name:container_spec_cpu_shares:sum
|
|
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
|
|
- record: pod_name:container_cpu_usage:sum
|
|
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
|
BY (pod_name)
|
|
- record: pod_name:container_fs_usage_bytes:sum
|
|
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
|
|
- record: namespace:container_memory_usage_bytes:sum
|
|
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
|
|
- record: namespace:container_spec_cpu_shares:sum
|
|
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
|
|
- record: namespace:container_cpu_usage:sum
|
|
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
|
BY (namespace)
|
|
- record: cluster:memory_usage:ratio
|
|
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
|
(cluster) / sum(machine_memory_bytes) BY (cluster)
|
|
- record: cluster:container_spec_cpu_shares:ratio
|
|
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
|
|
/ sum(machine_cpu_cores)
|
|
- record: cluster:container_cpu_usage:ratio
|
|
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
|
/ sum(machine_cpu_cores)
|
|
- record: apiserver_latency_seconds:quantile
|
|
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
|
|
1e+06
|
|
labels:
|
|
quantile: "0.99"
|
|
- record: apiserver_latency:quantile_seconds
|
|
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
|
|
1e+06
|
|
labels:
|
|
quantile: "0.9"
|
|
- record: apiserver_latency_seconds:quantile
|
|
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
|
|
1e+06
|
|
labels:
|
|
quantile: "0.5"
|
|
- alert: APIServerLatencyHigh
|
|
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
|
> 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
|
for {{$labels.verb}} {{$labels.resource}}
|
|
- alert: APIServerLatencyHigh
|
|
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
|
> 4
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
|
for {{$labels.verb}} {{$labels.resource}}
|
|
- alert: APIServerErrorsHigh
|
|
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
|
* 100 > 2
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: API server returns errors for {{ $value }}% of requests
|
|
- alert: APIServerErrorsHigh
|
|
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
|
* 100 > 5
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: API server returns errors for {{ $value }}% of requests
|
|
- alert: K8SApiserverDown
|
|
expr: absent(up{job="apiserver"} == 1)
|
|
for: 20m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: No API servers are reachable or all have disappeared from service
|
|
discovery
|
|
|
|
- alert: K8sCertificateExpirationNotice
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Kubernetes API Certificate is expiring soon (less than 7 days)
|
|
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
|
|
|
|
- alert: K8sCertificateExpirationNotice
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Kubernetes API Certificate is expiring in less than 1 day
|
|
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
|
|
node.rules.yaml: |+
|
|
groups:
|
|
- name: node.rules
|
|
rules:
|
|
- record: instance:node_cpu:rate:sum
|
|
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
|
BY (instance)
|
|
- record: instance:node_filesystem_usage:sum
|
|
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
|
BY (instance)
|
|
- record: instance:node_network_receive_bytes:rate:sum
|
|
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
|
- record: instance:node_network_transmit_bytes:rate:sum
|
|
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
|
- record: instance:node_cpu:ratio
|
|
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
|
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
|
- record: cluster:node_cpu:sum_rate5m
|
|
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
|
- record: cluster:node_cpu:ratio
|
|
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
|
- alert: NodeExporterDown
|
|
expr: absent(up{job="node-exporter"} == 1)
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Prometheus could not scrape a node-exporter for more than 10m,
|
|
or node-exporters have disappeared from discovery
|
|
- alert: NodeDiskRunningFull
|
|
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
|
full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
|
- alert: NodeDiskRunningFull
|
|
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
|
full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
|
prometheus.rules.yaml: |+
|
|
groups:
|
|
- name: prometheus.rules
|
|
rules:
|
|
- alert: PrometheusConfigReloadFailed
|
|
expr: prometheus_config_last_reload_successful == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
|
- alert: PrometheusNotificationQueueRunningFull
|
|
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
|
$labels.pod}}
|
|
- alert: PrometheusErrorSendingAlerts
|
|
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
|
> 0.01
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
|
- alert: PrometheusErrorSendingAlerts
|
|
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
|
> 0.03
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
|
- alert: PrometheusNotConnectedToAlertmanagers
|
|
expr: prometheus_notifications_alertmanagers_discovered < 1
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
|
to any Alertmanagers
|
|
- alert: PrometheusTSDBReloadsFailing
|
|
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
|
for: 12h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
|
reload failures over the last four hours.'
|
|
summary: Prometheus has issues reloading data blocks from disk
|
|
- alert: PrometheusTSDBCompactionsFailing
|
|
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
|
for: 12h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
|
compaction failures over the last four hours.'
|
|
summary: Prometheus has issues compacting sample blocks
|
|
- alert: PrometheusTSDBWALCorruptions
|
|
expr: tsdb_wal_corruptions_total > 0
|
|
for: 4h
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
|
log (WAL).'
|
|
summary: Prometheus write-ahead log is corrupted
|