mirror of
				https://github.com/carlosedp/cluster-monitoring.git
				synced 2025-10-26 10:23:04 +01:00 
			
		
		
		
	Update libs and regenerate manifests
This commit is contained in:
		
							parent
							
								
									19bd000f3e
								
							
						
					
					
						commit
						8ef44ef1ce
					
				@ -8,7 +8,7 @@
 | 
			
		||||
                    "subdir": "jsonnet/kube-prometheus"
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "version": "4adb70b017e9a4ecb884a636dfef6fcae7d4bed8"
 | 
			
		||||
            "version": "da959c643657c7d2aac6f5ddd68582a949283c49"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "ksonnet",
 | 
			
		||||
@ -28,7 +28,7 @@
 | 
			
		||||
                    "subdir": ""
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "version": "0afc72e70df6048c6b65fd3e4968e53b0812b30c"
 | 
			
		||||
            "version": "193d4934f85c9ff596d1f3e4dce7bd2da62a4d5e"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "grafonnet",
 | 
			
		||||
@ -48,7 +48,7 @@
 | 
			
		||||
                    "subdir": "grafana-builder"
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "version": "3c44dfa9bfe2b66985733d4b16e0afd29094b4a0"
 | 
			
		||||
            "version": "565bf6b51d636e0efe4add39f2ab8e2b1abb731f"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "grafana",
 | 
			
		||||
@ -58,7 +58,7 @@
 | 
			
		||||
                    "subdir": "grafana"
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "version": "c27d2792764867cdaf6484f067cc875cb8aef2f6"
 | 
			
		||||
            "version": "7fadaf2274d5cbe4ac6fbaf8786e4b7ecf3c1713"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "prometheus-operator",
 | 
			
		||||
@ -78,7 +78,7 @@
 | 
			
		||||
                    "subdir": "Documentation/etcd-mixin"
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "version": "43ce2eefaa0a4bdd5c1e825ff08a32e6e46f3343"
 | 
			
		||||
            "version": "8037e6e08727d4a17649f782cb4dbc482b8fe780"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "prometheus",
 | 
			
		||||
@ -88,7 +88,7 @@
 | 
			
		||||
                    "subdir": "documentation/prometheus-mixin"
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "version": "ff40de7ca6084f5aab1f3971025c00c217615589"
 | 
			
		||||
            "version": "f0bb8129c3e6ffc6906bdc130f5625110643f168"
 | 
			
		||||
        }
 | 
			
		||||
    ]
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -1,4 +1,4 @@
 | 
			
		||||
apiVersion: apps/v1beta2
 | 
			
		||||
apiVersion: apps/v1
 | 
			
		||||
kind: Deployment
 | 
			
		||||
metadata:
 | 
			
		||||
  labels:
 | 
			
		||||
@ -51,12 +51,6 @@ spec:
 | 
			
		||||
        - mountPath: /grafana-dashboard-definitions/0/coredns-dashboard
 | 
			
		||||
          name: grafana-dashboard-coredns-dashboard
 | 
			
		||||
          readOnly: false
 | 
			
		||||
        - mountPath: /grafana-dashboard-definitions/0/k8s-cluster-rsrc-use
 | 
			
		||||
          name: grafana-dashboard-k8s-cluster-rsrc-use
 | 
			
		||||
          readOnly: false
 | 
			
		||||
        - mountPath: /grafana-dashboard-definitions/0/k8s-node-rsrc-use
 | 
			
		||||
          name: grafana-dashboard-k8s-node-rsrc-use
 | 
			
		||||
          readOnly: false
 | 
			
		||||
        - mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster
 | 
			
		||||
          name: grafana-dashboard-k8s-resources-cluster
 | 
			
		||||
          readOnly: false
 | 
			
		||||
@ -78,9 +72,6 @@ spec:
 | 
			
		||||
        - mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard
 | 
			
		||||
          name: grafana-dashboard-kubernetes-cluster-dashboard
 | 
			
		||||
          readOnly: false
 | 
			
		||||
        - mountPath: /grafana-dashboard-definitions/0/nodes
 | 
			
		||||
          name: grafana-dashboard-nodes
 | 
			
		||||
          readOnly: false
 | 
			
		||||
        - mountPath: /grafana-dashboard-definitions/0/persistentvolumesusage
 | 
			
		||||
          name: grafana-dashboard-persistentvolumesusage
 | 
			
		||||
          readOnly: false
 | 
			
		||||
@ -135,12 +126,6 @@ spec:
 | 
			
		||||
      - configMap:
 | 
			
		||||
          name: grafana-dashboard-coredns-dashboard
 | 
			
		||||
        name: grafana-dashboard-coredns-dashboard
 | 
			
		||||
      - configMap:
 | 
			
		||||
          name: grafana-dashboard-k8s-cluster-rsrc-use
 | 
			
		||||
        name: grafana-dashboard-k8s-cluster-rsrc-use
 | 
			
		||||
      - configMap:
 | 
			
		||||
          name: grafana-dashboard-k8s-node-rsrc-use
 | 
			
		||||
        name: grafana-dashboard-k8s-node-rsrc-use
 | 
			
		||||
      - configMap:
 | 
			
		||||
          name: grafana-dashboard-k8s-resources-cluster
 | 
			
		||||
        name: grafana-dashboard-k8s-resources-cluster
 | 
			
		||||
@ -162,9 +147,6 @@ spec:
 | 
			
		||||
      - configMap:
 | 
			
		||||
          name: grafana-dashboard-kubernetes-cluster-dashboard
 | 
			
		||||
        name: grafana-dashboard-kubernetes-cluster-dashboard
 | 
			
		||||
      - configMap:
 | 
			
		||||
          name: grafana-dashboard-nodes
 | 
			
		||||
        name: grafana-dashboard-nodes
 | 
			
		||||
      - configMap:
 | 
			
		||||
          name: grafana-dashboard-persistentvolumesusage
 | 
			
		||||
        name: grafana-dashboard-persistentvolumesusage
 | 
			
		||||
 | 
			
		||||
@ -153,177 +153,9 @@ spec:
 | 
			
		||||
          node_namespace_pod:kube_pod_info:
 | 
			
		||||
        ))
 | 
			
		||||
      record: node:node_num_cpu:sum
 | 
			
		||||
    - expr: |
 | 
			
		||||
        1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
 | 
			
		||||
      record: :node_cpu_utilisation:avg1m
 | 
			
		||||
    - expr: |
 | 
			
		||||
        1 - avg by (node) (
 | 
			
		||||
          rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
 | 
			
		||||
        * on (namespace, pod) group_left(node)
 | 
			
		||||
          node_namespace_pod:kube_pod_info:)
 | 
			
		||||
      record: node:node_cpu_utilisation:avg1m
 | 
			
		||||
    - expr: |
 | 
			
		||||
        node:node_cpu_utilisation:avg1m
 | 
			
		||||
          *
 | 
			
		||||
        node:node_num_cpu:sum
 | 
			
		||||
          /
 | 
			
		||||
        scalar(sum(node:node_num_cpu:sum))
 | 
			
		||||
      record: node:cluster_cpu_utilisation:ratio
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum(node_load1{job="node-exporter"})
 | 
			
		||||
        /
 | 
			
		||||
        sum(node:node_num_cpu:sum)
 | 
			
		||||
      record: ':node_cpu_saturation_load1:'
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum by (node) (
 | 
			
		||||
          node_load1{job="node-exporter"}
 | 
			
		||||
        * on (namespace, pod) group_left(node)
 | 
			
		||||
          node_namespace_pod:kube_pod_info:
 | 
			
		||||
        )
 | 
			
		||||
        /
 | 
			
		||||
        node:node_num_cpu:sum
 | 
			
		||||
      record: 'node:node_cpu_saturation_load1:'
 | 
			
		||||
    - expr: |
 | 
			
		||||
        1 -
 | 
			
		||||
        sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
 | 
			
		||||
        /
 | 
			
		||||
        sum(node_memory_MemTotal_bytes{job="node-exporter"})
 | 
			
		||||
      record: ':node_memory_utilisation:'
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
 | 
			
		||||
      record: :node_memory_MemFreeCachedBuffers_bytes:sum
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum(node_memory_MemTotal_bytes{job="node-exporter"})
 | 
			
		||||
      record: :node_memory_MemTotal_bytes:sum
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum by (node) (
 | 
			
		||||
          (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
 | 
			
		||||
          * on (namespace, pod) group_left(node)
 | 
			
		||||
            node_namespace_pod:kube_pod_info:
 | 
			
		||||
        )
 | 
			
		||||
      record: node:node_memory_bytes_available:sum
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum by (node) (
 | 
			
		||||
          node_memory_MemTotal_bytes{job="node-exporter"}
 | 
			
		||||
          * on (namespace, pod) group_left(node)
 | 
			
		||||
            node_namespace_pod:kube_pod_info:
 | 
			
		||||
        )
 | 
			
		||||
      record: node:node_memory_bytes_total:sum
 | 
			
		||||
    - expr: |
 | 
			
		||||
        (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
 | 
			
		||||
        /
 | 
			
		||||
        node:node_memory_bytes_total:sum
 | 
			
		||||
      record: node:node_memory_utilisation:ratio
 | 
			
		||||
    - expr: |
 | 
			
		||||
        (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
 | 
			
		||||
        /
 | 
			
		||||
        scalar(sum(node:node_memory_bytes_total:sum))
 | 
			
		||||
      record: node:cluster_memory_utilisation:ratio
 | 
			
		||||
    - expr: |
 | 
			
		||||
        1e3 * sum(
 | 
			
		||||
          (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
 | 
			
		||||
         + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
 | 
			
		||||
        )
 | 
			
		||||
      record: :node_memory_swap_io_bytes:sum_rate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        1 -
 | 
			
		||||
        sum by (node) (
 | 
			
		||||
          (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
 | 
			
		||||
        * on (namespace, pod) group_left(node)
 | 
			
		||||
          node_namespace_pod:kube_pod_info:
 | 
			
		||||
        )
 | 
			
		||||
        /
 | 
			
		||||
        sum by (node) (
 | 
			
		||||
          node_memory_MemTotal_bytes{job="node-exporter"}
 | 
			
		||||
        * on (namespace, pod) group_left(node)
 | 
			
		||||
          node_namespace_pod:kube_pod_info:
 | 
			
		||||
        )
 | 
			
		||||
      record: 'node:node_memory_utilisation:'
 | 
			
		||||
    - expr: |
 | 
			
		||||
        1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
 | 
			
		||||
      record: 'node:node_memory_utilisation_2:'
 | 
			
		||||
    - expr: |
 | 
			
		||||
        1e3 * sum by (node) (
 | 
			
		||||
          (rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
 | 
			
		||||
         + rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
 | 
			
		||||
         * on (namespace, pod) group_left(node)
 | 
			
		||||
           node_namespace_pod:kube_pod_info:
 | 
			
		||||
        )
 | 
			
		||||
      record: node:node_memory_swap_io_bytes:sum_rate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
 | 
			
		||||
      record: :node_disk_utilisation:avg_irate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        avg by (node) (
 | 
			
		||||
          irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
 | 
			
		||||
        * on (namespace, pod) group_left(node)
 | 
			
		||||
          node_namespace_pod:kube_pod_info:
 | 
			
		||||
        )
 | 
			
		||||
      record: node:node_disk_utilisation:avg_irate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
 | 
			
		||||
      record: :node_disk_saturation:avg_irate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        avg by (node) (
 | 
			
		||||
          irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
 | 
			
		||||
        * on (namespace, pod) group_left(node)
 | 
			
		||||
          node_namespace_pod:kube_pod_info:
 | 
			
		||||
        )
 | 
			
		||||
      record: node:node_disk_saturation:avg_irate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
 | 
			
		||||
        - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
 | 
			
		||||
        / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
 | 
			
		||||
      record: 'node:node_filesystem_usage:'
 | 
			
		||||
    - expr: |
 | 
			
		||||
        max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
 | 
			
		||||
      record: 'node:node_filesystem_avail:'
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) +
 | 
			
		||||
        sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
 | 
			
		||||
      record: :node_net_utilisation:sum_irate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum by (node) (
 | 
			
		||||
          (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) +
 | 
			
		||||
          irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
 | 
			
		||||
        * on (namespace, pod) group_left(node)
 | 
			
		||||
          node_namespace_pod:kube_pod_info:
 | 
			
		||||
        )
 | 
			
		||||
      record: node:node_net_utilisation:sum_irate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) +
 | 
			
		||||
        sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
 | 
			
		||||
      record: :node_net_saturation:sum_irate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum by (node) (
 | 
			
		||||
          (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) +
 | 
			
		||||
          irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
 | 
			
		||||
        * on (namespace, pod) group_left(node)
 | 
			
		||||
          node_namespace_pod:kube_pod_info:
 | 
			
		||||
        )
 | 
			
		||||
      record: node:node_net_saturation:sum_irate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        max(
 | 
			
		||||
          max(
 | 
			
		||||
            kube_pod_info{job="kube-state-metrics", host_ip!=""}
 | 
			
		||||
          ) by (node, host_ip)
 | 
			
		||||
          * on (host_ip) group_right (node)
 | 
			
		||||
          label_replace(
 | 
			
		||||
            (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
 | 
			
		||||
          )
 | 
			
		||||
        ) by (node)
 | 
			
		||||
      record: 'node:node_inodes_total:'
 | 
			
		||||
    - expr: |
 | 
			
		||||
        max(
 | 
			
		||||
          max(
 | 
			
		||||
            kube_pod_info{job="kube-state-metrics", host_ip!=""}
 | 
			
		||||
          ) by (node, host_ip)
 | 
			
		||||
          * on (host_ip) group_right (node)
 | 
			
		||||
          label_replace(
 | 
			
		||||
            (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
 | 
			
		||||
          )
 | 
			
		||||
        ) by (node)
 | 
			
		||||
      record: 'node:node_inodes_free:'
 | 
			
		||||
  - name: kube-prometheus-node-recording.rules
 | 
			
		||||
    rules:
 | 
			
		||||
    - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
 | 
			
		||||
@ -446,17 +278,17 @@ spec:
 | 
			
		||||
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
 | 
			
		||||
      expr: |
 | 
			
		||||
        rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
 | 
			
		||||
      for: 1h
 | 
			
		||||
      for: 15m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: KubePodNotReady
 | 
			
		||||
      annotations:
 | 
			
		||||
        message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
 | 
			
		||||
          state for longer than an hour.
 | 
			
		||||
          state for longer than 15 minutes.
 | 
			
		||||
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
 | 
			
		||||
      expr: |
 | 
			
		||||
        sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) > 0
 | 
			
		||||
      for: 1h
 | 
			
		||||
      for: 15m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: KubeDeploymentGenerationMismatch
 | 
			
		||||
@ -475,13 +307,13 @@ spec:
 | 
			
		||||
    - alert: KubeDeploymentReplicasMismatch
 | 
			
		||||
      annotations:
 | 
			
		||||
        message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
 | 
			
		||||
          matched the expected number of replicas for longer than an hour.
 | 
			
		||||
          matched the expected number of replicas for longer than 15 minutes.
 | 
			
		||||
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
 | 
			
		||||
      expr: |
 | 
			
		||||
        kube_deployment_spec_replicas{job="kube-state-metrics"}
 | 
			
		||||
          !=
 | 
			
		||||
        kube_deployment_status_replicas_available{job="kube-state-metrics"}
 | 
			
		||||
      for: 1h
 | 
			
		||||
      for: 15m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
    - alert: KubeStatefulSetReplicasMismatch
 | 
			
		||||
@ -589,7 +421,7 @@ spec:
 | 
			
		||||
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
 | 
			
		||||
      expr: |
 | 
			
		||||
        kube_job_status_failed{job="kube-state-metrics"}  > 0
 | 
			
		||||
      for: 1h
 | 
			
		||||
      for: 15m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
  - name: kubernetes-resources
 | 
			
		||||
@ -723,7 +555,7 @@ spec:
 | 
			
		||||
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
 | 
			
		||||
      expr: |
 | 
			
		||||
        kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
 | 
			
		||||
      for: 1h
 | 
			
		||||
      for: 15m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: KubeVersionMismatch
 | 
			
		||||
@ -733,7 +565,7 @@ spec:
 | 
			
		||||
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
 | 
			
		||||
      expr: |
 | 
			
		||||
        count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
 | 
			
		||||
      for: 1h
 | 
			
		||||
      for: 15m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: KubeClientErrors
 | 
			
		||||
@ -949,17 +781,6 @@ spec:
 | 
			
		||||
      for: 4h
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusTSDBWALCorruptions
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
 | 
			
		||||
          {{$value | humanize}} corruptions of the write-ahead log (WAL) over the
 | 
			
		||||
          last 3h.
 | 
			
		||||
        summary: Prometheus is detecting WAL corruptions.
 | 
			
		||||
      expr: |
 | 
			
		||||
        increase(tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
 | 
			
		||||
      for: 4h
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: PrometheusNotIngestingSamples
 | 
			
		||||
      annotations:
 | 
			
		||||
        description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
 | 
			
		||||
@ -1054,7 +875,7 @@ spec:
 | 
			
		||||
        message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
 | 
			
		||||
          are out of sync.
 | 
			
		||||
      expr: |
 | 
			
		||||
        count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
 | 
			
		||||
        count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
 | 
			
		||||
      for: 5m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user