mirror of
				https://github.com/carlosedp/cluster-monitoring.git
				synced 2025-10-26 10:23:04 +01:00 
			
		
		
		
	Change references to kube-prometheus library. Update libs and regenerate manifests
This commit is contained in:
		
							parent
							
								
									1f50c68326
								
							
						
					
					
						commit
						a2f54dddd0
					
				@ -4,8 +4,8 @@
 | 
			
		||||
            "name": "kube-prometheus",
 | 
			
		||||
            "source": {
 | 
			
		||||
                "git": {
 | 
			
		||||
                    "remote": "https://github.com/coreos/prometheus-operator",
 | 
			
		||||
                    "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
 | 
			
		||||
                    "remote": "https://github.com/coreos/kube-prometheus",
 | 
			
		||||
                    "subdir": "jsonnet/kube-prometheus"
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "version": "master"
 | 
			
		||||
 | 
			
		||||
@ -4,11 +4,11 @@
 | 
			
		||||
            "name": "kube-prometheus",
 | 
			
		||||
            "source": {
 | 
			
		||||
                "git": {
 | 
			
		||||
                    "remote": "https://github.com/coreos/prometheus-operator",
 | 
			
		||||
                    "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
 | 
			
		||||
                    "remote": "https://github.com/coreos/kube-prometheus",
 | 
			
		||||
                    "subdir": "jsonnet/kube-prometheus"
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "version": "650359b3e627ae97a1f18cbd10d7ed9b2293c240"
 | 
			
		||||
            "version": "7bd745ef78dce3c14ef9e315506d0c1c32fdcc9e"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "ksonnet",
 | 
			
		||||
@ -28,7 +28,7 @@
 | 
			
		||||
                    "subdir": ""
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "version": "19db38fc449df024446059f21d5a329babaa3927"
 | 
			
		||||
            "version": "7360753d27aa428758c918434503c1c35afcd3bb"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "grafonnet",
 | 
			
		||||
@ -48,7 +48,7 @@
 | 
			
		||||
                    "subdir": "grafana-builder"
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "version": "e30a6040f3d7270655a980ab04d16142da4b429d"
 | 
			
		||||
            "version": "ecaeaed2e21c0db29098811e7826a9b923e706c5"
 | 
			
		||||
        },
 | 
			
		||||
        {
 | 
			
		||||
            "name": "grafana",
 | 
			
		||||
@ -78,7 +78,7 @@
 | 
			
		||||
                    "subdir": "Documentation/etcd-mixin"
 | 
			
		||||
                }
 | 
			
		||||
            },
 | 
			
		||||
            "version": "7a5acb4a43aa06bd9e32ab59a46271ab88d497e4"
 | 
			
		||||
            "version": "216808eab50f74e02410e96878bbf2175d2916cb"
 | 
			
		||||
        }
 | 
			
		||||
    ]
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
@ -13,3 +13,4 @@ spec:
 | 
			
		||||
  selector:
 | 
			
		||||
    alertmanager: main
 | 
			
		||||
    app: alertmanager
 | 
			
		||||
  sessionAffinity: ClientIP
 | 
			
		||||
 | 
			
		||||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							@ -60,6 +60,12 @@ spec:
 | 
			
		||||
        - mountPath: /grafana-dashboard-definitions/0/k8s-resources-pod
 | 
			
		||||
          name: grafana-dashboard-k8s-resources-pod
 | 
			
		||||
          readOnly: false
 | 
			
		||||
        - mountPath: /grafana-dashboard-definitions/0/k8s-resources-workload
 | 
			
		||||
          name: grafana-dashboard-k8s-resources-workload
 | 
			
		||||
          readOnly: false
 | 
			
		||||
        - mountPath: /grafana-dashboard-definitions/0/k8s-resources-workloads-namespace
 | 
			
		||||
          name: grafana-dashboard-k8s-resources-workloads-namespace
 | 
			
		||||
          readOnly: false
 | 
			
		||||
        - mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard
 | 
			
		||||
          name: grafana-dashboard-kubernetes-cluster-dashboard
 | 
			
		||||
          readOnly: false
 | 
			
		||||
@ -114,6 +120,12 @@ spec:
 | 
			
		||||
      - configMap:
 | 
			
		||||
          name: grafana-dashboard-k8s-resources-pod
 | 
			
		||||
        name: grafana-dashboard-k8s-resources-pod
 | 
			
		||||
      - configMap:
 | 
			
		||||
          name: grafana-dashboard-k8s-resources-workload
 | 
			
		||||
        name: grafana-dashboard-k8s-resources-workload
 | 
			
		||||
      - configMap:
 | 
			
		||||
          name: grafana-dashboard-k8s-resources-workloads-namespace
 | 
			
		||||
        name: grafana-dashboard-k8s-resources-workloads-namespace
 | 
			
		||||
      - configMap:
 | 
			
		||||
          name: grafana-dashboard-kubernetes-cluster-dashboard
 | 
			
		||||
        name: grafana-dashboard-kubernetes-cluster-dashboard
 | 
			
		||||
 | 
			
		||||
@ -22,6 +22,7 @@ spec:
 | 
			
		||||
        - --path.rootfs=/host/root
 | 
			
		||||
        - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
 | 
			
		||||
        - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
 | 
			
		||||
        - --collector.ntp
 | 
			
		||||
        image: carlosedp/node_exporter:v0.17.0
 | 
			
		||||
        name: node-exporter
 | 
			
		||||
        resources:
 | 
			
		||||
 | 
			
		||||
@ -3,7 +3,7 @@ data:
 | 
			
		||||
  config.yaml: |
 | 
			
		||||
    resourceRules:
 | 
			
		||||
      cpu:
 | 
			
		||||
        containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)
 | 
			
		||||
        containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}[1m])) by (<<.GroupBy>>)
 | 
			
		||||
        nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
 | 
			
		||||
        resources:
 | 
			
		||||
          overrides:
 | 
			
		||||
@ -15,7 +15,7 @@ data:
 | 
			
		||||
              resource: pod
 | 
			
		||||
        containerLabel: container_name
 | 
			
		||||
      memory:
 | 
			
		||||
        containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>)
 | 
			
		||||
        containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}) by (<<.GroupBy>>)
 | 
			
		||||
        nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>)
 | 
			
		||||
        resources:
 | 
			
		||||
          overrides:
 | 
			
		||||
 | 
			
		||||
@ -44,11 +44,44 @@ spec:
 | 
			
		||||
      record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum by (namespace, label_name) (
 | 
			
		||||
          sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
 | 
			
		||||
          sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
 | 
			
		||||
        * on (namespace, pod) group_left(label_name)
 | 
			
		||||
          label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
 | 
			
		||||
        )
 | 
			
		||||
      record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum(
 | 
			
		||||
          label_replace(
 | 
			
		||||
            label_replace(
 | 
			
		||||
              kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
 | 
			
		||||
              "replicaset", "$1", "owner_name", "(.*)"
 | 
			
		||||
            ) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
 | 
			
		||||
            "workload", "$1", "owner_name", "(.*)"
 | 
			
		||||
          )
 | 
			
		||||
        ) by (namespace, workload, pod)
 | 
			
		||||
      labels:
 | 
			
		||||
        workload_type: deployment
 | 
			
		||||
      record: mixin_pod_workload
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum(
 | 
			
		||||
          label_replace(
 | 
			
		||||
            kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
 | 
			
		||||
            "workload", "$1", "owner_name", "(.*)"
 | 
			
		||||
          )
 | 
			
		||||
        ) by (namespace, workload, pod)
 | 
			
		||||
      labels:
 | 
			
		||||
        workload_type: daemonset
 | 
			
		||||
      record: mixin_pod_workload
 | 
			
		||||
    - expr: |
 | 
			
		||||
        sum(
 | 
			
		||||
          label_replace(
 | 
			
		||||
            kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
 | 
			
		||||
            "workload", "$1", "owner_name", "(.*)"
 | 
			
		||||
          )
 | 
			
		||||
        ) by (namespace, workload, pod)
 | 
			
		||||
      labels:
 | 
			
		||||
        workload_type: statefulset
 | 
			
		||||
      record: mixin_pod_workload
 | 
			
		||||
  - name: kube-scheduler.rules
 | 
			
		||||
    rules:
 | 
			
		||||
    - expr: |
 | 
			
		||||
@ -235,11 +268,11 @@ spec:
 | 
			
		||||
        )
 | 
			
		||||
      record: node:node_disk_utilisation:avg_irate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3)
 | 
			
		||||
        avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
 | 
			
		||||
      record: :node_disk_saturation:avg_irate
 | 
			
		||||
    - expr: |
 | 
			
		||||
        avg by (node) (
 | 
			
		||||
          irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3
 | 
			
		||||
          irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
 | 
			
		||||
        * on (namespace, pod) group_left(node)
 | 
			
		||||
          node_namespace_pod:kube_pod_info:
 | 
			
		||||
        )
 | 
			
		||||
@ -813,7 +846,7 @@ spec:
 | 
			
		||||
    - alert: KubeClientCertificateExpiration
 | 
			
		||||
      annotations:
 | 
			
		||||
        message: A client certificate used to authenticate to the apiserver is expiring
 | 
			
		||||
          in less than 7 days.
 | 
			
		||||
          in less than 7.0 days.
 | 
			
		||||
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
 | 
			
		||||
      expr: |
 | 
			
		||||
        apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
 | 
			
		||||
@ -822,7 +855,7 @@ spec:
 | 
			
		||||
    - alert: KubeClientCertificateExpiration
 | 
			
		||||
      annotations:
 | 
			
		||||
        message: A client certificate used to authenticate to the apiserver is expiring
 | 
			
		||||
          in less than 24 hours.
 | 
			
		||||
          in less than 24.0 hours.
 | 
			
		||||
        runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
 | 
			
		||||
      expr: |
 | 
			
		||||
        apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
 | 
			
		||||
@ -898,6 +931,46 @@ spec:
 | 
			
		||||
      for: 10m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: critical
 | 
			
		||||
  - name: node-time
 | 
			
		||||
    rules:
 | 
			
		||||
    - alert: ClockSkewDetected
 | 
			
		||||
      annotations:
 | 
			
		||||
        message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod
 | 
			
		||||
          }}. Ensure NTP is configured correctly on this host.
 | 
			
		||||
      expr: |
 | 
			
		||||
        node_ntp_offset_seconds{job="node-exporter"} < -0.03 or node_ntp_offset_seconds{job="node-exporter"} > 0.03
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
  - name: node-network
 | 
			
		||||
    rules:
 | 
			
		||||
    - alert: NetworkReceiveErrors
 | 
			
		||||
      annotations:
 | 
			
		||||
        message: Network interface "{{ $labels.device }}" showing receive errors on
 | 
			
		||||
          node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
 | 
			
		||||
      expr: |
 | 
			
		||||
        rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: NetworkTransmitErrors
 | 
			
		||||
      annotations:
 | 
			
		||||
        message: Network interface "{{ $labels.device }}" showing transmit errors
 | 
			
		||||
          on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
 | 
			
		||||
      expr: |
 | 
			
		||||
        rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
    - alert: NodeNetworkInterfaceFlapping
 | 
			
		||||
      annotations:
 | 
			
		||||
        message: Network interface "{{ $labels.device }}" changing it's up status
 | 
			
		||||
          often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
 | 
			
		||||
      expr: |
 | 
			
		||||
        changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
 | 
			
		||||
      for: 2m
 | 
			
		||||
      labels:
 | 
			
		||||
        severity: warning
 | 
			
		||||
  - name: prometheus.rules
 | 
			
		||||
    rules:
 | 
			
		||||
    - alert: PrometheusConfigReloadFailed
 | 
			
		||||
 | 
			
		||||
@ -18,11 +18,6 @@ spec:
 | 
			
		||||
    honorLabels: true
 | 
			
		||||
    interval: 30s
 | 
			
		||||
    metricRelabelings:
 | 
			
		||||
    - action: drop
 | 
			
		||||
      regex: container_([a-z_]+);
 | 
			
		||||
      sourceLabels:
 | 
			
		||||
      - __name__
 | 
			
		||||
      - image
 | 
			
		||||
    - action: drop
 | 
			
		||||
      regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
 | 
			
		||||
      sourceLabels:
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user