mirror of
https://github.com/carlosedp/cluster-monitoring.git
synced 2024-11-20 19:07:17 +01:00
Change references to kube-prometheus library. Update libs and regenerate manifests
This commit is contained in:
parent
1f50c68326
commit
a2f54dddd0
@ -4,8 +4,8 @@
|
||||
"name": "kube-prometheus",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/coreos/prometheus-operator",
|
||||
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
|
||||
"remote": "https://github.com/coreos/kube-prometheus",
|
||||
"subdir": "jsonnet/kube-prometheus"
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
|
@ -4,11 +4,11 @@
|
||||
"name": "kube-prometheus",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/coreos/prometheus-operator",
|
||||
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
|
||||
"remote": "https://github.com/coreos/kube-prometheus",
|
||||
"subdir": "jsonnet/kube-prometheus"
|
||||
}
|
||||
},
|
||||
"version": "650359b3e627ae97a1f18cbd10d7ed9b2293c240"
|
||||
"version": "7bd745ef78dce3c14ef9e315506d0c1c32fdcc9e"
|
||||
},
|
||||
{
|
||||
"name": "ksonnet",
|
||||
@ -28,7 +28,7 @@
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "19db38fc449df024446059f21d5a329babaa3927"
|
||||
"version": "7360753d27aa428758c918434503c1c35afcd3bb"
|
||||
},
|
||||
{
|
||||
"name": "grafonnet",
|
||||
@ -48,7 +48,7 @@
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "e30a6040f3d7270655a980ab04d16142da4b429d"
|
||||
"version": "ecaeaed2e21c0db29098811e7826a9b923e706c5"
|
||||
},
|
||||
{
|
||||
"name": "grafana",
|
||||
@ -78,7 +78,7 @@
|
||||
"subdir": "Documentation/etcd-mixin"
|
||||
}
|
||||
},
|
||||
"version": "7a5acb4a43aa06bd9e32ab59a46271ab88d497e4"
|
||||
"version": "216808eab50f74e02410e96878bbf2175d2916cb"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
@ -13,3 +13,4 @@ spec:
|
||||
selector:
|
||||
alertmanager: main
|
||||
app: alertmanager
|
||||
sessionAffinity: ClientIP
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -60,6 +60,12 @@ spec:
|
||||
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-pod
|
||||
name: grafana-dashboard-k8s-resources-pod
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-workload
|
||||
name: grafana-dashboard-k8s-resources-workload
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-workloads-namespace
|
||||
name: grafana-dashboard-k8s-resources-workloads-namespace
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard
|
||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||
readOnly: false
|
||||
@ -114,6 +120,12 @@ spec:
|
||||
- configMap:
|
||||
name: grafana-dashboard-k8s-resources-pod
|
||||
name: grafana-dashboard-k8s-resources-pod
|
||||
- configMap:
|
||||
name: grafana-dashboard-k8s-resources-workload
|
||||
name: grafana-dashboard-k8s-resources-workload
|
||||
- configMap:
|
||||
name: grafana-dashboard-k8s-resources-workloads-namespace
|
||||
name: grafana-dashboard-k8s-resources-workloads-namespace
|
||||
- configMap:
|
||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||
|
@ -22,6 +22,7 @@ spec:
|
||||
- --path.rootfs=/host/root
|
||||
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
|
||||
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
|
||||
- --collector.ntp
|
||||
image: carlosedp/node_exporter:v0.17.0
|
||||
name: node-exporter
|
||||
resources:
|
||||
|
@ -3,7 +3,7 @@ data:
|
||||
config.yaml: |
|
||||
resourceRules:
|
||||
cpu:
|
||||
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)
|
||||
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}[1m])) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
resources:
|
||||
overrides:
|
||||
@ -15,7 +15,7 @@ data:
|
||||
resource: pod
|
||||
containerLabel: container_name
|
||||
memory:
|
||||
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
resources:
|
||||
overrides:
|
||||
|
@ -44,11 +44,44 @@ spec:
|
||||
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
|
||||
- expr: |
|
||||
sum by (namespace, label_name) (
|
||||
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
|
||||
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
|
||||
* on (namespace, pod) group_left(label_name)
|
||||
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
|
||||
)
|
||||
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
|
||||
- expr: |
|
||||
sum(
|
||||
label_replace(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||
"replicaset", "$1", "owner_name", "(.*)"
|
||||
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: deployment
|
||||
record: mixin_pod_workload
|
||||
- expr: |
|
||||
sum(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: daemonset
|
||||
record: mixin_pod_workload
|
||||
- expr: |
|
||||
sum(
|
||||
label_replace(
|
||||
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||
"workload", "$1", "owner_name", "(.*)"
|
||||
)
|
||||
) by (namespace, workload, pod)
|
||||
labels:
|
||||
workload_type: statefulset
|
||||
record: mixin_pod_workload
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- expr: |
|
||||
@ -235,11 +268,11 @@ spec:
|
||||
)
|
||||
record: node:node_disk_utilisation:avg_irate
|
||||
- expr: |
|
||||
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3)
|
||||
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
||||
record: :node_disk_saturation:avg_irate
|
||||
- expr: |
|
||||
avg by (node) (
|
||||
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3
|
||||
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
@ -813,7 +846,7 @@ spec:
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring
|
||||
in less than 7 days.
|
||||
in less than 7.0 days.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||
@ -822,7 +855,7 @@ spec:
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: A client certificate used to authenticate to the apiserver is expiring
|
||||
in less than 24 hours.
|
||||
in less than 24.0 hours.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
@ -898,6 +931,46 @@ spec:
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: node-time
|
||||
rules:
|
||||
- alert: ClockSkewDetected
|
||||
annotations:
|
||||
message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod
|
||||
}}. Ensure NTP is configured correctly on this host.
|
||||
expr: |
|
||||
node_ntp_offset_seconds{job="node-exporter"} < -0.03 or node_ntp_offset_seconds{job="node-exporter"} > 0.03
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: node-network
|
||||
rules:
|
||||
- alert: NetworkReceiveErrors
|
||||
annotations:
|
||||
message: Network interface "{{ $labels.device }}" showing receive errors on
|
||||
node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
expr: |
|
||||
rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NetworkTransmitErrors
|
||||
annotations:
|
||||
message: Network interface "{{ $labels.device }}" showing transmit errors
|
||||
on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
expr: |
|
||||
rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
message: Network interface "{{ $labels.device }}" changing it's up status
|
||||
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
expr: |
|
||||
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: prometheus.rules
|
||||
rules:
|
||||
- alert: PrometheusConfigReloadFailed
|
||||
|
@ -18,11 +18,6 @@ spec:
|
||||
honorLabels: true
|
||||
interval: 30s
|
||||
metricRelabelings:
|
||||
- action: drop
|
||||
regex: container_([a-z_]+);
|
||||
sourceLabels:
|
||||
- __name__
|
||||
- image
|
||||
- action: drop
|
||||
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
|
||||
sourceLabels:
|
||||
|
Loading…
Reference in New Issue
Block a user