mirror of
https://github.com/carlosedp/cluster-monitoring.git
synced 2024-11-20 19:07:17 +01:00
Updated libraries
This commit is contained in:
parent
362478a928
commit
7c31f8e998
File diff suppressed because it is too large
Load Diff
@ -4,7 +4,7 @@ data:
|
|||||||
resourceRules:
|
resourceRules:
|
||||||
cpu:
|
cpu:
|
||||||
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)
|
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)
|
||||||
nodeQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>, id='/'}[1m])) by (<<.GroupBy>>)
|
nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||||
resources:
|
resources:
|
||||||
overrides:
|
overrides:
|
||||||
node:
|
node:
|
||||||
@ -16,7 +16,7 @@ data:
|
|||||||
containerLabel: container_name
|
containerLabel: container_name
|
||||||
memory:
|
memory:
|
||||||
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||||
nodeQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,id='/'}) by (<<.GroupBy>>)
|
nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||||
resources:
|
resources:
|
||||||
overrides:
|
overrides:
|
||||||
node:
|
node:
|
||||||
|
@ -225,21 +225,21 @@ spec:
|
|||||||
)
|
)
|
||||||
record: node:node_memory_swap_io_bytes:sum_rate
|
record: node:node_memory_swap_io_bytes:sum_rate
|
||||||
- expr: |
|
- expr: |
|
||||||
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]))
|
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
||||||
record: :node_disk_utilisation:avg_irate
|
record: :node_disk_utilisation:avg_irate
|
||||||
- expr: |
|
- expr: |
|
||||||
avg by (node) (
|
avg by (node) (
|
||||||
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])
|
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||||
* on (namespace, pod) group_left(node)
|
* on (namespace, pod) group_left(node)
|
||||||
node_namespace_pod:kube_pod_info:
|
node_namespace_pod:kube_pod_info:
|
||||||
)
|
)
|
||||||
record: node:node_disk_utilisation:avg_irate
|
record: node:node_disk_utilisation:avg_irate
|
||||||
- expr: |
|
- expr: |
|
||||||
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3)
|
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3)
|
||||||
record: :node_disk_saturation:avg_irate
|
record: :node_disk_saturation:avg_irate
|
||||||
- expr: |
|
- expr: |
|
||||||
avg by (node) (
|
avg by (node) (
|
||||||
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3
|
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3
|
||||||
* on (namespace, pod) group_left(node)
|
* on (namespace, pod) group_left(node)
|
||||||
node_namespace_pod:kube_pod_info:
|
node_namespace_pod:kube_pod_info:
|
||||||
)
|
)
|
||||||
@ -795,9 +795,9 @@ spec:
|
|||||||
message: API server is returning errors for {{ $value }}% of requests.
|
message: API server is returning errors for {{ $value }}% of requests.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
|
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
||||||
/
|
/
|
||||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
|
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@ -806,9 +806,33 @@ spec:
|
|||||||
message: API server is returning errors for {{ $value }}% of requests.
|
message: API server is returning errors for {{ $value }}% of requests.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||||
expr: |
|
expr: |
|
||||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
|
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
||||||
/
|
/
|
||||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
|
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeAPIErrorsHigh
|
||||||
|
annotations:
|
||||||
|
message: API server is returning errors for {{ $value }}% of requests for
|
||||||
|
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
||||||
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||||
|
expr: |
|
||||||
|
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
- alert: KubeAPIErrorsHigh
|
||||||
|
annotations:
|
||||||
|
message: API server is returning errors for {{ $value }}% of requests for
|
||||||
|
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
||||||
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||||
|
expr: |
|
||||||
|
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
||||||
|
/
|
||||||
|
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -977,7 +1001,7 @@ spec:
|
|||||||
log (WAL).'
|
log (WAL).'
|
||||||
summary: Prometheus write-ahead log is corrupted
|
summary: Prometheus write-ahead log is corrupted
|
||||||
expr: |
|
expr: |
|
||||||
tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
|
prometheus_tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
|
||||||
for: 4h
|
for: 4h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
Loading…
Reference in New Issue
Block a user