mirror of
https://github.com/carlosedp/cluster-monitoring.git
synced 2024-11-20 19:07:17 +01:00
Updated libraries
This commit is contained in:
parent
362478a928
commit
7c31f8e998
File diff suppressed because it is too large
Load Diff
@ -4,7 +4,7 @@ data:
|
||||
resourceRules:
|
||||
cpu:
|
||||
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>, id='/'}[1m])) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
resources:
|
||||
overrides:
|
||||
node:
|
||||
@ -16,7 +16,7 @@ data:
|
||||
containerLabel: container_name
|
||||
memory:
|
||||
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,id='/'}) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
resources:
|
||||
overrides:
|
||||
node:
|
||||
|
@ -225,21 +225,21 @@ spec:
|
||||
)
|
||||
record: node:node_memory_swap_io_bytes:sum_rate
|
||||
- expr: |
|
||||
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]))
|
||||
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
||||
record: :node_disk_utilisation:avg_irate
|
||||
- expr: |
|
||||
avg by (node) (
|
||||
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])
|
||||
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
record: node:node_disk_utilisation:avg_irate
|
||||
- expr: |
|
||||
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3)
|
||||
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3)
|
||||
record: :node_disk_saturation:avg_irate
|
||||
- expr: |
|
||||
avg by (node) (
|
||||
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3
|
||||
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3
|
||||
* on (namespace, pod) group_left(node)
|
||||
node_namespace_pod:kube_pod_info:
|
||||
)
|
||||
@ -795,9 +795,9 @@ spec:
|
||||
message: API server is returning errors for {{ $value }}% of requests.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
|
||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
||||
/
|
||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10
|
||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -806,9 +806,33 @@ spec:
|
||||
message: API server is returning errors for {{ $value }}% of requests.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod)
|
||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
||||
/
|
||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5
|
||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) * 100 > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value }}% of requests for
|
||||
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
||||
/
|
||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value }}% of requests for
|
||||
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
||||
/
|
||||
sum(rate(apiserver_request_count{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -977,7 +1001,7 @@ spec:
|
||||
log (WAL).'
|
||||
summary: Prometheus write-ahead log is corrupted
|
||||
expr: |
|
||||
tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
|
||||
prometheus_tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
|
Loading…
Reference in New Issue
Block a user