mirror of
https://github.com/carlosedp/cluster-monitoring.git
synced 2024-11-20 19:07:17 +01:00
Update libs
This commit is contained in:
parent
5fd83a4ef2
commit
6213f12199
@ -69,6 +69,7 @@ local utils = import 'utils.libsonnet';
|
||||
local pvc = k.core.v1.persistentVolumeClaim,
|
||||
prometheus+: {
|
||||
spec+: {
|
||||
replicas: $._config.prometheus.replicas,
|
||||
retention: '15d',
|
||||
externalUrl: 'http://' + $._config.urls.prom_ingress,
|
||||
}
|
||||
|
@ -8,7 +8,7 @@
|
||||
"subdir": "jsonnet/kube-prometheus"
|
||||
}
|
||||
},
|
||||
"version": "da959c643657c7d2aac6f5ddd68582a949283c49"
|
||||
"version": "8405360a467a34fca34735d92c763ae38bfe5917"
|
||||
},
|
||||
{
|
||||
"name": "ksonnet",
|
||||
@ -28,7 +28,7 @@
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "193d4934f85c9ff596d1f3e4dce7bd2da62a4d5e"
|
||||
"version": "acac6762be477a7ec16a96ae5222dd365568d87d"
|
||||
},
|
||||
{
|
||||
"name": "grafonnet",
|
||||
@ -48,7 +48,7 @@
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "565bf6b51d636e0efe4add39f2ab8e2b1abb731f"
|
||||
"version": "1e4da2e3aa19d9880cf5ae07a5220bbd10c11083"
|
||||
},
|
||||
{
|
||||
"name": "grafana",
|
||||
@ -58,7 +58,7 @@
|
||||
"subdir": "grafana"
|
||||
}
|
||||
},
|
||||
"version": "7fadaf2274d5cbe4ac6fbaf8786e4b7ecf3c1713"
|
||||
"version": "c79fdb1f6cbd14a003f73d9a6f571f1cefc6638d"
|
||||
},
|
||||
{
|
||||
"name": "prometheus-operator",
|
||||
@ -68,7 +68,7 @@
|
||||
"subdir": "jsonnet/prometheus-operator"
|
||||
}
|
||||
},
|
||||
"version": "6efd4e5e12213021516c10b3ebd0699260ddd804"
|
||||
"version": "908ee0372a9ac2c6574d589fdc56a4f3cb5f12d1"
|
||||
},
|
||||
{
|
||||
"name": "etcd-mixin",
|
||||
@ -78,7 +78,7 @@
|
||||
"subdir": "Documentation/etcd-mixin"
|
||||
}
|
||||
},
|
||||
"version": "8037e6e08727d4a17649f782cb4dbc482b8fe780"
|
||||
"version": "f197ab45785dfe81a9e412affbdca7ff733dd5d0"
|
||||
},
|
||||
{
|
||||
"name": "prometheus",
|
||||
@ -88,7 +88,27 @@
|
||||
"subdir": "documentation/prometheus-mixin"
|
||||
}
|
||||
},
|
||||
"version": "f0bb8129c3e6ffc6906bdc130f5625110643f168"
|
||||
"version": "81d284f806ef828e8f323088e00871fedd5a77c2"
|
||||
},
|
||||
{
|
||||
"name": "node-mixin",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/prometheus/node_exporter",
|
||||
"subdir": "docs/node-mixin"
|
||||
}
|
||||
},
|
||||
"version": "fb54f7f2e0c8e154768e63efffd9a4c161ed6931"
|
||||
},
|
||||
{
|
||||
"name": "promgrafonnet",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin",
|
||||
"subdir": "lib/promgrafonnet"
|
||||
}
|
||||
},
|
||||
"version": "acac6762be477a7ec16a96ae5222dd365568d87d"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -206,6 +206,13 @@ spec:
|
||||
value in the first response, unless you have received
|
||||
this token from an error message.
|
||||
type: string
|
||||
remainingItemCount:
|
||||
description: |-
|
||||
remainingItemCount is the number of subsequent items in the list which are not included in this list response. If the list request contained label or field selectors, then the number of remaining items is unknown and the field will be left unset and omitted during serialization. If the list is complete (either because it is not chunking or because this is the last chunk), then there are no more remaining items and this field will be left unset and omitted during serialization. Servers older than v1.15 do not set this field. The intended use of the remainingItemCount is *estimating* the size of a collection. Clients should not rely on the remainingItemCount to be set or to be exact.
|
||||
|
||||
This field is alpha and can be changed or removed without notice.
|
||||
format: int64
|
||||
type: integer
|
||||
resourceVersion:
|
||||
description: 'String that identifies the server''s internal
|
||||
version of this object that can be used by clients to
|
||||
|
@ -47,7 +47,7 @@ spec:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the Secret or it's key must
|
||||
description: Specify whether the Secret or its key must
|
||||
be defined
|
||||
type: boolean
|
||||
required:
|
||||
@ -64,7 +64,7 @@ spec:
|
||||
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names'
|
||||
type: string
|
||||
optional:
|
||||
description: Specify whether the Secret or it's key must
|
||||
description: Specify whether the Secret or its key must
|
||||
be defined
|
||||
type: boolean
|
||||
required:
|
||||
@ -141,7 +141,7 @@ spec:
|
||||
to proxy through this endpoint.
|
||||
type: string
|
||||
relabelings:
|
||||
description: 'RelabelConfigs to apply to samples before ingestion.
|
||||
description: 'RelabelConfigs to apply to samples before scraping.
|
||||
More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#relabel_config'
|
||||
items:
|
||||
description: 'RelabelConfig allows dynamic rewriting of the
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -72,6 +72,15 @@ spec:
|
||||
- mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard
|
||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0/node-cluster-rsrc-use
|
||||
name: grafana-dashboard-node-cluster-rsrc-use
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0/node-rsrc-use
|
||||
name: grafana-dashboard-node-rsrc-use
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0/nodes
|
||||
name: grafana-dashboard-nodes
|
||||
readOnly: false
|
||||
- mountPath: /grafana-dashboard-definitions/0/persistentvolumesusage
|
||||
name: grafana-dashboard-persistentvolumesusage
|
||||
readOnly: false
|
||||
@ -144,6 +153,15 @@ spec:
|
||||
- configMap:
|
||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||
- configMap:
|
||||
name: grafana-dashboard-node-cluster-rsrc-use
|
||||
name: grafana-dashboard-node-cluster-rsrc-use
|
||||
- configMap:
|
||||
name: grafana-dashboard-node-rsrc-use
|
||||
name: grafana-dashboard-node-rsrc-use
|
||||
- configMap:
|
||||
name: grafana-dashboard-nodes
|
||||
name: grafana-dashboard-nodes
|
||||
- configMap:
|
||||
name: grafana-dashboard-persistentvolumesusage
|
||||
name: grafana-dashboard-persistentvolumesusage
|
||||
|
@ -64,35 +64,6 @@ spec:
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 150Mi
|
||||
- command:
|
||||
- /pod_nanny
|
||||
- --container=kube-state-metrics
|
||||
- --cpu=100m
|
||||
- --extra-cpu=2m
|
||||
- --memory=150Mi
|
||||
- --extra-memory=30Mi
|
||||
- --threshold=5
|
||||
- --deployment=kube-state-metrics
|
||||
env:
|
||||
- name: MY_POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: metadata.name
|
||||
- name: MY_POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
apiVersion: v1
|
||||
fieldPath: metadata.namespace
|
||||
image: carlosedp/addon-resizer:v1.8.4
|
||||
name: addon-resizer
|
||||
resources:
|
||||
limits:
|
||||
cpu: 50m
|
||||
memory: 30Mi
|
||||
requests:
|
||||
cpu: 10m
|
||||
memory: 30Mi
|
||||
nodeSelector:
|
||||
kubernetes.io/os: linux
|
||||
securityContext:
|
||||
|
@ -11,6 +11,9 @@ spec:
|
||||
honorLabels: true
|
||||
interval: 30s
|
||||
port: https-main
|
||||
relabelings:
|
||||
- action: labeldrop
|
||||
regex: (pod|service|endpoint|namespace)
|
||||
scheme: https
|
||||
scrapeTimeout: 30s
|
||||
tlsConfig:
|
||||
|
@ -13,7 +13,7 @@ spec:
|
||||
relabelings:
|
||||
- action: replace
|
||||
regex: (.*)
|
||||
replacment: $1
|
||||
replacement: $1
|
||||
sourceLabels:
|
||||
- __meta_kubernetes_pod_node_name
|
||||
targetLabel: instance
|
||||
|
@ -3,30 +3,30 @@ data:
|
||||
config.yaml: |
|
||||
resourceRules:
|
||||
cpu:
|
||||
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}[1m])) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
resources:
|
||||
overrides:
|
||||
node:
|
||||
resource: node
|
||||
namespace:
|
||||
resource: namespace
|
||||
pod_name:
|
||||
pod:
|
||||
resource: pod
|
||||
containerLabel: container_name
|
||||
containerLabel: container
|
||||
memory:
|
||||
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}) by (<<.GroupBy>>)
|
||||
nodeQuery: sum(node_memory_MemTotal_bytes{job="node-exporter",<<.LabelMatchers>>} - node_memory_MemAvailable_bytes{job="node-exporter",<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||
resources:
|
||||
overrides:
|
||||
node:
|
||||
instance:
|
||||
resource: node
|
||||
namespace:
|
||||
resource: namespace
|
||||
pod_name:
|
||||
pod:
|
||||
resource: pod
|
||||
containerLabel: container_name
|
||||
window: 1m
|
||||
containerLabel: container
|
||||
window: 5m
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: adapter-config
|
||||
|
@ -8,6 +8,67 @@ metadata:
|
||||
namespace: monitoring
|
||||
spec:
|
||||
groups:
|
||||
- name: node-exporter.rules
|
||||
rules:
|
||||
- expr: |
|
||||
count without (cpu) (
|
||||
count without (mode) (
|
||||
node_cpu_seconds_total{job="node-exporter"}
|
||||
)
|
||||
)
|
||||
record: instance:node_num_cpu:sum
|
||||
- expr: |
|
||||
1 - avg without (cpu, mode) (
|
||||
rate(node_cpu_seconds_total{job="node-exporter", mode="idle"}[1m])
|
||||
)
|
||||
record: instance:node_cpu_utilisation:rate1m
|
||||
- expr: |
|
||||
(
|
||||
node_load1{job="node-exporter"}
|
||||
/
|
||||
instance:node_num_cpu:sum{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_load1_per_cpu:ratio
|
||||
- expr: |
|
||||
1 - (
|
||||
node_memory_MemAvailable_bytes{job="node-exporter"}
|
||||
/
|
||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
)
|
||||
record: instance:node_memory_utilisation:ratio
|
||||
- expr: |
|
||||
(
|
||||
rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
|
||||
+
|
||||
rate(node_vmstat_pgpgout{job="node-exporter"}[1m])
|
||||
)
|
||||
record: instance:node_memory_swap_io_pages:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_seconds:rate1m
|
||||
- expr: |
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||
record: instance_device:node_disk_io_time_weighted_seconds:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_bytes_total{job="node-exporter", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_receive_bytes_excluding_lo:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_bytes_total{job="node-exporter", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_transmit_bytes_excluding_lo:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_receive_drop_total{job="node-exporter", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_receive_drop_excluding_lo:rate1m
|
||||
- expr: |
|
||||
sum without (device) (
|
||||
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
|
||||
)
|
||||
record: instance:node_network_transmit_drop_excluding_lo:rate1m
|
||||
- name: k8s.rules
|
||||
rules:
|
||||
- expr: |
|
||||
@ -21,13 +82,6 @@ spec:
|
||||
- expr: |
|
||||
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace)
|
||||
record: namespace:container_memory_usage_bytes:sum
|
||||
- expr: |
|
||||
sum by (namespace, label_name) (
|
||||
sum(container_memory_usage_bytes{job="kubelet",image!="", container!="POD"}) by (pod, namespace)
|
||||
* on (namespace, pod)
|
||||
group_left(label_name) kube_pod_labels{job="kube-state-metrics"}
|
||||
)
|
||||
record: namespace:container_memory_usage_bytes:sum
|
||||
- expr: |
|
||||
sum by (namespace, label_name) (
|
||||
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
|
||||
@ -177,6 +231,162 @@ spec:
|
||||
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
|
||||
BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
|
||||
has only {{ printf "%.2f" $value }}% available space left and is filling
|
||||
up.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
|
||||
has only {{ printf "%.2f" $value }}% available space left and is filling
|
||||
up fast.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
|
||||
has only {{ printf "%.2f" $value }}% available space left.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
|
||||
has only {{ printf "%.2f" $value }}% available space left.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
|
||||
has only {{ printf "%.2f" $value }}% available inodes left and is filling
|
||||
up.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
|
||||
has only {{ printf "%.2f" $value }}% available inodes left and is filling
|
||||
up fast.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
|
||||
has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }} at {{ $labels.instance }}
|
||||
has only {{ printf "%.2f" $value }}% available inodes left.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!=""} / node_filesystem_files{job="node-exporter",fstype!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: |
|
||||
increase(node_network_receive_errs_total[2m]) > 10
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: |
|
||||
increase(node_network_transmit_errs_total[2m]) > 10
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- name: kubernetes-absent
|
||||
rules:
|
||||
- alert: AlertmanagerDown
|
||||
@ -287,7 +497,7 @@ spec:
|
||||
state for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||
expr: |
|
||||
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) > 0
|
||||
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"} * on(namespace, pod) group_left(owner_kind) kube_pod_owner{owner_kind!="Job"}) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -363,13 +573,13 @@ spec:
|
||||
severity: critical
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace
|
||||
}}/{{ $labels.daemonset }} are scheduled and ready.
|
||||
message: Only {{ $value | humanizePercentage }} of the desired Pods of DaemonSet
|
||||
{{ $labels.namespace }}/{{ $labels.daemonset }} are scheduled and ready.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck
|
||||
expr: |
|
||||
kube_daemonset_status_number_ready{job="kube-state-metrics"}
|
||||
/
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} < 1.00
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -420,7 +630,33 @@ spec:
|
||||
message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
||||
expr: |
|
||||
kube_job_status_failed{job="kube-state-metrics"} > 0
|
||||
kube_job_failed{job="kube-state-metrics"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has not matched the
|
||||
desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpareplicasmismatch
|
||||
expr: |
|
||||
(kube_hpa_status_desired_replicas{job="kube-state-metrics"}
|
||||
!=
|
||||
kube_hpa_status_current_replicas{job="kube-state-metrics"})
|
||||
and
|
||||
changes(kube_hpa_status_current_replicas[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
message: HPA {{ $labels.namespace }}/{{ $labels.hpa }} has been running at
|
||||
max replicas for longer than 15 minutes.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubehpamaxedout
|
||||
expr: |
|
||||
kube_hpa_status_current_replicas{job="kube-state-metrics"}
|
||||
==
|
||||
kube_hpa_spec_max_replicas{job="kube-state-metrics"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -482,25 +718,28 @@ spec:
|
||||
severity: warning
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value
|
||||
}}% of its {{ $labels.resource }} quota.
|
||||
message: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded
|
||||
expr: |
|
||||
100 * kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 90
|
||||
> 0.90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace
|
||||
}} for container {{ $labels.container }} in pod {{ $labels.pod }}.'
|
||||
message: '{{ $value | humanizePercentage }} throttling of CPU in namespace
|
||||
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{
|
||||
$labels.pod }}.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
||||
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container!=\"\",
|
||||
}[5m])) by (container, pod, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m]))
|
||||
by (container, pod, namespace)\n > 25 \n"
|
||||
expr: |
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (container, pod, namespace)
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (container, pod, namespace)
|
||||
> ( 25 / 100 )
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -509,14 +748,14 @@ spec:
|
||||
- alert: KubePersistentVolumeUsageCritical
|
||||
annotations:
|
||||
message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value
|
||||
}}% free.
|
||||
}} in Namespace {{ $labels.namespace }} is only {{ $value | humanizePercentage
|
||||
}} free.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
|
||||
expr: |
|
||||
100 * kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
< 3
|
||||
< 0.03
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -524,14 +763,14 @@ spec:
|
||||
annotations:
|
||||
message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} is expected to fill up within four
|
||||
days. Currently {{ printf "%0.2f" $value }}% is available.
|
||||
days. Currently {{ $value | humanizePercentage }} is available.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
|
||||
expr: |
|
||||
100 * (
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet"}
|
||||
) < 15
|
||||
) < 0.15
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0
|
||||
for: 5m
|
||||
@ -551,7 +790,7 @@ spec:
|
||||
rules:
|
||||
- alert: KubeNodeNotReady
|
||||
annotations:
|
||||
message: '{{ $labels.node }} has been unready for more than an hour.'
|
||||
message: '{{ $labels.node }} has been unready for more than 15 minutes.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
||||
expr: |
|
||||
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||
@ -571,33 +810,23 @@ spec:
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
||||
}}' is experiencing {{ printf "%0.0f" $value }}% errors.'
|
||||
}}' is experiencing {{ $value | humanizePercentage }} errors.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
||||
expr: |
|
||||
(sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job)
|
||||
/
|
||||
sum(rate(rest_client_requests_total[5m])) by (instance, job))
|
||||
* 100 > 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeClientErrors
|
||||
annotations:
|
||||
message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance
|
||||
}}' is experiencing {{ printf "%0.0f" $value }} errors / second.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors
|
||||
expr: |
|
||||
sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1
|
||||
> 0.01
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeletTooManyPods
|
||||
annotations:
|
||||
message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close
|
||||
to the limit of 110.
|
||||
message: Kubelet '{{ $labels.node }}' is running at {{ $value | humanizePercentage
|
||||
}} of its Pod capacity.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
|
||||
expr: |
|
||||
kubelet_running_pod_count{job="kubelet"} > 110 * 0.9
|
||||
max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -623,47 +852,51 @@ spec:
|
||||
severity: critical
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value }}% of requests.
|
||||
message: API server is returning errors for {{ $value | humanizePercentage
|
||||
}} of requests.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 3
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.03
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value }}% of requests.
|
||||
message: API server is returning errors for {{ $value | humanizePercentage
|
||||
}} of requests.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m]))
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) * 100 > 1
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value }}% of requests for
|
||||
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
||||
message: API server is returning errors for {{ $value | humanizePercentage
|
||||
}} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource
|
||||
}}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 10
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubeAPIErrorsHigh
|
||||
annotations:
|
||||
message: API server is returning errors for {{ $value }}% of requests for
|
||||
{{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource }}.
|
||||
message: API server is returning errors for {{ $value | humanizePercentage
|
||||
}} of requests for {{ $labels.verb }} {{ $labels.resource }} {{ $labels.subresource
|
||||
}}.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
|
||||
expr: |
|
||||
sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[5m])) by (resource,subresource,verb)
|
||||
/
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) * 100 > 5
|
||||
sum(rate(apiserver_request_total{job="apiserver"}[5m])) by (resource,subresource,verb) > 0.05
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -794,7 +1027,8 @@ spec:
|
||||
- alert: PrometheusDuplicateTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
|
||||
{{$value | humanize}} samples/s with different values but duplicated timestamp.
|
||||
{{ printf "%.4g" $value }} samples/s with different values but duplicated
|
||||
timestamp.
|
||||
summary: Prometheus is dropping samples with duplicate timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
@ -804,7 +1038,7 @@ spec:
|
||||
- alert: PrometheusOutOfOrderTimestamps
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is dropping
|
||||
{{$value | humanize}} samples/s with timestamps arriving out of order.
|
||||
{{ printf "%.4g" $value }} samples/s with timestamps arriving out of order.
|
||||
summary: Prometheus drops samples with out-of-order timestamps.
|
||||
expr: |
|
||||
rate(prometheus_target_scrapes_sample_out_of_order_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
@ -848,6 +1082,25 @@ spec:
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: PrometheusRemoteWriteDesiredShards
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
|
||||
desired shards calculation wants to run {{ printf $value }} shards, which
|
||||
is more than the max of {{ printf `prometheus_remote_storage_shards_max{instance="%s",job="prometheus-k8s",namespace="monitoring"}`
|
||||
$labels.instance | query | first | value }}.
|
||||
summary: Prometheus remote write desired shards calculation wants to run more
|
||||
than configured max shards.
|
||||
expr: |
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_shards_desired{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
> on(job, instance) group_right
|
||||
max_over_time(prometheus_remote_storage_shards_max{job="prometheus-k8s",namespace="monitoring"}[5m])
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PrometheusRuleFailures
|
||||
annotations:
|
||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has failed to
|
||||
@ -902,8 +1155,10 @@ spec:
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
message: '{{ $value }}% of the {{ $labels.job }} targets are down.'
|
||||
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
||||
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }} targets are
|
||||
down.'
|
||||
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job,
|
||||
namespace, service)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -918,26 +1173,6 @@ spec:
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
- name: kube-prometheus-node-alerting.rules
|
||||
rules:
|
||||
- alert: NodeDiskRunningFull
|
||||
annotations:
|
||||
message: Device {{ $labels.device }} on node {{ $labels.instance }} will be
|
||||
full within the next 24 hours.
|
||||
expr: |
|
||||
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeDiskRunningFull
|
||||
annotations:
|
||||
message: Device {{ $labels.device }} on node {{ $labels.instance }} will be
|
||||
full within the next 2 hours.
|
||||
expr: |
|
||||
(node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: node-time
|
||||
rules:
|
||||
- alert: ClockSkewDetected
|
||||
@ -951,24 +1186,6 @@ spec:
|
||||
severity: warning
|
||||
- name: node-network
|
||||
rules:
|
||||
- alert: NetworkReceiveErrors
|
||||
annotations:
|
||||
message: Network interface "{{ $labels.device }}" showing receive errors on
|
||||
node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
expr: |
|
||||
rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NetworkTransmitErrors
|
||||
annotations:
|
||||
message: Network interface "{{ $labels.device }}" showing transmit errors
|
||||
on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||
expr: |
|
||||
rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeNetworkInterfaceFlapping
|
||||
annotations:
|
||||
message: Network interface "{{ $labels.device }}" changing it's up status
|
||||
|
Loading…
Reference in New Issue
Block a user