mirror of
https://github.com/carlosedp/cluster-monitoring.git
synced 2024-11-20 19:07:17 +01:00
Updated vendor libraries
This commit is contained in:
parent
53b31ec856
commit
5065d369e1
@ -8,7 +8,7 @@
|
||||
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
|
||||
}
|
||||
},
|
||||
"version": "9536d7787789b74b692cd8a5482a2801b1aba232"
|
||||
"version": "e123a1de479dbb911b4070f1bfcbd1e65e02209e"
|
||||
},
|
||||
{
|
||||
"name": "ksonnet",
|
||||
@ -28,7 +28,7 @@
|
||||
"subdir": ""
|
||||
}
|
||||
},
|
||||
"version": "4c23c06fff9ef50744f5ed306c9ab0c4bd78a144"
|
||||
"version": "668950e4af13f0153fa1d7b58ebe7023b33f2217"
|
||||
},
|
||||
{
|
||||
"name": "grafonnet",
|
||||
@ -38,7 +38,7 @@
|
||||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "eea8b5ba6b8883cf2df5a17c39a42c4b57c0d63e"
|
||||
"version": "11022f5e920ac1ea960556193e3f0ab57d70d7c5"
|
||||
},
|
||||
{
|
||||
"name": "grafana-builder",
|
||||
@ -48,7 +48,7 @@
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "c6932cf90bce4fef218b4308effc9f15c4219a01"
|
||||
"version": "ec3d4f943df01f517a083305666cd1c87bcc7e94"
|
||||
},
|
||||
{
|
||||
"name": "grafana",
|
||||
@ -58,7 +58,7 @@
|
||||
"subdir": "grafana"
|
||||
}
|
||||
},
|
||||
"version": "da19aef6f5b378fb5281e6f61dbadbbf734d45ee"
|
||||
"version": "9ddf5a198b0f7c898dc061158ea427112acbae11"
|
||||
},
|
||||
{
|
||||
"name": "prometheus-operator",
|
||||
@ -68,7 +68,7 @@
|
||||
"subdir": "jsonnet/prometheus-operator"
|
||||
}
|
||||
},
|
||||
"version": "72ec4b9b16ef11700724dc71fec77112536eed40"
|
||||
"version": "174c9bf17bec78b055e8e63c6ae3a3dc9bb0a3a8"
|
||||
},
|
||||
{
|
||||
"name": "etcd-mixin",
|
||||
@ -78,7 +78,7 @@
|
||||
"subdir": "Documentation/etcd-mixin"
|
||||
}
|
||||
},
|
||||
"version": "15b6a17be48dea91a11497980b9adab541add7f0"
|
||||
"version": "6070db22ed3d46372a5600fe8f35907f4d706bdb"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
@ -1378,6 +1378,12 @@ spec:
|
||||
under. This is necessary to generate correct URLs. This is necessary
|
||||
if Alertmanager is not served from root of a DNS name.
|
||||
type: string
|
||||
image:
|
||||
description: Image if specified has precedence over baseImage, tag and
|
||||
sha combinations. Specifying the version is still necessary to ensure
|
||||
the Prometheus Operator knows what version of Alertmanager is being
|
||||
configured.
|
||||
type: string
|
||||
imagePullSecrets:
|
||||
description: An optional list of references to secrets in the same namespace
|
||||
to use for pulling prometheus and alertmanager images from registries
|
||||
|
@ -1550,6 +1550,12 @@ spec:
|
||||
under. This is necessary to generate correct URLs. This is necessary
|
||||
if Prometheus is not served from root of a DNS name.
|
||||
type: string
|
||||
image:
|
||||
description: Image if specified has precedence over baseImage, tag and
|
||||
sha combinations. Specifying the version is still necessary to ensure
|
||||
the Prometheus Operator knows what version of Prometheus is being
|
||||
configured.
|
||||
type: string
|
||||
imagePullSecrets:
|
||||
description: An optional list of references to secrets in the same namespace
|
||||
to use for pulling prometheus and alertmanager images from registries
|
||||
@ -1863,6 +1869,21 @@ spec:
|
||||
priorityClassName:
|
||||
description: Priority class assigned to the Pods
|
||||
type: string
|
||||
query:
|
||||
description: QuerySpec defines the query command line flags when starting
|
||||
Prometheus.
|
||||
properties:
|
||||
lookbackDelta:
|
||||
description: The delta difference allowed for retrieving metrics
|
||||
during expression evaluations.
|
||||
type: string
|
||||
maxConcurrency:
|
||||
description: Number of concurrent queries that can be run at once.
|
||||
format: int32
|
||||
type: integer
|
||||
timeout:
|
||||
description: Maximum time a query may take before being aborted.
|
||||
type: string
|
||||
remoteRead:
|
||||
description: If specified, the remote_read spec. This is an experimental
|
||||
feature, it may change in any upcoming release in a breaking way.
|
||||
@ -2943,6 +2964,12 @@ spec:
|
||||
type: boolean
|
||||
required:
|
||||
- key
|
||||
image:
|
||||
description: Image if specified has precedence over baseImage, tag
|
||||
and sha combinations. Specifying the version is still necessary
|
||||
to ensure the Prometheus Operator knows what version of Thanos
|
||||
is being configured.
|
||||
type: string
|
||||
peers:
|
||||
description: Peers is a DNS name for Thanos to discover peers through.
|
||||
type: string
|
||||
|
@ -4896,6 +4896,12 @@ items:
|
||||
data:
|
||||
nodes.json: |-
|
||||
{
|
||||
"__inputs": [
|
||||
|
||||
],
|
||||
"__requires": [
|
||||
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
|
||||
@ -6208,6 +6214,12 @@ items:
|
||||
data:
|
||||
persistentvolumesusage.json: |-
|
||||
{
|
||||
"__inputs": [
|
||||
|
||||
],
|
||||
"__requires": [
|
||||
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
|
||||
@ -6551,6 +6563,12 @@ items:
|
||||
data:
|
||||
pods.json: |-
|
||||
{
|
||||
"__inputs": [
|
||||
|
||||
],
|
||||
"__requires": [
|
||||
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
|
||||
@ -6730,7 +6748,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))",
|
||||
"expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{ container_name }}",
|
||||
@ -6833,7 +6851,7 @@ items:
|
||||
"steppedLine": false,
|
||||
"targets": [
|
||||
{
|
||||
"expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", pod_name=\"$pod\"}[1m])))",
|
||||
"expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))",
|
||||
"format": "time_series",
|
||||
"intervalFactor": 2,
|
||||
"legendFormat": "{{ pod_name }}",
|
||||
@ -7035,6 +7053,12 @@ items:
|
||||
data:
|
||||
statefulset.json: |-
|
||||
{
|
||||
"__inputs": [
|
||||
|
||||
],
|
||||
"__requires": [
|
||||
|
||||
],
|
||||
"annotations": {
|
||||
"list": [
|
||||
|
||||
|
@ -72,6 +72,8 @@ spec:
|
||||
- mountPath: /etc/grafana
|
||||
name: grafana-config
|
||||
readOnly: false
|
||||
nodeSelector:
|
||||
beta.kubernetes.io/os: linux
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
|
@ -1,6 +1,8 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app: grafana
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
spec:
|
||||
|
12
manifests/grafana-serviceMonitor.yaml
Normal file
12
manifests/grafana-serviceMonitor.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
spec:
|
||||
endpoints:
|
||||
- interval: 15s
|
||||
port: http
|
||||
selector:
|
||||
matchLabels:
|
||||
app: grafana
|
@ -67,3 +67,10 @@ rules:
|
||||
- subjectaccessreviews
|
||||
verbs:
|
||||
- create
|
||||
- apiGroups:
|
||||
- policy
|
||||
resources:
|
||||
- poddisruptionbudgets
|
||||
verbs:
|
||||
- list
|
||||
- watch
|
||||
|
@ -19,6 +19,7 @@ spec:
|
||||
- --web.listen-address=127.0.0.1:9100
|
||||
- --path.procfs=/host/proc
|
||||
- --path.sysfs=/host/sys
|
||||
- --path.rootfs=/host/root
|
||||
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
|
||||
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
|
||||
image: carlosedp/node_exporter:v0.17.0
|
||||
@ -42,7 +43,9 @@ spec:
|
||||
name: root
|
||||
readOnly: true
|
||||
- args:
|
||||
- --logtostderr
|
||||
- --secure-listen-address=$(IP):9100
|
||||
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
|
||||
- --upstream=http://127.0.0.1:9100/
|
||||
env:
|
||||
- name: IP
|
||||
|
@ -25,7 +25,7 @@ spec:
|
||||
- --metrics-relist-interval=1m
|
||||
- --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/
|
||||
- --secure-port=6443
|
||||
image: directxman12/k8s-prometheus-adapter-arm64:v0.4.1
|
||||
image: carlosedp/k8s-prometheus-adapter:v0.4.1
|
||||
name: prometheus-adapter
|
||||
ports:
|
||||
- containerPort: 6443
|
||||
@ -39,6 +39,8 @@ spec:
|
||||
- mountPath: /etc/adapter
|
||||
name: config
|
||||
readOnly: false
|
||||
nodeSelector:
|
||||
beta.kubernetes.io/os: linux
|
||||
serviceAccountName: prometheus-adapter
|
||||
volumes:
|
||||
- emptyDir: {}
|
||||
|
@ -39,4 +39,17 @@ items:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
- apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: logging
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: prometheus-k8s
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
kind: RoleBindingList
|
||||
|
@ -9,7 +9,6 @@ items:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
@ -26,7 +25,6 @@ items:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
@ -43,7 +41,22 @@ items:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- nodes
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs:
|
||||
- get
|
||||
- list
|
||||
- watch
|
||||
- apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: logging
|
||||
rules:
|
||||
- apiGroups:
|
||||
- ""
|
||||
resources:
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
|
@ -288,21 +288,24 @@ spec:
|
||||
record: 'node:node_inodes_free:'
|
||||
- name: kube-prometheus-node-recording.rules
|
||||
rules:
|
||||
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
|
||||
(instance)
|
||||
record: instance:node_cpu:rate:sum
|
||||
- expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
||||
- expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
|
||||
BY (instance)
|
||||
record: instance:node_filesystem_usage:sum
|
||||
- expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
||||
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_receive_bytes:rate:sum
|
||||
- expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
||||
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||
record: instance:node_network_transmit_bytes:rate:sum
|
||||
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode)
|
||||
/ ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT
|
||||
(cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
|
||||
BY (instance, cpu)) BY (instance)
|
||||
record: instance:node_cpu:ratio
|
||||
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
|
||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
|
||||
record: cluster:node_cpu:sum_rate5m
|
||||
- expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
||||
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
|
||||
BY (instance, cpu))
|
||||
record: cluster:node_cpu:ratio
|
||||
- name: kubernetes-absent
|
||||
rules:
|
||||
@ -311,7 +314,7 @@ spec:
|
||||
message: Alertmanager has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
|
||||
expr: |
|
||||
absent(up{job="alertmanager-main"} == 1)
|
||||
absent(up{job="alertmanager-main",namespace="monitoring"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -383,7 +386,7 @@ spec:
|
||||
message: Prometheus has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
|
||||
expr: |
|
||||
absent(up{job="prometheus-k8s"} == 1)
|
||||
absent(up{job="prometheus-k8s",namespace="monitoring"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -392,7 +395,7 @@ spec:
|
||||
message: PrometheusOperator has disappeared from Prometheus target discovery.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
|
||||
expr: |
|
||||
absent(up{job="prometheus-operator"} == 1)
|
||||
absent(up{job="prometheus-operator",namespace="monitoring"} == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -626,8 +629,8 @@ spec:
|
||||
}} for container {{ $labels.container_name }} in pod {{ $labels.pod_name
|
||||
}}.'
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
||||
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m]))
|
||||
by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m]))
|
||||
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\",
|
||||
}[5m])) by (container_name, pod_name, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m]))
|
||||
by (container_name, pod_name, namespace)\n > 25 \n"
|
||||
for: 15m
|
||||
labels:
|
||||
@ -773,7 +776,8 @@ spec:
|
||||
severity: warning
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: Kubernetes API certificate is expiring in less than 7 days.
|
||||
message: A client certificate used to authenticate to the apiserver is expiring
|
||||
in less than 7 days.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||
@ -781,7 +785,8 @@ spec:
|
||||
severity: warning
|
||||
- alert: KubeClientCertificateExpiration
|
||||
annotations:
|
||||
message: Kubernetes API certificate is expiring in less than 24 hours.
|
||||
message: A client certificate used to authenticate to the apiserver is expiring
|
||||
in less than 24 hours.
|
||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||
expr: |
|
||||
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||
@ -794,7 +799,7 @@ spec:
|
||||
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
|
||||
are out of sync.
|
||||
expr: |
|
||||
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
|
||||
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -803,7 +808,7 @@ spec:
|
||||
message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
||||
}}/{{ $labels.pod}}.
|
||||
expr: |
|
||||
alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0
|
||||
alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -811,9 +816,9 @@ spec:
|
||||
annotations:
|
||||
message: Alertmanager has not found all other members of the cluster.
|
||||
expr: |
|
||||
alertmanager_cluster_members{job="alertmanager-main"}
|
||||
alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}
|
||||
!= on (service) GROUP_LEFT()
|
||||
count by (service) (alertmanager_cluster_members{job="alertmanager-main"})
|
||||
count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"})
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -860,7 +865,7 @@ spec:
|
||||
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
||||
summary: Reloading Prometheus' configuration failed
|
||||
expr: |
|
||||
prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0
|
||||
prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"} == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -870,7 +875,7 @@ spec:
|
||||
$labels.pod}}
|
||||
summary: Prometheus' alert notification queue is running full
|
||||
expr: |
|
||||
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"}
|
||||
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -880,7 +885,7 @@ spec:
|
||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||
summary: Errors while sending alert from Prometheus
|
||||
expr: |
|
||||
rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01
|
||||
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -890,7 +895,7 @@ spec:
|
||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||
summary: Errors while sending alerts from Prometheus
|
||||
expr: |
|
||||
rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03
|
||||
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.03
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
@ -900,7 +905,7 @@ spec:
|
||||
to any Alertmanagers
|
||||
summary: Prometheus is not connected to any Alertmanagers
|
||||
expr: |
|
||||
prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1
|
||||
prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"} < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -910,7 +915,7 @@ spec:
|
||||
reload failures over the last four hours.'
|
||||
summary: Prometheus has issues reloading data blocks from disk
|
||||
expr: |
|
||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0
|
||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
@ -920,7 +925,7 @@ spec:
|
||||
compaction failures over the last four hours.'
|
||||
summary: Prometheus has issues compacting sample blocks
|
||||
expr: |
|
||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0
|
||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
@ -930,7 +935,7 @@ spec:
|
||||
log (WAL).'
|
||||
summary: Prometheus write-ahead log is corrupted
|
||||
expr: |
|
||||
tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0
|
||||
tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
@ -940,7 +945,7 @@ spec:
|
||||
samples.
|
||||
summary: Prometheus isn't ingesting samples
|
||||
expr: |
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0
|
||||
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -950,7 +955,7 @@ spec:
|
||||
due to duplicate timestamps but different values'
|
||||
summary: Prometheus has many samples rejected
|
||||
expr: |
|
||||
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0
|
||||
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -961,7 +966,7 @@ spec:
|
||||
message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
|
||||
}} Namespace.
|
||||
expr: |
|
||||
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
|
||||
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
@ -969,7 +974,7 @@ spec:
|
||||
annotations:
|
||||
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
|
||||
expr: |
|
||||
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
|
||||
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
@ -13,3 +13,4 @@ spec:
|
||||
selector:
|
||||
app: prometheus
|
||||
prometheus: k8s
|
||||
sessionAffinity: ClientIP
|
||||
|
@ -10,6 +10,7 @@ spec:
|
||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
interval: 15s
|
||||
port: metrics
|
||||
jobLabel: k8s-app
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
||||
|
@ -36,7 +36,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
|
||||
addonResizer: "carlosedp/addon-resizer",
|
||||
nodeExporter: "carlosedp/node_exporter",
|
||||
prometheusOperator: "carlosedp/prometheus-operator",
|
||||
prometheusAdapter: "directxman12/k8s-prometheus-adapter-arm64",
|
||||
prometheusAdapter: "carlosedp/k8s-prometheus-adapter",
|
||||
grafana: "carlosedp/monitoring-grafana",
|
||||
configmapReloader: "carlosedp/configmap-reload",
|
||||
prometheusConfigReloader: "carlosedp/prometheus-config-reloader",
|
||||
@ -45,7 +45,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
|
||||
prometheus+:: {
|
||||
names: 'k8s',
|
||||
replicas: 1,
|
||||
namespaces: ["default", "kube-system","monitoring"],
|
||||
namespaces: ["default", "kube-system","monitoring","logging"],
|
||||
},
|
||||
|
||||
alertmanager+:: {
|
||||
|
Loading…
Reference in New Issue
Block a user