mirror of
https://github.com/carlosedp/cluster-monitoring.git
synced 2024-11-20 19:07:17 +01:00
Updated vendor libraries
This commit is contained in:
parent
53b31ec856
commit
5065d369e1
@ -8,7 +8,7 @@
|
|||||||
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
|
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "9536d7787789b74b692cd8a5482a2801b1aba232"
|
"version": "e123a1de479dbb911b4070f1bfcbd1e65e02209e"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "ksonnet",
|
"name": "ksonnet",
|
||||||
@ -28,7 +28,7 @@
|
|||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "4c23c06fff9ef50744f5ed306c9ab0c4bd78a144"
|
"version": "668950e4af13f0153fa1d7b58ebe7023b33f2217"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafonnet",
|
"name": "grafonnet",
|
||||||
@ -38,7 +38,7 @@
|
|||||||
"subdir": "grafonnet"
|
"subdir": "grafonnet"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "eea8b5ba6b8883cf2df5a17c39a42c4b57c0d63e"
|
"version": "11022f5e920ac1ea960556193e3f0ab57d70d7c5"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafana-builder",
|
"name": "grafana-builder",
|
||||||
@ -48,7 +48,7 @@
|
|||||||
"subdir": "grafana-builder"
|
"subdir": "grafana-builder"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "c6932cf90bce4fef218b4308effc9f15c4219a01"
|
"version": "ec3d4f943df01f517a083305666cd1c87bcc7e94"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafana",
|
"name": "grafana",
|
||||||
@ -58,7 +58,7 @@
|
|||||||
"subdir": "grafana"
|
"subdir": "grafana"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "da19aef6f5b378fb5281e6f61dbadbbf734d45ee"
|
"version": "9ddf5a198b0f7c898dc061158ea427112acbae11"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "prometheus-operator",
|
"name": "prometheus-operator",
|
||||||
@ -68,7 +68,7 @@
|
|||||||
"subdir": "jsonnet/prometheus-operator"
|
"subdir": "jsonnet/prometheus-operator"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "72ec4b9b16ef11700724dc71fec77112536eed40"
|
"version": "174c9bf17bec78b055e8e63c6ae3a3dc9bb0a3a8"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "etcd-mixin",
|
"name": "etcd-mixin",
|
||||||
@ -78,7 +78,7 @@
|
|||||||
"subdir": "Documentation/etcd-mixin"
|
"subdir": "Documentation/etcd-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "15b6a17be48dea91a11497980b9adab541add7f0"
|
"version": "6070db22ed3d46372a5600fe8f35907f4d706bdb"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -1378,6 +1378,12 @@ spec:
|
|||||||
under. This is necessary to generate correct URLs. This is necessary
|
under. This is necessary to generate correct URLs. This is necessary
|
||||||
if Alertmanager is not served from root of a DNS name.
|
if Alertmanager is not served from root of a DNS name.
|
||||||
type: string
|
type: string
|
||||||
|
image:
|
||||||
|
description: Image if specified has precedence over baseImage, tag and
|
||||||
|
sha combinations. Specifying the version is still necessary to ensure
|
||||||
|
the Prometheus Operator knows what version of Alertmanager is being
|
||||||
|
configured.
|
||||||
|
type: string
|
||||||
imagePullSecrets:
|
imagePullSecrets:
|
||||||
description: An optional list of references to secrets in the same namespace
|
description: An optional list of references to secrets in the same namespace
|
||||||
to use for pulling prometheus and alertmanager images from registries
|
to use for pulling prometheus and alertmanager images from registries
|
||||||
|
@ -1550,6 +1550,12 @@ spec:
|
|||||||
under. This is necessary to generate correct URLs. This is necessary
|
under. This is necessary to generate correct URLs. This is necessary
|
||||||
if Prometheus is not served from root of a DNS name.
|
if Prometheus is not served from root of a DNS name.
|
||||||
type: string
|
type: string
|
||||||
|
image:
|
||||||
|
description: Image if specified has precedence over baseImage, tag and
|
||||||
|
sha combinations. Specifying the version is still necessary to ensure
|
||||||
|
the Prometheus Operator knows what version of Prometheus is being
|
||||||
|
configured.
|
||||||
|
type: string
|
||||||
imagePullSecrets:
|
imagePullSecrets:
|
||||||
description: An optional list of references to secrets in the same namespace
|
description: An optional list of references to secrets in the same namespace
|
||||||
to use for pulling prometheus and alertmanager images from registries
|
to use for pulling prometheus and alertmanager images from registries
|
||||||
@ -1863,6 +1869,21 @@ spec:
|
|||||||
priorityClassName:
|
priorityClassName:
|
||||||
description: Priority class assigned to the Pods
|
description: Priority class assigned to the Pods
|
||||||
type: string
|
type: string
|
||||||
|
query:
|
||||||
|
description: QuerySpec defines the query command line flags when starting
|
||||||
|
Prometheus.
|
||||||
|
properties:
|
||||||
|
lookbackDelta:
|
||||||
|
description: The delta difference allowed for retrieving metrics
|
||||||
|
during expression evaluations.
|
||||||
|
type: string
|
||||||
|
maxConcurrency:
|
||||||
|
description: Number of concurrent queries that can be run at once.
|
||||||
|
format: int32
|
||||||
|
type: integer
|
||||||
|
timeout:
|
||||||
|
description: Maximum time a query may take before being aborted.
|
||||||
|
type: string
|
||||||
remoteRead:
|
remoteRead:
|
||||||
description: If specified, the remote_read spec. This is an experimental
|
description: If specified, the remote_read spec. This is an experimental
|
||||||
feature, it may change in any upcoming release in a breaking way.
|
feature, it may change in any upcoming release in a breaking way.
|
||||||
@ -2943,6 +2964,12 @@ spec:
|
|||||||
type: boolean
|
type: boolean
|
||||||
required:
|
required:
|
||||||
- key
|
- key
|
||||||
|
image:
|
||||||
|
description: Image if specified has precedence over baseImage, tag
|
||||||
|
and sha combinations. Specifying the version is still necessary
|
||||||
|
to ensure the Prometheus Operator knows what version of Thanos
|
||||||
|
is being configured.
|
||||||
|
type: string
|
||||||
peers:
|
peers:
|
||||||
description: Peers is a DNS name for Thanos to discover peers through.
|
description: Peers is a DNS name for Thanos to discover peers through.
|
||||||
type: string
|
type: string
|
||||||
|
@ -4896,6 +4896,12 @@ items:
|
|||||||
data:
|
data:
|
||||||
nodes.json: |-
|
nodes.json: |-
|
||||||
{
|
{
|
||||||
|
"__inputs": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"__requires": [
|
||||||
|
|
||||||
|
],
|
||||||
"annotations": {
|
"annotations": {
|
||||||
"list": [
|
"list": [
|
||||||
|
|
||||||
@ -6208,6 +6214,12 @@ items:
|
|||||||
data:
|
data:
|
||||||
persistentvolumesusage.json: |-
|
persistentvolumesusage.json: |-
|
||||||
{
|
{
|
||||||
|
"__inputs": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"__requires": [
|
||||||
|
|
||||||
|
],
|
||||||
"annotations": {
|
"annotations": {
|
||||||
"list": [
|
"list": [
|
||||||
|
|
||||||
@ -6551,6 +6563,12 @@ items:
|
|||||||
data:
|
data:
|
||||||
pods.json: |-
|
pods.json: |-
|
||||||
{
|
{
|
||||||
|
"__inputs": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"__requires": [
|
||||||
|
|
||||||
|
],
|
||||||
"annotations": {
|
"annotations": {
|
||||||
"list": [
|
"list": [
|
||||||
|
|
||||||
@ -6730,7 +6748,7 @@ items:
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))",
|
"expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
"legendFormat": "{{ container_name }}",
|
"legendFormat": "{{ container_name }}",
|
||||||
@ -6833,7 +6851,7 @@ items:
|
|||||||
"steppedLine": false,
|
"steppedLine": false,
|
||||||
"targets": [
|
"targets": [
|
||||||
{
|
{
|
||||||
"expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", pod_name=\"$pod\"}[1m])))",
|
"expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))",
|
||||||
"format": "time_series",
|
"format": "time_series",
|
||||||
"intervalFactor": 2,
|
"intervalFactor": 2,
|
||||||
"legendFormat": "{{ pod_name }}",
|
"legendFormat": "{{ pod_name }}",
|
||||||
@ -7035,6 +7053,12 @@ items:
|
|||||||
data:
|
data:
|
||||||
statefulset.json: |-
|
statefulset.json: |-
|
||||||
{
|
{
|
||||||
|
"__inputs": [
|
||||||
|
|
||||||
|
],
|
||||||
|
"__requires": [
|
||||||
|
|
||||||
|
],
|
||||||
"annotations": {
|
"annotations": {
|
||||||
"list": [
|
"list": [
|
||||||
|
|
||||||
|
@ -72,6 +72,8 @@ spec:
|
|||||||
- mountPath: /etc/grafana
|
- mountPath: /etc/grafana
|
||||||
name: grafana-config
|
name: grafana-config
|
||||||
readOnly: false
|
readOnly: false
|
||||||
|
nodeSelector:
|
||||||
|
beta.kubernetes.io/os: linux
|
||||||
securityContext:
|
securityContext:
|
||||||
runAsNonRoot: true
|
runAsNonRoot: true
|
||||||
runAsUser: 65534
|
runAsUser: 65534
|
||||||
|
@ -1,6 +1,8 @@
|
|||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: Service
|
kind: Service
|
||||||
metadata:
|
metadata:
|
||||||
|
labels:
|
||||||
|
app: grafana
|
||||||
name: grafana
|
name: grafana
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
spec:
|
spec:
|
||||||
|
12
manifests/grafana-serviceMonitor.yaml
Normal file
12
manifests/grafana-serviceMonitor.yaml
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: monitoring.coreos.com/v1
|
||||||
|
kind: ServiceMonitor
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
endpoints:
|
||||||
|
- interval: 15s
|
||||||
|
port: http
|
||||||
|
selector:
|
||||||
|
matchLabels:
|
||||||
|
app: grafana
|
@ -67,3 +67,10 @@ rules:
|
|||||||
- subjectaccessreviews
|
- subjectaccessreviews
|
||||||
verbs:
|
verbs:
|
||||||
- create
|
- create
|
||||||
|
- apiGroups:
|
||||||
|
- policy
|
||||||
|
resources:
|
||||||
|
- poddisruptionbudgets
|
||||||
|
verbs:
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
@ -19,6 +19,7 @@ spec:
|
|||||||
- --web.listen-address=127.0.0.1:9100
|
- --web.listen-address=127.0.0.1:9100
|
||||||
- --path.procfs=/host/proc
|
- --path.procfs=/host/proc
|
||||||
- --path.sysfs=/host/sys
|
- --path.sysfs=/host/sys
|
||||||
|
- --path.rootfs=/host/root
|
||||||
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
|
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
|
||||||
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
|
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
|
||||||
image: carlosedp/node_exporter:v0.17.0
|
image: carlosedp/node_exporter:v0.17.0
|
||||||
@ -42,7 +43,9 @@ spec:
|
|||||||
name: root
|
name: root
|
||||||
readOnly: true
|
readOnly: true
|
||||||
- args:
|
- args:
|
||||||
|
- --logtostderr
|
||||||
- --secure-listen-address=$(IP):9100
|
- --secure-listen-address=$(IP):9100
|
||||||
|
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
|
||||||
- --upstream=http://127.0.0.1:9100/
|
- --upstream=http://127.0.0.1:9100/
|
||||||
env:
|
env:
|
||||||
- name: IP
|
- name: IP
|
||||||
|
@ -25,7 +25,7 @@ spec:
|
|||||||
- --metrics-relist-interval=1m
|
- --metrics-relist-interval=1m
|
||||||
- --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/
|
- --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/
|
||||||
- --secure-port=6443
|
- --secure-port=6443
|
||||||
image: directxman12/k8s-prometheus-adapter-arm64:v0.4.1
|
image: carlosedp/k8s-prometheus-adapter:v0.4.1
|
||||||
name: prometheus-adapter
|
name: prometheus-adapter
|
||||||
ports:
|
ports:
|
||||||
- containerPort: 6443
|
- containerPort: 6443
|
||||||
@ -39,6 +39,8 @@ spec:
|
|||||||
- mountPath: /etc/adapter
|
- mountPath: /etc/adapter
|
||||||
name: config
|
name: config
|
||||||
readOnly: false
|
readOnly: false
|
||||||
|
nodeSelector:
|
||||||
|
beta.kubernetes.io/os: linux
|
||||||
serviceAccountName: prometheus-adapter
|
serviceAccountName: prometheus-adapter
|
||||||
volumes:
|
volumes:
|
||||||
- emptyDir: {}
|
- emptyDir: {}
|
||||||
|
@ -39,4 +39,17 @@ items:
|
|||||||
- kind: ServiceAccount
|
- kind: ServiceAccount
|
||||||
name: prometheus-k8s
|
name: prometheus-k8s
|
||||||
namespace: monitoring
|
namespace: monitoring
|
||||||
|
- apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: RoleBinding
|
||||||
|
metadata:
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: logging
|
||||||
|
roleRef:
|
||||||
|
apiGroup: rbac.authorization.k8s.io
|
||||||
|
kind: Role
|
||||||
|
name: prometheus-k8s
|
||||||
|
subjects:
|
||||||
|
- kind: ServiceAccount
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: monitoring
|
||||||
kind: RoleBindingList
|
kind: RoleBindingList
|
||||||
|
@ -9,7 +9,6 @@ items:
|
|||||||
- apiGroups:
|
- apiGroups:
|
||||||
- ""
|
- ""
|
||||||
resources:
|
resources:
|
||||||
- nodes
|
|
||||||
- services
|
- services
|
||||||
- endpoints
|
- endpoints
|
||||||
- pods
|
- pods
|
||||||
@ -26,7 +25,6 @@ items:
|
|||||||
- apiGroups:
|
- apiGroups:
|
||||||
- ""
|
- ""
|
||||||
resources:
|
resources:
|
||||||
- nodes
|
|
||||||
- services
|
- services
|
||||||
- endpoints
|
- endpoints
|
||||||
- pods
|
- pods
|
||||||
@ -43,7 +41,22 @@ items:
|
|||||||
- apiGroups:
|
- apiGroups:
|
||||||
- ""
|
- ""
|
||||||
resources:
|
resources:
|
||||||
- nodes
|
- services
|
||||||
|
- endpoints
|
||||||
|
- pods
|
||||||
|
verbs:
|
||||||
|
- get
|
||||||
|
- list
|
||||||
|
- watch
|
||||||
|
- apiVersion: rbac.authorization.k8s.io/v1
|
||||||
|
kind: Role
|
||||||
|
metadata:
|
||||||
|
name: prometheus-k8s
|
||||||
|
namespace: logging
|
||||||
|
rules:
|
||||||
|
- apiGroups:
|
||||||
|
- ""
|
||||||
|
resources:
|
||||||
- services
|
- services
|
||||||
- endpoints
|
- endpoints
|
||||||
- pods
|
- pods
|
||||||
|
@ -288,21 +288,24 @@ spec:
|
|||||||
record: 'node:node_inodes_free:'
|
record: 'node:node_inodes_free:'
|
||||||
- name: kube-prometheus-node-recording.rules
|
- name: kube-prometheus-node-recording.rules
|
||||||
rules:
|
rules:
|
||||||
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance)
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
|
||||||
|
(instance)
|
||||||
record: instance:node_cpu:rate:sum
|
record: instance:node_cpu:rate:sum
|
||||||
- expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
- expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
|
||||||
BY (instance)
|
BY (instance)
|
||||||
record: instance:node_filesystem_usage:sum
|
record: instance:node_filesystem_usage:sum
|
||||||
- expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
- expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
|
||||||
record: instance:node_network_receive_bytes:rate:sum
|
record: instance:node_network_receive_bytes:rate:sum
|
||||||
- expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
- expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
|
||||||
record: instance:node_network_transmit_bytes:rate:sum
|
record: instance:node_network_transmit_bytes:rate:sum
|
||||||
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode)
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT
|
||||||
/ ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
(cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
|
||||||
|
BY (instance, cpu)) BY (instance)
|
||||||
record: instance:node_cpu:ratio
|
record: instance:node_cpu:ratio
|
||||||
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m]))
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
|
||||||
record: cluster:node_cpu:sum_rate5m
|
record: cluster:node_cpu:sum_rate5m
|
||||||
- expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
|
||||||
|
BY (instance, cpu))
|
||||||
record: cluster:node_cpu:ratio
|
record: cluster:node_cpu:ratio
|
||||||
- name: kubernetes-absent
|
- name: kubernetes-absent
|
||||||
rules:
|
rules:
|
||||||
@ -311,7 +314,7 @@ spec:
|
|||||||
message: Alertmanager has disappeared from Prometheus target discovery.
|
message: Alertmanager has disappeared from Prometheus target discovery.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
|
||||||
expr: |
|
expr: |
|
||||||
absent(up{job="alertmanager-main"} == 1)
|
absent(up{job="alertmanager-main",namespace="monitoring"} == 1)
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@ -383,7 +386,7 @@ spec:
|
|||||||
message: Prometheus has disappeared from Prometheus target discovery.
|
message: Prometheus has disappeared from Prometheus target discovery.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
|
||||||
expr: |
|
expr: |
|
||||||
absent(up{job="prometheus-k8s"} == 1)
|
absent(up{job="prometheus-k8s",namespace="monitoring"} == 1)
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@ -392,7 +395,7 @@ spec:
|
|||||||
message: PrometheusOperator has disappeared from Prometheus target discovery.
|
message: PrometheusOperator has disappeared from Prometheus target discovery.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
|
||||||
expr: |
|
expr: |
|
||||||
absent(up{job="prometheus-operator"} == 1)
|
absent(up{job="prometheus-operator",namespace="monitoring"} == 1)
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@ -626,8 +629,8 @@ spec:
|
|||||||
}} for container {{ $labels.container_name }} in pod {{ $labels.pod_name
|
}} for container {{ $labels.container_name }} in pod {{ $labels.pod_name
|
||||||
}}.'
|
}}.'
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
|
||||||
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m]))
|
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\",
|
||||||
by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m]))
|
}[5m])) by (container_name, pod_name, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m]))
|
||||||
by (container_name, pod_name, namespace)\n > 25 \n"
|
by (container_name, pod_name, namespace)\n > 25 \n"
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
@ -773,7 +776,8 @@ spec:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeClientCertificateExpiration
|
- alert: KubeClientCertificateExpiration
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubernetes API certificate is expiring in less than 7 days.
|
message: A client certificate used to authenticate to the apiserver is expiring
|
||||||
|
in less than 7 days.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||||
@ -781,7 +785,8 @@ spec:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeClientCertificateExpiration
|
- alert: KubeClientCertificateExpiration
|
||||||
annotations:
|
annotations:
|
||||||
message: Kubernetes API certificate is expiring in less than 24 hours.
|
message: A client certificate used to authenticate to the apiserver is expiring
|
||||||
|
in less than 24 hours.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||||
expr: |
|
expr: |
|
||||||
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||||
@ -794,7 +799,7 @@ spec:
|
|||||||
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
|
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
|
||||||
are out of sync.
|
are out of sync.
|
||||||
expr: |
|
expr: |
|
||||||
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
|
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@ -803,7 +808,7 @@ spec:
|
|||||||
message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
||||||
}}/{{ $labels.pod}}.
|
}}/{{ $labels.pod}}.
|
||||||
expr: |
|
expr: |
|
||||||
alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0
|
alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -811,9 +816,9 @@ spec:
|
|||||||
annotations:
|
annotations:
|
||||||
message: Alertmanager has not found all other members of the cluster.
|
message: Alertmanager has not found all other members of the cluster.
|
||||||
expr: |
|
expr: |
|
||||||
alertmanager_cluster_members{job="alertmanager-main"}
|
alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}
|
||||||
!= on (service) GROUP_LEFT()
|
!= on (service) GROUP_LEFT()
|
||||||
count by (service) (alertmanager_cluster_members{job="alertmanager-main"})
|
count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"})
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@ -860,7 +865,7 @@ spec:
|
|||||||
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
||||||
summary: Reloading Prometheus' configuration failed
|
summary: Reloading Prometheus' configuration failed
|
||||||
expr: |
|
expr: |
|
||||||
prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0
|
prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"} == 0
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -870,7 +875,7 @@ spec:
|
|||||||
$labels.pod}}
|
$labels.pod}}
|
||||||
summary: Prometheus' alert notification queue is running full
|
summary: Prometheus' alert notification queue is running full
|
||||||
expr: |
|
expr: |
|
||||||
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"}
|
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -880,7 +885,7 @@ spec:
|
|||||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||||
summary: Errors while sending alert from Prometheus
|
summary: Errors while sending alert from Prometheus
|
||||||
expr: |
|
expr: |
|
||||||
rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01
|
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.01
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -890,7 +895,7 @@ spec:
|
|||||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||||
summary: Errors while sending alerts from Prometheus
|
summary: Errors while sending alerts from Prometheus
|
||||||
expr: |
|
expr: |
|
||||||
rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03
|
rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.03
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
@ -900,7 +905,7 @@ spec:
|
|||||||
to any Alertmanagers
|
to any Alertmanagers
|
||||||
summary: Prometheus is not connected to any Alertmanagers
|
summary: Prometheus is not connected to any Alertmanagers
|
||||||
expr: |
|
expr: |
|
||||||
prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1
|
prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"} < 1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -910,7 +915,7 @@ spec:
|
|||||||
reload failures over the last four hours.'
|
reload failures over the last four hours.'
|
||||||
summary: Prometheus has issues reloading data blocks from disk
|
summary: Prometheus has issues reloading data blocks from disk
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0
|
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0
|
||||||
for: 12h
|
for: 12h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -920,7 +925,7 @@ spec:
|
|||||||
compaction failures over the last four hours.'
|
compaction failures over the last four hours.'
|
||||||
summary: Prometheus has issues compacting sample blocks
|
summary: Prometheus has issues compacting sample blocks
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0
|
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0
|
||||||
for: 12h
|
for: 12h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -930,7 +935,7 @@ spec:
|
|||||||
log (WAL).'
|
log (WAL).'
|
||||||
summary: Prometheus write-ahead log is corrupted
|
summary: Prometheus write-ahead log is corrupted
|
||||||
expr: |
|
expr: |
|
||||||
tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0
|
tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
|
||||||
for: 4h
|
for: 4h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -940,7 +945,7 @@ spec:
|
|||||||
samples.
|
samples.
|
||||||
summary: Prometheus isn't ingesting samples
|
summary: Prometheus isn't ingesting samples
|
||||||
expr: |
|
expr: |
|
||||||
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0
|
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -950,7 +955,7 @@ spec:
|
|||||||
due to duplicate timestamps but different values'
|
due to duplicate timestamps but different values'
|
||||||
summary: Prometheus has many samples rejected
|
summary: Prometheus has many samples rejected
|
||||||
expr: |
|
expr: |
|
||||||
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0
|
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -961,7 +966,7 @@ spec:
|
|||||||
message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
|
message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
|
||||||
}} Namespace.
|
}} Namespace.
|
||||||
expr: |
|
expr: |
|
||||||
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1
|
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
@ -969,7 +974,7 @@ spec:
|
|||||||
annotations:
|
annotations:
|
||||||
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
|
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
|
||||||
expr: |
|
expr: |
|
||||||
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1
|
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
@ -13,3 +13,4 @@ spec:
|
|||||||
selector:
|
selector:
|
||||||
app: prometheus
|
app: prometheus
|
||||||
prometheus: k8s
|
prometheus: k8s
|
||||||
|
sessionAffinity: ClientIP
|
||||||
|
@ -10,6 +10,7 @@ spec:
|
|||||||
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||||
interval: 15s
|
interval: 15s
|
||||||
port: metrics
|
port: metrics
|
||||||
|
jobLabel: k8s-app
|
||||||
namespaceSelector:
|
namespaceSelector:
|
||||||
matchNames:
|
matchNames:
|
||||||
- kube-system
|
- kube-system
|
||||||
|
@ -36,7 +36,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
|
|||||||
addonResizer: "carlosedp/addon-resizer",
|
addonResizer: "carlosedp/addon-resizer",
|
||||||
nodeExporter: "carlosedp/node_exporter",
|
nodeExporter: "carlosedp/node_exporter",
|
||||||
prometheusOperator: "carlosedp/prometheus-operator",
|
prometheusOperator: "carlosedp/prometheus-operator",
|
||||||
prometheusAdapter: "directxman12/k8s-prometheus-adapter-arm64",
|
prometheusAdapter: "carlosedp/k8s-prometheus-adapter",
|
||||||
grafana: "carlosedp/monitoring-grafana",
|
grafana: "carlosedp/monitoring-grafana",
|
||||||
configmapReloader: "carlosedp/configmap-reload",
|
configmapReloader: "carlosedp/configmap-reload",
|
||||||
prometheusConfigReloader: "carlosedp/prometheus-config-reloader",
|
prometheusConfigReloader: "carlosedp/prometheus-config-reloader",
|
||||||
@ -45,7 +45,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
|
|||||||
prometheus+:: {
|
prometheus+:: {
|
||||||
names: 'k8s',
|
names: 'k8s',
|
||||||
replicas: 1,
|
replicas: 1,
|
||||||
namespaces: ["default", "kube-system","monitoring"],
|
namespaces: ["default", "kube-system","monitoring","logging"],
|
||||||
},
|
},
|
||||||
|
|
||||||
alertmanager+:: {
|
alertmanager+:: {
|
||||||
|
Loading…
Reference in New Issue
Block a user