Updated vendor libraries

This commit is contained in:
CarlosEDP 2019-02-01 11:52:53 -02:00
parent 53b31ec856
commit 5065d369e1
16 changed files with 164 additions and 46 deletions

View File

@ -8,7 +8,7 @@
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
} }
}, },
"version": "9536d7787789b74b692cd8a5482a2801b1aba232" "version": "e123a1de479dbb911b4070f1bfcbd1e65e02209e"
}, },
{ {
"name": "ksonnet", "name": "ksonnet",
@ -28,7 +28,7 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "4c23c06fff9ef50744f5ed306c9ab0c4bd78a144" "version": "668950e4af13f0153fa1d7b58ebe7023b33f2217"
}, },
{ {
"name": "grafonnet", "name": "grafonnet",
@ -38,7 +38,7 @@
"subdir": "grafonnet" "subdir": "grafonnet"
} }
}, },
"version": "eea8b5ba6b8883cf2df5a17c39a42c4b57c0d63e" "version": "11022f5e920ac1ea960556193e3f0ab57d70d7c5"
}, },
{ {
"name": "grafana-builder", "name": "grafana-builder",
@ -48,7 +48,7 @@
"subdir": "grafana-builder" "subdir": "grafana-builder"
} }
}, },
"version": "c6932cf90bce4fef218b4308effc9f15c4219a01" "version": "ec3d4f943df01f517a083305666cd1c87bcc7e94"
}, },
{ {
"name": "grafana", "name": "grafana",
@ -58,7 +58,7 @@
"subdir": "grafana" "subdir": "grafana"
} }
}, },
"version": "da19aef6f5b378fb5281e6f61dbadbbf734d45ee" "version": "9ddf5a198b0f7c898dc061158ea427112acbae11"
}, },
{ {
"name": "prometheus-operator", "name": "prometheus-operator",
@ -68,7 +68,7 @@
"subdir": "jsonnet/prometheus-operator" "subdir": "jsonnet/prometheus-operator"
} }
}, },
"version": "72ec4b9b16ef11700724dc71fec77112536eed40" "version": "174c9bf17bec78b055e8e63c6ae3a3dc9bb0a3a8"
}, },
{ {
"name": "etcd-mixin", "name": "etcd-mixin",
@ -78,7 +78,7 @@
"subdir": "Documentation/etcd-mixin" "subdir": "Documentation/etcd-mixin"
} }
}, },
"version": "15b6a17be48dea91a11497980b9adab541add7f0" "version": "6070db22ed3d46372a5600fe8f35907f4d706bdb"
} }
] ]
} }

View File

@ -1378,6 +1378,12 @@ spec:
under. This is necessary to generate correct URLs. This is necessary under. This is necessary to generate correct URLs. This is necessary
if Alertmanager is not served from root of a DNS name. if Alertmanager is not served from root of a DNS name.
type: string type: string
image:
description: Image if specified has precedence over baseImage, tag and
sha combinations. Specifying the version is still necessary to ensure
the Prometheus Operator knows what version of Alertmanager is being
configured.
type: string
imagePullSecrets: imagePullSecrets:
description: An optional list of references to secrets in the same namespace description: An optional list of references to secrets in the same namespace
to use for pulling prometheus and alertmanager images from registries to use for pulling prometheus and alertmanager images from registries

View File

@ -1550,6 +1550,12 @@ spec:
under. This is necessary to generate correct URLs. This is necessary under. This is necessary to generate correct URLs. This is necessary
if Prometheus is not served from root of a DNS name. if Prometheus is not served from root of a DNS name.
type: string type: string
image:
description: Image if specified has precedence over baseImage, tag and
sha combinations. Specifying the version is still necessary to ensure
the Prometheus Operator knows what version of Prometheus is being
configured.
type: string
imagePullSecrets: imagePullSecrets:
description: An optional list of references to secrets in the same namespace description: An optional list of references to secrets in the same namespace
to use for pulling prometheus and alertmanager images from registries to use for pulling prometheus and alertmanager images from registries
@ -1863,6 +1869,21 @@ spec:
priorityClassName: priorityClassName:
description: Priority class assigned to the Pods description: Priority class assigned to the Pods
type: string type: string
query:
description: QuerySpec defines the query command line flags when starting
Prometheus.
properties:
lookbackDelta:
description: The delta difference allowed for retrieving metrics
during expression evaluations.
type: string
maxConcurrency:
description: Number of concurrent queries that can be run at once.
format: int32
type: integer
timeout:
description: Maximum time a query may take before being aborted.
type: string
remoteRead: remoteRead:
description: If specified, the remote_read spec. This is an experimental description: If specified, the remote_read spec. This is an experimental
feature, it may change in any upcoming release in a breaking way. feature, it may change in any upcoming release in a breaking way.
@ -2943,6 +2964,12 @@ spec:
type: boolean type: boolean
required: required:
- key - key
image:
description: Image if specified has precedence over baseImage, tag
and sha combinations. Specifying the version is still necessary
to ensure the Prometheus Operator knows what version of Thanos
is being configured.
type: string
peers: peers:
description: Peers is a DNS name for Thanos to discover peers through. description: Peers is a DNS name for Thanos to discover peers through.
type: string type: string

View File

@ -4896,6 +4896,12 @@ items:
data: data:
nodes.json: |- nodes.json: |-
{ {
"__inputs": [
],
"__requires": [
],
"annotations": { "annotations": {
"list": [ "list": [
@ -6208,6 +6214,12 @@ items:
data: data:
persistentvolumesusage.json: |- persistentvolumesusage.json: |-
{ {
"__inputs": [
],
"__requires": [
],
"annotations": { "annotations": {
"list": [ "list": [
@ -6551,6 +6563,12 @@ items:
data: data:
pods.json: |- pods.json: |-
{ {
"__inputs": [
],
"__requires": [
],
"annotations": { "annotations": {
"list": [ "list": [
@ -6730,7 +6748,7 @@ items:
"steppedLine": false, "steppedLine": false,
"targets": [ "targets": [
{ {
"expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))",
"format": "time_series", "format": "time_series",
"intervalFactor": 2, "intervalFactor": 2,
"legendFormat": "{{ container_name }}", "legendFormat": "{{ container_name }}",
@ -6833,7 +6851,7 @@ items:
"steppedLine": false, "steppedLine": false,
"targets": [ "targets": [
{ {
"expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", pod_name=\"$pod\"}[1m])))", "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))",
"format": "time_series", "format": "time_series",
"intervalFactor": 2, "intervalFactor": 2,
"legendFormat": "{{ pod_name }}", "legendFormat": "{{ pod_name }}",
@ -7035,6 +7053,12 @@ items:
data: data:
statefulset.json: |- statefulset.json: |-
{ {
"__inputs": [
],
"__requires": [
],
"annotations": { "annotations": {
"list": [ "list": [

View File

@ -72,6 +72,8 @@ spec:
- mountPath: /etc/grafana - mountPath: /etc/grafana
name: grafana-config name: grafana-config
readOnly: false readOnly: false
nodeSelector:
beta.kubernetes.io/os: linux
securityContext: securityContext:
runAsNonRoot: true runAsNonRoot: true
runAsUser: 65534 runAsUser: 65534

View File

@ -1,6 +1,8 @@
apiVersion: v1 apiVersion: v1
kind: Service kind: Service
metadata: metadata:
labels:
app: grafana
name: grafana name: grafana
namespace: monitoring namespace: monitoring
spec: spec:

View File

@ -0,0 +1,12 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: grafana
namespace: monitoring
spec:
endpoints:
- interval: 15s
port: http
selector:
matchLabels:
app: grafana

View File

@ -67,3 +67,10 @@ rules:
- subjectaccessreviews - subjectaccessreviews
verbs: verbs:
- create - create
- apiGroups:
- policy
resources:
- poddisruptionbudgets
verbs:
- list
- watch

View File

@ -19,6 +19,7 @@ spec:
- --web.listen-address=127.0.0.1:9100 - --web.listen-address=127.0.0.1:9100
- --path.procfs=/host/proc - --path.procfs=/host/proc
- --path.sysfs=/host/sys - --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
image: carlosedp/node_exporter:v0.17.0 image: carlosedp/node_exporter:v0.17.0
@ -42,7 +43,9 @@ spec:
name: root name: root
readOnly: true readOnly: true
- args: - args:
- --logtostderr
- --secure-listen-address=$(IP):9100 - --secure-listen-address=$(IP):9100
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --upstream=http://127.0.0.1:9100/ - --upstream=http://127.0.0.1:9100/
env: env:
- name: IP - name: IP

View File

@ -25,7 +25,7 @@ spec:
- --metrics-relist-interval=1m - --metrics-relist-interval=1m
- --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/ - --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/
- --secure-port=6443 - --secure-port=6443
image: directxman12/k8s-prometheus-adapter-arm64:v0.4.1 image: carlosedp/k8s-prometheus-adapter:v0.4.1
name: prometheus-adapter name: prometheus-adapter
ports: ports:
- containerPort: 6443 - containerPort: 6443
@ -39,6 +39,8 @@ spec:
- mountPath: /etc/adapter - mountPath: /etc/adapter
name: config name: config
readOnly: false readOnly: false
nodeSelector:
beta.kubernetes.io/os: linux
serviceAccountName: prometheus-adapter serviceAccountName: prometheus-adapter
volumes: volumes:
- emptyDir: {} - emptyDir: {}

View File

@ -39,4 +39,17 @@ items:
- kind: ServiceAccount - kind: ServiceAccount
name: prometheus-k8s name: prometheus-k8s
namespace: monitoring namespace: monitoring
- apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: prometheus-k8s
namespace: logging
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: prometheus-k8s
subjects:
- kind: ServiceAccount
name: prometheus-k8s
namespace: monitoring
kind: RoleBindingList kind: RoleBindingList

View File

@ -9,7 +9,6 @@ items:
- apiGroups: - apiGroups:
- "" - ""
resources: resources:
- nodes
- services - services
- endpoints - endpoints
- pods - pods
@ -26,7 +25,6 @@ items:
- apiGroups: - apiGroups:
- "" - ""
resources: resources:
- nodes
- services - services
- endpoints - endpoints
- pods - pods
@ -43,7 +41,22 @@ items:
- apiGroups: - apiGroups:
- "" - ""
resources: resources:
- nodes - services
- endpoints
- pods
verbs:
- get
- list
- watch
- apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: prometheus-k8s
namespace: logging
rules:
- apiGroups:
- ""
resources:
- services - services
- endpoints - endpoints
- pods - pods

View File

@ -288,21 +288,24 @@ spec:
record: 'node:node_inodes_free:' record: 'node:node_inodes_free:'
- name: kube-prometheus-node-recording.rules - name: kube-prometheus-node-recording.rules
rules: rules:
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
(instance)
record: instance:node_cpu:rate:sum record: instance:node_cpu:rate:sum
- expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}))
BY (instance) BY (instance)
record: instance:node_filesystem_usage:sum record: instance:node_filesystem_usage:sum
- expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance)
record: instance:node_network_receive_bytes:rate:sum record: instance:node_network_receive_bytes:rate:sum
- expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance)
record: instance:node_network_transmit_bytes:rate:sum record: instance:node_network_transmit_bytes:rate:sum
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT
/ ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance)
record: instance:node_cpu:ratio record: instance:node_cpu:ratio
- expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m]))
record: cluster:node_cpu:sum_rate5m record: cluster:node_cpu:sum_rate5m
- expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu))
record: cluster:node_cpu:ratio record: cluster:node_cpu:ratio
- name: kubernetes-absent - name: kubernetes-absent
rules: rules:
@ -311,7 +314,7 @@ spec:
message: Alertmanager has disappeared from Prometheus target discovery. message: Alertmanager has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown
expr: | expr: |
absent(up{job="alertmanager-main"} == 1) absent(up{job="alertmanager-main",namespace="monitoring"} == 1)
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
@ -383,7 +386,7 @@ spec:
message: Prometheus has disappeared from Prometheus target discovery. message: Prometheus has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown
expr: | expr: |
absent(up{job="prometheus-k8s"} == 1) absent(up{job="prometheus-k8s",namespace="monitoring"} == 1)
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
@ -392,7 +395,7 @@ spec:
message: PrometheusOperator has disappeared from Prometheus target discovery. message: PrometheusOperator has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown
expr: | expr: |
absent(up{job="prometheus-operator"} == 1) absent(up{job="prometheus-operator",namespace="monitoring"} == 1)
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
@ -626,8 +629,8 @@ spec:
}} for container {{ $labels.container_name }} in pod {{ $labels.pod_name }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name
}}.' }}.'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh
expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m])) expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\",
by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m])) }[5m])) by (container_name, pod_name, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m]))
by (container_name, pod_name, namespace)\n > 25 \n" by (container_name, pod_name, namespace)\n > 25 \n"
for: 15m for: 15m
labels: labels:
@ -773,7 +776,8 @@ spec:
severity: warning severity: warning
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
message: Kubernetes API certificate is expiring in less than 7 days. message: A client certificate used to authenticate to the apiserver is expiring
in less than 7 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: | expr: |
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
@ -781,7 +785,8 @@ spec:
severity: warning severity: warning
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
message: Kubernetes API certificate is expiring in less than 24 hours. message: A client certificate used to authenticate to the apiserver is expiring
in less than 24 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: | expr: |
histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
@ -794,7 +799,7 @@ spec:
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
are out of sync. are out of sync.
expr: | expr: |
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
@ -803,7 +808,7 @@ spec:
message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
}}/{{ $labels.pod}}. }}/{{ $labels.pod}}.
expr: | expr: |
alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0 alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -811,9 +816,9 @@ spec:
annotations: annotations:
message: Alertmanager has not found all other members of the cluster. message: Alertmanager has not found all other members of the cluster.
expr: | expr: |
alertmanager_cluster_members{job="alertmanager-main"} alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}
!= on (service) GROUP_LEFT() != on (service) GROUP_LEFT()
count by (service) (alertmanager_cluster_members{job="alertmanager-main"}) count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"})
for: 5m for: 5m
labels: labels:
severity: critical severity: critical
@ -860,7 +865,7 @@ spec:
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
summary: Reloading Prometheus' configuration failed summary: Reloading Prometheus' configuration failed
expr: | expr: |
prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0 prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"} == 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -870,7 +875,7 @@ spec:
$labels.pod}} $labels.pod}}
summary: Prometheus' alert notification queue is running full summary: Prometheus' alert notification queue is running full
expr: | expr: |
predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"} predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"}
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -880,7 +885,7 @@ spec:
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alert from Prometheus summary: Errors while sending alert from Prometheus
expr: | expr: |
rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01 rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.01
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -890,7 +895,7 @@ spec:
$labels.pod}} to Alertmanager {{$labels.Alertmanager}} $labels.pod}} to Alertmanager {{$labels.Alertmanager}}
summary: Errors while sending alerts from Prometheus summary: Errors while sending alerts from Prometheus
expr: | expr: |
rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03 rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.03
for: 10m for: 10m
labels: labels:
severity: critical severity: critical
@ -900,7 +905,7 @@ spec:
to any Alertmanagers to any Alertmanagers
summary: Prometheus is not connected to any Alertmanagers summary: Prometheus is not connected to any Alertmanagers
expr: | expr: |
prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1 prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"} < 1
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -910,7 +915,7 @@ spec:
reload failures over the last four hours.' reload failures over the last four hours.'
summary: Prometheus has issues reloading data blocks from disk summary: Prometheus has issues reloading data blocks from disk
expr: | expr: |
increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0 increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0
for: 12h for: 12h
labels: labels:
severity: warning severity: warning
@ -920,7 +925,7 @@ spec:
compaction failures over the last four hours.' compaction failures over the last four hours.'
summary: Prometheus has issues compacting sample blocks summary: Prometheus has issues compacting sample blocks
expr: | expr: |
increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0 increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0
for: 12h for: 12h
labels: labels:
severity: warning severity: warning
@ -930,7 +935,7 @@ spec:
log (WAL).' log (WAL).'
summary: Prometheus write-ahead log is corrupted summary: Prometheus write-ahead log is corrupted
expr: | expr: |
tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0 tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0
for: 4h for: 4h
labels: labels:
severity: warning severity: warning
@ -940,7 +945,7 @@ spec:
samples. samples.
summary: Prometheus isn't ingesting samples summary: Prometheus isn't ingesting samples
expr: | expr: |
rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0 rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -950,7 +955,7 @@ spec:
due to duplicate timestamps but different values' due to duplicate timestamps but different values'
summary: Prometheus has many samples rejected summary: Prometheus has many samples rejected
expr: | expr: |
increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0 increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -961,7 +966,7 @@ spec:
message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace
}} Namespace. }} Namespace.
expr: | expr: |
rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m for: 10m
labels: labels:
severity: warning severity: warning
@ -969,7 +974,7 @@ spec:
annotations: annotations:
message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace.
expr: | expr: |
rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1
for: 10m for: 10m
labels: labels:
severity: warning severity: warning

View File

@ -13,3 +13,4 @@ spec:
selector: selector:
app: prometheus app: prometheus
prometheus: k8s prometheus: k8s
sessionAffinity: ClientIP

View File

@ -10,6 +10,7 @@ spec:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 15s interval: 15s
port: metrics port: metrics
jobLabel: k8s-app
namespaceSelector: namespaceSelector:
matchNames: matchNames:
- kube-system - kube-system

View File

@ -36,7 +36,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
addonResizer: "carlosedp/addon-resizer", addonResizer: "carlosedp/addon-resizer",
nodeExporter: "carlosedp/node_exporter", nodeExporter: "carlosedp/node_exporter",
prometheusOperator: "carlosedp/prometheus-operator", prometheusOperator: "carlosedp/prometheus-operator",
prometheusAdapter: "directxman12/k8s-prometheus-adapter-arm64", prometheusAdapter: "carlosedp/k8s-prometheus-adapter",
grafana: "carlosedp/monitoring-grafana", grafana: "carlosedp/monitoring-grafana",
configmapReloader: "carlosedp/configmap-reload", configmapReloader: "carlosedp/configmap-reload",
prometheusConfigReloader: "carlosedp/prometheus-config-reloader", prometheusConfigReloader: "carlosedp/prometheus-config-reloader",
@ -45,7 +45,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + {
prometheus+:: { prometheus+:: {
names: 'k8s', names: 'k8s',
replicas: 1, replicas: 1,
namespaces: ["default", "kube-system","monitoring"], namespaces: ["default", "kube-system","monitoring","logging"],
}, },
alertmanager+:: { alertmanager+:: {