Change references to kube-prometheus library. Update libs and regenerate manifests

This commit is contained in:
CarlosEDP 2019-04-22 15:17:53 -03:00
parent 1f50c68326
commit a2f54dddd0
9 changed files with 2485 additions and 81 deletions

View File

@ -4,8 +4,8 @@
"name": "kube-prometheus", "name": "kube-prometheus",
"source": { "source": {
"git": { "git": {
"remote": "https://github.com/coreos/prometheus-operator", "remote": "https://github.com/coreos/kube-prometheus",
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" "subdir": "jsonnet/kube-prometheus"
} }
}, },
"version": "master" "version": "master"

View File

@ -4,11 +4,11 @@
"name": "kube-prometheus", "name": "kube-prometheus",
"source": { "source": {
"git": { "git": {
"remote": "https://github.com/coreos/prometheus-operator", "remote": "https://github.com/coreos/kube-prometheus",
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" "subdir": "jsonnet/kube-prometheus"
} }
}, },
"version": "650359b3e627ae97a1f18cbd10d7ed9b2293c240" "version": "7bd745ef78dce3c14ef9e315506d0c1c32fdcc9e"
}, },
{ {
"name": "ksonnet", "name": "ksonnet",
@ -28,7 +28,7 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "19db38fc449df024446059f21d5a329babaa3927" "version": "7360753d27aa428758c918434503c1c35afcd3bb"
}, },
{ {
"name": "grafonnet", "name": "grafonnet",
@ -48,7 +48,7 @@
"subdir": "grafana-builder" "subdir": "grafana-builder"
} }
}, },
"version": "e30a6040f3d7270655a980ab04d16142da4b429d" "version": "ecaeaed2e21c0db29098811e7826a9b923e706c5"
}, },
{ {
"name": "grafana", "name": "grafana",
@ -78,7 +78,7 @@
"subdir": "Documentation/etcd-mixin" "subdir": "Documentation/etcd-mixin"
} }
}, },
"version": "7a5acb4a43aa06bd9e32ab59a46271ab88d497e4" "version": "216808eab50f74e02410e96878bbf2175d2916cb"
} }
] ]
} }

View File

@ -13,3 +13,4 @@ spec:
selector: selector:
alertmanager: main alertmanager: main
app: alertmanager app: alertmanager
sessionAffinity: ClientIP

File diff suppressed because it is too large Load Diff

View File

@ -60,6 +60,12 @@ spec:
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-pod - mountPath: /grafana-dashboard-definitions/0/k8s-resources-pod
name: grafana-dashboard-k8s-resources-pod name: grafana-dashboard-k8s-resources-pod
readOnly: false readOnly: false
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-workload
name: grafana-dashboard-k8s-resources-workload
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-workloads-namespace
name: grafana-dashboard-k8s-resources-workloads-namespace
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard - mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard
name: grafana-dashboard-kubernetes-cluster-dashboard name: grafana-dashboard-kubernetes-cluster-dashboard
readOnly: false readOnly: false
@ -114,6 +120,12 @@ spec:
- configMap: - configMap:
name: grafana-dashboard-k8s-resources-pod name: grafana-dashboard-k8s-resources-pod
name: grafana-dashboard-k8s-resources-pod name: grafana-dashboard-k8s-resources-pod
- configMap:
name: grafana-dashboard-k8s-resources-workload
name: grafana-dashboard-k8s-resources-workload
- configMap:
name: grafana-dashboard-k8s-resources-workloads-namespace
name: grafana-dashboard-k8s-resources-workloads-namespace
- configMap: - configMap:
name: grafana-dashboard-kubernetes-cluster-dashboard name: grafana-dashboard-kubernetes-cluster-dashboard
name: grafana-dashboard-kubernetes-cluster-dashboard name: grafana-dashboard-kubernetes-cluster-dashboard

View File

@ -22,6 +22,7 @@ spec:
- --path.rootfs=/host/root - --path.rootfs=/host/root
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
- --collector.ntp
image: carlosedp/node_exporter:v0.17.0 image: carlosedp/node_exporter:v0.17.0
name: node-exporter name: node-exporter
resources: resources:

View File

@ -3,7 +3,7 @@ data:
config.yaml: | config.yaml: |
resourceRules: resourceRules:
cpu: cpu:
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>) containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}[1m])) by (<<.GroupBy>>)
nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>) nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
resources: resources:
overrides: overrides:
@ -15,7 +15,7 @@ data:
resource: pod resource: pod
containerLabel: container_name containerLabel: container_name
memory: memory:
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>) containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}) by (<<.GroupBy>>)
nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>) nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>)
resources: resources:
overrides: overrides:

View File

@ -44,11 +44,44 @@ spec:
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
- expr: | - expr: |
sum by (namespace, label_name) ( sum by (namespace, label_name) (
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
* on (namespace, pod) group_left(label_name) * on (namespace, pod) group_left(label_name)
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
) )
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
- expr: |
sum(
label_replace(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)"
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
labels:
workload_type: deployment
record: mixin_pod_workload
- expr: |
sum(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
labels:
workload_type: daemonset
record: mixin_pod_workload
- expr: |
sum(
label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)"
)
) by (namespace, workload, pod)
labels:
workload_type: statefulset
record: mixin_pod_workload
- name: kube-scheduler.rules - name: kube-scheduler.rules
rules: rules:
- expr: | - expr: |
@ -235,11 +268,11 @@ spec:
) )
record: node:node_disk_utilisation:avg_irate record: node:node_disk_utilisation:avg_irate
- expr: | - expr: |
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3) avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
record: :node_disk_saturation:avg_irate record: :node_disk_saturation:avg_irate
- expr: | - expr: |
avg by (node) ( avg by (node) (
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3 irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
* on (namespace, pod) group_left(node) * on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info: node_namespace_pod:kube_pod_info:
) )
@ -813,7 +846,7 @@ spec:
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
message: A client certificate used to authenticate to the apiserver is expiring message: A client certificate used to authenticate to the apiserver is expiring
in less than 7 days. in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
@ -822,7 +855,7 @@ spec:
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
annotations: annotations:
message: A client certificate used to authenticate to the apiserver is expiring message: A client certificate used to authenticate to the apiserver is expiring
in less than 24 hours. in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
@ -898,6 +931,46 @@ spec:
for: 10m for: 10m
labels: labels:
severity: critical severity: critical
- name: node-time
rules:
- alert: ClockSkewDetected
annotations:
message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod
}}. Ensure NTP is configured correctly on this host.
expr: |
node_ntp_offset_seconds{job="node-exporter"} < -0.03 or node_ntp_offset_seconds{job="node-exporter"} > 0.03
for: 2m
labels:
severity: warning
- name: node-network
rules:
- alert: NetworkReceiveErrors
annotations:
message: Network interface "{{ $labels.device }}" showing receive errors on
node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
expr: |
rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
for: 2m
labels:
severity: warning
- alert: NetworkTransmitErrors
annotations:
message: Network interface "{{ $labels.device }}" showing transmit errors
on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
expr: |
rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
for: 2m
labels:
severity: warning
- alert: NodeNetworkInterfaceFlapping
annotations:
message: Network interface "{{ $labels.device }}" changing it's up status
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
expr: |
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
for: 2m
labels:
severity: warning
- name: prometheus.rules - name: prometheus.rules
rules: rules:
- alert: PrometheusConfigReloadFailed - alert: PrometheusConfigReloadFailed

View File

@ -18,11 +18,6 @@ spec:
honorLabels: true honorLabels: true
interval: 30s interval: 30s
metricRelabelings: metricRelabelings:
- action: drop
regex: container_([a-z_]+);
sourceLabels:
- __name__
- image
- action: drop - action: drop
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
sourceLabels: sourceLabels: