mirror of
https://github.com/carlosedp/cluster-monitoring.git
synced 2024-11-20 19:07:17 +01:00
Change references to kube-prometheus library. Update libs and regenerate manifests
This commit is contained in:
parent
1f50c68326
commit
a2f54dddd0
@ -4,8 +4,8 @@
|
|||||||
"name": "kube-prometheus",
|
"name": "kube-prometheus",
|
||||||
"source": {
|
"source": {
|
||||||
"git": {
|
"git": {
|
||||||
"remote": "https://github.com/coreos/prometheus-operator",
|
"remote": "https://github.com/coreos/kube-prometheus",
|
||||||
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
|
"subdir": "jsonnet/kube-prometheus"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "master"
|
"version": "master"
|
||||||
|
@ -4,11 +4,11 @@
|
|||||||
"name": "kube-prometheus",
|
"name": "kube-prometheus",
|
||||||
"source": {
|
"source": {
|
||||||
"git": {
|
"git": {
|
||||||
"remote": "https://github.com/coreos/prometheus-operator",
|
"remote": "https://github.com/coreos/kube-prometheus",
|
||||||
"subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus"
|
"subdir": "jsonnet/kube-prometheus"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "650359b3e627ae97a1f18cbd10d7ed9b2293c240"
|
"version": "7bd745ef78dce3c14ef9e315506d0c1c32fdcc9e"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "ksonnet",
|
"name": "ksonnet",
|
||||||
@ -28,7 +28,7 @@
|
|||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "19db38fc449df024446059f21d5a329babaa3927"
|
"version": "7360753d27aa428758c918434503c1c35afcd3bb"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafonnet",
|
"name": "grafonnet",
|
||||||
@ -48,7 +48,7 @@
|
|||||||
"subdir": "grafana-builder"
|
"subdir": "grafana-builder"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "e30a6040f3d7270655a980ab04d16142da4b429d"
|
"version": "ecaeaed2e21c0db29098811e7826a9b923e706c5"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafana",
|
"name": "grafana",
|
||||||
@ -78,7 +78,7 @@
|
|||||||
"subdir": "Documentation/etcd-mixin"
|
"subdir": "Documentation/etcd-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "7a5acb4a43aa06bd9e32ab59a46271ab88d497e4"
|
"version": "216808eab50f74e02410e96878bbf2175d2916cb"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
@ -13,3 +13,4 @@ spec:
|
|||||||
selector:
|
selector:
|
||||||
alertmanager: main
|
alertmanager: main
|
||||||
app: alertmanager
|
app: alertmanager
|
||||||
|
sessionAffinity: ClientIP
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -60,6 +60,12 @@ spec:
|
|||||||
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-pod
|
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-pod
|
||||||
name: grafana-dashboard-k8s-resources-pod
|
name: grafana-dashboard-k8s-resources-pod
|
||||||
readOnly: false
|
readOnly: false
|
||||||
|
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-workload
|
||||||
|
name: grafana-dashboard-k8s-resources-workload
|
||||||
|
readOnly: false
|
||||||
|
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-workloads-namespace
|
||||||
|
name: grafana-dashboard-k8s-resources-workloads-namespace
|
||||||
|
readOnly: false
|
||||||
- mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard
|
- mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard
|
||||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||||
readOnly: false
|
readOnly: false
|
||||||
@ -114,6 +120,12 @@ spec:
|
|||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-k8s-resources-pod
|
name: grafana-dashboard-k8s-resources-pod
|
||||||
name: grafana-dashboard-k8s-resources-pod
|
name: grafana-dashboard-k8s-resources-pod
|
||||||
|
- configMap:
|
||||||
|
name: grafana-dashboard-k8s-resources-workload
|
||||||
|
name: grafana-dashboard-k8s-resources-workload
|
||||||
|
- configMap:
|
||||||
|
name: grafana-dashboard-k8s-resources-workloads-namespace
|
||||||
|
name: grafana-dashboard-k8s-resources-workloads-namespace
|
||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||||
|
@ -22,6 +22,7 @@ spec:
|
|||||||
- --path.rootfs=/host/root
|
- --path.rootfs=/host/root
|
||||||
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
|
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
|
||||||
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
|
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
|
||||||
|
- --collector.ntp
|
||||||
image: carlosedp/node_exporter:v0.17.0
|
image: carlosedp/node_exporter:v0.17.0
|
||||||
name: node-exporter
|
name: node-exporter
|
||||||
resources:
|
resources:
|
||||||
|
@ -3,7 +3,7 @@ data:
|
|||||||
config.yaml: |
|
config.yaml: |
|
||||||
resourceRules:
|
resourceRules:
|
||||||
cpu:
|
cpu:
|
||||||
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>)
|
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}[1m])) by (<<.GroupBy>>)
|
||||||
nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[1m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||||
resources:
|
resources:
|
||||||
overrides:
|
overrides:
|
||||||
@ -15,7 +15,7 @@ data:
|
|||||||
resource: pod
|
resource: pod
|
||||||
containerLabel: container_name
|
containerLabel: container_name
|
||||||
memory:
|
memory:
|
||||||
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,container_name!="POD",container_name!="",pod_name!=""}) by (<<.GroupBy>>)
|
||||||
nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
nodeQuery: sum(node:node_memory_bytes_total:sum{<<.LabelMatchers>>} - node:node_memory_bytes_available:sum{<<.LabelMatchers>>}) by (<<.GroupBy>>)
|
||||||
resources:
|
resources:
|
||||||
overrides:
|
overrides:
|
||||||
|
@ -44,11 +44,44 @@ spec:
|
|||||||
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
|
record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum
|
||||||
- expr: |
|
- expr: |
|
||||||
sum by (namespace, label_name) (
|
sum by (namespace, label_name) (
|
||||||
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod)
|
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod)
|
||||||
* on (namespace, pod) group_left(label_name)
|
* on (namespace, pod) group_left(label_name)
|
||||||
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
|
label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)")
|
||||||
)
|
)
|
||||||
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
|
record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum
|
||||||
|
- expr: |
|
||||||
|
sum(
|
||||||
|
label_replace(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
|
||||||
|
"replicaset", "$1", "owner_name", "(.*)"
|
||||||
|
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"},
|
||||||
|
"workload", "$1", "owner_name", "(.*)"
|
||||||
|
)
|
||||||
|
) by (namespace, workload, pod)
|
||||||
|
labels:
|
||||||
|
workload_type: deployment
|
||||||
|
record: mixin_pod_workload
|
||||||
|
- expr: |
|
||||||
|
sum(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
|
||||||
|
"workload", "$1", "owner_name", "(.*)"
|
||||||
|
)
|
||||||
|
) by (namespace, workload, pod)
|
||||||
|
labels:
|
||||||
|
workload_type: daemonset
|
||||||
|
record: mixin_pod_workload
|
||||||
|
- expr: |
|
||||||
|
sum(
|
||||||
|
label_replace(
|
||||||
|
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
|
||||||
|
"workload", "$1", "owner_name", "(.*)"
|
||||||
|
)
|
||||||
|
) by (namespace, workload, pod)
|
||||||
|
labels:
|
||||||
|
workload_type: statefulset
|
||||||
|
record: mixin_pod_workload
|
||||||
- name: kube-scheduler.rules
|
- name: kube-scheduler.rules
|
||||||
rules:
|
rules:
|
||||||
- expr: |
|
- expr: |
|
||||||
@ -235,11 +268,11 @@ spec:
|
|||||||
)
|
)
|
||||||
record: node:node_disk_utilisation:avg_irate
|
record: node:node_disk_utilisation:avg_irate
|
||||||
- expr: |
|
- expr: |
|
||||||
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3)
|
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
||||||
record: :node_disk_saturation:avg_irate
|
record: :node_disk_saturation:avg_irate
|
||||||
- expr: |
|
- expr: |
|
||||||
avg by (node) (
|
avg by (node) (
|
||||||
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) / 1e3
|
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
||||||
* on (namespace, pod) group_left(node)
|
* on (namespace, pod) group_left(node)
|
||||||
node_namespace_pod:kube_pod_info:
|
node_namespace_pod:kube_pod_info:
|
||||||
)
|
)
|
||||||
@ -813,7 +846,7 @@ spec:
|
|||||||
- alert: KubeClientCertificateExpiration
|
- alert: KubeClientCertificateExpiration
|
||||||
annotations:
|
annotations:
|
||||||
message: A client certificate used to authenticate to the apiserver is expiring
|
message: A client certificate used to authenticate to the apiserver is expiring
|
||||||
in less than 7 days.
|
in less than 7.0 days.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||||
expr: |
|
expr: |
|
||||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
|
||||||
@ -822,7 +855,7 @@ spec:
|
|||||||
- alert: KubeClientCertificateExpiration
|
- alert: KubeClientCertificateExpiration
|
||||||
annotations:
|
annotations:
|
||||||
message: A client certificate used to authenticate to the apiserver is expiring
|
message: A client certificate used to authenticate to the apiserver is expiring
|
||||||
in less than 24 hours.
|
in less than 24.0 hours.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
|
||||||
expr: |
|
expr: |
|
||||||
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
|
||||||
@ -898,6 +931,46 @@ spec:
|
|||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
- name: node-time
|
||||||
|
rules:
|
||||||
|
- alert: ClockSkewDetected
|
||||||
|
annotations:
|
||||||
|
message: Clock skew detected on node-exporter {{ $labels.namespace }}/{{ $labels.pod
|
||||||
|
}}. Ensure NTP is configured correctly on this host.
|
||||||
|
expr: |
|
||||||
|
node_ntp_offset_seconds{job="node-exporter"} < -0.03 or node_ntp_offset_seconds{job="node-exporter"} > 0.03
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- name: node-network
|
||||||
|
rules:
|
||||||
|
- alert: NetworkReceiveErrors
|
||||||
|
annotations:
|
||||||
|
message: Network interface "{{ $labels.device }}" showing receive errors on
|
||||||
|
node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||||
|
expr: |
|
||||||
|
rate(node_network_receive_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NetworkTransmitErrors
|
||||||
|
annotations:
|
||||||
|
message: Network interface "{{ $labels.device }}" showing transmit errors
|
||||||
|
on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||||
|
expr: |
|
||||||
|
rate(node_network_transmit_errs_total{job="node-exporter",device!~"veth.+"}[2m]) > 0
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeNetworkInterfaceFlapping
|
||||||
|
annotations:
|
||||||
|
message: Network interface "{{ $labels.device }}" changing it's up status
|
||||||
|
often on node-exporter {{ $labels.namespace }}/{{ $labels.pod }}"
|
||||||
|
expr: |
|
||||||
|
changes(node_network_up{job="node-exporter",device!~"veth.+"}[2m]) > 2
|
||||||
|
for: 2m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
- name: prometheus.rules
|
- name: prometheus.rules
|
||||||
rules:
|
rules:
|
||||||
- alert: PrometheusConfigReloadFailed
|
- alert: PrometheusConfigReloadFailed
|
||||||
|
@ -18,11 +18,6 @@ spec:
|
|||||||
honorLabels: true
|
honorLabels: true
|
||||||
interval: 30s
|
interval: 30s
|
||||||
metricRelabelings:
|
metricRelabelings:
|
||||||
- action: drop
|
|
||||||
regex: container_([a-z_]+);
|
|
||||||
sourceLabels:
|
|
||||||
- __name__
|
|
||||||
- image
|
|
||||||
- action: drop
|
- action: drop
|
||||||
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
|
regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
|
||||||
sourceLabels:
|
sourceLabels:
|
||||||
|
Loading…
Reference in New Issue
Block a user