diff --git a/Makefile b/Makefile index 45ecf77..36ab2be 100644 --- a/Makefile +++ b/Makefile @@ -11,7 +11,7 @@ manifests: jsonnet rm -rf manifests ./scripts/build.sh main.jsonnet $(JSONNET_BIN) -update: +update: jsonnet_bundler jb update vendor: jsonnet_bundler jsonnetfile.json jsonnetfile.lock.json diff --git a/base_operator_stack.jsonnet b/base_operator_stack.jsonnet index 6ffa13b..79085b4 100644 --- a/base_operator_stack.jsonnet +++ b/base_operator_stack.jsonnet @@ -120,6 +120,38 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; grafanaDashboards+:: $._config.grafanaDashboards, + kubeStateMetrics+:: { + // Override command for addon-resizer due to change from parameter --threshold to --acceptance-offset + deployment+: { + spec+: { + template+: { + spec+: { + containers: + std.map( + function(c) + if std.startsWith(c.name, 'addon-resizer') then + c { + command: [ + '/pod_nanny', + '--container=kube-state-metrics', + '--cpu=100m', + '--extra-cpu=2m', + '--memory=150Mi', + '--extra-memory=30Mi', + '--acceptance-offset=5', + '--deployment=kube-state-metrics', + ], + } + else + c, + super.containers, + ), + }, + }, + }, + }, + }, + // Create ingress objects per application ingress+: { local secret = k.core.v1.secret, diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index 91ffe49..f892b64 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "99e5661f8d46ec2173f65b69eeb97f6e52d38e0d" + "version": "650359b3e627ae97a1f18cbd10d7ed9b2293c240" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "9069b2c1be0ce32f63f9a01c4a4f8d69bc4e37d5" + "version": "19db38fc449df024446059f21d5a329babaa3927" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "11022f5e920ac1ea960556193e3f0ab57d70d7c5" + "version": "d270f529db9eb750425a173188c534ab92532f47" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "5cc4bfab6e2453266e47d01b78cbae0b2643426e" + "version": "e30a6040f3d7270655a980ab04d16142da4b429d" }, { "name": "grafana", @@ -58,7 +58,7 @@ "subdir": "grafana" } }, - "version": "9ddf5a198b0f7c898dc061158ea427112acbae11" + "version": "de2ec3f0f9115da2d47dc6b86af9b402e2bf146d" }, { "name": "prometheus-operator", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "e1ca3b4434945e57e8e3a451cdbde74a903cc8e1" + "version": "7a5acb4a43aa06bd9e32ab59a46271ab88d497e4" } ] } diff --git a/manifests/alertmanager-secret.yaml b/manifests/alertmanager-secret.yaml index 9be86af..cdfe51c 100644 --- a/manifests/alertmanager-secret.yaml +++ b/manifests/alertmanager-secret.yaml @@ -1,6 +1,6 @@ apiVersion: v1 data: - alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAibnVsbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gImpvYiIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJudWxsIgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJEZWFkTWFuc1N3aXRjaCIKICAgICJyZWNlaXZlciI6ICJudWxsIg== + alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAibnVsbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gImpvYiIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJudWxsIgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg== kind: Secret metadata: name: alertmanager-main diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index aeac56d..36b75d2 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -12736,7 +12736,7 @@ items: ], "timeFrom": null, "timeShift": null, - "title": "CPU Utilizaion", + "title": "CPU Utilization", "tooltip": { "shared": false, "sort": 0, @@ -14361,11 +14361,25 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", cluster=\"$cluster\", namespace=\"$namespace\", image!=\"\", pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"}[1m]))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ container_name }}", + "legendFormat": "Current: {{ container_name }}", "refId": "A" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_requests_cpu_cores{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "refId": "B" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_limits_cpu_cores{job=\"kube-state-metrics\", cluster=\"$cluster\", namespace=\"$namespace\", pod=\"$pod\", container=~\"$container\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "refId": "C" } ], "thresholds": [ diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 357725f..0db3b9a 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -37,7 +37,7 @@ spec: record: namespace_name:container_memory_usage_bytes:sum - expr: | sum by (namespace, label_name) ( - sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod) + sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"^(Pending|Running)$"} == 1)) by (namespace, pod) * on (namespace, pod) group_left(label_name) label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") ) @@ -629,7 +629,7 @@ spec: message: Cluster has overcommitted CPU resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="cpu"}) / sum(node:node_num_cpu:sum) > 1.5 @@ -641,7 +641,7 @@ spec: message: Cluster has overcommitted memory resource requests for Namespaces. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit expr: | - sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="memory"}) / sum(node_memory_MemTotal_bytes{job="node-exporter"}) > 1.5 @@ -842,7 +842,7 @@ spec: in less than 7 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | - histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 labels: severity: warning - alert: KubeClientCertificateExpiration @@ -851,7 +851,7 @@ spec: in less than 24 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | - histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 labels: severity: critical - name: alertmanager.rules