diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index c726a9a..1fb7373 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "0e22050ca3ebe6b7033f8c0f1cb1605e3abe63c5" + "version": "685bc278917085efe30ef6ff7aecc532387da693" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "668950e4af13f0153fa1d7b58ebe7023b33f2217" + "version": "5525c8cc8a4a52d272bdaf481dd77b53a0c0f051" }, { "name": "grafonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "ec3d4f943df01f517a083305666cd1c87bcc7e94" + "version": "403b7d0120d2903d21854eae217b4e4863c454d1" }, { "name": "grafana", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "174c9bf17bec78b055e8e63c6ae3a3dc9bb0a3a8" + "version": "338addbabc8a29b46840df0bb0355c12b96a6f21" }, { "name": "etcd-mixin", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "1fe6f109c87c4fa47775426a6a60c3b954ed5c33" + "version": "4cd0bf8ea846a0d158761d55899f631eb2a423cf" } ] } diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 627ce96..158c5cb 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1538,6 +1538,14 @@ spec: required: - name type: array + enableAdminAPI: + description: 'Enable access to prometheus web admin API. Defaults to + the value of `false`. WARNING: Enabling the admin APIs enables mutating + endpoints, to delete data, shutdown Prometheus, and more. Enabling + this should be done with care and the user is advised to add additional + authentication authorization via a proxy to ensure only clients authorized + to perform these actions can do so. For more information see https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis' + type: boolean evaluationInterval: description: Interval between consecutive evaluations. type: string @@ -1572,6 +1580,9 @@ spec: description: ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP. type: boolean + logFormat: + description: Log format for Prometheus to be configured with. + type: string logLevel: description: Log level for Prometheus to be configured with. type: string @@ -2059,6 +2070,11 @@ spec: description: MinBackoff is the initial retry delay. Gets doubled for every retry. type: string + minShards: + description: MinShards is the minimum number of shards, i.e. + amount of concurrency. + format: int32 + type: integer remoteTimeout: description: Timeout for requests to the remote write endpoint. type: string @@ -2243,6 +2259,25 @@ spec: "In", and the values array contains only "value". The requirements are ANDed. type: object + rules: + description: /--rules.*/ command-line arguments + properties: + alert: + description: /--rules.alert.*/ command-line arguments + properties: + forGracePeriod: + description: Minimum duration between alert and restored 'for' + state. This is maintained only for alerts with configured + 'for' time greater than grace period. + type: string + forOutageTolerance: + description: Max time to tolerate prometheus outage for restoring + 'for' state of alert. + type: string + resendDelay: + description: Minimum amount of time to wait before resending + an alert to Alertmanager. + type: string scrapeInterval: description: Interval between consecutive scrapes. type: string @@ -2941,8 +2976,9 @@ spec: description: Thanos base image if other than default. type: string gcs: - description: ThanosGCSSpec defines parameters for use of Google - Cloud Storage (GCS) with Thanos. + description: 'Deprecated: ThanosGCSSpec should be configured with + an ObjectStorageConfig secret starting with Thanos v0.2.0. ThanosGCSSpec + will be removed.' properties: bucket: description: Google Cloud Storage bucket name for stored blocks. @@ -2970,6 +3006,22 @@ spec: to ensure the Prometheus Operator knows what version of Thanos is being configured. type: string + objectStorageConfig: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must be + defined + type: boolean + required: + - key peers: description: Peers is a DNS name for Thanos to discover peers through. type: string @@ -2988,8 +3040,9 @@ spec: to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object s3: - description: ThanosS3Spec defines parameters for of AWS Simple Storage - Service (S3) with Thanos. (S3 compatible services apply as well) + description: 'Deprecated: ThanosS3Spec should be configured with + an ObjectStorageConfig secret starting with Thanos v0.2.0. ThanosS3Spec + will be removed.' properties: accessKey: description: SecretKeySelector selects a key of a Secret. diff --git a/manifests/0prometheus-operator-clusterRole.yaml b/manifests/0prometheus-operator-clusterRole.yaml index e0ac283..123f78e 100644 --- a/manifests/0prometheus-operator-clusterRole.yaml +++ b/manifests/0prometheus-operator-clusterRole.yaml @@ -44,11 +44,13 @@ rules: - "" resources: - services + - services/finalizers - endpoints verbs: - get - create - update + - delete - apiGroups: - "" resources: diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 3f93c1b..610ffd5 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1510,7 +1510,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -1539,7 +1539,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_cpu_utilisation:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum))", + "expr": "node:cluster_cpu_utilisation:ratio", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -1596,7 +1596,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -1694,7 +1694,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -1723,7 +1723,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_memory_utilisation:ratio", + "expr": "node:cluster_memory_utilisation:ratio", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -1780,7 +1780,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -1878,7 +1878,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -1964,7 +1964,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -2062,7 +2062,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -2148,7 +2148,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -2246,7 +2246,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -2335,7 +2335,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -2387,7 +2387,7 @@ items: ] }, "timezone": "", - "title": "K8s / USE Method / Cluster", + "title": "Kubernetes / USE Method / Cluster", "uid": "a6e7d1362e1ddbb79db21d5bb40d7137", "version": 0 } @@ -2426,7 +2426,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -2512,7 +2512,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -2610,7 +2610,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -2696,7 +2696,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -2794,7 +2794,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -2880,7 +2880,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -2978,7 +2978,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -3064,7 +3064,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -3162,7 +3162,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -3251,7 +3251,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -3330,7 +3330,7 @@ items: ] }, "timezone": "", - "title": "K8s / USE Method / Node", + "title": "Kubernetes / USE Method / Node", "uid": "4ac4f123aae0ff6dbaf4f4f66120033b", "version": 0 } @@ -3370,7 +3370,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -3454,7 +3454,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -3538,7 +3538,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -3622,7 +3622,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -3706,7 +3706,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -3790,7 +3790,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -3885,7 +3885,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -3983,7 +3983,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -4250,7 +4250,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -4348,7 +4348,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 9, + "id": 10, "legend": { "avg": false, "current": false, @@ -4606,7 +4606,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -4658,7 +4658,7 @@ items: ] }, "timezone": "", - "title": "K8s / Compute Resources / Cluster", + "title": "Kubernetes / Compute Resources / Cluster", "uid": "efa86fd1d0c121a26444b636a3f509a8", "version": 0 } @@ -4697,7 +4697,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -4795,7 +4795,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -5062,7 +5062,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -5160,7 +5160,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -5418,7 +5418,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -5497,7 +5497,7 @@ items: ] }, "timezone": "", - "title": "K8s / Compute Resources / Namespace", + "title": "Kubernetes / Compute Resources / Namespace", "uid": "85a562078cdf77779eaa1add43ccec1e", "version": 0 } @@ -5536,7 +5536,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -5634,7 +5634,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -5901,7 +5901,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -5999,7 +5999,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -6257,7 +6257,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -6363,7 +6363,7 @@ items: ] }, "timezone": "", - "title": "K8s / Compute Resources / Pod", + "title": "Kubernetes / Compute Resources / Pod", "uid": "6581e46e4e5c7ba40a07646395ef7b23", "version": 0 } @@ -9456,7 +9456,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -9534,7 +9534,7 @@ items: ] }, "timezone": "", - "title": "Nodes", + "title": "Kubernetes / Nodes", "uid": "fa49a4706d07a042595b664c87fb33ea", "version": 0 } @@ -9779,7 +9779,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -9883,7 +9883,7 @@ items: ] }, "timezone": "", - "title": "Persistent Volumes", + "title": "Kubernetes / Persistent Volumes", "uid": "919b92a8e8041bd567af9edab12c840c", "version": 0 } @@ -10243,7 +10243,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -10373,7 +10373,7 @@ items: ] }, "timezone": "", - "title": "Pods", + "title": "Kubernetes / Pods", "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea", "version": 0 } @@ -16874,7 +16874,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -16978,7 +16978,7 @@ items: ] }, "timezone": "", - "title": "StatefulSets", + "title": "Kubernetes / StatefulSets", "uid": "a31c1f46e6f727cb37c0d731a7245005", "version": 0 } diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 443943c..19432b5 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -136,6 +136,13 @@ spec: * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) record: node:node_cpu_utilisation:avg1m + - expr: | + node:node_cpu_utilisation:avg1m + * + node:node_num_cpu:sum + / + scalar(sum(node:node_num_cpu:sum)) + record: node:cluster_cpu_utilisation:ratio - expr: | sum(node_load1{job="node-exporter"}) / @@ -179,8 +186,13 @@ spec: - expr: | (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / - scalar(sum(node:node_memory_bytes_total:sum)) + node:node_memory_bytes_total:sum record: node:node_memory_utilisation:ratio + - expr: | + (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) + / + scalar(sum(node:node_memory_bytes_total:sum)) + record: node:cluster_memory_utilisation:ratio - expr: | 1e3 * sum( (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) @@ -241,25 +253,25 @@ spec: max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) record: 'node:node_filesystem_avail:' - expr: | - sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) + + sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) record: :node_net_utilisation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) + + irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_net_utilisation:sum_irate - expr: | - sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) + + sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) record: :node_net_saturation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) + + irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -691,11 +703,11 @@ spec: severity: warning - alert: KubeVersionMismatch annotations: - message: There are {{ $value }} different versions of Kubernetes components - running. + message: There are {{ $value }} different semantic versions of Kubernetes + components running. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch expr: | - count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 + count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 for: 1h labels: severity: warning @@ -831,10 +843,14 @@ spec: for: 10m labels: severity: warning - - alert: DeadMansSwitch + - alert: Watchdog annotations: - message: This is a DeadMansSwitch meant to ensure that the entire alerting - pipeline is functional. + message: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. expr: vector(1) labels: severity: none diff --git a/manifests/prometheus-serviceMonitorApiserver.yaml b/manifests/prometheus-serviceMonitorApiserver.yaml index 6d884a2..5dea38e 100644 --- a/manifests/prometheus-serviceMonitorApiserver.yaml +++ b/manifests/prometheus-serviceMonitorApiserver.yaml @@ -14,6 +14,14 @@ spec: regex: etcd_(debugging|disk|request|server).* sourceLabels: - __name__ + - action: drop + regex: apiserver_admission_controller_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_step_admission_latencies_seconds_.* + sourceLabels: + - __name__ port: https scheme: https tlsConfig: diff --git a/manifests/prometheus-serviceMonitorKubelet.yaml b/manifests/prometheus-serviceMonitorKubelet.yaml index 97d7f1a..590a5cd 100644 --- a/manifests/prometheus-serviceMonitorKubelet.yaml +++ b/manifests/prometheus-serviceMonitorKubelet.yaml @@ -17,6 +17,16 @@ spec: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token honorLabels: true interval: 30s + metricRelabelings: + - action: drop + regex: container_([a-z_]+); + sourceLabels: + - __name__ + - image + - action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + sourceLabels: + - __name__ path: /metrics/cadvisor port: https-metrics scheme: https diff --git a/operator_stack.jsonnet b/operator_stack.jsonnet index 8e596c8..4791816 100644 --- a/operator_stack.jsonnet +++ b/operator_stack.jsonnet @@ -122,37 +122,6 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + 'traefik-dashboard.json': (import 'grafana-dashboards/traefik-dashboard.json'), 'coredns-dashboard.json': (import 'grafana-dashboards/coredns-dashboard.json'), }, - kubeStateMetrics+:: { - // Override command for addon-resizer due to change from parameter --threshold to --acceptance-offset - deployment+: { - spec+: { - template+: { - spec+: { - containers: - std.map( - function(c) - if std.startsWith(c.name, 'addon-resizer') then - c { - command: [ - '/pod_nanny', - '--container=kube-state-metrics', - '--cpu=100m', - '--extra-cpu=2m', - '--memory=150Mi', - '--extra-memory=30Mi', - '--acceptance-offset=5', - '--deployment=kube-state-metrics', - ], - } - else - c, - super.containers, - ), - }, - }, - }, - }, - }, // Create ingress objects per application ingress+: {