diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index be90246..efbc09a 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "9536d7787789b74b692cd8a5482a2801b1aba232" + "version": "e123a1de479dbb911b4070f1bfcbd1e65e02209e" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "4c23c06fff9ef50744f5ed306c9ab0c4bd78a144" + "version": "668950e4af13f0153fa1d7b58ebe7023b33f2217" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "eea8b5ba6b8883cf2df5a17c39a42c4b57c0d63e" + "version": "11022f5e920ac1ea960556193e3f0ab57d70d7c5" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "c6932cf90bce4fef218b4308effc9f15c4219a01" + "version": "ec3d4f943df01f517a083305666cd1c87bcc7e94" }, { "name": "grafana", @@ -58,7 +58,7 @@ "subdir": "grafana" } }, - "version": "da19aef6f5b378fb5281e6f61dbadbbf734d45ee" + "version": "9ddf5a198b0f7c898dc061158ea427112acbae11" }, { "name": "prometheus-operator", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "72ec4b9b16ef11700724dc71fec77112536eed40" + "version": "174c9bf17bec78b055e8e63c6ae3a3dc9bb0a3a8" }, { "name": "etcd-mixin", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "15b6a17be48dea91a11497980b9adab541add7f0" + "version": "6070db22ed3d46372a5600fe8f35907f4d706bdb" } ] } diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index d5c94fc..89748f1 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -1378,6 +1378,12 @@ spec: under. This is necessary to generate correct URLs. This is necessary if Alertmanager is not served from root of a DNS name. type: string + image: + description: Image if specified has precedence over baseImage, tag and + sha combinations. Specifying the version is still necessary to ensure + the Prometheus Operator knows what version of Alertmanager is being + configured. + type: string imagePullSecrets: description: An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index d825277..627ce96 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1550,6 +1550,12 @@ spec: under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name. type: string + image: + description: Image if specified has precedence over baseImage, tag and + sha combinations. Specifying the version is still necessary to ensure + the Prometheus Operator knows what version of Prometheus is being + configured. + type: string imagePullSecrets: description: An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries @@ -1863,6 +1869,21 @@ spec: priorityClassName: description: Priority class assigned to the Pods type: string + query: + description: QuerySpec defines the query command line flags when starting + Prometheus. + properties: + lookbackDelta: + description: The delta difference allowed for retrieving metrics + during expression evaluations. + type: string + maxConcurrency: + description: Number of concurrent queries that can be run at once. + format: int32 + type: integer + timeout: + description: Maximum time a query may take before being aborted. + type: string remoteRead: description: If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way. @@ -2943,6 +2964,12 @@ spec: type: boolean required: - key + image: + description: Image if specified has precedence over baseImage, tag + and sha combinations. Specifying the version is still necessary + to ensure the Prometheus Operator knows what version of Thanos + is being configured. + type: string peers: description: Peers is a DNS name for Thanos to discover peers through. type: string diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 4900caa..f3374ed 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -4896,6 +4896,12 @@ items: data: nodes.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ @@ -6208,6 +6214,12 @@ items: data: persistentvolumesusage.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ @@ -6551,6 +6563,12 @@ items: data: pods.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ @@ -6730,7 +6748,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ container_name }}", @@ -6833,7 +6851,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", pod_name=\"$pod\"}[1m])))", + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ pod_name }}", @@ -7035,6 +7053,12 @@ items: data: statefulset.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 6e0b26f..47b3034 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -72,6 +72,8 @@ spec: - mountPath: /etc/grafana name: grafana-config readOnly: false + nodeSelector: + beta.kubernetes.io/os: linux securityContext: runAsNonRoot: true runAsUser: 65534 diff --git a/manifests/grafana-service.yaml b/manifests/grafana-service.yaml index 45f77a0..3acdf1e 100644 --- a/manifests/grafana-service.yaml +++ b/manifests/grafana-service.yaml @@ -1,6 +1,8 @@ apiVersion: v1 kind: Service metadata: + labels: + app: grafana name: grafana namespace: monitoring spec: diff --git a/manifests/grafana-serviceMonitor.yaml b/manifests/grafana-serviceMonitor.yaml new file mode 100644 index 0000000..7ede266 --- /dev/null +++ b/manifests/grafana-serviceMonitor.yaml @@ -0,0 +1,12 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: grafana + namespace: monitoring +spec: + endpoints: + - interval: 15s + port: http + selector: + matchLabels: + app: grafana diff --git a/manifests/kube-state-metrics-clusterRole.yaml b/manifests/kube-state-metrics-clusterRole.yaml index c519a91..b939df6 100644 --- a/manifests/kube-state-metrics-clusterRole.yaml +++ b/manifests/kube-state-metrics-clusterRole.yaml @@ -67,3 +67,10 @@ rules: - subjectaccessreviews verbs: - create +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - list + - watch diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index ed85468..213714d 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -19,6 +19,7 @@ spec: - --web.listen-address=127.0.0.1:9100 - --path.procfs=/host/proc - --path.sysfs=/host/sys + - --path.rootfs=/host/root - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ image: carlosedp/node_exporter:v0.17.0 @@ -42,7 +43,9 @@ spec: name: root readOnly: true - args: + - --logtostderr - --secure-listen-address=$(IP):9100 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 - --upstream=http://127.0.0.1:9100/ env: - name: IP diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml index cdba964..a7f6028 100644 --- a/manifests/prometheus-adapter-deployment.yaml +++ b/manifests/prometheus-adapter-deployment.yaml @@ -25,7 +25,7 @@ spec: - --metrics-relist-interval=1m - --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/ - --secure-port=6443 - image: directxman12/k8s-prometheus-adapter-arm64:v0.4.1 + image: carlosedp/k8s-prometheus-adapter:v0.4.1 name: prometheus-adapter ports: - containerPort: 6443 @@ -39,6 +39,8 @@ spec: - mountPath: /etc/adapter name: config readOnly: false + nodeSelector: + beta.kubernetes.io/os: linux serviceAccountName: prometheus-adapter volumes: - emptyDir: {} diff --git a/manifests/prometheus-roleBindingSpecificNamespaces.yaml b/manifests/prometheus-roleBindingSpecificNamespaces.yaml index c7527f6..087cdcf 100644 --- a/manifests/prometheus-roleBindingSpecificNamespaces.yaml +++ b/manifests/prometheus-roleBindingSpecificNamespaces.yaml @@ -39,4 +39,17 @@ items: - kind: ServiceAccount name: prometheus-k8s namespace: monitoring +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-k8s + namespace: logging + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring kind: RoleBindingList diff --git a/manifests/prometheus-roleSpecificNamespaces.yaml b/manifests/prometheus-roleSpecificNamespaces.yaml index b305774..56a7370 100644 --- a/manifests/prometheus-roleSpecificNamespaces.yaml +++ b/manifests/prometheus-roleSpecificNamespaces.yaml @@ -9,7 +9,6 @@ items: - apiGroups: - "" resources: - - nodes - services - endpoints - pods @@ -26,7 +25,6 @@ items: - apiGroups: - "" resources: - - nodes - services - endpoints - pods @@ -43,7 +41,22 @@ items: - apiGroups: - "" resources: - - nodes + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-k8s + namespace: logging + rules: + - apiGroups: + - "" + resources: - services - endpoints - pods diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 05e0deb..443943c 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -288,21 +288,24 @@ spec: record: 'node:node_inodes_free:' - name: kube-prometheus-node-recording.rules rules: - - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY + (instance) record: instance:node_cpu:rate:sum - - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance) record: instance:node_filesystem_usage:sum - - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) - / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT + (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) + BY (instance, cpu)) BY (instance) record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) + BY (instance, cpu)) record: cluster:node_cpu:ratio - name: kubernetes-absent rules: @@ -311,7 +314,7 @@ spec: message: Alertmanager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown expr: | - absent(up{job="alertmanager-main"} == 1) + absent(up{job="alertmanager-main",namespace="monitoring"} == 1) for: 15m labels: severity: critical @@ -383,7 +386,7 @@ spec: message: Prometheus has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown expr: | - absent(up{job="prometheus-k8s"} == 1) + absent(up{job="prometheus-k8s",namespace="monitoring"} == 1) for: 15m labels: severity: critical @@ -392,7 +395,7 @@ spec: message: PrometheusOperator has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown expr: | - absent(up{job="prometheus-operator"} == 1) + absent(up{job="prometheus-operator",namespace="monitoring"} == 1) for: 15m labels: severity: critical @@ -626,8 +629,8 @@ spec: }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh - expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m])) - by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m])) + expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\", + }[5m])) by (container_name, pod_name, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace)\n > 25 \n" for: 15m labels: @@ -773,7 +776,8 @@ spec: severity: warning - alert: KubeClientCertificateExpiration annotations: - message: Kubernetes API certificate is expiring in less than 7 days. + message: A client certificate used to authenticate to the apiserver is expiring + in less than 7 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 @@ -781,7 +785,8 @@ spec: severity: warning - alert: KubeClientCertificateExpiration annotations: - message: Kubernetes API certificate is expiring in less than 24 hours. + message: A client certificate used to authenticate to the apiserver is expiring + in less than 24 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 @@ -794,7 +799,7 @@ spec: message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. expr: | - count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 + count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 for: 5m labels: severity: critical @@ -803,7 +808,7 @@ spec: message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. expr: | - alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0 + alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0 for: 10m labels: severity: warning @@ -811,9 +816,9 @@ spec: annotations: message: Alertmanager has not found all other members of the cluster. expr: | - alertmanager_cluster_members{job="alertmanager-main"} + alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"} != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{job="alertmanager-main"}) + count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}) for: 5m labels: severity: critical @@ -860,7 +865,7 @@ spec: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} summary: Reloading Prometheus' configuration failed expr: | - prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0 + prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"} == 0 for: 10m labels: severity: warning @@ -870,7 +875,7 @@ spec: $labels.pod}} summary: Prometheus' alert notification queue is running full expr: | - predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"} + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"} for: 10m labels: severity: warning @@ -880,7 +885,7 @@ spec: $labels.pod}} to Alertmanager {{$labels.Alertmanager}} summary: Errors while sending alert from Prometheus expr: | - rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01 + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.01 for: 10m labels: severity: warning @@ -890,7 +895,7 @@ spec: $labels.pod}} to Alertmanager {{$labels.Alertmanager}} summary: Errors while sending alerts from Prometheus expr: | - rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03 + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.03 for: 10m labels: severity: critical @@ -900,7 +905,7 @@ spec: to any Alertmanagers summary: Prometheus is not connected to any Alertmanagers expr: | - prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1 + prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"} < 1 for: 10m labels: severity: warning @@ -910,7 +915,7 @@ spec: reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk expr: | - increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0 + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0 for: 12h labels: severity: warning @@ -920,7 +925,7 @@ spec: compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks expr: | - increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0 + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0 for: 12h labels: severity: warning @@ -930,7 +935,7 @@ spec: log (WAL).' summary: Prometheus write-ahead log is corrupted expr: | - tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0 + tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0 for: 4h labels: severity: warning @@ -940,7 +945,7 @@ spec: samples. summary: Prometheus isn't ingesting samples expr: | - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0 + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 for: 10m labels: severity: warning @@ -950,7 +955,7 @@ spec: due to duplicate timestamps but different values' summary: Prometheus has many samples rejected expr: | - increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0 + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning @@ -961,7 +966,7 @@ spec: message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace. expr: | - rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 + rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning @@ -969,7 +974,7 @@ spec: annotations: message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. expr: | - rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning diff --git a/manifests/prometheus-service.yaml b/manifests/prometheus-service.yaml index 85b007f..4f61e88 100644 --- a/manifests/prometheus-service.yaml +++ b/manifests/prometheus-service.yaml @@ -13,3 +13,4 @@ spec: selector: app: prometheus prometheus: k8s + sessionAffinity: ClientIP diff --git a/manifests/prometheus-serviceMonitorCoreDNS.yaml b/manifests/prometheus-serviceMonitorCoreDNS.yaml index 14a2454..633aa18 100644 --- a/manifests/prometheus-serviceMonitorCoreDNS.yaml +++ b/manifests/prometheus-serviceMonitorCoreDNS.yaml @@ -10,6 +10,7 @@ spec: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token interval: 15s port: metrics + jobLabel: k8s-app namespaceSelector: matchNames: - kube-system diff --git a/operator_stack.jsonnet b/operator_stack.jsonnet index 7b36362..71150ef 100644 --- a/operator_stack.jsonnet +++ b/operator_stack.jsonnet @@ -36,7 +36,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { addonResizer: "carlosedp/addon-resizer", nodeExporter: "carlosedp/node_exporter", prometheusOperator: "carlosedp/prometheus-operator", - prometheusAdapter: "directxman12/k8s-prometheus-adapter-arm64", + prometheusAdapter: "carlosedp/k8s-prometheus-adapter", grafana: "carlosedp/monitoring-grafana", configmapReloader: "carlosedp/configmap-reload", prometheusConfigReloader: "carlosedp/prometheus-config-reloader", @@ -45,7 +45,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { prometheus+:: { names: 'k8s', replicas: 1, - namespaces: ["default", "kube-system","monitoring"], + namespaces: ["default", "kube-system","monitoring","logging"], }, alertmanager+:: {