commit 063e040dac51e111fc79ef7fb4b3a25d72883658 Author: CarlosEDP Date: Thu Mar 1 19:03:53 2018 -0500 Initial import diff --git a/Readme.md b/Readme.md new file mode 100644 index 0000000..267b3da --- /dev/null +++ b/Readme.md @@ -0,0 +1,23 @@ +# Prometheus Operator for ARM platform + +The Prometheus Operator for Kubernetes provides easy monitoring definitions for Kubernetes services and deployment and management of Prometheus instances. + +This project aims on porting the [official manifests](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus) and images to the ARM platform. This have been tested on a ARM64 Kubernetes cluster deployed as [this article](medium.com/@carlosedp/building-an-arm-kubernetes-cluster-ef31032636f9). + +## Changes to Kubeadm for Prometheus Operator + +According to the official deployment documentation [here](https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus/docs/kube-prometheus-on-kubeadm.md), a couple of changes on the cluster are required: + +We need to expose the cadvisor that is installed and managed by the kubelet daemon and allow webhook token authentication. To do so, we do the following on **all the masters and nodes**: + + sed -e "/cadvisor-port=0/d" -i /etc/systemd/system/kubelet.service.d/10-kubeadm.conf + sed -e "s/--authorization-mode=Webhook/--authentication-token-webhook=true --authorization-mode=Webhook/" + systemctl daemon-reload + systemctl restart kubelet + +In case you already have a Kubernetes deployed with kubeadm, change the address kube-controller-manager and kube-scheduler listens **on master node** in addition to previous kubelet change: + + sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-controller-manager.yaml + sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-scheduler.yaml + + diff --git a/deploy b/deploy new file mode 100755 index 0000000..7e0434d --- /dev/null +++ b/deploy @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +if [ -z "${KUBECONFIG}" ]; then + export KUBECONFIG=~/.kube/config +fi + +# CAUTION - setting NAMESPACE will deploy most components to the given namespace +# however some are hardcoded to 'monitoring'. Only use if you have reviewed all manifests. + +if [ -z "${NAMESPACE}" ]; then + NAMESPACE=monitoring +fi + +kubectl create namespace "$NAMESPACE" + +kctl() { + kubectl --namespace "$NAMESPACE" "$@" +} + +kubectl apply -f manifests/k8s + +kctl apply -f manifests/prometheus-operator + +# Wait for CRDs to be ready. +printf "Waiting for Operator to register custom resource definitions..." +until kctl get customresourcedefinitions servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get customresourcedefinitions prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get customresourcedefinitions alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +until kctl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done +echo "done!" + +kctl apply -f manifests/node-exporter +kctl apply -f manifests/kube-state-metrics +kctl apply -f manifests/grafana/grafana-credentials.yaml +kctl apply -f manifests/grafana +find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \; +kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml +kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml +kctl apply -f manifests/alertmanager/ + diff --git a/manifests/alertmanager/alertmanager-config.yaml b/manifests/alertmanager/alertmanager-config.yaml new file mode 100644 index 0000000..62d3901 --- /dev/null +++ b/manifests/alertmanager/alertmanager-config.yaml @@ -0,0 +1,6 @@ +apiVersion: v1 +kind: Secret +metadata: + name: alertmanager-main +data: + alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== diff --git a/manifests/alertmanager/alertmanager-ingress.yaml b/manifests/alertmanager/alertmanager-ingress.yaml new file mode 100644 index 0000000..86c133b --- /dev/null +++ b/manifests/alertmanager/alertmanager-ingress.yaml @@ -0,0 +1,15 @@ +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: alertmanager + namespace: monitoring +spec: + rules: + - host: alertmanager.internal.carlosedp.com + http: + paths: + - path: / + backend: + serviceName: alertmanager-main + servicePort: web + diff --git a/manifests/alertmanager/alertmanager-service.yaml b/manifests/alertmanager/alertmanager-service.yaml new file mode 100644 index 0000000..a541310 --- /dev/null +++ b/manifests/alertmanager/alertmanager-service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + alertmanager: main + name: alertmanager-main +spec: + type: NodePort + ports: + - name: web + nodePort: 30903 + port: 9093 + protocol: TCP + targetPort: web + selector: + alertmanager: main diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests/alertmanager/alertmanager.yaml new file mode 100644 index 0000000..deadd75 --- /dev/null +++ b/manifests/alertmanager/alertmanager.yaml @@ -0,0 +1,11 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Alertmanager +metadata: + name: main + labels: + alertmanager: main +spec: + replicas: 1 + baseImage: carlosedp/alertmanager + version: v0.14.0 + diff --git a/manifests/armexporter/daemonset.yaml b/manifests/armexporter/daemonset.yaml new file mode 100644 index 0000000..9d30040 --- /dev/null +++ b/manifests/armexporter/daemonset.yaml @@ -0,0 +1,22 @@ +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: arm-exporter + namespace: monitoring + labels: + k8s-app: arm-exporter +spec: + template: + metadata: + name: arm-exporter + labels: + k8s-app: arm-exporter + spec: + hostNetwork: true + containers: + - image: carlosedp/arm_exporter + name: arm-exporter + ports: + - name: http + containerPort: 9243 + hostPort: 9243 diff --git a/manifests/armexporter/node-exporter-service.yaml b/manifests/armexporter/node-exporter-service.yaml new file mode 100644 index 0000000..f9bfc58 --- /dev/null +++ b/manifests/armexporter/node-exporter-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: node-exporter + k8s-app: node-exporter + name: node-exporter +spec: + type: ClusterIP + clusterIP: None + ports: + - name: http + port: 9100 + protocol: TCP + selector: + app: node-exporter + diff --git a/manifests/armexporter/service.yaml b/manifests/armexporter/service.yaml new file mode 100644 index 0000000..af8986b --- /dev/null +++ b/manifests/armexporter/service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: arm-exporter + name: arm-exporter +spec: + ports: + - name: http + port: 9243 + protocol: TCP + selector: + k8s-app: arm-exporter + diff --git a/manifests/grafana/grafana-claim.yaml b/manifests/grafana/grafana-claim.yaml new file mode 100644 index 0000000..186bd5c --- /dev/null +++ b/manifests/grafana/grafana-claim.yaml @@ -0,0 +1,12 @@ +kind: PersistentVolumeClaim +apiVersion: v1 +metadata: + name: grafana-claim + namespace: monitoring +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 5Gi + diff --git a/manifests/grafana/grafana-configmap.yaml b/manifests/grafana/grafana-configmap.yaml new file mode 100644 index 0000000..144f55b --- /dev/null +++ b/manifests/grafana/grafana-configmap.yaml @@ -0,0 +1,23 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-config + namespace: monitoring +data: + config.ini: | + [database] + path = /data/grafana.db + + [paths] + data = /data + logs = /data/log + plugins = /data/plugins + + [session] + provider = memory + + [auth.basic] + enabled = false + + [auth.anonymous] + enabled = false diff --git a/manifests/grafana/grafana-credentials.yaml b/manifests/grafana/grafana-credentials.yaml new file mode 100644 index 0000000..c3da1b6 --- /dev/null +++ b/manifests/grafana/grafana-credentials.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: Secret +metadata: + name: grafana-credentials +data: + user: YWRtaW4= + password: YWRtaW4= diff --git a/manifests/grafana/grafana-dashboard-definitions.yaml b/manifests/grafana/grafana-dashboard-definitions.yaml new file mode 100644 index 0000000..dbd2a30 --- /dev/null +++ b/manifests/grafana/grafana-dashboard-definitions.yaml @@ -0,0 +1,7360 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboard-definitions-0 +data: + deployment-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 1, + "hideControls": false, + "links": [], + "rows": [ + { + "collapse": false, + "editable": false, + "height": "200px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m]))", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "CPU", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}) / 1024^3", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "Bps", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$deployment_namespace\",pod_name=~\"$deployment_name.*\"}[3m]))", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "100px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "metric": "kube_deployment_spec_replicas", + "refId": "A", + "step": 600 + } + ], + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Available Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_status_observed_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_deployment_metadata_generation{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(kube_deployment_status_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "current replicas", + "refId": "A", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_available{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "max(kube_deployment_status_replicas_unavailable{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "unavailable", + "refId": "C", + "step": 30 + }, + { + "expr": "min(kube_deployment_status_replicas_updated{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "D", + "step": 30 + }, + { + "expr": "max(kube_deployment_spec_replicas{deployment=\"$deployment_name\",namespace=\"$deployment_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "show": false + } + ] + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "deployment_namespace", + "options": [], + "query": "label_values(kube_deployment_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Deployment", + "multi": false, + "name": "deployment_name", + "options": [], + "query": "label_values(kube_deployment_metadata_generation{namespace=\"$deployment_namespace\"}, deployment)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "deployment", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Deployment", + "version": 1 + } + etcd-dashboard.json: |+ + { + "__inputs": [ + { + "name": "prometheus", + "label": "prometheus", + "description": "", + "type": "datasource", + "pluginId": "prometheus", + "pluginName": "Prometheus" + } + ], + "__requires": [ + { + "type": "grafana", + "id": "grafana", + "name": "Grafana", + "version": "4.5.2" + }, + { + "type": "panel", + "id": "graph", + "name": "Graph", + "version": "" + }, + { + "type": "datasource", + "id": "prometheus", + "name": "Prometheus", + "version": "1.0.0" + }, + { + "type": "panel", + "id": "singlestat", + "name": "Singlestat", + "version": "" + } + ], + "annotations": { + "list": [] + }, + "description": "etcd sample Grafana dashboard with Prometheus", + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "error": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 28, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(etcd_server_has_leader)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "metric": "etcd_server_has_leader", + "refId": "A", + "step": 20 + } + ], + "thresholds": "", + "title": "Up", + "type": "singlestat", + "valueFontSize": "200%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "id": 23, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(grpc_server_started_total{grpc_type=\"unary\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "RPC Rate", + "metric": "grpc_server_started_total", + "refId": "A", + "step": 4 + }, + { + "expr": "sum(rate(grpc_server_handled_total{grpc_type=\"unary\",grpc_code!=\"OK\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "RPC Failed Rate", + "metric": "grpc_server_handled_total", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "RPC Rate", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "id": 41, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Watch\",grpc_type=\"bidi_stream\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Watch Streams", + "metric": "grpc_server_handled_total", + "refId": "A", + "step": 4 + }, + { + "expr": "sum(grpc_server_started_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"}) - sum(grpc_server_handled_total{grpc_service=\"etcdserverpb.Lease\",grpc_type=\"bidi_stream\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Lease Streams", + "metric": "grpc_server_handled_total", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Active Streams", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": null, + "editable": false, + "error": false, + "fill": 0, + "grid": {}, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "etcd_debugging_mvcc_db_total_size_in_bytes", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} DB Size", + "metric": "", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "DB Size", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "grid": {}, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": true, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{instance}} WAL fsync", + "metric": "etcd_disk_wal_fsync_duration_seconds_bucket", + "refId": "A", + "step": 4 + }, + { + "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} DB fsync", + "metric": "etcd_disk_backend_commit_duration_seconds_bucket", + "refId": "B", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Disk Sync Duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "id": 29, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Resident Memory", + "metric": "process_resident_memory_bytes", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 5, + "id": 22, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "rate(etcd_network_client_grpc_received_bytes_total[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Client Traffic In", + "metric": "etcd_network_client_grpc_received_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Client Traffic In", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 5, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "rate(etcd_network_client_grpc_sent_bytes_total[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Client Traffic Out", + "metric": "etcd_network_client_grpc_sent_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Client Traffic Out", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(etcd_network_peer_received_bytes_total[5m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Peer Traffic In", + "metric": "etcd_network_peer_received_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Peer Traffic In", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": null, + "editable": false, + "error": false, + "fill": 0, + "grid": {}, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 3, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(etcd_network_peer_sent_bytes_total[5m])) by (instance)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{instance}} Peer Traffic Out", + "metric": "etcd_network_peer_sent_bytes_total", + "refId": "A", + "step": 4 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Peer Traffic Out", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 0, + "id": 40, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(etcd_server_proposals_failed_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Failure Rate", + "metric": "etcd_server_proposals_failed_total", + "refId": "A", + "step": 2 + }, + { + "expr": "sum(etcd_server_proposals_pending)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Pending Total", + "metric": "etcd_server_proposals_pending", + "refId": "B", + "step": 2 + }, + { + "expr": "sum(rate(etcd_server_proposals_committed_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Commit Rate", + "metric": "etcd_server_proposals_committed_total", + "refId": "C", + "step": 2 + }, + { + "expr": "sum(rate(etcd_server_proposals_applied_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Proposal Apply Rate", + "refId": "D", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Raft Proposals", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "editable": false, + "error": false, + "fill": 0, + "id": 19, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "changes(etcd_server_leader_changes_seen_total[1d])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}} Total Leader Elections Per Day", + "metric": "etcd_server_leader_changes_seen_total", + "refId": "A", + "step": 2 + } + ], + "thresholds": [], + "timeFrom": null, + "timeShift": null, + "title": "Total Leader Elections Per Day", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "New row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-15m", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "etcd", + "version": 4 + } + kubernetes-capacity-planning-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_cpu{mode=\"idle\"}[2m])) * 100", + "hide": false, + "intervalFactor": 10, + "legendFormat": "", + "refId": "A", + "step": 50 + } + ], + "title": "Idle CPU", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 9, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_load1)", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load5)", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "sum(node_load15)", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "title": "System Load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 4, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)", + "intervalFactor": 2, + "legendFormat": "memory usage", + "metric": "memo", + "refId": "A", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_Buffers)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "memo", + "refId": "B", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_Cached)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "memo", + "refId": "C", + "step": 10, + "target": "" + }, + { + "expr": "sum(node_memory_MemFree)", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "memo", + "refId": "D", + "step": 10, + "target": "" + } + ], + "title": "Memory Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "intervalFactor": 2, + "metric": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory Usage", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "246px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 6, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_disk_bytes_read[5m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum(rate(node_disk_bytes_written[5m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 20 + }, + { + "expr": "sum(rate(node_disk_io_time_ms[5m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 20 + } + ], + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "ms", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 12, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk Space Usage", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 8, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_receive_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10, + "target": "" + } + ], + "title": "Network Received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 10, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(node_network_transmit_bytes{device!~\"lo\"}[5m]))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10, + "target": "" + } + ], + "title": "Network Transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "276px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 11, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 11, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_info)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Current number of Pods", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(kube_node_status_capacity_pods)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Maximum capacity of pods", + "refId": "B", + "step": 10 + } + ], + "title": "Cluster Pod Utilization", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Pod Utilization", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Capacity Planning", + "version": 4 + } + kubernetes-cluster-health-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "editable": false, + "height": "254px", + "panels": [ + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 1, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Control Plane Components Down", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "Everything UP and healthy", + "value": "null" + }, + { + "op": "=", + "text": "", + "value": "" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Alerts Firing", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 3, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"pending\",alertname!=\"DeadMansSwitch\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "3, 5", + "title": "Alerts Pending", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 4, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "count(increase(kube_pod_container_status_restarts[1h]) > 5)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Crashlooping Pods", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": false, + "title": "Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"Ready\",status!=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Node Not Ready", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 6, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"DiskPressure\",status=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Node Disk Pressure", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(kube_node_status_condition{condition=\"MemoryPressure\",status=\"true\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Node Memory Pressure", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 8, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(kube_node_spec_unschedulable)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Nodes Unschedulable", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": false, + "title": "Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Cluster Health", + "version": 9 + } + kubernetes-cluster-status-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "rows": [ + { + "collapse": false, + "editable": false, + "height": "129px", + "panels": [ + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 6, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(up{job=~\"apiserver|kube-scheduler|kube-controller-manager\"} == 0)", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Control Plane UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "UP", + "value": "null" + } + ], + "valueName": "total" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 6, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "3, 5", + "title": "Alerts Firing", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": true, + "title": "Cluster Health", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "168px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 1, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"apiserver\"} == 1) / count(up{job=\"apiserver\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "API Servers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / count(up{job=\"kube-controller-manager\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "Controller Managers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / count(up{job=\"kube-scheduler\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "Schedulers UP", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "colorBackground": false, + "colorValue": true, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 4, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "count(increase(kube_pod_container_status_restarts{namespace=~\"kube-system|tectonic-system\"}[1h]) > 5)", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "1, 3", + "title": "Crashlooping Control Plane Pods", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": true, + "title": "Control Plane Status", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "158px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "sum(100 - (avg by (instance) (rate(node_cpu{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)) / count(node_cpu{job=\"node-exporter\",mode=\"idle\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "80, 90", + "title": "CPU Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((sum(node_memory_MemTotal) - sum(node_memory_MemFree) - sum(node_memory_Buffers) - sum(node_memory_Cached)) / sum(node_memory_MemTotal)) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "80, 90", + "title": "Memory Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\"}) - sum(node_filesystem_free{device!=\"rootfs\"})) / sum(node_filesystem_size{device!=\"rootfs\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "80, 90", + "title": "Filesystem Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 10, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "100 - (sum(kube_node_status_capacity_pods) - sum(kube_pod_info)) / sum(kube_node_status_capacity_pods) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "80, 90", + "title": "Pod Utilization", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": true, + "title": "Capacity Planning", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Cluster Status", + "version": 3 + } + kubernetes-control-plane-status-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "rows": [ + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 1, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"apiserver\"} == 1) / sum(up{job=\"apiserver\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "API Servers UP", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"kube-controller-manager\"} == 1) / sum(up{job=\"kube-controller-manager\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "Controller Managers UP", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 3, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(up{job=\"kube-scheduler\"} == 1) / sum(up{job=\"kube-scheduler\"})) * 100", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "thresholds": "50, 80", + "title": "Schedulers UP", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 4, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(sum by(instance) (rate(apiserver_request_count{code=~\"5..\"}[5m])) / sum by(instance) (rate(apiserver_request_count[5m]))) * 100", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 600 + } + ], + "thresholds": "5, 10", + "title": "API Server Request Error Rate", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 7, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(verb) (rate(apiserver_latency_seconds:quantile[5m]) >= 0)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 30 + } + ], + "title": "API Server Request Latency", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 5, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "cluster:scheduler_e2e_scheduling_latency_seconds:quantile", + "format": "time_series", + "intervalFactor": 2, + "refId": "A", + "step": 60 + } + ], + "title": "End to End Scheduling Latency", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "dtdurations", + "logBase": 1, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 6, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(instance) (rate(apiserver_request_count{code!~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Error Rate", + "refId": "A", + "step": 60 + }, + { + "expr": "sum by(instance) (rate(apiserver_request_count[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Request Rate", + "refId": "B", + "step": 60 + } + ], + "title": "API Server Request Rates", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Control Plane Status", + "version": 3 + } + kubernetes-resource-requests-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "editable": false, + "height": "300px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "This represents the total [CPU resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-cpu) in the cluster.\nFor comparison the total [allocatable CPU cores](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_cpu_cores) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable CPU Cores", + "refId": "A", + "step": 20 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Requested CPU Cores", + "refId": "B", + "step": 20 + } + ], + "title": "CPU Cores", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": "CPU Cores", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_cpu_cores) by (instance)) / min(sum(kube_node_status_allocatable_cpu_cores) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "CPU Cores", + "transparent": false, + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "CPU Cores", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "300px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "description": "This represents the total [memory resource requests](https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/#meaning-of-memory) in the cluster.\nFor comparison the total [allocatable memory](https://github.com/kubernetes/community/blob/master/contributors/design-proposals/node-allocatable.md) is also shown.", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(sum(kube_node_status_allocatable_memory_bytes) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Allocatable Memory", + "refId": "A", + "step": 20 + }, + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance))", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Requested Memory", + "refId": "B", + "step": 20 + } + ], + "title": "Memory", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": "Memory", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 4, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "max(sum(kube_pod_container_resource_requests_memory_bytes) by (instance)) / min(sum(kube_node_status_allocatable_memory_bytes) by (instance)) * 100", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 240 + } + ], + "thresholds": "80, 90", + "title": "Memory", + "transparent": false, + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Memory", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes Resource Requests", + "version": 2 + } + nodes-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "description": "Dashboard to get an overview of one server", + "editable": false, + "gnetId": 22, + "graphTooltip": 0, + "hideControls": false, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (cpu) (irate(node_cpu{mode=\"idle\", instance=\"$server\"}[5m])) * 100)", + "hide": false, + "intervalFactor": 10, + "legendFormat": "{{cpu}}", + "refId": "A", + "step": 50 + } + ], + "title": "Idle CPU", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percent", + "label": "cpu usage", + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 9, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_load1{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 1m", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "node_load5{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 5m", + "refId": "B", + "step": 20, + "target": "" + }, + { + "expr": "node_load15{instance=\"$server\"}", + "intervalFactor": 4, + "legendFormat": "load 15m", + "refId": "C", + "step": 20, + "target": "" + } + ], + "title": "System Load", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 4, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "node_memory_SwapFree{instance=\"172.17.0.1:9100\",job=\"prometheus\"}", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory used", + "metric": "", + "refId": "C", + "step": 10 + }, + { + "expr": "node_memory_Buffers{instance=\"$server\"}", + "interval": "", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "metric": "", + "refId": "E", + "step": 10 + }, + { + "expr": "node_memory_Cached{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory cached", + "metric": "", + "refId": "F", + "step": 10 + }, + { + "expr": "node_memory_MemFree{instance=\"$server\"}", + "intervalFactor": 2, + "legendFormat": "memory free", + "metric": "", + "refId": "D", + "step": 10 + } + ], + "title": "Memory Usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "((node_memory_MemTotal{instance=\"$server\"} - node_memory_MemFree{instance=\"$server\"} - node_memory_Buffers{instance=\"$server\"} - node_memory_Cached{instance=\"$server\"}) / node_memory_MemTotal{instance=\"$server\"}) * 100", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "80, 90", + "title": "Memory Usage", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 6, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "{instance=\"172.17.0.1:9100\"}", + "yaxis": 2 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (instance) (rate(node_disk_bytes_read{instance=\"$server\"}[2m]))", + "hide": false, + "intervalFactor": 4, + "legendFormat": "read", + "refId": "A", + "step": 20, + "target": "" + }, + { + "expr": "sum by (instance) (rate(node_disk_bytes_written{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "written", + "refId": "B", + "step": 20 + }, + { + "expr": "sum by (instance) (rate(node_disk_io_time_ms{instance=\"$server\"}[2m]))", + "intervalFactor": 4, + "legendFormat": "io time", + "refId": "C", + "step": 20 + } + ], + "title": "Disk I/O", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "ms", + "logBase": 1, + "show": true + } + ] + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": false, + "format": "percentunit", + "gauge": { + "maxValue": 1, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "hideTimeOverride": false, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "(sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"}) - sum(node_filesystem_free{device!=\"rootfs\",instance=\"$server\"})) / sum(node_filesystem_size{device!=\"rootfs\",instance=\"$server\"})", + "intervalFactor": 2, + "refId": "A", + "step": 60, + "target": "" + } + ], + "thresholds": "0.75, 0.9", + "title": "Disk Space Usage", + "transparent": false, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 8, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_receive_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A", + "step": 10, + "target": "" + } + ], + "title": "Network Received", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "show": true + } + ] + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 10, + "isNew": false, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "transmitted", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(node_network_transmit_bytes{instance=\"$server\",device!~\"lo\"}[5m])", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "B", + "step": 10, + "target": "" + } + ], + "title": "Network Transmitted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "bytes", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "server", + "options": [], + "query": "label_values(node_boot_time, instance)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Nodes", + "version": 2 + } + pods-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 1, + "hideControls": false, + "links": [], + "refresh": false, + "rows": [ + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": false, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{pod_name=\"$pod\", container_name=~\"$container\", container_name!=\"POD\"})", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Current: {{ container_name }}", + "metric": "container_memory_usage_bytes", + "refId": "A", + "step": 15 + }, + { + "expr": "kube_pod_container_resource_requests_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_resource_requests_memory_bytes", + "refId": "B", + "step": 20 + }, + { + "expr": "kube_pod_container_resource_limits_memory_bytes{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "metric": "kube_pod_container_resource_limits_memory_bytes", + "refId": "C", + "step": 20 + } + ], + "title": "Memory Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 2, + "isNew": false, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name)(rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A", + "step": 30 + }, + { + "expr": "kube_pod_container_resource_requests_cpu_cores{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "metric": "kube_pod_container_resource_requests_cpu_cores", + "refId": "B", + "step": 20 + }, + { + "expr": "kube_pod_container_resource_limits_cpu_cores{pod=\"$pod\", container=~\"$container\"}", + "interval": "10s", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "metric": "kube_pod_container_resource_limits_memory_bytes", + "refId": "C", + "step": 20 + } + ], + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "250px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 3, + "isNew": false, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{pod_name=\"$pod\"}[1m])))", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A", + "step": 30 + } + ], + "title": "Network I/O", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ] + } + ], + "showTitle": false, + "title": "New Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [], + "query": "label_values(kube_pod_info{namespace=~\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Pods", + "version": 1 + } + statefulset-dashboard.json: |+ + { + "__inputs": [ + { + "description": "", + "label": "prometheus", + "name": "prometheus", + "pluginId": "prometheus", + "pluginName": "Prometheus", + "type": "datasource" + } + ], + "annotations": { + "list": [] + }, + "editable": false, + "graphTooltip": 1, + "hideControls": false, + "links": [], + "rows": [ + { + "collapse": false, + "editable": false, + "height": "200px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 8, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m]))", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "CPU", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 9, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "80%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}) / 1024^3", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Memory", + "type": "singlestat", + "valueFontSize": "110%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "Bps", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 7, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$statefulset_namespace\",pod_name=~\"$statefulset_name.*\"}[3m]))", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "100px", + "panels": [ + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": false + }, + "id": 5, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "metric": "kube_statefulset_replicas", + "refId": "A", + "step": 600 + } + ], + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 6, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "min(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Available Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 3, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_statefulset_status_observed_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": "prometheus", + "editable": false, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "id": 2, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "targets": [ + { + "expr": "max(kube_statefulset_metadata_generation{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "refId": "A", + "step": 600 + } + ], + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + }, + { + "collapse": false, + "editable": false, + "height": "350px", + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": false, + "error": false, + "fill": 1, + "grid": { + "threshold1Color": "rgba(216, 200, 27, 0.27)", + "threshold2Color": "rgba(234, 112, 112, 0.22)" + }, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "hideZero": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "min(kube_statefulset_status_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "available", + "refId": "B", + "step": 30 + }, + { + "expr": "max(kube_statefulset_replicas{statefulset=\"$statefulset_name\",namespace=\"$statefulset_namespace\"}) without (instance, pod)", + "intervalFactor": 2, + "legendFormat": "desired", + "refId": "E", + "step": 30 + } + ], + "title": "Replicas", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "none", + "label": "", + "logBase": 1, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 1, + "show": false + } + ] + } + ], + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "sharedCrosshair": false, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "allValue": ".*", + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "statefulset_namespace", + "options": [], + "query": "label_values(kube_statefulset_metadata_generation, namespace)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": null, + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": {}, + "datasource": "prometheus", + "hide": 0, + "includeAll": false, + "label": "StatefulSet", + "multi": false, + "name": "statefulset_name", + "options": [], + "query": "label_values(kube_statefulset_metadata_generation{namespace=\"$statefulset_namespace\"}, statefulset)", + "refresh": 1, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "statefulset", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "StatefulSet", + "version": 1 + } +--- diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests/grafana/grafana-dashboards.yaml new file mode 100644 index 0000000..772d3f6 --- /dev/null +++ b/manifests/grafana/grafana-dashboards.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards +data: + dashboards.yaml: |+ + - name: '0' + org_id: 1 + folder: '' + type: file + options: + folder: /grafana-dashboard-definitions/0 diff --git a/manifests/grafana/grafana-datasources.yaml b/manifests/grafana/grafana-datasources.yaml new file mode 100644 index 0000000..33c3081 --- /dev/null +++ b/manifests/grafana/grafana-datasources.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources +data: + prometheus.yaml: |+ + datasources: + - name: prometheus + type: prometheus + access: proxy + org_id: 1 + url: http://prometheus-k8s.monitoring.svc:9090 + version: 1 + editable: false + diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests/grafana/grafana-deployment.yaml new file mode 100644 index 0000000..cd69c5f --- /dev/null +++ b/manifests/grafana/grafana-deployment.yaml @@ -0,0 +1,60 @@ +apiVersion: apps/v1beta1 +kind: Deployment +metadata: + name: grafana +spec: + replicas: 1 + template: + metadata: + labels: + app: grafana + spec: + securityContext: + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: grafana + image: carlosedp/monitoring-grafana:latest + volumeMounts: + - name: grafana-config + mountPath: /grafana/conf/config.ini + subPath: config.ini + - name: grafana-storage + mountPath: /data + - name: grafana-datasources + mountPath: /grafana/conf/provisioning/datasources + - name: grafana-dashboards + mountPath: /grafana/conf/provisioning/dashboards + - name: grafana-dashboard-definitions-0 + mountPath: /grafana-dashboard-definitions/0 + ports: + - name: web + containerPort: 3000 + env: + - name: GF_INSTALL_PLUGINS + value: "grafana-clock-panel,grafana-piechart-panel" + - name: GF_PATHS_PLUGINS + value: "/data/plugins" + resources: + requests: + memory: 100Mi + cpu: 100m + limits: + memory: 200Mi + cpu: 200m + volumes: + - name: grafana-config + configMap: + name: grafana-config + - name: grafana-storage + persistentVolumeClaim: + claimName: grafana-claim + - name: grafana-datasources + configMap: + name: grafana-datasources + - name: grafana-dashboards + configMap: + name: grafana-dashboards + - name: grafana-dashboard-definitions-0 + configMap: + name: grafana-dashboard-definitions-0 diff --git a/manifests/grafana/grafana-ingress.yaml b/manifests/grafana/grafana-ingress.yaml new file mode 100644 index 0000000..cc55605 --- /dev/null +++ b/manifests/grafana/grafana-ingress.yaml @@ -0,0 +1,15 @@ +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: grafana + namespace: monitoring +spec: + rules: + - host: grafana.internal.carlosedp.com + http: + paths: + - path: / + backend: + serviceName: grafana + servicePort: 3000 + diff --git a/manifests/grafana/grafana-service.yaml b/manifests/grafana/grafana-service.yaml new file mode 100644 index 0000000..fbcee40 --- /dev/null +++ b/manifests/grafana/grafana-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: grafana + labels: + app: grafana +spec: + type: NodePort + ports: + - port: 3000 + protocol: TCP + nodePort: 30902 + targetPort: web + selector: + app: grafana diff --git a/manifests/k8s/kube-controller-manager.yaml b/manifests/k8s/kube-controller-manager.yaml new file mode 100644 index 0000000..bd8d7cb --- /dev/null +++ b/manifests/k8s/kube-controller-manager.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + namespace: kube-system + name: kube-controller-manager-prometheus-discovery + labels: + k8s-app: kube-controller-manager +spec: + selector: + component: kube-controller-manager + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10252 + targetPort: 10252 + protocol: TCP diff --git a/manifests/k8s/kube-scheduler.yaml b/manifests/k8s/kube-scheduler.yaml new file mode 100644 index 0000000..2d90097 --- /dev/null +++ b/manifests/k8s/kube-scheduler.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + namespace: kube-system + name: kube-scheduler-prometheus-discovery + labels: + k8s-app: kube-scheduler +spec: + selector: + component: kube-scheduler + type: ClusterIP + clusterIP: None + ports: + - name: http-metrics + port: 10251 + targetPort: 10251 + protocol: TCP diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml b/manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml new file mode 100644 index 0000000..8284fc1 --- /dev/null +++ b/manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml new file mode 100644 index 0000000..ef5e91a --- /dev/null +++ b/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml @@ -0,0 +1,45 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: +- apiGroups: [""] + resources: + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: ["list", "watch"] +- apiGroups: ["extensions"] + resources: + - daemonsets + - deployments + - replicasets + verbs: ["list", "watch"] +- apiGroups: ["apps"] + resources: + - statefulsets + verbs: ["list", "watch"] +- apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["list", "watch"] +- apiGroups: ["autoscaling"] + resources: + - horizontalpodautoscalers + verbs: ["list", "watch"] +- apiGroups: ["authentication.k8s.io"] + resources: + - tokenreviews + verbs: ["create"] +- apiGroups: ["authorization.k8s.io"] + resources: + - subjectaccessreviews + verbs: ["create"] \ No newline at end of file diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml new file mode 100644 index 0000000..dd74f5e --- /dev/null +++ b/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml @@ -0,0 +1,55 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: kube-state-metrics +spec: + replicas: 1 + template: + metadata: + labels: + app: kube-state-metrics + spec: + serviceAccountName: kube-state-metrics + securityContext: + runAsNonRoot: true + runAsUser: 65534 + containers: + - name: kube-state-metrics + image: carlosedp/kube-state-metrics-arm64:v1.2.0 + args: + - "--host=0.0.0.0" + - "--port=8443" + - "--telemetry-host=0.0.0.0" + - "--telemetry-port=9443" + ports: + - name: http-main + containerPort: 8443 + - name: http-self + containerPort: 9443 + - name: addon-resizer + image: gcr.io/google-containers/addon-resizer-arm64:2.1 + resources: + limits: + cpu: 100m + memory: 30Mi + requests: + cpu: 100m + memory: 30Mi + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + command: + - /pod_nanny + - --container=kube-state-metrics + - --cpu=100m + - --extra-cpu=2m + - --memory=150Mi + - --extra-memory=30Mi + #- --threshold=5 + - --deployment=kube-state-metrics diff --git a/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml b/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml new file mode 100644 index 0000000..a93c396 --- /dev/null +++ b/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kube-state-metrics-resizer +subjects: +- kind: ServiceAccount + name: kube-state-metrics + diff --git a/manifests/kube-state-metrics/kube-state-metrics-role.yaml b/manifests/kube-state-metrics/kube-state-metrics-role.yaml new file mode 100644 index 0000000..6bf21fb --- /dev/null +++ b/manifests/kube-state-metrics/kube-state-metrics-role.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: kube-state-metrics-resizer +rules: +- apiGroups: [""] + resources: + - pods + verbs: ["get"] +- apiGroups: ["extensions"] + resources: + - deployments + resourceNames: ["kube-state-metrics"] + verbs: ["get", "update"] + diff --git a/manifests/kube-state-metrics/kube-state-metrics-service-account.yaml b/manifests/kube-state-metrics/kube-state-metrics-service-account.yaml new file mode 100644 index 0000000..9977935 --- /dev/null +++ b/manifests/kube-state-metrics/kube-state-metrics-service-account.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics diff --git a/manifests/kube-state-metrics/kube-state-metrics-service.yaml b/manifests/kube-state-metrics/kube-state-metrics-service.yaml new file mode 100644 index 0000000..dfe9d9a --- /dev/null +++ b/manifests/kube-state-metrics/kube-state-metrics-service.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: kube-state-metrics + k8s-app: kube-state-metrics + name: kube-state-metrics +spec: + clusterIP: None + ports: + - name: http-main + port: 8443 + targetPort: http-main + protocol: TCP + - name: http-self + port: 9443 + targetPort: http-self + protocol: TCP + selector: + app: kube-state-metrics + diff --git a/manifests/node-exporter/node-exporter-cluster-role-binding.yaml b/manifests/node-exporter/node-exporter-cluster-role-binding.yaml new file mode 100644 index 0000000..a5a2050 --- /dev/null +++ b/manifests/node-exporter/node-exporter-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-exporter +subjects: +- kind: ServiceAccount + name: node-exporter + namespace: monitoring diff --git a/manifests/node-exporter/node-exporter-cluster-role.yaml b/manifests/node-exporter/node-exporter-cluster-role.yaml new file mode 100644 index 0000000..932b776 --- /dev/null +++ b/manifests/node-exporter/node-exporter-cluster-role.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: node-exporter +rules: +- apiGroups: ["authentication.k8s.io"] + resources: + - tokenreviews + verbs: ["create"] +- apiGroups: ["authorization.k8s.io"] + resources: + - subjectaccessreviews + verbs: ["create"] diff --git a/manifests/node-exporter/node-exporter-daemonset.yaml b/manifests/node-exporter/node-exporter-daemonset.yaml new file mode 100644 index 0000000..433a48c --- /dev/null +++ b/manifests/node-exporter/node-exporter-daemonset.yaml @@ -0,0 +1,57 @@ +apiVersion: extensions/v1beta1 +kind: DaemonSet +metadata: + name: node-exporter +spec: + updateStrategy: + rollingUpdate: + maxUnavailable: 1 + type: RollingUpdate + template: + metadata: + labels: + app: node-exporter + name: node-exporter + spec: + serviceAccountName: node-exporter + securityContext: + runAsNonRoot: true + runAsUser: 65534 + hostNetwork: true + hostPID: true + containers: + - image: napnap75/rpi-prometheus:node_exporter + args: + #- "--web.listen-address=0.0.0.1:9100" + - "--path.procfs=/host/proc" + - "--path.sysfs=/host/sys" + name: node-exporter + ports: + - containerPort: 9100 + hostPort: 9100 + name: http + resources: + requests: + memory: 30Mi + cpu: 100m + limits: + memory: 50Mi + cpu: 200m + volumeMounts: + - name: proc + readOnly: true + mountPath: /host/proc + - name: sys + readOnly: true + mountPath: /host/sys + tolerations: + - effect: NoSchedule + operator: Exists + volumes: + - name: proc + hostPath: + path: /proc + - name: sys + hostPath: + path: /sys + diff --git a/manifests/node-exporter/node-exporter-service-account.yaml b/manifests/node-exporter/node-exporter-service-account.yaml new file mode 100644 index 0000000..703a274 --- /dev/null +++ b/manifests/node-exporter/node-exporter-service-account.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-exporter diff --git a/manifests/node-exporter/node-exporter-service.yaml b/manifests/node-exporter/node-exporter-service.yaml new file mode 100644 index 0000000..f9bfc58 --- /dev/null +++ b/manifests/node-exporter/node-exporter-service.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + app: node-exporter + k8s-app: node-exporter + name: node-exporter +spec: + type: ClusterIP + clusterIP: None + ports: + - name: http + port: 9100 + protocol: TCP + selector: + app: node-exporter + diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml new file mode 100644 index 0000000..e7e03a2 --- /dev/null +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-operator +subjects: +- kind: ServiceAccount + name: prometheus-operator + namespace: monitoring diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml new file mode 100644 index 0000000..1b13c89 --- /dev/null +++ b/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml @@ -0,0 +1,54 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus-operator +rules: +- apiGroups: + - extensions + resources: + - thirdpartyresources + verbs: + - "*" +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - "*" +- apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + - prometheuses + - prometheuses/finalizers + - alertmanagers/finalizers + - servicemonitors + verbs: + - "*" +- apiGroups: + - apps + resources: + - statefulsets + verbs: ["*"] +- apiGroups: [""] + resources: + - configmaps + - secrets + verbs: ["*"] +- apiGroups: [""] + resources: + - pods + verbs: ["list", "delete"] +- apiGroups: [""] + resources: + - services + - endpoints + verbs: ["get", "create", "update"] +- apiGroups: [""] + resources: + - nodes + verbs: ["list", "watch"] +- apiGroups: [""] + resources: + - namespaces + verbs: ["list"] diff --git a/manifests/prometheus-operator/prometheus-operator-service-account.yaml b/manifests/prometheus-operator/prometheus-operator-service-account.yaml new file mode 100644 index 0000000..38d18cc --- /dev/null +++ b/manifests/prometheus-operator/prometheus-operator-service-account.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-operator diff --git a/manifests/prometheus-operator/prometheus-operator-service.yaml b/manifests/prometheus-operator/prometheus-operator-service.yaml new file mode 100644 index 0000000..8882d4a --- /dev/null +++ b/manifests/prometheus-operator/prometheus-operator-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + name: prometheus-operator + labels: + k8s-app: prometheus-operator +spec: + type: ClusterIP + ports: + - name: http + port: 8080 + targetPort: http + protocol: TCP + selector: + k8s-app: prometheus-operator diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests/prometheus-operator/prometheus-operator.yaml new file mode 100644 index 0000000..c47e5b4 --- /dev/null +++ b/manifests/prometheus-operator/prometheus-operator.yaml @@ -0,0 +1,34 @@ +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + labels: + k8s-app: prometheus-operator + name: prometheus-operator +spec: + replicas: 1 + template: + metadata: + labels: + k8s-app: prometheus-operator + spec: + containers: + - args: + - --kubelet-service=kube-system/kubelet + - --prometheus-config-reloader=carlosedp/prometheus-config-reloader:v0.0.2 + - --config-reloader-image=carlosedp/configmap-reload:latest + image: carlosedp/prometheus-operator:v0.17.0 + name: prometheus-operator + ports: + - containerPort: 8080 + name: http + resources: + limits: + cpu: 200m + memory: 100Mi + requests: + cpu: 100m + memory: 50Mi + securityContext: + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: prometheus-operator diff --git a/manifests/prometheus/prometheus-k8s-ingress.yaml b/manifests/prometheus/prometheus-k8s-ingress.yaml new file mode 100644 index 0000000..7dccd73 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-ingress.yaml @@ -0,0 +1,14 @@ +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: prometheus + namespace: monitoring +spec: + rules: + - host: prometheus.internal.carlosedp.com + http: + paths: + - path: / + backend: + serviceName: prometheus-k8s + servicePort: web diff --git a/manifests/prometheus/prometheus-k8s-role-bindings.yaml b/manifests/prometheus/prometheus-k8s-role-bindings.yaml new file mode 100644 index 0000000..5f190e7 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-role-bindings.yaml @@ -0,0 +1,54 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: RoleBinding +metadata: + name: prometheus-k8s + namespace: default +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRoleBinding +metadata: + name: prometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus/prometheus-k8s-roles.yaml b/manifests/prometheus/prometheus-k8s-roles.yaml new file mode 100644 index 0000000..4f738e7 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-roles.yaml @@ -0,0 +1,55 @@ +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: monitoring +rules: +- apiGroups: [""] + resources: + - nodes + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +- apiGroups: [""] + resources: + - configmaps + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: kube-system +rules: +- apiGroups: [""] + resources: + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: Role +metadata: + name: prometheus-k8s + namespace: default +rules: +- apiGroups: [""] + resources: + - services + - endpoints + - pods + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1beta1 +kind: ClusterRole +metadata: + name: prometheus-k8s +rules: +- apiGroups: [""] + resources: + - nodes/metrics + verbs: ["get"] +- nonResourceURLs: ["/metrics"] + verbs: ["get"] diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests/prometheus/prometheus-k8s-rules.yaml new file mode 100644 index 0000000..d563a57 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-rules.yaml @@ -0,0 +1,602 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-k8s-rules + labels: + role: prometheus-rulefiles + prometheus: k8s +data: + alertmanager.rules.yaml: |+ + groups: + - name: alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) + GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", + "alertmanager-$1", "alertmanager", "(.*)") != 1 + for: 5m + labels: + severity: critical + annotations: + description: The configuration of the instances of the Alertmanager cluster + `{{$labels.service}}` are out of sync. + - alert: AlertmanagerDownOrMissing + expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", + "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 + for: 5m + labels: + severity: warning + annotations: + description: An unexpected number of Alertmanagers are scraped or Alertmanagers + disappeared from discovery. + - alert: AlertmanagerFailedReload + expr: alertmanager_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + etcd3.rules.yaml: |+ + groups: + - name: ./etcd3.rules + rules: + - alert: InsufficientMembers + expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) + for: 3m + labels: + severity: critical + annotations: + description: If one more etcd member goes down the cluster will be unavailable + summary: etcd cluster insufficient members + - alert: NoLeader + expr: etcd_server_has_leader{job="etcd"} == 0 + for: 1m + labels: + severity: critical + annotations: + description: etcd member {{ $labels.instance }} has no leader + summary: etcd member has no leader + - alert: HighNumberOfLeaderChanges + expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader + changes within the last hour + summary: a high number of leader changes within the etcd cluster are happening + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) + / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: HighNumberOfFailedGRPCRequests + expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) + / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed + on etcd instance {{ $labels.instance }}' + summary: a high number of gRPC requests are failing + - alert: GRPCRequestsSlow + expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) + > 0.15 + for: 10m + labels: + severity: critical + annotations: + description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method + }} are slow + summary: slow gRPC requests + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HighNumberOfFailedHTTPRequests + expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) + BY (method) > 0.05 + for: 5m + labels: + severity: critical + annotations: + description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd + instance {{ $labels.instance }}' + summary: a high number of HTTP requests are failing + - alert: HTTPRequestsSlow + expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method + }} are slow + summary: slow HTTP requests + - alert: EtcdMemberCommunicationSlow + expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) + > 0.15 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} member communication with + {{ $labels.To }} is slow + summary: etcd member communication is slow + - alert: HighNumberOfFailedProposals + expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal + failures within the last hour + summary: a high number of proposals within the etcd cluster are failing + - alert: HighFsyncDurations + expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) + > 0.5 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} fync durations are high + summary: high fsync durations + - alert: HighCommitDurations + expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) + > 0.25 + for: 10m + labels: + severity: warning + annotations: + description: etcd instance {{ $labels.instance }} commit durations are high + summary: high commit durations + general.rules.yaml: |+ + groups: + - name: general.rules + rules: + - alert: TargetDown + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $value }}% of {{ $labels.job }} targets are down.' + summary: Targets are down + - alert: DeadMansSwitch + expr: vector(1) + labels: + severity: none + annotations: + description: This is a DeadMansSwitch meant to ensure that the entire Alerting + pipeline is functional. + summary: Alerting DeadMansSwitch + - record: fd_utilization + expr: process_open_fds / process_max_fds + - alert: FdExhaustionClose + expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 + for: 10m + labels: + severity: warning + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance + will exhaust in file/socket descriptors within the next 4 hours' + summary: file descriptors soon exhausted + - alert: FdExhaustionClose + expr: predict_linear(fd_utilization[10m], 3600) > 1 + for: 10m + labels: + severity: critical + annotations: + description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance + will exhaust in file/socket descriptors within the next hour' + summary: file descriptors soon exhausted + kube-controller-manager.rules.yaml: |+ + groups: + - name: kube-controller-manager.rules + rules: + - alert: K8SControllerManagerDown + expr: absent(up{job="kube-controller-manager"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S controller manager. Deployments and replication + controllers are not making progress. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager + summary: Controller manager is down + kube-scheduler.rules.yaml: |+ + groups: + - name: kube-scheduler.rules + rules: + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.99" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.9" + - record: cluster:scheduler_binding_latency_seconds:quantile + expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) + BY (le, cluster)) / 1e+06 + labels: + quantile: "0.5" + - alert: K8SSchedulerDown + expr: absent(up{job="kube-scheduler"} == 1) + for: 5m + labels: + severity: critical + annotations: + description: There is no running K8S scheduler. New pods are not being assigned + to nodes. + runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler + summary: Scheduler is down + kube-state-metrics.rules.yaml: |+ + groups: + - name: kube-state-metrics.rules + rules: + - alert: DeploymentGenerationMismatch + expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation + for: 15m + labels: + severity: warning + annotations: + description: Observed deployment generation does not match expected one for + deployment {{$labels.namespaces}}/{{$labels.deployment}} + summary: Deployment is outdated + - alert: DeploymentReplicasNotUpdated + expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) + or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) + unless (kube_deployment_spec_paused == 1) + for: 15m + labels: + severity: warning + annotations: + description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} + summary: Deployment replicas are outdated + - alert: DaemonSetRolloutStuck + expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled + * 100 < 100 + for: 15m + labels: + severity: warning + annotations: + description: Only {{$value}}% of desired pods scheduled and ready for daemon + set {{$labels.namespaces}}/{{$labels.daemonset}} + summary: DaemonSet is missing pods + - alert: K8SDaemonSetsNotScheduled + expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled + > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are not scheduled. + summary: Daemonsets are not scheduled correctly + - alert: DaemonSetsMissScheduled + expr: kube_daemonset_status_number_misscheduled > 0 + for: 10m + labels: + severity: warning + annotations: + description: A number of daemonsets are running where they are not supposed + to run. + summary: Daemonsets are not scheduled correctly + - alert: PodFrequentlyRestarting + expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 + for: 10m + labels: + severity: warning + annotations: + description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} + times within the last hour + summary: Pod is restarting frequently + kubelet.rules.yaml: |+ + groups: + - name: kubelet.rules + rules: + - alert: K8SNodeNotReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 1h + labels: + severity: warning + annotations: + description: The Kubelet on {{ $labels.node }} has not checked in with the API, + or has set itself to NotReady, for more than an hour + summary: Node status is NotReady + - alert: K8SManyNodesNotReady + expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) + > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == + 0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 + for: 1m + labels: + severity: critical + annotations: + description: '{{ $value }}% of Kubernetes nodes are not ready' + - alert: K8SKubeletDown + expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 + for: 1h + labels: + severity: warning + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets. + - alert: K8SKubeletDown + expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) + * 100 > 1 + for: 1h + labels: + severity: critical + annotations: + description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets + have disappeared from service discovery. + summary: Many Kubelets cannot be scraped + - alert: K8SKubeletTooManyPods + expr: kubelet_running_pod_count > 100 + for: 10m + labels: + severity: warning + annotations: + description: Kubelet {{$labels.instance}} is running {{$value}} pods, close + to the limit of 110 + summary: Kubelet is close to pod limit + kubernetes.rules.yaml: |+ + groups: + - name: kubernetes.rules + rules: + - record: pod_name:container_memory_usage_bytes:sum + expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY + (pod_name) + - record: pod_name:container_spec_cpu_shares:sum + expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) + - record: pod_name:container_cpu_usage:sum + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) + BY (pod_name) + - record: pod_name:container_fs_usage_bytes:sum + expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) + - record: namespace:container_memory_usage_bytes:sum + expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) + - record: namespace:container_spec_cpu_shares:sum + expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) + - record: namespace:container_cpu_usage:sum + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) + BY (namespace) + - record: cluster:memory_usage:ratio + expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY + (cluster) / sum(machine_memory_bytes) BY (cluster) + - record: cluster:container_spec_cpu_shares:ratio + expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 + / sum(machine_cpu_cores) + - record: cluster:container_cpu_usage:ratio + expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) + / sum(machine_cpu_cores) + - record: apiserver_latency_seconds:quantile + expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 + labels: + quantile: "0.99" + - record: apiserver_latency:quantile_seconds + expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 + labels: + quantile: "0.9" + - record: apiserver_latency_seconds:quantile + expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / + 1e+06 + labels: + quantile: "0.5" + - alert: APIServerLatencyHigh + expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} + > 1 + for: 10m + labels: + severity: warning + annotations: + description: the API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}} + - alert: APIServerLatencyHigh + expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} + > 4 + for: 10m + labels: + severity: critical + annotations: + description: the API server has a 99th percentile latency of {{ $value }} seconds + for {{$labels.verb}} {{$labels.resource}} + - alert: APIServerErrorsHigh + expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) + * 100 > 2 + for: 10m + labels: + severity: warning + annotations: + description: API server returns errors for {{ $value }}% of requests + - alert: APIServerErrorsHigh + expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) + * 100 > 5 + for: 10m + labels: + severity: critical + annotations: + description: API server returns errors for {{ $value }}% of requests + - alert: K8SApiserverDown + expr: absent(up{job="apiserver"} == 1) + for: 20m + labels: + severity: critical + annotations: + description: No API servers are reachable or all have disappeared from service + discovery + + - alert: K8sCertificateExpirationNotice + labels: + severity: warning + annotations: + description: Kubernetes API Certificate is expiring soon (less than 7 days) + expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 + + - alert: K8sCertificateExpirationNotice + labels: + severity: critical + annotations: + description: Kubernetes API Certificate is expiring in less than 1 day + expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 + node.rules.yaml: |+ + groups: + - name: node.rules + rules: + - record: instance:node_cpu:rate:sum + expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) + BY (instance) + - record: instance:node_filesystem_usage:sum + expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + BY (instance) + - record: instance:node_network_receive_bytes:rate:sum + expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + - record: instance:node_network_transmit_bytes:rate:sum + expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + - record: instance:node_cpu:ratio + expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) + GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + - record: cluster:node_cpu:sum_rate5m + expr: sum(rate(node_cpu{mode!="idle"}[5m])) + - record: cluster:node_cpu:ratio + expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) + - alert: NodeExporterDown + expr: absent(up{job="node-exporter"} == 1) + for: 10m + labels: + severity: warning + annotations: + description: Prometheus could not scrape a node-exporter for more than 10m, + or node-exporters have disappeared from discovery + - alert: NodeDiskRunningFull + expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 + for: 30m + labels: + severity: warning + annotations: + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 24 hours (mounted at {{$labels.mountpoint}}) + - alert: NodeDiskRunningFull + expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 + for: 10m + labels: + severity: critical + annotations: + description: device {{$labels.device}} on node {{$labels.instance}} is running + full within the next 2 hours (mounted at {{$labels.mountpoint}}) + prometheus.rules.yaml: |+ + groups: + - name: prometheus.rules + rules: + - alert: PrometheusConfigReloadFailed + expr: prometheus_config_last_reload_successful == 0 + for: 10m + labels: + severity: warning + annotations: + description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + - alert: PrometheusNotificationQueueRunningFull + expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity + for: 10m + labels: + severity: warning + annotations: + description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ + $labels.pod}} + - alert: PrometheusErrorSendingAlerts + expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) + > 0.01 + for: 10m + labels: + severity: warning + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusErrorSendingAlerts + expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) + > 0.03 + for: 10m + labels: + severity: critical + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + - alert: PrometheusNotConnectedToAlertmanagers + expr: prometheus_notifications_alertmanagers_discovered < 1 + for: 10m + labels: + severity: warning + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected + to any Alertmanagers + - alert: PrometheusTSDBReloadsFailing + expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 + for: 12h + labels: + severity: warning + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + reload failures over the last four hours.' + summary: Prometheus has issues reloading data blocks from disk + - alert: PrometheusTSDBCompactionsFailing + expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 + for: 12h + labels: + severity: warning + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + compaction failures over the last four hours.' + summary: Prometheus has issues compacting sample blocks + - alert: PrometheusTSDBWALCorruptions + expr: tsdb_wal_corruptions_total > 0 + for: 4h + labels: + severity: warning + annotations: + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead + log (WAL).' + summary: Prometheus write-ahead log is corrupted diff --git a/manifests/prometheus/prometheus-k8s-service-account.yaml b/manifests/prometheus/prometheus-k8s-service-account.yaml new file mode 100644 index 0000000..58d5342 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-account.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-k8s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml new file mode 100644 index 0000000..19669e3 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: alertmanager + labels: + k8s-app: alertmanager +spec: + selector: + matchLabels: + alertmanager: main + namespaceSelector: + matchNames: + - monitoring + endpoints: + - port: web + interval: 30s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml new file mode 100644 index 0000000..40361f0 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml @@ -0,0 +1,23 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kube-apiserver + labels: + k8s-app: apiserver +spec: + jobLabel: component + selector: + matchLabels: + component: apiserver + provider: kubernetes + namespaceSelector: + matchNames: + - default + endpoints: + - port: https + interval: 30s + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-arm-exporter.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-arm-exporter.yaml new file mode 100644 index 0000000..b7d4cb2 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-arm-exporter.yaml @@ -0,0 +1,18 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: arm-exporter + namespace: monitoring + labels: + k8s-app: arm-exporter +spec: + jobLabel: k8s-app + selector: + matchLabels: + k8s-app: arm-exporter + namespaceSelector: + matchNames: + - monitoring + endpoints: + - port: http + interval: 30s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml new file mode 100644 index 0000000..681c320 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kube-controller-manager + labels: + k8s-app: kube-controller-manager +spec: + jobLabel: k8s-app + endpoints: + - port: http-metrics + interval: 30s + selector: + matchLabels: + k8s-app: kube-controller-manager + namespaceSelector: + matchNames: + - kube-system diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml new file mode 100644 index 0000000..6927f58 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kube-scheduler + labels: + k8s-app: kube-scheduler +spec: + jobLabel: k8s-app + endpoints: + - port: http-metrics + interval: 30s + selector: + matchLabels: + k8s-app: kube-scheduler + namespaceSelector: + matchNames: + - kube-system diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml new file mode 100644 index 0000000..62174b4 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml @@ -0,0 +1,28 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kube-state-metrics + labels: + k8s-app: kube-state-metrics +spec: + jobLabel: k8s-app + selector: + matchLabels: + k8s-app: kube-state-metrics + namespaceSelector: + matchNames: + - monitoring + endpoints: + - port: http-main + #scheme: https + interval: 30s + honorLabels: true + #bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + #tlsConfig: + # insecureSkipVerify: true + - port: http-self + # scheme: https + interval: 30s + # bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + # tlsConfig: + # insecureSkipVerify: true diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml new file mode 100644 index 0000000..16c9752 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml @@ -0,0 +1,29 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: kubelet + labels: + k8s-app: kubelet +spec: + jobLabel: k8s-app + endpoints: + - port: https-metrics + scheme: https + interval: 30s + tlsConfig: + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + - port: https-metrics + scheme: https + path: /metrics/cadvisor + interval: 30s + honorLabels: true + tlsConfig: + insecureSkipVerify: true + bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + selector: + matchLabels: + k8s-app: kubelet + namespaceSelector: + matchNames: + - kube-system diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml new file mode 100644 index 0000000..4c78ea1 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml @@ -0,0 +1,21 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: node-exporter + labels: + k8s-app: node-exporter +spec: + jobLabel: k8s-app + selector: + matchLabels: + k8s-app: node-exporter + namespaceSelector: + matchNames: + - monitoring + endpoints: + - port: http + #scheme: http + interval: 30s + #bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + #tlsConfig: + # insecureSkipVerify: true diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml new file mode 100644 index 0000000..0b8028e --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml @@ -0,0 +1,12 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: prometheus-operator + labels: + k8s-app: prometheus-operator +spec: + endpoints: + - port: http + selector: + matchLabels: + k8s-app: prometheus-operator diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml new file mode 100644 index 0000000..c3d11e5 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml @@ -0,0 +1,16 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: prometheus + labels: + k8s-app: prometheus +spec: + selector: + matchLabels: + prometheus: k8s + namespaceSelector: + matchNames: + - monitoring + endpoints: + - port: web + interval: 30s diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-traefik.yaml b/manifests/prometheus/prometheus-k8s-service-monitor-traefik.yaml new file mode 100644 index 0000000..a415768 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service-monitor-traefik.yaml @@ -0,0 +1,17 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: traefik-ingress-lb + labels: + k8s-app: traefik-ingress-lb +spec: + jobLabel: k8s-app + endpoints: + - port: admin + interval: 30s + selector: + matchLabels: + k8s-app: traefik-ingress-lb + namespaceSelector: + matchNames: + - kube-system diff --git a/manifests/prometheus/prometheus-k8s-service.yaml b/manifests/prometheus/prometheus-k8s-service.yaml new file mode 100644 index 0000000..5cd3b65 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s-service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + prometheus: k8s + name: prometheus-k8s +spec: + type: NodePort + ports: + - name: web + nodePort: 30900 + port: 9090 + protocol: TCP + targetPort: web + selector: + prometheus: k8s diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests/prometheus/prometheus-k8s.yaml new file mode 100644 index 0000000..919c056 --- /dev/null +++ b/manifests/prometheus/prometheus-k8s.yaml @@ -0,0 +1,39 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + name: k8s + labels: + prometheus: k8s +spec: + replicas: 1 + baseImage: carlosedp/prometheus + version: v2.1.0 + retention: "168h" + serviceAccountName: prometheus-k8s + serviceMonitorSelector: + matchExpressions: + - {key: k8s-app, operator: Exists} + ruleSelector: + matchLabels: + role: prometheus-rulefiles + prometheus: k8s + resources: + requests: + # 2Gi is default, but won't schedule if you don't have a node with >2Gi + # memory. Modify based on your target and time-series count for + # production use. This value is mainly meant for demonstration/testing + # purposes. + memory: 400Mi + alerting: + alertmanagers: + - namespace: monitoring + name: alertmanager-main + port: web + storage: + volumeClaimTemplate: + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 50Gi diff --git a/teardown b/teardown new file mode 100755 index 0000000..b2c4c54 --- /dev/null +++ b/teardown @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +if [ -z "${KUBECONFIG}" ]; then + export KUBECONFIG=~/.kube/config +fi + +# CAUTION - NAMESPACE must match its value when deploy script was run. +# Some resources are always deployed to the monitoring namespace. + +if [ -z "${NAMESPACE}" ]; then + NAMESPACE=monitoring +fi + +kctl() { + kubectl --namespace "$NAMESPACE" "$@" +} + +kctl delete -f manifests/node-exporter +kctl delete -f manifests/kube-state-metrics +kctl delete -f manifests/grafana +find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \; +kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml +kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml +kctl delete -f manifests/alertmanager + +# Hack: wait a bit to let the controller delete the deployed Prometheus server. +sleep 5 + +kctl delete -f manifests/prometheus-operator +