From f59ff6aece194ce4b9e439bb195061295089d1b0 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Wed, 12 Dec 2018 15:18:00 -0200 Subject: [PATCH 01/30] New prometheus-operator structure using jsonnet build --- .gitignore | 1 + Makefile | 43 + arm_exporter.jsonnet | 95 + build.sh | 16 + build_images.sh | 198 +- deploy | 52 - example.jsonnet | 14 + jsonnetfile.json | 14 + jsonnetfile.lock.json | 84 + main.jsonnet | 2 + .../alertmanager/alertmanager-config.yaml | 0 .../alertmanager/alertmanager-ingress.yaml | 0 .../alertmanager/alertmanager-service.yaml | 0 .../alertmanager/alertmanager.yaml | 0 .../armexporter/daemonset.yaml | 0 .../armexporter/service.yaml | 0 .../grafana/grafana-claim.yaml | 0 .../grafana/grafana-configmap.yaml | 0 .../grafana/grafana-credentials.yaml | 0 .../grafana-dashboard-definitions.yaml | 0 .../grafana/grafana-dashboards.yaml | 0 .../grafana/grafana-datasources.yaml | 0 .../grafana/grafana-deployment.yaml | 0 .../grafana/grafana-external-ingress.yaml | 0 .../grafana/grafana-ingress.yaml | 0 .../grafana/grafana-service.yaml | 0 .../k8s/kube-controller-manager.yaml | 0 .../k8s/kube-scheduler.yaml | 0 ...be-state-metrics-cluster-role-binding.yaml | 0 .../kube-state-metrics-cluster-role.yaml | 0 .../kube-state-metrics-deployment.yaml | 0 .../kube-state-metrics-role-binding.yaml | 0 .../kube-state-metrics-role.yaml | 0 .../kube-state-metrics-service-account.yaml | 0 .../kube-state-metrics-service.yaml | 0 .../node-exporter-cluster-role-binding.yaml | 0 .../node-exporter-cluster-role.yaml | 0 .../node-exporter-daemonset.yaml | 0 .../node-exporter-service-account.yaml | 0 .../node-exporter/node-exporter-service.yaml | 0 ...metheus-operator-cluster-role-binding.yaml | 0 .../prometheus-operator-cluster-role.yaml | 0 .../prometheus-operator-service-account.yaml | 0 .../prometheus-operator-service.yaml | 0 .../prometheus-operator.yaml | 0 .../prometheus/prometheus-k8s-ingress.yaml | 0 .../prometheus-k8s-role-bindings.yaml | 0 .../prometheus/prometheus-k8s-roles.yaml | 0 .../prometheus/prometheus-k8s-rules.yaml | 0 .../prometheus-k8s-service-account.yaml | 0 ...heus-k8s-service-monitor-alertmanager.yaml | 0 ...metheus-k8s-service-monitor-apiserver.yaml | 0 ...heus-k8s-service-monitor-arm-exporter.yaml | 0 ...rvice-monitor-kube-controller-manager.yaml | 0 ...us-k8s-service-monitor-kube-scheduler.yaml | 0 ...8s-service-monitor-kube-state-metrics.yaml | 0 ...rometheus-k8s-service-monitor-kubelet.yaml | 0 ...eus-k8s-service-monitor-node-exporter.yaml | 0 ...s-service-monitor-prometheus-operator.yaml | 0 ...etheus-k8s-service-monitor-prometheus.yaml | 0 ...rometheus-k8s-service-monitor-traefik.yaml | 0 .../prometheus/prometheus-k8s-service.yaml | 0 .../prometheus/prometheus-k8s.yaml | 0 .../smtp-server/create_secret.sh | 0 .../smtp-server/smtp.yaml | 0 .../snmp-exporter-deployment.yaml | 0 .../snmp-exporter/snmp-exporter-service.yaml | 0 .../snmp-exporter-servicemonitor-router.yaml | 0 manifests/00namespace-namespace.yaml | 4 + ...0alertmanagerCustomResourceDefinition.yaml | 2406 +++++ ...r-0prometheusCustomResourceDefinition.yaml | 3107 +++++++ ...rometheusruleCustomResourceDefinition.yaml | 342 + ...ervicemonitorCustomResourceDefinition.yaml | 291 + .../0prometheus-operator-clusterRole.yaml | 66 + ...rometheus-operator-clusterRoleBinding.yaml | 12 + .../0prometheus-operator-deployment.yaml | 44 + manifests/0prometheus-operator-service.yaml | 15 + .../0prometheus-operator-serviceAccount.yaml | 5 + .../0prometheus-operator-serviceMonitor.yaml | 14 + manifests/alertmanager-alertmanager.yaml | 18 + manifests/alertmanager-secret.yaml | 8 + manifests/alertmanager-service.yaml | 15 + manifests/alertmanager-serviceAccount.yaml | 5 + manifests/alertmanager-serviceMonitor.yaml | 14 + manifests/arm-exporter-daemonset.yaml | 44 + manifests/arm-exporter-service.yaml | 15 + manifests/arm-exporter-serviceMonitor.yaml | 19 + manifests/grafana-config.yaml | 8 + manifests/grafana-dashboardDatasources.yaml | 8 + manifests/grafana-dashboardDefinitions.yaml | 7891 +++++++++++++++++ manifests/grafana-dashboardSources.yaml | 21 + manifests/grafana-deployment.yaml | 132 + manifests/grafana-service.yaml | 12 + manifests/grafana-serviceAccount.yaml | 5 + manifests/ingress-alertmanager-main.yaml | 14 + manifests/ingress-grafana.yaml | 14 + manifests/ingress-prometheus-k8s.yaml | 14 + manifests/kube-state-metrics-clusterRole.yaml | 69 + ...kube-state-metrics-clusterRoleBinding.yaml | 12 + manifests/kube-state-metrics-deployment.yaml | 53 + manifests/kube-state-metrics-role.yaml | 30 + manifests/kube-state-metrics-roleBinding.yaml | 12 + manifests/kube-state-metrics-service.yaml | 18 + .../kube-state-metrics-serviceAccount.yaml | 5 + .../kube-state-metrics-serviceMonitor.yaml | 27 + manifests/node-exporter-clusterRole.yaml | 17 + .../node-exporter-clusterRoleBinding.yaml | 12 + manifests/node-exporter-daemonset.yaml | 85 + manifests/node-exporter-service.yaml | 15 + manifests/node-exporter-serviceAccount.yaml | 5 + manifests/node-exporter-serviceMonitor.yaml | 19 + manifests/prometheus-adapter-apiService.yaml | 13 + manifests/prometheus-adapter-clusterRole.yaml | 16 + ...prometheus-adapter-clusterRoleBinding.yaml | 13 + ...s-adapter-clusterRoleBindingDelegator.yaml | 12 + ...us-adapter-clusterRoleServerResources.yaml | 11 + manifests/prometheus-adapter-configMap.yaml | 33 + manifests/prometheus-adapter-deployment.yaml | 50 + ...metheus-adapter-roleBindingAuthReader.yaml | 13 + manifests/prometheus-adapter-service.yaml | 14 + .../prometheus-adapter-serviceAccount.yaml | 5 + manifests/prometheus-clusterRole.yaml | 15 + manifests/prometheus-clusterRoleBinding.yaml | 12 + manifests/prometheus-prometheus.yaml | 45 + manifests/prometheus-roleBindingConfig.yaml | 13 + ...metheus-roleBindingSpecificNamespaces.yaml | 42 + manifests/prometheus-roleConfig.yaml | 12 + .../prometheus-roleSpecificNamespaces.yaml | 54 + manifests/prometheus-rules.yaml | 975 ++ manifests/prometheus-service.yaml | 15 + manifests/prometheus-serviceAccount.yaml | 5 + manifests/prometheus-serviceMonitor.yaml | 14 + .../prometheus-serviceMonitorApiserver.yaml | 29 + .../prometheus-serviceMonitorCoreDNS.yaml | 18 + ...s-serviceMonitorKubeControllerManager.yaml | 23 + ...rometheus-serviceMonitorKubeScheduler.yaml | 18 + .../prometheus-serviceMonitorKubelet.yaml | 31 + operator_stack.jsonnet | 288 + teardown | 31 - 139 files changed, 17175 insertions(+), 106 deletions(-) create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 arm_exporter.jsonnet create mode 100755 build.sh delete mode 100755 deploy create mode 100644 example.jsonnet create mode 100644 jsonnetfile.json create mode 100644 jsonnetfile.lock.json create mode 100644 main.jsonnet rename {manifests => manifests-old}/alertmanager/alertmanager-config.yaml (100%) rename {manifests => manifests-old}/alertmanager/alertmanager-ingress.yaml (100%) rename {manifests => manifests-old}/alertmanager/alertmanager-service.yaml (100%) rename {manifests => manifests-old}/alertmanager/alertmanager.yaml (100%) rename {manifests => manifests-old}/armexporter/daemonset.yaml (100%) rename {manifests => manifests-old}/armexporter/service.yaml (100%) rename {manifests => manifests-old}/grafana/grafana-claim.yaml (100%) rename {manifests => manifests-old}/grafana/grafana-configmap.yaml (100%) rename {manifests => manifests-old}/grafana/grafana-credentials.yaml (100%) rename {manifests => manifests-old}/grafana/grafana-dashboard-definitions.yaml (100%) rename {manifests => manifests-old}/grafana/grafana-dashboards.yaml (100%) rename {manifests => manifests-old}/grafana/grafana-datasources.yaml (100%) rename {manifests => manifests-old}/grafana/grafana-deployment.yaml (100%) rename {manifests => manifests-old}/grafana/grafana-external-ingress.yaml (100%) rename {manifests => manifests-old}/grafana/grafana-ingress.yaml (100%) rename {manifests => manifests-old}/grafana/grafana-service.yaml (100%) rename {manifests => manifests-old}/k8s/kube-controller-manager.yaml (100%) rename {manifests => manifests-old}/k8s/kube-scheduler.yaml (100%) rename {manifests => manifests-old}/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml (100%) rename {manifests => manifests-old}/kube-state-metrics/kube-state-metrics-cluster-role.yaml (100%) rename {manifests => manifests-old}/kube-state-metrics/kube-state-metrics-deployment.yaml (100%) rename {manifests => manifests-old}/kube-state-metrics/kube-state-metrics-role-binding.yaml (100%) rename {manifests => manifests-old}/kube-state-metrics/kube-state-metrics-role.yaml (100%) rename {manifests => manifests-old}/kube-state-metrics/kube-state-metrics-service-account.yaml (100%) rename {manifests => manifests-old}/kube-state-metrics/kube-state-metrics-service.yaml (100%) rename {manifests => manifests-old}/node-exporter/node-exporter-cluster-role-binding.yaml (100%) rename {manifests => manifests-old}/node-exporter/node-exporter-cluster-role.yaml (100%) rename {manifests => manifests-old}/node-exporter/node-exporter-daemonset.yaml (100%) rename {manifests => manifests-old}/node-exporter/node-exporter-service-account.yaml (100%) rename {manifests => manifests-old}/node-exporter/node-exporter-service.yaml (100%) rename {manifests => manifests-old}/prometheus-operator/prometheus-operator-cluster-role-binding.yaml (100%) rename {manifests => manifests-old}/prometheus-operator/prometheus-operator-cluster-role.yaml (100%) rename {manifests => manifests-old}/prometheus-operator/prometheus-operator-service-account.yaml (100%) rename {manifests => manifests-old}/prometheus-operator/prometheus-operator-service.yaml (100%) rename {manifests => manifests-old}/prometheus-operator/prometheus-operator.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-ingress.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-role-bindings.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-roles.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-rules.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-account.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-monitor-apiserver.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-monitor-arm-exporter.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-monitor-kubelet.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-monitor-prometheus.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service-monitor-traefik.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s-service.yaml (100%) rename {manifests => manifests-old}/prometheus/prometheus-k8s.yaml (100%) rename {manifests => manifests-old}/smtp-server/create_secret.sh (100%) rename {manifests => manifests-old}/smtp-server/smtp.yaml (100%) rename {manifests => manifests-old}/snmp-exporter/snmp-exporter-deployment.yaml (100%) rename {manifests => manifests-old}/snmp-exporter/snmp-exporter-service.yaml (100%) rename {manifests => manifests-old}/snmp-exporter/snmp-exporter-servicemonitor-router.yaml (100%) create mode 100644 manifests/00namespace-namespace.yaml create mode 100644 manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml create mode 100644 manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml create mode 100644 manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml create mode 100644 manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml create mode 100644 manifests/0prometheus-operator-clusterRole.yaml create mode 100644 manifests/0prometheus-operator-clusterRoleBinding.yaml create mode 100644 manifests/0prometheus-operator-deployment.yaml create mode 100644 manifests/0prometheus-operator-service.yaml create mode 100644 manifests/0prometheus-operator-serviceAccount.yaml create mode 100644 manifests/0prometheus-operator-serviceMonitor.yaml create mode 100644 manifests/alertmanager-alertmanager.yaml create mode 100644 manifests/alertmanager-secret.yaml create mode 100644 manifests/alertmanager-service.yaml create mode 100644 manifests/alertmanager-serviceAccount.yaml create mode 100644 manifests/alertmanager-serviceMonitor.yaml create mode 100644 manifests/arm-exporter-daemonset.yaml create mode 100644 manifests/arm-exporter-service.yaml create mode 100644 manifests/arm-exporter-serviceMonitor.yaml create mode 100644 manifests/grafana-config.yaml create mode 100644 manifests/grafana-dashboardDatasources.yaml create mode 100644 manifests/grafana-dashboardDefinitions.yaml create mode 100644 manifests/grafana-dashboardSources.yaml create mode 100644 manifests/grafana-deployment.yaml create mode 100644 manifests/grafana-service.yaml create mode 100644 manifests/grafana-serviceAccount.yaml create mode 100644 manifests/ingress-alertmanager-main.yaml create mode 100644 manifests/ingress-grafana.yaml create mode 100644 manifests/ingress-prometheus-k8s.yaml create mode 100644 manifests/kube-state-metrics-clusterRole.yaml create mode 100644 manifests/kube-state-metrics-clusterRoleBinding.yaml create mode 100644 manifests/kube-state-metrics-deployment.yaml create mode 100644 manifests/kube-state-metrics-role.yaml create mode 100644 manifests/kube-state-metrics-roleBinding.yaml create mode 100644 manifests/kube-state-metrics-service.yaml create mode 100644 manifests/kube-state-metrics-serviceAccount.yaml create mode 100644 manifests/kube-state-metrics-serviceMonitor.yaml create mode 100644 manifests/node-exporter-clusterRole.yaml create mode 100644 manifests/node-exporter-clusterRoleBinding.yaml create mode 100644 manifests/node-exporter-daemonset.yaml create mode 100644 manifests/node-exporter-service.yaml create mode 100644 manifests/node-exporter-serviceAccount.yaml create mode 100644 manifests/node-exporter-serviceMonitor.yaml create mode 100644 manifests/prometheus-adapter-apiService.yaml create mode 100644 manifests/prometheus-adapter-clusterRole.yaml create mode 100644 manifests/prometheus-adapter-clusterRoleBinding.yaml create mode 100644 manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml create mode 100644 manifests/prometheus-adapter-clusterRoleServerResources.yaml create mode 100644 manifests/prometheus-adapter-configMap.yaml create mode 100644 manifests/prometheus-adapter-deployment.yaml create mode 100644 manifests/prometheus-adapter-roleBindingAuthReader.yaml create mode 100644 manifests/prometheus-adapter-service.yaml create mode 100644 manifests/prometheus-adapter-serviceAccount.yaml create mode 100644 manifests/prometheus-clusterRole.yaml create mode 100644 manifests/prometheus-clusterRoleBinding.yaml create mode 100644 manifests/prometheus-prometheus.yaml create mode 100644 manifests/prometheus-roleBindingConfig.yaml create mode 100644 manifests/prometheus-roleBindingSpecificNamespaces.yaml create mode 100644 manifests/prometheus-roleConfig.yaml create mode 100644 manifests/prometheus-roleSpecificNamespaces.yaml create mode 100644 manifests/prometheus-rules.yaml create mode 100644 manifests/prometheus-service.yaml create mode 100644 manifests/prometheus-serviceAccount.yaml create mode 100644 manifests/prometheus-serviceMonitor.yaml create mode 100644 manifests/prometheus-serviceMonitorApiserver.yaml create mode 100644 manifests/prometheus-serviceMonitorCoreDNS.yaml create mode 100644 manifests/prometheus-serviceMonitorKubeControllerManager.yaml create mode 100644 manifests/prometheus-serviceMonitorKubeScheduler.yaml create mode 100644 manifests/prometheus-serviceMonitorKubelet.yaml create mode 100644 operator_stack.jsonnet delete mode 100755 teardown diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..22d0d82 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +vendor diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..71d48e3 --- /dev/null +++ b/Makefile @@ -0,0 +1,43 @@ +JSONNET_FMT := jsonnet fmt -n 2 --max-blank-lines 2 --string-style s --comment-style s + +JB_BINARY:=$(GOPATH)/bin/jb + +.PHONY: generate vendor fmt manifests + +all: generate + +generate: manifests + +manifests: $(JSONNET) + rm -rf manifests + ./build.sh main.jsonnet + +update: + jb update + +vendor: $(JB_BINARY) jsonnetfile.json jsonnetfile.lock.json + rm -rf vendor + $(JB_BINARY) install + +fmt: + find . -name 'vendor' -prune -o -name '*.libsonnet' -o -name '*.jsonnet' -print | xargs -n 1 -- $(JSONNET_FMT) -i + +deploy: + kubectl apply -f ./manifests/ + echo "Will wait 40 seconds to reapply manifests" + sleep 40 + kubectl apply -f ./manifests/ + +teardown: + kubectl delete -f ./manifests/ + +tar: manifests + rm -rf manifests.tar + tar -cf manifests.tar manifests + +$(JB_BINARY): + go get -u github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb + +$(JSONNET): + go get github.com/google/go-jsonnet/jsonnet + go get github.com/brancz/gojsontoyaml diff --git a/arm_exporter.jsonnet b/arm_exporter.jsonnet new file mode 100644 index 0000000..cb5dc20 --- /dev/null +++ b/arm_exporter.jsonnet @@ -0,0 +1,95 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + + versions+:: { + armExporter: 'latest', + kubeRbacProxy: 'v0.4.0', + }, + + imageRepos+:: { + armExporter: 'carlosedp/arm_exporter', + kubeRbacProxy: 'carlosedp/kube-rbac-proxy', + }, + }, + + armExporter+:: { + daemonset: + local daemonset = k.apps.v1beta2.daemonSet; + local container = daemonset.mixin.spec.template.spec.containersType; + local containerPort = container.portsType; + + local podLabels = { 'k8s-app': 'arm-exporter' }; + + local armExporter = + container.new('arm-exporter', $._config.imageRepos.armExporter + ':' + $._config.versions.armExporter) + + container.mixin.resources.withRequests({ cpu: '100m', memory: '180Mi' }) + + container.mixin.resources.withLimits({ cpu: '200m', memory: '180Mi' }); + + local proxy = + container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + + container.withArgs([ + '--secure-listen-address=$(IP):9243', + '--upstream=http://127.0.0.1:9243/', + ]) + + container.withPorts(containerPort.new(9243) + containerPort.withHostPort(9243) + containerPort.withName('https')) + + container.mixin.resources.withRequests({ cpu: '10m', memory: '20Mi' }) + + container.mixin.resources.withLimits({ cpu: '20m', memory: '40Mi' }); + local c = [armExporter, proxy]; + + daemonset.new() + + daemonset.mixin.metadata.withName('arm-exporter') + + daemonset.mixin.metadata.withNamespace($._config.namespace) + + daemonset.mixin.metadata.withLabels(podLabels) + + daemonset.mixin.spec.selector.withMatchLabels(podLabels) + + daemonset.mixin.spec.template.metadata.withLabels(podLabels) + + daemonset.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) + + daemonset.mixin.spec.template.spec.withContainers(c), + serviceMonitor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'arm-exporter', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'arm-exporter', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'arm-exporter', + }, + }, + endpoints: [ + { + port: 'https', + scheme: 'https', + interval: '30s', + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + tlsConfig: { + insecureSkipVerify: true, + }, + }, + ], + }, + }, + + service: + local service = k.core.v1.service; + local servicePort = k.core.v1.service.mixin.spec.portsType; + + local armExporterPort = servicePort.newNamed('https', 9243, 'https'); + + service.new('arm-exporter', $.armExporter.daemonset.spec.selector.matchLabels, armExporterPort) + + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.metadata.withLabels({ 'k8s-app': 'arm-exporter' }) + + service.mixin.spec.withClusterIp('None'), + }, +}; + +{ ['arm-exporter-' + name]: kp.armExporter[name] for name in std.objectFields(kp.armExporter) } \ No newline at end of file diff --git a/build.sh b/build.sh new file mode 100755 index 0000000..f68cd44 --- /dev/null +++ b/build.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +# This script uses arg $1 (name of *.jsonnet file to use) to generate the manifests/*.yaml files. + +set -e +set -x +# only exit with zero if all commands of the pipeline exit successfully +set -o pipefail + +# Make sure to start with a clean 'manifests' dir +rm -rf manifests +mkdir manifests + + # optional, but we would like to generate yaml, not json +jsonnet -J vendor -m manifests "${1-example.jsonnet}" | xargs -I{} sh -c 'cat {} | gojsontoyaml > {}.yaml; rm -f {}' -- {} + diff --git a/build_images.sh b/build_images.sh index 14e5c4c..da5a57b 100644 --- a/build_images.sh +++ b/build_images.sh @@ -3,43 +3,67 @@ REPO=carlosedp AOM_VERSION=2.1 -KSM_VERSION=v1.3.0 -VERSION=v0.20.0 +KSM_VERSION=v1.4.0 +VERSION=v0.26.0 PROMCONFIGRELOADER_VERSION=v0.20.0 +#------------------------------------------------------------------------------- # Kubernetes addon-resizer # Retag Addon-resizer google images to have unified manifest on DockerHub docker pull gcr.io/google-containers/addon-resizer-arm64:$AOM_VERSION docker pull gcr.io/google-containers/addon-resizer-arm:$AOM_VERSION +docker pull gcr.io/google-containers/addon-resizer-amd64:$AOM_VERSION + docker tag gcr.io/google-containers/addon-resizer-arm64:$AOM_VERSION $REPO/addon-resizer:$AOM_VERSION-arm64 +docker tag gcr.io/google-containers/addon-resizer-amd64:$AOM_VERSION $REPO/addon-resizer:$AOM_VERSION-arm64 docker tag gcr.io/google-containers/addon-resizer-arm:$AOM_VERSION $REPO/addon-resizer:$AOM_VERSION-arm docker push $REPO/addon-resizer:$AOM_VERSION-arm docker push $REPO/addon-resizer:$AOM_VERSION-arm64 +docker push $REPO/addon-resizer:$AOM_VERSION-amd64 manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64 --template $REPO/addon-resizer:$AOM_VERSION-ARCH --target $REPO/addon-resizer:$AOM_VERSION manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64 --template $REPO/addon-resizer:$AOM_VERSION-ARCH --target $REPO/addon-resizer:latest +#------------------------------------------------------------------------------- # Kube-state-metrics + +export DOCKER_CLI_EXPERIMENTAL=enabled +IMAGE=carlosedp/kube-state-metrics +ALL_ARCH='amd64 arm arm64' + go get github.com/kubernetes/kube-state-metrics -mv $HOME/go/src/github.com/kubernetes/kube-state-metrics $HOME/go/src/k8s.io/kube-state-metrics +#mv $HOME/go/src/github.com/kubernetes/kube-state-metrics $HOME/go/src/k8s.io/kube-state-metrics cd $HOME/go/src/k8s.io/kube-state-metrics git checkout ${KSM_VERSION} + +cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm32v6\/alpine:3.7/' > Dockerfile.arm + +cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm64v8\/alpine:3.7/' > Dockerfile.arm64 + GOOS=linux GOARCH=arm go build . -docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-arm . +docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-arm -f Dockerfile.arm . GOOS=linux GOARCH=arm64 go build . -docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-arm64 . +docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-arm64 -f Dockerfile.arm64 . + +GOOS=linux GOARCH=amd64 go build . +docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-amd64 -f Dockerfile . docker push $REPO/kube-state-metrics:$KSM_VERSION-arm docker push $REPO/kube-state-metrics:$KSM_VERSION-arm64 +docker push $REPO/kube-state-metrics:$KSM_VERSION-amd64 -manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64 --template $REPO/kube-state-metrics:$KSM_VERSION-ARCH --target $REPO/kube-state-metrics:$KSM_VERSION -manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64 --template $REPO/kube-state-metrics:$KSM_VERSION-ARCH --target $REPO/kube-state-metrics:latest - +docker manifest create --amend $IMAGE:$KSM_VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$KSM_VERSION\-&~g"` +for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$KSM_VERSION $IMAGE:$KSM_VERSION-$arch; done +docker manifest push $IMAGE:$KSM_VERSION +#------------------------------------------------------------------------------- # Prometheus-operator +export DOCKER_CLI_EXPERIMENTAL=enabled +IMAGE=carlosedp/prometheus-operator +ALL_ARCH='amd64 arm arm64' go get github.com/coreos/prometheus-operator cd $HOME/go/src/github.com/coreos/prometheus-operator @@ -47,40 +71,168 @@ git checkout ${VERSION} go get -u github.com/prometheus/promu -cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM busybox/' > Dockerfile.arm +cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm32v6\/busybox/' > Dockerfile.arm + +cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm64v8\/busybox/' > Dockerfile.arm64 GOOS=linux GOARCH=arm $GOPATH/bin/promu build --prefix `pwd` docker build -t $REPO/prometheus-operator:${VERSION}-arm -f Dockerfile.arm . GOOS=linux GOARCH=arm64 $GOPATH/bin/promu build --prefix `pwd` -docker build -t $REPO/prometheus-operator:${VERSION}-arm64 -f Dockerfile.arm . +docker build -t $REPO/prometheus-operator:${VERSION}-arm64 -f Dockerfile.arm64 . docker push $REPO/prometheus-operator:$VERSION-arm docker push $REPO/prometheus-operator:$VERSION-arm64 -manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64 --template $REPO/prometheus-operator:$VERSION-ARCH --target $REPO/prometheus-operator:$VERSION -manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64 --template $REPO/prometheus-operator:$VERSION-ARCH --target $REPO/prometheus-operator:latest +docker manifest create --amend $IMAGE:$VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$VERSION\-&~g"` +for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$VERSION $IMAGE:$VERSION-$arch; done +docker manifest push $IMAGE:$VERSION rm Dockerfile.arm +rm Dockerfile.arm64 +#------------------------------------------------------------------------------- +# kube-rbac-proxy +export DOCKER_CLI_EXPERIMENTAL=enabled +IMAGE=carlosedp/kube-rbac-proxy +VERSION=v0.4.0 +ALL_ARCH='amd64 arm arm64' + +go get github.com/brancz/kube-rbac-proxy +cd $HOME/go/src/github.com/brancz/kube-rbac-proxy +git checkout ${VERSION} + +cat > Dockerfile.arm < Dockerfile.arm64 < Dockerfile.amd64 < Dockerfile.arm +go get github.com/coreos/prometheus-operator +cd $HOME/go/src/github.com/coreos/prometheus-operator/cmd/prometheus-config-reloader +git checkout ${VERSION} + +cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm32v6\/busybox/' > Dockerfile.arm + +cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm64v8\/busybox/' > Dockerfile.arm64 GOOS=linux GOARCH=arm CGO_ENABLED=0 go build -o prometheus-config-reloader main.go -docker build -t $REPO/prometheus-config-reloader:${PROMCONFIGRELOADER_VERSION}-arm -f Dockerfile.arm . +docker build -t $IMAGE:$VERSION-arm -f Dockerfile.arm . GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -o prometheus-config-reloader main.go -docker build -t $REPO/prometheus-config-reloader:${PROMCONFIGRELOADER_VERSION}-arm64 -f Dockerfile.arm . +docker build -t $IMAGE:$VERSION-arm64 -f Dockerfile.arm64 . -docker push $REPO/prometheus-config-reloader:$PROMCONFIGRELOADER_VERSION-arm -docker push $REPO/prometheus-config-reloader:$PROMCONFIGRELOADER_VERSION-arm64 +GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o prometheus-config-reloader main.go +docker build -t $IMAGE:$VERSION-amd64 -f Dockerfile . -manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64 --template $REPO/prometheus-config-reloader:$PROMCONFIGRELOADER_VERSION-ARCH --target $REPO/prometheus-config-reloader:$VERSION -manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64 --template $REPO/prometheus-config-reloader:$PROMCONFIGRELOADER_VERSION-ARCH --target $REPO/prometheus-config-reloader:latest +docker push $IMAGE:$VERSION-arm +docker push $IMAGE:$VERSION-arm64 +docker push $IMAGE:$VERSION-amd64 + +docker manifest create --amend $IMAGE:$VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$VERSION\-&~g"` +for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$VERSION $IMAGE:$VERSION-$arch; done +docker manifest push $IMAGE:$VERSION rm Dockerfile.arm +rm Dockerfile.arm64 + +#------------------------------------------------------------------------------- +# configmap-reload +export DOCKER_CLI_EXPERIMENTAL=enabled +IMAGE=carlosedp/configmap-reload +VERSION=v0.2.2 +ALL_ARCH='amd64 arm arm64' + +go get github.com/openshift/configmap-reload +cd $HOME/go/src/github.com/openshift/configmap-reload +#git checkout ${VERSION} + +cat > Dockerfile.arm < Dockerfile.arm64 < Dockerfile.amd64 < /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get customresourcedefinitions prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get customresourcedefinitions alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -until kctl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done -echo "done!" - -kctl apply -f manifests/node-exporter -kctl apply -f manifests/armexporter/daemonset.yaml -kctl apply -f manifests/armexporter/service.yaml - -kctl apply -f manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml -kctl apply -f manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml -kctl apply -f manifests/kube-state-metrics/kube-state-metrics-role.yaml -kctl apply -f manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml -kctl apply -f manifests/kube-state-metrics/kube-state-metrics-service-account.yaml -kctl apply -f manifests/kube-state-metrics/kube-state-metrics-service.yaml -kctl apply -f manifests/kube-state-metrics/kube-state-metrics-deployment.yaml - -kctl apply -f manifests/grafana/grafana-credentials.yaml -kctl apply -f manifests/grafana -find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \; -kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml -kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml -kctl apply -f manifests/alertmanager/ -kctl apply -f manifests/smtp-server/smtp.yaml diff --git a/example.jsonnet b/example.jsonnet new file mode 100644 index 0000000..2a10509 --- /dev/null +++ b/example.jsonnet @@ -0,0 +1,14 @@ +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, +}; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } diff --git a/jsonnetfile.json b/jsonnetfile.json new file mode 100644 index 0000000..13cd6fa --- /dev/null +++ b/jsonnetfile.json @@ -0,0 +1,14 @@ +{ + "dependencies": [ + { + "name": "kube-prometheus", + "source": { + "git": { + "remote": "https://github.com/coreos/prometheus-operator", + "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" + } + }, + "version": "master" + } + ] +} diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json new file mode 100644 index 0000000..be90246 --- /dev/null +++ b/jsonnetfile.lock.json @@ -0,0 +1,84 @@ +{ + "dependencies": [ + { + "name": "kube-prometheus", + "source": { + "git": { + "remote": "https://github.com/coreos/prometheus-operator", + "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" + } + }, + "version": "9536d7787789b74b692cd8a5482a2801b1aba232" + }, + { + "name": "ksonnet", + "source": { + "git": { + "remote": "https://github.com/ksonnet/ksonnet-lib", + "subdir": "" + } + }, + "version": "d03da231d6c8bd74437b74a1e9e8b966f13dffa2" + }, + { + "name": "kubernetes-mixin", + "source": { + "git": { + "remote": "https://github.com/kubernetes-monitoring/kubernetes-mixin", + "subdir": "" + } + }, + "version": "4c23c06fff9ef50744f5ed306c9ab0c4bd78a144" + }, + { + "name": "grafonnet", + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib", + "subdir": "grafonnet" + } + }, + "version": "eea8b5ba6b8883cf2df5a17c39a42c4b57c0d63e" + }, + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/kausalco/public", + "subdir": "grafana-builder" + } + }, + "version": "c6932cf90bce4fef218b4308effc9f15c4219a01" + }, + { + "name": "grafana", + "source": { + "git": { + "remote": "https://github.com/brancz/kubernetes-grafana", + "subdir": "grafana" + } + }, + "version": "da19aef6f5b378fb5281e6f61dbadbbf734d45ee" + }, + { + "name": "prometheus-operator", + "source": { + "git": { + "remote": "https://github.com/coreos/prometheus-operator", + "subdir": "jsonnet/prometheus-operator" + } + }, + "version": "72ec4b9b16ef11700724dc71fec77112536eed40" + }, + { + "name": "etcd-mixin", + "source": { + "git": { + "remote": "https://github.com/coreos/etcd", + "subdir": "Documentation/etcd-mixin" + } + }, + "version": "15b6a17be48dea91a11497980b9adab541add7f0" + } + ] +} diff --git a/main.jsonnet b/main.jsonnet new file mode 100644 index 0000000..6099c2c --- /dev/null +++ b/main.jsonnet @@ -0,0 +1,2 @@ +(import 'operator_stack.jsonnet') + +(import 'arm_exporter.jsonnet') diff --git a/manifests/alertmanager/alertmanager-config.yaml b/manifests-old/alertmanager/alertmanager-config.yaml similarity index 100% rename from manifests/alertmanager/alertmanager-config.yaml rename to manifests-old/alertmanager/alertmanager-config.yaml diff --git a/manifests/alertmanager/alertmanager-ingress.yaml b/manifests-old/alertmanager/alertmanager-ingress.yaml similarity index 100% rename from manifests/alertmanager/alertmanager-ingress.yaml rename to manifests-old/alertmanager/alertmanager-ingress.yaml diff --git a/manifests/alertmanager/alertmanager-service.yaml b/manifests-old/alertmanager/alertmanager-service.yaml similarity index 100% rename from manifests/alertmanager/alertmanager-service.yaml rename to manifests-old/alertmanager/alertmanager-service.yaml diff --git a/manifests/alertmanager/alertmanager.yaml b/manifests-old/alertmanager/alertmanager.yaml similarity index 100% rename from manifests/alertmanager/alertmanager.yaml rename to manifests-old/alertmanager/alertmanager.yaml diff --git a/manifests/armexporter/daemonset.yaml b/manifests-old/armexporter/daemonset.yaml similarity index 100% rename from manifests/armexporter/daemonset.yaml rename to manifests-old/armexporter/daemonset.yaml diff --git a/manifests/armexporter/service.yaml b/manifests-old/armexporter/service.yaml similarity index 100% rename from manifests/armexporter/service.yaml rename to manifests-old/armexporter/service.yaml diff --git a/manifests/grafana/grafana-claim.yaml b/manifests-old/grafana/grafana-claim.yaml similarity index 100% rename from manifests/grafana/grafana-claim.yaml rename to manifests-old/grafana/grafana-claim.yaml diff --git a/manifests/grafana/grafana-configmap.yaml b/manifests-old/grafana/grafana-configmap.yaml similarity index 100% rename from manifests/grafana/grafana-configmap.yaml rename to manifests-old/grafana/grafana-configmap.yaml diff --git a/manifests/grafana/grafana-credentials.yaml b/manifests-old/grafana/grafana-credentials.yaml similarity index 100% rename from manifests/grafana/grafana-credentials.yaml rename to manifests-old/grafana/grafana-credentials.yaml diff --git a/manifests/grafana/grafana-dashboard-definitions.yaml b/manifests-old/grafana/grafana-dashboard-definitions.yaml similarity index 100% rename from manifests/grafana/grafana-dashboard-definitions.yaml rename to manifests-old/grafana/grafana-dashboard-definitions.yaml diff --git a/manifests/grafana/grafana-dashboards.yaml b/manifests-old/grafana/grafana-dashboards.yaml similarity index 100% rename from manifests/grafana/grafana-dashboards.yaml rename to manifests-old/grafana/grafana-dashboards.yaml diff --git a/manifests/grafana/grafana-datasources.yaml b/manifests-old/grafana/grafana-datasources.yaml similarity index 100% rename from manifests/grafana/grafana-datasources.yaml rename to manifests-old/grafana/grafana-datasources.yaml diff --git a/manifests/grafana/grafana-deployment.yaml b/manifests-old/grafana/grafana-deployment.yaml similarity index 100% rename from manifests/grafana/grafana-deployment.yaml rename to manifests-old/grafana/grafana-deployment.yaml diff --git a/manifests/grafana/grafana-external-ingress.yaml b/manifests-old/grafana/grafana-external-ingress.yaml similarity index 100% rename from manifests/grafana/grafana-external-ingress.yaml rename to manifests-old/grafana/grafana-external-ingress.yaml diff --git a/manifests/grafana/grafana-ingress.yaml b/manifests-old/grafana/grafana-ingress.yaml similarity index 100% rename from manifests/grafana/grafana-ingress.yaml rename to manifests-old/grafana/grafana-ingress.yaml diff --git a/manifests/grafana/grafana-service.yaml b/manifests-old/grafana/grafana-service.yaml similarity index 100% rename from manifests/grafana/grafana-service.yaml rename to manifests-old/grafana/grafana-service.yaml diff --git a/manifests/k8s/kube-controller-manager.yaml b/manifests-old/k8s/kube-controller-manager.yaml similarity index 100% rename from manifests/k8s/kube-controller-manager.yaml rename to manifests-old/k8s/kube-controller-manager.yaml diff --git a/manifests/k8s/kube-scheduler.yaml b/manifests-old/k8s/kube-scheduler.yaml similarity index 100% rename from manifests/k8s/kube-scheduler.yaml rename to manifests-old/k8s/kube-scheduler.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml b/manifests-old/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml rename to manifests-old/kube-state-metrics/kube-state-metrics-cluster-role-binding.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml b/manifests-old/kube-state-metrics/kube-state-metrics-cluster-role.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-cluster-role.yaml rename to manifests-old/kube-state-metrics/kube-state-metrics-cluster-role.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-deployment.yaml b/manifests-old/kube-state-metrics/kube-state-metrics-deployment.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-deployment.yaml rename to manifests-old/kube-state-metrics/kube-state-metrics-deployment.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml b/manifests-old/kube-state-metrics/kube-state-metrics-role-binding.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-role-binding.yaml rename to manifests-old/kube-state-metrics/kube-state-metrics-role-binding.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-role.yaml b/manifests-old/kube-state-metrics/kube-state-metrics-role.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-role.yaml rename to manifests-old/kube-state-metrics/kube-state-metrics-role.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-service-account.yaml b/manifests-old/kube-state-metrics/kube-state-metrics-service-account.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-service-account.yaml rename to manifests-old/kube-state-metrics/kube-state-metrics-service-account.yaml diff --git a/manifests/kube-state-metrics/kube-state-metrics-service.yaml b/manifests-old/kube-state-metrics/kube-state-metrics-service.yaml similarity index 100% rename from manifests/kube-state-metrics/kube-state-metrics-service.yaml rename to manifests-old/kube-state-metrics/kube-state-metrics-service.yaml diff --git a/manifests/node-exporter/node-exporter-cluster-role-binding.yaml b/manifests-old/node-exporter/node-exporter-cluster-role-binding.yaml similarity index 100% rename from manifests/node-exporter/node-exporter-cluster-role-binding.yaml rename to manifests-old/node-exporter/node-exporter-cluster-role-binding.yaml diff --git a/manifests/node-exporter/node-exporter-cluster-role.yaml b/manifests-old/node-exporter/node-exporter-cluster-role.yaml similarity index 100% rename from manifests/node-exporter/node-exporter-cluster-role.yaml rename to manifests-old/node-exporter/node-exporter-cluster-role.yaml diff --git a/manifests/node-exporter/node-exporter-daemonset.yaml b/manifests-old/node-exporter/node-exporter-daemonset.yaml similarity index 100% rename from manifests/node-exporter/node-exporter-daemonset.yaml rename to manifests-old/node-exporter/node-exporter-daemonset.yaml diff --git a/manifests/node-exporter/node-exporter-service-account.yaml b/manifests-old/node-exporter/node-exporter-service-account.yaml similarity index 100% rename from manifests/node-exporter/node-exporter-service-account.yaml rename to manifests-old/node-exporter/node-exporter-service-account.yaml diff --git a/manifests/node-exporter/node-exporter-service.yaml b/manifests-old/node-exporter/node-exporter-service.yaml similarity index 100% rename from manifests/node-exporter/node-exporter-service.yaml rename to manifests-old/node-exporter/node-exporter-service.yaml diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml b/manifests-old/prometheus-operator/prometheus-operator-cluster-role-binding.yaml similarity index 100% rename from manifests/prometheus-operator/prometheus-operator-cluster-role-binding.yaml rename to manifests-old/prometheus-operator/prometheus-operator-cluster-role-binding.yaml diff --git a/manifests/prometheus-operator/prometheus-operator-cluster-role.yaml b/manifests-old/prometheus-operator/prometheus-operator-cluster-role.yaml similarity index 100% rename from manifests/prometheus-operator/prometheus-operator-cluster-role.yaml rename to manifests-old/prometheus-operator/prometheus-operator-cluster-role.yaml diff --git a/manifests/prometheus-operator/prometheus-operator-service-account.yaml b/manifests-old/prometheus-operator/prometheus-operator-service-account.yaml similarity index 100% rename from manifests/prometheus-operator/prometheus-operator-service-account.yaml rename to manifests-old/prometheus-operator/prometheus-operator-service-account.yaml diff --git a/manifests/prometheus-operator/prometheus-operator-service.yaml b/manifests-old/prometheus-operator/prometheus-operator-service.yaml similarity index 100% rename from manifests/prometheus-operator/prometheus-operator-service.yaml rename to manifests-old/prometheus-operator/prometheus-operator-service.yaml diff --git a/manifests/prometheus-operator/prometheus-operator.yaml b/manifests-old/prometheus-operator/prometheus-operator.yaml similarity index 100% rename from manifests/prometheus-operator/prometheus-operator.yaml rename to manifests-old/prometheus-operator/prometheus-operator.yaml diff --git a/manifests/prometheus/prometheus-k8s-ingress.yaml b/manifests-old/prometheus/prometheus-k8s-ingress.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-ingress.yaml rename to manifests-old/prometheus/prometheus-k8s-ingress.yaml diff --git a/manifests/prometheus/prometheus-k8s-role-bindings.yaml b/manifests-old/prometheus/prometheus-k8s-role-bindings.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-role-bindings.yaml rename to manifests-old/prometheus/prometheus-k8s-role-bindings.yaml diff --git a/manifests/prometheus/prometheus-k8s-roles.yaml b/manifests-old/prometheus/prometheus-k8s-roles.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-roles.yaml rename to manifests-old/prometheus/prometheus-k8s-roles.yaml diff --git a/manifests/prometheus/prometheus-k8s-rules.yaml b/manifests-old/prometheus/prometheus-k8s-rules.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-rules.yaml rename to manifests-old/prometheus/prometheus-k8s-rules.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-account.yaml b/manifests-old/prometheus/prometheus-k8s-service-account.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-account.yaml rename to manifests-old/prometheus/prometheus-k8s-service-account.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml b/manifests-old/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml rename to manifests-old/prometheus/prometheus-k8s-service-monitor-alertmanager.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml b/manifests-old/prometheus/prometheus-k8s-service-monitor-apiserver.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-monitor-apiserver.yaml rename to manifests-old/prometheus/prometheus-k8s-service-monitor-apiserver.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-arm-exporter.yaml b/manifests-old/prometheus/prometheus-k8s-service-monitor-arm-exporter.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-monitor-arm-exporter.yaml rename to manifests-old/prometheus/prometheus-k8s-service-monitor-arm-exporter.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml b/manifests-old/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml rename to manifests-old/prometheus/prometheus-k8s-service-monitor-kube-controller-manager.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml b/manifests-old/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml rename to manifests-old/prometheus/prometheus-k8s-service-monitor-kube-scheduler.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml b/manifests-old/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml rename to manifests-old/prometheus/prometheus-k8s-service-monitor-kube-state-metrics.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml b/manifests-old/prometheus/prometheus-k8s-service-monitor-kubelet.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-monitor-kubelet.yaml rename to manifests-old/prometheus/prometheus-k8s-service-monitor-kubelet.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml b/manifests-old/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml rename to manifests-old/prometheus/prometheus-k8s-service-monitor-node-exporter.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml b/manifests-old/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml rename to manifests-old/prometheus/prometheus-k8s-service-monitor-prometheus-operator.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml b/manifests-old/prometheus/prometheus-k8s-service-monitor-prometheus.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-monitor-prometheus.yaml rename to manifests-old/prometheus/prometheus-k8s-service-monitor-prometheus.yaml diff --git a/manifests/prometheus/prometheus-k8s-service-monitor-traefik.yaml b/manifests-old/prometheus/prometheus-k8s-service-monitor-traefik.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service-monitor-traefik.yaml rename to manifests-old/prometheus/prometheus-k8s-service-monitor-traefik.yaml diff --git a/manifests/prometheus/prometheus-k8s-service.yaml b/manifests-old/prometheus/prometheus-k8s-service.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s-service.yaml rename to manifests-old/prometheus/prometheus-k8s-service.yaml diff --git a/manifests/prometheus/prometheus-k8s.yaml b/manifests-old/prometheus/prometheus-k8s.yaml similarity index 100% rename from manifests/prometheus/prometheus-k8s.yaml rename to manifests-old/prometheus/prometheus-k8s.yaml diff --git a/manifests/smtp-server/create_secret.sh b/manifests-old/smtp-server/create_secret.sh similarity index 100% rename from manifests/smtp-server/create_secret.sh rename to manifests-old/smtp-server/create_secret.sh diff --git a/manifests/smtp-server/smtp.yaml b/manifests-old/smtp-server/smtp.yaml similarity index 100% rename from manifests/smtp-server/smtp.yaml rename to manifests-old/smtp-server/smtp.yaml diff --git a/manifests/snmp-exporter/snmp-exporter-deployment.yaml b/manifests-old/snmp-exporter/snmp-exporter-deployment.yaml similarity index 100% rename from manifests/snmp-exporter/snmp-exporter-deployment.yaml rename to manifests-old/snmp-exporter/snmp-exporter-deployment.yaml diff --git a/manifests/snmp-exporter/snmp-exporter-service.yaml b/manifests-old/snmp-exporter/snmp-exporter-service.yaml similarity index 100% rename from manifests/snmp-exporter/snmp-exporter-service.yaml rename to manifests-old/snmp-exporter/snmp-exporter-service.yaml diff --git a/manifests/snmp-exporter/snmp-exporter-servicemonitor-router.yaml b/manifests-old/snmp-exporter/snmp-exporter-servicemonitor-router.yaml similarity index 100% rename from manifests/snmp-exporter/snmp-exporter-servicemonitor-router.yaml rename to manifests-old/snmp-exporter/snmp-exporter-servicemonitor-router.yaml diff --git a/manifests/00namespace-namespace.yaml b/manifests/00namespace-namespace.yaml new file mode 100644 index 0000000..d325236 --- /dev/null +++ b/manifests/00namespace-namespace.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml new file mode 100644 index 0000000..d5c94fc --- /dev/null +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -0,0 +1,2406 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + creationTimestamp: null + name: alertmanagers.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + kind: Alertmanager + plural: alertmanagers + scope: Namespaced + validation: + openAPIV3Schema: + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + spec: + description: 'AlertmanagerSpec is a specification of the desired behavior + of the Alertmanager cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + properties: + additionalPeers: + description: AdditionalPeers allows injecting a set of additional Alertmanagers + to peer with to form a highly available cluster. + items: + type: string + type: array + affinity: + description: Affinity is a group of affinity scheduling rules. + properties: + nodeAffinity: + description: Node affinity is a group of node affinity scheduling + rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the affinity expressions specified by this field, + but it may choose a node that violates one or more of the + expressions. The node that is most preferred is the one with + the greatest sum of weights, i.e. for each node that meets + all of the scheduling requirements (resource request, requiredDuringScheduling + affinity expressions, etc.), compute a sum by iterating through + the elements of this field and adding "weight" to the sum + if the node matches the corresponding matchExpressions; the + node(s) with the highest sum are the most preferred. + items: + description: An empty preferred scheduling term matches all + objects with implicit weight 0 (i.e. it's a no-op). A null + preferred scheduling term matches no objects (i.e. is also + a no-op). + properties: + preference: + description: A null or empty node selector term matches + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. + properties: + matchExpressions: + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + weight: + description: Weight associated with matching the corresponding + nodeSelectorTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - preference + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: A node selector represents the union of the results + of one or more label queries over a set of nodes; that is, + it represents the OR of the selectors represented by the node + selector terms. + properties: + nodeSelectorTerms: + description: Required. A list of node selector terms. The + terms are ORed. + items: + description: A null or empty node selector term matches + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. + properties: + matchExpressions: + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + type: array + required: + - nodeSelectorTerms + podAffinity: + description: Pod affinity is a group of inter pod affinity scheduling + rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the affinity expressions specified by this field, + but it may choose a node that violates one or more of the + expressions. The node that is most preferred is the one with + the greatest sum of weights, i.e. for each node that meets + all of the scheduling requirements (resource request, requiredDuringScheduling + affinity expressions, etc.), compute a sum by iterating through + the elements of this field and adding "weight" to the sum + if the node has pods which matches the corresponding podAffinityTerm; + the node(s) with the highest sum are the most preferred. + items: + description: The weights of all of the matched WeightedPodAffinityTerm + fields are added per-node to find the most preferred node(s) + properties: + podAffinityTerm: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) + that this pod should be co-located (affinity) or not + co-located (anti-affinity) with, where co-located is + defined as running on a node whose value of the label + with key matches that of any node on which + a pod of the set of pods is running + properties: + labelSelector: + description: A label selector is a label query over + a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector + matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label + selector requirements. The requirements are + ANDed. + items: + description: A label selector requirement is + a selector that contains values, a key, and + an operator that relates the key and values. + properties: + key: + description: key is the label key that the + selector applies to. + type: string + operator: + description: operator represents a key's + relationship to a set of values. Valid + operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string + values. If the operator is In or NotIn, + the values array must be non-empty. If + the operator is Exists or DoesNotExist, + the values array must be empty. This array + is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, + whose key field is "key", the operator is "In", + and the values array contains only "value". + The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces + the labelSelector applies to (matches against); + null or empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods + matching the labelSelector in the specified namespaces, + where co-located is defined as running on a node + whose value of the label with key topologyKey matches + that of any node on which any of the selected pods + is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + weight: + description: weight associated with matching the corresponding + podAffinityTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - podAffinityTerm + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the affinity requirements specified by this + field are not met at scheduling time, the pod will not be + scheduled onto the node. If the affinity requirements specified + by this field cease to be met at some point during pod execution + (e.g. due to a pod label update), the system may or may not + try to eventually evict the pod from its node. When there + are multiple elements, the lists of nodes corresponding to + each podAffinityTerm are intersected, i.e. all terms must + be satisfied. + items: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) that + this pod should be co-located (affinity) or not co-located + (anti-affinity) with, where co-located is defined as running + on a node whose value of the label with key + matches that of any node on which a pod of the set of pods + is running + properties: + labelSelector: + description: A label selector is a label query over a + set of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values + array must be non-empty. If the operator is + Exists or DoesNotExist, the values array must + be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces the + labelSelector applies to (matches against); null or + empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where + co-located is defined as running on a node whose value + of the label with key topologyKey matches that of any + node on which any of the selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: array + podAntiAffinity: + description: Pod anti affinity is a group of inter pod anti affinity + scheduling rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the anti-affinity expressions specified by this + field, but it may choose a node that violates one or more + of the expressions. The node that is most preferred is the + one with the greatest sum of weights, i.e. for each node that + meets all of the scheduling requirements (resource request, + requiredDuringScheduling anti-affinity expressions, etc.), + compute a sum by iterating through the elements of this field + and adding "weight" to the sum if the node has pods which + matches the corresponding podAffinityTerm; the node(s) with + the highest sum are the most preferred. + items: + description: The weights of all of the matched WeightedPodAffinityTerm + fields are added per-node to find the most preferred node(s) + properties: + podAffinityTerm: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) + that this pod should be co-located (affinity) or not + co-located (anti-affinity) with, where co-located is + defined as running on a node whose value of the label + with key matches that of any node on which + a pod of the set of pods is running + properties: + labelSelector: + description: A label selector is a label query over + a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector + matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label + selector requirements. The requirements are + ANDed. + items: + description: A label selector requirement is + a selector that contains values, a key, and + an operator that relates the key and values. + properties: + key: + description: key is the label key that the + selector applies to. + type: string + operator: + description: operator represents a key's + relationship to a set of values. Valid + operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string + values. If the operator is In or NotIn, + the values array must be non-empty. If + the operator is Exists or DoesNotExist, + the values array must be empty. This array + is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, + whose key field is "key", the operator is "In", + and the values array contains only "value". + The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces + the labelSelector applies to (matches against); + null or empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods + matching the labelSelector in the specified namespaces, + where co-located is defined as running on a node + whose value of the label with key topologyKey matches + that of any node on which any of the selected pods + is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + weight: + description: weight associated with matching the corresponding + podAffinityTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - podAffinityTerm + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the anti-affinity requirements specified by + this field are not met at scheduling time, the pod will not + be scheduled onto the node. If the anti-affinity requirements + specified by this field cease to be met at some point during + pod execution (e.g. due to a pod label update), the system + may or may not try to eventually evict the pod from its node. + When there are multiple elements, the lists of nodes corresponding + to each podAffinityTerm are intersected, i.e. all terms must + be satisfied. + items: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) that + this pod should be co-located (affinity) or not co-located + (anti-affinity) with, where co-located is defined as running + on a node whose value of the label with key + matches that of any node on which a pod of the set of pods + is running + properties: + labelSelector: + description: A label selector is a label query over a + set of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values + array must be non-empty. If the operator is + Exists or DoesNotExist, the values array must + be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces the + labelSelector applies to (matches against); null or + empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where + co-located is defined as running on a node whose value + of the label with key topologyKey matches that of any + node on which any of the selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: array + baseImage: + description: Base image that is used to deploy pods, without tag. + type: string + configMaps: + description: ConfigMaps is a list of ConfigMaps in the same namespace + as the Alertmanager object, which shall be mounted into the Alertmanager + Pods. The ConfigMaps are mounted into /etc/alertmanager/configmaps/. + items: + type: string + type: array + containers: + description: Containers allows injecting additional containers. This + is meant to allow adding an authentication proxy to an Alertmanager + pod. + items: + description: A single application container that you want to run within + a pod. + properties: + args: + description: 'Arguments to the entrypoint. The docker image''s + CMD is used if this is not provided. Variable references $(VAR_NAME) + are expanded using the container''s environment. If a variable + cannot be resolved, the reference in the input string will be + unchanged. The $(VAR_NAME) syntax can be escaped with a double + $$, ie: $$(VAR_NAME). Escaped references will never be expanded, + regardless of whether the variable exists or not. Cannot be + updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed within a shell. The + docker image''s ENTRYPOINT is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container''s + environment. If a variable cannot be resolved, the reference + in the input string will be unchanged. The $(VAR_NAME) syntax + can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references + will never be expanded, regardless of whether the variable exists + or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables to set in the container. + Cannot be updated. + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. Must be a + C_IDENTIFIER. + type: string + value: + description: 'Variable references $(VAR_NAME) are expanded + using the previous defined environment variables in the + container and any service environment variables. If a + variable cannot be resolved, the reference in the input + string will be unchanged. The $(VAR_NAME) syntax can be + escaped with a double $$, ie: $$(VAR_NAME). Escaped references + will never be expanded, regardless of whether the variable + exists or not. Defaults to "".' + type: string + valueFrom: + description: EnvVarSource represents a source for the value + of an EnvVar. + properties: + configMapKeyRef: + description: Selects a key from a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the ConfigMap or it's + key must be defined + type: boolean + required: + - key + fieldRef: + description: ObjectFieldSelector selects an APIVersioned + field of an object. + properties: + apiVersion: + description: Version of the schema the FieldPath + is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the + specified API version. + type: string + required: + - fieldPath + resourceFieldRef: + description: ResourceFieldSelector represents container + resources (cpu, memory) and their output format + properties: + containerName: + description: 'Container name: required for volumes, + optional for env vars' + type: string + divisor: {} + resource: + description: 'Required: resource to select' + type: string + required: + - resource + secretKeyRef: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's + key must be defined + type: boolean + required: + - key + required: + - name + type: array + envFrom: + description: List of sources to populate environment variables + in the container. The keys defined within a source must be a + C_IDENTIFIER. All invalid keys will be reported as an event + when the container is starting. When a key exists in multiple + sources, the value associated with the last source will take + precedence. Values defined by an Env with a duplicate key will + take precedence. Cannot be updated. + items: + description: EnvFromSource represents the source of a set of + ConfigMaps + properties: + configMapRef: + description: |- + ConfigMapEnvSource selects a ConfigMap to populate the environment variables with. + + The contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the ConfigMap must be defined + type: boolean + prefix: + description: An optional identifier to prepend to each key + in the ConfigMap. Must be a C_IDENTIFIER. + type: string + secretRef: + description: |- + SecretEnvSource selects a Secret to populate the environment variables with. + + The contents of the target Secret's Data field will represent the key-value pairs as environment variables. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret must be defined + type: boolean + type: array + image: + description: 'Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images + This field is optional to allow higher level config management + to default or override container images in workload controllers + like Deployments and StatefulSets.' + type: string + imagePullPolicy: + description: 'Image pull policy. One of Always, Never, IfNotPresent. + Defaults to Always if :latest tag is specified, or IfNotPresent + otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Lifecycle describes actions that the management system + should take in response to container lifecycle events. For the + PostStart and PreStop lifecycle handlers, management of the + container blocks until the action is complete, unless the container + process fails, in which case the handler is aborted. + properties: + postStart: + description: Handler defines a specific action that should + be taken + properties: + exec: + description: ExecAction describes a "run in container" + action. + properties: + command: + description: Command is the command line to execute + inside the container, the working directory for + the command is root ('/') in the container's filesystem. + The command is simply exec'd, it is not run inside + a shell, so traditional shell instructions ('|', + etc) won't work. To use a shell, you need to explicitly + call out to that shell. Exit status of 0 is treated + as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + httpGet: + description: HTTPGetAction describes an action based on + HTTP Get requests. + properties: + host: + description: Host name to connect to, defaults to + the pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. + HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header + to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: string + - type: integer + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + tcpSocket: + description: TCPSocketAction describes an action based + on opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: string + - type: integer + required: + - port + preStop: + description: Handler defines a specific action that should + be taken + properties: + exec: + description: ExecAction describes a "run in container" + action. + properties: + command: + description: Command is the command line to execute + inside the container, the working directory for + the command is root ('/') in the container's filesystem. + The command is simply exec'd, it is not run inside + a shell, so traditional shell instructions ('|', + etc) won't work. To use a shell, you need to explicitly + call out to that shell. Exit status of 0 is treated + as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + httpGet: + description: HTTPGetAction describes an action based on + HTTP Get requests. + properties: + host: + description: Host name to connect to, defaults to + the pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. + HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header + to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: string + - type: integer + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + tcpSocket: + description: TCPSocketAction describes an action based + on opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: string + - type: integer + required: + - port + livenessProbe: + description: Probe describes a health check to be performed against + a container to determine whether it is alive or ready to receive + traffic. + properties: + exec: + description: ExecAction describes a "run in container" action. + properties: + command: + description: Command is the command line to execute inside + the container, the working directory for the command is + root ('/') in the container's filesystem. The command + is simply exec'd, it is not run inside a shell, so traditional + shell instructions ('|', etc) won't work. To use a shell, + you need to explicitly call out to that shell. Exit + status of 0 is treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + failureThreshold: + description: Minimum consecutive failures for the probe to + be considered failed after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGetAction describes an action based on HTTP + Get requests. + properties: + host: + description: Host name to connect to, defaults to the + pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes a custom header to + be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: string + - type: integer + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + initialDelaySeconds: + description: 'Number of seconds after the container has started + before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to + be considered successful after having failed. Defaults to + 1. Must be 1 for liveness. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: TCPSocketAction describes an action based on + opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: string + - type: integer + required: + - port + timeoutSeconds: + description: 'Number of seconds after which the probe times + out. Defaults to 1 second. Minimum value is 1. More info: + https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + name: + description: Name of the container specified as a DNS_LABEL. Each + container in a pod must have a unique name (DNS_LABEL). Cannot + be updated. + type: string + ports: + description: List of ports to expose from the container. Exposing + a port here gives the system additional information about the + network connections a container uses, but is primarily informational. + Not specifying a port here DOES NOT prevent that port from being + exposed. Any port which is listening on the default "0.0.0.0" + address inside a container will be accessible from the network. + Cannot be updated. + items: + description: ContainerPort represents a network port in a single + container. + properties: + containerPort: + description: Number of port to expose on the pod's IP address. + This must be a valid port number, 0 < x < 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind the external port to. + type: string + hostPort: + description: Number of port to expose on the host. If specified, + this must be a valid port number, 0 < x < 65536. If HostNetwork + is specified, this must match ContainerPort. Most containers + do not need this. + format: int32 + type: integer + name: + description: If specified, this must be an IANA_SVC_NAME + and unique within the pod. Each named port in a pod must + have a unique name. Name for the port that can be referred + to by services. + type: string + protocol: + description: Protocol for port. Must be UDP, TCP, or SCTP. + Defaults to "TCP". + type: string + required: + - containerPort + type: array + readinessProbe: + description: Probe describes a health check to be performed against + a container to determine whether it is alive or ready to receive + traffic. + properties: + exec: + description: ExecAction describes a "run in container" action. + properties: + command: + description: Command is the command line to execute inside + the container, the working directory for the command is + root ('/') in the container's filesystem. The command + is simply exec'd, it is not run inside a shell, so traditional + shell instructions ('|', etc) won't work. To use a shell, + you need to explicitly call out to that shell. Exit + status of 0 is treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + failureThreshold: + description: Minimum consecutive failures for the probe to + be considered failed after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGetAction describes an action based on HTTP + Get requests. + properties: + host: + description: Host name to connect to, defaults to the + pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes a custom header to + be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: string + - type: integer + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + initialDelaySeconds: + description: 'Number of seconds after the container has started + before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to + be considered successful after having failed. Defaults to + 1. Must be 1 for liveness. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: TCPSocketAction describes an action based on + opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: string + - type: integer + required: + - port + timeoutSeconds: + description: 'Number of seconds after which the probe times + out. Defaults to 1 second. Minimum value is 1. More info: + https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute + resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute + resources required. If Requests is omitted for a container, + it defaults to Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + securityContext: + description: SecurityContext holds security configuration that + will be applied to a container. Some fields are present in both + SecurityContext and PodSecurityContext. When both are set, + the values in SecurityContext take precedence. + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation controls whether a + process can gain more privileges than its parent process. + This bool directly controls if the no_new_privs flag will + be set on the container process. AllowPrivilegeEscalation + is true always when the container is: 1) run as Privileged + 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: Adds and removes POSIX capabilities from running + containers. + properties: + add: + description: Added capabilities + items: + type: string + type: array + drop: + description: Removed capabilities + items: + type: string + type: array + privileged: + description: Run container in privileged mode. Processes in + privileged containers are essentially equivalent to root + on the host. Defaults to false. + type: boolean + procMount: + description: procMount denotes the type of proc mount to use + for the containers. The default is DefaultProcMount which + uses the container runtime defaults for readonly paths and + masked paths. This requires the ProcMountType feature flag + to be enabled. + type: string + readOnlyRootFilesystem: + description: Whether this container has a read-only root filesystem. + Default is false. + type: boolean + runAsGroup: + description: The GID to run the entrypoint of the container + process. Uses runtime default if unset. May also be set + in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext + takes precedence. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must run as a non-root + user. If true, the Kubelet will validate the image at runtime + to ensure that it does not run as UID 0 (root) and fail + to start the container if it does. If unset or false, no + such validation will be performed. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, the + value specified in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container + process. Defaults to user specified in image metadata if + unspecified. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, the + value specified in SecurityContext takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: SELinuxOptions are the labels to be applied to + the container + properties: + level: + description: Level is SELinux level label that applies + to the container. + type: string + role: + description: Role is a SELinux role label that applies + to the container. + type: string + type: + description: Type is a SELinux type label that applies + to the container. + type: string + user: + description: User is a SELinux user label that applies + to the container. + type: string + stdin: + description: Whether this container should allocate a buffer for + stdin in the container runtime. If this is not set, reads from + stdin in the container will always result in EOF. Default is + false. + type: boolean + stdinOnce: + description: Whether the container runtime should close the stdin + channel after it has been opened by a single attach. When stdin + is true the stdin stream will remain open across multiple attach + sessions. If stdinOnce is set to true, stdin is opened on container + start, is empty until the first client attaches to stdin, and + then remains open and accepts data until the client disconnects, + at which time stdin is closed and remains closed until the container + is restarted. If this flag is false, a container processes that + reads from stdin will never receive an EOF. Default is false + type: boolean + terminationMessagePath: + description: 'Optional: Path at which the file to which the container''s + termination message will be written is mounted into the container''s + filesystem. Message written is intended to be brief final status, + such as an assertion failure message. Will be truncated by the + node if greater than 4096 bytes. The total message length across + all containers will be limited to 12kb. Defaults to /dev/termination-log. + Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination message should be populated. + File will use the contents of terminationMessagePath to populate + the container status message on both success and failure. FallbackToLogsOnError + will use the last chunk of container log output if the termination + message file is empty and the container exited with an error. + The log output is limited to 2048 bytes or 80 lines, whichever + is smaller. Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should allocate a TTY for + itself, also requires 'stdin' to be true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of block devices to be + used by the container. This is an alpha feature and may change + in the future. + items: + description: volumeDevice describes a mapping of a raw block + device within a container. + properties: + devicePath: + description: devicePath is the path inside of the container + that the device will be mapped to. + type: string + name: + description: name must match the name of a persistentVolumeClaim + in the pod + type: string + required: + - name + - devicePath + type: array + volumeMounts: + description: Pod volumes to mount into the container's filesystem. + Cannot be updated. + items: + description: VolumeMount describes a mounting of a Volume within + a container. + properties: + mountPath: + description: Path within the container at which the volume + should be mounted. Must not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how mounts are + propagated from the host to container and the other way + around. When not set, MountPropagationNone is used. This + field is beta in 1.10. + type: string + name: + description: This must match the Name of a Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write otherwise + (false or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume from which the container's + volume should be mounted. Defaults to "" (volume's root). + type: string + required: + - name + - mountPath + type: array + workingDir: + description: Container's working directory. If not specified, + the container runtime's default will be used, which might be + configured in the container image. Cannot be updated. + type: string + required: + - name + type: array + externalUrl: + description: The external URL the Alertmanager instances will be available + under. This is necessary to generate correct URLs. This is necessary + if Alertmanager is not served from root of a DNS name. + type: string + imagePullSecrets: + description: An optional list of references to secrets in the same namespace + to use for pulling prometheus and alertmanager images from registries + see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod + items: + description: LocalObjectReference contains enough information to let + you locate the referenced object inside the same namespace. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + type: array + listenLocal: + description: ListenLocal makes the Alertmanager server listen on loopback, + so that it does not bind against the Pod IP. Note this is only for + the Alertmanager UI, not the gossip communication. + type: boolean + logLevel: + description: Log level for Alertmanager to be configured with. + type: string + nodeSelector: + description: Define which Nodes the Pods are scheduled on. + type: object + paused: + description: If set to true all actions on the underlaying managed objects + are not goint to be performed, except for delete actions. + type: boolean + podMetadata: + description: ObjectMeta is metadata that all persisted resources must + have, which includes all objects users must create. + properties: + annotations: + description: 'Annotations is an unstructured key value map stored + with a resource that may be set by external tools to store and + retrieve arbitrary metadata. They are not queryable and should + be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations' + type: object + clusterName: + description: The name of the cluster which the object belongs to. + This is used to distinguish resources with same name and namespace + in different clusters. This field is not set anywhere right now + and apiserver is going to ignore it if set in create or update + request. + type: string + creationTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of + the factory methods that the time package offers. + format: date-time + type: string + deletionGracePeriodSeconds: + description: Number of seconds allowed for this object to gracefully + terminate before it will be removed from the system. Only set + when deletionTimestamp is also set. May only be shortened. Read-only. + format: int64 + type: integer + deletionTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of + the factory methods that the time package offers. + format: date-time + type: string + finalizers: + description: Must be empty before the object is deleted from the + registry. Each entry is an identifier for the responsible component + that will remove the entry from the list. If the deletionTimestamp + of the object is non-nil, entries in this list can only be removed. + items: + type: string + type: array + generateName: + description: |- + GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server. + + If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header). + + Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency + type: string + generation: + description: A sequence number representing a specific generation + of the desired state. Populated by the system. Read-only. + format: int64 + type: integer + initializers: + description: Initializers tracks the progress of initialization. + properties: + pending: + description: Pending is a list of initializers that must execute + in order before this object is visible. When the last pending + initializer is removed, and no failing result is set, the + initializers struct will be set to nil and the object is considered + as initialized and visible to all clients. + items: + description: Initializer is information about an initializer + that has not yet completed. + properties: + name: + description: name of the process that is responsible for + initializing this object. + type: string + required: + - name + type: array + result: + description: Status is a return value for calls that don't return + other objects. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of + this representation of an object. Servers should convert + recognized schemas to the latest internal value, and may + reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + code: + description: Suggested HTTP return code for this status, + 0 if not set. + format: int32 + type: integer + details: + description: StatusDetails is a set of additional properties + that MAY be set by the server to provide additional information + about a response. The Reason field of a Status object + defines what attributes will be set. Clients must ignore + fields that do not match the defined type of each attribute, + and should assume that any attribute may be empty, invalid, + or under defined. + properties: + causes: + description: The Causes array includes more details + associated with the StatusReason failure. Not all + StatusReasons may provide detailed causes. + items: + description: StatusCause provides more information + about an api.Status failure, including cases when + multiple errors are encountered. + properties: + field: + description: |- + The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional. + + Examples: + "name" - the field "name" on the current resource + "items[0].name" - the field "name" on the first array entry in "items" + type: string + message: + description: A human-readable description of the + cause of the error. This field may be presented + as-is to a reader. + type: string + reason: + description: A machine-readable description of + the cause of the error. If this value is empty + there is no information available. + type: string + type: array + group: + description: The group attribute of the resource associated + with the status StatusReason. + type: string + kind: + description: 'The kind attribute of the resource associated + with the status StatusReason. On some operations may + differ from the requested resource Kind. More info: + https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: The name attribute of the resource associated + with the status StatusReason (when there is a single + name which can be described). + type: string + retryAfterSeconds: + description: If specified, the time in seconds before + the operation should be retried. Some errors may indicate + the client must take an alternate action - for those + errors this field may indicate how long to wait before + taking the alternate action. + format: int32 + type: integer + uid: + description: 'UID of the resource. (when there is a + single resource which can be described). More info: + http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + kind: + description: 'Kind is a string value representing the REST + resource this object represents. Servers may infer this + from the endpoint the client submits requests to. Cannot + be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + message: + description: A human-readable description of the status + of this operation. + type: string + metadata: + description: ListMeta describes metadata that synthetic + resources must have, including lists and various status + objects. A resource may have only one of {ObjectMeta, + ListMeta}. + properties: + continue: + description: continue may be set if the user set a limit + on the number of items returned, and indicates that + the server has more data available. The value is opaque + and may be used to issue another request to the endpoint + that served this list to retrieve the next set of + available objects. Continuing a consistent list may + not be possible if the server configuration has changed + or more than a few minutes have passed. The resourceVersion + field returned when using this continue value will + be identical to the value in the first response, unless + you have received this token from an error message. + type: string + resourceVersion: + description: 'String that identifies the server''s internal + version of this object that can be used by clients + to determine when objects have changed. Value must + be treated as opaque by clients and passed unmodified + back to the server. Populated by the system. Read-only. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency' + type: string + selfLink: + description: selfLink is a URL representing this object. + Populated by the system. Read-only. + type: string + reason: + description: A machine-readable description of why this + operation is in the "Failure" status. If this value is + empty there is no information available. A Reason clarifies + an HTTP status code but does not override it. + type: string + status: + description: 'Status of the operation. One of: "Success" + or "Failure". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status' + type: string + required: + - pending + labels: + description: 'Map of string keys and values that can be used to + organize and categorize (scope and select) objects. May match + selectors of replication controllers and services. More info: + http://kubernetes.io/docs/user-guide/labels' + type: object + name: + description: 'Name must be unique within a namespace. Is required + when creating resources, although some resources may allow a client + to request the generation of an appropriate name automatically. + Name is primarily intended for creation idempotence and configuration + definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: |- + Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty. + + Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces + type: string + ownerReferences: + description: List of objects depended by this object. If ALL objects + in the list have been deleted, this object will be garbage collected. + If this object is managed by a controller, then an entry in this + list will point to this controller, with the controller field + set to true. There cannot be more than one managing controller. + items: + description: OwnerReference contains enough information to let + you identify an owning object. Currently, an owning object must + be in the same namespace, so there is no namespace field. + properties: + apiVersion: + description: API version of the referent. + type: string + blockOwnerDeletion: + description: If true, AND if the owner has the "foregroundDeletion" + finalizer, then the owner cannot be deleted from the key-value + store until this reference is removed. Defaults to false. + To set this field, a user needs "delete" permission of the + owner, otherwise 422 (Unprocessable Entity) will be returned. + type: boolean + controller: + description: If true, this reference points to the managing + controller. + type: boolean + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + uid: + description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + required: + - apiVersion + - kind + - name + - uid + type: array + resourceVersion: + description: |- + An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources. + + Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency + type: string + selfLink: + description: SelfLink is a URL representing this object. Populated + by the system. Read-only. + type: string + uid: + description: |- + UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations. + + Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids + type: string + priorityClassName: + description: Priority class assigned to the Pods + type: string + replicas: + description: Size is the expected size of the alertmanager cluster. + The controller will eventually make the size of the running cluster + equal to the expected size. + format: int32 + type: integer + resources: + description: ResourceRequirements describes the compute resource requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute resources + allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute resources + required. If Requests is omitted for a container, it defaults + to Limits if that is explicitly specified, otherwise to an implementation-defined + value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + retention: + description: Time duration Alertmanager shall retain data for. Default + is '120h', and must match the regular expression `[0-9]+(ms|s|m|h)` + (milliseconds seconds minutes hours). + type: string + routePrefix: + description: The route prefix Alertmanager registers HTTP handlers for. + This is useful, if using ExternalURL and a proxy is rewriting HTTP + routes of a request, and the actual ExternalURL is still true, but + the server serves requests under a different route prefix. For example + for use with `kubectl proxy`. + type: string + secrets: + description: Secrets is a list of Secrets in the same namespace as the + Alertmanager object, which shall be mounted into the Alertmanager + Pods. The Secrets are mounted into /etc/alertmanager/secrets/. + items: + type: string + type: array + securityContext: + description: PodSecurityContext holds pod-level security attributes + and common container settings. Some fields are also present in container.securityContext. Field + values of container.securityContext take precedence over field values + of PodSecurityContext. + properties: + fsGroup: + description: |- + A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod: + + 1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw---- + + If unset, the Kubelet will not modify the ownership and permissions of any volume. + format: int64 + type: integer + runAsGroup: + description: The GID to run the entrypoint of the container process. + Uses runtime default if unset. May also be set in SecurityContext. If + set in both SecurityContext and PodSecurityContext, the value + specified in SecurityContext takes precedence for that container. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must run as a non-root + user. If true, the Kubelet will validate the image at runtime + to ensure that it does not run as UID 0 (root) and fail to start + the container if it does. If unset or false, no such validation + will be performed. May also be set in SecurityContext. If set + in both SecurityContext and PodSecurityContext, the value specified + in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container process. + Defaults to user specified in image metadata if unspecified. May + also be set in SecurityContext. If set in both SecurityContext + and PodSecurityContext, the value specified in SecurityContext + takes precedence for that container. + format: int64 + type: integer + seLinuxOptions: + description: SELinuxOptions are the labels to be applied to the + container + properties: + level: + description: Level is SELinux level label that applies to the + container. + type: string + role: + description: Role is a SELinux role label that applies to the + container. + type: string + type: + description: Type is a SELinux type label that applies to the + container. + type: string + user: + description: User is a SELinux user label that applies to the + container. + type: string + supplementalGroups: + description: A list of groups applied to the first process run in + each container, in addition to the container's primary GID. If + unspecified, no groups will be added to any container. + items: + format: int64 + type: integer + type: array + sysctls: + description: Sysctls hold a list of namespaced sysctls used for + the pod. Pods with unsupported sysctls (by the container runtime) + might fail to launch. + items: + description: Sysctl defines a kernel parameter to be set + properties: + name: + description: Name of a property to set + type: string + value: + description: Value of a property to set + type: string + required: + - name + - value + type: array + serviceAccountName: + description: ServiceAccountName is the name of the ServiceAccount to + use to run the Prometheus Pods. + type: string + sha: + description: SHA of Alertmanager container image to be deployed. Defaults + to the value of `version`. Similar to a tag, but the SHA explicitly + deploys an immutable container image. Version and Tag are ignored + if SHA is set. + type: string + storage: + description: StorageSpec defines the configured storage for a group + Prometheus servers. If neither `emptyDir` nor `volumeClaimTemplate` + is specified, then by default an [EmptyDir](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) + will be used. + properties: + emptyDir: + description: Represents an empty directory for a pod. Empty directory + volumes support ownership management and SELinux relabeling. + properties: + medium: + description: 'What type of storage medium should back this directory. + The default is "" which means to use the node''s default medium. + Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + type: string + sizeLimit: {} + volumeClaimTemplate: + description: PersistentVolumeClaim is a user's request for and claim + to a persistent volume + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this + representation of an object. Servers should convert recognized + schemas to the latest internal value, and may reject unrecognized + values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource + this object represents. Servers may infer this from the endpoint + the client submits requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + metadata: + description: ObjectMeta is metadata that all persisted resources + must have, which includes all objects users must create. + properties: + annotations: + description: 'Annotations is an unstructured key value map + stored with a resource that may be set by external tools + to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + More info: http://kubernetes.io/docs/user-guide/annotations' + type: object + clusterName: + description: The name of the cluster which the object belongs + to. This is used to distinguish resources with same name + and namespace in different clusters. This field is not + set anywhere right now and apiserver is going to ignore + it if set in create or update request. + type: string + creationTimestamp: + description: Time is a wrapper around time.Time which supports + correct marshaling to YAML and JSON. Wrappers are provided + for many of the factory methods that the time package + offers. + format: date-time + type: string + deletionGracePeriodSeconds: + description: Number of seconds allowed for this object to + gracefully terminate before it will be removed from the + system. Only set when deletionTimestamp is also set. May + only be shortened. Read-only. + format: int64 + type: integer + deletionTimestamp: + description: Time is a wrapper around time.Time which supports + correct marshaling to YAML and JSON. Wrappers are provided + for many of the factory methods that the time package + offers. + format: date-time + type: string + finalizers: + description: Must be empty before the object is deleted + from the registry. Each entry is an identifier for the + responsible component that will remove the entry from + the list. If the deletionTimestamp of the object is non-nil, + entries in this list can only be removed. + items: + type: string + type: array + generateName: + description: |- + GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server. + + If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header). + + Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency + type: string + generation: + description: A sequence number representing a specific generation + of the desired state. Populated by the system. Read-only. + format: int64 + type: integer + initializers: + description: Initializers tracks the progress of initialization. + properties: + pending: + description: Pending is a list of initializers that + must execute in order before this object is visible. + When the last pending initializer is removed, and + no failing result is set, the initializers struct + will be set to nil and the object is considered as + initialized and visible to all clients. + items: + description: Initializer is information about an initializer + that has not yet completed. + properties: + name: + description: name of the process that is responsible + for initializing this object. + type: string + required: + - name + type: array + result: + description: Status is a return value for calls that + don't return other objects. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema + of this representation of an object. Servers should + convert recognized schemas to the latest internal + value, and may reject unrecognized values. More + info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + code: + description: Suggested HTTP return code for this + status, 0 if not set. + format: int32 + type: integer + details: + description: StatusDetails is a set of additional + properties that MAY be set by the server to provide + additional information about a response. The Reason + field of a Status object defines what attributes + will be set. Clients must ignore fields that do + not match the defined type of each attribute, + and should assume that any attribute may be empty, + invalid, or under defined. + properties: + causes: + description: The Causes array includes more + details associated with the StatusReason failure. + Not all StatusReasons may provide detailed + causes. + items: + description: StatusCause provides more information + about an api.Status failure, including cases + when multiple errors are encountered. + properties: + field: + description: |- + The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional. + + Examples: + "name" - the field "name" on the current resource + "items[0].name" - the field "name" on the first array entry in "items" + type: string + message: + description: A human-readable description + of the cause of the error. This field + may be presented as-is to a reader. + type: string + reason: + description: A machine-readable description + of the cause of the error. If this value + is empty there is no information available. + type: string + type: array + group: + description: The group attribute of the resource + associated with the status StatusReason. + type: string + kind: + description: 'The kind attribute of the resource + associated with the status StatusReason. On + some operations may differ from the requested + resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: The name attribute of the resource + associated with the status StatusReason (when + there is a single name which can be described). + type: string + retryAfterSeconds: + description: If specified, the time in seconds + before the operation should be retried. Some + errors may indicate the client must take an + alternate action - for those errors this field + may indicate how long to wait before taking + the alternate action. + format: int32 + type: integer + uid: + description: 'UID of the resource. (when there + is a single resource which can be described). + More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + kind: + description: 'Kind is a string value representing + the REST resource this object represents. Servers + may infer this from the endpoint the client submits + requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + message: + description: A human-readable description of the + status of this operation. + type: string + metadata: + description: ListMeta describes metadata that synthetic + resources must have, including lists and various + status objects. A resource may have only one of + {ObjectMeta, ListMeta}. + properties: + continue: + description: continue may be set if the user + set a limit on the number of items returned, + and indicates that the server has more data + available. The value is opaque and may be + used to issue another request to the endpoint + that served this list to retrieve the next + set of available objects. Continuing a consistent + list may not be possible if the server configuration + has changed or more than a few minutes have + passed. The resourceVersion field returned + when using this continue value will be identical + to the value in the first response, unless + you have received this token from an error + message. + type: string + resourceVersion: + description: 'String that identifies the server''s + internal version of this object that can be + used by clients to determine when objects + have changed. Value must be treated as opaque + by clients and passed unmodified back to the + server. Populated by the system. Read-only. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency' + type: string + selfLink: + description: selfLink is a URL representing + this object. Populated by the system. Read-only. + type: string + reason: + description: A machine-readable description of why + this operation is in the "Failure" status. If + this value is empty there is no information available. + A Reason clarifies an HTTP status code but does + not override it. + type: string + status: + description: 'Status of the operation. One of: "Success" + or "Failure". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status' + type: string + required: + - pending + labels: + description: 'Map of string keys and values that can be + used to organize and categorize (scope and select) objects. + May match selectors of replication controllers and services. + More info: http://kubernetes.io/docs/user-guide/labels' + type: object + name: + description: 'Name must be unique within a namespace. Is + required when creating resources, although some resources + may allow a client to request the generation of an appropriate + name automatically. Name is primarily intended for creation + idempotence and configuration definition. Cannot be updated. + More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: |- + Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty. + + Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces + type: string + ownerReferences: + description: List of objects depended by this object. If + ALL objects in the list have been deleted, this object + will be garbage collected. If this object is managed by + a controller, then an entry in this list will point to + this controller, with the controller field set to true. + There cannot be more than one managing controller. + items: + description: OwnerReference contains enough information + to let you identify an owning object. Currently, an + owning object must be in the same namespace, so there + is no namespace field. + properties: + apiVersion: + description: API version of the referent. + type: string + blockOwnerDeletion: + description: If true, AND if the owner has the "foregroundDeletion" + finalizer, then the owner cannot be deleted from + the key-value store until this reference is removed. + Defaults to false. To set this field, a user needs + "delete" permission of the owner, otherwise 422 + (Unprocessable Entity) will be returned. + type: boolean + controller: + description: If true, this reference points to the + managing controller. + type: boolean + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + uid: + description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + required: + - apiVersion + - kind + - name + - uid + type: array + resourceVersion: + description: |- + An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources. + + Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency + type: string + selfLink: + description: SelfLink is a URL representing this object. + Populated by the system. Read-only. + type: string + uid: + description: |- + UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations. + + Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids + type: string + spec: + description: PersistentVolumeClaimSpec describes the common + attributes of storage devices and allows a Source for provider-specific + attributes + properties: + accessModes: + description: 'AccessModes contains the desired access modes + the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + dataSource: + description: TypedLocalObjectReference contains enough information + to let you locate the typed referenced object inside the + same namespace. + properties: + apiGroup: + description: APIGroup is the group for the resource + being referenced. If APIGroup is not specified, the + specified Kind must be in the core API group. For + any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + required: + - kind + - name + resources: + description: ResourceRequirements describes the compute + resource requirements. + properties: + limits: + description: 'Limits describes the maximum amount of + compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount + of compute resources required. If Requests is omitted + for a container, it defaults to Limits if that is + explicitly specified, otherwise to an implementation-defined + value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + selector: + description: A label selector is a label query over a set + of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + storageClassName: + description: 'Name of the StorageClass required by the claim. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1' + type: string + volumeMode: + description: volumeMode defines what type of volume is required + by the claim. Value of Filesystem is implied when not + included in claim spec. This is an alpha feature and may + change in the future. + type: string + volumeName: + description: VolumeName is the binding reference to the + PersistentVolume backing this claim. + type: string + status: + description: PersistentVolumeClaimStatus is the current status + of a persistent volume claim. + properties: + accessModes: + description: 'AccessModes contains the actual access modes + the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + capacity: + description: Represents the actual resources of the underlying + volume. + type: object + conditions: + description: Current Condition of persistent volume claim. + If underlying persistent volume is being resized then + the Condition will be set to 'ResizeStarted'. + items: + description: PersistentVolumeClaimCondition contails details + about state of pvc + properties: + lastProbeTime: + description: Time is a wrapper around time.Time which + supports correct marshaling to YAML and JSON. Wrappers + are provided for many of the factory methods that + the time package offers. + format: date-time + type: string + lastTransitionTime: + description: Time is a wrapper around time.Time which + supports correct marshaling to YAML and JSON. Wrappers + are provided for many of the factory methods that + the time package offers. + format: date-time + type: string + message: + description: Human-readable message indicating details + about last transition. + type: string + reason: + description: Unique, this should be a short, machine + understandable string that gives the reason for + condition's last transition. If it reports "ResizeStarted" + that means the underlying persistent volume is being + resized. + type: string + status: + type: string + type: + type: string + required: + - type + - status + type: array + phase: + description: Phase represents the current phase of PersistentVolumeClaim. + type: string + tag: + description: Tag of Alertmanager container image to be deployed. Defaults + to the value of `version`. Version is ignored if Tag is set. + type: string + tolerations: + description: If specified, the pod's tolerations. + items: + description: The pod this Toleration is attached to tolerates any + taint that matches the triple using the matching + operator . + properties: + effect: + description: Effect indicates the taint effect to match. Empty + means match all taint effects. When specified, allowed values + are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Key is the taint key that the toleration applies + to. Empty means match all taint keys. If the key is empty, operator + must be Exists; this combination means to match all values and + all keys. + type: string + operator: + description: Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. Exists + is equivalent to wildcard for value, so that a pod can tolerate + all taints of a particular category. + type: string + tolerationSeconds: + description: TolerationSeconds represents the period of time the + toleration (which must be of effect NoExecute, otherwise this + field is ignored) tolerates the taint. By default, it is not + set, which means tolerate the taint forever (do not evict). + Zero and negative values will be treated as 0 (evict immediately) + by the system. + format: int64 + type: integer + value: + description: Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise + just a regular string. + type: string + type: array + version: + description: Version the cluster should be on. + type: string + status: + description: 'AlertmanagerStatus is the most recent observed status of the + Alertmanager cluster. Read-only. Not included when requesting from the + apiserver, only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + properties: + availableReplicas: + description: Total number of available pods (ready for at least minReadySeconds) + targeted by this Alertmanager cluster. + format: int32 + type: integer + paused: + description: Represents whether any actions on the underlaying managed + objects are being performed. Only delete actions will be performed. + type: boolean + replicas: + description: Total number of non-terminated pods targeted by this Alertmanager + cluster (their labels match the selector). + format: int32 + type: integer + unavailableReplicas: + description: Total number of unavailable pods targeted by this Alertmanager + cluster. + format: int32 + type: integer + updatedReplicas: + description: Total number of non-terminated pods targeted by this Alertmanager + cluster that have the desired version spec. + format: int32 + type: integer + required: + - paused + - replicas + - updatedReplicas + - availableReplicas + - unavailableReplicas + version: v1 diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml new file mode 100644 index 0000000..d825277 --- /dev/null +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -0,0 +1,3107 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + creationTimestamp: null + name: prometheuses.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + kind: Prometheus + plural: prometheuses + scope: Namespaced + validation: + openAPIV3Schema: + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + spec: + description: 'PrometheusSpec is a specification of the desired behavior + of the Prometheus cluster. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + properties: + additionalAlertManagerConfigs: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must be a valid + secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must be defined + type: boolean + required: + - key + additionalAlertRelabelConfigs: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must be a valid + secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must be defined + type: boolean + required: + - key + additionalScrapeConfigs: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must be a valid + secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must be defined + type: boolean + required: + - key + affinity: + description: Affinity is a group of affinity scheduling rules. + properties: + nodeAffinity: + description: Node affinity is a group of node affinity scheduling + rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the affinity expressions specified by this field, + but it may choose a node that violates one or more of the + expressions. The node that is most preferred is the one with + the greatest sum of weights, i.e. for each node that meets + all of the scheduling requirements (resource request, requiredDuringScheduling + affinity expressions, etc.), compute a sum by iterating through + the elements of this field and adding "weight" to the sum + if the node matches the corresponding matchExpressions; the + node(s) with the highest sum are the most preferred. + items: + description: An empty preferred scheduling term matches all + objects with implicit weight 0 (i.e. it's a no-op). A null + preferred scheduling term matches no objects (i.e. is also + a no-op). + properties: + preference: + description: A null or empty node selector term matches + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. + properties: + matchExpressions: + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + weight: + description: Weight associated with matching the corresponding + nodeSelectorTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - preference + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: A node selector represents the union of the results + of one or more label queries over a set of nodes; that is, + it represents the OR of the selectors represented by the node + selector terms. + properties: + nodeSelectorTerms: + description: Required. A list of node selector terms. The + terms are ORed. + items: + description: A null or empty node selector term matches + no objects. The requirements of them are ANDed. The + TopologySelectorTerm type implements a subset of the + NodeSelectorTerm. + properties: + matchExpressions: + description: A list of node selector requirements + by node's labels. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchFields: + description: A list of node selector requirements + by node's fields. + items: + description: A node selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: The label key that the selector + applies to. + type: string + operator: + description: Represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: An array of string values. If the + operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be + empty. If the operator is Gt or Lt, the values + array must have a single element, which will + be interpreted as an integer. This array is + replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + type: array + required: + - nodeSelectorTerms + podAffinity: + description: Pod affinity is a group of inter pod affinity scheduling + rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the affinity expressions specified by this field, + but it may choose a node that violates one or more of the + expressions. The node that is most preferred is the one with + the greatest sum of weights, i.e. for each node that meets + all of the scheduling requirements (resource request, requiredDuringScheduling + affinity expressions, etc.), compute a sum by iterating through + the elements of this field and adding "weight" to the sum + if the node has pods which matches the corresponding podAffinityTerm; + the node(s) with the highest sum are the most preferred. + items: + description: The weights of all of the matched WeightedPodAffinityTerm + fields are added per-node to find the most preferred node(s) + properties: + podAffinityTerm: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) + that this pod should be co-located (affinity) or not + co-located (anti-affinity) with, where co-located is + defined as running on a node whose value of the label + with key matches that of any node on which + a pod of the set of pods is running + properties: + labelSelector: + description: A label selector is a label query over + a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector + matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label + selector requirements. The requirements are + ANDed. + items: + description: A label selector requirement is + a selector that contains values, a key, and + an operator that relates the key and values. + properties: + key: + description: key is the label key that the + selector applies to. + type: string + operator: + description: operator represents a key's + relationship to a set of values. Valid + operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string + values. If the operator is In or NotIn, + the values array must be non-empty. If + the operator is Exists or DoesNotExist, + the values array must be empty. This array + is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, + whose key field is "key", the operator is "In", + and the values array contains only "value". + The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces + the labelSelector applies to (matches against); + null or empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods + matching the labelSelector in the specified namespaces, + where co-located is defined as running on a node + whose value of the label with key topologyKey matches + that of any node on which any of the selected pods + is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + weight: + description: weight associated with matching the corresponding + podAffinityTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - podAffinityTerm + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the affinity requirements specified by this + field are not met at scheduling time, the pod will not be + scheduled onto the node. If the affinity requirements specified + by this field cease to be met at some point during pod execution + (e.g. due to a pod label update), the system may or may not + try to eventually evict the pod from its node. When there + are multiple elements, the lists of nodes corresponding to + each podAffinityTerm are intersected, i.e. all terms must + be satisfied. + items: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) that + this pod should be co-located (affinity) or not co-located + (anti-affinity) with, where co-located is defined as running + on a node whose value of the label with key + matches that of any node on which a pod of the set of pods + is running + properties: + labelSelector: + description: A label selector is a label query over a + set of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values + array must be non-empty. If the operator is + Exists or DoesNotExist, the values array must + be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces the + labelSelector applies to (matches against); null or + empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where + co-located is defined as running on a node whose value + of the label with key topologyKey matches that of any + node on which any of the selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: array + podAntiAffinity: + description: Pod anti affinity is a group of inter pod anti affinity + scheduling rules. + properties: + preferredDuringSchedulingIgnoredDuringExecution: + description: The scheduler will prefer to schedule pods to nodes + that satisfy the anti-affinity expressions specified by this + field, but it may choose a node that violates one or more + of the expressions. The node that is most preferred is the + one with the greatest sum of weights, i.e. for each node that + meets all of the scheduling requirements (resource request, + requiredDuringScheduling anti-affinity expressions, etc.), + compute a sum by iterating through the elements of this field + and adding "weight" to the sum if the node has pods which + matches the corresponding podAffinityTerm; the node(s) with + the highest sum are the most preferred. + items: + description: The weights of all of the matched WeightedPodAffinityTerm + fields are added per-node to find the most preferred node(s) + properties: + podAffinityTerm: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) + that this pod should be co-located (affinity) or not + co-located (anti-affinity) with, where co-located is + defined as running on a node whose value of the label + with key matches that of any node on which + a pod of the set of pods is running + properties: + labelSelector: + description: A label selector is a label query over + a set of resources. The result of matchLabels and + matchExpressions are ANDed. An empty label selector + matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label + selector requirements. The requirements are + ANDed. + items: + description: A label selector requirement is + a selector that contains values, a key, and + an operator that relates the key and values. + properties: + key: + description: key is the label key that the + selector applies to. + type: string + operator: + description: operator represents a key's + relationship to a set of values. Valid + operators are In, NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string + values. If the operator is In or NotIn, + the values array must be non-empty. If + the operator is Exists or DoesNotExist, + the values array must be empty. This array + is replaced during a strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} + pairs. A single {key,value} in the matchLabels + map is equivalent to an element of matchExpressions, + whose key field is "key", the operator is "In", + and the values array contains only "value". + The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces + the labelSelector applies to (matches against); + null or empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods + matching the labelSelector in the specified namespaces, + where co-located is defined as running on a node + whose value of the label with key topologyKey matches + that of any node on which any of the selected pods + is running. Empty topologyKey is not allowed. + type: string + required: + - topologyKey + weight: + description: weight associated with matching the corresponding + podAffinityTerm, in the range 1-100. + format: int32 + type: integer + required: + - weight + - podAffinityTerm + type: array + requiredDuringSchedulingIgnoredDuringExecution: + description: If the anti-affinity requirements specified by + this field are not met at scheduling time, the pod will not + be scheduled onto the node. If the anti-affinity requirements + specified by this field cease to be met at some point during + pod execution (e.g. due to a pod label update), the system + may or may not try to eventually evict the pod from its node. + When there are multiple elements, the lists of nodes corresponding + to each podAffinityTerm are intersected, i.e. all terms must + be satisfied. + items: + description: Defines a set of pods (namely those matching + the labelSelector relative to the given namespace(s)) that + this pod should be co-located (affinity) or not co-located + (anti-affinity) with, where co-located is defined as running + on a node whose value of the label with key + matches that of any node on which a pod of the set of pods + is running + properties: + labelSelector: + description: A label selector is a label query over a + set of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values + array must be non-empty. If the operator is + Exists or DoesNotExist, the values array must + be empty. This array is replaced during a + strategic merge patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + namespaces: + description: namespaces specifies which namespaces the + labelSelector applies to (matches against); null or + empty list means "this pod's namespace" + items: + type: string + type: array + topologyKey: + description: This pod should be co-located (affinity) + or not co-located (anti-affinity) with the pods matching + the labelSelector in the specified namespaces, where + co-located is defined as running on a node whose value + of the label with key topologyKey matches that of any + node on which any of the selected pods is running. Empty + topologyKey is not allowed. + type: string + required: + - topologyKey + type: array + alerting: + description: AlertingSpec defines parameters for alerting configuration + of Prometheus servers. + properties: + alertmanagers: + description: AlertmanagerEndpoints Prometheus should fire alerts + against. + items: + description: AlertmanagerEndpoints defines a selection of a single + Endpoints object containing alertmanager IPs to fire alerts + against. + properties: + bearerTokenFile: + description: BearerTokenFile to read from filesystem to use + when authenticating to Alertmanager. + type: string + name: + description: Name of Endpoints object in Namespace. + type: string + namespace: + description: Namespace of Endpoints object. + type: string + pathPrefix: + description: Prefix for the HTTP path alerts are pushed to. + type: string + port: + anyOf: + - type: string + - type: integer + scheme: + description: Scheme to use when firing alerts. + type: string + tlsConfig: + description: TLSConfig specifies TLS configuration parameters. + properties: + caFile: + description: The CA cert to use for the targets. + type: string + certFile: + description: The client cert file for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: The client key file for the targets. + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + required: + - namespace + - name + - port + type: array + required: + - alertmanagers + apiserverConfig: + description: 'APIServerConfig defines a host and auth methods to access + apiserver. More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#kubernetes_sd_config' + properties: + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over basic + authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + properties: + password: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + username: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bearerToken: + description: Bearer token for accessing apiserver. + type: string + bearerTokenFile: + description: File to read bearer token for accessing apiserver. + type: string + host: + description: Host of apiserver. A valid string consisting of a hostname + or IP followed by an optional port number + type: string + tlsConfig: + description: TLSConfig specifies TLS configuration parameters. + properties: + caFile: + description: The CA cert to use for the targets. + type: string + certFile: + description: The client cert file for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: The client key file for the targets. + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + required: + - host + baseImage: + description: Base image to use for a Prometheus deployment. + type: string + configMaps: + description: ConfigMaps is a list of ConfigMaps in the same namespace + as the Prometheus object, which shall be mounted into the Prometheus + Pods. The ConfigMaps are mounted into /etc/prometheus/configmaps/. + items: + type: string + type: array + containers: + description: Containers allows injecting additional containers. This + is meant to allow adding an authentication proxy to a Prometheus pod. + items: + description: A single application container that you want to run within + a pod. + properties: + args: + description: 'Arguments to the entrypoint. The docker image''s + CMD is used if this is not provided. Variable references $(VAR_NAME) + are expanded using the container''s environment. If a variable + cannot be resolved, the reference in the input string will be + unchanged. The $(VAR_NAME) syntax can be escaped with a double + $$, ie: $$(VAR_NAME). Escaped references will never be expanded, + regardless of whether the variable exists or not. Cannot be + updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + command: + description: 'Entrypoint array. Not executed within a shell. The + docker image''s ENTRYPOINT is used if this is not provided. + Variable references $(VAR_NAME) are expanded using the container''s + environment. If a variable cannot be resolved, the reference + in the input string will be unchanged. The $(VAR_NAME) syntax + can be escaped with a double $$, ie: $$(VAR_NAME). Escaped references + will never be expanded, regardless of whether the variable exists + or not. Cannot be updated. More info: https://kubernetes.io/docs/tasks/inject-data-application/define-command-argument-container/#running-a-command-in-a-shell' + items: + type: string + type: array + env: + description: List of environment variables to set in the container. + Cannot be updated. + items: + description: EnvVar represents an environment variable present + in a Container. + properties: + name: + description: Name of the environment variable. Must be a + C_IDENTIFIER. + type: string + value: + description: 'Variable references $(VAR_NAME) are expanded + using the previous defined environment variables in the + container and any service environment variables. If a + variable cannot be resolved, the reference in the input + string will be unchanged. The $(VAR_NAME) syntax can be + escaped with a double $$, ie: $$(VAR_NAME). Escaped references + will never be expanded, regardless of whether the variable + exists or not. Defaults to "".' + type: string + valueFrom: + description: EnvVarSource represents a source for the value + of an EnvVar. + properties: + configMapKeyRef: + description: Selects a key from a ConfigMap. + properties: + key: + description: The key to select. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the ConfigMap or it's + key must be defined + type: boolean + required: + - key + fieldRef: + description: ObjectFieldSelector selects an APIVersioned + field of an object. + properties: + apiVersion: + description: Version of the schema the FieldPath + is written in terms of, defaults to "v1". + type: string + fieldPath: + description: Path of the field to select in the + specified API version. + type: string + required: + - fieldPath + resourceFieldRef: + description: ResourceFieldSelector represents container + resources (cpu, memory) and their output format + properties: + containerName: + description: 'Container name: required for volumes, + optional for env vars' + type: string + divisor: {} + resource: + description: 'Required: resource to select' + type: string + required: + - resource + secretKeyRef: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's + key must be defined + type: boolean + required: + - key + required: + - name + type: array + envFrom: + description: List of sources to populate environment variables + in the container. The keys defined within a source must be a + C_IDENTIFIER. All invalid keys will be reported as an event + when the container is starting. When a key exists in multiple + sources, the value associated with the last source will take + precedence. Values defined by an Env with a duplicate key will + take precedence. Cannot be updated. + items: + description: EnvFromSource represents the source of a set of + ConfigMaps + properties: + configMapRef: + description: |- + ConfigMapEnvSource selects a ConfigMap to populate the environment variables with. + + The contents of the target ConfigMap's Data field will represent the key-value pairs as environment variables. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the ConfigMap must be defined + type: boolean + prefix: + description: An optional identifier to prepend to each key + in the ConfigMap. Must be a C_IDENTIFIER. + type: string + secretRef: + description: |- + SecretEnvSource selects a Secret to populate the environment variables with. + + The contents of the target Secret's Data field will represent the key-value pairs as environment variables. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret must be defined + type: boolean + type: array + image: + description: 'Docker image name. More info: https://kubernetes.io/docs/concepts/containers/images + This field is optional to allow higher level config management + to default or override container images in workload controllers + like Deployments and StatefulSets.' + type: string + imagePullPolicy: + description: 'Image pull policy. One of Always, Never, IfNotPresent. + Defaults to Always if :latest tag is specified, or IfNotPresent + otherwise. Cannot be updated. More info: https://kubernetes.io/docs/concepts/containers/images#updating-images' + type: string + lifecycle: + description: Lifecycle describes actions that the management system + should take in response to container lifecycle events. For the + PostStart and PreStop lifecycle handlers, management of the + container blocks until the action is complete, unless the container + process fails, in which case the handler is aborted. + properties: + postStart: + description: Handler defines a specific action that should + be taken + properties: + exec: + description: ExecAction describes a "run in container" + action. + properties: + command: + description: Command is the command line to execute + inside the container, the working directory for + the command is root ('/') in the container's filesystem. + The command is simply exec'd, it is not run inside + a shell, so traditional shell instructions ('|', + etc) won't work. To use a shell, you need to explicitly + call out to that shell. Exit status of 0 is treated + as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + httpGet: + description: HTTPGetAction describes an action based on + HTTP Get requests. + properties: + host: + description: Host name to connect to, defaults to + the pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. + HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header + to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: string + - type: integer + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + tcpSocket: + description: TCPSocketAction describes an action based + on opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: string + - type: integer + required: + - port + preStop: + description: Handler defines a specific action that should + be taken + properties: + exec: + description: ExecAction describes a "run in container" + action. + properties: + command: + description: Command is the command line to execute + inside the container, the working directory for + the command is root ('/') in the container's filesystem. + The command is simply exec'd, it is not run inside + a shell, so traditional shell instructions ('|', + etc) won't work. To use a shell, you need to explicitly + call out to that shell. Exit status of 0 is treated + as live/healthy and non-zero is unhealthy. + items: + type: string + type: array + httpGet: + description: HTTPGetAction describes an action based on + HTTP Get requests. + properties: + host: + description: Host name to connect to, defaults to + the pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. + HTTP allows repeated headers. + items: + description: HTTPHeader describes a custom header + to be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: string + - type: integer + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + tcpSocket: + description: TCPSocketAction describes an action based + on opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: string + - type: integer + required: + - port + livenessProbe: + description: Probe describes a health check to be performed against + a container to determine whether it is alive or ready to receive + traffic. + properties: + exec: + description: ExecAction describes a "run in container" action. + properties: + command: + description: Command is the command line to execute inside + the container, the working directory for the command is + root ('/') in the container's filesystem. The command + is simply exec'd, it is not run inside a shell, so traditional + shell instructions ('|', etc) won't work. To use a shell, + you need to explicitly call out to that shell. Exit + status of 0 is treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + failureThreshold: + description: Minimum consecutive failures for the probe to + be considered failed after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGetAction describes an action based on HTTP + Get requests. + properties: + host: + description: Host name to connect to, defaults to the + pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes a custom header to + be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: string + - type: integer + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + initialDelaySeconds: + description: 'Number of seconds after the container has started + before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to + be considered successful after having failed. Defaults to + 1. Must be 1 for liveness. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: TCPSocketAction describes an action based on + opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: string + - type: integer + required: + - port + timeoutSeconds: + description: 'Number of seconds after which the probe times + out. Defaults to 1 second. Minimum value is 1. More info: + https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + name: + description: Name of the container specified as a DNS_LABEL. Each + container in a pod must have a unique name (DNS_LABEL). Cannot + be updated. + type: string + ports: + description: List of ports to expose from the container. Exposing + a port here gives the system additional information about the + network connections a container uses, but is primarily informational. + Not specifying a port here DOES NOT prevent that port from being + exposed. Any port which is listening on the default "0.0.0.0" + address inside a container will be accessible from the network. + Cannot be updated. + items: + description: ContainerPort represents a network port in a single + container. + properties: + containerPort: + description: Number of port to expose on the pod's IP address. + This must be a valid port number, 0 < x < 65536. + format: int32 + type: integer + hostIP: + description: What host IP to bind the external port to. + type: string + hostPort: + description: Number of port to expose on the host. If specified, + this must be a valid port number, 0 < x < 65536. If HostNetwork + is specified, this must match ContainerPort. Most containers + do not need this. + format: int32 + type: integer + name: + description: If specified, this must be an IANA_SVC_NAME + and unique within the pod. Each named port in a pod must + have a unique name. Name for the port that can be referred + to by services. + type: string + protocol: + description: Protocol for port. Must be UDP, TCP, or SCTP. + Defaults to "TCP". + type: string + required: + - containerPort + type: array + readinessProbe: + description: Probe describes a health check to be performed against + a container to determine whether it is alive or ready to receive + traffic. + properties: + exec: + description: ExecAction describes a "run in container" action. + properties: + command: + description: Command is the command line to execute inside + the container, the working directory for the command is + root ('/') in the container's filesystem. The command + is simply exec'd, it is not run inside a shell, so traditional + shell instructions ('|', etc) won't work. To use a shell, + you need to explicitly call out to that shell. Exit + status of 0 is treated as live/healthy and non-zero + is unhealthy. + items: + type: string + type: array + failureThreshold: + description: Minimum consecutive failures for the probe to + be considered failed after having succeeded. Defaults to + 3. Minimum value is 1. + format: int32 + type: integer + httpGet: + description: HTTPGetAction describes an action based on HTTP + Get requests. + properties: + host: + description: Host name to connect to, defaults to the + pod IP. You probably want to set "Host" in httpHeaders + instead. + type: string + httpHeaders: + description: Custom headers to set in the request. HTTP + allows repeated headers. + items: + description: HTTPHeader describes a custom header to + be used in HTTP probes + properties: + name: + description: The header field name + type: string + value: + description: The header field value + type: string + required: + - name + - value + type: array + path: + description: Path to access on the HTTP server. + type: string + port: + anyOf: + - type: string + - type: integer + scheme: + description: Scheme to use for connecting to the host. + Defaults to HTTP. + type: string + required: + - port + initialDelaySeconds: + description: 'Number of seconds after the container has started + before liveness probes are initiated. More info: https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + periodSeconds: + description: How often (in seconds) to perform the probe. + Default to 10 seconds. Minimum value is 1. + format: int32 + type: integer + successThreshold: + description: Minimum consecutive successes for the probe to + be considered successful after having failed. Defaults to + 1. Must be 1 for liveness. Minimum value is 1. + format: int32 + type: integer + tcpSocket: + description: TCPSocketAction describes an action based on + opening a socket + properties: + host: + description: 'Optional: Host name to connect to, defaults + to the pod IP.' + type: string + port: + anyOf: + - type: string + - type: integer + required: + - port + timeoutSeconds: + description: 'Number of seconds after which the probe times + out. Defaults to 1 second. Minimum value is 1. More info: + https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle#container-probes' + format: int32 + type: integer + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute + resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute + resources required. If Requests is omitted for a container, + it defaults to Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + securityContext: + description: SecurityContext holds security configuration that + will be applied to a container. Some fields are present in both + SecurityContext and PodSecurityContext. When both are set, + the values in SecurityContext take precedence. + properties: + allowPrivilegeEscalation: + description: 'AllowPrivilegeEscalation controls whether a + process can gain more privileges than its parent process. + This bool directly controls if the no_new_privs flag will + be set on the container process. AllowPrivilegeEscalation + is true always when the container is: 1) run as Privileged + 2) has CAP_SYS_ADMIN' + type: boolean + capabilities: + description: Adds and removes POSIX capabilities from running + containers. + properties: + add: + description: Added capabilities + items: + type: string + type: array + drop: + description: Removed capabilities + items: + type: string + type: array + privileged: + description: Run container in privileged mode. Processes in + privileged containers are essentially equivalent to root + on the host. Defaults to false. + type: boolean + procMount: + description: procMount denotes the type of proc mount to use + for the containers. The default is DefaultProcMount which + uses the container runtime defaults for readonly paths and + masked paths. This requires the ProcMountType feature flag + to be enabled. + type: string + readOnlyRootFilesystem: + description: Whether this container has a read-only root filesystem. + Default is false. + type: boolean + runAsGroup: + description: The GID to run the entrypoint of the container + process. Uses runtime default if unset. May also be set + in PodSecurityContext. If set in both SecurityContext and + PodSecurityContext, the value specified in SecurityContext + takes precedence. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must run as a non-root + user. If true, the Kubelet will validate the image at runtime + to ensure that it does not run as UID 0 (root) and fail + to start the container if it does. If unset or false, no + such validation will be performed. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, the + value specified in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container + process. Defaults to user specified in image metadata if + unspecified. May also be set in PodSecurityContext. If + set in both SecurityContext and PodSecurityContext, the + value specified in SecurityContext takes precedence. + format: int64 + type: integer + seLinuxOptions: + description: SELinuxOptions are the labels to be applied to + the container + properties: + level: + description: Level is SELinux level label that applies + to the container. + type: string + role: + description: Role is a SELinux role label that applies + to the container. + type: string + type: + description: Type is a SELinux type label that applies + to the container. + type: string + user: + description: User is a SELinux user label that applies + to the container. + type: string + stdin: + description: Whether this container should allocate a buffer for + stdin in the container runtime. If this is not set, reads from + stdin in the container will always result in EOF. Default is + false. + type: boolean + stdinOnce: + description: Whether the container runtime should close the stdin + channel after it has been opened by a single attach. When stdin + is true the stdin stream will remain open across multiple attach + sessions. If stdinOnce is set to true, stdin is opened on container + start, is empty until the first client attaches to stdin, and + then remains open and accepts data until the client disconnects, + at which time stdin is closed and remains closed until the container + is restarted. If this flag is false, a container processes that + reads from stdin will never receive an EOF. Default is false + type: boolean + terminationMessagePath: + description: 'Optional: Path at which the file to which the container''s + termination message will be written is mounted into the container''s + filesystem. Message written is intended to be brief final status, + such as an assertion failure message. Will be truncated by the + node if greater than 4096 bytes. The total message length across + all containers will be limited to 12kb. Defaults to /dev/termination-log. + Cannot be updated.' + type: string + terminationMessagePolicy: + description: Indicate how the termination message should be populated. + File will use the contents of terminationMessagePath to populate + the container status message on both success and failure. FallbackToLogsOnError + will use the last chunk of container log output if the termination + message file is empty and the container exited with an error. + The log output is limited to 2048 bytes or 80 lines, whichever + is smaller. Defaults to File. Cannot be updated. + type: string + tty: + description: Whether this container should allocate a TTY for + itself, also requires 'stdin' to be true. Default is false. + type: boolean + volumeDevices: + description: volumeDevices is the list of block devices to be + used by the container. This is an alpha feature and may change + in the future. + items: + description: volumeDevice describes a mapping of a raw block + device within a container. + properties: + devicePath: + description: devicePath is the path inside of the container + that the device will be mapped to. + type: string + name: + description: name must match the name of a persistentVolumeClaim + in the pod + type: string + required: + - name + - devicePath + type: array + volumeMounts: + description: Pod volumes to mount into the container's filesystem. + Cannot be updated. + items: + description: VolumeMount describes a mounting of a Volume within + a container. + properties: + mountPath: + description: Path within the container at which the volume + should be mounted. Must not contain ':'. + type: string + mountPropagation: + description: mountPropagation determines how mounts are + propagated from the host to container and the other way + around. When not set, MountPropagationNone is used. This + field is beta in 1.10. + type: string + name: + description: This must match the Name of a Volume. + type: string + readOnly: + description: Mounted read-only if true, read-write otherwise + (false or unspecified). Defaults to false. + type: boolean + subPath: + description: Path within the volume from which the container's + volume should be mounted. Defaults to "" (volume's root). + type: string + required: + - name + - mountPath + type: array + workingDir: + description: Container's working directory. If not specified, + the container runtime's default will be used, which might be + configured in the container image. Cannot be updated. + type: string + required: + - name + type: array + evaluationInterval: + description: Interval between consecutive evaluations. + type: string + externalLabels: + description: The labels to add to any time series or alerts when communicating + with external systems (federation, remote storage, Alertmanager). + type: object + externalUrl: + description: The external URL the Prometheus instances will be available + under. This is necessary to generate correct URLs. This is necessary + if Prometheus is not served from root of a DNS name. + type: string + imagePullSecrets: + description: An optional list of references to secrets in the same namespace + to use for pulling prometheus and alertmanager images from registries + see http://kubernetes.io/docs/user-guide/images#specifying-imagepullsecrets-on-a-pod + items: + description: LocalObjectReference contains enough information to let + you locate the referenced object inside the same namespace. + properties: + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + type: array + listenLocal: + description: ListenLocal makes the Prometheus server listen on loopback, + so that it does not bind against the Pod IP. + type: boolean + logLevel: + description: Log level for Prometheus to be configured with. + type: string + nodeSelector: + description: Define which Nodes the Pods are scheduled on. + type: object + paused: + description: When a Prometheus deployment is paused, no actions except + for deletion will be performed on the underlying objects. + type: boolean + podMetadata: + description: ObjectMeta is metadata that all persisted resources must + have, which includes all objects users must create. + properties: + annotations: + description: 'Annotations is an unstructured key value map stored + with a resource that may be set by external tools to store and + retrieve arbitrary metadata. They are not queryable and should + be preserved when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations' + type: object + clusterName: + description: The name of the cluster which the object belongs to. + This is used to distinguish resources with same name and namespace + in different clusters. This field is not set anywhere right now + and apiserver is going to ignore it if set in create or update + request. + type: string + creationTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of + the factory methods that the time package offers. + format: date-time + type: string + deletionGracePeriodSeconds: + description: Number of seconds allowed for this object to gracefully + terminate before it will be removed from the system. Only set + when deletionTimestamp is also set. May only be shortened. Read-only. + format: int64 + type: integer + deletionTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of + the factory methods that the time package offers. + format: date-time + type: string + finalizers: + description: Must be empty before the object is deleted from the + registry. Each entry is an identifier for the responsible component + that will remove the entry from the list. If the deletionTimestamp + of the object is non-nil, entries in this list can only be removed. + items: + type: string + type: array + generateName: + description: |- + GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server. + + If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header). + + Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency + type: string + generation: + description: A sequence number representing a specific generation + of the desired state. Populated by the system. Read-only. + format: int64 + type: integer + initializers: + description: Initializers tracks the progress of initialization. + properties: + pending: + description: Pending is a list of initializers that must execute + in order before this object is visible. When the last pending + initializer is removed, and no failing result is set, the + initializers struct will be set to nil and the object is considered + as initialized and visible to all clients. + items: + description: Initializer is information about an initializer + that has not yet completed. + properties: + name: + description: name of the process that is responsible for + initializing this object. + type: string + required: + - name + type: array + result: + description: Status is a return value for calls that don't return + other objects. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of + this representation of an object. Servers should convert + recognized schemas to the latest internal value, and may + reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + code: + description: Suggested HTTP return code for this status, + 0 if not set. + format: int32 + type: integer + details: + description: StatusDetails is a set of additional properties + that MAY be set by the server to provide additional information + about a response. The Reason field of a Status object + defines what attributes will be set. Clients must ignore + fields that do not match the defined type of each attribute, + and should assume that any attribute may be empty, invalid, + or under defined. + properties: + causes: + description: The Causes array includes more details + associated with the StatusReason failure. Not all + StatusReasons may provide detailed causes. + items: + description: StatusCause provides more information + about an api.Status failure, including cases when + multiple errors are encountered. + properties: + field: + description: |- + The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional. + + Examples: + "name" - the field "name" on the current resource + "items[0].name" - the field "name" on the first array entry in "items" + type: string + message: + description: A human-readable description of the + cause of the error. This field may be presented + as-is to a reader. + type: string + reason: + description: A machine-readable description of + the cause of the error. If this value is empty + there is no information available. + type: string + type: array + group: + description: The group attribute of the resource associated + with the status StatusReason. + type: string + kind: + description: 'The kind attribute of the resource associated + with the status StatusReason. On some operations may + differ from the requested resource Kind. More info: + https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: The name attribute of the resource associated + with the status StatusReason (when there is a single + name which can be described). + type: string + retryAfterSeconds: + description: If specified, the time in seconds before + the operation should be retried. Some errors may indicate + the client must take an alternate action - for those + errors this field may indicate how long to wait before + taking the alternate action. + format: int32 + type: integer + uid: + description: 'UID of the resource. (when there is a + single resource which can be described). More info: + http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + kind: + description: 'Kind is a string value representing the REST + resource this object represents. Servers may infer this + from the endpoint the client submits requests to. Cannot + be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + message: + description: A human-readable description of the status + of this operation. + type: string + metadata: + description: ListMeta describes metadata that synthetic + resources must have, including lists and various status + objects. A resource may have only one of {ObjectMeta, + ListMeta}. + properties: + continue: + description: continue may be set if the user set a limit + on the number of items returned, and indicates that + the server has more data available. The value is opaque + and may be used to issue another request to the endpoint + that served this list to retrieve the next set of + available objects. Continuing a consistent list may + not be possible if the server configuration has changed + or more than a few minutes have passed. The resourceVersion + field returned when using this continue value will + be identical to the value in the first response, unless + you have received this token from an error message. + type: string + resourceVersion: + description: 'String that identifies the server''s internal + version of this object that can be used by clients + to determine when objects have changed. Value must + be treated as opaque by clients and passed unmodified + back to the server. Populated by the system. Read-only. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency' + type: string + selfLink: + description: selfLink is a URL representing this object. + Populated by the system. Read-only. + type: string + reason: + description: A machine-readable description of why this + operation is in the "Failure" status. If this value is + empty there is no information available. A Reason clarifies + an HTTP status code but does not override it. + type: string + status: + description: 'Status of the operation. One of: "Success" + or "Failure". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status' + type: string + required: + - pending + labels: + description: 'Map of string keys and values that can be used to + organize and categorize (scope and select) objects. May match + selectors of replication controllers and services. More info: + http://kubernetes.io/docs/user-guide/labels' + type: object + name: + description: 'Name must be unique within a namespace. Is required + when creating resources, although some resources may allow a client + to request the generation of an appropriate name automatically. + Name is primarily intended for creation idempotence and configuration + definition. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: |- + Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty. + + Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces + type: string + ownerReferences: + description: List of objects depended by this object. If ALL objects + in the list have been deleted, this object will be garbage collected. + If this object is managed by a controller, then an entry in this + list will point to this controller, with the controller field + set to true. There cannot be more than one managing controller. + items: + description: OwnerReference contains enough information to let + you identify an owning object. Currently, an owning object must + be in the same namespace, so there is no namespace field. + properties: + apiVersion: + description: API version of the referent. + type: string + blockOwnerDeletion: + description: If true, AND if the owner has the "foregroundDeletion" + finalizer, then the owner cannot be deleted from the key-value + store until this reference is removed. Defaults to false. + To set this field, a user needs "delete" permission of the + owner, otherwise 422 (Unprocessable Entity) will be returned. + type: boolean + controller: + description: If true, this reference points to the managing + controller. + type: boolean + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + uid: + description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + required: + - apiVersion + - kind + - name + - uid + type: array + resourceVersion: + description: |- + An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources. + + Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency + type: string + selfLink: + description: SelfLink is a URL representing this object. Populated + by the system. Read-only. + type: string + uid: + description: |- + UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations. + + Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids + type: string + priorityClassName: + description: Priority class assigned to the Pods + type: string + remoteRead: + description: If specified, the remote_read spec. This is an experimental + feature, it may change in any upcoming release in a breaking way. + items: + description: RemoteReadSpec defines the remote_read configuration + for prometheus. + properties: + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over + basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + properties: + password: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + username: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bearerToken: + description: bearer token for remote read. + type: string + bearerTokenFile: + description: File to read bearer token for remote read. + type: string + proxyUrl: + description: Optional ProxyURL + type: string + readRecent: + description: Whether reads should be made for queries for time + ranges that the local storage should have complete data for. + type: boolean + remoteTimeout: + description: Timeout for requests to the remote read endpoint. + type: string + requiredMatchers: + description: An optional list of equality matchers which have + to be present in a selector to query the remote read endpoint. + type: object + tlsConfig: + description: TLSConfig specifies TLS configuration parameters. + properties: + caFile: + description: The CA cert to use for the targets. + type: string + certFile: + description: The client cert file for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: The client key file for the targets. + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + url: + description: The URL of the endpoint to send samples to. + type: string + required: + - url + type: array + remoteWrite: + description: If specified, the remote_write spec. This is an experimental + feature, it may change in any upcoming release in a breaking way. + items: + description: RemoteWriteSpec defines the remote_write configuration + for prometheus. + properties: + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over + basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + properties: + password: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + username: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bearerToken: + description: File to read bearer token for remote write. + type: string + bearerTokenFile: + description: File to read bearer token for remote write. + type: string + proxyUrl: + description: Optional ProxyURL + type: string + queueConfig: + description: QueueConfig allows the tuning of remote_write queue_config + parameters. This object is referenced in the RemoteWriteSpec + object. + properties: + batchSendDeadline: + description: BatchSendDeadline is the maximum time a sample + will wait in buffer. + type: string + capacity: + description: Capacity is the number of samples to buffer per + shard before we start dropping them. + format: int32 + type: integer + maxBackoff: + description: MaxBackoff is the maximum retry delay. + type: string + maxRetries: + description: MaxRetries is the maximum number of times to + retry a batch on recoverable errors. + format: int32 + type: integer + maxSamplesPerSend: + description: MaxSamplesPerSend is the maximum number of samples + per send. + format: int32 + type: integer + maxShards: + description: MaxShards is the maximum number of shards, i.e. + amount of concurrency. + format: int32 + type: integer + minBackoff: + description: MinBackoff is the initial retry delay. Gets doubled + for every retry. + type: string + remoteTimeout: + description: Timeout for requests to the remote write endpoint. + type: string + tlsConfig: + description: TLSConfig specifies TLS configuration parameters. + properties: + caFile: + description: The CA cert to use for the targets. + type: string + certFile: + description: The client cert file for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: The client key file for the targets. + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + url: + description: The URL of the endpoint to send samples to. + type: string + writeRelabelConfigs: + description: The list of remote write relabel configurations. + items: + description: 'RelabelConfig allows dynamic rewriting of the + label set, being applied to samples before ingestion. It defines + ``-section of Prometheus configuration. + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' + properties: + action: + description: Action to perform based on regex matching. + Default is 'replace' + type: string + modulus: + description: Modulus to take of the hash of the source label + values. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted + value is matched. defailt is '(.*)' + type: string + replacement: + description: Replacement value against which a regex replace + is performed if the regular expression matches. Regex + capture groups are available. Default is '$1' + type: string + separator: + description: Separator placed between concatenated source + label values. default is ';'. + type: string + sourceLabels: + description: The source labels select values from existing + labels. Their content is concatenated using the configured + separator and matched against the configured regular expression + for the replace, keep, and drop actions. + items: + type: string + type: array + targetLabel: + description: Label to which the resulting value is written + in a replace action. It is mandatory for replace actions. + Regex capture groups are available. + type: string + type: array + required: + - url + type: array + replicas: + description: Number of instances to deploy for a Prometheus deployment. + format: int32 + type: integer + resources: + description: ResourceRequirements describes the compute resource requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute resources + allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute resources + required. If Requests is omitted for a container, it defaults + to Limits if that is explicitly specified, otherwise to an implementation-defined + value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + retention: + description: Time duration Prometheus shall retain data for. Default + is '24h', and must match the regular expression `[0-9]+(ms|s|m|h|d|w|y)` + (milliseconds seconds minutes hours days weeks years). + type: string + routePrefix: + description: The route prefix Prometheus registers HTTP handlers for. + This is useful, if using ExternalURL and a proxy is rewriting HTTP + routes of a request, and the actual ExternalURL is still true, but + the server serves requests under a different route prefix. For example + for use with `kubectl proxy`. + type: string + ruleNamespaceSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object + ruleSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object + scrapeInterval: + description: Interval between consecutive scrapes. + type: string + secrets: + description: Secrets is a list of Secrets in the same namespace as the + Prometheus object, which shall be mounted into the Prometheus Pods. + The Secrets are mounted into /etc/prometheus/secrets/. + items: + type: string + type: array + securityContext: + description: PodSecurityContext holds pod-level security attributes + and common container settings. Some fields are also present in container.securityContext. Field + values of container.securityContext take precedence over field values + of PodSecurityContext. + properties: + fsGroup: + description: |- + A special supplemental group that applies to all containers in a pod. Some volume types allow the Kubelet to change the ownership of that volume to be owned by the pod: + + 1. The owning GID will be the FSGroup 2. The setgid bit is set (new files created in the volume will be owned by FSGroup) 3. The permission bits are OR'd with rw-rw---- + + If unset, the Kubelet will not modify the ownership and permissions of any volume. + format: int64 + type: integer + runAsGroup: + description: The GID to run the entrypoint of the container process. + Uses runtime default if unset. May also be set in SecurityContext. If + set in both SecurityContext and PodSecurityContext, the value + specified in SecurityContext takes precedence for that container. + format: int64 + type: integer + runAsNonRoot: + description: Indicates that the container must run as a non-root + user. If true, the Kubelet will validate the image at runtime + to ensure that it does not run as UID 0 (root) and fail to start + the container if it does. If unset or false, no such validation + will be performed. May also be set in SecurityContext. If set + in both SecurityContext and PodSecurityContext, the value specified + in SecurityContext takes precedence. + type: boolean + runAsUser: + description: The UID to run the entrypoint of the container process. + Defaults to user specified in image metadata if unspecified. May + also be set in SecurityContext. If set in both SecurityContext + and PodSecurityContext, the value specified in SecurityContext + takes precedence for that container. + format: int64 + type: integer + seLinuxOptions: + description: SELinuxOptions are the labels to be applied to the + container + properties: + level: + description: Level is SELinux level label that applies to the + container. + type: string + role: + description: Role is a SELinux role label that applies to the + container. + type: string + type: + description: Type is a SELinux type label that applies to the + container. + type: string + user: + description: User is a SELinux user label that applies to the + container. + type: string + supplementalGroups: + description: A list of groups applied to the first process run in + each container, in addition to the container's primary GID. If + unspecified, no groups will be added to any container. + items: + format: int64 + type: integer + type: array + sysctls: + description: Sysctls hold a list of namespaced sysctls used for + the pod. Pods with unsupported sysctls (by the container runtime) + might fail to launch. + items: + description: Sysctl defines a kernel parameter to be set + properties: + name: + description: Name of a property to set + type: string + value: + description: Value of a property to set + type: string + required: + - name + - value + type: array + serviceAccountName: + description: ServiceAccountName is the name of the ServiceAccount to + use to run the Prometheus Pods. + type: string + serviceMonitorNamespaceSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object + serviceMonitorSelector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object + sha: + description: SHA of Prometheus container image to be deployed. Defaults + to the value of `version`. Similar to a tag, but the SHA explicitly + deploys an immutable container image. Version and Tag are ignored + if SHA is set. + type: string + storage: + description: StorageSpec defines the configured storage for a group + Prometheus servers. If neither `emptyDir` nor `volumeClaimTemplate` + is specified, then by default an [EmptyDir](https://kubernetes.io/docs/concepts/storage/volumes/#emptydir) + will be used. + properties: + emptyDir: + description: Represents an empty directory for a pod. Empty directory + volumes support ownership management and SELinux relabeling. + properties: + medium: + description: 'What type of storage medium should back this directory. + The default is "" which means to use the node''s default medium. + Must be an empty string (default) or Memory. More info: https://kubernetes.io/docs/concepts/storage/volumes#emptydir' + type: string + sizeLimit: {} + volumeClaimTemplate: + description: PersistentVolumeClaim is a user's request for and claim + to a persistent volume + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this + representation of an object. Servers should convert recognized + schemas to the latest internal value, and may reject unrecognized + values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource + this object represents. Servers may infer this from the endpoint + the client submits requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + metadata: + description: ObjectMeta is metadata that all persisted resources + must have, which includes all objects users must create. + properties: + annotations: + description: 'Annotations is an unstructured key value map + stored with a resource that may be set by external tools + to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + More info: http://kubernetes.io/docs/user-guide/annotations' + type: object + clusterName: + description: The name of the cluster which the object belongs + to. This is used to distinguish resources with same name + and namespace in different clusters. This field is not + set anywhere right now and apiserver is going to ignore + it if set in create or update request. + type: string + creationTimestamp: + description: Time is a wrapper around time.Time which supports + correct marshaling to YAML and JSON. Wrappers are provided + for many of the factory methods that the time package + offers. + format: date-time + type: string + deletionGracePeriodSeconds: + description: Number of seconds allowed for this object to + gracefully terminate before it will be removed from the + system. Only set when deletionTimestamp is also set. May + only be shortened. Read-only. + format: int64 + type: integer + deletionTimestamp: + description: Time is a wrapper around time.Time which supports + correct marshaling to YAML and JSON. Wrappers are provided + for many of the factory methods that the time package + offers. + format: date-time + type: string + finalizers: + description: Must be empty before the object is deleted + from the registry. Each entry is an identifier for the + responsible component that will remove the entry from + the list. If the deletionTimestamp of the object is non-nil, + entries in this list can only be removed. + items: + type: string + type: array + generateName: + description: |- + GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server. + + If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header). + + Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency + type: string + generation: + description: A sequence number representing a specific generation + of the desired state. Populated by the system. Read-only. + format: int64 + type: integer + initializers: + description: Initializers tracks the progress of initialization. + properties: + pending: + description: Pending is a list of initializers that + must execute in order before this object is visible. + When the last pending initializer is removed, and + no failing result is set, the initializers struct + will be set to nil and the object is considered as + initialized and visible to all clients. + items: + description: Initializer is information about an initializer + that has not yet completed. + properties: + name: + description: name of the process that is responsible + for initializing this object. + type: string + required: + - name + type: array + result: + description: Status is a return value for calls that + don't return other objects. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema + of this representation of an object. Servers should + convert recognized schemas to the latest internal + value, and may reject unrecognized values. More + info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + code: + description: Suggested HTTP return code for this + status, 0 if not set. + format: int32 + type: integer + details: + description: StatusDetails is a set of additional + properties that MAY be set by the server to provide + additional information about a response. The Reason + field of a Status object defines what attributes + will be set. Clients must ignore fields that do + not match the defined type of each attribute, + and should assume that any attribute may be empty, + invalid, or under defined. + properties: + causes: + description: The Causes array includes more + details associated with the StatusReason failure. + Not all StatusReasons may provide detailed + causes. + items: + description: StatusCause provides more information + about an api.Status failure, including cases + when multiple errors are encountered. + properties: + field: + description: |- + The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional. + + Examples: + "name" - the field "name" on the current resource + "items[0].name" - the field "name" on the first array entry in "items" + type: string + message: + description: A human-readable description + of the cause of the error. This field + may be presented as-is to a reader. + type: string + reason: + description: A machine-readable description + of the cause of the error. If this value + is empty there is no information available. + type: string + type: array + group: + description: The group attribute of the resource + associated with the status StatusReason. + type: string + kind: + description: 'The kind attribute of the resource + associated with the status StatusReason. On + some operations may differ from the requested + resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: The name attribute of the resource + associated with the status StatusReason (when + there is a single name which can be described). + type: string + retryAfterSeconds: + description: If specified, the time in seconds + before the operation should be retried. Some + errors may indicate the client must take an + alternate action - for those errors this field + may indicate how long to wait before taking + the alternate action. + format: int32 + type: integer + uid: + description: 'UID of the resource. (when there + is a single resource which can be described). + More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + kind: + description: 'Kind is a string value representing + the REST resource this object represents. Servers + may infer this from the endpoint the client submits + requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + message: + description: A human-readable description of the + status of this operation. + type: string + metadata: + description: ListMeta describes metadata that synthetic + resources must have, including lists and various + status objects. A resource may have only one of + {ObjectMeta, ListMeta}. + properties: + continue: + description: continue may be set if the user + set a limit on the number of items returned, + and indicates that the server has more data + available. The value is opaque and may be + used to issue another request to the endpoint + that served this list to retrieve the next + set of available objects. Continuing a consistent + list may not be possible if the server configuration + has changed or more than a few minutes have + passed. The resourceVersion field returned + when using this continue value will be identical + to the value in the first response, unless + you have received this token from an error + message. + type: string + resourceVersion: + description: 'String that identifies the server''s + internal version of this object that can be + used by clients to determine when objects + have changed. Value must be treated as opaque + by clients and passed unmodified back to the + server. Populated by the system. Read-only. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency' + type: string + selfLink: + description: selfLink is a URL representing + this object. Populated by the system. Read-only. + type: string + reason: + description: A machine-readable description of why + this operation is in the "Failure" status. If + this value is empty there is no information available. + A Reason clarifies an HTTP status code but does + not override it. + type: string + status: + description: 'Status of the operation. One of: "Success" + or "Failure". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status' + type: string + required: + - pending + labels: + description: 'Map of string keys and values that can be + used to organize and categorize (scope and select) objects. + May match selectors of replication controllers and services. + More info: http://kubernetes.io/docs/user-guide/labels' + type: object + name: + description: 'Name must be unique within a namespace. Is + required when creating resources, although some resources + may allow a client to request the generation of an appropriate + name automatically. Name is primarily intended for creation + idempotence and configuration definition. Cannot be updated. + More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: |- + Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty. + + Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces + type: string + ownerReferences: + description: List of objects depended by this object. If + ALL objects in the list have been deleted, this object + will be garbage collected. If this object is managed by + a controller, then an entry in this list will point to + this controller, with the controller field set to true. + There cannot be more than one managing controller. + items: + description: OwnerReference contains enough information + to let you identify an owning object. Currently, an + owning object must be in the same namespace, so there + is no namespace field. + properties: + apiVersion: + description: API version of the referent. + type: string + blockOwnerDeletion: + description: If true, AND if the owner has the "foregroundDeletion" + finalizer, then the owner cannot be deleted from + the key-value store until this reference is removed. + Defaults to false. To set this field, a user needs + "delete" permission of the owner, otherwise 422 + (Unprocessable Entity) will be returned. + type: boolean + controller: + description: If true, this reference points to the + managing controller. + type: boolean + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + uid: + description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + required: + - apiVersion + - kind + - name + - uid + type: array + resourceVersion: + description: |- + An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources. + + Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency + type: string + selfLink: + description: SelfLink is a URL representing this object. + Populated by the system. Read-only. + type: string + uid: + description: |- + UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations. + + Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids + type: string + spec: + description: PersistentVolumeClaimSpec describes the common + attributes of storage devices and allows a Source for provider-specific + attributes + properties: + accessModes: + description: 'AccessModes contains the desired access modes + the volume should have. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + dataSource: + description: TypedLocalObjectReference contains enough information + to let you locate the typed referenced object inside the + same namespace. + properties: + apiGroup: + description: APIGroup is the group for the resource + being referenced. If APIGroup is not specified, the + specified Kind must be in the core API group. For + any other third-party types, APIGroup is required. + type: string + kind: + description: Kind is the type of resource being referenced + type: string + name: + description: Name is the name of resource being referenced + type: string + required: + - kind + - name + resources: + description: ResourceRequirements describes the compute + resource requirements. + properties: + limits: + description: 'Limits describes the maximum amount of + compute resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount + of compute resources required. If Requests is omitted + for a container, it defaults to Limits if that is + explicitly specified, otherwise to an implementation-defined + value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + selector: + description: A label selector is a label query over a set + of resources. The result of matchLabels and matchExpressions + are ANDed. An empty label selector matches all objects. + A null label selector matches no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector + requirements. The requirements are ANDed. + items: + description: A label selector requirement is a selector + that contains values, a key, and an operator that + relates the key and values. + properties: + key: + description: key is the label key that the selector + applies to. + type: string + operator: + description: operator represents a key's relationship + to a set of values. Valid operators are In, + NotIn, Exists and DoesNotExist. + type: string + values: + description: values is an array of string values. + If the operator is In or NotIn, the values array + must be non-empty. If the operator is Exists + or DoesNotExist, the values array must be empty. + This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. + A single {key,value} in the matchLabels map is equivalent + to an element of matchExpressions, whose key field + is "key", the operator is "In", and the values array + contains only "value". The requirements are ANDed. + type: object + storageClassName: + description: 'Name of the StorageClass required by the claim. + More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#class-1' + type: string + volumeMode: + description: volumeMode defines what type of volume is required + by the claim. Value of Filesystem is implied when not + included in claim spec. This is an alpha feature and may + change in the future. + type: string + volumeName: + description: VolumeName is the binding reference to the + PersistentVolume backing this claim. + type: string + status: + description: PersistentVolumeClaimStatus is the current status + of a persistent volume claim. + properties: + accessModes: + description: 'AccessModes contains the actual access modes + the volume backing the PVC has. More info: https://kubernetes.io/docs/concepts/storage/persistent-volumes#access-modes-1' + items: + type: string + type: array + capacity: + description: Represents the actual resources of the underlying + volume. + type: object + conditions: + description: Current Condition of persistent volume claim. + If underlying persistent volume is being resized then + the Condition will be set to 'ResizeStarted'. + items: + description: PersistentVolumeClaimCondition contails details + about state of pvc + properties: + lastProbeTime: + description: Time is a wrapper around time.Time which + supports correct marshaling to YAML and JSON. Wrappers + are provided for many of the factory methods that + the time package offers. + format: date-time + type: string + lastTransitionTime: + description: Time is a wrapper around time.Time which + supports correct marshaling to YAML and JSON. Wrappers + are provided for many of the factory methods that + the time package offers. + format: date-time + type: string + message: + description: Human-readable message indicating details + about last transition. + type: string + reason: + description: Unique, this should be a short, machine + understandable string that gives the reason for + condition's last transition. If it reports "ResizeStarted" + that means the underlying persistent volume is being + resized. + type: string + status: + type: string + type: + type: string + required: + - type + - status + type: array + phase: + description: Phase represents the current phase of PersistentVolumeClaim. + type: string + tag: + description: Tag of Prometheus container image to be deployed. Defaults + to the value of `version`. Version is ignored if Tag is set. + type: string + thanos: + description: ThanosSpec defines parameters for a Prometheus server within + a Thanos deployment. + properties: + baseImage: + description: Thanos base image if other than default. + type: string + gcs: + description: ThanosGCSSpec defines parameters for use of Google + Cloud Storage (GCS) with Thanos. + properties: + bucket: + description: Google Cloud Storage bucket name for stored blocks. + If empty it won't store any block inside Google Cloud Storage. + type: string + credentials: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + peers: + description: Peers is a DNS name for Thanos to discover peers through. + type: string + resources: + description: ResourceRequirements describes the compute resource + requirements. + properties: + limits: + description: 'Limits describes the maximum amount of compute + resources allowed. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + requests: + description: 'Requests describes the minimum amount of compute + resources required. If Requests is omitted for a container, + it defaults to Limits if that is explicitly specified, otherwise + to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' + type: object + s3: + description: ThanosS3Spec defines parameters for of AWS Simple Storage + Service (S3) with Thanos. (S3 compatible services apply as well) + properties: + accessKey: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bucket: + description: S3-Compatible API bucket name for stored blocks. + type: string + encryptsse: + description: Whether to use Server Side Encryption + type: boolean + endpoint: + description: S3-Compatible API endpoint for stored blocks. + type: string + insecure: + description: Whether to use an insecure connection with an S3-Compatible + API. + type: boolean + secretKey: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + signatureVersion2: + description: Whether to use S3 Signature Version 2; otherwise + Signature Version 4 will be used. + type: boolean + sha: + description: SHA of Thanos container image to be deployed. Defaults + to the value of `version`. Similar to a tag, but the SHA explicitly + deploys an immutable container image. Version and Tag are ignored + if SHA is set. + type: string + tag: + description: Tag of Thanos sidecar container image to be deployed. + Defaults to the value of `version`. Version is ignored if Tag + is set. + type: string + version: + description: Version describes the version of Thanos to use. + type: string + tolerations: + description: If specified, the pod's tolerations. + items: + description: The pod this Toleration is attached to tolerates any + taint that matches the triple using the matching + operator . + properties: + effect: + description: Effect indicates the taint effect to match. Empty + means match all taint effects. When specified, allowed values + are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Key is the taint key that the toleration applies + to. Empty means match all taint keys. If the key is empty, operator + must be Exists; this combination means to match all values and + all keys. + type: string + operator: + description: Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. Exists + is equivalent to wildcard for value, so that a pod can tolerate + all taints of a particular category. + type: string + tolerationSeconds: + description: TolerationSeconds represents the period of time the + toleration (which must be of effect NoExecute, otherwise this + field is ignored) tolerates the taint. By default, it is not + set, which means tolerate the taint forever (do not evict). + Zero and negative values will be treated as 0 (evict immediately) + by the system. + format: int64 + type: integer + value: + description: Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise + just a regular string. + type: string + type: array + version: + description: Version of Prometheus to be deployed. + type: string + status: + description: 'PrometheusStatus is the most recent observed status of the + Prometheus cluster. Read-only. Not included when requesting from the apiserver, + only from the Prometheus Operator API itself. More info: https://github.com/kubernetes/community/blob/master/contributors/devel/api-conventions.md#spec-and-status' + properties: + availableReplicas: + description: Total number of available pods (ready for at least minReadySeconds) + targeted by this Prometheus deployment. + format: int32 + type: integer + paused: + description: Represents whether any actions on the underlaying managed + objects are being performed. Only delete actions will be performed. + type: boolean + replicas: + description: Total number of non-terminated pods targeted by this Prometheus + deployment (their labels match the selector). + format: int32 + type: integer + unavailableReplicas: + description: Total number of unavailable pods targeted by this Prometheus + deployment. + format: int32 + type: integer + updatedReplicas: + description: Total number of non-terminated pods targeted by this Prometheus + deployment that have the desired version spec. + format: int32 + type: integer + required: + - paused + - replicas + - updatedReplicas + - availableReplicas + - unavailableReplicas + version: v1 diff --git a/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml new file mode 100644 index 0000000..877fada --- /dev/null +++ b/manifests/0prometheus-operator-0prometheusruleCustomResourceDefinition.yaml @@ -0,0 +1,342 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + creationTimestamp: null + name: prometheusrules.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + kind: PrometheusRule + plural: prometheusrules + scope: Namespaced + validation: + openAPIV3Schema: + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + metadata: + description: ObjectMeta is metadata that all persisted resources must have, + which includes all objects users must create. + properties: + annotations: + description: 'Annotations is an unstructured key value map stored with + a resource that may be set by external tools to store and retrieve + arbitrary metadata. They are not queryable and should be preserved + when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations' + type: object + clusterName: + description: The name of the cluster which the object belongs to. This + is used to distinguish resources with same name and namespace in different + clusters. This field is not set anywhere right now and apiserver is + going to ignore it if set in create or update request. + type: string + creationTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of the + factory methods that the time package offers. + format: date-time + type: string + deletionGracePeriodSeconds: + description: Number of seconds allowed for this object to gracefully + terminate before it will be removed from the system. Only set when + deletionTimestamp is also set. May only be shortened. Read-only. + format: int64 + type: integer + deletionTimestamp: + description: Time is a wrapper around time.Time which supports correct + marshaling to YAML and JSON. Wrappers are provided for many of the + factory methods that the time package offers. + format: date-time + type: string + finalizers: + description: Must be empty before the object is deleted from the registry. + Each entry is an identifier for the responsible component that will + remove the entry from the list. If the deletionTimestamp of the object + is non-nil, entries in this list can only be removed. + items: + type: string + type: array + generateName: + description: |- + GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server. + + If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header). + + Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#idempotency + type: string + generation: + description: A sequence number representing a specific generation of + the desired state. Populated by the system. Read-only. + format: int64 + type: integer + initializers: + description: Initializers tracks the progress of initialization. + properties: + pending: + description: Pending is a list of initializers that must execute + in order before this object is visible. When the last pending + initializer is removed, and no failing result is set, the initializers + struct will be set to nil and the object is considered as initialized + and visible to all clients. + items: + description: Initializer is information about an initializer that + has not yet completed. + properties: + name: + description: name of the process that is responsible for initializing + this object. + type: string + required: + - name + type: array + result: + description: Status is a return value for calls that don't return + other objects. + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this + representation of an object. Servers should convert recognized + schemas to the latest internal value, and may reject unrecognized + values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + code: + description: Suggested HTTP return code for this status, 0 if + not set. + format: int32 + type: integer + details: + description: StatusDetails is a set of additional properties + that MAY be set by the server to provide additional information + about a response. The Reason field of a Status object defines + what attributes will be set. Clients must ignore fields that + do not match the defined type of each attribute, and should + assume that any attribute may be empty, invalid, or under + defined. + properties: + causes: + description: The Causes array includes more details associated + with the StatusReason failure. Not all StatusReasons may + provide detailed causes. + items: + description: StatusCause provides more information about + an api.Status failure, including cases when multiple + errors are encountered. + properties: + field: + description: |- + The field of the resource that has caused this error, as named by its JSON serialization. May include dot and postfix notation for nested attributes. Arrays are zero-indexed. Fields may appear more than once in an array of causes due to fields having multiple errors. Optional. + + Examples: + "name" - the field "name" on the current resource + "items[0].name" - the field "name" on the first array entry in "items" + type: string + message: + description: A human-readable description of the cause + of the error. This field may be presented as-is + to a reader. + type: string + reason: + description: A machine-readable description of the + cause of the error. If this value is empty there + is no information available. + type: string + type: array + group: + description: The group attribute of the resource associated + with the status StatusReason. + type: string + kind: + description: 'The kind attribute of the resource associated + with the status StatusReason. On some operations may differ + from the requested resource Kind. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: The name attribute of the resource associated + with the status StatusReason (when there is a single name + which can be described). + type: string + retryAfterSeconds: + description: If specified, the time in seconds before the + operation should be retried. Some errors may indicate + the client must take an alternate action - for those errors + this field may indicate how long to wait before taking + the alternate action. + format: int32 + type: integer + uid: + description: 'UID of the resource. (when there is a single + resource which can be described). More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + kind: + description: 'Kind is a string value representing the REST resource + this object represents. Servers may infer this from the endpoint + the client submits requests to. Cannot be updated. In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + message: + description: A human-readable description of the status of this + operation. + type: string + metadata: + description: ListMeta describes metadata that synthetic resources + must have, including lists and various status objects. A resource + may have only one of {ObjectMeta, ListMeta}. + properties: + continue: + description: continue may be set if the user set a limit + on the number of items returned, and indicates that the + server has more data available. The value is opaque and + may be used to issue another request to the endpoint that + served this list to retrieve the next set of available + objects. Continuing a consistent list may not be possible + if the server configuration has changed or more than a + few minutes have passed. The resourceVersion field returned + when using this continue value will be identical to the + value in the first response, unless you have received + this token from an error message. + type: string + resourceVersion: + description: 'String that identifies the server''s internal + version of this object that can be used by clients to + determine when objects have changed. Value must be treated + as opaque by clients and passed unmodified back to the + server. Populated by the system. Read-only. More info: + https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency' + type: string + selfLink: + description: selfLink is a URL representing this object. + Populated by the system. Read-only. + type: string + reason: + description: A machine-readable description of why this operation + is in the "Failure" status. If this value is empty there is + no information available. A Reason clarifies an HTTP status + code but does not override it. + type: string + status: + description: 'Status of the operation. One of: "Success" or + "Failure". More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#spec-and-status' + type: string + required: + - pending + labels: + description: 'Map of string keys and values that can be used to organize + and categorize (scope and select) objects. May match selectors of + replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels' + type: object + name: + description: 'Name must be unique within a namespace. Is required when + creating resources, although some resources may allow a client to + request the generation of an appropriate name automatically. Name + is primarily intended for creation idempotence and configuration definition. + Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + namespace: + description: |- + Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty. + + Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces + type: string + ownerReferences: + description: List of objects depended by this object. If ALL objects + in the list have been deleted, this object will be garbage collected. + If this object is managed by a controller, then an entry in this list + will point to this controller, with the controller field set to true. + There cannot be more than one managing controller. + items: + description: OwnerReference contains enough information to let you + identify an owning object. Currently, an owning object must be in + the same namespace, so there is no namespace field. + properties: + apiVersion: + description: API version of the referent. + type: string + blockOwnerDeletion: + description: If true, AND if the owner has the "foregroundDeletion" + finalizer, then the owner cannot be deleted from the key-value + store until this reference is removed. Defaults to false. To + set this field, a user needs "delete" permission of the owner, + otherwise 422 (Unprocessable Entity) will be returned. + type: boolean + controller: + description: If true, this reference points to the managing controller. + type: boolean + kind: + description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + name: + description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + uid: + description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids' + type: string + required: + - apiVersion + - kind + - name + - uid + type: array + resourceVersion: + description: |- + An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources. + + Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#concurrency-control-and-consistency + type: string + selfLink: + description: SelfLink is a URL representing this object. Populated by + the system. Read-only. + type: string + uid: + description: |- + UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations. + + Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids + type: string + spec: + description: PrometheusRuleSpec contains specification parameters for a + Rule. + properties: + groups: + description: Content of Prometheus rule file + items: + description: RuleGroup is a list of sequentially evaluated recording + and alerting rules. + properties: + interval: + type: string + name: + type: string + rules: + items: + description: Rule describes an alerting or recording rule. + properties: + alert: + type: string + annotations: + type: object + expr: + anyOf: + - type: string + - type: integer + for: + type: string + labels: + type: object + record: + type: string + required: + - expr + type: array + required: + - name + - rules + type: array + version: v1 diff --git a/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml new file mode 100644 index 0000000..d2e310f --- /dev/null +++ b/manifests/0prometheus-operator-0servicemonitorCustomResourceDefinition.yaml @@ -0,0 +1,291 @@ +apiVersion: apiextensions.k8s.io/v1beta1 +kind: CustomResourceDefinition +metadata: + creationTimestamp: null + name: servicemonitors.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + kind: ServiceMonitor + plural: servicemonitors + scope: Namespaced + validation: + openAPIV3Schema: + properties: + apiVersion: + description: 'APIVersion defines the versioned schema of this representation + of an object. Servers should convert recognized schemas to the latest + internal value, and may reject unrecognized values. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#resources' + type: string + kind: + description: 'Kind is a string value representing the REST resource this + object represents. Servers may infer this from the endpoint the client + submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/api-conventions.md#types-kinds' + type: string + spec: + description: ServiceMonitorSpec contains specification parameters for a + ServiceMonitor. + properties: + endpoints: + description: A list of endpoints allowed as part of this ServiceMonitor. + items: + description: Endpoint defines a scrapeable endpoint serving Prometheus + metrics. + properties: + basicAuth: + description: 'BasicAuth allow an endpoint to authenticate over + basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' + properties: + password: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + username: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must + be a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must + be defined + type: boolean + required: + - key + bearerTokenFile: + description: File to read bearer token for scraping targets. + type: string + honorLabels: + description: HonorLabels chooses the metric's labels on collisions + with target labels. + type: boolean + interval: + description: Interval at which metrics should be scraped + type: string + metricRelabelings: + description: MetricRelabelConfigs to apply to samples before ingestion. + items: + description: 'RelabelConfig allows dynamic rewriting of the + label set, being applied to samples before ingestion. It defines + ``-section of Prometheus configuration. + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' + properties: + action: + description: Action to perform based on regex matching. + Default is 'replace' + type: string + modulus: + description: Modulus to take of the hash of the source label + values. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted + value is matched. defailt is '(.*)' + type: string + replacement: + description: Replacement value against which a regex replace + is performed if the regular expression matches. Regex + capture groups are available. Default is '$1' + type: string + separator: + description: Separator placed between concatenated source + label values. default is ';'. + type: string + sourceLabels: + description: The source labels select values from existing + labels. Their content is concatenated using the configured + separator and matched against the configured regular expression + for the replace, keep, and drop actions. + items: + type: string + type: array + targetLabel: + description: Label to which the resulting value is written + in a replace action. It is mandatory for replace actions. + Regex capture groups are available. + type: string + type: array + params: + description: Optional HTTP URL parameters + type: object + path: + description: HTTP path to scrape for metrics. + type: string + port: + description: Name of the service port this endpoint refers to. + Mutually exclusive with targetPort. + type: string + proxyUrl: + description: ProxyURL eg http://proxyserver:2195 Directs scrapes + to proxy through this endpoint. + type: string + relabelings: + description: 'RelabelConfigs to apply to samples before ingestion. + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#' + items: + description: 'RelabelConfig allows dynamic rewriting of the + label set, being applied to samples before ingestion. It defines + ``-section of Prometheus configuration. + More info: https://prometheus.io/docs/prometheus/latest/configuration/configuration/#metric_relabel_configs' + properties: + action: + description: Action to perform based on regex matching. + Default is 'replace' + type: string + modulus: + description: Modulus to take of the hash of the source label + values. + format: int64 + type: integer + regex: + description: Regular expression against which the extracted + value is matched. defailt is '(.*)' + type: string + replacement: + description: Replacement value against which a regex replace + is performed if the regular expression matches. Regex + capture groups are available. Default is '$1' + type: string + separator: + description: Separator placed between concatenated source + label values. default is ';'. + type: string + sourceLabels: + description: The source labels select values from existing + labels. Their content is concatenated using the configured + separator and matched against the configured regular expression + for the replace, keep, and drop actions. + items: + type: string + type: array + targetLabel: + description: Label to which the resulting value is written + in a replace action. It is mandatory for replace actions. + Regex capture groups are available. + type: string + type: array + scheme: + description: HTTP scheme to use for scraping. + type: string + scrapeTimeout: + description: Timeout after which the scrape is ended + type: string + targetPort: + anyOf: + - type: string + - type: integer + tlsConfig: + description: TLSConfig specifies TLS configuration parameters. + properties: + caFile: + description: The CA cert to use for the targets. + type: string + certFile: + description: The client cert file for the targets. + type: string + insecureSkipVerify: + description: Disable target certificate validation. + type: boolean + keyFile: + description: The client key file for the targets. + type: string + serverName: + description: Used to verify the hostname for the targets. + type: string + type: array + jobLabel: + description: The label to use to retrieve the job name from. + type: string + namespaceSelector: + description: NamespaceSelector is a selector for selecting either all + namespaces or a list of namespaces. + properties: + any: + description: Boolean describing whether all namespaces are selected + in contrast to a list restricting them. + type: boolean + matchNames: + description: List of namespace names. + items: + type: string + type: array + podTargetLabels: + description: PodTargetLabels transfers labels on the Kubernetes Pod + onto the target. + items: + type: string + type: array + sampleLimit: + description: SampleLimit defines per-scrape limit on number of scraped + samples that will be accepted. + format: int64 + type: integer + selector: + description: A label selector is a label query over a set of resources. + The result of matchLabels and matchExpressions are ANDed. An empty + label selector matches all objects. A null label selector matches + no objects. + properties: + matchExpressions: + description: matchExpressions is a list of label selector requirements. + The requirements are ANDed. + items: + description: A label selector requirement is a selector that contains + values, a key, and an operator that relates the key and values. + properties: + key: + description: key is the label key that the selector applies + to. + type: string + operator: + description: operator represents a key's relationship to a + set of values. Valid operators are In, NotIn, Exists and + DoesNotExist. + type: string + values: + description: values is an array of string values. If the operator + is In or NotIn, the values array must be non-empty. If the + operator is Exists or DoesNotExist, the values array must + be empty. This array is replaced during a strategic merge + patch. + items: + type: string + type: array + required: + - key + - operator + type: array + matchLabels: + description: matchLabels is a map of {key,value} pairs. A single + {key,value} in the matchLabels map is equivalent to an element + of matchExpressions, whose key field is "key", the operator is + "In", and the values array contains only "value". The requirements + are ANDed. + type: object + targetLabels: + description: TargetLabels transfers labels on the Kubernetes Service + onto the target. + items: + type: string + type: array + required: + - endpoints + - selector + version: v1 diff --git a/manifests/0prometheus-operator-clusterRole.yaml b/manifests/0prometheus-operator-clusterRole.yaml new file mode 100644 index 0000000..e0ac283 --- /dev/null +++ b/manifests/0prometheus-operator-clusterRole.yaml @@ -0,0 +1,66 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus-operator +rules: +- apiGroups: + - apiextensions.k8s.io + resources: + - customresourcedefinitions + verbs: + - '*' +- apiGroups: + - monitoring.coreos.com + resources: + - alertmanagers + - prometheuses + - prometheuses/finalizers + - alertmanagers/finalizers + - servicemonitors + - prometheusrules + verbs: + - '*' +- apiGroups: + - apps + resources: + - statefulsets + verbs: + - '*' +- apiGroups: + - "" + resources: + - configmaps + - secrets + verbs: + - '*' +- apiGroups: + - "" + resources: + - pods + verbs: + - list + - delete +- apiGroups: + - "" + resources: + - services + - endpoints + verbs: + - get + - create + - update +- apiGroups: + - "" + resources: + - nodes + verbs: + - list + - watch +- apiGroups: + - "" + resources: + - namespaces + verbs: + - get + - list + - watch diff --git a/manifests/0prometheus-operator-clusterRoleBinding.yaml b/manifests/0prometheus-operator-clusterRoleBinding.yaml new file mode 100644 index 0000000..8f1b4d3 --- /dev/null +++ b/manifests/0prometheus-operator-clusterRoleBinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus-operator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-operator +subjects: +- kind: ServiceAccount + name: prometheus-operator + namespace: monitoring diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml new file mode 100644 index 0000000..833bec9 --- /dev/null +++ b/manifests/0prometheus-operator-deployment.yaml @@ -0,0 +1,44 @@ +apiVersion: apps/v1beta2 +kind: Deployment +metadata: + labels: + k8s-app: prometheus-operator + name: prometheus-operator + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + k8s-app: prometheus-operator + template: + metadata: + labels: + k8s-app: prometheus-operator + spec: + containers: + - args: + - --kubelet-service=kube-system/kubelet + - --logtostderr=true + - --config-reloader-image=carlosedp/configmap-reload:v0.2.2 + - --prometheus-config-reloader=carlosedp/prometheus-config-reloader:v0.26.0 + image: carlosedp/prometheus-operator:v0.26.0 + name: prometheus-operator + ports: + - containerPort: 8080 + name: http + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + securityContext: + allowPrivilegeEscalation: false + readOnlyRootFilesystem: true + nodeSelector: + beta.kubernetes.io/os: linux + securityContext: + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: prometheus-operator diff --git a/manifests/0prometheus-operator-service.yaml b/manifests/0prometheus-operator-service.yaml new file mode 100644 index 0000000..5231b33 --- /dev/null +++ b/manifests/0prometheus-operator-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: prometheus-operator + name: prometheus-operator + namespace: monitoring +spec: + clusterIP: None + ports: + - name: http + port: 8080 + targetPort: http + selector: + k8s-app: prometheus-operator diff --git a/manifests/0prometheus-operator-serviceAccount.yaml b/manifests/0prometheus-operator-serviceAccount.yaml new file mode 100644 index 0000000..11898a6 --- /dev/null +++ b/manifests/0prometheus-operator-serviceAccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-operator + namespace: monitoring diff --git a/manifests/0prometheus-operator-serviceMonitor.yaml b/manifests/0prometheus-operator-serviceMonitor.yaml new file mode 100644 index 0000000..14f402f --- /dev/null +++ b/manifests/0prometheus-operator-serviceMonitor.yaml @@ -0,0 +1,14 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: prometheus-operator + name: prometheus-operator + namespace: monitoring +spec: + endpoints: + - honorLabels: true + port: http + selector: + matchLabels: + k8s-app: prometheus-operator diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml new file mode 100644 index 0000000..e57f39a --- /dev/null +++ b/manifests/alertmanager-alertmanager.yaml @@ -0,0 +1,18 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Alertmanager +metadata: + labels: + alertmanager: main + name: main + namespace: monitoring +spec: + baseImage: carlosedp/alertmanager + nodeSelector: + beta.kubernetes.io/os: linux + replicas: 1 + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 + serviceAccountName: alertmanager-main + version: v0.15.3 diff --git a/manifests/alertmanager-secret.yaml b/manifests/alertmanager-secret.yaml new file mode 100644 index 0000000..4a143fb --- /dev/null +++ b/manifests/alertmanager-secret.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +data: + alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== +kind: Secret +metadata: + name: alertmanager-main + namespace: monitoring +type: Opaque diff --git a/manifests/alertmanager-service.yaml b/manifests/alertmanager-service.yaml new file mode 100644 index 0000000..0c75679 --- /dev/null +++ b/manifests/alertmanager-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + alertmanager: main + name: alertmanager-main + namespace: monitoring +spec: + ports: + - name: web + port: 9093 + targetPort: web + selector: + alertmanager: main + app: alertmanager diff --git a/manifests/alertmanager-serviceAccount.yaml b/manifests/alertmanager-serviceAccount.yaml new file mode 100644 index 0000000..5c06d5e --- /dev/null +++ b/manifests/alertmanager-serviceAccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: alertmanager-main + namespace: monitoring diff --git a/manifests/alertmanager-serviceMonitor.yaml b/manifests/alertmanager-serviceMonitor.yaml new file mode 100644 index 0000000..548af0d --- /dev/null +++ b/manifests/alertmanager-serviceMonitor.yaml @@ -0,0 +1,14 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: alertmanager + name: alertmanager + namespace: monitoring +spec: + endpoints: + - interval: 30s + port: web + selector: + matchLabels: + alertmanager: main diff --git a/manifests/arm-exporter-daemonset.yaml b/manifests/arm-exporter-daemonset.yaml new file mode 100644 index 0000000..1d247d0 --- /dev/null +++ b/manifests/arm-exporter-daemonset.yaml @@ -0,0 +1,44 @@ +apiVersion: apps/v1beta2 +kind: DaemonSet +metadata: + labels: + k8s-app: arm-exporter + name: arm-exporter + namespace: monitoring +spec: + selector: + matchLabels: + k8s-app: arm-exporter + template: + metadata: + labels: + k8s-app: arm-exporter + spec: + containers: + - image: carlosedp/arm_exporter:latest + name: arm-exporter + resources: + limits: + cpu: 200m + memory: 180Mi + requests: + cpu: 100m + memory: 180Mi + - args: + - --secure-listen-address=$(IP):9243 + - --upstream=http://127.0.0.1:9243/ + image: carlosedp/kube-rbac-proxy:v0.4.0 + name: kube-rbac-proxy + ports: + - containerPort: 9243 + hostPort: 9243 + name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + nodeSelector: + beta.kubernetes.io/os: linux diff --git a/manifests/arm-exporter-service.yaml b/manifests/arm-exporter-service.yaml new file mode 100644 index 0000000..d83042f --- /dev/null +++ b/manifests/arm-exporter-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: arm-exporter + name: arm-exporter + namespace: monitoring +spec: + clusterIP: None + ports: + - name: https + port: 9243 + targetPort: https + selector: + k8s-app: arm-exporter diff --git a/manifests/arm-exporter-serviceMonitor.yaml b/manifests/arm-exporter-serviceMonitor.yaml new file mode 100644 index 0000000..a92a2a1 --- /dev/null +++ b/manifests/arm-exporter-serviceMonitor.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: arm-exporter + name: arm-exporter + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + port: https + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: k8s-app + selector: + matchLabels: + k8s-app: arm-exporter diff --git a/manifests/grafana-config.yaml b/manifests/grafana-config.yaml new file mode 100644 index 0000000..51cc4b7 --- /dev/null +++ b/manifests/grafana-config.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +data: + grafana.ini: W2F1dGguYW5vbnltb3VzXQplbmFibGVkID0gZmFsc2UKW2F1dGguYmFzaWNdCmVuYWJsZWQgPSBmYWxzZQpbZGF0YWJhc2VdCnBhdGggPSAvZGF0YS9ncmFmYW5hLmRiCltwYXRoc10KZGF0YSA9IC92YXIvbGliL2dyYWZhbmEKbG9ncyA9IC92YXIvbGliL2dyYWZhbmEvbG9nCnBsdWdpbnMgPSAvdmFyL2xpYi9ncmFmYW5hL3BsdWdpbnMKcHJvdmlzaW9uaW5nID0gL2V0Yy9ncmFmYW5hL3Byb3Zpc2lvbmluZwpbc2Vzc2lvbl0KcHJvdmlkZXIgPSBtZW1vcnkKW3NtdHBdCmVuYWJsZWQgPSB0cnVlCmZyb21fYWRkcmVzcyA9IGNhcmxvc2VkcEBnbWFpbC5jb20KZnJvbV9uYW1lID0gR3JhZmFuYSBBbGVydApob3N0ID0gc210cC1zZXJ2ZXIubW9uaXRvcmluZy5zdmM6MjUKcGFzc3dvcmQgPSAKc2tpcF92ZXJpZnkgPSB0cnVlCnVzZXIgPSAK +kind: Secret +metadata: + name: grafana-config + namespace: monitoring +type: Opaque diff --git a/manifests/grafana-dashboardDatasources.yaml b/manifests/grafana-dashboardDatasources.yaml new file mode 100644 index 0000000..446c686 --- /dev/null +++ b/manifests/grafana-dashboardDatasources.yaml @@ -0,0 +1,8 @@ +apiVersion: v1 +data: + prometheus.yaml: ewogICAgImFwaVZlcnNpb24iOiAxLAogICAgImRhdGFzb3VyY2VzIjogWwogICAgICAgIHsKICAgICAgICAgICAgImFjY2VzcyI6ICJwcm94eSIsCiAgICAgICAgICAgICJlZGl0YWJsZSI6IGZhbHNlLAogICAgICAgICAgICAibmFtZSI6ICJwcm9tZXRoZXVzIiwKICAgICAgICAgICAgIm9yZ0lkIjogMSwKICAgICAgICAgICAgInR5cGUiOiAicHJvbWV0aGV1cyIsCiAgICAgICAgICAgICJ1cmwiOiAiaHR0cDovL3Byb21ldGhldXMtazhzLm1vbml0b3Jpbmcuc3ZjOjkwOTAiLAogICAgICAgICAgICAidmVyc2lvbiI6IDEKICAgICAgICB9CiAgICBdCn0= +kind: Secret +metadata: + name: grafana-datasources + namespace: monitoring +type: Opaque diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml new file mode 100644 index 0000000..4900caa --- /dev/null +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -0,0 +1,7891 @@ +apiVersion: v1 +items: +- apiVersion: v1 + data: + k8s-cluster-rsrc-use.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_utilisation:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_saturation_load1: / scalar(sum(min(kube_pod_info) by (node)))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Saturation (Load1)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_utilisation:ratio", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_swap_io_bytes:sum_rate", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Saturation (Swap I/O)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_utilisation:avg_irate / scalar(:kube_pod_info_node_count:)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_saturation:avg_irate / scalar(:kube_pod_info_node_count:)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Saturation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_utilisation:sum_irate", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Utilisation (Transmitted)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_saturation:sum_irate", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Saturation (Dropped)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Network", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"} - node_filesystem_avail_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)) by (pod,namespace)\n/ scalar(sum(max(node_filesystem_size_bytes{fstype=\u007e\"ext[234]|btrfs|xfs|zfs\"}) by (device,pod,namespace)))\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{node}}", + "legendLink": "/d/4ac4f123aae0ff6dbaf4f4f66120033b/k8s-node-rsrc-use", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Capacity", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": 1, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Storage", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "K8s / USE Method / Cluster", + "uid": "a6e7d1362e1ddbb79db21d5bb40d7137", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-cluster-rsrc-use + namespace: monitoring +- apiVersion: v1 + data: + k8s-node-rsrc-use.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_utilisation:avg1m{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Utilisation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_cpu_saturation_load1:{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Saturation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Saturation (Load1)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_utilisation:{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Memory", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_memory_swap_io_bytes:sum_rate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Swap IO", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Saturation (Swap I/O)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_utilisation:avg_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Utilisation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_disk_saturation:avg_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Saturation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk IO Saturation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_utilisation:sum_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Utilisation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Utilisation (Transmitted)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_net_saturation:sum_irate{node=\"$node\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Saturation", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Net Saturation (Dropped)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Net", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_filesystem_usage:\n* on (namespace, pod) group_left (node) node_namespace_pod:kube_pod_info:{node=\"$node\"}\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Disk", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "node", + "multi": false, + "name": "node", + "options": [ + + ], + "query": "label_values(kube_node_info, node)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "K8s / USE Method / Node", + "uid": "4ac4f123aae0ff6dbaf4f4f66120033b", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-node-rsrc-use + namespace: monitoring +- apiVersion: v1 + data: + k8s-resources-cluster.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "100px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1 - avg(rate(node_cpu_seconds_total{mode=\"idle\"}[1m]))", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores) / sum(node:node_num_cpu:sum)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "CPU Requests Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores) / sum(node:node_num_cpu:sum)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "CPU Limits Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "1 - sum(:node_memory_MemFreeCachedBuffers_bytes:sum) / sum(:node_memory_MemTotal_bytes:sum)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Utilisation", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Requests Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "format": "percentunit", + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 2, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes) / sum(:node_memory_MemTotal_bytes:sum)", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "70,80", + "timeFrom": null, + "timeShift": null, + "title": "Memory Limits Commitment", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "singlestat", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Headlines", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace) / sum(kube_pod_container_resource_requests_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate) by (namespace) / sum(kube_pod_container_resource_limits_cpu_cores) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{namespace}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage (w/o cache)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Namespace", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/d/85a562078cdf77779eaa1add43ccec1e/k8s-resources-namespace?var-datasource=$datasource&var-namespace=$__cell", + "pattern": "namespace", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_requests_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(container_memory_rss{container_name!=\"\"}) by (namespace) / sum(kube_pod_container_resource_limits_memory_bytes) by (namespace)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Requests by Namespace", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Requests", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "K8s / Compute Resources / Cluster", + "uid": "efa86fd1d0c121a26444b636a3f509a8", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-resources-cluster + namespace: monitoring +- apiVersion: v1 + data: + k8s-resources-namespace.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}) by (pod_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", container_name!=\"\"}) by (pod_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{pod_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Pod", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": true, + "linkTooltip": "Drill down", + "linkUrl": "/d/6581e46e4e5c7ba40a07646395ef7b23/k8s-resources-pod?var-datasource=$datasource&var-namespace=$namespace&var-pod=$__cell", + "pattern": "pod", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\",container_name!=\"\"}, \"pod\", \"$1\", \"pod_name\", \"(.*)\")) by (pod) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\"}) by (pod)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Quota", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "K8s / Compute Resources / Namespace", + "uid": "85a562078cdf77779eaa1add43ccec1e", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-resources-namespace + namespace: monitoring +- apiVersion: v1 + data: + k8s-resources-pod.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 0, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}) by (container_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "CPU Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "CPU Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "CPU Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Container", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(label_replace(namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "CPU Quota", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 10, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 0, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}) by (container_name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{container_name}}", + "legendLink": null, + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Usage", + "titleSize": "h6" + }, + { + "collapse": false, + "height": "250px", + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "pattern": "Time", + "type": "hidden" + }, + { + "alias": "Memory Usage", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #A", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #B", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Requests %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #C", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Memory Limits", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #D", + "thresholds": [ + + ], + "type": "number", + "unit": "decbytes" + }, + { + "alias": "Memory Limits %", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "Value #E", + "thresholds": [ + + ], + "type": "number", + "unit": "percentunit" + }, + { + "alias": "Container", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "link": false, + "linkTooltip": "Drill down", + "linkUrl": "", + "pattern": "container", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"POD\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "A", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "C", + "step": 10 + }, + { + "expr": "sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\", container!=\"\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "D", + "step": 10 + }, + { + "expr": "sum(label_replace(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\"$pod\", container_name!=\"\"}, \"container\", \"$1\", \"container_name\", \"(.*)\")) by (container) / sum(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod=\"$pod\"}) by (container)", + "format": "table", + "instant": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "E", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Quota", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "transform": "table", + "type": "table", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Memory Quota", + "titleSize": "h6" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "text": "prod", + "value": "prod" + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "pod", + "multi": false, + "name": "pod", + "options": [ + + ], + "query": "label_values(kube_pod_info{namespace=\"$namespace\"}, pod)", + "refresh": 1, + "regex": "", + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "K8s / Compute Resources / Pod", + "uid": "6581e46e4e5c7ba40a07646395ef7b23", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-k8s-resources-pod + namespace: monitoring +- apiVersion: v1 + data: + nodes.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(node_load1{job=\"node-exporter\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 1m", + "refId": "A" + }, + { + "expr": "max(node_load5{job=\"node-exporter\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 5m", + "refId": "B" + }, + { + "expr": "max(node_load15{job=\"node-exporter\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "load 15m", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "System load", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cpu}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Usage Per Core", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": "true", + "avg": "true", + "current": "true", + "max": "false", + "min": "false", + "rightSide": "true", + "show": "true", + "total": "false", + "values": "true" + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max (sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m])) ) * 100\n", + "format": "time_series", + "intervalFactor": 10, + "legendFormat": "{{ cpu }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Utilizaion", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "$datasource", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(sum by (cpu) (irate(node_cpu_seconds_total{job=\"node-exporter\", mode!=\"idle\", instance=\"$instance\"}[2m]))) * 100\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80, 90", + "title": "CPU Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory used", + "refId": "A" + }, + { + "expr": "max(node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory buffers", + "refId": "B" + }, + { + "expr": "max(node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory cached", + "refId": "C" + }, + { + "expr": "max(node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "memory free", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "$datasource", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 7, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(\n (\n (\n node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_MemFree_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Buffers_bytes{job=\"node-exporter\", instance=\"$instance\"}\n - node_memory_Cached_bytes{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_memory_MemTotal_bytes{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80, 90", + "title": "Memory Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + { + "alias": "read", + "yaxis": 1 + }, + { + "alias": "io time", + "yaxis": 2 + } + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(rate(node_disk_read_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "read", + "refId": "A" + }, + { + "expr": "max(rate(node_disk_written_bytes_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "written", + "refId": "B" + }, + { + "expr": "max(rate(node_disk_io_time_seconds_total{job=\"node-exporter\", instance=\"$instance\"}[2m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "io time", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node:node_filesystem_usage:\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk Space Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(rate(node_network_receive_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Received", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(rate(node_network_transmit_bytes_total{job=\"node-exporter\", instance=\"$instance\", device!\u007e\"lo\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{device}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Transmitted", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 9, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(\n node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"}\n)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inodes used", + "refId": "A" + }, + { + "expr": "max(node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "inodes free", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Inodes Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "$datasource", + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 13, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(\n (\n (\n node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n - node_filesystem_files_free{job=\"node-exporter\", instance=\"$instance\"}\n )\n / node_filesystem_files{job=\"node-exporter\", instance=\"$instance\"}\n ) * 100)\n", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "80, 90", + "title": "Inodes Usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(node_boot_time_seconds{job=\"node-exporter\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Nodes", + "uid": "fa49a4706d07a042595b664c87fb33ea", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-nodes + namespace: monitoring +- apiVersion: v1 + data: + persistentvolumesusage.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": false, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "(kubelet_volume_stats_capacity_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} - kubelet_volume_stats_available_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"}) / kubelet_volume_stats_capacity_bytes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ Usage }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Volume Space Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "kubelet_volume_stats_inodes_used{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} / kubelet_volume_stats_inodes{job=\"kubelet\", persistentvolumeclaim=\"$volume\"} * 100\n", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ Usage }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Volume inodes Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": 100, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kubelet_volume_stats_capacity_bytes{job=\"kubelet\"}, exported_namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "PersistentVolumeClaim", + "multi": false, + "name": "volume", + "options": [ + + ], + "query": "label_values(kubelet_volume_stats_capacity_bytes{job=\"kubelet\", exported_namespace=\"$namespace\"}, persistentvolumeclaim)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-7d", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Persistent Volumes", + "uid": "919b92a8e8041bd567af9edab12c840c", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-persistentvolumesusage + namespace: monitoring +- apiVersion: v1 + data: + pods.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 2, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by(container_name) (container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\", container_name=\u007e\"$container\", container_name!=\"POD\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Current: {{ container_name }}", + "refId": "A" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_requests_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Requested: {{ container }}", + "refId": "B" + }, + { + "expr": "sum by(container) (kube_pod_container_resource_limits_memory_bytes{job=\"kube-state-metrics\", namespace=\"$namespace\", pod=\"$pod\", container=\u007e\"$container\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Limit: {{ container }}", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ container_name }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", pod_name=\"$pod\"}[1m])))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ pod_name }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network I/O", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_pod_info, namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Pod", + "multi": false, + "name": "pod", + "options": [ + + ], + "query": "label_values(kube_pod_info{namespace=\u007e\"$namespace\"}, pod)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": "Container", + "multi": false, + "name": "container", + "options": [ + + ], + "query": "label_values(kube_pod_container_info{namespace=\"$namespace\", pod=\"$pod\"}, container)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Pods", + "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-pods + namespace: monitoring +- apiVersion: v1 + data: + statefulset.json: |- + { + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "cores", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "CPU", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 3, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "GB", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(container_memory_usage_bytes{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}) / 1024^3", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Memory", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 4, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "Bps", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 4, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(rate(container_network_transmit_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\u007e\"$statefulset.*\"}[3m])) + sum(rate(container_network_receive_bytes_total{namespace=\"$namespace\",pod_name=\u007e\"$statefulset.*\"}[3m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Network", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "height": "100px", + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 5, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Desired Replicas", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 6, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Replicas of current version", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 7, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_status_observed_generation{job=\"kube-state-metrics\", namespace=\"$namespace\", statefulset=\"$statefulset\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Observed Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 8, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 3, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "max(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", statefulset=\"$statefulset\", namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Metadata Generation", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "0", + "value": "null" + } + ], + "valueName": "current" + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(kube_statefulset_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas specified", + "refId": "A" + }, + { + "expr": "max(kube_statefulset_status_replicas{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas created", + "refId": "B" + }, + { + "expr": "min(kube_statefulset_status_replicas_ready{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "ready", + "refId": "C" + }, + { + "expr": "min(kube_statefulset_status_replicas_current{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "replicas of current version", + "refId": "D" + }, + { + "expr": "min(kube_statefulset_status_replicas_updated{job=\"kube-state-metrics\", statefulset=\"$statefulset\",namespace=\"$namespace\"}) without (instance, pod)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "updated", + "refId": "E" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Replicas", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + + ], + "templating": { + "list": [ + { + "current": { + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\"}, namespace)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": false, + "label": "Name", + "multi": false, + "name": "statefulset", + "options": [ + + ], + "query": "label_values(kube_statefulset_metadata_generation{job=\"kube-state-metrics\", namespace=\"$namespace\"}, statefulset)", + "refresh": 2, + "regex": "", + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "StatefulSets", + "uid": "a31c1f46e6f727cb37c0d731a7245005", + "version": 0 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-statefulset + namespace: monitoring +kind: ConfigMapList diff --git a/manifests/grafana-dashboardSources.yaml b/manifests/grafana-dashboardSources.yaml new file mode 100644 index 0000000..d8b401a --- /dev/null +++ b/manifests/grafana-dashboardSources.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +data: + dashboards.yaml: |- + { + "apiVersion": 1, + "providers": [ + { + "folder": "", + "name": "0", + "options": { + "path": "/grafana-dashboard-definitions/0" + }, + "orgId": 1, + "type": "file" + } + ] + } +kind: ConfigMap +metadata: + name: grafana-dashboards + namespace: monitoring diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml new file mode 100644 index 0000000..1ef1d7b --- /dev/null +++ b/manifests/grafana-deployment.yaml @@ -0,0 +1,132 @@ +apiVersion: apps/v1beta2 +kind: Deployment +metadata: + labels: + app: grafana + name: grafana + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: grafana + template: + metadata: + labels: + app: grafana + spec: + containers: + - args: + - -config=/etc/grafana/grafana.ini + image: carlosedp/monitoring-grafana:v5.4.0 + name: grafana + ports: + - containerPort: 3000 + name: http + readinessProbe: + httpGet: + path: /api/health + port: http + resources: + limits: + cpu: 200m + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + volumeMounts: + - mountPath: /var/lib/grafana + name: grafana-storage + readOnly: false + - mountPath: /etc/grafana/provisioning/datasources + name: grafana-datasources + readOnly: false + - mountPath: /etc/grafana/provisioning/dashboards + name: grafana-dashboards + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/k8s-cluster-rsrc-use + name: grafana-dashboard-k8s-cluster-rsrc-use + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/k8s-node-rsrc-use + name: grafana-dashboard-k8s-node-rsrc-use + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster + name: grafana-dashboard-k8s-resources-cluster + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/k8s-resources-namespace + name: grafana-dashboard-k8s-resources-namespace + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/k8s-resources-pod + name: grafana-dashboard-k8s-resources-pod + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/nodes + name: grafana-dashboard-nodes + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/persistentvolumesusage + name: grafana-dashboard-persistentvolumesusage + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/pods + name: grafana-dashboard-pods + readOnly: false + - mountPath: /grafana-dashboard-definitions/0/statefulset + name: grafana-dashboard-statefulset + readOnly: false + - mountPath: /etc/grafana + name: grafana-config + readOnly: false + securityContext: + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: grafana + volumes: + - emptyDir: {} + name: grafana-storage + - name: grafana-datasources + secret: + secretName: grafana-datasources + - configMap: + name: grafana-dashboards + name: grafana-dashboards + - configMap: + name: grafana-dashboard-k8s-cluster-rsrc-use + name: grafana-dashboard-k8s-cluster-rsrc-use + - configMap: + name: grafana-dashboard-k8s-node-rsrc-use + name: grafana-dashboard-k8s-node-rsrc-use + - configMap: + name: grafana-dashboard-k8s-resources-cluster + name: grafana-dashboard-k8s-resources-cluster + - configMap: + name: grafana-dashboard-k8s-resources-namespace + name: grafana-dashboard-k8s-resources-namespace + - configMap: + name: grafana-dashboard-k8s-resources-pod + name: grafana-dashboard-k8s-resources-pod + - configMap: + name: grafana-dashboard-nodes + name: grafana-dashboard-nodes + - configMap: + name: grafana-dashboard-persistentvolumesusage + name: grafana-dashboard-persistentvolumesusage + - configMap: + name: grafana-dashboard-pods + name: grafana-dashboard-pods + - configMap: + name: grafana-dashboard-statefulset + name: grafana-dashboard-statefulset + - name: grafana-config + secret: + secretName: grafana-config + volumeClaimTemplate: + apiVersion: v1 + kind: PersistentVolumeClaim + metadata: + name: grafana-storage + namespace: monitoring + spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 2Gi + storageClassName: nfs-ssd-node1 diff --git a/manifests/grafana-service.yaml b/manifests/grafana-service.yaml new file mode 100644 index 0000000..45f77a0 --- /dev/null +++ b/manifests/grafana-service.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: monitoring +spec: + ports: + - name: http + port: 3000 + targetPort: http + selector: + app: grafana diff --git a/manifests/grafana-serviceAccount.yaml b/manifests/grafana-serviceAccount.yaml new file mode 100644 index 0000000..3ed3e03 --- /dev/null +++ b/manifests/grafana-serviceAccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: grafana + namespace: monitoring diff --git a/manifests/ingress-alertmanager-main.yaml b/manifests/ingress-alertmanager-main.yaml new file mode 100644 index 0000000..540a0b2 --- /dev/null +++ b/manifests/ingress-alertmanager-main.yaml @@ -0,0 +1,14 @@ +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: alertmanager-main + namespace: monitoring +spec: + rules: + - host: alertmanager.internal.carlosedp.com + http: + paths: + - backend: + serviceName: alertmanager-main + servicePort: web + path: / diff --git a/manifests/ingress-grafana.yaml b/manifests/ingress-grafana.yaml new file mode 100644 index 0000000..cd56db0 --- /dev/null +++ b/manifests/ingress-grafana.yaml @@ -0,0 +1,14 @@ +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: grafana + namespace: monitoring +spec: + rules: + - host: grafana.internal.carlosedp.com + http: + paths: + - backend: + serviceName: grafana + servicePort: http + path: / diff --git a/manifests/ingress-prometheus-k8s.yaml b/manifests/ingress-prometheus-k8s.yaml new file mode 100644 index 0000000..aad643d --- /dev/null +++ b/manifests/ingress-prometheus-k8s.yaml @@ -0,0 +1,14 @@ +apiVersion: extensions/v1beta1 +kind: Ingress +metadata: + name: prometheus-k8s + namespace: monitoring +spec: + rules: + - host: prometheus.internal.carlosedp.com + http: + paths: + - backend: + serviceName: prometheus-k8s + servicePort: web + path: / diff --git a/manifests/kube-state-metrics-clusterRole.yaml b/manifests/kube-state-metrics-clusterRole.yaml new file mode 100644 index 0000000..c519a91 --- /dev/null +++ b/manifests/kube-state-metrics-clusterRole.yaml @@ -0,0 +1,69 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: kube-state-metrics +rules: +- apiGroups: + - "" + resources: + - configmaps + - secrets + - nodes + - pods + - services + - resourcequotas + - replicationcontrollers + - limitranges + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: + - list + - watch +- apiGroups: + - extensions + resources: + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch +- apiGroups: + - apps + resources: + - statefulsets + - daemonsets + - deployments + - replicasets + verbs: + - list + - watch +- apiGroups: + - batch + resources: + - cronjobs + - jobs + verbs: + - list + - watch +- apiGroups: + - autoscaling + resources: + - horizontalpodautoscalers + verbs: + - list + - watch +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/manifests/kube-state-metrics-clusterRoleBinding.yaml b/manifests/kube-state-metrics-clusterRoleBinding.yaml new file mode 100644 index 0000000..9a8f311 --- /dev/null +++ b/manifests/kube-state-metrics-clusterRoleBinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: kube-state-metrics +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics + namespace: monitoring diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml new file mode 100644 index 0000000..32ef699 --- /dev/null +++ b/manifests/kube-state-metrics-deployment.yaml @@ -0,0 +1,53 @@ +apiVersion: apps/v1beta2 +kind: Deployment +metadata: + labels: + app: kube-state-metrics + name: kube-state-metrics + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: kube-state-metrics + template: + metadata: + labels: + app: kube-state-metrics + spec: + containers: + - command: + - /pod_nanny + - --container=kube-state-metrics + - --cpu=100m + - --extra-cpu=2m + - --memory=150Mi + - --extra-memory=30Mi + - --acceptance-offset=5 + - --deployment=kube-state-metrics + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + image: carlosedp/addon-resizer:2.1 + name: addon-resizer + resources: + limits: + cpu: 50m + memory: 30Mi + requests: + cpu: 10m + memory: 30Mi + nodeSelector: + beta.kubernetes.io/os: linux + securityContext: + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: kube-state-metrics diff --git a/manifests/kube-state-metrics-role.yaml b/manifests/kube-state-metrics-role.yaml new file mode 100644 index 0000000..e03d889 --- /dev/null +++ b/manifests/kube-state-metrics-role.yaml @@ -0,0 +1,30 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: kube-state-metrics + namespace: monitoring +rules: +- apiGroups: + - "" + resources: + - pods + verbs: + - get +- apiGroups: + - extensions + resourceNames: + - kube-state-metrics + resources: + - deployments + verbs: + - get + - update +- apiGroups: + - apps + resourceNames: + - kube-state-metrics + resources: + - deployments + verbs: + - get + - update diff --git a/manifests/kube-state-metrics-roleBinding.yaml b/manifests/kube-state-metrics-roleBinding.yaml new file mode 100644 index 0000000..9c61143 --- /dev/null +++ b/manifests/kube-state-metrics-roleBinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: kube-state-metrics + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: kube-state-metrics +subjects: +- kind: ServiceAccount + name: kube-state-metrics diff --git a/manifests/kube-state-metrics-service.yaml b/manifests/kube-state-metrics-service.yaml new file mode 100644 index 0000000..84927af --- /dev/null +++ b/manifests/kube-state-metrics-service.yaml @@ -0,0 +1,18 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: kube-state-metrics + name: kube-state-metrics + namespace: monitoring +spec: + clusterIP: None + ports: + - name: https-main + port: 8443 + targetPort: https-main + - name: https-self + port: 9443 + targetPort: https-self + selector: + app: kube-state-metrics diff --git a/manifests/kube-state-metrics-serviceAccount.yaml b/manifests/kube-state-metrics-serviceAccount.yaml new file mode 100644 index 0000000..fff1028 --- /dev/null +++ b/manifests/kube-state-metrics-serviceAccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: kube-state-metrics + namespace: monitoring diff --git a/manifests/kube-state-metrics-serviceMonitor.yaml b/manifests/kube-state-metrics-serviceMonitor.yaml new file mode 100644 index 0000000..2100449 --- /dev/null +++ b/manifests/kube-state-metrics-serviceMonitor.yaml @@ -0,0 +1,27 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: kube-state-metrics + name: kube-state-metrics + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + port: https-main + scheme: https + scrapeTimeout: 30s + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + port: https-self + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: k8s-app + selector: + matchLabels: + k8s-app: kube-state-metrics diff --git a/manifests/node-exporter-clusterRole.yaml b/manifests/node-exporter-clusterRole.yaml new file mode 100644 index 0000000..ad783ae --- /dev/null +++ b/manifests/node-exporter-clusterRole.yaml @@ -0,0 +1,17 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: node-exporter +rules: +- apiGroups: + - authentication.k8s.io + resources: + - tokenreviews + verbs: + - create +- apiGroups: + - authorization.k8s.io + resources: + - subjectaccessreviews + verbs: + - create diff --git a/manifests/node-exporter-clusterRoleBinding.yaml b/manifests/node-exporter-clusterRoleBinding.yaml new file mode 100644 index 0000000..a5a2050 --- /dev/null +++ b/manifests/node-exporter-clusterRoleBinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: node-exporter +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: node-exporter +subjects: +- kind: ServiceAccount + name: node-exporter + namespace: monitoring diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml new file mode 100644 index 0000000..9b781c7 --- /dev/null +++ b/manifests/node-exporter-daemonset.yaml @@ -0,0 +1,85 @@ +apiVersion: apps/v1beta2 +kind: DaemonSet +metadata: + labels: + app: node-exporter + name: node-exporter + namespace: monitoring +spec: + selector: + matchLabels: + app: node-exporter + template: + metadata: + labels: + app: node-exporter + spec: + containers: + - args: + - --web.listen-address=127.0.0.1:9100 + - --path.procfs=/host/proc + - --path.sysfs=/host/sys + - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) + - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ + image: carlosedp/node_exporter:v0.17.0 + name: node-exporter + resources: + limits: + cpu: 250m + memory: 180Mi + requests: + cpu: 102m + memory: 180Mi + volumeMounts: + - mountPath: /host/proc + name: proc + readOnly: false + - mountPath: /host/sys + name: sys + readOnly: false + - mountPath: /host/root + mountPropagation: HostToContainer + name: root + readOnly: true + - args: + - --secure-listen-address=$(IP):9100 + - --upstream=http://127.0.0.1:9100/ + env: + - name: IP + valueFrom: + fieldRef: + fieldPath: status.podIP + image: carlosedp/kube-rbac-proxy:v0.4.0 + name: kube-rbac-proxy + ports: + - containerPort: 9100 + hostPort: 9100 + name: https + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + hostNetwork: true + hostPID: true + nodeSelector: + beta.kubernetes.io/os: linux + securityContext: + runAsNonRoot: true + runAsUser: 65534 + serviceAccountName: node-exporter + tolerations: + - effect: NoSchedule + key: node-role.kubernetes.io/master + volumes: + - hostPath: + path: /proc + name: proc + - hostPath: + path: /sys + name: sys + - hostPath: + path: / + name: root diff --git a/manifests/node-exporter-service.yaml b/manifests/node-exporter-service.yaml new file mode 100644 index 0000000..1d728d7 --- /dev/null +++ b/manifests/node-exporter-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: node-exporter + name: node-exporter + namespace: monitoring +spec: + clusterIP: None + ports: + - name: https + port: 9100 + targetPort: https + selector: + app: node-exporter diff --git a/manifests/node-exporter-serviceAccount.yaml b/manifests/node-exporter-serviceAccount.yaml new file mode 100644 index 0000000..8a03ac1 --- /dev/null +++ b/manifests/node-exporter-serviceAccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: node-exporter + namespace: monitoring diff --git a/manifests/node-exporter-serviceMonitor.yaml b/manifests/node-exporter-serviceMonitor.yaml new file mode 100644 index 0000000..273d274 --- /dev/null +++ b/manifests/node-exporter-serviceMonitor.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: node-exporter + name: node-exporter + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + port: https + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: k8s-app + selector: + matchLabels: + k8s-app: node-exporter diff --git a/manifests/prometheus-adapter-apiService.yaml b/manifests/prometheus-adapter-apiService.yaml new file mode 100644 index 0000000..95d5c32 --- /dev/null +++ b/manifests/prometheus-adapter-apiService.yaml @@ -0,0 +1,13 @@ +apiVersion: apiregistration.k8s.io/v1beta1 +kind: APIService +metadata: + name: v1beta1.metrics.k8s.io +spec: + group: metrics.k8s.io + groupPriorityMinimum: 100 + insecureSkipTLSVerify: true + service: + name: prometheus-adapter + namespace: monitoring + version: v1beta1 + versionPriority: 100 diff --git a/manifests/prometheus-adapter-clusterRole.yaml b/manifests/prometheus-adapter-clusterRole.yaml new file mode 100644 index 0000000..a02d2bb --- /dev/null +++ b/manifests/prometheus-adapter-clusterRole.yaml @@ -0,0 +1,16 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus-adapter +rules: +- apiGroups: + - "" + resources: + - nodes + - namespaces + - pods + - services + verbs: + - get + - list + - watch diff --git a/manifests/prometheus-adapter-clusterRoleBinding.yaml b/manifests/prometheus-adapter-clusterRoleBinding.yaml new file mode 100644 index 0000000..29fa917 --- /dev/null +++ b/manifests/prometheus-adapter-clusterRoleBinding.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus-adapter + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-adapter +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml b/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml new file mode 100644 index 0000000..4295b50 --- /dev/null +++ b/manifests/prometheus-adapter-clusterRoleBindingDelegator.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: resource-metrics:system:auth-delegator +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: system:auth-delegator +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/manifests/prometheus-adapter-clusterRoleServerResources.yaml b/manifests/prometheus-adapter-clusterRoleServerResources.yaml new file mode 100644 index 0000000..fcb914c --- /dev/null +++ b/manifests/prometheus-adapter-clusterRoleServerResources.yaml @@ -0,0 +1,11 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: resource-metrics-server-resources +rules: +- apiGroups: + - metrics.k8s.io + resources: + - '*' + verbs: + - '*' diff --git a/manifests/prometheus-adapter-configMap.yaml b/manifests/prometheus-adapter-configMap.yaml new file mode 100644 index 0000000..a231de3 --- /dev/null +++ b/manifests/prometheus-adapter-configMap.yaml @@ -0,0 +1,33 @@ +apiVersion: v1 +data: + config.yaml: | + resourceRules: + cpu: + containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>}[1m])) by (<<.GroupBy>>) + nodeQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>, id='/'}[1m])) by (<<.GroupBy>>) + resources: + overrides: + node: + resource: node + namespace: + resource: namespace + pod_name: + resource: pod + containerLabel: container_name + memory: + containerQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>}) by (<<.GroupBy>>) + nodeQuery: sum(container_memory_working_set_bytes{<<.LabelMatchers>>,id='/'}) by (<<.GroupBy>>) + resources: + overrides: + node: + resource: node + namespace: + resource: namespace + pod_name: + resource: pod + containerLabel: container_name + window: 1m +kind: ConfigMap +metadata: + name: adapter-config + namespace: monitoring diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml new file mode 100644 index 0000000..ddf6a32 --- /dev/null +++ b/manifests/prometheus-adapter-deployment.yaml @@ -0,0 +1,50 @@ +apiVersion: apps/v1beta2 +kind: Deployment +metadata: + name: prometheus-adapter + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + name: prometheus-adapter + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 0 + template: + metadata: + labels: + name: prometheus-adapter + spec: + containers: + - args: + - --cert-dir=/var/run/serving-cert + - --config=/etc/adapter/config.yaml + - --logtostderr=true + - --metrics-relist-interval=1m + - --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/ + - --secure-port=6443 + image: directxman12/k8s-prometheus-adapter-arm64:v0.4.0 + name: prometheus-adapter + ports: + - containerPort: 6443 + volumeMounts: + - mountPath: /tmp + name: tmpfs + readOnly: false + - mountPath: /var/run/serving-cert + name: volume-serving-cert + readOnly: false + - mountPath: /etc/adapter + name: config + readOnly: false + serviceAccountName: prometheus-adapter + volumes: + - emptyDir: {} + name: tmpfs + - emptyDir: {} + name: volume-serving-cert + - configMap: + name: adapter-config + name: config diff --git a/manifests/prometheus-adapter-roleBindingAuthReader.yaml b/manifests/prometheus-adapter-roleBindingAuthReader.yaml new file mode 100644 index 0000000..48c8f32 --- /dev/null +++ b/manifests/prometheus-adapter-roleBindingAuthReader.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: resource-metrics-auth-reader + namespace: kube-system +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: extension-apiserver-authentication-reader +subjects: +- kind: ServiceAccount + name: prometheus-adapter + namespace: monitoring diff --git a/manifests/prometheus-adapter-service.yaml b/manifests/prometheus-adapter-service.yaml new file mode 100644 index 0000000..e786e01 --- /dev/null +++ b/manifests/prometheus-adapter-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + name: prometheus-adapter + name: prometheus-adapter + namespace: monitoring +spec: + ports: + - name: https + port: 443 + targetPort: 6443 + selector: + name: prometheus-adapter diff --git a/manifests/prometheus-adapter-serviceAccount.yaml b/manifests/prometheus-adapter-serviceAccount.yaml new file mode 100644 index 0000000..d7e7050 --- /dev/null +++ b/manifests/prometheus-adapter-serviceAccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-adapter + namespace: monitoring diff --git a/manifests/prometheus-clusterRole.yaml b/manifests/prometheus-clusterRole.yaml new file mode 100644 index 0000000..d5c4598 --- /dev/null +++ b/manifests/prometheus-clusterRole.yaml @@ -0,0 +1,15 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: prometheus-k8s +rules: +- apiGroups: + - "" + resources: + - nodes/metrics + verbs: + - get +- nonResourceURLs: + - /metrics + verbs: + - get diff --git a/manifests/prometheus-clusterRoleBinding.yaml b/manifests/prometheus-clusterRoleBinding.yaml new file mode 100644 index 0000000..554bb6f --- /dev/null +++ b/manifests/prometheus-clusterRoleBinding.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: prometheus-k8s +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: prometheus-k8s +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml new file mode 100644 index 0000000..58ddfb9 --- /dev/null +++ b/manifests/prometheus-prometheus.yaml @@ -0,0 +1,45 @@ +apiVersion: monitoring.coreos.com/v1 +kind: Prometheus +metadata: + labels: + prometheus: k8s + name: k8s + namespace: monitoring +spec: + alerting: + alertmanagers: + - name: alertmanager-main + namespace: monitoring + port: web + baseImage: carlosedp/prometheus + externalUrl: http://prometheus.internal.carlosedp.com + nodeSelector: + beta.kubernetes.io/os: linux + replicas: 1 + resources: + requests: + memory: 400Mi + retention: 15d + ruleSelector: + matchLabels: + prometheus: k8s + role: alert-rules + securityContext: + fsGroup: 2000 + runAsNonRoot: true + runAsUser: 1000 + serviceAccountName: prometheus-k8s + serviceMonitorNamespaceSelector: {} + serviceMonitorSelector: {} + storage: + volumeClaimTemplate: + apiVersion: v1 + kind: PersistentVolumeClaim + spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 20Gi + storageClassName: nfs-ssd-node1 + version: v2.5.0 diff --git a/manifests/prometheus-roleBindingConfig.yaml b/manifests/prometheus-roleBindingConfig.yaml new file mode 100644 index 0000000..ec0129d --- /dev/null +++ b/manifests/prometheus-roleBindingConfig.yaml @@ -0,0 +1,13 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: prometheus-k8s-config + namespace: monitoring +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s-config +subjects: +- kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus-roleBindingSpecificNamespaces.yaml b/manifests/prometheus-roleBindingSpecificNamespaces.yaml new file mode 100644 index 0000000..c7527f6 --- /dev/null +++ b/manifests/prometheus-roleBindingSpecificNamespaces.yaml @@ -0,0 +1,42 @@ +apiVersion: rbac.authorization.k8s.io/v1 +items: +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-k8s + namespace: default + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-k8s + namespace: kube-system + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-k8s + namespace: monitoring + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring +kind: RoleBindingList diff --git a/manifests/prometheus-roleConfig.yaml b/manifests/prometheus-roleConfig.yaml new file mode 100644 index 0000000..5f1cd04 --- /dev/null +++ b/manifests/prometheus-roleConfig.yaml @@ -0,0 +1,12 @@ +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: prometheus-k8s-config + namespace: monitoring +rules: +- apiGroups: + - "" + resources: + - configmaps + verbs: + - get diff --git a/manifests/prometheus-roleSpecificNamespaces.yaml b/manifests/prometheus-roleSpecificNamespaces.yaml new file mode 100644 index 0000000..b305774 --- /dev/null +++ b/manifests/prometheus-roleSpecificNamespaces.yaml @@ -0,0 +1,54 @@ +apiVersion: rbac.authorization.k8s.io/v1 +items: +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-k8s + namespace: default + rules: + - apiGroups: + - "" + resources: + - nodes + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-k8s + namespace: kube-system + rules: + - apiGroups: + - "" + resources: + - nodes + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-k8s + namespace: monitoring + rules: + - apiGroups: + - "" + resources: + - nodes + - services + - endpoints + - pods + verbs: + - get + - list + - watch +kind: RoleList diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml new file mode 100644 index 0000000..05e0deb --- /dev/null +++ b/manifests/prometheus-rules.yaml @@ -0,0 +1,975 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: k8s + role: alert-rules + name: prometheus-k8s-rules + namespace: monitoring +spec: + groups: + - name: k8s.rules + rules: + - expr: | + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace) + record: namespace:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum by (namespace, pod_name, container_name) ( + rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m]) + ) + record: namespace_pod_name_container_name:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum(container_memory_usage_bytes{job="kubelet", image!="", container_name!=""}) by (namespace) + record: namespace:container_memory_usage_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container_name!=""}[5m])) by (namespace, pod_name) + * on (namespace, pod_name) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:container_cpu_usage_seconds_total:sum_rate + - expr: | + sum by (namespace, label_name) ( + sum(container_memory_usage_bytes{job="kubelet",image!="", container_name!=""}) by (pod_name, namespace) + * on (namespace, pod_name) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:container_memory_usage_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:kube_pod_container_resource_requests_memory_bytes:sum + - expr: | + sum by (namespace, label_name) ( + sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} and on(pod) kube_pod_status_scheduled{condition="true"}) by (namespace, pod) + * on (namespace, pod) group_left(label_name) + label_replace(kube_pod_labels{job="kube-state-metrics"}, "pod_name", "$1", "pod", "(.*)") + ) + record: namespace_name:kube_pod_container_resource_requests_cpu_cores:sum + - name: kube-scheduler.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.99, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_e2e_scheduling_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_e2e_scheduling_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_scheduling_algorithm_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_scheduling_algorithm_latency:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(scheduler_binding_latency_microseconds_bucket{job="kube-scheduler"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:scheduler_binding_latency:histogram_quantile + - name: kube-apiserver.rules + rules: + - expr: | + histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.99" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: | + histogram_quantile(0.9, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.9" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - expr: | + histogram_quantile(0.5, sum(rate(apiserver_request_latencies_bucket{job="apiserver"}[5m])) without(instance, pod)) / 1e+06 + labels: + quantile: "0.5" + record: cluster_quantile:apiserver_request_latencies:histogram_quantile + - name: node.rules + rules: + - expr: sum(min(kube_pod_info) by (node)) + record: ':kube_pod_info_node_count:' + - expr: | + max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) + record: 'node_namespace_pod:kube_pod_info:' + - expr: | + count by (node) (sum by (node, cpu) ( + node_cpu_seconds_total{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + )) + record: node:node_num_cpu:sum + - expr: | + 1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])) + record: :node_cpu_utilisation:avg1m + - expr: | + 1 - avg by (node) ( + rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info:) + record: node:node_cpu_utilisation:avg1m + - expr: | + sum(node_load1{job="node-exporter"}) + / + sum(node:node_num_cpu:sum) + record: ':node_cpu_saturation_load1:' + - expr: | + sum by (node) ( + node_load1{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + node:node_num_cpu:sum + record: 'node:node_cpu_saturation_load1:' + - expr: | + 1 - + sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + / + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + record: ':node_memory_utilisation:' + - expr: | + sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + record: :node_memory_MemFreeCachedBuffers_bytes:sum + - expr: | + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + record: :node_memory_MemTotal_bytes:sum + - expr: | + sum by (node) ( + (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_available:sum + - expr: | + sum by (node) ( + node_memory_MemTotal_bytes{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_bytes_total:sum + - expr: | + (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) + / + scalar(sum(node:node_memory_bytes_total:sum)) + record: node:node_memory_utilisation:ratio + - expr: | + 1e3 * sum( + (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) + ) + record: :node_memory_swap_io_bytes:sum_rate + - expr: | + 1 - + sum by (node) ( + (node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"}) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + / + sum by (node) ( + node_memory_MemTotal_bytes{job="node-exporter"} + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: 'node:node_memory_utilisation:' + - expr: | + 1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum) + record: 'node:node_memory_utilisation_2:' + - expr: | + 1e3 * sum by (node) ( + (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) + + rate(node_vmstat_pgpgout{job="node-exporter"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_memory_swap_io_bytes:sum_rate + - expr: | + avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m])) + record: :node_disk_utilisation:avg_irate + - expr: | + avg by (node) ( + irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_utilisation:avg_irate + - expr: | + avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3) + record: :node_disk_saturation:avg_irate + - expr: | + avg by (node) ( + irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+"}[1m]) / 1e3 + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_disk_saturation:avg_irate + - expr: | + max by (namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} + - node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_usage:' + - expr: | + max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) + record: 'node:node_filesystem_avail:' + - expr: | + sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + record: :node_net_utilisation:sum_irate + - expr: | + sum by (node) ( + (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_utilisation:sum_irate + - expr: | + sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) + + sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + record: :node_net_saturation:sum_irate + - expr: | + sum by (node) ( + (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) + + irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + * on (namespace, pod) group_left(node) + node_namespace_pod:kube_pod_info: + ) + record: node:node_net_saturation:sum_irate + - expr: | + max( + max( + kube_pod_info{job="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_total:' + - expr: | + max( + max( + kube_pod_info{job="kube-state-metrics", host_ip!=""} + ) by (node, host_ip) + * on (host_ip) group_right (node) + label_replace( + (max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*" + ) + ) by (node) + record: 'node:node_inodes_free:' + - name: kube-prometheus-node-recording.rules + rules: + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) + record: instance:node_cpu:rate:sum + - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + BY (instance) + record: instance:node_filesystem_usage:sum + - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + record: instance:node_network_receive_bytes:rate:sum + - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + record: instance:node_network_transmit_bytes:rate:sum + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) + / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + record: instance:node_cpu:ratio + - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) + record: cluster:node_cpu:sum_rate5m + - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) + record: cluster:node_cpu:ratio + - name: kubernetes-absent + rules: + - alert: AlertmanagerDown + annotations: + message: Alertmanager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown + expr: | + absent(up{job="alertmanager-main"} == 1) + for: 15m + labels: + severity: critical + - alert: CoreDNSDown + annotations: + message: CoreDNS has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-corednsdown + expr: | + absent(up{job="kube-dns"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeAPIDown + annotations: + message: KubeAPI has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapidown + expr: | + absent(up{job="apiserver"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeControllerManagerDown + annotations: + message: KubeControllerManager has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecontrollermanagerdown + expr: | + absent(up{job="kube-controller-manager"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeSchedulerDown + annotations: + message: KubeScheduler has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeschedulerdown + expr: | + absent(up{job="kube-scheduler"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeStateMetricsDown + annotations: + message: KubeStateMetrics has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricsdown + expr: | + absent(up{job="kube-state-metrics"} == 1) + for: 15m + labels: + severity: critical + - alert: KubeletDown + annotations: + message: Kubelet has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown + expr: | + absent(up{job="kubelet"} == 1) + for: 15m + labels: + severity: critical + - alert: NodeExporterDown + annotations: + message: NodeExporter has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-nodeexporterdown + expr: | + absent(up{job="node-exporter"} == 1) + for: 15m + labels: + severity: critical + - alert: PrometheusDown + annotations: + message: Prometheus has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown + expr: | + absent(up{job="prometheus-k8s"} == 1) + for: 15m + labels: + severity: critical + - alert: PrometheusOperatorDown + annotations: + message: PrometheusOperator has disappeared from Prometheus target discovery. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown + expr: | + absent(up{job="prometheus-operator"} == 1) + for: 15m + labels: + severity: critical + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is restarting {{ printf "%.2f" $value }} times / 5 minutes. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping + expr: | + rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0 + for: 1h + labels: + severity: critical + - alert: KubePodNotReady + annotations: + message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready + state for longer than an hour. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready + expr: | + sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) > 0 + for: 1h + labels: + severity: critical + - alert: KubeDeploymentGenerationMismatch + annotations: + message: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment + }} does not match, this indicates that the Deployment has failed but has + not been rolled back. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentgenerationmismatch + expr: | + kube_deployment_status_observed_generation{job="kube-state-metrics"} + != + kube_deployment_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeDeploymentReplicasMismatch + annotations: + message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not + matched the expected number of replicas for longer than an hour. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch + expr: | + kube_deployment_spec_replicas{job="kube-state-metrics"} + != + kube_deployment_status_replicas_available{job="kube-state-metrics"} + for: 1h + labels: + severity: critical + - alert: KubeStatefulSetReplicasMismatch + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has + not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch + expr: | + kube_statefulset_status_replicas_ready{job="kube-state-metrics"} + != + kube_statefulset_status_replicas{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetGenerationMismatch + annotations: + message: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset + }} does not match, this indicates that the StatefulSet has failed but has + not been rolled back. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetgenerationmismatch + expr: | + kube_statefulset_status_observed_generation{job="kube-state-metrics"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics"} + for: 15m + labels: + severity: critical + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + message: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update + has not been rolled out. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetupdatenotrolledout + expr: | + max without (revision) ( + kube_statefulset_status_current_revision{job="kube-state-metrics"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics"} + ) + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetRolloutStuck + annotations: + message: Only {{ $value }}% of the desired Pods of DaemonSet {{ $labels.namespace + }}/{{ $labels.daemonset }} are scheduled and ready. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck + expr: | + kube_daemonset_status_number_ready{job="kube-state-metrics"} + / + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} * 100 < 100 + for: 15m + labels: + severity: critical + - alert: KubeDaemonSetNotScheduled + annotations: + message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are not scheduled.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetnotscheduled + expr: | + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + message: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are running where they are not supposed to run.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetmisscheduled + expr: | + kube_daemonset_status_number_misscheduled{job="kube-state-metrics"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeCronJobRunning + annotations: + message: CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more + than 1h to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecronjobrunning + expr: | + time() - kube_cronjob_next_schedule_time{job="kube-state-metrics"} > 3600 + for: 1h + labels: + severity: warning + - alert: KubeJobCompletion + annotations: + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more + than one hour to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobcompletion + expr: | + kube_job_spec_completions{job="kube-state-metrics"} - kube_job_status_succeeded{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning + - alert: KubeJobFailed + annotations: + message: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed + expr: | + kube_job_status_failed{job="kube-state-metrics"} > 0 + for: 1h + labels: + severity: warning + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Pods and cannot + tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit + expr: | + sum(namespace_name:kube_pod_container_resource_requests_cpu_cores:sum) + / + sum(node:node_num_cpu:sum) + > + (count(node:node_num_cpu:sum)-1) / count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Pods and cannot + tolerate node failure. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit + expr: | + sum(namespace_name:kube_pod_container_resource_requests_memory_bytes:sum) + / + sum(node_memory_MemTotal_bytes) + > + (count(node:node_num_cpu:sum)-1) + / + count(node:node_num_cpu:sum) + for: 5m + labels: + severity: warning + - alert: KubeCPUOvercommit + annotations: + message: Cluster has overcommitted CPU resource requests for Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.cpu"}) + / + sum(node:node_num_cpu:sum) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemOvercommit + annotations: + message: Cluster has overcommitted memory resource requests for Namespaces. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit + expr: | + sum(kube_resourcequota{job="kube-state-metrics", type="hard", resource="requests.memory"}) + / + sum(node_memory_MemTotal_bytes{job="node-exporter"}) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaExceeded + annotations: + message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value + }}% of its {{ $labels.resource }} quota. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded + expr: | + 100 * kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 90 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + message: '{{ printf "%0.0f" $value }}% throttling of CPU in namespace {{ $labels.namespace + }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name + }}.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh + expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m])) + by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m])) + by (container_name, pod_name, namespace)\n > 25 \n" + for: 15m + labels: + severity: warning + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeUsageCritical + annotations: + message: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is only {{ printf "%0.2f" $value + }}% free. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical + expr: | + 100 * kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + < 3 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFullInFourDays + annotations: + message: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} is expected to fill up within four + days. Currently {{ printf "%0.2f" $value }}% is available. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays + expr: | + 100 * ( + kubelet_volume_stats_available_bytes{job="kubelet"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet"} + ) < 15 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 + for: 5m + labels: + severity: critical + - alert: KubePersistentVolumeErrors + annotations: + message: The persistent volume {{ $labels.persistentvolume }} has status {{ + $labels.phase }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeerrors + expr: | + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical + - name: kubernetes-system + rules: + - alert: KubeNodeNotReady + annotations: + message: '{{ $labels.node }} has been unready for more than an hour.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready + expr: | + kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0 + for: 1h + labels: + severity: warning + - alert: KubeVersionMismatch + annotations: + message: There are {{ $value }} different versions of Kubernetes components + running. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch + expr: | + count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 + for: 1h + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ printf "%0.0f" $value }}% errors.' + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors + expr: | + (sum(rate(rest_client_requests_total{code=~"5.."}[5m])) by (instance, job) + / + sum(rate(rest_client_requests_total[5m])) by (instance, job)) + * 100 > 1 + for: 15m + labels: + severity: warning + - alert: KubeClientErrors + annotations: + message: Kubernetes API server client '{{ $labels.job }}/{{ $labels.instance + }}' is experiencing {{ printf "%0.0f" $value }} errors / second. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclienterrors + expr: | + sum(rate(ksm_scrape_error_total{job="kube-state-metrics"}[5m])) by (instance, job) > 0.1 + for: 15m + labels: + severity: warning + - alert: KubeletTooManyPods + annotations: + message: Kubelet {{ $labels.instance }} is running {{ $value }} Pods, close + to the limit of 110. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods + expr: | + kubelet_running_pod_count{job="kubelet"} > 110 * 0.9 + for: 15m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{ $value }} seconds + for {{ $labels.verb }} {{ $labels.resource }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh + expr: | + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 1 + for: 10m + labels: + severity: warning + - alert: KubeAPILatencyHigh + annotations: + message: The API server has a 99th percentile latency of {{ $value }} seconds + for {{ $labels.verb }} {{ $labels.resource }}. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh + expr: | + cluster_quantile:apiserver_request_latencies:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"^(?:LIST|WATCH|WATCHLIST|PROXY|CONNECT)$"} > 4 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 10 + for: 10m + labels: + severity: critical + - alert: KubeAPIErrorsHigh + annotations: + message: API server is returning errors for {{ $value }}% of requests. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh + expr: | + sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[5m])) without(instance, pod) + / + sum(rate(apiserver_request_count{job="apiserver"}[5m])) without(instance, pod) * 100 > 5 + for: 10m + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: Kubernetes API certificate is expiring in less than 7 days. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration + expr: | + histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 + labels: + severity: warning + - alert: KubeClientCertificateExpiration + annotations: + message: Kubernetes API certificate is expiring in less than 24 hours. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration + expr: | + histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 + labels: + severity: critical + - name: alertmanager.rules + rules: + - alert: AlertmanagerConfigInconsistent + annotations: + message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` + are out of sync. + expr: | + count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 + for: 5m + labels: + severity: critical + - alert: AlertmanagerFailedReload + annotations: + message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace + }}/{{ $labels.pod}}. + expr: | + alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0 + for: 10m + labels: + severity: warning + - alert: AlertmanagerMembersInconsistent + annotations: + message: Alertmanager has not found all other members of the cluster. + expr: | + alertmanager_cluster_members{job="alertmanager-main"} + != on (service) GROUP_LEFT() + count by (service) (alertmanager_cluster_members{job="alertmanager-main"}) + for: 5m + labels: + severity: critical + - name: general.rules + rules: + - alert: TargetDown + annotations: + message: '{{ $value }}% of the {{ $labels.job }} targets are down.' + expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 + for: 10m + labels: + severity: warning + - alert: DeadMansSwitch + annotations: + message: This is a DeadMansSwitch meant to ensure that the entire alerting + pipeline is functional. + expr: vector(1) + labels: + severity: none + - name: kube-prometheus-node-alerting.rules + rules: + - alert: NodeDiskRunningFull + annotations: + message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace + }}/{{ $labels.pod }} will be full within the next 24 hours. + expr: | + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[6h], 3600 * 24) < 0) + for: 30m + labels: + severity: warning + - alert: NodeDiskRunningFull + annotations: + message: Device {{ $labels.device }} of node-exporter {{ $labels.namespace + }}/{{ $labels.pod }} will be full within the next 2 hours. + expr: | + (node:node_filesystem_usage: > 0.85) and (predict_linear(node:node_filesystem_avail:[30m], 3600 * 2) < 0) + for: 10m + labels: + severity: critical + - name: prometheus.rules + rules: + - alert: PrometheusConfigReloadFailed + annotations: + description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} + summary: Reloading Prometheus' configuration failed + expr: | + prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0 + for: 10m + labels: + severity: warning + - alert: PrometheusNotificationQueueRunningFull + annotations: + description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ + $labels.pod}} + summary: Prometheus' alert notification queue is running full + expr: | + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"} + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alert from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01 + for: 10m + labels: + severity: warning + - alert: PrometheusErrorSendingAlerts + annotations: + description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ + $labels.pod}} to Alertmanager {{$labels.Alertmanager}} + summary: Errors while sending alerts from Prometheus + expr: | + rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03 + for: 10m + labels: + severity: critical + - alert: PrometheusNotConnectedToAlertmanagers + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected + to any Alertmanagers + summary: Prometheus is not connected to any Alertmanagers + expr: | + prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1 + for: 10m + labels: + severity: warning + - alert: PrometheusTSDBReloadsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + reload failures over the last four hours.' + summary: Prometheus has issues reloading data blocks from disk + expr: | + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBCompactionsFailing + annotations: + description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} + compaction failures over the last four hours.' + summary: Prometheus has issues compacting sample blocks + expr: | + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0 + for: 12h + labels: + severity: warning + - alert: PrometheusTSDBWALCorruptions + annotations: + description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead + log (WAL).' + summary: Prometheus write-ahead log is corrupted + expr: | + tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0 + for: 4h + labels: + severity: warning + - alert: PrometheusNotIngestingSamples + annotations: + description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} isn't ingesting + samples. + summary: Prometheus isn't ingesting samples + expr: | + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0 + for: 10m + labels: + severity: warning + - alert: PrometheusTargetScrapesDuplicate + annotations: + description: '{{$labels.namespace}}/{{$labels.pod}} has many samples rejected + due to duplicate timestamps but different values' + summary: Prometheus has many samples rejected + expr: | + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0 + for: 10m + labels: + severity: warning + - name: prometheus-operator + rules: + - alert: PrometheusOperatorReconcileErrors + annotations: + message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace + }} Namespace. + expr: | + rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 + for: 10m + labels: + severity: warning + - alert: PrometheusOperatorNodeLookupErrors + annotations: + message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. + expr: | + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 + for: 10m + labels: + severity: warning diff --git a/manifests/prometheus-service.yaml b/manifests/prometheus-service.yaml new file mode 100644 index 0000000..85b007f --- /dev/null +++ b/manifests/prometheus-service.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + prometheus: k8s + name: prometheus-k8s + namespace: monitoring +spec: + ports: + - name: web + port: 9090 + targetPort: web + selector: + app: prometheus + prometheus: k8s diff --git a/manifests/prometheus-serviceAccount.yaml b/manifests/prometheus-serviceAccount.yaml new file mode 100644 index 0000000..3e55fad --- /dev/null +++ b/manifests/prometheus-serviceAccount.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: prometheus-k8s + namespace: monitoring diff --git a/manifests/prometheus-serviceMonitor.yaml b/manifests/prometheus-serviceMonitor.yaml new file mode 100644 index 0000000..b7605db --- /dev/null +++ b/manifests/prometheus-serviceMonitor.yaml @@ -0,0 +1,14 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: prometheus + name: prometheus + namespace: monitoring +spec: + endpoints: + - interval: 30s + port: web + selector: + matchLabels: + prometheus: k8s diff --git a/manifests/prometheus-serviceMonitorApiserver.yaml b/manifests/prometheus-serviceMonitorApiserver.yaml new file mode 100644 index 0000000..6d884a2 --- /dev/null +++ b/manifests/prometheus-serviceMonitorApiserver.yaml @@ -0,0 +1,29 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: apiserver + name: kube-apiserver + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 30s + metricRelabelings: + - action: drop + regex: etcd_(debugging|disk|request|server).* + sourceLabels: + - __name__ + port: https + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + serverName: kubernetes + jobLabel: component + namespaceSelector: + matchNames: + - default + selector: + matchLabels: + component: apiserver + provider: kubernetes diff --git a/manifests/prometheus-serviceMonitorCoreDNS.yaml b/manifests/prometheus-serviceMonitorCoreDNS.yaml new file mode 100644 index 0000000..14a2454 --- /dev/null +++ b/manifests/prometheus-serviceMonitorCoreDNS.yaml @@ -0,0 +1,18 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: coredns + name: coredns + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + interval: 15s + port: metrics + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: kube-dns diff --git a/manifests/prometheus-serviceMonitorKubeControllerManager.yaml b/manifests/prometheus-serviceMonitorKubeControllerManager.yaml new file mode 100644 index 0000000..153a90d --- /dev/null +++ b/manifests/prometheus-serviceMonitorKubeControllerManager.yaml @@ -0,0 +1,23 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: kube-controller-manager + name: kube-controller-manager + namespace: monitoring +spec: + endpoints: + - interval: 30s + metricRelabelings: + - action: drop + regex: etcd_(debugging|disk|request|server).* + sourceLabels: + - __name__ + port: http-metrics + jobLabel: k8s-app + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: kube-controller-manager diff --git a/manifests/prometheus-serviceMonitorKubeScheduler.yaml b/manifests/prometheus-serviceMonitorKubeScheduler.yaml new file mode 100644 index 0000000..f00db0e --- /dev/null +++ b/manifests/prometheus-serviceMonitorKubeScheduler.yaml @@ -0,0 +1,18 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: kube-scheduler + name: kube-scheduler + namespace: monitoring +spec: + endpoints: + - interval: 30s + port: http-metrics + jobLabel: k8s-app + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: kube-scheduler diff --git a/manifests/prometheus-serviceMonitorKubelet.yaml b/manifests/prometheus-serviceMonitorKubelet.yaml new file mode 100644 index 0000000..97d7f1a --- /dev/null +++ b/manifests/prometheus-serviceMonitorKubelet.yaml @@ -0,0 +1,31 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: kubelet + name: kubelet + namespace: monitoring +spec: + endpoints: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + port: https-metrics + scheme: https + tlsConfig: + insecureSkipVerify: true + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + honorLabels: true + interval: 30s + path: /metrics/cadvisor + port: https-metrics + scheme: https + tlsConfig: + insecureSkipVerify: true + jobLabel: k8s-app + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: kubelet diff --git a/operator_stack.jsonnet b/operator_stack.jsonnet new file mode 100644 index 0000000..8b2b49f --- /dev/null +++ b/operator_stack.jsonnet @@ -0,0 +1,288 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: "monitoring", + + urls+:: { + prom_externalUrl: 'http://prometheus.internal.carlosedp.com', + alert_externalUrl: 'http://alertmanager.internal.carlosedp.com', + grafana_externalUrl: 'http://grafana.internal.carlosedp.com/', + + prom_ingress: 'prometheus.internal.carlosedp.com', + alert_ingress: 'alertmanager.internal.carlosedp.com', + grafana_ingress: 'grafana.internal.carlosedp.com', + }, + + versions+:: { + prometheus: "v2.5.0", + alertmanager: "v0.15.3", + kubeStateMetrics: "v1.4.0", + kubeRbacProxy: "v0.4.0", + addonResizer: "2.1", + nodeExporter: "v0.17.0", + prometheusOperator: "v0.26.0", + prometheusAdapter: "v0.4.0", + grafana: "v5.4.0", + configmapReloader: "v0.2.2", + prometheusConfigReloader: "v0.26.0", + }, + + imageRepos+:: { + prometheus: "carlosedp/prometheus", + alertmanager: "carlosedp/alertmanager", + kubeStateMetrics: "carlosedp/kube-state-metrics", + kubeRbacProxy: "carlosedp/kube-rbac-proxy", + addonResizer: "carlosedp/addon-resizer", + nodeExporter: "carlosedp/node_exporter", + prometheusOperator: "carlosedp/prometheus-operator", + prometheusAdapter: "directxman12/k8s-prometheus-adapter-arm64", + grafana: "carlosedp/monitoring-grafana", + configmapReloader: "carlosedp/configmap-reload", + prometheusConfigReloader: "carlosedp/prometheus-config-reloader", + }, + + prometheus+:: { + names: 'k8s', + replicas: 1, + namespaces: ["default", "kube-system","monitoring"], + }, + + alertmanager+:: { + name: 'main', + config: ||| + global: + resolve_timeout: 5m + route: + group_by: ['job'] + group_wait: 30s + group_interval: 5m + repeat_interval: 12h + receiver: 'null' + routes: + - match: + alertname: DeadMansSwitch + receiver: 'null' + receivers: + - name: 'null' + |||, + replicas: 1, + // Configure External URL's + alertmanager+: { + spec+: { + externalUrl: $._config.urls.alert_externalUrl, + }, + }, + }, + + kubeStateMetrics+:: { + collectors: '', // empty string gets a default set + scrapeInterval: '30s', + scrapeTimeout: '30s', + + baseCPU: '100m', + baseMemory: '150Mi', + cpuPerNode: '2m', + memoryPerNode: '30Mi', + }, + + grafana+:: { + config: { + sections: { + database: { path: '/data/grafana.db' }, + paths: { + data: '/var/lib/grafana', + logs: '/var/lib/grafana/log', + plugins: '/var/lib/grafana/plugins', + provisioning: '/etc/grafana/provisioning', + }, + session: { provider: 'memory' }, + 'auth.basic': {enabled: false}, + 'auth.anonymous': {enabled: false}, + smtp: { + enabled: true, + host: 'smtp-server.monitoring.svc:25', + user: '', + password: '', + from_address:'carlosedp@gmail.com', + from_name: 'Grafana Alert', + skip_verify: true + }, + }, + }, + }, + + //--------------------------------------- + // End of _config + //--------------------------------------- + + }, + prometheus+:: { + local pvc = k.core.v1.persistentVolumeClaim, + + prometheus+: { + spec+: { + retention: '15d', + externalUrl: $._config.urls.prom_externalUrl, + storage: { + volumeClaimTemplate: + pvc.new() + + pvc.mixin.spec.withAccessModes('ReadWriteOnce') + + pvc.mixin.spec.resources.withRequests({ storage: '20Gi' }) + + pvc.mixin.spec.withStorageClassName('nfs-ssd-node1'), + }, + }, + }, + }, + + # Override command for Grafana to load ini from correct path + grafana+:: { + deployment+: + { + local pvc = k.core.v1.persistentVolumeClaim, + spec+: { + volumeClaimTemplate: + pvc.new() + + pvc.mixin.metadata.withNamespace($._config.namespace) + + pvc.mixin.metadata.withName("grafana-storage") + + pvc.mixin.spec.withAccessModes('ReadWriteMany') + + pvc.mixin.spec.resources.withRequests({ storage: '2Gi' }) + + pvc.mixin.spec.withStorageClassName('nfs-ssd-node1'), + template+: { + spec+: { + containers: + std.map( + function(c) + if c.name == 'grafana' then + c { + args+: [ + '-config=/etc/grafana/grafana.ini', + ], + } + else + c, + super.containers, + ), + }, + }, + }, + }, + }, + + // Override command for addon-resizer due to change from parameter --threshold to --acceptance-offset + kubeStateMetrics+:: { + deployment+: { + spec+: { + template+: { + spec+: { + containers: + std.filterMap( + function(c) c.name == 'addon-resizer', + function(c) + if std.startsWith(c.name, 'addon-resizer') then + c { + command: [ + '/pod_nanny', + '--container=kube-state-metrics', + '--cpu=' + $._config.kubeStateMetrics.baseCPU, + '--extra-cpu=' + $._config.kubeStateMetrics.cpuPerNode, + '--memory=' + $._config.kubeStateMetrics.baseMemory, + '--extra-memory=' + $._config.kubeStateMetrics.memoryPerNode, + '--acceptance-offset=5', + '--deployment=kube-state-metrics', + ], + }, + super.containers, + ), + }, + }, + }, + }, + }, + + // Create ingress objects per application + ingress+: { + local secret = k.core.v1.secret, + local ingress = k.extensions.v1beta1.ingress, + local ingressTls = ingress.mixin.spec.tlsType, + local ingressRule = ingress.mixin.spec.rulesType, + local httpIngressPath = ingressRule.mixin.http.pathsType, + + 'alertmanager-main': + ingress.new() + + ingress.mixin.metadata.withName('alertmanager-main') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + // ingress.mixin.metadata.withAnnotations({ + // 'nginx.ingress.kubernetes.io/auth-type': 'basic', + // 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', + // 'nginx.ingress.kubernetes.io/auth-realm': 'Authentication Required', + // }) + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost($._config.urls.alert_ingress) + + ingressRule.mixin.http.withPaths( + httpIngressPath.new() + + httpIngressPath.withPath('/') + + httpIngressPath.mixin.backend.withServiceName('alertmanager-main') + + httpIngressPath.mixin.backend.withServicePort('web') + ), + ), + 'grafana': + ingress.new() + + ingress.mixin.metadata.withName('grafana') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + // ingress.mixin.metadata.withAnnotations({ + // 'nginx.ingress.kubernetes.io/auth-type': 'basic', + // 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', + // 'nginx.ingress.kubernetes.io/auth-realm': 'Authentication Required', + // }) + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost($._config.urls.grafana_ingress) + + ingressRule.mixin.http.withPaths( + httpIngressPath.new() + + httpIngressPath.withPath('/') + + httpIngressPath.mixin.backend.withServiceName('grafana') + + httpIngressPath.mixin.backend.withServicePort('http') + ), + ), + 'prometheus-k8s': + ingress.new() + + ingress.mixin.metadata.withName('prometheus-k8s') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + // ingress.mixin.metadata.withAnnotations({ + // 'nginx.ingress.kubernetes.io/auth-type': 'basic', + // 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', + // 'nginx.ingress.kubernetes.io/auth-realm': 'Authentication Required', + // }) + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost($._config.urls.prom_ingress) + + ingressRule.mixin.http.withPaths( + httpIngressPath.new() + + httpIngressPath.withPath('/') + + httpIngressPath.mixin.backend.withServiceName('prometheus-k8s') + + httpIngressPath.mixin.backend.withServicePort('web') + ), + ), + }, + }; + // + { + // Create basic auth secret - replace 'auth' file with your own + // Create with htpasswd -c auth [USERNAME] +// ingress+:: { +// 'basic-auth-secret': +// secret.new('basic-auth', { auth: std.base64(importstr 'auth') }) + +// secret.mixin.metadata.withNamespace($._config.namespace), +// }, +// }; + +{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + +{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + +{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + +{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + +{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + +{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + +{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + +{ ['ingress-' + name]: kp.ingress[name] for name in std.objectFields(kp.ingress) } + +{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } \ No newline at end of file diff --git a/teardown b/teardown deleted file mode 100755 index d6c6f66..0000000 --- a/teardown +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env bash - -if [ -z "${KUBECONFIG}" ]; then - export KUBECONFIG=~/.kube/config -fi - -# CAUTION - NAMESPACE must match its value when deploy script was run. -# Some resources are always deployed to the monitoring namespace. - -if [ -z "${NAMESPACE}" ]; then - NAMESPACE=monitoring -fi - -kctl() { - kubectl --namespace "$NAMESPACE" "$@" -} - -kctl delete -f manifests/node-exporter -kctl delete -f manifests/armexporter -kctl delete -f manifests/kube-state-metrics -kctl delete -f manifests/grafana -find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \; -kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml -kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml -kctl delete -f manifests/alertmanager -kctl delete -f manifests/smtp-server -# Hack: wait a bit to let the controller delete the deployed Prometheus server. -sleep 5 - -kctl delete -f manifests/prometheus-operator - From 703f8207e33dd0a3fec0ab37f17ff5acb87fb1dd Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Thu, 31 Jan 2019 18:31:53 -0200 Subject: [PATCH 02/30] Added image to script --- build_images.sh | 73 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 51 insertions(+), 22 deletions(-) diff --git a/build_images.sh b/build_images.sh index da5a57b..45f18bb 100644 --- a/build_images.sh +++ b/build_images.sh @@ -1,37 +1,65 @@ #!/bin/bash +export DOCKER_CLI_EXPERIMENTAL=enabled + REPO=carlosedp -AOM_VERSION=2.1 +AOR_VERSION=2.1 KSM_VERSION=v1.4.0 -VERSION=v0.26.0 +PROM_OP_VERSION=v0.26.0 PROMCONFIGRELOADER_VERSION=v0.20.0 - +PROM_ADAPTER_VERSION=v0.4.1 +PROM_CONFIG_RELOADER_VERSION=v0.26.0 +KUBE_RBAC_VERSION=v0.4.0 +CONFIGMAP_RELOAD_VERSION=v0.2.2 #------------------------------------------------------------------------------- # Kubernetes addon-resizer # Retag Addon-resizer google images to have unified manifest on DockerHub -docker pull gcr.io/google-containers/addon-resizer-arm64:$AOM_VERSION -docker pull gcr.io/google-containers/addon-resizer-arm:$AOM_VERSION -docker pull gcr.io/google-containers/addon-resizer-amd64:$AOM_VERSION +docker pull gcr.io/google-containers/addon-resizer-arm:$AOR_VERSION +docker pull gcr.io/google-containers/addon-resizer-arm64:$AOR_VERSION +docker pull gcr.io/google-containers/addon-resizer-amd64:$AOR_VERSION -docker tag gcr.io/google-containers/addon-resizer-arm64:$AOM_VERSION $REPO/addon-resizer:$AOM_VERSION-arm64 -docker tag gcr.io/google-containers/addon-resizer-amd64:$AOM_VERSION $REPO/addon-resizer:$AOM_VERSION-arm64 -docker tag gcr.io/google-containers/addon-resizer-arm:$AOM_VERSION $REPO/addon-resizer:$AOM_VERSION-arm +docker tag gcr.io/google-containers/addon-resizer-arm:$AOR_VERSION $REPO/addon-resizer:$AOR_VERSION-arm +docker tag gcr.io/google-containers/addon-resizer-arm64:$AOR_VERSION $REPO/addon-resizer:$AOR_VERSION-arm64 +docker tag gcr.io/google-containers/addon-resizer-amd64:$AOR_VERSION $REPO/addon-resizer:$AOR_VERSION-amd64 -docker push $REPO/addon-resizer:$AOM_VERSION-arm -docker push $REPO/addon-resizer:$AOM_VERSION-arm64 -docker push $REPO/addon-resizer:$AOM_VERSION-amd64 +docker push $REPO/addon-resizer:$AOR_VERSION-arm +docker push $REPO/addon-resizer:$AOR_VERSION-arm64 +docker push $REPO/addon-resizer:$AOR_VERSION-amd64 -manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64 --template $REPO/addon-resizer:$AOM_VERSION-ARCH --target $REPO/addon-resizer:$AOM_VERSION -manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64 --template $REPO/addon-resizer:$AOM_VERSION-ARCH --target $REPO/addon-resizer:latest +manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64,linux/amd64 --template $REPO/addon-resizer:$AOR_VERSION-ARCH --target $REPO/addon-resizer:$AOR_VERSION +manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64,linux/amd64 --template $REPO/addon-resizer:$AOR_VERSION-ARCH --target $REPO/addon-resizer:latest +#------------------------------------------------------------------------------- +# Prometheus-adapter +# Retag prometheus-adapter from directxman12 images to have unified manifest on DockerHub + +docker pull directxman12/k8s-prometheus-adapter-arm:$PROM_ADAPTER_VERSION +docker pull directxman12/k8s-prometheus-adapter-arm64:$PROM_ADAPTER_VERSION +docker pull directxman12/k8s-prometheus-adapter-amd64:$PROM_ADAPTER_VERSION + +docker tag directxman12/k8s-prometheus-adapter-arm:$PROM_ADAPTER_VERSION $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-arm +docker tag directxman12/k8s-prometheus-adapter-arm64:$PROM_ADAPTER_VERSION $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-arm64 +docker tag directxman12/k8s-prometheus-adapter-amd64:$PROM_ADAPTER_VERSION $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-amd64 + +docker push $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-arm +docker push $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-arm64 +docker push $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-amd64 + +IMAGE=$REPO/k8s-prometheus-adapter +VERSION=$PROM_ADAPTER_VERSION +ALL_ARCH='amd64 arm arm64' + +docker manifest create --amend $IMAGE:$VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$VERSION\-&~g"` +for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$VERSION $IMAGE:$VERSION-$arch; done +docker manifest push $IMAGE:$VERSION #------------------------------------------------------------------------------- # Kube-state-metrics -export DOCKER_CLI_EXPERIMENTAL=enabled IMAGE=carlosedp/kube-state-metrics ALL_ARCH='amd64 arm arm64' +VERSION=$KSM_VERSION go get github.com/kubernetes/kube-state-metrics #mv $HOME/go/src/github.com/kubernetes/kube-state-metrics $HOME/go/src/k8s.io/kube-state-metrics @@ -55,15 +83,15 @@ docker push $REPO/kube-state-metrics:$KSM_VERSION-arm docker push $REPO/kube-state-metrics:$KSM_VERSION-arm64 docker push $REPO/kube-state-metrics:$KSM_VERSION-amd64 -docker manifest create --amend $IMAGE:$KSM_VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$KSM_VERSION\-&~g"` -for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$KSM_VERSION $IMAGE:$KSM_VERSION-$arch; done -docker manifest push $IMAGE:$KSM_VERSION +docker manifest create --amend $IMAGE:$VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$VERSION\-&~g"` +for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$VERSION $IMAGE:$VERSION-$arch; done +docker manifest push $IMAGE:$VERSION #------------------------------------------------------------------------------- # Prometheus-operator -export DOCKER_CLI_EXPERIMENTAL=enabled IMAGE=carlosedp/prometheus-operator ALL_ARCH='amd64 arm arm64' +VERSION=$PROM_OP_VERSION go get github.com/coreos/prometheus-operator cd $HOME/go/src/github.com/coreos/prometheus-operator @@ -95,7 +123,7 @@ rm Dockerfile.arm64 # kube-rbac-proxy export DOCKER_CLI_EXPERIMENTAL=enabled IMAGE=carlosedp/kube-rbac-proxy -VERSION=v0.4.0 +VERSION=$KUBE_RBAC_VERSION ALL_ARCH='amd64 arm arm64' go get github.com/brancz/kube-rbac-proxy @@ -156,7 +184,8 @@ docker manifest push $IMAGE:$VERSION # prometheus-config-reloader export DOCKER_CLI_EXPERIMENTAL=enabled IMAGE=carlosedp/prometheus-config-reloader -VERSION=v0.26.0 + +VERSION=$PROM_CONFIG_RELOADER_VERSION ALL_ARCH='amd64 arm arm64' go get github.com/coreos/prometheus-operator @@ -191,7 +220,7 @@ rm Dockerfile.arm64 # configmap-reload export DOCKER_CLI_EXPERIMENTAL=enabled IMAGE=carlosedp/configmap-reload -VERSION=v0.2.2 +VERSION=$CONFIGMAP_RELOAD_VERSION ALL_ARCH='amd64 arm arm64' go get github.com/openshift/configmap-reload From 3fa21f619206dc69fcddecb1eb8dcd9fdb62705f Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Thu, 31 Jan 2019 18:45:20 -0200 Subject: [PATCH 03/30] Cleanup after build images --- build_images.sh | 39 ++++++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/build_images.sh b/build_images.sh index 45f18bb..a71157f 100644 --- a/build_images.sh +++ b/build_images.sh @@ -19,7 +19,6 @@ docker pull gcr.io/google-containers/addon-resizer-arm:$AOR_VERSION docker pull gcr.io/google-containers/addon-resizer-arm64:$AOR_VERSION docker pull gcr.io/google-containers/addon-resizer-amd64:$AOR_VERSION - docker tag gcr.io/google-containers/addon-resizer-arm:$AOR_VERSION $REPO/addon-resizer:$AOR_VERSION-arm docker tag gcr.io/google-containers/addon-resizer-arm64:$AOR_VERSION $REPO/addon-resizer:$AOR_VERSION-arm64 docker tag gcr.io/google-containers/addon-resizer-amd64:$AOR_VERSION $REPO/addon-resizer:$AOR_VERSION-amd64 @@ -28,6 +27,13 @@ docker push $REPO/addon-resizer:$AOR_VERSION-arm docker push $REPO/addon-resizer:$AOR_VERSION-arm64 docker push $REPO/addon-resizer:$AOR_VERSION-amd64 +docker rmi gcr.io/google-containers/addon-resizer-arm:$AOR_VERSION +docker rmi gcr.io/google-containers/addon-resizer-arm64:$AOR_VERSION +docker rmi gcr.io/google-containers/addon-resizer-amd64:$AOR_VERSION +docker rmi $REPO/addon-resizer:$AOR_VERSION-arm +docker rmi $REPO/addon-resizer:$AOR_VERSION-arm64 +docker rmi $REPO/addon-resizer:$AOR_VERSION-amd64 + manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64,linux/amd64 --template $REPO/addon-resizer:$AOR_VERSION-ARCH --target $REPO/addon-resizer:$AOR_VERSION manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64,linux/amd64 --template $REPO/addon-resizer:$AOR_VERSION-ARCH --target $REPO/addon-resizer:latest @@ -47,6 +53,13 @@ docker push $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-arm docker push $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-arm64 docker push $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-amd64 +docker rmi directxman12/k8s-prometheus-adapter-arm:$PROM_ADAPTER_VERSION +docker rmi directxman12/k8s-prometheus-adapter-arm64:$PROM_ADAPTER_VERSION +docker rmi directxman12/k8s-prometheus-adapter-amd64:$PROM_ADAPTER_VERSION +docker rmi $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-arm +docker rmi $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-arm64 +docker rmi $REPO/k8s-prometheus-adapter:$PROM_ADAPTER_VERSION-amd64 + IMAGE=$REPO/k8s-prometheus-adapter VERSION=$PROM_ADAPTER_VERSION ALL_ARCH='amd64 arm arm64' @@ -63,13 +76,15 @@ VERSION=$KSM_VERSION go get github.com/kubernetes/kube-state-metrics #mv $HOME/go/src/github.com/kubernetes/kube-state-metrics $HOME/go/src/k8s.io/kube-state-metrics -cd $HOME/go/src/k8s.io/kube-state-metrics +pushd $GOPATH/src/k8s.io/kube-state-metrics git checkout ${KSM_VERSION} cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm32v6\/alpine:3.7/' > Dockerfile.arm cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm64v8\/alpine:3.7/' > Dockerfile.arm64 +cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM amd64\/alpine:3.7/' > Dockerfile.amd64 + GOOS=linux GOARCH=arm go build . docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-arm -f Dockerfile.arm . @@ -77,16 +92,21 @@ GOOS=linux GOARCH=arm64 go build . docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-arm64 -f Dockerfile.arm64 . GOOS=linux GOARCH=amd64 go build . -docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-amd64 -f Dockerfile . +docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-amd64 -f Dockerfile.amd64 . docker push $REPO/kube-state-metrics:$KSM_VERSION-arm docker push $REPO/kube-state-metrics:$KSM_VERSION-arm64 docker push $REPO/kube-state-metrics:$KSM_VERSION-amd64 +docker rmi $REPO/kube-state-metrics:$KSM_VERSION-arm +docker rmi $REPO/kube-state-metrics:$KSM_VERSION-arm64 +docker rmi $REPO/kube-state-metrics:$KSM_VERSION-amd64 + docker manifest create --amend $IMAGE:$VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$VERSION\-&~g"` for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$VERSION $IMAGE:$VERSION-$arch; done docker manifest push $IMAGE:$VERSION +popd #------------------------------------------------------------------------------- # Prometheus-operator IMAGE=carlosedp/prometheus-operator @@ -103,14 +123,20 @@ cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^ cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm64v8\/busybox/' > Dockerfile.arm64 +cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM amd64\/busybox/' > Dockerfile.amd64 + GOOS=linux GOARCH=arm $GOPATH/bin/promu build --prefix `pwd` docker build -t $REPO/prometheus-operator:${VERSION}-arm -f Dockerfile.arm . GOOS=linux GOARCH=arm64 $GOPATH/bin/promu build --prefix `pwd` docker build -t $REPO/prometheus-operator:${VERSION}-arm64 -f Dockerfile.arm64 . +GOOS=linux GOARCH=amd64 $GOPATH/bin/promu build --prefix `pwd` +docker build -t $REPO/prometheus-operator:${VERSION}-amd64 -f Dockerfile.amd64 . + docker push $REPO/prometheus-operator:$VERSION-arm docker push $REPO/prometheus-operator:$VERSION-arm64 +docker push $REPO/prometheus-operator:$VERSION-amd64 docker manifest create --amend $IMAGE:$VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$VERSION\-&~g"` for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$VERSION $IMAGE:$VERSION-$arch; done @@ -121,7 +147,6 @@ rm Dockerfile.arm64 #------------------------------------------------------------------------------- # kube-rbac-proxy -export DOCKER_CLI_EXPERIMENTAL=enabled IMAGE=carlosedp/kube-rbac-proxy VERSION=$KUBE_RBAC_VERSION ALL_ARCH='amd64 arm arm64' @@ -182,7 +207,6 @@ docker manifest push $IMAGE:$VERSION #------------------------------------------------------------------------------- # prometheus-config-reloader -export DOCKER_CLI_EXPERIMENTAL=enabled IMAGE=carlosedp/prometheus-config-reloader VERSION=$PROM_CONFIG_RELOADER_VERSION @@ -196,6 +220,8 @@ cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^ cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm64v8\/busybox/' > Dockerfile.arm64 +cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM amd64\/busybox/' > Dockerfile.amd64 + GOOS=linux GOARCH=arm CGO_ENABLED=0 go build -o prometheus-config-reloader main.go docker build -t $IMAGE:$VERSION-arm -f Dockerfile.arm . @@ -203,7 +229,7 @@ GOOS=linux GOARCH=arm64 CGO_ENABLED=0 go build -o prometheus-config-reloader mai docker build -t $IMAGE:$VERSION-arm64 -f Dockerfile.arm64 . GOOS=linux GOARCH=amd64 CGO_ENABLED=0 go build -o prometheus-config-reloader main.go -docker build -t $IMAGE:$VERSION-amd64 -f Dockerfile . +docker build -t $IMAGE:$VERSION-amd64 -f Dockerfile.amd64 . docker push $IMAGE:$VERSION-arm docker push $IMAGE:$VERSION-arm64 @@ -218,7 +244,6 @@ rm Dockerfile.arm64 #------------------------------------------------------------------------------- # configmap-reload -export DOCKER_CLI_EXPERIMENTAL=enabled IMAGE=carlosedp/configmap-reload VERSION=$CONFIGMAP_RELOAD_VERSION ALL_ARCH='amd64 arm arm64' From ae215c494737786a129a95cabf89fdc4ade02ccb Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Thu, 31 Jan 2019 19:04:07 -0200 Subject: [PATCH 04/30] Fix script --- build_images.sh | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/build_images.sh b/build_images.sh index a71157f..f239884 100644 --- a/build_images.sh +++ b/build_images.sh @@ -1,16 +1,19 @@ #!/bin/bash +# Build images for Prometheus Operator and dependencies +# Run on Linux AMD64 machine due to qemu image for rbac-proxy + export DOCKER_CLI_EXPERIMENTAL=enabled REPO=carlosedp AOR_VERSION=2.1 -KSM_VERSION=v1.4.0 -PROM_OP_VERSION=v0.26.0 -PROMCONFIGRELOADER_VERSION=v0.20.0 PROM_ADAPTER_VERSION=v0.4.1 -PROM_CONFIG_RELOADER_VERSION=v0.26.0 -KUBE_RBAC_VERSION=v0.4.0 +KSM_VERSION=v1.5.0 +PROM_OP_VERSION=v0.28.0 +KUBE_RBAC_VERSION=v0.4.1 +PROMCONFIGRELOADER_VERSION=v0.20.0 +PROM_CONFIG_RELOADER_VERSION=v0.28.0 CONFIGMAP_RELOAD_VERSION=v0.2.2 #------------------------------------------------------------------------------- # Kubernetes addon-resizer @@ -37,6 +40,13 @@ docker rmi $REPO/addon-resizer:$AOR_VERSION-amd64 manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64,linux/amd64 --template $REPO/addon-resizer:$AOR_VERSION-ARCH --target $REPO/addon-resizer:$AOR_VERSION manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64,linux/amd64 --template $REPO/addon-resizer:$AOR_VERSION-ARCH --target $REPO/addon-resizer:latest +IMAGE=$REPO/addon-resizer +VERSION=$AOR_VERSION +ALL_ARCH='amd64 arm arm64' + +docker manifest create --amend $IMAGE:$VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$VERSION\-&~g"` +for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$VERSION $IMAGE:$VERSION-$arch; done +docker manifest push $IMAGE:$VERSION #------------------------------------------------------------------------------- # Prometheus-adapter # Retag prometheus-adapter from directxman12 images to have unified manifest on DockerHub @@ -77,6 +87,7 @@ VERSION=$KSM_VERSION go get github.com/kubernetes/kube-state-metrics #mv $HOME/go/src/github.com/kubernetes/kube-state-metrics $HOME/go/src/k8s.io/kube-state-metrics pushd $GOPATH/src/k8s.io/kube-state-metrics +git pull git checkout ${KSM_VERSION} cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm32v6\/alpine:3.7/' > Dockerfile.arm @@ -115,6 +126,7 @@ VERSION=$PROM_OP_VERSION go get github.com/coreos/prometheus-operator cd $HOME/go/src/github.com/coreos/prometheus-operator +git pull git checkout ${VERSION} go get -u github.com/prometheus/promu @@ -153,6 +165,7 @@ ALL_ARCH='amd64 arm arm64' go get github.com/brancz/kube-rbac-proxy cd $HOME/go/src/github.com/brancz/kube-rbac-proxy +git pull git checkout ${VERSION} cat > Dockerfile.arm < Dockerfile.arm @@ -250,7 +264,8 @@ ALL_ARCH='amd64 arm arm64' go get github.com/openshift/configmap-reload cd $HOME/go/src/github.com/openshift/configmap-reload -#git checkout ${VERSION} +git pull +git checkout ${VERSION} cat > Dockerfile.arm < Date: Thu, 31 Jan 2019 19:06:10 -0200 Subject: [PATCH 05/30] Clean wrong command --- build_images.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/build_images.sh b/build_images.sh index f239884..41bbcee 100644 --- a/build_images.sh +++ b/build_images.sh @@ -37,9 +37,6 @@ docker rmi $REPO/addon-resizer:$AOR_VERSION-arm docker rmi $REPO/addon-resizer:$AOR_VERSION-arm64 docker rmi $REPO/addon-resizer:$AOR_VERSION-amd64 -manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64,linux/amd64 --template $REPO/addon-resizer:$AOR_VERSION-ARCH --target $REPO/addon-resizer:$AOR_VERSION -manifest-tool-linux-arm64 push from-args --platforms linux/arm,linux/arm64,linux/amd64 --template $REPO/addon-resizer:$AOR_VERSION-ARCH --target $REPO/addon-resizer:latest - IMAGE=$REPO/addon-resizer VERSION=$AOR_VERSION ALL_ARCH='amd64 arm arm64' From 3dabb24eb15f482539f95e5ff0b27113b3942426 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Thu, 31 Jan 2019 19:47:22 -0200 Subject: [PATCH 06/30] Fix script --- build_images.sh | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/build_images.sh b/build_images.sh index 41bbcee..66adf74 100644 --- a/build_images.sh +++ b/build_images.sh @@ -12,7 +12,6 @@ PROM_ADAPTER_VERSION=v0.4.1 KSM_VERSION=v1.5.0 PROM_OP_VERSION=v0.28.0 KUBE_RBAC_VERSION=v0.4.1 -PROMCONFIGRELOADER_VERSION=v0.20.0 PROM_CONFIG_RELOADER_VERSION=v0.28.0 CONFIGMAP_RELOAD_VERSION=v0.2.2 #------------------------------------------------------------------------------- @@ -84,7 +83,7 @@ VERSION=$KSM_VERSION go get github.com/kubernetes/kube-state-metrics #mv $HOME/go/src/github.com/kubernetes/kube-state-metrics $HOME/go/src/k8s.io/kube-state-metrics pushd $GOPATH/src/k8s.io/kube-state-metrics -git pull +git fetch git checkout ${KSM_VERSION} cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm32v6\/alpine:3.7/' > Dockerfile.arm @@ -123,7 +122,7 @@ VERSION=$PROM_OP_VERSION go get github.com/coreos/prometheus-operator cd $HOME/go/src/github.com/coreos/prometheus-operator -git pull +git fetch git checkout ${VERSION} go get -u github.com/prometheus/promu @@ -162,7 +161,7 @@ ALL_ARCH='amd64 arm arm64' go get github.com/brancz/kube-rbac-proxy cd $HOME/go/src/github.com/brancz/kube-rbac-proxy -git pull +git fetch git checkout ${VERSION} cat > Dockerfile.arm < Dockerfile.arm +sed -i '/^FROM/a COPY qemu-arm-static /usr/bin/qemu-arm-static' Dockerfile.arm +sed -i '/^RUN/a RUN rm /usr/bin/qemu-arm-static' Dockerfile.arm cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM arm64v8\/busybox/' > Dockerfile.arm64 +sed -i '/^FROM/a COPY qemu-aarch64-static /usr/bin/qemu-aarch64-static' Dockerfile.arm64 +sed -i '/^RUN/a RUN rm /usr/bin/qemu-aarch64-static' Dockerfile.arm64 cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM amd64\/busybox/' > Dockerfile.amd64 @@ -252,6 +260,7 @@ docker manifest push $IMAGE:$VERSION rm Dockerfile.arm rm Dockerfile.arm64 +rm Dockerfile.amd64 #------------------------------------------------------------------------------- # configmap-reload @@ -261,7 +270,7 @@ ALL_ARCH='amd64 arm arm64' go get github.com/openshift/configmap-reload cd $HOME/go/src/github.com/openshift/configmap-reload -git pull +git fetch git checkout ${VERSION} cat > Dockerfile.arm < Date: Fri, 1 Feb 2019 11:43:22 -0200 Subject: [PATCH 07/30] Added Traefik. Updated image versions --- arm_exporter.jsonnet | 5 +- main.jsonnet | 3 +- .../0prometheus-operator-deployment.yaml | 4 +- manifests/arm-exporter-daemonset.yaml | 8 +- manifests/grafana-deployment.yaml | 17 +---- manifests/node-exporter-daemonset.yaml | 2 +- manifests/prometheus-adapter-deployment.yaml | 2 +- manifests/traefik-serviceMonitor.yaml | 19 +++++ operator_stack.jsonnet | 74 +++++++++---------- traefik.jsonnet | 45 +++++++++++ 10 files changed, 114 insertions(+), 65 deletions(-) create mode 100644 manifests/traefik-serviceMonitor.yaml create mode 100644 traefik.jsonnet diff --git a/arm_exporter.jsonnet b/arm_exporter.jsonnet index cb5dc20..21f0bbb 100644 --- a/arm_exporter.jsonnet +++ b/arm_exporter.jsonnet @@ -25,9 +25,8 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { local armExporter = container.new('arm-exporter', $._config.imageRepos.armExporter + ':' + $._config.versions.armExporter) + - container.mixin.resources.withRequests({ cpu: '100m', memory: '180Mi' }) + - container.mixin.resources.withLimits({ cpu: '200m', memory: '180Mi' }); - + container.mixin.resources.withRequests({ cpu: '50m', memory: '50Mi' }) + + container.mixin.resources.withLimits({ cpu: '100m', memory: '100Mi' }); local proxy = container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + container.withArgs([ diff --git a/main.jsonnet b/main.jsonnet index 6099c2c..e594914 100644 --- a/main.jsonnet +++ b/main.jsonnet @@ -1,2 +1,3 @@ (import 'operator_stack.jsonnet') + -(import 'arm_exporter.jsonnet') +(import 'arm_exporter.jsonnet') + +(import 'traefik.jsonnet') diff --git a/manifests/0prometheus-operator-deployment.yaml b/manifests/0prometheus-operator-deployment.yaml index 833bec9..eb02935 100644 --- a/manifests/0prometheus-operator-deployment.yaml +++ b/manifests/0prometheus-operator-deployment.yaml @@ -20,8 +20,8 @@ spec: - --kubelet-service=kube-system/kubelet - --logtostderr=true - --config-reloader-image=carlosedp/configmap-reload:v0.2.2 - - --prometheus-config-reloader=carlosedp/prometheus-config-reloader:v0.26.0 - image: carlosedp/prometheus-operator:v0.26.0 + - --prometheus-config-reloader=carlosedp/prometheus-config-reloader:v0.28.0 + image: carlosedp/prometheus-operator:v0.28.0 name: prometheus-operator ports: - containerPort: 8080 diff --git a/manifests/arm-exporter-daemonset.yaml b/manifests/arm-exporter-daemonset.yaml index 1d247d0..11b3703 100644 --- a/manifests/arm-exporter-daemonset.yaml +++ b/manifests/arm-exporter-daemonset.yaml @@ -19,11 +19,11 @@ spec: name: arm-exporter resources: limits: - cpu: 200m - memory: 180Mi - requests: cpu: 100m - memory: 180Mi + memory: 100Mi + requests: + cpu: 50m + memory: 50Mi - args: - --secure-listen-address=$(IP):9243 - --upstream=http://127.0.0.1:9243/ diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 1ef1d7b..6e0b26f 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,9 +16,7 @@ spec: app: grafana spec: containers: - - args: - - -config=/etc/grafana/grafana.ini - image: carlosedp/monitoring-grafana:v5.4.0 + - image: carlosedp/monitoring-grafana:v5.4.0 name: grafana ports: - containerPort: 3000 @@ -117,16 +115,3 @@ spec: - name: grafana-config secret: secretName: grafana-config - volumeClaimTemplate: - apiVersion: v1 - kind: PersistentVolumeClaim - metadata: - name: grafana-storage - namespace: monitoring - spec: - accessModes: - - ReadWriteMany - resources: - requests: - storage: 2Gi - storageClassName: nfs-ssd-node1 diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index 9b781c7..ed85468 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -49,7 +49,7 @@ spec: valueFrom: fieldRef: fieldPath: status.podIP - image: carlosedp/kube-rbac-proxy:v0.4.0 + image: carlosedp/kube-rbac-proxy:v0.4.1 name: kube-rbac-proxy ports: - containerPort: 9100 diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml index ddf6a32..cdba964 100644 --- a/manifests/prometheus-adapter-deployment.yaml +++ b/manifests/prometheus-adapter-deployment.yaml @@ -25,7 +25,7 @@ spec: - --metrics-relist-interval=1m - --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/ - --secure-port=6443 - image: directxman12/k8s-prometheus-adapter-arm64:v0.4.0 + image: directxman12/k8s-prometheus-adapter-arm64:v0.4.1 name: prometheus-adapter ports: - containerPort: 6443 diff --git a/manifests/traefik-serviceMonitor.yaml b/manifests/traefik-serviceMonitor.yaml new file mode 100644 index 0000000..fdf7d3d --- /dev/null +++ b/manifests/traefik-serviceMonitor.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: traefik-ingress-lb + name: traefik-ingress-lb + namespace: monitoring +spec: + endpoints: + - interval: 30s + port: admin + scheme: https + jobLabel: k8s-app + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: traefik-ingress-lb diff --git a/operator_stack.jsonnet b/operator_stack.jsonnet index 8b2b49f..7b36362 100644 --- a/operator_stack.jsonnet +++ b/operator_stack.jsonnet @@ -17,15 +17,15 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { versions+:: { prometheus: "v2.5.0", alertmanager: "v0.15.3", - kubeStateMetrics: "v1.4.0", - kubeRbacProxy: "v0.4.0", + kubeStateMetrics: "v1.5.0", + kubeRbacProxy: "v0.4.1", addonResizer: "2.1", nodeExporter: "v0.17.0", - prometheusOperator: "v0.26.0", - prometheusAdapter: "v0.4.0", + prometheusOperator: "v0.28.0", + prometheusAdapter: "v0.4.1", grafana: "v5.4.0", configmapReloader: "v0.2.2", - prometheusConfigReloader: "v0.26.0", + prometheusConfigReloader: "v0.28.0", }, imageRepos+:: { @@ -136,38 +136,38 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { }, # Override command for Grafana to load ini from correct path - grafana+:: { - deployment+: - { - local pvc = k.core.v1.persistentVolumeClaim, - spec+: { - volumeClaimTemplate: - pvc.new() + - pvc.mixin.metadata.withNamespace($._config.namespace) + - pvc.mixin.metadata.withName("grafana-storage") + - pvc.mixin.spec.withAccessModes('ReadWriteMany') + - pvc.mixin.spec.resources.withRequests({ storage: '2Gi' }) + - pvc.mixin.spec.withStorageClassName('nfs-ssd-node1'), - template+: { - spec+: { - containers: - std.map( - function(c) - if c.name == 'grafana' then - c { - args+: [ - '-config=/etc/grafana/grafana.ini', - ], - } - else - c, - super.containers, - ), - }, - }, - }, - }, - }, + // grafana+:: { + // deployment+: + // { + // local pvc = k.core.v1.persistentVolumeClaim, + // spec+: { + // volumeClaimTemplate: + // pvc.new() + + // pvc.mixin.metadata.withNamespace($._config.namespace) + + // pvc.mixin.metadata.withName("grafana-storage") + + // pvc.mixin.spec.withAccessModes('ReadWriteMany') + + // pvc.mixin.spec.resources.withRequests({ storage: '2Gi' }) + + // pvc.mixin.spec.withStorageClassName('nfs-ssd-node1'), + // template+: { + // spec+: { + // containers: + // std.map( + // function(c) + // if c.name == 'grafana' then + // c { + // args+: [ + // '-config=/etc/grafana/grafana.ini', + // ], + // } + // else + // c, + // super.containers, + // ), + // }, + // }, + // }, + // }, + // }, // Override command for addon-resizer due to change from parameter --threshold to --acceptance-offset kubeStateMetrics+:: { diff --git a/traefik.jsonnet b/traefik.jsonnet new file mode 100644 index 0000000..9fa0926 --- /dev/null +++ b/traefik.jsonnet @@ -0,0 +1,45 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { + _config+:: { + namespace: 'monitoring', + }, + + traefik+:: { + serviceMonitor: + { + + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'traefik-ingress-lb', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'traefik-ingress-lb', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'traefik-ingress-lb', + }, + }, + endpoints: [ + { + port: 'admin', + scheme: 'https', + interval: '30s', + }, + ], + namespaceSelector: { + matchNames: [ + 'kube-system', + ] + }, + }, + }, + }, +}; + +{ ['traefik-' + name]: kp.traefik[name] for name in std.objectFields(kp.traefik) } \ No newline at end of file From 5065d369e13e144ddb94c93693bbb1b637cded4d Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Fri, 1 Feb 2019 11:52:53 -0200 Subject: [PATCH 08/30] Updated vendor libraries --- jsonnetfile.lock.json | 14 ++-- ...0alertmanagerCustomResourceDefinition.yaml | 6 ++ ...r-0prometheusCustomResourceDefinition.yaml | 27 ++++++++ manifests/grafana-dashboardDefinitions.yaml | 28 +++++++- manifests/grafana-deployment.yaml | 2 + manifests/grafana-service.yaml | 2 + manifests/grafana-serviceMonitor.yaml | 12 ++++ manifests/kube-state-metrics-clusterRole.yaml | 7 ++ manifests/node-exporter-daemonset.yaml | 3 + manifests/prometheus-adapter-deployment.yaml | 4 +- ...metheus-roleBindingSpecificNamespaces.yaml | 13 ++++ .../prometheus-roleSpecificNamespaces.yaml | 19 +++++- manifests/prometheus-rules.yaml | 67 ++++++++++--------- manifests/prometheus-service.yaml | 1 + .../prometheus-serviceMonitorCoreDNS.yaml | 1 + operator_stack.jsonnet | 4 +- 16 files changed, 164 insertions(+), 46 deletions(-) create mode 100644 manifests/grafana-serviceMonitor.yaml diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index be90246..efbc09a 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "9536d7787789b74b692cd8a5482a2801b1aba232" + "version": "e123a1de479dbb911b4070f1bfcbd1e65e02209e" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "4c23c06fff9ef50744f5ed306c9ab0c4bd78a144" + "version": "668950e4af13f0153fa1d7b58ebe7023b33f2217" }, { "name": "grafonnet", @@ -38,7 +38,7 @@ "subdir": "grafonnet" } }, - "version": "eea8b5ba6b8883cf2df5a17c39a42c4b57c0d63e" + "version": "11022f5e920ac1ea960556193e3f0ab57d70d7c5" }, { "name": "grafana-builder", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "c6932cf90bce4fef218b4308effc9f15c4219a01" + "version": "ec3d4f943df01f517a083305666cd1c87bcc7e94" }, { "name": "grafana", @@ -58,7 +58,7 @@ "subdir": "grafana" } }, - "version": "da19aef6f5b378fb5281e6f61dbadbbf734d45ee" + "version": "9ddf5a198b0f7c898dc061158ea427112acbae11" }, { "name": "prometheus-operator", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "72ec4b9b16ef11700724dc71fec77112536eed40" + "version": "174c9bf17bec78b055e8e63c6ae3a3dc9bb0a3a8" }, { "name": "etcd-mixin", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "15b6a17be48dea91a11497980b9adab541add7f0" + "version": "6070db22ed3d46372a5600fe8f35907f4d706bdb" } ] } diff --git a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml index d5c94fc..89748f1 100644 --- a/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0alertmanagerCustomResourceDefinition.yaml @@ -1378,6 +1378,12 @@ spec: under. This is necessary to generate correct URLs. This is necessary if Alertmanager is not served from root of a DNS name. type: string + image: + description: Image if specified has precedence over baseImage, tag and + sha combinations. Specifying the version is still necessary to ensure + the Prometheus Operator knows what version of Alertmanager is being + configured. + type: string imagePullSecrets: description: An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index d825277..627ce96 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1550,6 +1550,12 @@ spec: under. This is necessary to generate correct URLs. This is necessary if Prometheus is not served from root of a DNS name. type: string + image: + description: Image if specified has precedence over baseImage, tag and + sha combinations. Specifying the version is still necessary to ensure + the Prometheus Operator knows what version of Prometheus is being + configured. + type: string imagePullSecrets: description: An optional list of references to secrets in the same namespace to use for pulling prometheus and alertmanager images from registries @@ -1863,6 +1869,21 @@ spec: priorityClassName: description: Priority class assigned to the Pods type: string + query: + description: QuerySpec defines the query command line flags when starting + Prometheus. + properties: + lookbackDelta: + description: The delta difference allowed for retrieving metrics + during expression evaluations. + type: string + maxConcurrency: + description: Number of concurrent queries that can be run at once. + format: int32 + type: integer + timeout: + description: Maximum time a query may take before being aborted. + type: string remoteRead: description: If specified, the remote_read spec. This is an experimental feature, it may change in any upcoming release in a breaking way. @@ -2943,6 +2964,12 @@ spec: type: boolean required: - key + image: + description: Image if specified has precedence over baseImage, tag + and sha combinations. Specifying the version is still necessary + to ensure the Prometheus Operator knows what version of Thanos + is being configured. + type: string peers: description: Peers is a DNS name for Thanos to discover peers through. type: string diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 4900caa..f3374ed 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -4896,6 +4896,12 @@ items: data: nodes.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ @@ -6208,6 +6214,12 @@ items: data: persistentvolumesusage.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ @@ -6551,6 +6563,12 @@ items: data: pods.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ @@ -6730,7 +6748,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", + "expr": "sum by (container_name) (rate(container_cpu_usage_seconds_total{job=\"kubelet\", namespace=\"$namespace\", image!=\"\",container_name!=\"POD\",pod_name=\"$pod\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ container_name }}", @@ -6833,7 +6851,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", pod_name=\"$pod\"}[1m])))", + "expr": "sort_desc(sum by (pod_name) (rate(container_network_receive_bytes_total{job=\"kubelet\", namespace=\"$namespace\", pod_name=\"$pod\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ pod_name }}", @@ -7035,6 +7053,12 @@ items: data: statefulset.json: |- { + "__inputs": [ + + ], + "__requires": [ + + ], "annotations": { "list": [ diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 6e0b26f..47b3034 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -72,6 +72,8 @@ spec: - mountPath: /etc/grafana name: grafana-config readOnly: false + nodeSelector: + beta.kubernetes.io/os: linux securityContext: runAsNonRoot: true runAsUser: 65534 diff --git a/manifests/grafana-service.yaml b/manifests/grafana-service.yaml index 45f77a0..3acdf1e 100644 --- a/manifests/grafana-service.yaml +++ b/manifests/grafana-service.yaml @@ -1,6 +1,8 @@ apiVersion: v1 kind: Service metadata: + labels: + app: grafana name: grafana namespace: monitoring spec: diff --git a/manifests/grafana-serviceMonitor.yaml b/manifests/grafana-serviceMonitor.yaml new file mode 100644 index 0000000..7ede266 --- /dev/null +++ b/manifests/grafana-serviceMonitor.yaml @@ -0,0 +1,12 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: grafana + namespace: monitoring +spec: + endpoints: + - interval: 15s + port: http + selector: + matchLabels: + app: grafana diff --git a/manifests/kube-state-metrics-clusterRole.yaml b/manifests/kube-state-metrics-clusterRole.yaml index c519a91..b939df6 100644 --- a/manifests/kube-state-metrics-clusterRole.yaml +++ b/manifests/kube-state-metrics-clusterRole.yaml @@ -67,3 +67,10 @@ rules: - subjectaccessreviews verbs: - create +- apiGroups: + - policy + resources: + - poddisruptionbudgets + verbs: + - list + - watch diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index ed85468..213714d 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -19,6 +19,7 @@ spec: - --web.listen-address=127.0.0.1:9100 - --path.procfs=/host/proc - --path.sysfs=/host/sys + - --path.rootfs=/host/root - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ image: carlosedp/node_exporter:v0.17.0 @@ -42,7 +43,9 @@ spec: name: root readOnly: true - args: + - --logtostderr - --secure-listen-address=$(IP):9100 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 - --upstream=http://127.0.0.1:9100/ env: - name: IP diff --git a/manifests/prometheus-adapter-deployment.yaml b/manifests/prometheus-adapter-deployment.yaml index cdba964..a7f6028 100644 --- a/manifests/prometheus-adapter-deployment.yaml +++ b/manifests/prometheus-adapter-deployment.yaml @@ -25,7 +25,7 @@ spec: - --metrics-relist-interval=1m - --prometheus-url=http://prometheus-k8s.monitoring.svc:9090/ - --secure-port=6443 - image: directxman12/k8s-prometheus-adapter-arm64:v0.4.1 + image: carlosedp/k8s-prometheus-adapter:v0.4.1 name: prometheus-adapter ports: - containerPort: 6443 @@ -39,6 +39,8 @@ spec: - mountPath: /etc/adapter name: config readOnly: false + nodeSelector: + beta.kubernetes.io/os: linux serviceAccountName: prometheus-adapter volumes: - emptyDir: {} diff --git a/manifests/prometheus-roleBindingSpecificNamespaces.yaml b/manifests/prometheus-roleBindingSpecificNamespaces.yaml index c7527f6..087cdcf 100644 --- a/manifests/prometheus-roleBindingSpecificNamespaces.yaml +++ b/manifests/prometheus-roleBindingSpecificNamespaces.yaml @@ -39,4 +39,17 @@ items: - kind: ServiceAccount name: prometheus-k8s namespace: monitoring +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-k8s + namespace: logging + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring kind: RoleBindingList diff --git a/manifests/prometheus-roleSpecificNamespaces.yaml b/manifests/prometheus-roleSpecificNamespaces.yaml index b305774..56a7370 100644 --- a/manifests/prometheus-roleSpecificNamespaces.yaml +++ b/manifests/prometheus-roleSpecificNamespaces.yaml @@ -9,7 +9,6 @@ items: - apiGroups: - "" resources: - - nodes - services - endpoints - pods @@ -26,7 +25,6 @@ items: - apiGroups: - "" resources: - - nodes - services - endpoints - pods @@ -43,7 +41,22 @@ items: - apiGroups: - "" resources: - - nodes + - services + - endpoints + - pods + verbs: + - get + - list + - watch +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-k8s + namespace: logging + rules: + - apiGroups: + - "" + resources: - services - endpoints - pods diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 05e0deb..443943c 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -288,21 +288,24 @@ spec: record: 'node:node_inodes_free:' - name: kube-prometheus-node-recording.rules rules: - - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[3m])) BY (instance) + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY + (instance) record: instance:node_cpu:rate:sum - - expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) + - expr: sum((node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"})) BY (instance) record: instance:node_filesystem_usage:sum - - expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) + - expr: sum(rate(node_network_receive_bytes_total[3m])) BY (instance) record: instance:node_network_receive_bytes:rate:sum - - expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) + - expr: sum(rate(node_network_transmit_bytes_total[3m])) BY (instance) record: instance:node_network_transmit_bytes:rate:sum - - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) WITHOUT (cpu, mode) - / ON(instance) GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) WITHOUT + (cpu, mode) / ON(instance) GROUP_LEFT() count(sum(node_cpu_seconds_total) + BY (instance, cpu)) BY (instance) record: instance:node_cpu:ratio - - expr: sum(rate(node_cpu{mode!="idle",mode!="iowait"}[5m])) + - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[5m])) record: cluster:node_cpu:sum_rate5m - - expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) + - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) + BY (instance, cpu)) record: cluster:node_cpu:ratio - name: kubernetes-absent rules: @@ -311,7 +314,7 @@ spec: message: Alertmanager has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-alertmanagerdown expr: | - absent(up{job="alertmanager-main"} == 1) + absent(up{job="alertmanager-main",namespace="monitoring"} == 1) for: 15m labels: severity: critical @@ -383,7 +386,7 @@ spec: message: Prometheus has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusdown expr: | - absent(up{job="prometheus-k8s"} == 1) + absent(up{job="prometheus-k8s",namespace="monitoring"} == 1) for: 15m labels: severity: critical @@ -392,7 +395,7 @@ spec: message: PrometheusOperator has disappeared from Prometheus target discovery. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-prometheusoperatordown expr: | - absent(up{job="prometheus-operator"} == 1) + absent(up{job="prometheus-operator",namespace="monitoring"} == 1) for: 15m labels: severity: critical @@ -626,8 +629,8 @@ spec: }} for container {{ $labels.container_name }} in pod {{ $labels.pod_name }}.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-cputhrottlinghigh - expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{}[5m])) - by (container_name, pod_name, namespace) \n / \nsum(increase(container_cpu_cfs_periods_total{}[5m])) + expr: "100 * sum(increase(container_cpu_cfs_throttled_periods_total{container_name!=\"\", + }[5m])) by (container_name, pod_name, namespace)\n /\nsum(increase(container_cpu_cfs_periods_total{}[5m])) by (container_name, pod_name, namespace)\n > 25 \n" for: 15m labels: @@ -773,7 +776,8 @@ spec: severity: warning - alert: KubeClientCertificateExpiration annotations: - message: Kubernetes API certificate is expiring in less than 7 days. + message: A client certificate used to authenticate to the apiserver is expiring + in less than 7 days. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 @@ -781,7 +785,8 @@ spec: severity: warning - alert: KubeClientCertificateExpiration annotations: - message: Kubernetes API certificate is expiring in less than 24 hours. + message: A client certificate used to authenticate to the apiserver is expiring + in less than 24 hours. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration expr: | histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 @@ -794,7 +799,7 @@ spec: message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}` are out of sync. expr: | - count_values("config_hash", alertmanager_config_hash{job="alertmanager-main"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 + count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1 for: 5m labels: severity: critical @@ -803,7 +808,7 @@ spec: message: Reloading Alertmanager's configuration has failed for {{ $labels.namespace }}/{{ $labels.pod}}. expr: | - alertmanager_config_last_reload_successful{job="alertmanager-main"} == 0 + alertmanager_config_last_reload_successful{job="alertmanager-main",namespace="monitoring"} == 0 for: 10m labels: severity: warning @@ -811,9 +816,9 @@ spec: annotations: message: Alertmanager has not found all other members of the cluster. expr: | - alertmanager_cluster_members{job="alertmanager-main"} + alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"} != on (service) GROUP_LEFT() - count by (service) (alertmanager_cluster_members{job="alertmanager-main"}) + count by (service) (alertmanager_cluster_members{job="alertmanager-main",namespace="monitoring"}) for: 5m labels: severity: critical @@ -860,7 +865,7 @@ spec: description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} summary: Reloading Prometheus' configuration failed expr: | - prometheus_config_last_reload_successful{job="prometheus-k8s"} == 0 + prometheus_config_last_reload_successful{job="prometheus-k8s",namespace="monitoring"} == 0 for: 10m labels: severity: warning @@ -870,7 +875,7 @@ spec: $labels.pod}} summary: Prometheus' alert notification queue is running full expr: | - predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s"} + predict_linear(prometheus_notifications_queue_length{job="prometheus-k8s",namespace="monitoring"}[5m], 60 * 30) > prometheus_notifications_queue_capacity{job="prometheus-k8s",namespace="monitoring"} for: 10m labels: severity: warning @@ -880,7 +885,7 @@ spec: $labels.pod}} to Alertmanager {{$labels.Alertmanager}} summary: Errors while sending alert from Prometheus expr: | - rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.01 + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.01 for: 10m labels: severity: warning @@ -890,7 +895,7 @@ spec: $labels.pod}} to Alertmanager {{$labels.Alertmanager}} summary: Errors while sending alerts from Prometheus expr: | - rate(prometheus_notifications_errors_total{job="prometheus-k8s"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s"}[5m]) > 0.03 + rate(prometheus_notifications_errors_total{job="prometheus-k8s",namespace="monitoring"}[5m]) / rate(prometheus_notifications_sent_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0.03 for: 10m labels: severity: critical @@ -900,7 +905,7 @@ spec: to any Alertmanagers summary: Prometheus is not connected to any Alertmanagers expr: | - prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s"} < 1 + prometheus_notifications_alertmanagers_discovered{job="prometheus-k8s",namespace="monitoring"} < 1 for: 10m labels: severity: warning @@ -910,7 +915,7 @@ spec: reload failures over the last four hours.' summary: Prometheus has issues reloading data blocks from disk expr: | - increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s"}[2h]) > 0 + increase(prometheus_tsdb_reloads_failures_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0 for: 12h labels: severity: warning @@ -920,7 +925,7 @@ spec: compaction failures over the last four hours.' summary: Prometheus has issues compacting sample blocks expr: | - increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s"}[2h]) > 0 + increase(prometheus_tsdb_compactions_failed_total{job="prometheus-k8s",namespace="monitoring"}[2h]) > 0 for: 12h labels: severity: warning @@ -930,7 +935,7 @@ spec: log (WAL).' summary: Prometheus write-ahead log is corrupted expr: | - tsdb_wal_corruptions_total{job="prometheus-k8s"} > 0 + tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"} > 0 for: 4h labels: severity: warning @@ -940,7 +945,7 @@ spec: samples. summary: Prometheus isn't ingesting samples expr: | - rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s"}[5m]) <= 0 + rate(prometheus_tsdb_head_samples_appended_total{job="prometheus-k8s",namespace="monitoring"}[5m]) <= 0 for: 10m labels: severity: warning @@ -950,7 +955,7 @@ spec: due to duplicate timestamps but different values' summary: Prometheus has many samples rejected expr: | - increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s"}[5m]) > 0 + increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{job="prometheus-k8s",namespace="monitoring"}[5m]) > 0 for: 10m labels: severity: warning @@ -961,7 +966,7 @@ spec: message: Errors while reconciling {{ $labels.controller }} in {{ $labels.namespace }} Namespace. expr: | - rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator"}[5m]) > 0.1 + rate(prometheus_operator_reconcile_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning @@ -969,7 +974,7 @@ spec: annotations: message: Errors while reconciling Prometheus in {{ $labels.namespace }} Namespace. expr: | - rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator"}[5m]) > 0.1 + rate(prometheus_operator_node_address_lookup_errors_total{job="prometheus-operator",namespace="monitoring"}[5m]) > 0.1 for: 10m labels: severity: warning diff --git a/manifests/prometheus-service.yaml b/manifests/prometheus-service.yaml index 85b007f..4f61e88 100644 --- a/manifests/prometheus-service.yaml +++ b/manifests/prometheus-service.yaml @@ -13,3 +13,4 @@ spec: selector: app: prometheus prometheus: k8s + sessionAffinity: ClientIP diff --git a/manifests/prometheus-serviceMonitorCoreDNS.yaml b/manifests/prometheus-serviceMonitorCoreDNS.yaml index 14a2454..633aa18 100644 --- a/manifests/prometheus-serviceMonitorCoreDNS.yaml +++ b/manifests/prometheus-serviceMonitorCoreDNS.yaml @@ -10,6 +10,7 @@ spec: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token interval: 15s port: metrics + jobLabel: k8s-app namespaceSelector: matchNames: - kube-system diff --git a/operator_stack.jsonnet b/operator_stack.jsonnet index 7b36362..71150ef 100644 --- a/operator_stack.jsonnet +++ b/operator_stack.jsonnet @@ -36,7 +36,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { addonResizer: "carlosedp/addon-resizer", nodeExporter: "carlosedp/node_exporter", prometheusOperator: "carlosedp/prometheus-operator", - prometheusAdapter: "directxman12/k8s-prometheus-adapter-arm64", + prometheusAdapter: "carlosedp/k8s-prometheus-adapter", grafana: "carlosedp/monitoring-grafana", configmapReloader: "carlosedp/configmap-reload", prometheusConfigReloader: "carlosedp/prometheus-config-reloader", @@ -45,7 +45,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { prometheus+:: { names: 'k8s', replicas: 1, - namespaces: ["default", "kube-system","monitoring"], + namespaces: ["default", "kube-system","monitoring","logging"], }, alertmanager+:: { From 6405a1ecf70042ee4cc9fe71861cd925eed3a739 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 17:22:30 -0200 Subject: [PATCH 09/30] Fix image building script --- build_images.sh | 40 ++++++++++++++++------------------------ 1 file changed, 16 insertions(+), 24 deletions(-) mode change 100644 => 100755 build_images.sh diff --git a/build_images.sh b/build_images.sh old mode 100644 new mode 100755 index 66adf74..dda3680 --- a/build_images.sh +++ b/build_images.sh @@ -42,7 +42,7 @@ ALL_ARCH='amd64 arm arm64' docker manifest create --amend $IMAGE:$VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$VERSION\-&~g"` for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$VERSION $IMAGE:$VERSION-$arch; done -docker manifest push $IMAGE:$VERSION +docker manifest push --purge $IMAGE:$VERSION #------------------------------------------------------------------------------- # Prometheus-adapter # Retag prometheus-adapter from directxman12 images to have unified manifest on DockerHub @@ -72,7 +72,7 @@ ALL_ARCH='amd64 arm arm64' docker manifest create --amend $IMAGE:$VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$VERSION\-&~g"` for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$VERSION $IMAGE:$VERSION-$arch; done -docker manifest push $IMAGE:$VERSION +docker manifest push --purge $IMAGE:$VERSION #------------------------------------------------------------------------------- # Kube-state-metrics @@ -92,26 +92,22 @@ cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^ cat Dockerfile |sed -e 's/\.build\/linux-amd64\/operator/operator/' |sed -e 's/^FROM.*/FROM amd64\/alpine:3.7/' > Dockerfile.amd64 -GOOS=linux GOARCH=arm go build . +CGO_ENABLED=0 GOOS=linux GOARCH=arm go build . docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-arm -f Dockerfile.arm . -GOOS=linux GOARCH=arm64 go build . +CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build . docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-arm64 -f Dockerfile.arm64 . -GOOS=linux GOARCH=amd64 go build . +CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -installsuffix cgo -ldflags '-extldflags "-static"' . docker build -t $REPO/kube-state-metrics:${KSM_VERSION}-amd64 -f Dockerfile.amd64 . docker push $REPO/kube-state-metrics:$KSM_VERSION-arm docker push $REPO/kube-state-metrics:$KSM_VERSION-arm64 docker push $REPO/kube-state-metrics:$KSM_VERSION-amd64 -docker rmi $REPO/kube-state-metrics:$KSM_VERSION-arm -docker rmi $REPO/kube-state-metrics:$KSM_VERSION-arm64 -docker rmi $REPO/kube-state-metrics:$KSM_VERSION-amd64 - docker manifest create --amend $IMAGE:$VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$VERSION\-&~g"` for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$VERSION $IMAGE:$VERSION-$arch; done -docker manifest push $IMAGE:$VERSION +docker manifest push --purge $IMAGE:$VERSION popd #------------------------------------------------------------------------------- @@ -148,7 +144,7 @@ docker push $REPO/prometheus-operator:$VERSION-amd64 docker manifest create --amend $IMAGE:$VERSION `echo $ALL_ARCH | sed -e "s~[^ ]*~$IMAGE:$VERSION\-&~g"` for arch in $ALL_ARCH; do docker manifest annotate --arch $arch $IMAGE:$VERSION $IMAGE:$VERSION-$arch; done -docker manifest push $IMAGE:$VERSION +docker manifest push --purge $IMAGE:$VERSION rm Dockerfile.arm rm Dockerfile.arm64 @@ -184,7 +180,7 @@ EXPOSE 8080 EOF cat > Dockerfile.amd64 < Date: Mon, 4 Feb 2019 17:23:24 -0200 Subject: [PATCH 10/30] Fix arm_exporter --- arm_exporter.jsonnet | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/arm_exporter.jsonnet b/arm_exporter.jsonnet index 21f0bbb..e5a0792 100644 --- a/arm_exporter.jsonnet +++ b/arm_exporter.jsonnet @@ -1,18 +1,10 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'image_sources_versions.jsonnet') + + { _config+:: { namespace: 'monitoring', - - versions+:: { - armExporter: 'latest', - kubeRbacProxy: 'v0.4.0', - }, - - imageRepos+:: { - armExporter: 'carlosedp/arm_exporter', - kubeRbacProxy: 'carlosedp/kube-rbac-proxy', - }, }, armExporter+:: { From e48d9116329db84b05a88360a7902e1ea2b3fbcc Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 17:23:51 -0200 Subject: [PATCH 11/30] Update libs --- image_sources_versions.jsonnet | 35 ++++++++++++++++++++++++++++++++++ jsonnetfile.lock.json | 4 ++-- 2 files changed, 37 insertions(+), 2 deletions(-) create mode 100644 image_sources_versions.jsonnet diff --git a/image_sources_versions.jsonnet b/image_sources_versions.jsonnet new file mode 100644 index 0000000..9151408 --- /dev/null +++ b/image_sources_versions.jsonnet @@ -0,0 +1,35 @@ +{ + _config+:: { + versions+:: { + prometheus: "v2.7.0", + alertmanager: "v0.16.0", + kubeStateMetrics: "v1.5.0", + kubeRbacProxy: "v0.4.1", + addonResizer: "2.1", + nodeExporter: "v0.17.0", + prometheusOperator: "v0.28.0", + prometheusAdapter: "v0.4.1", + grafana: "5.4.3", + configmapReloader: "v0.2.2", + prometheusConfigReloader: "v0.28.0", + armExporter: 'latest', + smtpServer: 'latest', + }, + + imageRepos+:: { + prometheus: "carlosedp/prometheus", + alertmanager: "carlosedp/alertmanager", + kubeStateMetrics: "carlosedp/kube-state-metrics", + kubeRbacProxy: "carlosedp/kube-rbac-proxy", + addonResizer: "carlosedp/addon-resizer", + nodeExporter: "carlosedp/node_exporter", + prometheusOperator: "carlosedp/prometheus-operator", + prometheusAdapter: "carlosedp/k8s-prometheus-adapter", + grafana: "grafana/grafana", + configmapReloader: "carlosedp/configmap-reload", + prometheusConfigReloader: "carlosedp/prometheus-config-reloader", + armExporter: 'carlosedp/arm_exporter', + smtpServer: 'carlosedp/docker-smtp', + }, + }, +} \ No newline at end of file diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index efbc09a..c726a9a 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "e123a1de479dbb911b4070f1bfcbd1e65e02209e" + "version": "0e22050ca3ebe6b7033f8c0f1cb1605e3abe63c5" }, { "name": "ksonnet", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "6070db22ed3d46372a5600fe8f35907f4d706bdb" + "version": "1fe6f109c87c4fa47775426a6a60c3b954ed5c33" } ] } From abd76d847a6ef8784ef76db9609205239517f200 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 17:24:49 -0200 Subject: [PATCH 12/30] Add SMTP relay for Gmail --- create_gmail_auth.sh | 10 ++++++++ smtp_server.jsonnet | 57 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100755 create_gmail_auth.sh create mode 100644 smtp_server.jsonnet diff --git a/create_gmail_auth.sh b/create_gmail_auth.sh new file mode 100755 index 0000000..f29ae38 --- /dev/null +++ b/create_gmail_auth.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +echo "Please enter your Gmail account"; +read username; + +echo "Please enter your Gmail password"; +read -s password; + +echo "Creating secret" +kubectl create secret generic smtp-account -n monitoring --from-literal=username=${username} --from-literal=password=${password} diff --git a/smtp_server.jsonnet b/smtp_server.jsonnet new file mode 100644 index 0000000..f0ca3de --- /dev/null +++ b/smtp_server.jsonnet @@ -0,0 +1,57 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'image_sources_versions.jsonnet') + + { + _config+:: { + namespace: 'monitoring', + }, + + smtpServer+:: { + deployment: + local deployment = k.apps.v1beta2.deployment; + local container = k.apps.v1beta2.deployment.mixin.spec.template.spec.containersType; + local containerPort = container.portsType; + + local podLabels = { run: 'smtp-server' }; + + local smtpServer = + container.new('smtp-server', $._config.imageRepos.smtpServer + ':' + $._config.versions.smtpServer) + + container.withPorts(containerPort.newNamed('smtp', 25)) + + container.withEnv([ + { + name: 'GMAIL_USER', + valueFrom: { + secretKeyRef: { name: 'smtp-account', key: 'username' }, + }, + }, + { + name: 'GMAIL_PASSWORD', + valueFrom: { + secretKeyRef: { name: 'smtp-account', key: 'password' }, + }, + }, + {name: 'DISABLE_IPV6', + value: 'True'}, + {name: 'RELAY_DOMAINS', value: ':192.168.0.0/24:10.0.0.0/16'}, + ]); + + local c = [smtpServer]; + + deployment.new('smtp-server', 1, c, podLabels) + + deployment.mixin.metadata.withNamespace($._config.namespace) + + deployment.mixin.metadata.withLabels(podLabels) + + deployment.mixin.spec.selector.withMatchLabels(podLabels), + + service: + local service = k.core.v1.service; + local servicePort = k.core.v1.service.mixin.spec.portsType; + local smtpServerPorts = servicePort.newNamed('smtp', 25, 'smtp'); + + service.new('smtp-server', $.smtpServer.deployment.spec.selector.matchLabels, smtpServerPorts) + + service.mixin.metadata.withNamespace($._config.namespace) + + service.mixin.metadata.withLabels({ 'run': 'smtp-server' }) + }, +}; + +{ ['smtp-server-' + name]: kp.smtpServer[name] for name in std.objectFields(kp.smtpServer) } \ No newline at end of file From 8d794a45ec8c5c6cbbe6a71f5283a7dee65fd22f Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 17:25:28 -0200 Subject: [PATCH 13/30] Fix Traefik monitor --- traefik.jsonnet | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/traefik.jsonnet b/traefik.jsonnet index 9fa0926..507d1ef 100644 --- a/traefik.jsonnet +++ b/traefik.jsonnet @@ -28,7 +28,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { endpoints: [ { port: 'admin', - scheme: 'https', + scheme: 'http', interval: '30s', }, ], From 06671c0121e62164955653c68d40dcd4ff6b5f58 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 18:01:11 -0200 Subject: [PATCH 14/30] Added MetalLB ServiceMonitor --- metallb.jsonnet | 58 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 metallb.jsonnet diff --git a/metallb.jsonnet b/metallb.jsonnet new file mode 100644 index 0000000..04cb9d9 --- /dev/null +++ b/metallb.jsonnet @@ -0,0 +1,58 @@ +local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; + +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + (import 'image_sources_versions.jsonnet') + + { + _config+:: { + namespace: 'monitoring', + }, + + metallb+:: { + serviceMonitor: + { + apiVersion: 'monitoring.coreos.com/v1', + kind: 'ServiceMonitor', + metadata: { + name: 'metallb', + namespace: $._config.namespace, + labels: { + 'k8s-app': 'metallb-controller', + }, + }, + spec: { + jobLabel: 'k8s-app', + selector: { + matchLabels: { + 'k8s-app': 'metallb-controller', + }, + }, + endpoints: [ + { + port: 'http', + scheme: 'http', + interval: '30s', + }, + ], + namespaceSelector: { + matchNames: [ + 'metallb-system', + ] + }, + + }, + }, + + service: + local service = k.core.v1.service; + local servicePort = k.core.v1.service.mixin.spec.portsType; + + local metallbPort = servicePort.newNamed('http', 7472, 7472); + + service.new('metallb-controller', {"app": "metallb", "component": "controller"}, metallbPort) + + service.mixin.metadata.withNamespace('metallb-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'metallb-controller' }) + + service.mixin.spec.withClusterIp('None'), + }, +}; + +{ ['metallb-' + name]: kp.metallb[name] for name in std.objectFields(kp.metallb) } \ No newline at end of file From 158acde3ac1134f7b9916d75bb66f72347977425 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 18:02:26 -0200 Subject: [PATCH 15/30] Added new config to the config generator. Import Grafana dashboards --- generate_ingress_auth.sh | 6 + ...Exporter Server Metrics-1520350339243.json | 1609 ----------- ...aefik Realtime Metrics-1520350498858.json | 1180 -------- .../kubernetes-cluster-dashboard.json | 1671 +++++++++++ ...0511205.json => prometheus-dashboard.json} | 323 ++- ...monitoring.json => traefik-dashboard.json} | 2521 ++++++++--------- operator_stack.jsonnet | 470 ++- 7 files changed, 3320 insertions(+), 4460 deletions(-) create mode 100644 generate_ingress_auth.sh delete mode 100644 grafana-dashboards/Node Exporter Server Metrics-1520350339243.json delete mode 100644 grafana-dashboards/Traefik Realtime Metrics-1520350498858.json create mode 100644 grafana-dashboards/kubernetes-cluster-dashboard.json rename grafana-dashboards/{Prometheus2.0-1520350511205.json => prometheus-dashboard.json} (96%) rename grafana-dashboards/{Kubernetes cluster monitoring.json => traefik-dashboard.json} (53%) diff --git a/generate_ingress_auth.sh b/generate_ingress_auth.sh new file mode 100644 index 0000000..d3fb7af --- /dev/null +++ b/generate_ingress_auth.sh @@ -0,0 +1,6 @@ +#!/bin/bash +if [[ $# -eq 0 ]] ; then + echo "Run the script with the required auth user and namespace for the secret: ${0} [user] [namespace]" + exit 0 +fi +printf "${1}:`openssl passwd -apr1`\n" >> auth \ No newline at end of file diff --git a/grafana-dashboards/Node Exporter Server Metrics-1520350339243.json b/grafana-dashboards/Node Exporter Server Metrics-1520350339243.json deleted file mode 100644 index afcbba7..0000000 --- a/grafana-dashboards/Node Exporter Server Metrics-1520350339243.json +++ /dev/null @@ -1,1609 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "5.0.0" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "5.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "5.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "5.0.0" - }, - { - "type": "panel", - "id": "text", - "name": "Text", - "version": "5.0.0" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "description": "Dashboard to view multiple servers", - "editable": true, - "gnetId": 405, - "graphTooltip": 0, - "id": null, - "iteration": 1520350315682, - "links": [], - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 56, - "panels": [], - "title": "Main", - "type": "row" - }, - { - "content": "", - "editable": true, - "error": false, - "gridPos": { - "h": 1, - "w": 8, - "x": 0, - "y": 1 - }, - "id": 11, - "links": [], - "minSpan": 4, - "mode": "html", - "repeat": "node", - "repeatDirection": "h", - "style": {}, - "title": "$node", - "type": "text" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 3, - "w": 8, - "x": 0, - "y": 2 - }, - "id": 20, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "minSpan": 4, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "repeat": "node", - "repeatDirection": "h", - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "count(node_cpu{instance=~\"$node\", mode=\"system\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "", - "metric": "", - "refId": "A", - "step": 14400, - "target": "" - } - ], - "thresholds": "", - "title": "CPU Cores", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 3, - "editable": true, - "error": false, - "fill": 10, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 5 - }, - "id": 7, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "hideEmpty": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 0, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": true, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "repeatDirection": "h", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "sum by (mode)(irate(node_cpu{mode=\"system\",instance=~'$node'}[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{mode}}", - "metric": "", - "refId": "A", - "step": 1200, - "target": "" - }, - { - "expr": "sum by (mode)(irate(node_cpu{mode='user',instance=~'$node'}[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "user", - "refId": "B", - "step": 1200 - }, - { - "expr": "sum by (mode)(irate(node_cpu{mode='nice',instance=~'$node'}[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "nice", - "refId": "C", - "step": 1200 - }, - { - "expr": "sum by (mode)(irate(node_cpu{mode='iowait',instance=~'$node'}[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "iowait", - "refId": "E", - "step": 1200 - }, - { - "expr": "sum by (mode)(irate(node_cpu{mode='steal',instance=~'$node'}[5m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "steal", - "refId": "H", - "step": 1200 - }, - { - "expr": "sum by (mode)(irate(node_cpu{mode='idle',instance=~'$node'}[5m]))", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "idle", - "refId": "D", - "step": 1200 - }, - { - "expr": "sum by (mode)(irate(node_cpu{mode='irq',instance=~'$node'}[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "irq", - "refId": "F", - "step": 1200 - }, - { - "expr": "sum by (mode)(irate(node_cpu{mode='softirq',instance=~'$node'}[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "softirq", - "refId": "G", - "step": 1200 - }, - { - "expr": "sum by (mode)(irate(node_cpu{mode='guest',instance=~'$node'}[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "guest", - "refId": "I", - "step": 1200 - } - ], - "thresholds": [ - { - "colorMode": "custom", - "fill": true, - "fillColor": "rgba(216, 200, 27, 0.27)", - "op": "gt", - "value": 0 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "%", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { - "Slab": "#E5A8E2", - "Swap": "#E24D42" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 12 - }, - "id": 17, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "seriesOverrides": [ - { - "alias": "/Apps|Buffers|Cached|Free|Slab|SwapCached|PageTables|VmallocUsed/", - "fill": 5, - "stack": true - }, - { - "alias": "Swap", - "fill": 5, - "stack": true - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "( node_memory_MemTotal{instance=~'$node'} - node_memory_MemFree{instance=~'$node'} - node_memory_Buffers{instance=~'$node'} - node_memory_Cached{instance=~'$node'} - node_memory_SwapCached{instance=~'$node'} - node_memory_Slab{instance=~'$node'} - node_memory_PageTables{instance=~'$node'} - node_memory_VmallocUsed{instance=~'$node'} )", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Apps", - "metric": "", - "refId": "A", - "step": 1200, - "target": "" - }, - { - "expr": "node_memory_Buffers{instance=~'$node'}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Buffers", - "refId": "B", - "step": 1200 - }, - { - "expr": "node_memory_Cached{instance=~'$node'}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Cached", - "refId": "D", - "step": 1200 - }, - { - "expr": "node_memory_MemFree{instance=~'$node'}", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "Free", - "refId": "E", - "step": 1200 - }, - { - "expr": "node_memory_Slab{instance=~'$node'}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Slab", - "refId": "F", - "step": 1200 - }, - { - "expr": "node_memory_SwapCached{instance=~'$node'}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "SwapCached", - "refId": "G", - "step": 1200 - }, - { - "expr": "node_memory_PageTables{instance=~'$node'}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "PageTables", - "refId": "H", - "step": 1200 - }, - { - "expr": "node_memory_VmallocUsed{instance=~'$node'}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "VmallocUsed", - "metric": "", - "refId": "I", - "step": 1200 - }, - { - "expr": "(node_memory_SwapTotal{instance=~'$node'} - node_memory_SwapFree{instance=~'$node'})", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Swap", - "metric": "", - "refId": "C", - "step": 1200 - }, - { - "expr": "node_memory_Committed_AS{instance=~'$node'}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Committed", - "metric": "", - "refId": "J", - "step": 1200 - }, - { - "expr": "node_memory_Mapped{instance=~'$node'}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Mapped", - "refId": "K", - "step": 1200 - }, - { - "expr": "node_memory_Active{instance=~'$node'}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Active", - "metric": "", - "refId": "L", - "step": 1200 - }, - { - "expr": "node_memory_Inactive{instance=~'$node'}", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Inactive", - "metric": "", - "refId": "M", - "step": 1200 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Memory", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": "GB", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 19 - }, - "id": 13, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "repeatDirection": "h", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_load1{instance=~\"$node\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "load", - "metric": "", - "refId": "A", - "step": 1200, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Load", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 3, - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 26 - }, - "id": 9, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "100.0 - 100 * (node_filesystem_avail{instance=~'$node',device !~'tmpfs',device!~'by-uuid'} / node_filesystem_size{instance=~'$node',device !~'tmpfs',device!~'by-uuid'})", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{mountpoint}}", - "metric": "", - "refId": "A", - "step": 1200, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk Space Used", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": 0, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 33 - }, - "id": 19, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "irate(node_disk_io_time_ms{instance=~\"$node\"}[5m])/10", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{device}}", - "metric": "", - "refId": "A", - "step": 1200, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk Utilization per Device", - "tooltip": { - "msResolution": false, - "shared": false, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "logBase": 1, - "max": 100, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 40 - }, - "id": 14, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "seriesOverrides": [ - { - "alias": "/.*_read$/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "irate(node_disk_reads_completed{instance=~'$node'}[5m])", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{device}}_read", - "metric": "", - "refId": "A", - "step": 2400, - "target": "" - }, - { - "expr": "irate(node_disk_writes_completed{instance=~'$node'}[5m])", - "intervalFactor": 2, - "legendFormat": "{{device}}_write", - "metric": "", - "refId": "B", - "step": 1200 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk IOs per Device", - "tooltip": { - "msResolution": false, - "shared": false, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "IO/second read (-) / write (+)", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 47 - }, - "id": 18, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "seriesOverrides": [ - { - "alias": "/.*_read/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "irate(node_disk_sectors_read{instance=~'$node'}[5m]) * 512", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{device}}_read", - "refId": "B", - "step": 2400 - }, - { - "expr": "irate(node_disk_sectors_written{instance=~'$node'}[5m]) * 512", - "interval": "", - "intervalFactor": 4, - "legendFormat": "{{device}}_write", - "metric": "", - "refId": "A", - "step": 2400, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Disk Throughput per Device", - "tooltip": { - "msResolution": false, - "shared": false, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": "Bytes/second read (-) / write (+)", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 54 - }, - "id": 22, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "irate(node_context_switches{instance=~\"$node\"}[5m])", - "interval": "", - "intervalFactor": 2, - "legendFormat": "context switches", - "metric": "", - "refId": "A", - "step": 1200, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Context Switches", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 61 - }, - "id": 12, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "seriesOverrides": [ - { - "alias": "/.*_in/", - "transform": "negative-Y" - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "irate(node_network_receive_bytes{instance=~'$node'}[5m])*8", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{device}}_in", - "metric": "", - "refId": "A", - "step": 1200, - "target": "" - }, - { - "expr": "irate(node_network_transmit_bytes{instance=~'$node'}[5m])*8", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{device}}_out", - "refId": "B", - "step": 1200 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Network Traffic", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bits", - "label": "bits in (-) / bits out (+)", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 68 - }, - "id": 21, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_netstat_Tcp_CurrEstab{instance=~'$node'}", - "intervalFactor": 2, - "legendFormat": "established", - "refId": "A", - "step": 1200, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Netstat", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 75 - }, - "id": 23, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "seriesOverrides": [ - { - "alias": "/.*Out.*/", - "transform": "negative-Y" - }, - { - "alias": "Udp_NoPorts", - "yaxis": 2 - } - ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "irate(node_netstat_Udp_InDatagrams{instance=~\"$node\"}[5m])", - "intervalFactor": 2, - "legendFormat": "Udp_InDatagrams", - "refId": "A", - "step": 1200, - "target": "" - }, - { - "expr": "irate(node_netstat_Udp_InErrors{instance=~\"$node\"}[5m])", - "intervalFactor": 2, - "legendFormat": "Udp_InErrors", - "refId": "B", - "step": 1200 - }, - { - "expr": "irate(node_netstat_Udp_OutDatagrams{instance=~\"$node\"}[5m])", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Udp_OutDatagrams", - "refId": "C", - "step": 1200 - }, - { - "expr": "irate(node_netstat_Udp_NoPorts{instance=~\"$node\"}[5m])", - "intervalFactor": 2, - "legendFormat": "Udp_NoPorts", - "refId": "D", - "step": 1200 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "UDP Stats", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 8, - "x": 0, - "y": 82 - }, - "id": 24, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "minSpan": 4, - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": "node", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_nf_conntrack_entries_limit{instance=~\"$node\"} - node_nf_conntrack_entries{instance=~\"$node\"}", - "intervalFactor": 2, - "legendFormat": "free", - "refId": "A", - "step": 1200, - "target": "" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Conntrack", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 89 - }, - "id": 88, - "panels": [], - "title": "Row title", - "type": "row" - } - ], - "refresh": "30s", - "schemaVersion": 16, - "style": "dark", - "tags": [ - "custom" - ], - "templating": { - "list": [ - { - "allFormat": "glob", - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "", - "multi": true, - "multiFormat": "regex values", - "name": "node", - "options": [], - "query": "label_values(node_boot_time, instance)", - "refresh": 1, - "regex": "", - "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - }, - { - "allFormat": "glob", - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": true, - "label": "", - "multi": true, - "multiFormat": "regex values", - "name": "node_no_port", - "options": [], - "query": "label_values(node_boot_time, instance)", - "refresh": 1, - "regex": "/^(.*)\\:.*/", - "sort": 1, - "tagValuesQuery": "", - "tags": [], - "tagsQuery": "", - "type": "query", - "useTags": false - } - ] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": { - "now": true, - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Node Exporter Server Metrics", - "uid": "GxJ3JT6zk", - "version": 4 -} \ No newline at end of file diff --git a/grafana-dashboards/Traefik Realtime Metrics-1520350498858.json b/grafana-dashboards/Traefik Realtime Metrics-1520350498858.json deleted file mode 100644 index 55aff5e..0000000 --- a/grafana-dashboards/Traefik Realtime Metrics-1520350498858.json +++ /dev/null @@ -1,1180 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "5.2.2" - }, - { - "type": "panel", - "id": "grafana-piechart-panel", - "name": "Pie Chart", - "version": "1.2.0" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "5.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "5.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "5.0.0" - } - ], - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": "-- Grafana --", - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "description": "Visualize Traefik Metrics", - "editable": true, - "gnetId": 2870, - "graphTooltip": 0, - "id": null, - "links": [], - "panels": [ - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 8, - "x": 0, - "y": 0 - }, - "id": 7, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "avg(traefik_entrypoint_request_duration_seconds_sum) / avg(traefik_entrypoint_requests_total) * 1000", - "format": "time_series", - "instant": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 60 - } - ], - "thresholds": "300,600", - "title": "Average Response Time (ms)", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": 0, - "format": "s", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 8, - "x": 8, - "y": 0 - }, - "id": 3, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "min(time() - process_start_time_seconds{job=\"traefik-ingress-lb\"})", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 60 - } - ], - "thresholds": "", - "title": "Uptime", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": true, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(26, 206, 22, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "format": "none", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 2, - "w": 8, - "x": 16, - "y": 0 - }, - "id": 9, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "sum(rate(traefik_entrypoint_requests_total{code=\"404\",method=\"GET\"}[5m])) * 1000", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "", - "metric": "traefik_requests_total", - "refId": "A", - "step": 60 - } - ], - "thresholds": "0,1", - "title": "404 Error Count last 5 Minutes", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "max" - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 8, - "w": 8, - "x": 0, - "y": 2 - }, - "id": 1, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(traefik_entrypoint_requests_total{entrypoint=\"http\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{http}}", - "metric": "", - "refId": "A", - "step": 20 - }, - { - "expr": "sum(traefik_entrypoint_requests_total{entrypoint=\"https\"})", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{https}}", - "refId": "B", - "step": 20 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Total requests", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": "Count", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 1000 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "5m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "frequency": "60s", - "handler": 1, - "name": "Average response time alert", - "noDataState": "no_data", - "notifications": [] - }, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 8, - "w": 8, - "x": 8, - "y": 2 - }, - "id": 11, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "avg(traefik_entrypoint_request_duration_seconds_sum) / avg(traefik_entrypoint_requests_total) * 1000", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Average response time (ms)", - "refId": "A", - "step": 240 - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 1000 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Average response time", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "ms", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 8, - "w": 8, - "x": 16, - "y": 2 - }, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(traefik_entrypoint_requests_total[5m]))", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{requests}}", - "refId": "A", - "step": 20 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Requests in last 5 minutes", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 0, - "fill": 1, - "gridPos": { - "h": 11, - "w": 8, - "x": 0, - "y": 10 - }, - "id": 5, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"http|https\",code=\"200\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{service}} {{method}} {{code}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Successful Status Code Count (5min)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 0, - "fill": 1, - "gridPos": { - "h": 11, - "w": 8, - "x": 8, - "y": 10 - }, - "id": 4, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"http|https\",code!=\"200\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{service}} {{method}} {{code}}", - "refId": "A", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Bad Status Code Count (5m)", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 11, - "w": 8, - "x": 16, - "y": 10 - }, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(traefik_entrypoint_requests_total) by (code)", - "format": "time_series", - "instant": false, - "intervalFactor": 2, - "legendFormat": "{{code}}", - "refId": "A", - "step": 20 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Requests by Request Code", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 21 - }, - "id": 13, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": true, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"http|https\",code=\"200\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{method}} : {{code}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Status code 200 over 5min", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": true, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 21 - }, - "id": 15, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": true, - "rightSide": true, - "show": true, - "total": false, - "values": true - }, - "lines": false, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "rate(traefik_entrypoint_requests_total{entrypoint=~\"http|https\",code!=\"200\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ method }} : {{code}}", - "refId": "A" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Others status code over 5min", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "cacheTimeout": null, - "combine": { - "label": "Others", - "threshold": 0 - }, - "datasource": "${DS_PROMETHEUS}", - "fontSize": "80%", - "format": "short", - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 28 - }, - "id": 17, - "interval": null, - "legend": { - "percentage": true, - "show": true, - "values": false - }, - "legendType": "Right side", - "links": [], - "maxDataPoints": 3, - "nullPointMode": "connected", - "pieType": "pie", - "strokeWidth": 1, - "targets": [ - { - "expr": "sum(rate(traefik_entrypoint_requests_total[5m])) by (entrypoint)", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{ entrypoint}}", - "refId": "A" - } - ], - "title": "Requests by service", - "type": "grafana-piechart-panel", - "valueName": "current" - }, - { - "aliasColors": {}, - "cacheTimeout": null, - "combine": { - "label": "Others", - "threshold": 0 - }, - "datasource": "${DS_PROMETHEUS}", - "fontSize": "80%", - "format": "short", - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 28 - }, - "id": 19, - "interval": null, - "legend": { - "percentage": true, - "show": true, - "values": false - }, - "legendType": "Right side", - "links": [], - "maxDataPoints": 3, - "nullPointMode": "connected", - "pieType": "pie", - "strokeWidth": 1, - "targets": [ - { - "expr": "sum(rate(traefik_entrypoint_requests_total[5m])) by (entrypoint) ", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "{{ entrypoint}}", - "refId": "A" - } - ], - "title": "Requests by protocol", - "type": "grafana-piechart-panel", - "valueName": "current" - } - ], - "refresh": "30s", - "schemaVersion": 16, - "style": "dark", - "tags": [ - "custom" - ], - "templating": { - "list": [] - }, - "time": { - "from": "now-3h", - "to": "now" - }, - "timepicker": { - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Traefik Realtime Metrics", - "uid": "000000010", - "version": 9 -} diff --git a/grafana-dashboards/kubernetes-cluster-dashboard.json b/grafana-dashboards/kubernetes-cluster-dashboard.json new file mode 100644 index 0000000..797dd38 --- /dev/null +++ b/grafana-dashboards/kubernetes-cluster-dashboard.json @@ -0,0 +1,1671 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Monitor a Kubernetes cluster using Prometheus TSDB. Shows overall cluster CPU / Memory / Disk usage as well as individual pod statistics. ", + "editable": true, + "gnetId": 162, + "graphTooltip": 1, + "id": 12, + "links": [], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 4, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "65, 90", + "title": "Cluster memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": true, + "colorValue": false, + "colors": [ + "rgba(0, 0, 0, 0)", + "rgb(210, 1, 1)", + "#890f02" + ], + "datasource": "prometheus", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 23, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(up{job=\"kubelet\"}) BY (job)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "1.1", + "title": "Up Nodes", + "type": "singlestat", + "valueFontSize": "120%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "decimals": 0, + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 6, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(100 - (avg by (instance) (irate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100))", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "65, 90", + "title": "Cluster CPU usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "columns": [], + "datasource": "prometheus", + "fontSize": "90%", + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 2 + }, + "id": 25, + "links": [], + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 2, + "desc": false + }, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "link": false, + "pattern": "Time", + "type": "date" + }, + { + "alias": "Uptime", + "colorMode": null, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "Value", + "thresholds": [], + "type": "number", + "unit": "s" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/endpoint|job|namespace|pod|service/", + "thresholds": [], + "type": "hidden", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "instance", + "preserveFormat": false, + "sanitize": false, + "thresholds": [], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "(time() - node_boot_time_seconds)", + "format": "table", + "instant": true, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Node Uptime", + "transform": "table", + "transparent": true, + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 15, + "panels": [], + "title": "Nodes", + "type": "row" + }, + { + "alert": { + "conditions": [ + { + "evaluator": { + "params": [ + 3500000000 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "4m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "60s", + "handler": 1, + "name": "Memory Usage alert", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 3500000000 + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "conditions": [ + { + "evaluator": { + "params": [ + 90 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "15m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "max" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "60s", + "handler": 1, + "name": "CPU Usage alert", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)", + "format": "time_series", + "intervalFactor": 3, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 90 + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "decimals": null, + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "columns": [], + "datasource": "prometheus", + "fontSize": "100%", + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 31, + "links": [], + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": true + }, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "link": false, + "pattern": "Time", + "type": "date" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "condition|container|daemonset|endpoint|namespace|node", + "thresholds": [], + "type": "hidden", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "expr": "ALERTS{alertstate=\"firing\"}", + "format": "table", + "instant": true, + "intervalFactor": 1, + "refId": "A" + }, + { + "expr": "ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"}", + "format": "table", + "hide": true, + "intervalFactor": 1, + "refId": "B" + } + ], + "title": "Active Alerts", + "transform": "table", + "type": "table" + }, + { + "dashboardFilter": "", + "folderId": null, + "gridPos": { + "h": 9, + "w": 5, + "x": 12, + "y": 17 + }, + "id": 27, + "limit": 10, + "links": [], + "nameFilter": "", + "onlyAlertsOnDashboard": false, + "show": "current", + "sortOrder": 1, + "stateFilter": [], + "title": "Alarms", + "type": "alertlist" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "decimals": null, + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 7, + "x": 17, + "y": 17 + }, + "id": 7, + "interval": null, + "isNew": true, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size_bytes{device=~\"/dev/.*\"}) - sum(node_filesystem_free_bytes{device=~\"/dev/.*\"}) ) / sum(node_filesystem_size_bytes{device=~\"/dev/.*\"}) * 100", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "metric": "", + "refId": "A", + "step": 10 + } + ], + "thresholds": "65, 90", + "title": "Cluster Filesystem usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "alert": { + "conditions": [ + { + "evaluator": { + "params": [ + 1 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C", + "5m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "60s", + "handler": 1, + "name": "Node Down", + "noDataState": "alerting", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 4, + "w": 7, + "x": 17, + "y": 22 + }, + "id": 29, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(up{job=\"kubelet\"}) BY (job)", + "format": "time_series", + "hide": true, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Up Nodes", + "refId": "A" + }, + { + "expr": "count(up{job=\"kubelet\"})", + "format": "time_series", + "hide": true, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Total Nodes", + "refId": "B" + }, + { + "expr": "avg(up{job=\"kubelet\"}) BY (job)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "C" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "lt", + "value": 1 + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Up Nodes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "conditions": [ + { + "evaluator": { + "params": [ + 85 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "1m", + "now" + ] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "60s", + "handler": 1, + "name": "CPU Temperature alert", + "noDataState": "no_data", + "notifications": [] + }, + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rpi_cpu_temperature_celsius", + "format": "time_series", + "intervalFactor": 5, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 85 + } + ], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Temperature", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "celsius", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 17, + "panels": [], + "title": "Pods", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 3, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 270, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10,sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\"}[1m] ) ))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ container_name}}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Pod CPU usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 250, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum(container_memory_usage_bytes{image!=\"\"}) by (container_name, image))", + "format": "time_series", + "hide": true, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ container_name }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + }, + { + "expr": "topk(10,sum(container_memory_rss{name=~\".+\"}) by (container_name))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ container_name }}", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Pod memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 550, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10,sum(rate(container_network_transmit_bytes_total{name=~\".+\"}[5m])) by (name))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ name }}", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(container_network_transmit_bytes_total{id=\"/\"}[$interval])", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Sent Network Traffic per Container", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 1, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 10, + "max": 8, + "min": 0, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 21, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 150, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10,sum(rate(container_network_receive_bytes_total{name=~\".+\"}[5m])) by (name))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "A", + "step": 240 + }, + { + "expr": "- rate(container_network_transmit_bytes_total{name=~\".+\"}[$interval])", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Received Network Traffic per Container", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 1, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 54 + }, + "id": 8, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (kubernetes_pod_name) (rate (container_network_receive_bytes_total{name!=\"\", kubernetes_pod_name=~\".*\"}[1m]) ))", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Receive Traffic", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "expr": "sort_desc(sum by (kubernetes_pod_name) (rate (container_network_transmit_bytes_total{name!=\"\", kubernetes_pod_name=~\".*\"}[1m]) ))", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Transmit Traffic", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Pod Network i/o", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "10s", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "custom" + ], + "templating": { + "list": [] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes cluster monitoring (via Prometheus)", + "uid": "82pBZCmRk", + "version": 2 + } \ No newline at end of file diff --git a/grafana-dashboards/Prometheus2.0-1520350511205.json b/grafana-dashboards/prometheus-dashboard.json similarity index 96% rename from grafana-dashboards/Prometheus2.0-1520350511205.json rename to grafana-dashboards/prometheus-dashboard.json index de29bb0..07bdd50 100644 --- a/grafana-dashboards/Prometheus2.0-1520350511205.json +++ b/grafana-dashboards/prometheus-dashboard.json @@ -1,53 +1,4 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - }, - { - "name": "VAR_SCRAPE_INTERVAL", - "type": "constant", - "label": "Scrape interval seconds", - "value": "60", - "description": "" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "5.0.0" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "5.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "5.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "5.0.0" - }, - { - "type": "panel", - "id": "text", - "name": "Text", - "version": "5.0.0" - } - ], "annotations": { "list": [ { @@ -95,8 +46,8 @@ "editable": true, "gnetId": 3681, "graphTooltip": 1, - "id": null, - "iteration": 1520350506982, + "id": 13, + "iteration": 1549118131383, "links": [ { "icon": "info", @@ -141,7 +92,7 @@ "rgba(237, 129, 40, 0.89)", "#bf1b00" ], - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "decimals": 1, "format": "s", "gauge": { @@ -223,7 +174,7 @@ "rgba(237, 129, 40, 0.89)", "#bf1b00" ], - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "format": "short", "gauge": { "maxValue": 1000000, @@ -304,7 +255,7 @@ "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "format": "none", "gauge": { "maxValue": 100, @@ -385,7 +336,7 @@ "rgba(237, 129, 40, 0.89)", "#d44a3a" ], - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "decimals": 2, "format": "ms", "gauge": { @@ -483,7 +434,7 @@ "rgba(237, 129, 40, 0.89)", "#299c46" ], - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "decimals": 1, "format": "none", "gauge": { @@ -620,6 +571,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Query elapsed time", "tooltip": { @@ -653,7 +605,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -717,6 +673,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Head series created/deleted", "tooltip": { @@ -750,7 +707,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -898,6 +859,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Prometheus errors", "tooltip": { @@ -931,7 +893,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "collapsed": false, @@ -1003,6 +969,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Scrape delay (counts with 1m scrape interval)", "tooltip": { @@ -1034,7 +1001,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -1096,6 +1067,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Rule evaulation duration", "tooltip": { @@ -1129,7 +1101,11 @@ "min": "0", "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "collapsed": false, @@ -1200,6 +1176,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Request count", "tooltip": { @@ -1233,7 +1210,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -1291,6 +1272,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Request duration per handler", "tooltip": { @@ -1324,7 +1306,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -1380,6 +1366,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Request size by handler", "tooltip": { @@ -1413,7 +1400,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -1489,6 +1480,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Cont of concurent queries", "tooltip": { @@ -1522,7 +1514,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "collapsed": false, @@ -1610,6 +1606,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Alert queue size", "tooltip": { @@ -1643,7 +1640,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -1700,6 +1701,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Count of discovered alertmanagers", "tooltip": { @@ -1733,7 +1735,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -1801,6 +1807,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Alerting errors", "tooltip": { @@ -1834,7 +1841,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "collapsed": false, @@ -1855,7 +1866,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "fill": 1, "gridPos": { "h": 7, @@ -1897,6 +1908,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Consul SD sync count", "tooltip": { @@ -1929,14 +1941,18 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "fill": 1, "gridPos": { "h": 7, @@ -1978,6 +1994,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Marathon SD sync count", "tooltip": { @@ -2010,14 +2027,18 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "fill": 1, "gridPos": { "h": 7, @@ -2059,6 +2080,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Kubernetes SD sync count", "tooltip": { @@ -2091,7 +2113,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -2189,6 +2215,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Service discovery errors", "tooltip": { @@ -2222,7 +2249,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "collapsed": false, @@ -2243,7 +2274,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "fill": 1, "gridPos": { "h": 7, @@ -2284,6 +2315,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Reloaded block from disk", "tooltip": { @@ -2316,7 +2348,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -2373,6 +2409,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Loaded data blocks", "tooltip": { @@ -2406,7 +2443,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -2463,6 +2504,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Time series total count", "tooltip": { @@ -2496,7 +2538,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": {}, @@ -2548,6 +2594,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Samples Appended per second", "tooltip": { @@ -2581,7 +2628,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "collapsed": false, @@ -2659,6 +2710,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Head chunks count", "tooltip": { @@ -2692,14 +2744,18 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "fill": 1, "gridPos": { "h": 7, @@ -2740,6 +2796,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Length of head block", "tooltip": { @@ -2772,7 +2829,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -2834,6 +2895,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Head Chunks Created/Deleted per second", "tooltip": { @@ -2867,7 +2929,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "collapsed": false, @@ -2888,7 +2954,7 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "fill": 1, "gridPos": { "h": 7, @@ -2929,6 +2995,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Compaction duration", "tooltip": { @@ -2961,14 +3028,18 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "fill": 1, "gridPos": { "h": 7, @@ -3009,6 +3080,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Go Garbage collection duration", "tooltip": { @@ -3041,14 +3113,18 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "fill": 1, "gridPos": { "h": 7, @@ -3089,6 +3165,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "WAL truncate duration seconds", "tooltip": { @@ -3121,14 +3198,18 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", + "datasource": "prometheus", "fill": 1, "gridPos": { "h": 7, @@ -3169,6 +3250,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "WAL fsync duration seconds", "tooltip": { @@ -3201,7 +3283,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "collapsed": false, @@ -3307,6 +3393,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Memory", "tooltip": { @@ -3340,7 +3427,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": { @@ -3399,6 +3490,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Allocations per second", "tooltip": { @@ -3432,7 +3524,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "aliasColors": {}, @@ -3488,6 +3584,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "CPU per second", "tooltip": { @@ -3523,7 +3620,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } }, { "collapsed": false, @@ -4895,6 +4996,7 @@ ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, "title": "Net errors", "tooltip": { @@ -4928,7 +5030,11 @@ "min": null, "show": true } - ] + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "refresh": "5m", @@ -5009,12 +5115,17 @@ ], "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", "refresh": 2, + "skipUrlSync": false, "type": "interval" }, { "allValue": null, - "current": {}, + "current": { + "text": "10.32.0.53:9090", + "value": "10.32.0.53:9090" + }, "datasource": "$datasource", + "definition": "", "hide": 0, "includeAll": false, "label": "Instance", @@ -5024,6 +5135,7 @@ "query": "label_values(prometheus_build_info, instance)", "refresh": 2, "regex": "", + "skipUrlSync": false, "sort": 2, "tagValuesQuery": "", "tags": [], @@ -5033,19 +5145,20 @@ }, { "current": { - "value": "${VAR_SCRAPE_INTERVAL}", - "text": "${VAR_SCRAPE_INTERVAL}" + "text": "60", + "value": "60" }, "hide": 0, "label": "Scrape interval seconds", "name": "scrape_interval", "options": [ { - "value": "${VAR_SCRAPE_INTERVAL}", - "text": "${VAR_SCRAPE_INTERVAL}" + "text": "60", + "value": "60" } ], - "query": "${VAR_SCRAPE_INTERVAL}", + "query": "60", + "skipUrlSync": false, "type": "constant" }, { @@ -5060,6 +5173,7 @@ "query": "prometheus", "refresh": 1, "regex": "", + "skipUrlSync": false, "type": "datasource" }, { @@ -5074,12 +5188,13 @@ "query": "influxdb", "refresh": 1, "regex": "", + "skipUrlSync": false, "type": "datasource" } ] }, "time": { - "from": "now-7d", + "from": "now-3h", "to": "now" }, "timepicker": { @@ -5110,5 +5225,5 @@ "timezone": "browser", "title": "Prometheus2.0", "uid": "XmsJC9mRz", - "version": 4 + "version": 1 } \ No newline at end of file diff --git a/grafana-dashboards/Kubernetes cluster monitoring.json b/grafana-dashboards/traefik-dashboard.json similarity index 53% rename from grafana-dashboards/Kubernetes cluster monitoring.json rename to grafana-dashboards/traefik-dashboard.json index aebed5c..2c0ad50 100644 --- a/grafana-dashboards/Kubernetes cluster monitoring.json +++ b/grafana-dashboards/traefik-dashboard.json @@ -1,52 +1,4 @@ { - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "panel", - "id": "alertlist", - "name": "Alert List", - "version": "5.0.0" - }, - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "5.2.2" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "5.0.0" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "5.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "5.0.0" - }, - { - "type": "panel", - "id": "table", - "name": "Table", - "version": "5.0.0" - } - ], "annotations": { "list": [ { @@ -60,42 +12,53 @@ } ] }, - "description": "Monitor a Kubernetes cluster using Prometheus TSDB. Shows overall cluster CPU / Memory / Disk usage as well as individual pod statistics. ", + "description": "Traefik dashboard prometheus", "editable": true, - "gnetId": 162, - "graphTooltip": 1, - "id": null, + "gnetId": 5851, + "graphTooltip": 0, + "id": 20, + "iteration": 1549118220486, "links": [], "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 21, + "panels": [], + "title": "General", + "type": "row" + }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ - "rgba(50, 172, 45, 0.97)", + "#d44a3a", "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" + "#299c46" ], - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "format": "percent", + "datasource": "prometheus", + "format": "none", "gauge": { "maxValue": 100, "minValue": 0, - "show": true, + "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 7, - "w": 8, + "w": 3, "x": 0, - "y": 0 + "y": 1 }, - "id": 4, + "id": 13, "interval": null, - "isNew": true, "links": [], "mappingType": 1, "mappingTypes": [ @@ -126,21 +89,19 @@ "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", - "show": false + "show": true }, "tableColumn": "", "targets": [ { - "expr": "(sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100", + "expr": "count(kube_pod_status_ready{namespace=\"$namespace\",condition=\"true\",pod=~\"traefik.*\"})", "format": "time_series", - "interval": "10s", "intervalFactor": 1, - "refId": "A", - "step": 10 + "refId": "A" } ], - "thresholds": "65, 90", - "title": "Cluster memory usage", + "thresholds": "", + "title": "Traefik instances", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ @@ -154,15 +115,15 @@ }, { "cacheTimeout": null, - "colorBackground": true, + "colorBackground": false, "colorValue": false, "colors": [ - "rgba(0, 0, 0, 0)", - "rgb(210, 1, 1)", - "#890f02" + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" ], - "datasource": "${DS_PROMETHEUS}", - "format": "percentunit", + "datasource": "prometheus", + "format": "s", "gauge": { "maxValue": 100, "minValue": 0, @@ -171,12 +132,12 @@ "thresholdMarkers": true }, "gridPos": { - "h": 2, - "w": 8, - "x": 8, - "y": 0 + "h": 7, + "w": 3, + "x": 3, + "y": 1 }, - "id": 23, + "id": 37, "interval": null, "links": [], "mappingType": 1, @@ -213,18 +174,96 @@ "tableColumn": "", "targets": [ { - "expr": "avg(up{job=\"kubelet\"}) BY (job)", + "expr": "time() - max(process_start_time_seconds{job=~\"traefik.*\"})", "format": "time_series", - "instant": true, - "intervalFactor": 1, - "legendFormat": "", + "intervalFactor": 2, "refId": "A" } ], - "thresholds": "1.1", - "title": "Up Nodes", + "thresholds": "", + "title": "Uptime", "type": "singlestat", - "valueFontSize": "120%", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "ms", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 6, + "y": 1 + }, + "id": 39, + "interval": null, + "links": [], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(traefik_entrypoint_request_duration_seconds_sum) / sum(traefik_entrypoint_requests_total) * 1000", + "format": "time_series", + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Average response time", + "type": "singlestat", + "valueFontSize": "80%", "valueMaps": [ { "op": "=", @@ -235,859 +274,63 @@ "valueName": "avg" }, { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": 0, - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true + "aliasColors": { + "Latency over 1 min": "rgb(9, 116, 190)", + "Latency over 5 min": "#bf1b00" }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 2, "gridPos": { "h": 7, - "w": 8, - "x": 16, - "y": 0 - }, - "id": 6, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "avg(100 - (avg by (instance) (irate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100))", - "format": "time_series", - "interval": "10s", - "intervalFactor": 1, - "refId": "A", - "step": 10 - } - ], - "thresholds": "65, 90", - "title": "Cluster CPU usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "columns": [], - "datasource": "${DS_PROMETHEUS}", - "fontSize": "90%", - "gridPos": { - "h": 5, - "w": 8, - "x": 8, - "y": 2 - }, - "id": 25, - "links": [], - "pageSize": null, - "scroll": true, - "showHeader": true, - "sort": { - "col": 2, - "desc": false - }, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "link": false, - "pattern": "Time", - "type": "date" - }, - { - "alias": "Uptime", - "colorMode": null, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "Value", - "thresholds": [], - "type": "number", - "unit": "s" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "decimals": 2, - "pattern": "/endpoint|job|namespace|pod|service/", - "thresholds": [], - "type": "hidden", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "instance", - "preserveFormat": false, - "sanitize": false, - "thresholds": [], - "type": "string", - "unit": "short" - } - ], - "targets": [ - { - "expr": "(time() - node_boot_time_seconds)", - "format": "table", - "instant": true, - "intervalFactor": 1, - "refId": "A" - } - ], - "title": "Node Uptime", - "transform": "table", - "transparent": true, - "type": "table" - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 7 - }, - "id": 15, - "panels": [], - "title": "Nodes", - "type": "row" - }, - { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 3500000000 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "4m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "frequency": "60s", - "handler": 1, - "name": "Memory Usage alert", - "noDataState": "no_data", - "notifications": [] - }, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 8 - }, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ instance }}", - "refId": "A" - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 3500000000 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Memory Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "decbytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 90 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "15m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "max" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "frequency": "60s", - "handler": 1, - "name": "CPU Usage alert", - "noDataState": "no_data", - "notifications": [] - }, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 9, - "w": 12, - "x": 12, - "y": 8 + "w": 14, + "x": 10, + "y": 1 }, "id": 11, "legend": { - "avg": false, + "avg": true, "current": false, - "max": false, + "max": true, "min": false, "show": true, "total": false, - "values": false + "values": true }, "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)", - "format": "time_series", - "intervalFactor": 3, - "legendFormat": "{{instance}}", - "refId": "A" - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 90 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Usage", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "decimals": null, - "format": "percent", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "columns": [], - "datasource": "${DS_PROMETHEUS}", - "fontSize": "100%", - "gridPos": { - "h": 9, - "w": 12, - "x": 0, - "y": 17 - }, - "id": 31, - "links": [], - "pageSize": null, - "scroll": true, - "showHeader": true, - "sort": { - "col": 0, - "desc": true - }, - "styles": [ - { - "alias": "Time", - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "link": false, - "pattern": "Time", - "type": "date" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "dateFormat": "YYYY-MM-DD HH:mm:ss", - "decimals": 2, - "pattern": "condition|container|daemonset|endpoint|namespace|node", - "thresholds": [], - "type": "hidden", - "unit": "short" - }, - { - "alias": "", - "colorMode": null, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "decimals": 2, - "pattern": "/.*/", - "thresholds": [], - "type": "number", - "unit": "short" - } - ], - "targets": [ - { - "expr": "ALERTS{alertstate=\"firing\"}", - "format": "table", - "instant": true, - "intervalFactor": 1, - "refId": "A" - }, - { - "expr": "ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"}", - "format": "table", - "hide": true, - "intervalFactor": 1, - "refId": "B" - } - ], - "title": "Active Alerts", - "transform": "table", - "type": "table" - }, - { - "dashboardFilter": "", - "folderId": null, - "gridPos": { - "h": 9, - "w": 5, - "x": 12, - "y": 17 - }, - "id": 27, - "limit": 10, - "links": [], - "nameFilter": "", - "onlyAlertsOnDashboard": false, - "show": "current", - "sortOrder": 1, - "stateFilter": [], - "title": "Alarms", - "type": "alertlist" - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(50, 172, 45, 0.97)", - "rgba(237, 129, 40, 0.89)", - "rgba(245, 54, 54, 0.9)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "editable": true, - "error": false, - "format": "percent", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": true, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "gridPos": { - "h": 5, - "w": 7, - "x": 17, - "y": 17 - }, - "id": 7, - "interval": null, - "isNew": true, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "(sum(node_filesystem_size_bytes{device=~\"/dev/.*\"}) - sum(node_filesystem_free_bytes{device=~\"/dev/.*\"}) ) / sum(node_filesystem_size_bytes{device=~\"/dev/.*\"}) * 100", - "format": "time_series", - "interval": "10s", - "intervalFactor": 1, - "metric": "", - "refId": "A", - "step": 10 - } - ], - "thresholds": "65, 90", - "title": "Cluster Filesystem usage", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "current" - }, - { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 1 - ], - "type": "lt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "C", - "5m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "frequency": "60s", - "handler": 1, - "name": "Node Down", - "noDataState": "alerting", - "notifications": [] - }, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 4, - "w": 7, - "x": 17, - "y": 22 - }, - "id": 29, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, + "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", - "seriesOverrides": [], + "seriesOverrides": [ + { + "alias": "Latency over 5 min", + "yaxis": 1 + } + ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { - "expr": "sum(up{job=\"kubelet\"}) BY (job)", - "format": "time_series", - "hide": true, - "instant": false, - "intervalFactor": 1, - "legendFormat": "Up Nodes", - "refId": "A" - }, - { - "expr": "count(up{job=\"kubelet\"})", - "format": "time_series", - "hide": true, - "instant": false, - "intervalFactor": 1, - "legendFormat": "Total Nodes", - "refId": "B" - }, - { - "expr": "avg(up{job=\"kubelet\"}) BY (job)", + "expr": "histogram_quantile(0.$percentiles, sum(rate(traefik_entrypoint_request_duration_seconds_bucket{namespace=\"$namespace\", code=\"200\",method=\"GET\"}[5m])) by (le))", "format": "time_series", "hide": false, "intervalFactor": 1, - "refId": "C" - } - ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "lt", - "value": 1 - } - ], - "timeFrom": null, - "timeShift": null, - "title": "Up Nodes", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "alert": { - "conditions": [ - { - "evaluator": { - "params": [ - 85 - ], - "type": "gt" - }, - "operator": { - "type": "and" - }, - "query": { - "params": [ - "A", - "1m", - "now" - ] - }, - "reducer": { - "params": [], - "type": "avg" - }, - "type": "query" - } - ], - "executionErrorState": "alerting", - "frequency": "60s", - "handler": 1, - "name": "CPU Temperature alert", - "noDataState": "no_data", - "notifications": [] - }, - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "gridPos": { - "h": 6, - "w": 24, - "x": 0, - "y": 26 - }, - "id": 13, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rpi_cpu_temperature_celsius", - "format": "time_series", - "intervalFactor": 5, - "legendFormat": "{{instance}}", + "legendFormat": "Latency over 1 min", "refId": "A" } ], - "thresholds": [ - { - "colorMode": "critical", - "fill": true, - "line": true, - "op": "gt", - "value": 85 - } - ], + "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, - "title": "CPU Temperature", + "title": "Global latency $percentiles th perc over 5 min", "tooltip": { "shared": true, "sort": 0, @@ -1103,11 +346,11 @@ }, "yaxes": [ { - "format": "celsius", + "format": "ms", "label": null, "logBase": 1, "max": null, - "min": null, + "min": "0", "show": true }, { @@ -1116,332 +359,6 @@ "logBase": 1, "max": null, "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 32 - }, - "id": 17, - "panels": [], - "title": "Pods", - "type": "row" - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 0, - "editable": true, - "error": false, - "fill": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 33 - }, - "id": 3, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": 270, - "sort": "current", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "null as zero", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "topk(10,sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\"}[1m] ) ))", - "format": "time_series", - "instant": false, - "interval": "", - "intervalFactor": 1, - "legendFormat": "{{ container_name}}", - "metric": "container_cpu", - "refId": "A", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Pod CPU usage", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 2, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percentunit", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "fill": 0, - "grid": {}, - "gridPos": { - "h": 7, - "w": 24, - "x": 0, - "y": 40 - }, - "id": 2, - "isNew": true, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": 250, - "sort": "avg", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sort_desc(sum(container_memory_usage_bytes{image!=\"\"}) by (container_name, image))", - "format": "time_series", - "hide": true, - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "{{ container_name }}", - "metric": "container_memory_usage:sort_desc", - "refId": "A", - "step": 10 - }, - { - "expr": "topk(10,sum(container_memory_rss{name=~\".+\"}) by (container_name))", - "format": "time_series", - "intervalFactor": 1, - "legendFormat": "{{ container_name }}", - "refId": "B" - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Pod memory usage", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 2, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ], - "yaxis": { - "align": false, - "alignLevel": null - } - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 47 - }, - "id": 19, - "legend": { - "alignAsTable": true, - "avg": true, - "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "sideWidth": 550, - "sort": "avg", - "sortDesc": true, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "topk(10,sum(rate(container_network_transmit_bytes_total{name=~\".+\"}[5m])) by (name))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{ name }}", - "refId": "A", - "step": 240 - }, - { - "expr": "rate(container_network_transmit_bytes_total{id=\"/\"}[$interval])", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "", - "refId": "B", - "step": 10 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Sent Network Traffic per Container", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 1, - "value_type": "cumulative" - }, - "transparent": false, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "Bps", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": "", - "logBase": 10, - "max": 8, - "min": 0, "show": false } ], @@ -1455,36 +372,28 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, + "datasource": "prometheus", "fill": 1, - "grid": {}, "gridPos": { "h": 7, - "w": 12, - "x": 12, - "y": 47 + "w": 24, + "x": 0, + "y": 8 }, - "id": 21, + "id": 29, "legend": { "alignAsTable": true, "avg": true, "current": false, - "hideEmpty": true, - "hideZero": true, - "max": false, + "max": true, "min": false, - "rightSide": false, + "rightSide": true, "show": true, - "sideWidth": 150, - "sort": "avg", - "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, @@ -1497,34 +406,23 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum(rate(container_network_receive_bytes_total{name=~\".+\"}[5m])) by (name))", + "expr": "histogram_quantile(0.$percentiles, rate(traefik_entrypoint_request_duration_seconds_bucket{namespace=\"$namespace\",code=\"200\",method=\"GET\"}[5m]))", "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{name}}", - "refId": "A", - "step": 240 - }, - { - "expr": "- rate(container_network_transmit_bytes_total{name=~\".+\"}[$interval])", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "{{name}}", - "refId": "B", - "step": 10 + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "refId": "A" } ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, - "title": "Received Network Traffic per Container", + "title": "Per node latency $percentiles th perc over 5 min", "tooltip": { - "msResolution": true, "shared": true, - "sort": 1, - "value_type": "cumulative" + "sort": 0, + "value_type": "individual" }, - "transparent": false, "type": "graph", "xaxis": { "buckets": null, @@ -1535,7 +433,107 @@ }, "yaxes": [ { - "format": "Bps", + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 17, + "panels": [], + "title": "Frontends (entrypoints)", + "type": "row" + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_entrypoint_requests_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Total requests", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Total requests over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", "label": null, "logBase": 1, "max": null, @@ -1561,38 +559,119 @@ "bars": false, "dashLength": 10, "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": 2, - "editable": true, - "error": false, - "fill": 0, - "grid": {}, + "datasource": "prometheus", + "fill": 7, "gridPos": { "h": 7, - "w": 24, + "w": 12, "x": 0, - "y": 54 + "y": 23 }, - "id": 8, - "isNew": true, + "id": 19, "legend": { "alignAsTable": true, "avg": true, "current": true, - "max": false, + "max": true, "min": false, "rightSide": true, "show": true, - "sideWidth": 200, - "sort": "current", + "sort": "avg", "sortDesc": true, "total": false, "values": true }, "lines": true, - "linewidth": 2, + "linewidth": 1, "links": [], - "nullPointMode": "connected", + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(traefik_entrypoint_open_connections{namespace=\"$namespace\"}) by (method)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ method }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Open Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, @@ -1603,35 +682,691 @@ "steppedLine": false, "targets": [ { - "expr": "sort_desc(sum by (kubernetes_pod_name) (rate (container_network_receive_bytes_total{name!=\"\", kubernetes_pod_name=~\".*\"}[1m]) ))", + "expr": "sum(rate(traefik_entrypoint_request_duration_seconds_bucket{namespace=\"$namespace\",le=\"0.1\",code=\"200\"}[5m])) by (job) / sum(rate(traefik_entrypoint_request_duration_seconds_count{namespace=\"$namespace\",code=\"200\"}[5m])) by (job)", "format": "time_series", - "interval": "10s", "intervalFactor": 1, - "legendFormat": "Receive Traffic", - "metric": "network", - "refId": "A", - "step": 10 - }, - { - "expr": "sort_desc(sum by (kubernetes_pod_name) (rate (container_network_transmit_bytes_total{name!=\"\", kubernetes_pod_name=~\".*\"}[1m]) ))", - "format": "time_series", - "interval": "10s", - "intervalFactor": 1, - "legendFormat": "Transmit Traffic", - "metric": "network", - "refId": "B", - "step": 10 + "legendFormat": "Code 200", + "refId": "A" } ], "thresholds": [], "timeFrom": null, + "timeRegions": [], "timeShift": null, - "title": "Pod Network i/o", + "title": "Apdex score (over 5 min)", "tooltip": { - "msResolution": false, "shared": true, "sort": 0, - "value_type": "cumulative" + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 24, + "panels": [], + "title": "Backends", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 7, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(traefik_backend_open_connections{namespace=\"$namespace\"}) by (method)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ method }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Open Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 26, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_request_duration_seconds_bucket{namespace=\"$namespace\",le=\"0.1\",code=\"200\"}[5m])) by (job) / sum(rate(traefik_backend_request_duration_seconds_count{namespace=\"$namespace\",code=\"200\"}[5m])) by (job)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Code 200", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Apdex score (over 5 min)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 15, + "panels": [], + "title": "HTTP Codes stats", + "type": "row" + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\",code=~\"2..\"}[5m])) by (method, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{method}} : {{code}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Status code 2xx over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 27, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\",code=~\"5..\"}[5m])) by (method, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{method}} : {{code}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Status code 5xx over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\"}[1m])) by (backend)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ backend }}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Backend total requests over 1min per backend", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sortDesc": false, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\",code!~\"2..|5..\"}[5m])) by (method, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ method }} : {{code}}", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Others status code over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 57 + }, + "id": 35, + "panels": [], + "title": "Pods ressources", + "type": "row" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 58 + }, + "id": 31, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Max memory used", + "refId": "A" + }, + { + "expr": "avg(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requested memory usage", + "refId": "B" + }, + { + "expr": "avg(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit memory usage", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Traefik max memory usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" }, "type": "graph", "xaxis": { @@ -1650,6 +1385,106 @@ "min": null, "show": true }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 58 + }, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"}[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Max cpu used", + "refId": "A" + }, + { + "expr": "avg(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requested cpu usage", + "refId": "B" + }, + { + "expr": "avg(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit cpu usage", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Traefik max CPU usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, { "format": "short", "label": null, @@ -1665,14 +1500,68 @@ } } ], - "refresh": "10s", "schemaVersion": 16, "style": "dark", "tags": [ - "custom" + "traefik" ], "templating": { - "list": [] + "list": [ + { + "allValue": null, + "current": { + "text": "kube-system", + "value": "kube-system" + }, + "datasource": "prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [], + "query": "label_values(traefik_config_reloads_total, namespace)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "tags": [], + "text": "99", + "value": "99" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "percentiles", + "options": [ + { + "selected": false, + "text": "95", + "value": "95" + }, + { + "selected": true, + "text": "99", + "value": "99" + } + ], + "query": "95,99", + "skipUrlSync": false, + "type": "custom" + } + ] }, "time": { "from": "now-3h", @@ -1703,8 +1592,8 @@ "30d" ] }, - "timezone": "browser", - "title": "Kubernetes cluster monitoring (via Prometheus)", - "uid": "82pBZCmRk", - "version": 37 -} + "timezone": "", + "title": "Traefik", + "uid": "000000113", + "version": 3 +} \ No newline at end of file diff --git a/operator_stack.jsonnet b/operator_stack.jsonnet index 71150ef..42abbee 100644 --- a/operator_stack.jsonnet +++ b/operator_stack.jsonnet @@ -1,281 +1,249 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { +local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + + // (import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') + + (import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') + + (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + + (import 'image_sources_versions.jsonnet') + + { _config+:: { - namespace: "monitoring", + namespace: "monitoring", - urls+:: { - prom_externalUrl: 'http://prometheus.internal.carlosedp.com', - alert_externalUrl: 'http://alertmanager.internal.carlosedp.com', - grafana_externalUrl: 'http://grafana.internal.carlosedp.com/', + urls+:: { + prom_externalUrl: 'http://prometheus.internal.carlosedp.com', + alert_externalUrl: 'http://alertmanager.internal.carlosedp.com', + grafana_externalUrl: 'http://grafana.internal.carlosedp.com/', - prom_ingress: 'prometheus.internal.carlosedp.com', - alert_ingress: 'alertmanager.internal.carlosedp.com', - grafana_ingress: 'grafana.internal.carlosedp.com', - }, - - versions+:: { - prometheus: "v2.5.0", - alertmanager: "v0.15.3", - kubeStateMetrics: "v1.5.0", - kubeRbacProxy: "v0.4.1", - addonResizer: "2.1", - nodeExporter: "v0.17.0", - prometheusOperator: "v0.28.0", - prometheusAdapter: "v0.4.1", - grafana: "v5.4.0", - configmapReloader: "v0.2.2", - prometheusConfigReloader: "v0.28.0", - }, - - imageRepos+:: { - prometheus: "carlosedp/prometheus", - alertmanager: "carlosedp/alertmanager", - kubeStateMetrics: "carlosedp/kube-state-metrics", - kubeRbacProxy: "carlosedp/kube-rbac-proxy", - addonResizer: "carlosedp/addon-resizer", - nodeExporter: "carlosedp/node_exporter", - prometheusOperator: "carlosedp/prometheus-operator", - prometheusAdapter: "carlosedp/k8s-prometheus-adapter", - grafana: "carlosedp/monitoring-grafana", - configmapReloader: "carlosedp/configmap-reload", - prometheusConfigReloader: "carlosedp/prometheus-config-reloader", - }, - - prometheus+:: { - names: 'k8s', - replicas: 1, - namespaces: ["default", "kube-system","monitoring","logging"], - }, - - alertmanager+:: { - name: 'main', - config: ||| - global: - resolve_timeout: 5m - route: - group_by: ['job'] - group_wait: 30s - group_interval: 5m - repeat_interval: 12h - receiver: 'null' - routes: - - match: - alertname: DeadMansSwitch - receiver: 'null' - receivers: - - name: 'null' - |||, - replicas: 1, - // Configure External URL's - alertmanager+: { - spec+: { - externalUrl: $._config.urls.alert_externalUrl, + prom_ingress: 'prometheus.internal.carlosedp.com', + alert_ingress: 'alertmanager.internal.carlosedp.com', + grafana_ingress: 'grafana.internal.carlosedp.com', + grafana_ingress_external: 'grafana.cloud.carlosedp.com', }, - }, - }, - kubeStateMetrics+:: { - collectors: '', // empty string gets a default set - scrapeInterval: '30s', - scrapeTimeout: '30s', + prometheus+:: { + names: 'k8s', + replicas: 1, + namespaces: ["default", "kube-system", "monitoring", "logging", "metallb-system"], + }, - baseCPU: '100m', - baseMemory: '150Mi', - cpuPerNode: '2m', - memoryPerNode: '30Mi', - }, + alertmanager+:: { + replicas: 1, + }, - grafana+:: { - config: { - sections: { - database: { path: '/data/grafana.db' }, - paths: { - data: '/var/lib/grafana', - logs: '/var/lib/grafana/log', - plugins: '/var/lib/grafana/plugins', - provisioning: '/etc/grafana/provisioning', - }, - session: { provider: 'memory' }, - 'auth.basic': {enabled: false}, - 'auth.anonymous': {enabled: false}, - smtp: { - enabled: true, - host: 'smtp-server.monitoring.svc:25', - user: '', - password: '', - from_address:'carlosedp@gmail.com', - from_name: 'Grafana Alert', - skip_verify: true + kubeStateMetrics+:: { + collectors: '', // empty string gets a default set + scrapeInterval: '30s', + scrapeTimeout: '30s', + + baseCPU: '100m', + baseMemory: '150Mi', + cpuPerNode: '2m', + memoryPerNode: '30Mi', + }, + + grafana+:: { + config: { + sections: { + // database: { path: '/data/grafana.db' }, + // paths: { + // data: '/var/lib/grafana', + // logs: '/var/lib/grafana/log', + // plugins: '/var/lib/grafana/plugins', + // provisioning: '/etc/grafana/provisioning', + // }, + session: { provider: 'memory' }, + 'auth.basic': {enabled: false}, + 'auth.anonymous': {enabled: false}, + smtp: { + enabled: true, + host: 'smtp-server.monitoring.svc:25', + user: '', + password: '', + from_address:'carlosedp@gmail.com', + from_name: 'Grafana Alert', + skip_verify: true + }, + }, }, }, - }, }, - //--------------------------------------- // End of _config //--------------------------------------- - }, prometheus+:: { - local pvc = k.core.v1.persistentVolumeClaim, - - prometheus+: { - spec+: { - retention: '15d', - externalUrl: $._config.urls.prom_externalUrl, - storage: { - volumeClaimTemplate: - pvc.new() + - pvc.mixin.spec.withAccessModes('ReadWriteOnce') + - pvc.mixin.spec.resources.withRequests({ storage: '20Gi' }) + - pvc.mixin.spec.withStorageClassName('nfs-ssd-node1'), - }, + local pvc = k.core.v1.persistentVolumeClaim, + prometheus+: { + spec+: { + retention: '15d', + externalUrl: $._config.urls.prom_externalUrl, + storage: { + volumeClaimTemplate: + pvc.new() + + pvc.mixin.spec.withAccessModes('ReadWriteOnce') + + pvc.mixin.spec.resources.withRequests({ storage: '20Gi' }) + # Uncomment below to define a StorageClass name + #+ pvc.mixin.spec.withStorageClassName('nfs-master-ssd'), + }, + }, }, - }, }, - # Override command for Grafana to load ini from correct path - // grafana+:: { - // deployment+: - // { - // local pvc = k.core.v1.persistentVolumeClaim, - // spec+: { - // volumeClaimTemplate: - // pvc.new() + - // pvc.mixin.metadata.withNamespace($._config.namespace) + - // pvc.mixin.metadata.withName("grafana-storage") + - // pvc.mixin.spec.withAccessModes('ReadWriteMany') + - // pvc.mixin.spec.resources.withRequests({ storage: '2Gi' }) + - // pvc.mixin.spec.withStorageClassName('nfs-ssd-node1'), - // template+: { - // spec+: { - // containers: - // std.map( - // function(c) - // if c.name == 'grafana' then - // c { - // args+: [ - // '-config=/etc/grafana/grafana.ini', - // ], - // } - // else - // c, - // super.containers, - // ), - // }, - // }, - // }, - // }, - // }, + # Override command for Grafana persist data + // grafana+:: { + // local pvc = k.core.v1.persistentVolumeClaim, + // local deployment = k.apps.v1beta2.deployment, + // local container = deployment.mixin.spec.template.spec.containersType, + // local containerVolumeMount = container.volumeMountsType, + // local volume = deployment.mixin.spec.template.spec.volumesType, + // local grafanaStorage = pvc.new() + pvc.mixin.metadata.withNamespace($._config.namespace) + + // pvc.mixin.metadata.withName("grafana-storage") + + // pvc.mixin.spec.withAccessModes('ReadWriteMany') + + // pvc.mixin.spec.resources.withRequests({ storage: '2Gi' }), + // deployment+: { + // // local storageVolumeName = grafanaStorage, + // // local storageVolume = volume.fromPersistentVolumeClaim(grafanaStorage), + // // }, + // // { + // spec+: { + // template+: { + // spec+: { + // containers: + // std.map( + // function(c) + // if c.name == 'grafana' then + // c { + // volumeMounts+: [ + // containerVolumeMount.new("grafana-storage", "/var/lib/grafana"), + // ], + // } + // else + // c, + // super.containers, + // ), + // volumes+: [ + // volume.fromPersistentVolumeClaim(grafanaStorage, "grafana-storage"), + // ], + // }, + // }, + // }, + // }, + // }, - // Override command for addon-resizer due to change from parameter --threshold to --acceptance-offset - kubeStateMetrics+:: { - deployment+: { - spec+: { - template+: { - spec+: { - containers: - std.filterMap( - function(c) c.name == 'addon-resizer', - function(c) - if std.startsWith(c.name, 'addon-resizer') then - c { - command: [ - '/pod_nanny', - '--container=kube-state-metrics', - '--cpu=' + $._config.kubeStateMetrics.baseCPU, - '--extra-cpu=' + $._config.kubeStateMetrics.cpuPerNode, - '--memory=' + $._config.kubeStateMetrics.baseMemory, - '--extra-memory=' + $._config.kubeStateMetrics.memoryPerNode, - '--acceptance-offset=5', - '--deployment=kube-state-metrics', - ], + // Add custom dashboards + grafanaDashboards+:: { + 'kubernetes-cluster-dashboard.json': (import 'grafana-dashboards/kubernetes-cluster-dashboard.json'), + 'prometheus-dashboard.json': (import 'grafana-dashboards/prometheus-dashboard.json'), + 'traefik-dashboard.json': (import 'grafana-dashboards/traefik-dashboard.json'), + }, + kubeStateMetrics+:: { + // Override command for addon-resizer due to change from parameter --threshold to --acceptance-offset + deployment+: { + spec+: { + template+: { + spec+: { + containers: + std.map( + function(c) + if std.startsWith(c.name, 'addon-resizer') then + c { + command: [ + '/pod_nanny', + '--container=kube-state-metrics', + '--cpu=100m', + '--extra-cpu=2m', + '--memory=150Mi', + '--extra-memory=30Mi', + '--acceptance-offset=5', + '--deployment=kube-state-metrics', + ], + } + else + c, + super.containers, + ), }, - super.containers, - ), - }, + }, + }, }, - }, }, - }, // Create ingress objects per application - ingress+: { - local secret = k.core.v1.secret, - local ingress = k.extensions.v1beta1.ingress, - local ingressTls = ingress.mixin.spec.tlsType, - local ingressRule = ingress.mixin.spec.rulesType, - local httpIngressPath = ingressRule.mixin.http.pathsType, + ingress+: { + local secret = k.core.v1.secret, + local ingress = k.extensions.v1beta1.ingress, + local ingressTls = ingress.mixin.spec.tlsType, + local ingressRule = ingress.mixin.spec.rulesType, + local httpIngressPath = ingressRule.mixin.http.pathsType, - 'alertmanager-main': - ingress.new() + - ingress.mixin.metadata.withName('alertmanager-main') + - ingress.mixin.metadata.withNamespace($._config.namespace) + - // ingress.mixin.metadata.withAnnotations({ - // 'nginx.ingress.kubernetes.io/auth-type': 'basic', - // 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', - // 'nginx.ingress.kubernetes.io/auth-realm': 'Authentication Required', - // }) + - ingress.mixin.spec.withRules( - ingressRule.new() + - ingressRule.withHost($._config.urls.alert_ingress) + - ingressRule.mixin.http.withPaths( - httpIngressPath.new() + - httpIngressPath.withPath('/') + - httpIngressPath.mixin.backend.withServiceName('alertmanager-main') + - httpIngressPath.mixin.backend.withServicePort('web') - ), - ), - 'grafana': - ingress.new() + - ingress.mixin.metadata.withName('grafana') + - ingress.mixin.metadata.withNamespace($._config.namespace) + - // ingress.mixin.metadata.withAnnotations({ - // 'nginx.ingress.kubernetes.io/auth-type': 'basic', - // 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', - // 'nginx.ingress.kubernetes.io/auth-realm': 'Authentication Required', - // }) + - ingress.mixin.spec.withRules( - ingressRule.new() + - ingressRule.withHost($._config.urls.grafana_ingress) + - ingressRule.mixin.http.withPaths( - httpIngressPath.new() + - httpIngressPath.withPath('/') + - httpIngressPath.mixin.backend.withServiceName('grafana') + - httpIngressPath.mixin.backend.withServicePort('http') - ), - ), - 'prometheus-k8s': - ingress.new() + - ingress.mixin.metadata.withName('prometheus-k8s') + - ingress.mixin.metadata.withNamespace($._config.namespace) + - // ingress.mixin.metadata.withAnnotations({ - // 'nginx.ingress.kubernetes.io/auth-type': 'basic', - // 'nginx.ingress.kubernetes.io/auth-secret': 'basic-auth', - // 'nginx.ingress.kubernetes.io/auth-realm': 'Authentication Required', - // }) + - ingress.mixin.spec.withRules( - ingressRule.new() + - ingressRule.withHost($._config.urls.prom_ingress) + - ingressRule.mixin.http.withPaths( - httpIngressPath.new() + - httpIngressPath.withPath('/') + - httpIngressPath.mixin.backend.withServiceName('prometheus-k8s') + - httpIngressPath.mixin.backend.withServicePort('web') - ), - ), - }, - }; - // + { - // Create basic auth secret - replace 'auth' file with your own - // Create with htpasswd -c auth [USERNAME] -// ingress+:: { -// 'basic-auth-secret': -// secret.new('basic-auth', { auth: std.base64(importstr 'auth') }) + -// secret.mixin.metadata.withNamespace($._config.namespace), -// }, -// }; + 'alertmanager-main': + ingress.new() + + ingress.mixin.metadata.withName('alertmanager-main') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost($._config.urls.alert_ingress) + + ingressRule.mixin.http.withPaths( + httpIngressPath.new() + + httpIngressPath.withPath('/') + + httpIngressPath.mixin.backend.withServiceName('alertmanager-main') + + httpIngressPath.mixin.backend.withServicePort('web') + ), + ), + 'grafana': + ingress.new() + + ingress.mixin.metadata.withName('grafana') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost($._config.urls.grafana_ingress) + + ingressRule.mixin.http.withPaths( + httpIngressPath.new() + + httpIngressPath.withPath('/') + + httpIngressPath.mixin.backend.withServiceName('grafana') + + httpIngressPath.mixin.backend.withServicePort('http') + ), + ), + 'prometheus-k8s': + ingress.new() + + ingress.mixin.metadata.withName('prometheus-k8s') + + ingress.mixin.metadata.withNamespace($._config.namespace) + + ingress.mixin.spec.withRules( + ingressRule.new() + + ingressRule.withHost($._config.urls.prom_ingress) + + ingressRule.mixin.http.withPaths( + httpIngressPath.new() + + httpIngressPath.withPath('/') + + httpIngressPath.mixin.backend.withServiceName('prometheus-k8s') + + httpIngressPath.mixin.backend.withServicePort('web') + ), + ), + // 'grafana-external': + // // Example external ingress with authentication + // ingress.new() + + // ingress.mixin.metadata.withName('grafana-external') + + // ingress.mixin.metadata.withNamespace($._config.namespace) + + // ingress.mixin.metadata.withLabels({'traffic-type': 'external'}) + + // ingress.mixin.metadata.withAnnotations({ + // 'ingress.kubernetes.io/auth-type': 'basic', + // 'ingress.kubernetes.io/auth-secret': 'basic-auth', + // }) + + // ingress.mixin.spec.withRules( + // ingressRule.new() + + // ingressRule.withHost($._config.urls.grafana_ingress_external) + + // ingressRule.mixin.http.withPaths( + // httpIngressPath.new() + + // httpIngressPath.withPath('/') + + // httpIngressPath.mixin.backend.withServiceName('grafana') + + // httpIngressPath.mixin.backend.withServicePort('http') + // ), + // ), + // 'basic-auth-secret': + // // First generate the auth secret with gen_auth.sh script + // secret.new('basic-auth', { auth: std.base64(importstr 'auth') }) + + // secret.mixin.metadata.withNamespace($._config.namespace), + }, + }; { ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + { ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + From 6fc585900c2ee1a7db137c5a62a8a2bb6d5a4de3 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 18:02:42 -0200 Subject: [PATCH 16/30] Import all jsonnet files --- main.jsonnet | 2 ++ 1 file changed, 2 insertions(+) diff --git a/main.jsonnet b/main.jsonnet index e594914..69e2344 100644 --- a/main.jsonnet +++ b/main.jsonnet @@ -1,3 +1,5 @@ (import 'operator_stack.jsonnet') + (import 'arm_exporter.jsonnet') + +(import 'smtp_server.jsonnet') + +(import 'metallb.jsonnet') + (import 'traefik.jsonnet') From 073b5169eeb3111b074da99a8283fa33e8b713ae Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 18:03:30 -0200 Subject: [PATCH 17/30] Generate all manifests --- manifests/alertmanager-alertmanager.yaml | 2 +- manifests/alertmanager-secret.yaml | 2 +- manifests/arm-exporter-daemonset.yaml | 2 +- manifests/grafana-config.yaml | 2 +- manifests/grafana-dashboardDefinitions.yaml | 9366 +++++++++++++++++ manifests/grafana-deployment.yaml | 20 +- manifests/kube-state-metrics-deployment.yaml | 48 + manifests/metallb-service.yaml | 16 + manifests/metallb-serviceMonitor.yaml | 19 + ...llerManagerPrometheusDiscoveryService.yaml | 15 + ...eus-kubeDnsPrometheusDiscoveryService.yaml | 15 + ...beSchedulerPrometheusDiscoveryService.yaml | 15 + manifests/prometheus-prometheus.yaml | 3 +- ...metheus-roleBindingSpecificNamespaces.yaml | 13 + .../prometheus-roleSpecificNamespaces.yaml | 16 + manifests/smtp-server-deployment.yaml | 38 + manifests/smtp-server-service.yaml | 14 + manifests/traefik-serviceMonitor.yaml | 2 +- 18 files changed, 9600 insertions(+), 8 deletions(-) create mode 100644 manifests/metallb-service.yaml create mode 100644 manifests/metallb-serviceMonitor.yaml create mode 100644 manifests/prometheus-kubeControllerManagerPrometheusDiscoveryService.yaml create mode 100644 manifests/prometheus-kubeDnsPrometheusDiscoveryService.yaml create mode 100644 manifests/prometheus-kubeSchedulerPrometheusDiscoveryService.yaml create mode 100644 manifests/smtp-server-deployment.yaml create mode 100644 manifests/smtp-server-service.yaml diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index e57f39a..c8d7024 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -15,4 +15,4 @@ spec: runAsNonRoot: true runAsUser: 1000 serviceAccountName: alertmanager-main - version: v0.15.3 + version: v0.16.0 diff --git a/manifests/alertmanager-secret.yaml b/manifests/alertmanager-secret.yaml index 4a143fb..79fc7a2 100644 --- a/manifests/alertmanager-secret.yaml +++ b/manifests/alertmanager-secret.yaml @@ -1,6 +1,6 @@ apiVersion: v1 data: - alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== + alertmanager.yaml: Imdsb2JhbCI6IAogICJyZXNvbHZlX3RpbWVvdXQiOiAiNW0iCiJyZWNlaXZlcnMiOiAKLSAibmFtZSI6ICJudWxsIgoicm91dGUiOiAKICAiZ3JvdXBfYnkiOiAKICAtICJqb2IiCiAgImdyb3VwX2ludGVydmFsIjogIjVtIgogICJncm91cF93YWl0IjogIjMwcyIKICAicmVjZWl2ZXIiOiAibnVsbCIKICAicmVwZWF0X2ludGVydmFsIjogIjEyaCIKICAicm91dGVzIjogCiAgLSAibWF0Y2giOiAKICAgICAgImFsZXJ0bmFtZSI6ICJEZWFkTWFuc1N3aXRjaCIKICAgICJyZWNlaXZlciI6ICJudWxsIg== kind: Secret metadata: name: alertmanager-main diff --git a/manifests/arm-exporter-daemonset.yaml b/manifests/arm-exporter-daemonset.yaml index 11b3703..557e6fa 100644 --- a/manifests/arm-exporter-daemonset.yaml +++ b/manifests/arm-exporter-daemonset.yaml @@ -27,7 +27,7 @@ spec: - args: - --secure-listen-address=$(IP):9243 - --upstream=http://127.0.0.1:9243/ - image: carlosedp/kube-rbac-proxy:v0.4.0 + image: carlosedp/kube-rbac-proxy:v0.4.1 name: kube-rbac-proxy ports: - containerPort: 9243 diff --git a/manifests/grafana-config.yaml b/manifests/grafana-config.yaml index 51cc4b7..14966e2 100644 --- a/manifests/grafana-config.yaml +++ b/manifests/grafana-config.yaml @@ -1,6 +1,6 @@ apiVersion: v1 data: - grafana.ini: W2F1dGguYW5vbnltb3VzXQplbmFibGVkID0gZmFsc2UKW2F1dGguYmFzaWNdCmVuYWJsZWQgPSBmYWxzZQpbZGF0YWJhc2VdCnBhdGggPSAvZGF0YS9ncmFmYW5hLmRiCltwYXRoc10KZGF0YSA9IC92YXIvbGliL2dyYWZhbmEKbG9ncyA9IC92YXIvbGliL2dyYWZhbmEvbG9nCnBsdWdpbnMgPSAvdmFyL2xpYi9ncmFmYW5hL3BsdWdpbnMKcHJvdmlzaW9uaW5nID0gL2V0Yy9ncmFmYW5hL3Byb3Zpc2lvbmluZwpbc2Vzc2lvbl0KcHJvdmlkZXIgPSBtZW1vcnkKW3NtdHBdCmVuYWJsZWQgPSB0cnVlCmZyb21fYWRkcmVzcyA9IGNhcmxvc2VkcEBnbWFpbC5jb20KZnJvbV9uYW1lID0gR3JhZmFuYSBBbGVydApob3N0ID0gc210cC1zZXJ2ZXIubW9uaXRvcmluZy5zdmM6MjUKcGFzc3dvcmQgPSAKc2tpcF92ZXJpZnkgPSB0cnVlCnVzZXIgPSAK + grafana.ini: W2F1dGguYW5vbnltb3VzXQplbmFibGVkID0gZmFsc2UKW2F1dGguYmFzaWNdCmVuYWJsZWQgPSBmYWxzZQpbc2Vzc2lvbl0KcHJvdmlkZXIgPSBtZW1vcnkKW3NtdHBdCmVuYWJsZWQgPSB0cnVlCmZyb21fYWRkcmVzcyA9IGNhcmxvc2VkcEBnbWFpbC5jb20KZnJvbV9uYW1lID0gR3JhZmFuYSBBbGVydApob3N0ID0gc210cC1zZXJ2ZXIubW9uaXRvcmluZy5zdmM6MjUKcGFzc3dvcmQgPSAKc2tpcF92ZXJpZnkgPSB0cnVlCnVzZXIgPSAK kind: Secret metadata: name: grafana-config diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index f3374ed..1aec137 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -4892,6 +4892,1848 @@ items: metadata: name: grafana-dashboard-k8s-resources-pod namespace: monitoring +- apiVersion: v1 + data: + kubernetes-cluster-dashboard.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Monitor a Kubernetes cluster using Prometheus TSDB. Shows overall cluster CPU / Memory / Disk usage as well as individual pod statistics. ", + "editable": true, + "gnetId": 162, + "graphTooltip": 1, + "id": 12, + "links": [ + + ], + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 4, + "interval": null, + "isNew": true, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "65, 90", + "title": "Cluster memory usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": true, + "colorValue": false, + "colors": [ + "rgba(0, 0, 0, 0)", + "rgb(210, 1, 1)", + "#890f02" + ], + "datasource": "prometheus", + "format": "percentunit", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 2, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 23, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(up{job=\"kubelet\"}) BY (job)", + "format": "time_series", + "instant": true, + "intervalFactor": 1, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "1.1", + "title": "Up Nodes", + "type": "singlestat", + "valueFontSize": "120%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "decimals": 0, + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 6, + "interval": null, + "isNew": true, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "avg(100 - (avg by (instance) (irate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100))", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "refId": "A", + "step": 10 + } + ], + "thresholds": "65, 90", + "title": "Cluster CPU usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "columns": [ + + ], + "datasource": "prometheus", + "fontSize": "90%", + "gridPos": { + "h": 5, + "w": 8, + "x": 8, + "y": 2 + }, + "id": 25, + "links": [ + + ], + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 2, + "desc": false + }, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "link": false, + "pattern": "Time", + "type": "date" + }, + { + "alias": "Uptime", + "colorMode": null, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "Value", + "thresholds": [ + + ], + "type": "number", + "unit": "s" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/endpoint|job|namespace|pod|service/", + "thresholds": [ + + ], + "type": "hidden", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "instance", + "preserveFormat": false, + "sanitize": false, + "thresholds": [ + + ], + "type": "string", + "unit": "short" + } + ], + "targets": [ + { + "expr": "(time() - node_boot_time_seconds)", + "format": "table", + "instant": true, + "intervalFactor": 1, + "refId": "A" + } + ], + "title": "Node Uptime", + "transform": "table", + "transparent": true, + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 7 + }, + "id": 15, + "panels": [ + + ], + "title": "Nodes", + "type": "row" + }, + { + "alert": { + "conditions": [ + { + "evaluator": { + "params": [ + 3500000000 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "4m", + "now" + ] + }, + "reducer": { + "params": [ + + ], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "60s", + "handler": 1, + "name": "Memory Usage alert", + "noDataState": "no_data", + "notifications": [ + + ] + }, + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 3500000000 + } + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Memory Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "conditions": [ + { + "evaluator": { + "params": [ + 90 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "15m", + "now" + ] + }, + "reducer": { + "params": [ + + ], + "type": "max" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "60s", + "handler": 1, + "name": "CPU Usage alert", + "noDataState": "no_data", + "notifications": [ + + ] + }, + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 11, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "100 - (avg by (instance) (irate(node_cpu_seconds_total{job=\"node-exporter\",mode=\"idle\"}[5m])) * 100)", + "format": "time_series", + "intervalFactor": 3, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 90 + } + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "decimals": null, + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "columns": [ + + ], + "datasource": "prometheus", + "fontSize": "100%", + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 17 + }, + "id": 31, + "links": [ + + ], + "pageSize": null, + "scroll": true, + "showHeader": true, + "sort": { + "col": 0, + "desc": true + }, + "styles": [ + { + "alias": "Time", + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "link": false, + "pattern": "Time", + "type": "date" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "dateFormat": "YYYY-MM-DD HH:mm:ss", + "decimals": 2, + "pattern": "condition|container|daemonset|endpoint|namespace|node", + "thresholds": [ + + ], + "type": "hidden", + "unit": "short" + }, + { + "alias": "", + "colorMode": null, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "decimals": 2, + "pattern": "/.*/", + "thresholds": [ + + ], + "type": "number", + "unit": "short" + } + ], + "targets": [ + { + "expr": "ALERTS{alertstate=\"firing\"}", + "format": "table", + "instant": true, + "intervalFactor": 1, + "refId": "A" + }, + { + "expr": "ALERTS{alertstate=\"firing\",alertname!=\"DeadMansSwitch\"}", + "format": "table", + "hide": true, + "intervalFactor": 1, + "refId": "B" + } + ], + "title": "Active Alerts", + "transform": "table", + "type": "table" + }, + { + "dashboardFilter": "", + "folderId": null, + "gridPos": { + "h": 9, + "w": 5, + "x": 12, + "y": 17 + }, + "id": 27, + "limit": 10, + "links": [ + + ], + "nameFilter": "", + "onlyAlertsOnDashboard": false, + "show": "current", + "sortOrder": 1, + "stateFilter": [ + + ], + "title": "Alarms", + "type": "alertlist" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(50, 172, 45, 0.97)", + "rgba(237, 129, 40, 0.89)", + "rgba(245, 54, 54, 0.9)" + ], + "datasource": "prometheus", + "decimals": null, + "editable": true, + "error": false, + "format": "percent", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": true, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 7, + "x": 17, + "y": 17 + }, + "id": 7, + "interval": null, + "isNew": true, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "(sum(node_filesystem_size_bytes{device=\u007e\"/dev/.*\"}) - sum(node_filesystem_free_bytes{device=\u007e\"/dev/.*\"}) ) / sum(node_filesystem_size_bytes{device=\u007e\"/dev/.*\"}) * 100", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "metric": "", + "refId": "A", + "step": 10 + } + ], + "thresholds": "65, 90", + "title": "Cluster Filesystem usage", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "alert": { + "conditions": [ + { + "evaluator": { + "params": [ + 1 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C", + "5m", + "now" + ] + }, + "reducer": { + "params": [ + + ], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "60s", + "handler": 1, + "name": "Node Down", + "noDataState": "alerting", + "notifications": [ + + ] + }, + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 4, + "w": 7, + "x": 17, + "y": 22 + }, + "id": 29, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(up{job=\"kubelet\"}) BY (job)", + "format": "time_series", + "hide": true, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Up Nodes", + "refId": "A" + }, + { + "expr": "count(up{job=\"kubelet\"})", + "format": "time_series", + "hide": true, + "instant": false, + "intervalFactor": 1, + "legendFormat": "Total Nodes", + "refId": "B" + }, + { + "expr": "avg(up{job=\"kubelet\"}) BY (job)", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "refId": "C" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "lt", + "value": 1 + } + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Up Nodes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "alert": { + "conditions": [ + { + "evaluator": { + "params": [ + 85 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "A", + "1m", + "now" + ] + }, + "reducer": { + "params": [ + + ], + "type": "avg" + }, + "type": "query" + } + ], + "executionErrorState": "alerting", + "frequency": "60s", + "handler": 1, + "name": "CPU Temperature alert", + "noDataState": "no_data", + "notifications": [ + + ] + }, + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 26 + }, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rpi_cpu_temperature_celsius", + "format": "time_series", + "intervalFactor": 5, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + { + "colorMode": "critical", + "fill": true, + "line": true, + "op": "gt", + "value": 85 + } + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "CPU Temperature", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "celsius", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 32 + }, + "id": 17, + "panels": [ + + ], + "title": "Pods", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 0, + "editable": true, + "error": false, + "fill": 0, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 33 + }, + "id": 3, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 270, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10,sum by (container_name)( rate(container_cpu_usage_seconds_total{image!=\"\",container_name!=\"POD\"}[1m] ) ))", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{ container_name}}", + "metric": "container_cpu", + "refId": "A", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Pod CPU usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 40 + }, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 250, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum(container_memory_usage_bytes{image!=\"\"}) by (container_name, image))", + "format": "time_series", + "hide": true, + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "{{ container_name }}", + "metric": "container_memory_usage:sort_desc", + "refId": "A", + "step": 10 + }, + { + "expr": "topk(10,sum(container_memory_rss{name=\u007e\".+\"}) by (container_name))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ container_name }}", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Pod memory usage", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 47 + }, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 550, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10,sum(rate(container_network_transmit_bytes_total{name=\u007e\".+\"}[5m])) by (name))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ name }}", + "refId": "A", + "step": 240 + }, + { + "expr": "rate(container_network_transmit_bytes_total{id=\"/\"}[$interval])", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "", + "refId": "B", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Sent Network Traffic per Container", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 1, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": "", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": "", + "logBase": 10, + "max": 8, + "min": 0, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 47 + }, + "id": 21, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": 150, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "topk(10,sum(rate(container_network_receive_bytes_total{name=\u007e\".+\"}[5m])) by (name))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "A", + "step": 240 + }, + { + "expr": "- rate(container_network_transmit_bytes_total{name=\u007e\".+\"}[$interval])", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "B", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Received Network Traffic per Container", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 1, + "value_type": "cumulative" + }, + "transparent": false, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "Bps", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "decimals": 2, + "editable": true, + "error": false, + "fill": 0, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 54 + }, + "id": 8, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": 200, + "sort": "current", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sort_desc(sum by (kubernetes_pod_name) (rate (container_network_receive_bytes_total{name!=\"\", kubernetes_pod_name=\u007e\".*\"}[1m]) ))", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Receive Traffic", + "metric": "network", + "refId": "A", + "step": 10 + }, + { + "expr": "sort_desc(sum by (kubernetes_pod_name) (rate (container_network_transmit_bytes_total{name!=\"\", kubernetes_pod_name=\u007e\".*\"}[1m]) ))", + "format": "time_series", + "interval": "10s", + "intervalFactor": 1, + "legendFormat": "Transmit Traffic", + "metric": "network", + "refId": "B", + "step": 10 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Pod Network i/o", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "10s", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "custom" + ], + "templating": { + "list": [ + + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Kubernetes cluster monitoring (via Prometheus)", + "uid": "82pBZCmRk", + "version": 2 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-kubernetes-cluster-dashboard + namespace: monitoring - apiVersion: v1 data: nodes.json: |- @@ -7049,6 +8891,5748 @@ items: metadata: name: grafana-dashboard-pods namespace: monitoring +- apiVersion: v1 + data: + prometheus-dashboard.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + }, + { + "datasource": "$datasource", + "enable": true, + "expr": "count(sum(up{instance=\"$instance\"}) by (instance) < 1)", + "hide": false, + "iconColor": "rgb(250, 44, 18)", + "limit": 100, + "name": "downage", + "showIn": 0, + "step": "30s", + "tagKeys": "instance", + "textFormat": "prometheus down", + "titleFormat": "Downage", + "type": "alert" + }, + { + "datasource": "$datasource", + "enable": true, + "expr": "sum(changes(prometheus_config_last_reload_success_timestamp_seconds[10m])) by (instance)", + "hide": false, + "iconColor": "#fceaca", + "limit": 100, + "name": "Reload", + "showIn": 0, + "step": "5m", + "tagKeys": "instance", + "tags": [ + + ], + "titleFormat": "Reload", + "type": "tags" + } + ] + }, + "description": "Dashboard for monitoring of Prometheus v2.x.x", + "editable": true, + "gnetId": 3681, + "graphTooltip": 1, + "id": 13, + "iteration": 1549118131383, + "links": [ + { + "icon": "info", + "tags": [ + + ], + "targetBlank": true, + "title": "Dashboard's Github ", + "tooltip": "Github repo of this dashboard", + "type": "link", + "url": "https://github.com/FUSAKLA/Prometheus2-grafana-dashboard" + }, + { + "icon": "doc", + "tags": [ + + ], + "targetBlank": true, + "title": "Prometheus Docs", + "tooltip": "", + "type": "link", + "url": "http://prometheus.io/docs/introduction/overview/" + } + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 55, + "panels": [ + + ], + "repeat": null, + "title": "Header instance info", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#bf1b00" + ], + "datasource": "prometheus", + "decimals": 1, + "format": "s", + "gauge": { + "maxValue": 1000000, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 0, + "y": 1 + }, + "id": 41, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "min(time() - process_start_time_seconds{instance=\"$instance\"})", + "format": "time_series", + "instant": false, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#bf1b00" + ], + "datasource": "prometheus", + "format": "short", + "gauge": { + "maxValue": 1000000, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 8, + "x": 4, + "y": 1 + }, + "id": 42, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "prometheus_tsdb_head_series{instance=\"$instance\"}", + "format": "time_series", + "instant": false, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "500000,800000,1000000", + "title": "Total count of time series", + "type": "singlestat", + "valueFontSize": "150%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 12, + "y": 1 + }, + "id": 48, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "version", + "targets": [ + { + "expr": "prometheus_build_info{instance=\"$instance\"}", + "format": "table", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Version", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "decimals": 2, + "format": "ms", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 4, + "x": 15, + "y": 1 + }, + "id": 49, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "prometheus_tsdb_head_max_time{instance=\"$instance\"} - prometheus_tsdb_head_min_time{instance=\"$instance\"}", + "format": "time_series", + "instant": true, + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Actual head block length", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "content": "", + "gridPos": { + "h": 5, + "w": 2, + "x": 19, + "y": 1 + }, + "height": "", + "id": 50, + "links": [ + + ], + "mode": "html", + "title": "", + "transparent": true, + "type": "text" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": true, + "colors": [ + "#e6522c", + "rgba(237, 129, 40, 0.89)", + "#299c46" + ], + "datasource": "prometheus", + "decimals": 1, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 5, + "w": 3, + "x": 21, + "y": 1 + }, + "id": 52, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "2", + "format": "time_series", + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "10,20", + "title": "", + "transparent": true, + "type": "singlestat", + "valueFontSize": "200%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 56, + "panels": [ + + ], + "repeat": null, + "title": "Main info", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 7 + }, + "id": 15, + "legend": { + "avg": true, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "max(prometheus_engine_query_duration_seconds{instance=\"$instance\"}) by (instance, slice)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "max duration for {{slice}}", + "metric": "prometheus_local_storage_rushed_mode", + "refId": "A", + "step": 900 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Query elapsed time", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 7 + }, + "id": 17, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(prometheus_tsdb_head_series_created_total{instance=\"$instance\"}[$aggregation_interval])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "created on {{ instance }}", + "metric": "prometheus_local_storage_maintain_series_duration_seconds_count", + "refId": "A", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_tsdb_head_series_removed_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) * -1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "removed on {{ instance }}", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Head series created/deleted", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 7 + }, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(prometheus_target_scrapes_exceeded_sample_limit_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "exceeded_sample_limit on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "A", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_target_scrapes_sample_duplicate_timestamp_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "duplicate_timestamp on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "B", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_bounds_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "out_of_bounds on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "C", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_target_scrapes_sample_out_of_order_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "out_of_order on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "D", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_rule_evaluation_failures_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "rule_evaluation_failure on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "G", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_tsdb_compactions_failed_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "tsdb_compactions_failed on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "K", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_tsdb_reloads_failures_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "tsdb_reloads_failures on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "L", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_tsdb_head_series_not_found{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "head_series_not_found on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "N", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_evaluator_iterations_missed_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "evaluator_iterations_missed on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "O", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_evaluator_iterations_skipped_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "evaluator_iterations_skipped on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "P", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Prometheus errors", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 14 + }, + "id": 57, + "panels": [ + + ], + "repeat": null, + "title": "Scrape & rule duration", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 15 + }, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": false, + "sort": "max", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_target_interval_length_seconds{instance=\"$instance\",quantile=\"0.99\"} - $scrape_interval", + "format": "time_series", + "interval": "2m", + "intervalFactor": 1, + "legendFormat": "{{instance}}", + "metric": "", + "refId": "A", + "step": 300 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Scrape delay (counts with 1m scrape interval)", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 15 + }, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Queue length", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_evaluator_duration_seconds{instance=\"$instance\"}) by (instance, quantile)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Queue length", + "metric": "prometheus_local_storage_indexing_queue_length", + "refId": "B", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Rule evaulation duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 22 + }, + "id": 58, + "panels": [ + + ], + "repeat": null, + "title": "Requests & queries", + "type": "row" + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 23 + }, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(http_requests_total{instance=\"$instance\"}[$aggregation_interval])) by (instance, handler) > 0", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ handler }} on {{ instance }}", + "metric": "", + "refId": "A", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Request count", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 23 + }, + "id": 16, + "legend": { + "avg": false, + "current": false, + "hideEmpty": true, + "hideZero": true, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(sum(http_request_duration_microseconds{instance=\"$instance\"}) by (instance, handler, quantile)) by (instance, handler) > 0", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{ handler }} on {{ instance }}", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Request duration per handler", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 23 + }, + "id": 19, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(http_request_size_bytes{instance=\"$instance\", quantile=\"0.99\"}[$aggregation_interval])) by (instance, handler) > 0", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "{{ handler }} in {{ instance }}", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Request size by handler", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Allocated bytes": "#F9BA8F", + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max count collector": "#bf1b00", + "Max count harvester": "#bf1b00", + "Max to persist": "#3F6833", + "RSS": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 23 + }, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Max.*/", + "fill": 0, + "linewidth": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_engine_queries{instance=\"$instance\"}) by (instance, handler)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Current count ", + "metric": "last", + "refId": "A", + "step": 1800 + }, + { + "expr": "sum(prometheus_engine_queries_concurrent_max{instance=\"$instance\"}) by (instance, handler)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Max count", + "metric": "last", + "refId": "B", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Cont of concurent queries", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 59, + "panels": [ + + ], + "repeat": null, + "title": "Alerting", + "type": "row" + }, + { + "aliasColors": { + "Alert queue capacity on o collector": "#bf1b00", + "Alert queue capacity on o harvester": "#bf1b00", + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 31 + }, + "id": 20, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/.*capacity.*/", + "fill": 0, + "linewidth": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_notifications_queue_capacity{instance=\"$instance\"})by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Alert queue capacity ", + "metric": "prometheus_local_storage_checkpoint_last_size_bytes", + "refId": "A", + "step": 1800 + }, + { + "expr": "sum(prometheus_notifications_queue_length{instance=\"$instance\"})by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Alert queue size on ", + "metric": "prometheus_local_storage_checkpoint_last_size_bytes", + "refId": "B", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Alert queue size", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 31 + }, + "id": 21, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_notifications_alertmanagers_discovered{instance=\"$instance\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Checkpoint chunks written/s", + "metric": "prometheus_local_storage_checkpoint_series_chunks_written_sum", + "refId": "A", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Count of discovered alertmanagers", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 31 + }, + "id": 39, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(prometheus_notifications_dropped_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "notifications_dropped on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "F", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_rule_evaluation_failures_total{rule_type=\"alerting\",instance=\"$instance\"}[$aggregation_interval])) by (rule_type,instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "rule_evaluation_failures on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "A", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Alerting errors", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 60, + "panels": [ + + ], + "repeat": null, + "title": "Service discovery", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 39 + }, + "id": 43, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(prometheus_target_sync_length_seconds_count{scrape_job=\"consul\", instance=\"$instance\"}[$aggregation_interval])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Consul target sync count", + "refId": "A", + "step": 240 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Consul SD sync count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 39 + }, + "id": 44, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(prometheus_target_sync_length_seconds_count{scrape_job=\"marathon\", instance=\"$instance\"}[$aggregation_interval])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Marathon target sync count", + "refId": "A", + "step": 240 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Marathon SD sync count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 39 + }, + "id": 45, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "increase(prometheus_target_sync_length_seconds_count{scrape_job=\"kubernetes\"}[$aggregation_interval])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Count of target synces", + "refId": "A", + "step": 240 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Kubernetes SD sync count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 39 + }, + "id": 46, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(prometheus_target_scrapes_exceeded_sample_limit_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "exceeded_sample_limit on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "A", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_sd_file_read_errors_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "sd_file_read_error on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "E", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_sd_consul_rpc_failures_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "sd_consul_rpc_failure on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "H", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_sd_marathon_refresh_failures_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "sd_marathon_refresh_failure on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "I", + "step": 1800 + }, + { + "expr": "sum(increase(prometheus_sd_openstack_refresh_failures_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "sd_openstack_refresh_failure on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "J", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Service discovery errors", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 46 + }, + "id": 61, + "panels": [ + + ], + "repeat": null, + "title": "TSDB stats", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 47 + }, + "id": 36, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(prometheus_tsdb_reloads_total{instance=\"$instance\"}[30m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Reloaded block from disk", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 47 + }, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_tsdb_blocks_loaded{instance=\"$instance\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Loaded data blocks", + "metric": "prometheus_local_storage_memory_chunkdescs", + "refId": "A", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Loaded data blocks", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 47 + }, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "prometheus_tsdb_head_series{instance=\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Time series count", + "metric": "prometheus_local_storage_memory_series", + "refId": "A", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Time series total count", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 47 + }, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(prometheus_tsdb_head_samples_appended_total{instance=\"$instance\"}[$aggregation_interval])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "samples/s {{instance}}", + "metric": "prometheus_local_storage_ingested_samples_total", + "refId": "A", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Samples Appended per second", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": "", + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 54 + }, + "id": 62, + "panels": [ + + ], + "repeat": null, + "title": "Head block stats", + "type": "row" + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833", + "To persist": "#9AC48A" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 55 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/Max.*/", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_tsdb_head_chunks{instance=\"$instance\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Head chunk count", + "metric": "prometheus_local_storage_memory_chunks", + "refId": "A", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Head chunks count", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 55 + }, + "id": 35, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(prometheus_tsdb_head_max_time{instance=\"$instance\"}) by (instance) - min(prometheus_tsdb_head_min_time{instance=\"$instance\"}) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Length of head block", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 55 + }, + "id": 4, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(prometheus_tsdb_head_chunks_created_total{instance=\"$instance\"}[$aggregation_interval])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "created on {{ instance }}", + "refId": "B" + }, + { + "expr": "sum(rate(prometheus_tsdb_head_chunks_removed_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) * -1", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "deleted on {{ instance }}", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Head Chunks Created/Deleted per second", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 62 + }, + "id": 63, + "panels": [ + + ], + "repeat": null, + "title": "Data maintenance", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 0, + "y": 63 + }, + "id": 33, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(prometheus_tsdb_compaction_duration_sum{instance=\"$instance\"}[30m]) / increase(prometheus_tsdb_compaction_duration_count{instance=\"$instance\"}[30m])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ instance }}", + "refId": "B" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Compaction duration", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 6, + "y": 63 + }, + "id": 34, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_tsdb_head_gc_duration_seconds{instance=\"$instance\"}) by (instance, quantile)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ quantile }} on {{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Go Garbage collection duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 63 + }, + "id": 37, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(prometheus_tsdb_wal_truncate_duration_seconds{instance=\"$instance\"}) by (instance, quantile)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ quantile }} on {{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "WAL truncate duration seconds", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 63 + }, + "id": 38, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(tsdb_wal_fsync_duration_seconds{instance=\"$instance\"}) by (instance, quantile)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ quantile }} {{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "WAL fsync duration seconds", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 70 + }, + "id": 64, + "panels": [ + + ], + "repeat": null, + "title": "RAM&CPU", + "type": "row" + }, + { + "aliasColors": { + "Allocated bytes": "#7EB26D", + "Allocated bytes - 1m max": "#BF1B00", + "Allocated bytes - 1m min": "#BF1B00", + "Allocated bytes - 5m max": "#BF1B00", + "Allocated bytes - 5m min": "#BF1B00", + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833", + "RSS": "#447EBC" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": null, + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 71 + }, + "id": 6, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/-/", + "fill": 0 + }, + { + "alias": "collector heap size", + "color": "#E0752D", + "fill": 0, + "linewidth": 2 + }, + { + "alias": "collector kubernetes memory limit", + "color": "#BF1B00", + "fill": 0, + "linewidth": 3 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(process_resident_memory_bytes{instance=\"$instance\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Total resident memory - {{instance}}", + "metric": "process_resident_memory_bytes", + "refId": "B", + "step": 1800 + }, + { + "expr": "sum(go_memstats_alloc_bytes{instance=\"$instance\"}) by (instance)", + "format": "time_series", + "hide": false, + "intervalFactor": 2, + "legendFormat": "Total llocated bytes - {{instance}}", + "metric": "go_memstats_alloc_bytes", + "refId": "A", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Memory", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + "Allocated bytes": "#F9BA8F", + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833", + "RSS": "#890F02" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 71 + }, + "id": 7, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(go_memstats_alloc_bytes_total{instance=\"$instance\"}[$aggregation_interval])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Allocated Bytes/s", + "metric": "go_memstats_alloc_bytes", + "refId": "A", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Allocations per second", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "decimals": 2, + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 71 + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "hideEmpty": false, + "max": false, + "min": false, + "rightSide": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(process_cpu_seconds_total{instance=\"$instance\"}[$aggregation_interval])) by (instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "CPU/s", + "metric": "prometheus_local_storage_ingested_samples_total", + "refId": "B", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "CPU per second", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + "avg" + ] + }, + "yaxes": [ + { + "format": "none", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 78 + }, + "id": 65, + "panels": [ + + ], + "repeat": null, + "title": "Heapster description", + "type": "row" + }, + { + "content": "Two rows bellow can serve as example if you are running **Prometheus** in **Kubernetes** and uses **Heapster** with **InfluxDB**.\n\nThe schema and queries are very hard to generalize so you will have to tweak them but I leave them here for inspiration.", + "gridPos": { + "h": 3, + "w": 24, + "x": 0, + "y": 79 + }, + "id": 51, + "links": [ + + ], + "mode": "markdown", + "title": "Heapster rows", + "type": "text" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 82 + }, + "id": 66, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$influx_datasource", + "decimals": 2, + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 91 + }, + "id": 28, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "cpu/limit.mean ", + "color": "#C15C17", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\")/1000 as \"usage_rate.mean\" FROM \"cpu/usage_rate\" WHERE \"type\" = 'pod_container' AND \"container_name\" =\u007e /prometheus/ AND $timeFilter GROUP BY container_name, time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\")/1000 as \"usage_rate.mean\" FROM \"cpu/usage_rate\" WHERE \"type\" = 'pod_container' AND \"container_name\" =\u007e /prometheus/ AND $timeFilter GROUP BY container_name, time($__interval) fill(null)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\")/1000 as \"CPU LIMIT\" FROM \"cpu/limit\" WHERE \"type\" = 'pod_container' AND \"container_name\" =\u007e /prometheus/ AND $timeFilter GROUP BY container_name, time($__interval) fill(null)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage/s", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$influx_datasource", + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 91 + }, + "id": 29, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "cpu/limit.mean ", + "color": "#C15C17", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"usage_rate.mean\" FROM \"memory/usage\" WHERE \"type\" = 'pod_container' AND \"container_name\" =\u007e /prometheus/ AND $timeFilter GROUP BY container_name, time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"usage_rate.mean\" FROM \"memory/usage\" WHERE \"type\" = 'pod_container' AND \"container_name\" =\u007e /prometheus/ AND $timeFilter GROUP BY container_name, time($__interval) fill(null)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"CPU LIMIT\" FROM \"memory/limit\" WHERE \"type\" = 'pod_container' AND \"container_name\" =\u007e /prometheus/ AND $timeFilter GROUP BY container_name, time($__interval) fill(null)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory usage", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "title": "Heapster RAM&CPU", + "type": "row" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 83 + }, + "id": 67, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$influx_datasource", + "fill": 1, + "gridPos": { + "h": 7, + "w": 10, + "x": 0, + "y": 92 + }, + "id": 30, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "/errors/", + "color": "#C15C17", + "pointradius": 3, + "points": true + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"rx_rate.mean \" FROM \"network/rx_rate\" WHERE \"labels\" =\u007e /app:prometheus/ AND $timeFilter GROUP BY labels, time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"rx_rate.mean\" FROM \"network/rx_rate\" WHERE \"labels\" =\u007e /app:prometheus/ AND $timeFilter GROUP BY labels, time($__interval) fill(null)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\")*-1 as \"tx_rate.mean \" FROM \"network/tx_rate\" WHERE \"labels\" =\u007e /app:prometheus/ AND $timeFilter GROUP BY labels, time($__interval) fill(null)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\")*-1 as \"tx_rate.mean\" FROM \"network/tx_rate\" WHERE \"labels\" =\u007e /app:prometheus/ AND $timeFilter GROUP BY labels, time($__interval) fill(null)", + "rawQuery": true, + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"rx_errors_rate.mean\" FROM \"network/rx_errors_rate\" WHERE \"labels\" =\u007e /app:prometheus/ AND $timeFilter AND \"value\">0 GROUP BY labels, time($__interval) fill(null)", + "rawQuery": true, + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"rx_errors_rate.mean\" FROM \"network/rx_errors_rate\" WHERE \"labels\" =\u007e /app:prometheus/ AND $timeFilter AND \"value\">0 GROUP BY labels, time($__interval) fill(null)", + "rawQuery": true, + "refId": "F", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"tx_errors_rate.mean\" FROM \"network/rx_errors_rate\" WHERE \"labels\" =\u007e /app:prometheus/ AND $timeFilter AND \"value\">0 GROUP BY labels, time($__interval) fill(null)", + "rawQuery": true, + "refId": "G", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"tx_errors_rate.mean\" FROM \"network/rx_errors_rate\" WHERE \"labels\" =\u007e /app:prometheus/ AND $timeFilter AND \"value\">0 GROUP BY labels, time($__interval) fill(null)", + "rawQuery": true, + "refId": "H", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network rx[IN] / tx[OUT] in bytes/s", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$influx_datasource", + "fill": 1, + "gridPos": { + "h": 7, + "w": 10, + "x": 10, + "y": 92 + }, + "id": 31, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "cpu/limit.mean", + "color": "#C15C17", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"DISK_USAGE.mean \" FROM \"filesystem/usage\" WHERE \"labels\" =\u007e /data-prometheus:true/ AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"DISK_USAGE.mean \" FROM \"filesystem/usage\" WHERE \"labels\" =\u007e /data-prometheus:true/ AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"DISK_LIMIT.mean \" FROM \"filesystem/limit\" WHERE \"labels\" =\u007e /data-prometheus:true/ AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"DISK_LIMIT.mean\" FROM \"filesystem/limit\" WHERE \"labels\" =\u007e /data-prometheus:true/ AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Disk usage", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "decbytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$influx_datasource", + "fill": 1, + "gridPos": { + "h": 7, + "w": 4, + "x": 20, + "y": 92 + }, + "id": 32, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "cpu/limit.mean ", + "color": "#C15C17", + "fill": 0 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"FREE_INODES.mean \" FROM \"filesystem/inodes_free\" WHERE \"labels\" =\u007e /data-prometheus:true/ AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": false, + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"FREE_INODES.mean\" FROM \"filesystem/inodes_free\" WHERE \"labels\" =\u007e /data-prometheus:true/ AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"USED_INODES.mean \" FROM \"filesystem/inodes\" WHERE \"labels\" =\u007e /data-prometheus:true/ AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + }, + { + "dsType": "influxdb", + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "orderByTime": "ASC", + "policy": "default", + "query": "SELECT mean(\"value\") as \"USED_INODES.mean\" FROM \"filesystem/inodes\" WHERE \"labels\" =\u007e /data-prometheus:true/ AND $timeFilter GROUP BY time($__interval) fill(null)", + "rawQuery": true, + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [ + + ], + "type": "mean" + } + ] + ], + "tags": [ + + ] + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Number of free INODES", + "tooltip": { + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "title": "Heapster host stats", + "type": "row" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 84 + }, + "id": 68, + "panels": [ + + ], + "repeat": null, + "title": "Contrac errors", + "type": "row" + }, + { + "aliasColors": { + "Chunks": "#1F78C1", + "Chunks to persist": "#508642", + "Max chunks": "#052B51", + "Max to persist": "#3F6833" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fill": 1, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 85 + }, + "id": 47, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": false, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(net_conntrack_dialer_conn_failed_total{instance=\"$instance\"}[$aggregation_interval])) by (instance) > 0", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "conntrack_dialer_conn_failed on {{ instance }}", + "metric": "prometheus_local_storage_chunk_ops_total", + "refId": "M", + "step": 1800 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Net errors", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 2, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "5m", + "schemaVersion": 16, + "style": "dark", + "tags": [ + "custom" + ], + "templating": { + "list": [ + { + "auto": true, + "auto_count": 30, + "auto_min": "2m", + "current": { + "text": "auto", + "value": "$__auto_interval_aggregation_interval" + }, + "hide": 0, + "label": "aggregation intarval", + "name": "aggregation_interval", + "options": [ + { + "selected": true, + "text": "auto", + "value": "$__auto_interval_aggregation_interval" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "1m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "allValue": null, + "current": { + "text": "10.32.0.53:9090", + "value": "10.32.0.53:9090" + }, + "datasource": "$datasource", + "definition": "", + "hide": 0, + "includeAll": false, + "label": "Instance", + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(prometheus_build_info, instance)", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "text": "60", + "value": "60" + }, + "hide": 0, + "label": "Scrape interval seconds", + "name": "scrape_interval", + "options": [ + { + "text": "60", + "value": "60" + } + ], + "query": "60", + "skipUrlSync": false, + "type": "constant" + }, + { + "current": { + "text": "prometheus", + "value": "prometheus" + }, + "hide": 0, + "label": "Prometheus datasource", + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "text": "No data sources found", + "value": "" + }, + "hide": 0, + "label": "InfluxDB datasource", + "name": "influx_datasource", + "options": [ + + ], + "query": "influxdb", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Prometheus2.0", + "uid": "XmsJC9mRz", + "version": 1 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-prometheus-dashboard + namespace: monitoring - apiVersion: v1 data: statefulset.json: |- @@ -7912,4 +15496,1786 @@ items: metadata: name: grafana-dashboard-statefulset namespace: monitoring +- apiVersion: v1 + data: + traefik-dashboard.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Traefik dashboard prometheus", + "editable": true, + "gnetId": 5851, + "graphTooltip": 0, + "id": 20, + "iteration": 1549118220486, + "links": [ + + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 21, + "panels": [ + + ], + "title": "General", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#d44a3a", + "rgba(237, 129, 40, 0.89)", + "#299c46" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 13, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(kube_pod_status_ready{namespace=\"$namespace\",condition=\"true\",pod=\u007e\"traefik.*\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Traefik instances", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 37, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "time() - max(process_start_time_seconds{job=\u007e\"traefik.*\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "ms", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 6, + "y": 1 + }, + "id": 39, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(traefik_entrypoint_request_duration_seconds_sum) / sum(traefik_entrypoint_requests_total) * 1000", + "format": "time_series", + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Average response time", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { + "Latency over 1 min": "rgb(9, 116, 190)", + "Latency over 5 min": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 2, + "gridPos": { + "h": 7, + "w": 14, + "x": 10, + "y": 1 + }, + "id": 11, + "legend": { + "avg": true, + "current": false, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Latency over 5 min", + "yaxis": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.$percentiles, sum(rate(traefik_entrypoint_request_duration_seconds_bucket{namespace=\"$namespace\", code=\"200\",method=\"GET\"}[5m])) by (le))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Latency over 1 min", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Global latency $percentiles th perc over 5 min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 29, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.$percentiles, rate(traefik_entrypoint_request_duration_seconds_bucket{namespace=\"$namespace\",code=\"200\",method=\"GET\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Per node latency $percentiles th perc over 5 min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 17, + "panels": [ + + ], + "title": "Frontends (entrypoints)", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_entrypoint_requests_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Total requests", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Total requests over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 7, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(traefik_entrypoint_open_connections{namespace=\"$namespace\"}) by (method)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ method }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Open Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_entrypoint_request_duration_seconds_bucket{namespace=\"$namespace\",le=\"0.1\",code=\"200\"}[5m])) by (job) / sum(rate(traefik_entrypoint_request_duration_seconds_count{namespace=\"$namespace\",code=\"200\"}[5m])) by (job)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Code 200", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Apdex score (over 5 min)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 24, + "panels": [ + + ], + "title": "Backends", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 7, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(traefik_backend_open_connections{namespace=\"$namespace\"}) by (method)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ method }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Open Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 26, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_request_duration_seconds_bucket{namespace=\"$namespace\",le=\"0.1\",code=\"200\"}[5m])) by (job) / sum(rate(traefik_backend_request_duration_seconds_count{namespace=\"$namespace\",code=\"200\"}[5m])) by (job)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Code 200", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Apdex score (over 5 min)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 15, + "panels": [ + + ], + "title": "HTTP Codes stats", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\",code=\u007e\"2..\"}[5m])) by (method, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{method}} : {{code}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Status code 2xx over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 27, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\",code=\u007e\"5..\"}[5m])) by (method, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{method}} : {{code}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Status code 5xx over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\"}[1m])) by (backend)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ backend }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Backend total requests over 1min per backend", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sortDesc": false, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\",code!\u007e\"2..|5..\"}[5m])) by (method, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ method }} : {{code}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Others status code over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 57 + }, + "id": 35, + "panels": [ + + ], + "title": "Pods ressources", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 58 + }, + "id": 31, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=\u007e\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Max memory used", + "refId": "A" + }, + { + "expr": "avg(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod_name=\u007e\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requested memory usage", + "refId": "B" + }, + { + "expr": "avg(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod_name=\u007e\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit memory usage", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Traefik max memory usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 58 + }, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=\u007e\"traefik-ingress-controller.*\"}[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Max cpu used", + "refId": "A" + }, + { + "expr": "avg(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod_name=\u007e\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requested cpu usage", + "refId": "B" + }, + { + "expr": "avg(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod_name=\u007e\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit cpu usage", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Traefik max CPU usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "traefik" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "kube-system", + "value": "kube-system" + }, + "datasource": "prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(traefik_config_reloads_total, namespace)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "tags": [ + + ], + "text": "99", + "value": "99" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "percentiles", + "options": [ + { + "selected": false, + "text": "95", + "value": "95" + }, + { + "selected": true, + "text": "99", + "value": "99" + } + ], + "query": "95,99", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Traefik", + "uid": "000000113", + "version": 3 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-traefik-dashboard + namespace: monitoring kind: ConfigMapList diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 47b3034..2c3c7b2 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -16,7 +16,7 @@ spec: app: grafana spec: containers: - - image: carlosedp/monitoring-grafana:v5.4.0 + - image: grafana/grafana:5.4.3 name: grafana ports: - containerPort: 3000 @@ -57,6 +57,9 @@ spec: - mountPath: /grafana-dashboard-definitions/0/k8s-resources-pod name: grafana-dashboard-k8s-resources-pod readOnly: false + - mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard + name: grafana-dashboard-kubernetes-cluster-dashboard + readOnly: false - mountPath: /grafana-dashboard-definitions/0/nodes name: grafana-dashboard-nodes readOnly: false @@ -66,9 +69,15 @@ spec: - mountPath: /grafana-dashboard-definitions/0/pods name: grafana-dashboard-pods readOnly: false + - mountPath: /grafana-dashboard-definitions/0/prometheus-dashboard + name: grafana-dashboard-prometheus-dashboard + readOnly: false - mountPath: /grafana-dashboard-definitions/0/statefulset name: grafana-dashboard-statefulset readOnly: false + - mountPath: /grafana-dashboard-definitions/0/traefik-dashboard + name: grafana-dashboard-traefik-dashboard + readOnly: false - mountPath: /etc/grafana name: grafana-config readOnly: false @@ -102,6 +111,9 @@ spec: - configMap: name: grafana-dashboard-k8s-resources-pod name: grafana-dashboard-k8s-resources-pod + - configMap: + name: grafana-dashboard-kubernetes-cluster-dashboard + name: grafana-dashboard-kubernetes-cluster-dashboard - configMap: name: grafana-dashboard-nodes name: grafana-dashboard-nodes @@ -111,9 +123,15 @@ spec: - configMap: name: grafana-dashboard-pods name: grafana-dashboard-pods + - configMap: + name: grafana-dashboard-prometheus-dashboard + name: grafana-dashboard-prometheus-dashboard - configMap: name: grafana-dashboard-statefulset name: grafana-dashboard-statefulset + - configMap: + name: grafana-dashboard-traefik-dashboard + name: grafana-dashboard-traefik-dashboard - name: grafana-config secret: secretName: grafana-config diff --git a/manifests/kube-state-metrics-deployment.yaml b/manifests/kube-state-metrics-deployment.yaml index 32ef699..8e28a72 100644 --- a/manifests/kube-state-metrics-deployment.yaml +++ b/manifests/kube-state-metrics-deployment.yaml @@ -16,6 +16,54 @@ spec: app: kube-state-metrics spec: containers: + - args: + - --logtostderr + - --secure-listen-address=:8443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 + - --upstream=http://127.0.0.1:8081/ + image: carlosedp/kube-rbac-proxy:v0.4.1 + name: kube-rbac-proxy-main + ports: + - containerPort: 8443 + name: https-main + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + - args: + - --logtostderr + - --secure-listen-address=:9443 + - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 + - --upstream=http://127.0.0.1:8082/ + image: carlosedp/kube-rbac-proxy:v0.4.1 + name: kube-rbac-proxy-self + ports: + - containerPort: 9443 + name: https-self + resources: + limits: + cpu: 20m + memory: 40Mi + requests: + cpu: 10m + memory: 20Mi + - args: + - --host=127.0.0.1 + - --port=8081 + - --telemetry-host=127.0.0.1 + - --telemetry-port=8082 + image: carlosedp/kube-state-metrics:v1.5.0 + name: kube-state-metrics + resources: + limits: + cpu: 100m + memory: 150Mi + requests: + cpu: 100m + memory: 150Mi - command: - /pod_nanny - --container=kube-state-metrics diff --git a/manifests/metallb-service.yaml b/manifests/metallb-service.yaml new file mode 100644 index 0000000..63e08c5 --- /dev/null +++ b/manifests/metallb-service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: metallb-controller + name: metallb-controller + namespace: metallb-system +spec: + clusterIP: None + ports: + - name: http + port: 7472 + targetPort: 7472 + selector: + app: metallb + component: controller diff --git a/manifests/metallb-serviceMonitor.yaml b/manifests/metallb-serviceMonitor.yaml new file mode 100644 index 0000000..5e0cc81 --- /dev/null +++ b/manifests/metallb-serviceMonitor.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: metallb-controller + name: metallb + namespace: monitoring +spec: + endpoints: + - interval: 30s + port: http + scheme: http + jobLabel: k8s-app + namespaceSelector: + matchNames: + - metallb-system + selector: + matchLabels: + k8s-app: metallb-controller diff --git a/manifests/prometheus-kubeControllerManagerPrometheusDiscoveryService.yaml b/manifests/prometheus-kubeControllerManagerPrometheusDiscoveryService.yaml new file mode 100644 index 0000000..9506973 --- /dev/null +++ b/manifests/prometheus-kubeControllerManagerPrometheusDiscoveryService.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: kube-controller-manager + name: kube-controller-manager-prometheus-discovery + namespace: kube-system +spec: + clusterIP: None + ports: + - name: http-metrics + port: 10252 + targetPort: 10252 + selector: + component: kube-controller-manager diff --git a/manifests/prometheus-kubeDnsPrometheusDiscoveryService.yaml b/manifests/prometheus-kubeDnsPrometheusDiscoveryService.yaml new file mode 100644 index 0000000..34e746c --- /dev/null +++ b/manifests/prometheus-kubeDnsPrometheusDiscoveryService.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: kube-dns + name: kube-dns-prometheus-discovery + namespace: kube-system +spec: + clusterIP: None + ports: + - name: metrics + port: 9153 + targetPort: 9153 + selector: + k8s-app: kube-dns diff --git a/manifests/prometheus-kubeSchedulerPrometheusDiscoveryService.yaml b/manifests/prometheus-kubeSchedulerPrometheusDiscoveryService.yaml new file mode 100644 index 0000000..b4843c7 --- /dev/null +++ b/manifests/prometheus-kubeSchedulerPrometheusDiscoveryService.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + k8s-app: kube-scheduler + name: kube-scheduler-prometheus-discovery + namespace: kube-system +spec: + clusterIP: None + ports: + - name: http-metrics + port: 10251 + targetPort: 10251 + selector: + component: kube-scheduler diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index 58ddfb9..db7f96d 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -41,5 +41,4 @@ spec: resources: requests: storage: 20Gi - storageClassName: nfs-ssd-node1 - version: v2.5.0 + version: v2.7.0 diff --git a/manifests/prometheus-roleBindingSpecificNamespaces.yaml b/manifests/prometheus-roleBindingSpecificNamespaces.yaml index 087cdcf..888e95a 100644 --- a/manifests/prometheus-roleBindingSpecificNamespaces.yaml +++ b/manifests/prometheus-roleBindingSpecificNamespaces.yaml @@ -52,4 +52,17 @@ items: - kind: ServiceAccount name: prometheus-k8s namespace: monitoring +- apiVersion: rbac.authorization.k8s.io/v1 + kind: RoleBinding + metadata: + name: prometheus-k8s + namespace: metallb-system + roleRef: + apiGroup: rbac.authorization.k8s.io + kind: Role + name: prometheus-k8s + subjects: + - kind: ServiceAccount + name: prometheus-k8s + namespace: monitoring kind: RoleBindingList diff --git a/manifests/prometheus-roleSpecificNamespaces.yaml b/manifests/prometheus-roleSpecificNamespaces.yaml index 56a7370..378b939 100644 --- a/manifests/prometheus-roleSpecificNamespaces.yaml +++ b/manifests/prometheus-roleSpecificNamespaces.yaml @@ -64,4 +64,20 @@ items: - get - list - watch +- apiVersion: rbac.authorization.k8s.io/v1 + kind: Role + metadata: + name: prometheus-k8s + namespace: metallb-system + rules: + - apiGroups: + - "" + resources: + - services + - endpoints + - pods + verbs: + - get + - list + - watch kind: RoleList diff --git a/manifests/smtp-server-deployment.yaml b/manifests/smtp-server-deployment.yaml new file mode 100644 index 0000000..df63d8d --- /dev/null +++ b/manifests/smtp-server-deployment.yaml @@ -0,0 +1,38 @@ +apiVersion: apps/v1beta2 +kind: Deployment +metadata: + labels: + run: smtp-server + name: smtp-server + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + run: smtp-server + template: + metadata: + labels: + run: smtp-server + spec: + containers: + - env: + - name: GMAIL_USER + valueFrom: + secretKeyRef: + key: username + name: smtp-account + - name: GMAIL_PASSWORD + valueFrom: + secretKeyRef: + key: password + name: smtp-account + - name: DISABLE_IPV6 + value: "True" + - name: RELAY_DOMAINS + value: :192.168.0.0/24:10.0.0.0/16 + image: carlosedp/docker-smtp:latest + name: smtp-server + ports: + - containerPort: 25 + name: smtp diff --git a/manifests/smtp-server-service.yaml b/manifests/smtp-server-service.yaml new file mode 100644 index 0000000..dc01fa3 --- /dev/null +++ b/manifests/smtp-server-service.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Service +metadata: + labels: + run: smtp-server + name: smtp-server + namespace: monitoring +spec: + ports: + - name: smtp + port: 25 + targetPort: smtp + selector: + run: smtp-server diff --git a/manifests/traefik-serviceMonitor.yaml b/manifests/traefik-serviceMonitor.yaml index fdf7d3d..fdd3333 100644 --- a/manifests/traefik-serviceMonitor.yaml +++ b/manifests/traefik-serviceMonitor.yaml @@ -9,7 +9,7 @@ spec: endpoints: - interval: 30s port: admin - scheme: https + scheme: http jobLabel: k8s-app namespaceSelector: matchNames: From 7f9f0b8b8392f643bfa275aab9a0106b00dbeeef Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 18:07:09 -0200 Subject: [PATCH 18/30] Remove example file --- example.jsonnet | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 example.jsonnet diff --git a/example.jsonnet b/example.jsonnet deleted file mode 100644 index 2a10509..0000000 --- a/example.jsonnet +++ /dev/null @@ -1,14 +0,0 @@ -local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + { - _config+:: { - namespace: 'monitoring', - }, -}; - -{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) } + -{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) } + -{ ['node-exporter-' + name]: kp.nodeExporter[name] for name in std.objectFields(kp.nodeExporter) } + -{ ['kube-state-metrics-' + name]: kp.kubeStateMetrics[name] for name in std.objectFields(kp.kubeStateMetrics) } + -{ ['alertmanager-' + name]: kp.alertmanager[name] for name in std.objectFields(kp.alertmanager) } + -{ ['prometheus-' + name]: kp.prometheus[name] for name in std.objectFields(kp.prometheus) } + -{ ['prometheus-adapter-' + name]: kp.prometheusAdapter[name] for name in std.objectFields(kp.prometheusAdapter) } + -{ ['grafana-' + name]: kp.grafana[name] for name in std.objectFields(kp.grafana) } From 61f5a2927bd648e55430db3a119a5ba015145914 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 18:07:37 -0200 Subject: [PATCH 19/30] Added anti-affinity to Prometheus and alertmanager --- .gitignore | 1 + manifests/alertmanager-alertmanager.yaml | 14 ++++++++++++++ manifests/prometheus-prometheus.yaml | 14 ++++++++++++++ operator_stack.jsonnet | 2 +- 4 files changed, 30 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 22d0d82..47a49ee 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ vendor +auth diff --git a/manifests/alertmanager-alertmanager.yaml b/manifests/alertmanager-alertmanager.yaml index c8d7024..5e1d1af 100644 --- a/manifests/alertmanager-alertmanager.yaml +++ b/manifests/alertmanager-alertmanager.yaml @@ -6,6 +6,20 @@ metadata: name: main namespace: monitoring spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: alertmanager + operator: In + values: + - main + namespaces: + - monitoring + topologyKey: kubernetes.io/hostname + weight: 100 baseImage: carlosedp/alertmanager nodeSelector: beta.kubernetes.io/os: linux diff --git a/manifests/prometheus-prometheus.yaml b/manifests/prometheus-prometheus.yaml index db7f96d..2b57e7d 100644 --- a/manifests/prometheus-prometheus.yaml +++ b/manifests/prometheus-prometheus.yaml @@ -6,6 +6,20 @@ metadata: name: k8s namespace: monitoring spec: + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: prometheus + operator: In + values: + - k8s + namespaces: + - monitoring + topologyKey: kubernetes.io/hostname + weight: 100 alerting: alertmanagers: - name: alertmanager-main diff --git a/operator_stack.jsonnet b/operator_stack.jsonnet index 42abbee..c271ebd 100644 --- a/operator_stack.jsonnet +++ b/operator_stack.jsonnet @@ -1,7 +1,7 @@ local k = import 'ksonnet/ksonnet.beta.3/k.libsonnet'; local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + - // (import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') + + (import 'kube-prometheus/kube-prometheus-anti-affinity.libsonnet') + (import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') + (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') + (import 'image_sources_versions.jsonnet') + From 86ce17c4554f595aeeccf55d60687f71913ba2bd Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 18:22:43 -0200 Subject: [PATCH 20/30] Updated dashboard alert --- .../kubernetes-cluster-dashboard.json | 25 +++++++++++------ manifests/grafana-dashboardDefinitions.yaml | 27 +++++++++++++------ 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/grafana-dashboards/kubernetes-cluster-dashboard.json b/grafana-dashboards/kubernetes-cluster-dashboard.json index 797dd38..da58dbb 100644 --- a/grafana-dashboards/kubernetes-cluster-dashboard.json +++ b/grafana-dashboards/kubernetes-cluster-dashboard.json @@ -16,7 +16,7 @@ "editable": true, "gnetId": 162, "graphTooltip": 1, - "id": 12, + "id": 13, "links": [], "panels": [ { @@ -379,7 +379,7 @@ { "evaluator": { "params": [ - 3500000000 + 0.85 ], "type": "gt" }, @@ -388,19 +388,20 @@ }, "query": { "params": [ - "A", - "4m", + "B", + "5m", "now" ] }, "reducer": { "params": [], - "type": "avg" + "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", + "for": "0m", "frequency": "60s", "handler": 1, "name": "Memory Usage alert", @@ -448,6 +449,13 @@ "intervalFactor": 2, "legendFormat": "{{ instance }}", "refId": "A" + }, + { + "expr": "(node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes))/node_memory_MemTotal_bytes", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "refId": "B" } ], "thresholds": [ @@ -456,7 +464,7 @@ "fill": true, "line": true, "op": "gt", - "value": 3500000000 + "value": 0.85 } ], "timeFrom": null, @@ -705,6 +713,7 @@ }, { "dashboardFilter": "", + "dashboardTags": [], "folderId": null, "gridPos": { "h": 9, @@ -1666,6 +1675,6 @@ }, "timezone": "browser", "title": "Kubernetes cluster monitoring (via Prometheus)", - "uid": "82pBZCmRk", - "version": 2 + "uid": "82pBZCmRkasd", + "version": 1 } \ No newline at end of file diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 1aec137..305d84d 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -4913,7 +4913,7 @@ items: "editable": true, "gnetId": 162, "graphTooltip": 1, - "id": 12, + "id": 13, "links": [ ], @@ -5296,7 +5296,7 @@ items: { "evaluator": { "params": [ - 3500000000 + 0.84999999999999998 ], "type": "gt" }, @@ -5305,8 +5305,8 @@ items: }, "query": { "params": [ - "A", - "4m", + "B", + "5m", "now" ] }, @@ -5314,12 +5314,13 @@ items: "params": [ ], - "type": "avg" + "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", + "for": "0m", "frequency": "60s", "handler": 1, "name": "Memory Usage alert", @@ -5375,6 +5376,13 @@ items: "intervalFactor": 2, "legendFormat": "{{ instance }}", "refId": "A" + }, + { + "expr": "(node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes))/node_memory_MemTotal_bytes", + "format": "time_series", + "hide": true, + "intervalFactor": 1, + "refId": "B" } ], "thresholds": [ @@ -5383,7 +5391,7 @@ items: "fill": true, "line": true, "op": "gt", - "value": 3500000000 + "value": 0.84999999999999998 } ], "timeFrom": null, @@ -5658,6 +5666,9 @@ items: }, { "dashboardFilter": "", + "dashboardTags": [ + + ], "folderId": null, "gridPos": { "h": 9, @@ -6727,8 +6738,8 @@ items: }, "timezone": "browser", "title": "Kubernetes cluster monitoring (via Prometheus)", - "uid": "82pBZCmRk", - "version": 2 + "uid": "82pBZCmRkasd", + "version": 1 } kind: ConfigMap metadata: From 2acf11bcd52b6131115413aa4effacd01a20d741 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 20:00:38 -0200 Subject: [PATCH 21/30] Add Grafana persistence --- manifests/grafana-deployment.yaml | 5 ++- manifests/grafana-storage.yaml | 11 +++++ operator_stack.jsonnet | 70 +++++++++++++------------------ 3 files changed, 44 insertions(+), 42 deletions(-) create mode 100644 manifests/grafana-storage.yaml diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 2c3c7b2..74bc1a2 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -88,8 +88,9 @@ spec: runAsUser: 65534 serviceAccountName: grafana volumes: - - emptyDir: {} - name: grafana-storage + - name: grafana-storage + persistentVolumeClaim: + claimName: grafana-storage - name: grafana-datasources secret: secretName: grafana-datasources diff --git a/manifests/grafana-storage.yaml b/manifests/grafana-storage.yaml new file mode 100644 index 0000000..e5a18b6 --- /dev/null +++ b/manifests/grafana-storage.yaml @@ -0,0 +1,11 @@ +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: grafana-storage + namespace: monitoring +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 2Gi diff --git a/operator_stack.jsonnet b/operator_stack.jsonnet index c271ebd..d36b052 100644 --- a/operator_stack.jsonnet +++ b/operator_stack.jsonnet @@ -89,46 +89,36 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + }, }, - # Override command for Grafana persist data - // grafana+:: { - // local pvc = k.core.v1.persistentVolumeClaim, - // local deployment = k.apps.v1beta2.deployment, - // local container = deployment.mixin.spec.template.spec.containersType, - // local containerVolumeMount = container.volumeMountsType, - // local volume = deployment.mixin.spec.template.spec.volumesType, - // local grafanaStorage = pvc.new() + pvc.mixin.metadata.withNamespace($._config.namespace) + - // pvc.mixin.metadata.withName("grafana-storage") + - // pvc.mixin.spec.withAccessModes('ReadWriteMany') + - // pvc.mixin.spec.resources.withRequests({ storage: '2Gi' }), - // deployment+: { - // // local storageVolumeName = grafanaStorage, - // // local storageVolume = volume.fromPersistentVolumeClaim(grafanaStorage), - // // }, - // // { - // spec+: { - // template+: { - // spec+: { - // containers: - // std.map( - // function(c) - // if c.name == 'grafana' then - // c { - // volumeMounts+: [ - // containerVolumeMount.new("grafana-storage", "/var/lib/grafana"), - // ], - // } - // else - // c, - // super.containers, - // ), - // volumes+: [ - // volume.fromPersistentVolumeClaim(grafanaStorage, "grafana-storage"), - // ], - // }, - // }, - // }, - // }, - // }, + # Override deployment for Grafana data persistence + grafana+:: { + deployment+: { + spec+: { + template+: { + spec+: { + volumes: + std.map( + function(v) + if v.name == 'grafana-storage' then + {'name':'grafana-storage', + 'persistentVolumeClaim': { + 'claimName': 'grafana-storage'} + } + else + v, + super.volumes + ), + }, + }, + }, + }, + storage: + local pvc = k.core.v1.persistentVolumeClaim; + local deployment = k.apps.v1beta2.deployment; + pvc.new() + pvc.mixin.metadata.withNamespace($._config.namespace) + + pvc.mixin.metadata.withName("grafana-storage") + + pvc.mixin.spec.withAccessModes('ReadWriteMany') + + pvc.mixin.spec.resources.withRequests({ storage: '2Gi' }), + }, // Add custom dashboards grafanaDashboards+:: { From 0af6b6eb2ec553c69d74cd0127c13472862ae689 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 20:20:03 -0200 Subject: [PATCH 22/30] Cleanup --- operator_stack.jsonnet | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/operator_stack.jsonnet b/operator_stack.jsonnet index d36b052..821583b 100644 --- a/operator_stack.jsonnet +++ b/operator_stack.jsonnet @@ -10,10 +10,6 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + namespace: "monitoring", urls+:: { - prom_externalUrl: 'http://prometheus.internal.carlosedp.com', - alert_externalUrl: 'http://alertmanager.internal.carlosedp.com', - grafana_externalUrl: 'http://grafana.internal.carlosedp.com/', - prom_ingress: 'prometheus.internal.carlosedp.com', alert_ingress: 'alertmanager.internal.carlosedp.com', grafana_ingress: 'grafana.internal.carlosedp.com', @@ -76,7 +72,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + prometheus+: { spec+: { retention: '15d', - externalUrl: $._config.urls.prom_externalUrl, + externalUrl: 'http://' + $._config.urls.prom_ingress, storage: { volumeClaimTemplate: pvc.new() + @@ -113,7 +109,6 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + }, storage: local pvc = k.core.v1.persistentVolumeClaim; - local deployment = k.apps.v1beta2.deployment; pvc.new() + pvc.mixin.metadata.withNamespace($._config.namespace) + pvc.mixin.metadata.withName("grafana-storage") + pvc.mixin.spec.withAccessModes('ReadWriteMany') + @@ -208,8 +203,8 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + httpIngressPath.mixin.backend.withServicePort('web') ), ), - // 'grafana-external': // // Example external ingress with authentication + // 'grafana-external': // ingress.new() + // ingress.mixin.metadata.withName('grafana-external') + // ingress.mixin.metadata.withNamespace($._config.namespace) + From ac8f720239d5a98b2a13326e17213ec2c5569580 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Mon, 4 Feb 2019 20:54:28 -0200 Subject: [PATCH 23/30] Add CoreDNS dashboard --- grafana-dashboards/coredns-dashboard.json | 1316 +++++++++++++++++ manifests/grafana-dashboardDefinitions.yaml | 1479 +++++++++++++++++++ manifests/grafana-deployment.yaml | 6 + operator_stack.jsonnet | 1 + 4 files changed, 2802 insertions(+) create mode 100644 grafana-dashboards/coredns-dashboard.json diff --git a/grafana-dashboards/coredns-dashboard.json b/grafana-dashboards/coredns-dashboard.json new file mode 100644 index 0000000..c0d906c --- /dev/null +++ b/grafana-dashboards/coredns-dashboard.json @@ -0,0 +1,1316 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "A dashboard for the CoreDNS DNS server.", + "editable": true, + "gnetId": 5926, + "graphTooltip": 0, + "id": 14, + "iteration": 1549319226130, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "total", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_dns_request_count_total{instance=~\"$instance\"}[5m])) by (proto)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{proto}}", + "refId": "A", + "step": 60 + }, + { + "expr": "sum(rate(coredns_dns_request_count_total{instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "total", + "refId": "B", + "step": 60 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests (total)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "total", + "yaxis": 2 + }, + { + "alias": "other", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_dns_request_type_count_total{instance=~\"$instance\"}[5m])) by (type)", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "refId": "A", + "step": 60 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests (by qtype)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "total", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_dns_request_count_total{instance=~\"$instance\"}[5m])) by (zone)", + "intervalFactor": 2, + "legendFormat": "{{zone}}", + "refId": "A", + "step": 60 + }, + { + "expr": "sum(rate(coredns_dns_request_count_total{instance=~\"$instance\"}[5m]))", + "intervalFactor": 2, + "legendFormat": "total", + "refId": "B", + "step": 60 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests (by zone)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "total", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_dns_request_do_count_total{instance=~\"$instance\"}[5m]))", + "intervalFactor": 2, + "legendFormat": "DO", + "refId": "A", + "step": 40 + }, + { + "expr": "sum(rate(coredns_dns_request_count_total{instance=~\"$instance\"}[5m]))", + "intervalFactor": 2, + "legendFormat": "total", + "refId": "B", + "step": 40 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests (DO bit)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 7 + }, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "tcp:90", + "yaxis": 2 + }, + { + "alias": "tcp:99 ", + "yaxis": 2 + }, + { + "alias": "tcp:50", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_size_bytes_bucket{instance=~\"$instance\",proto=\"udp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:99 ", + "refId": "A", + "step": 60 + }, + { + "expr": "histogram_quantile(0.90, sum(rate(coredns_dns_request_size_bytes_bucket{instance=~\"$instance\",proto=\"udp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:90", + "refId": "B", + "step": 60 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_size_bytes_bucket{instance=~\"$instance\",proto=\"udp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:50", + "refId": "C", + "step": 60 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests (size, udp)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 7 + }, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "tcp:90", + "yaxis": 1 + }, + { + "alias": "tcp:99 ", + "yaxis": 1 + }, + { + "alias": "tcp:50", + "yaxis": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_size_bytes_bucket{instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:99 ", + "refId": "A", + "step": 60 + }, + { + "expr": "histogram_quantile(0.90, sum(rate(coredns_dns_request_size_bytes_bucket{instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:90", + "refId": "B", + "step": 60 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_size_bytes_bucket{instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:50", + "refId": "C", + "step": 60 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Requests (size,tcp)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_dns_response_rcode_count_total{instance=~\"$instance\"}[5m])) by (rcode)", + "intervalFactor": 2, + "legendFormat": "{{rcode}}", + "refId": "A", + "step": 40 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Responses (by rcode)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_milliseconds_bucket{instance=~\"$instance\"}[5m])) by (le, job))", + "intervalFactor": 2, + "legendFormat": "99%", + "refId": "A", + "step": 40 + }, + { + "expr": "histogram_quantile(0.90, sum(rate(coredns_dns_request_duration_milliseconds_bucket{instance=~\"$instance\"}[5m])) by (le))", + "intervalFactor": 2, + "legendFormat": "90%", + "refId": "B", + "step": 40 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_milliseconds_bucket{instance=~\"$instance\"}[5m])) by (le))", + "intervalFactor": 2, + "legendFormat": "50%", + "refId": "C", + "step": 40 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Responses (duration)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "ms", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "udp:50%", + "yaxis": 1 + }, + { + "alias": "tcp:50%", + "yaxis": 2 + }, + { + "alias": "tcp:90%", + "yaxis": 2 + }, + { + "alias": "tcp:99%", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_response_size_bytes_bucket{instance=~\"$instance\",proto=\"udp\"}[5m])) by (le,proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:99%", + "refId": "A", + "step": 40 + }, + { + "expr": "histogram_quantile(0.90, sum(rate(coredns_dns_response_size_bytes_bucket{instance=\"$instance\",proto=\"udp\"}[5m])) by (le,proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:90%", + "refId": "B", + "step": 40 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_response_size_bytes_bucket{instance=~\"$instance\",proto=\"udp\"}[5m])) by (le,proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:50%", + "metric": "", + "refId": "C", + "step": 40 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Responses (size, udp)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "udp:50%", + "yaxis": 1 + }, + { + "alias": "tcp:50%", + "yaxis": 1 + }, + { + "alias": "tcp:90%", + "yaxis": 1 + }, + { + "alias": "tcp:99%", + "yaxis": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_response_size_bytes_bucket{instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le,proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:99%", + "refId": "A", + "step": 40 + }, + { + "expr": "histogram_quantile(0.90, sum(rate(coredns_dns_response_size_bytes_bucket{instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le,proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:90%", + "refId": "B", + "step": 40 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_response_size_bytes_bucket{instance=~\"$instance\",proto=\"tcp\"}[5m])) by (le, proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:50%", + "metric": "", + "refId": "C", + "step": 40 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Responses (size, tcp)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(coredns_cache_size{instance=~\"$instance\"}) by (type)", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "refId": "A", + "step": 40 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cache (size)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": {}, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "misses", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_cache_hits_total{instance=~\"$instance\"}[5m])) by (type)", + "intervalFactor": 2, + "legendFormat": "hits:{{type}}", + "refId": "A", + "step": 40 + }, + { + "expr": "sum(rate(coredns_cache_misses_total{instance=~\"$instance\"}[5m])) by (type)", + "intervalFactor": 2, + "legendFormat": "misses", + "refId": "B", + "step": 40 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Cache (hitrate)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "dns", + "coredns" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": false, + "name": "instance", + "options": [], + "query": "up{job=\"coredns\"}", + "refresh": 1, + "regex": ".*instance=\"(.*?)\".*", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "CoreDNS", + "uid": "q-QViRumz", + "version": 1 +} \ No newline at end of file diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 305d84d..25998ba 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1,5 +1,1484 @@ apiVersion: v1 items: +- apiVersion: v1 + data: + coredns-dashboard.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "A dashboard for the CoreDNS DNS server.", + "editable": true, + "gnetId": 5926, + "graphTooltip": 0, + "id": 14, + "iteration": 1549319226130, + "links": [ + + ], + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 0 + }, + "id": 1, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "total", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_dns_request_count_total{instance=\u007e\"$instance\"}[5m])) by (proto)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{proto}}", + "refId": "A", + "step": 60 + }, + { + "expr": "sum(rate(coredns_dns_request_count_total{instance=\u007e\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "total", + "refId": "B", + "step": 60 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Requests (total)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 0 + }, + "id": 12, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "total", + "yaxis": 2 + }, + { + "alias": "other", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_dns_request_type_count_total{instance=\u007e\"$instance\"}[5m])) by (type)", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "refId": "A", + "step": 60 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Requests (by qtype)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 2, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "total", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_dns_request_count_total{instance=\u007e\"$instance\"}[5m])) by (zone)", + "intervalFactor": 2, + "legendFormat": "{{zone}}", + "refId": "A", + "step": 60 + }, + { + "expr": "sum(rate(coredns_dns_request_count_total{instance=\u007e\"$instance\"}[5m]))", + "intervalFactor": 2, + "legendFormat": "total", + "refId": "B", + "step": 60 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Requests (by zone)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 10, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "total", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_dns_request_do_count_total{instance=\u007e\"$instance\"}[5m]))", + "intervalFactor": 2, + "legendFormat": "DO", + "refId": "A", + "step": 40 + }, + { + "expr": "sum(rate(coredns_dns_request_count_total{instance=\u007e\"$instance\"}[5m]))", + "intervalFactor": 2, + "legendFormat": "total", + "refId": "B", + "step": 40 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Requests (DO bit)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 12, + "y": 7 + }, + "id": 9, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "tcp:90", + "yaxis": 2 + }, + { + "alias": "tcp:99 ", + "yaxis": 2 + }, + { + "alias": "tcp:50", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_size_bytes_bucket{instance=\u007e\"$instance\",proto=\"udp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:99 ", + "refId": "A", + "step": 60 + }, + { + "expr": "histogram_quantile(0.90, sum(rate(coredns_dns_request_size_bytes_bucket{instance=\u007e\"$instance\",proto=\"udp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:90", + "refId": "B", + "step": 60 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_size_bytes_bucket{instance=\u007e\"$instance\",proto=\"udp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:50", + "refId": "C", + "step": 60 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Requests (size, udp)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 6, + "x": 18, + "y": 7 + }, + "id": 14, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "tcp:90", + "yaxis": 1 + }, + { + "alias": "tcp:99 ", + "yaxis": 1 + }, + { + "alias": "tcp:50", + "yaxis": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_size_bytes_bucket{instance=\u007e\"$instance\",proto=\"tcp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:99 ", + "refId": "A", + "step": 60 + }, + { + "expr": "histogram_quantile(0.90, sum(rate(coredns_dns_request_size_bytes_bucket{instance=\u007e\"$instance\",proto=\"tcp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:90", + "refId": "B", + "step": 60 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_size_bytes_bucket{instance=\u007e\"$instance\",proto=\"tcp\"}[5m])) by (le,proto))", + "intervalFactor": 2, + "legendFormat": "{{proto}}:50", + "refId": "C", + "step": 60 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Requests (size,tcp)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "id": 5, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_dns_response_rcode_count_total{instance=\u007e\"$instance\"}[5m])) by (rcode)", + "intervalFactor": 2, + "legendFormat": "{{rcode}}", + "refId": "A", + "step": 40 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Responses (by rcode)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 14 + }, + "id": 3, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_request_duration_milliseconds_bucket{instance=\u007e\"$instance\"}[5m])) by (le, job))", + "intervalFactor": 2, + "legendFormat": "99%", + "refId": "A", + "step": 40 + }, + { + "expr": "histogram_quantile(0.90, sum(rate(coredns_dns_request_duration_milliseconds_bucket{instance=\u007e\"$instance\"}[5m])) by (le))", + "intervalFactor": 2, + "legendFormat": "90%", + "refId": "B", + "step": 40 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_request_duration_milliseconds_bucket{instance=\u007e\"$instance\"}[5m])) by (le))", + "intervalFactor": 2, + "legendFormat": "50%", + "refId": "C", + "step": 40 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Responses (duration)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 8, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "udp:50%", + "yaxis": 1 + }, + { + "alias": "tcp:50%", + "yaxis": 2 + }, + { + "alias": "tcp:90%", + "yaxis": 2 + }, + { + "alias": "tcp:99%", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_response_size_bytes_bucket{instance=\u007e\"$instance\",proto=\"udp\"}[5m])) by (le,proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:99%", + "refId": "A", + "step": 40 + }, + { + "expr": "histogram_quantile(0.90, sum(rate(coredns_dns_response_size_bytes_bucket{instance=\"$instance\",proto=\"udp\"}[5m])) by (le,proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:90%", + "refId": "B", + "step": 40 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_response_size_bytes_bucket{instance=\u007e\"$instance\",proto=\"udp\"}[5m])) by (le,proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:50%", + "metric": "", + "refId": "C", + "step": 40 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Responses (size, udp)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 21 + }, + "id": 13, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "udp:50%", + "yaxis": 1 + }, + { + "alias": "tcp:50%", + "yaxis": 1 + }, + { + "alias": "tcp:90%", + "yaxis": 1 + }, + { + "alias": "tcp:99%", + "yaxis": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(coredns_dns_response_size_bytes_bucket{instance=\u007e\"$instance\",proto=\"tcp\"}[5m])) by (le,proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:99%", + "refId": "A", + "step": 40 + }, + { + "expr": "histogram_quantile(0.90, sum(rate(coredns_dns_response_size_bytes_bucket{instance=\u007e\"$instance\",proto=\"tcp\"}[5m])) by (le,proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:90%", + "refId": "B", + "step": 40 + }, + { + "expr": "histogram_quantile(0.50, sum(rate(coredns_dns_response_size_bytes_bucket{instance=\u007e\"$instance\",proto=\"tcp\"}[5m])) by (le, proto)) ", + "intervalFactor": 2, + "legendFormat": "{{proto}}:50%", + "metric": "", + "refId": "C", + "step": 40 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Responses (size, tcp)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 28 + }, + "id": 15, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(coredns_cache_size{instance=\u007e\"$instance\"}) by (type)", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "refId": "A", + "step": 40 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Cache (size)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "editable": true, + "error": false, + "fill": 1, + "grid": { + + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 28 + }, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "connected", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "misses", + "yaxis": 2 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(coredns_cache_hits_total{instance=\u007e\"$instance\"}[5m])) by (type)", + "intervalFactor": 2, + "legendFormat": "hits:{{type}}", + "refId": "A", + "step": 40 + }, + { + "expr": "sum(rate(coredns_cache_misses_total{instance=\u007e\"$instance\"}[5m])) by (type)", + "intervalFactor": 2, + "legendFormat": "misses", + "refId": "B", + "step": 40 + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Cache (hitrate)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "pps", + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "dns", + "coredns" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "text": "All", + "value": "$__all" + }, + "datasource": "prometheus", + "definition": "", + "hide": 0, + "includeAll": true, + "label": "Instance", + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "up{job=\"coredns\"}", + "refresh": 1, + "regex": ".*instance=\"(.*?)\".*", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "now": true, + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "CoreDNS", + "uid": "q-QViRumz", + "version": 1 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-coredns-dashboard + namespace: monitoring - apiVersion: v1 data: k8s-cluster-rsrc-use.json: |- diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 74bc1a2..e4e6e77 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -42,6 +42,9 @@ spec: - mountPath: /etc/grafana/provisioning/dashboards name: grafana-dashboards readOnly: false + - mountPath: /grafana-dashboard-definitions/0/coredns-dashboard + name: grafana-dashboard-coredns-dashboard + readOnly: false - mountPath: /grafana-dashboard-definitions/0/k8s-cluster-rsrc-use name: grafana-dashboard-k8s-cluster-rsrc-use readOnly: false @@ -97,6 +100,9 @@ spec: - configMap: name: grafana-dashboards name: grafana-dashboards + - configMap: + name: grafana-dashboard-coredns-dashboard + name: grafana-dashboard-coredns-dashboard - configMap: name: grafana-dashboard-k8s-cluster-rsrc-use name: grafana-dashboard-k8s-cluster-rsrc-use diff --git a/operator_stack.jsonnet b/operator_stack.jsonnet index 821583b..8e596c8 100644 --- a/operator_stack.jsonnet +++ b/operator_stack.jsonnet @@ -120,6 +120,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + 'kubernetes-cluster-dashboard.json': (import 'grafana-dashboards/kubernetes-cluster-dashboard.json'), 'prometheus-dashboard.json': (import 'grafana-dashboards/prometheus-dashboard.json'), 'traefik-dashboard.json': (import 'grafana-dashboards/traefik-dashboard.json'), + 'coredns-dashboard.json': (import 'grafana-dashboards/coredns-dashboard.json'), }, kubeStateMetrics+:: { // Override command for addon-resizer due to change from parameter --threshold to --acceptance-offset From 4902887205795020f7267fe02664323c3576c5cf Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Tue, 5 Feb 2019 11:57:43 -0200 Subject: [PATCH 24/30] Update Grafana dashboard --- .../kubernetes-cluster-dashboard.json | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/grafana-dashboards/kubernetes-cluster-dashboard.json b/grafana-dashboards/kubernetes-cluster-dashboard.json index da58dbb..9f42016 100644 --- a/grafana-dashboards/kubernetes-cluster-dashboard.json +++ b/grafana-dashboards/kubernetes-cluster-dashboard.json @@ -16,7 +16,7 @@ "editable": true, "gnetId": 162, "graphTooltip": 1, - "id": 13, + "id": 7, "links": [], "panels": [ { @@ -1358,10 +1358,10 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum(rate(container_network_transmit_bytes_total{name=~\".+\"}[5m])) by (name))", + "expr": "topk(10,sum(rate(container_network_transmit_bytes_total{pod_name=~\".+\"}[5m])) by (pod_name))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ name }}", + "legendFormat": "{{ pod_name }}", "refId": "A", "step": 240 }, @@ -1383,7 +1383,7 @@ "tooltip": { "msResolution": true, "shared": true, - "sort": 1, + "sort": 2, "value_type": "cumulative" }, "transparent": false, @@ -1465,19 +1465,19 @@ "steppedLine": false, "targets": [ { - "expr": "topk(10,sum(rate(container_network_receive_bytes_total{name=~\".+\"}[5m])) by (name))", + "expr": "topk(10,sum(rate(container_network_receive_bytes_total{pod_name=~\".+\"}[5m])) by (pod_name))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{name}}", + "legendFormat": "{{pod_name}}", "refId": "A", "step": 240 }, { - "expr": "- rate(container_network_transmit_bytes_total{name=~\".+\"}[$interval])", + "expr": "- rate(container_network_transmit_bytes_total{pod_name=~\".+\"}[$interval])", "format": "time_series", "hide": true, "intervalFactor": 2, - "legendFormat": "{{name}}", + "legendFormat": "{{pod_name}}", "refId": "B", "step": 10 } @@ -1490,7 +1490,7 @@ "tooltip": { "msResolution": true, "shared": true, - "sort": 1, + "sort": 2, "value_type": "cumulative" }, "transparent": false, From 645ac5333f04c6932bc7a60372ba3e07d7a25dd0 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Tue, 5 Feb 2019 16:52:36 -0200 Subject: [PATCH 25/30] Fix arm-exporter proxy listen port --- arm_exporter.jsonnet | 2 +- manifests/arm-exporter-daemonset.yaml | 2 +- manifests/grafana-dashboardDefinitions.yaml | 18 +++++++++--------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/arm_exporter.jsonnet b/arm_exporter.jsonnet index e5a0792..47d3ac0 100644 --- a/arm_exporter.jsonnet +++ b/arm_exporter.jsonnet @@ -22,7 +22,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + local proxy = container.new('kube-rbac-proxy', $._config.imageRepos.kubeRbacProxy + ':' + $._config.versions.kubeRbacProxy) + container.withArgs([ - '--secure-listen-address=$(IP):9243', + '--secure-listen-address=:9243', '--upstream=http://127.0.0.1:9243/', ]) + container.withPorts(containerPort.new(9243) + containerPort.withHostPort(9243) + containerPort.withName('https')) + diff --git a/manifests/arm-exporter-daemonset.yaml b/manifests/arm-exporter-daemonset.yaml index 557e6fa..de2cdf1 100644 --- a/manifests/arm-exporter-daemonset.yaml +++ b/manifests/arm-exporter-daemonset.yaml @@ -25,7 +25,7 @@ spec: cpu: 50m memory: 50Mi - args: - - --secure-listen-address=$(IP):9243 + - --secure-listen-address=:9243 - --upstream=http://127.0.0.1:9243/ image: carlosedp/kube-rbac-proxy:v0.4.1 name: kube-rbac-proxy diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 25998ba..3f93c1b 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -6392,7 +6392,7 @@ items: "editable": true, "gnetId": 162, "graphTooltip": 1, - "id": 13, + "id": 7, "links": [ ], @@ -7864,10 +7864,10 @@ items: "steppedLine": false, "targets": [ { - "expr": "topk(10,sum(rate(container_network_transmit_bytes_total{name=\u007e\".+\"}[5m])) by (name))", + "expr": "topk(10,sum(rate(container_network_transmit_bytes_total{pod_name=\u007e\".+\"}[5m])) by (pod_name))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{ name }}", + "legendFormat": "{{ pod_name }}", "refId": "A", "step": 240 }, @@ -7893,7 +7893,7 @@ items: "tooltip": { "msResolution": true, "shared": true, - "sort": 1, + "sort": 2, "value_type": "cumulative" }, "transparent": false, @@ -7985,19 +7985,19 @@ items: "steppedLine": false, "targets": [ { - "expr": "topk(10,sum(rate(container_network_receive_bytes_total{name=\u007e\".+\"}[5m])) by (name))", + "expr": "topk(10,sum(rate(container_network_receive_bytes_total{pod_name=\u007e\".+\"}[5m])) by (pod_name))", "format": "time_series", "intervalFactor": 2, - "legendFormat": "{{name}}", + "legendFormat": "{{pod_name}}", "refId": "A", "step": 240 }, { - "expr": "- rate(container_network_transmit_bytes_total{name=\u007e\".+\"}[$interval])", + "expr": "- rate(container_network_transmit_bytes_total{pod_name=\u007e\".+\"}[$interval])", "format": "time_series", "hide": true, "intervalFactor": 2, - "legendFormat": "{{name}}", + "legendFormat": "{{pod_name}}", "refId": "B", "step": 10 } @@ -8014,7 +8014,7 @@ items: "tooltip": { "msResolution": true, "shared": true, - "sort": 1, + "sort": 2, "value_type": "cumulative" }, "transparent": false, From cb0c14feeddd92cb99e9ca97af2a2baf60ca899a Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Tue, 5 Feb 2019 16:54:41 -0200 Subject: [PATCH 26/30] Tag arm_exporter to arm64 only nodes --- arm_exporter.jsonnet | 2 +- manifests/arm-exporter-daemonset.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arm_exporter.jsonnet b/arm_exporter.jsonnet index 47d3ac0..5eb02ce 100644 --- a/arm_exporter.jsonnet +++ b/arm_exporter.jsonnet @@ -36,7 +36,7 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + daemonset.mixin.metadata.withLabels(podLabels) + daemonset.mixin.spec.selector.withMatchLabels(podLabels) + daemonset.mixin.spec.template.metadata.withLabels(podLabels) + - daemonset.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/os': 'linux' }) + + daemonset.mixin.spec.template.spec.withNodeSelector({ 'beta.kubernetes.io/arch': 'arm64' }) + daemonset.mixin.spec.template.spec.withContainers(c), serviceMonitor: { diff --git a/manifests/arm-exporter-daemonset.yaml b/manifests/arm-exporter-daemonset.yaml index de2cdf1..453048c 100644 --- a/manifests/arm-exporter-daemonset.yaml +++ b/manifests/arm-exporter-daemonset.yaml @@ -41,4 +41,4 @@ spec: cpu: 10m memory: 20Mi nodeSelector: - beta.kubernetes.io/os: linux + beta.kubernetes.io/arch: arm64 From 8f3d7c448d55fff415323773f7064755acb44061 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Tue, 5 Feb 2019 17:41:01 -0200 Subject: [PATCH 27/30] Updated README --- Readme.md | 84 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/Readme.md b/Readme.md index 55929ee..39ca7e0 100644 --- a/Readme.md +++ b/Readme.md @@ -1,32 +1,66 @@ -# Prometheus Operator for ARM platform +# Prometheus Operator for ARM / X86-64 platforms The Prometheus Operator for Kubernetes provides easy monitoring definitions for Kubernetes services and deployment and management of Prometheus instances. -This project aims on porting the [official manifests](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus) and images to the ARM platform. This have been tested on a ARM64 Kubernetes cluster deployed as [this article](https://medium.com/@carlosedp/building-an-arm-kubernetes-cluster-ef31032636f9). +This have been tested on a hybrid ARM64 / X84-64 Kubernetes cluster deployed as [this article](https://medium.com/@carlosedp/building-a-hybrid-x86-64-and-arm-kubernetes-cluster-e7f94ff6e51d). -## Changes to Kubeadm for Prometheus Operator +This repository collects Kubernetes manifests, Grafana dashboards, and Prometheus rules combined with documentation and scripts to provide easy to operate end-to-end Kubernetes cluster monitoring with Prometheus using the Prometheus Operator. -According to the official deployment documentation [here](https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus/docs/kube-prometheus-on-kubeadm.md), a couple of changes on the cluster are required: +The content of this project is written in jsonnet and is an extension of the fantastic [kube-prometheus](https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus) project. -We need to expose the cadvisor that is installed and managed by the kubelet daemon and allow webhook token authentication. To do so, we do the following on **all the masters and nodes**: +Components included in this package: -```bash -# Enable cadvisor port -sudo sed -e "/cadvisor-port=0/d" -i /etc/systemd/system/kubelet.service.d/10-kubeadm.conf +* The Prometheus Operator +* Highly available Prometheus +* Highly available Alertmanager +* Prometheus node-exporter +* ARM_exporter to generate temperature metrics +* MetalLB metrics +* Traefik metrics +* kube-state-metrics +* Grafana -# Enable Webhook authorization -sudo perl -pi -e "s/(?:--authentication-token-webhook=true )*--authorization-mode=Webhook/--authentication-token-webhook=true --authorization-mode=Webhook/g" /etc/systemd/system/kubelet.service.d/10-kubeadm.conf +## Quickstart -sudo systemctl daemon-reload -sudo systemctl restart kubelet +The repository already provides a set of compiled manifests to be applied into the cluster. The deployment can be customized thru the jsonnet files. + +To simply deploy the stack, run: + +``` +$ kubectl apply -f manifests/ + +# It can take a few seconds for the above 'create manifests' command to fully create the following resources, so verify the resources are ready before proceeding. +$ until kubectl get customresourcedefinitions servicemonitors.monitoring.coreos.com ; do date; sleep 1; echo ""; done +$ until kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""; done + +$ kubectl apply -f manifests/ # This command sometimes may need to be done twice (to workaround a race condition). ``` -In case you already have a Kubernetes deployed with kubeadm, change the address kube-controller-manager and kube-scheduler listens **on master node** in addition to previous kubelet change: +## Customizing -```bash -# Make kube-controller ad kube-scheduler listen on all addresses -sudo sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-controller-manager.yaml -sudo sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-scheduler.yaml +The content of this project consists of a set of jsonnet files making up a library to be consumed. + +### Pre-reqs + +The project requires json-bundler and the jsonnet compiler. The Makefile does the heavy-lifting of installing: + +``` +git clone https://github.com/carlosedp/prometheus-operator-ARM +cd prometheus-operator-ARM +make vendor +# Change the jsonnet files... +make +``` +After this, a new customized set of manifests is built into the `manifests` dir. To apply to your cluster, run: + +``` +make deploy +``` + +To uninstall, run: + +``` +make teardown ``` ## Images @@ -60,11 +94,17 @@ This project depends on the following images: * Autobuild: No autobuild yet. Use provided `build_images.sh` script. * Images: https://hub.docker.com/r/carlosedp/prometheus-operator +**Prometheus-adapter** + +* Source: https://github.com/DirectXMan12/k8s-prometheus-adapter +* Autobuild: No autobuild yet. Use provided `build_images.sh` script. +* Images: https://hub.docker.com/r/carlosedp/k8s-prometheus-adapter + **Grafana** * Source: https://github.com/carlosedp/grafana-ARM * Autobuild: https://travis-ci.org/carlosedp/grafana-ARM -* Images: https://hub.docker.com/r/carlosedp/monitoring-grafana/ +* Images: https://hub.docker.com/r/grafana/grafana/ **Kube-state-metrics** @@ -74,7 +114,7 @@ This project depends on the following images: **Addon-resizer** -* Source: +* Source: https://github.com/kubernetes/autoscaler/tree/master/addon-resizer * Autobuild: No autobuild yet. Use provided `build_images.sh` script. * Images: https://hub.docker.com/r/carlosedp/addon-resizer @@ -97,3 +137,9 @@ This project depends on the following images: Source: https://github.com/carlosedp/docker-smtp Autobuild: https://travis-ci.org/carlosedp/docker-smtp Images: https://hub.docker.com/r/carlosedp/docker-smtp + +**Kube-rbac-proxy** + +Source: https://github.com/brancz/kube-rbac-proxy +Autobuild: No autobuild yet. Use provided `build_images.sh` script. +Images: https://hub.docker.com/r/carlosedp/kube-rbac-proxy From 57047c06c5366272ade26f93e5eed94921e56d32 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Fri, 15 Feb 2019 10:22:45 -0200 Subject: [PATCH 28/30] Cleanup after merging addon-resizer PR to upstream. Update libs --- jsonnetfile.lock.json | 10 +- ...r-0prometheusCustomResourceDefinition.yaml | 61 +++++++++- .../0prometheus-operator-clusterRole.yaml | 2 + manifests/grafana-dashboardDefinitions.yaml | 112 +++++++++--------- manifests/prometheus-rules.yaml | 46 ++++--- .../prometheus-serviceMonitorApiserver.yaml | 8 ++ .../prometheus-serviceMonitorKubelet.yaml | 10 ++ operator_stack.jsonnet | 31 ----- 8 files changed, 169 insertions(+), 111 deletions(-) diff --git a/jsonnetfile.lock.json b/jsonnetfile.lock.json index c726a9a..1fb7373 100644 --- a/jsonnetfile.lock.json +++ b/jsonnetfile.lock.json @@ -8,7 +8,7 @@ "subdir": "contrib/kube-prometheus/jsonnet/kube-prometheus" } }, - "version": "0e22050ca3ebe6b7033f8c0f1cb1605e3abe63c5" + "version": "685bc278917085efe30ef6ff7aecc532387da693" }, { "name": "ksonnet", @@ -28,7 +28,7 @@ "subdir": "" } }, - "version": "668950e4af13f0153fa1d7b58ebe7023b33f2217" + "version": "5525c8cc8a4a52d272bdaf481dd77b53a0c0f051" }, { "name": "grafonnet", @@ -48,7 +48,7 @@ "subdir": "grafana-builder" } }, - "version": "ec3d4f943df01f517a083305666cd1c87bcc7e94" + "version": "403b7d0120d2903d21854eae217b4e4863c454d1" }, { "name": "grafana", @@ -68,7 +68,7 @@ "subdir": "jsonnet/prometheus-operator" } }, - "version": "174c9bf17bec78b055e8e63c6ae3a3dc9bb0a3a8" + "version": "338addbabc8a29b46840df0bb0355c12b96a6f21" }, { "name": "etcd-mixin", @@ -78,7 +78,7 @@ "subdir": "Documentation/etcd-mixin" } }, - "version": "1fe6f109c87c4fa47775426a6a60c3b954ed5c33" + "version": "4cd0bf8ea846a0d158761d55899f631eb2a423cf" } ] } diff --git a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml index 627ce96..158c5cb 100644 --- a/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml +++ b/manifests/0prometheus-operator-0prometheusCustomResourceDefinition.yaml @@ -1538,6 +1538,14 @@ spec: required: - name type: array + enableAdminAPI: + description: 'Enable access to prometheus web admin API. Defaults to + the value of `false`. WARNING: Enabling the admin APIs enables mutating + endpoints, to delete data, shutdown Prometheus, and more. Enabling + this should be done with care and the user is advised to add additional + authentication authorization via a proxy to ensure only clients authorized + to perform these actions can do so. For more information see https://prometheus.io/docs/prometheus/latest/querying/api/#tsdb-admin-apis' + type: boolean evaluationInterval: description: Interval between consecutive evaluations. type: string @@ -1572,6 +1580,9 @@ spec: description: ListenLocal makes the Prometheus server listen on loopback, so that it does not bind against the Pod IP. type: boolean + logFormat: + description: Log format for Prometheus to be configured with. + type: string logLevel: description: Log level for Prometheus to be configured with. type: string @@ -2059,6 +2070,11 @@ spec: description: MinBackoff is the initial retry delay. Gets doubled for every retry. type: string + minShards: + description: MinShards is the minimum number of shards, i.e. + amount of concurrency. + format: int32 + type: integer remoteTimeout: description: Timeout for requests to the remote write endpoint. type: string @@ -2243,6 +2259,25 @@ spec: "In", and the values array contains only "value". The requirements are ANDed. type: object + rules: + description: /--rules.*/ command-line arguments + properties: + alert: + description: /--rules.alert.*/ command-line arguments + properties: + forGracePeriod: + description: Minimum duration between alert and restored 'for' + state. This is maintained only for alerts with configured + 'for' time greater than grace period. + type: string + forOutageTolerance: + description: Max time to tolerate prometheus outage for restoring + 'for' state of alert. + type: string + resendDelay: + description: Minimum amount of time to wait before resending + an alert to Alertmanager. + type: string scrapeInterval: description: Interval between consecutive scrapes. type: string @@ -2941,8 +2976,9 @@ spec: description: Thanos base image if other than default. type: string gcs: - description: ThanosGCSSpec defines parameters for use of Google - Cloud Storage (GCS) with Thanos. + description: 'Deprecated: ThanosGCSSpec should be configured with + an ObjectStorageConfig secret starting with Thanos v0.2.0. ThanosGCSSpec + will be removed.' properties: bucket: description: Google Cloud Storage bucket name for stored blocks. @@ -2970,6 +3006,22 @@ spec: to ensure the Prometheus Operator knows what version of Thanos is being configured. type: string + objectStorageConfig: + description: SecretKeySelector selects a key of a Secret. + properties: + key: + description: The key of the secret to select from. Must be + a valid secret key. + type: string + name: + description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' + type: string + optional: + description: Specify whether the Secret or it's key must be + defined + type: boolean + required: + - key peers: description: Peers is a DNS name for Thanos to discover peers through. type: string @@ -2988,8 +3040,9 @@ spec: to an implementation-defined value. More info: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/' type: object s3: - description: ThanosS3Spec defines parameters for of AWS Simple Storage - Service (S3) with Thanos. (S3 compatible services apply as well) + description: 'Deprecated: ThanosS3Spec should be configured with + an ObjectStorageConfig secret starting with Thanos v0.2.0. ThanosS3Spec + will be removed.' properties: accessKey: description: SecretKeySelector selects a key of a Secret. diff --git a/manifests/0prometheus-operator-clusterRole.yaml b/manifests/0prometheus-operator-clusterRole.yaml index e0ac283..123f78e 100644 --- a/manifests/0prometheus-operator-clusterRole.yaml +++ b/manifests/0prometheus-operator-clusterRole.yaml @@ -44,11 +44,13 @@ rules: - "" resources: - services + - services/finalizers - endpoints verbs: - get - create - update + - delete - apiGroups: - "" resources: diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index 3f93c1b..610ffd5 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -1510,7 +1510,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -1539,7 +1539,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_cpu_utilisation:avg1m * node:node_num_cpu:sum / scalar(sum(node:node_num_cpu:sum))", + "expr": "node:cluster_cpu_utilisation:ratio", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -1596,7 +1596,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -1694,7 +1694,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -1723,7 +1723,7 @@ items: "steppedLine": false, "targets": [ { - "expr": "node:node_memory_utilisation:ratio", + "expr": "node:cluster_memory_utilisation:ratio", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{node}}", @@ -1780,7 +1780,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -1878,7 +1878,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -1964,7 +1964,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -2062,7 +2062,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -2148,7 +2148,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -2246,7 +2246,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -2335,7 +2335,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -2387,7 +2387,7 @@ items: ] }, "timezone": "", - "title": "K8s / USE Method / Cluster", + "title": "Kubernetes / USE Method / Cluster", "uid": "a6e7d1362e1ddbb79db21d5bb40d7137", "version": 0 } @@ -2426,7 +2426,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -2512,7 +2512,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -2610,7 +2610,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -2696,7 +2696,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -2794,7 +2794,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -2880,7 +2880,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -2978,7 +2978,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -3064,7 +3064,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -3162,7 +3162,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -3251,7 +3251,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -3330,7 +3330,7 @@ items: ] }, "timezone": "", - "title": "K8s / USE Method / Node", + "title": "Kubernetes / USE Method / Node", "uid": "4ac4f123aae0ff6dbaf4f4f66120033b", "version": 0 } @@ -3370,7 +3370,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -3454,7 +3454,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -3538,7 +3538,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -3622,7 +3622,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -3706,7 +3706,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 4, + "id": 5, "legend": { "avg": false, "current": false, @@ -3790,7 +3790,7 @@ items: "datasource": "$datasource", "fill": 1, "format": "percentunit", - "id": 5, + "id": 6, "legend": { "avg": false, "current": false, @@ -3885,7 +3885,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 6, + "id": 7, "legend": { "avg": false, "current": false, @@ -3983,7 +3983,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 7, + "id": 8, "legend": { "avg": false, "current": false, @@ -4250,7 +4250,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 8, + "id": 9, "legend": { "avg": false, "current": false, @@ -4348,7 +4348,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 9, + "id": 10, "legend": { "avg": false, "current": false, @@ -4606,7 +4606,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -4658,7 +4658,7 @@ items: ] }, "timezone": "", - "title": "K8s / Compute Resources / Cluster", + "title": "Kubernetes / Compute Resources / Cluster", "uid": "efa86fd1d0c121a26444b636a3f509a8", "version": 0 } @@ -4697,7 +4697,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -4795,7 +4795,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -5062,7 +5062,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -5160,7 +5160,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -5418,7 +5418,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -5497,7 +5497,7 @@ items: ] }, "timezone": "", - "title": "K8s / Compute Resources / Namespace", + "title": "Kubernetes / Compute Resources / Namespace", "uid": "85a562078cdf77779eaa1add43ccec1e", "version": 0 } @@ -5536,7 +5536,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 0, + "id": 1, "legend": { "avg": false, "current": false, @@ -5634,7 +5634,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 1, + "id": 2, "legend": { "avg": false, "current": false, @@ -5901,7 +5901,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 10, - "id": 2, + "id": 3, "legend": { "avg": false, "current": false, @@ -5999,7 +5999,7 @@ items: "dashes": false, "datasource": "$datasource", "fill": 1, - "id": 3, + "id": 4, "legend": { "avg": false, "current": false, @@ -6257,7 +6257,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -6363,7 +6363,7 @@ items: ] }, "timezone": "", - "title": "K8s / Compute Resources / Pod", + "title": "Kubernetes / Compute Resources / Pod", "uid": "6581e46e4e5c7ba40a07646395ef7b23", "version": 0 } @@ -9456,7 +9456,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -9534,7 +9534,7 @@ items: ] }, "timezone": "", - "title": "Nodes", + "title": "Kubernetes / Nodes", "uid": "fa49a4706d07a042595b664c87fb33ea", "version": 0 } @@ -9779,7 +9779,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -9883,7 +9883,7 @@ items: ] }, "timezone": "", - "title": "Persistent Volumes", + "title": "Kubernetes / Persistent Volumes", "uid": "919b92a8e8041bd567af9edab12c840c", "version": 0 } @@ -10243,7 +10243,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -10373,7 +10373,7 @@ items: ] }, "timezone": "", - "title": "Pods", + "title": "Kubernetes / Pods", "uid": "ab4f13a9892a76a4d21ce8c2445bf4ea", "version": 0 } @@ -16874,7 +16874,7 @@ items: "schemaVersion": 14, "style": "dark", "tags": [ - + "kubernetes-mixin" ], "templating": { "list": [ @@ -16978,7 +16978,7 @@ items: ] }, "timezone": "", - "title": "StatefulSets", + "title": "Kubernetes / StatefulSets", "uid": "a31c1f46e6f727cb37c0d731a7245005", "version": 0 } diff --git a/manifests/prometheus-rules.yaml b/manifests/prometheus-rules.yaml index 443943c..19432b5 100644 --- a/manifests/prometheus-rules.yaml +++ b/manifests/prometheus-rules.yaml @@ -136,6 +136,13 @@ spec: * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:) record: node:node_cpu_utilisation:avg1m + - expr: | + node:node_cpu_utilisation:avg1m + * + node:node_num_cpu:sum + / + scalar(sum(node:node_num_cpu:sum)) + record: node:cluster_cpu_utilisation:ratio - expr: | sum(node_load1{job="node-exporter"}) / @@ -179,8 +186,13 @@ spec: - expr: | (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) / - scalar(sum(node:node_memory_bytes_total:sum)) + node:node_memory_bytes_total:sum record: node:node_memory_utilisation:ratio + - expr: | + (node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum) + / + scalar(sum(node:node_memory_bytes_total:sum)) + record: node:cluster_memory_utilisation:ratio - expr: | 1e3 * sum( (rate(node_vmstat_pgpgin{job="node-exporter"}[1m]) @@ -241,25 +253,25 @@ spec: max by (namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}) record: 'node:node_filesystem_avail:' - expr: | - sum(irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) + + sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) record: :node_net_utilisation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_bytes_total{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_bytes_total{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) + + irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) record: node:node_net_utilisation:sum_irate - expr: | - sum(irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m])) + - sum(irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) + + sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) record: :node_net_saturation:sum_irate - expr: | sum by (node) ( - (irate(node_network_receive_drop_total{job="node-exporter",device="eth0"}[1m]) + - irate(node_network_transmit_drop_total{job="node-exporter",device="eth0"}[1m])) + (irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) + + irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m])) * on (namespace, pod) group_left(node) node_namespace_pod:kube_pod_info: ) @@ -691,11 +703,11 @@ spec: severity: warning - alert: KubeVersionMismatch annotations: - message: There are {{ $value }} different versions of Kubernetes components - running. + message: There are {{ $value }} different semantic versions of Kubernetes + components running. runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch expr: | - count(count(kubernetes_build_info{job!="kube-dns"}) by (gitVersion)) > 1 + count(count by (gitVersion) (label_replace(kubernetes_build_info{job!="kube-dns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1 for: 1h labels: severity: warning @@ -831,10 +843,14 @@ spec: for: 10m labels: severity: warning - - alert: DeadMansSwitch + - alert: Watchdog annotations: - message: This is a DeadMansSwitch meant to ensure that the entire alerting - pipeline is functional. + message: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. expr: vector(1) labels: severity: none diff --git a/manifests/prometheus-serviceMonitorApiserver.yaml b/manifests/prometheus-serviceMonitorApiserver.yaml index 6d884a2..5dea38e 100644 --- a/manifests/prometheus-serviceMonitorApiserver.yaml +++ b/manifests/prometheus-serviceMonitorApiserver.yaml @@ -14,6 +14,14 @@ spec: regex: etcd_(debugging|disk|request|server).* sourceLabels: - __name__ + - action: drop + regex: apiserver_admission_controller_admission_latencies_seconds_.* + sourceLabels: + - __name__ + - action: drop + regex: apiserver_admission_step_admission_latencies_seconds_.* + sourceLabels: + - __name__ port: https scheme: https tlsConfig: diff --git a/manifests/prometheus-serviceMonitorKubelet.yaml b/manifests/prometheus-serviceMonitorKubelet.yaml index 97d7f1a..590a5cd 100644 --- a/manifests/prometheus-serviceMonitorKubelet.yaml +++ b/manifests/prometheus-serviceMonitorKubelet.yaml @@ -17,6 +17,16 @@ spec: - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token honorLabels: true interval: 30s + metricRelabelings: + - action: drop + regex: container_([a-z_]+); + sourceLabels: + - __name__ + - image + - action: drop + regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) + sourceLabels: + - __name__ path: /metrics/cadvisor port: https-metrics scheme: https diff --git a/operator_stack.jsonnet b/operator_stack.jsonnet index 8e596c8..4791816 100644 --- a/operator_stack.jsonnet +++ b/operator_stack.jsonnet @@ -122,37 +122,6 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + 'traefik-dashboard.json': (import 'grafana-dashboards/traefik-dashboard.json'), 'coredns-dashboard.json': (import 'grafana-dashboards/coredns-dashboard.json'), }, - kubeStateMetrics+:: { - // Override command for addon-resizer due to change from parameter --threshold to --acceptance-offset - deployment+: { - spec+: { - template+: { - spec+: { - containers: - std.map( - function(c) - if std.startsWith(c.name, 'addon-resizer') then - c { - command: [ - '/pod_nanny', - '--container=kube-state-metrics', - '--cpu=100m', - '--extra-cpu=2m', - '--memory=150Mi', - '--extra-memory=30Mi', - '--acceptance-offset=5', - '--deployment=kube-state-metrics', - ], - } - else - c, - super.containers, - ), - }, - }, - }, - }, - }, // Create ingress objects per application ingress+: { From fcf2c50783e04f69e1218c80848febb93009a8b7 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Fri, 15 Feb 2019 10:33:59 -0200 Subject: [PATCH 29/30] Update SMTP image version --- image_sources_versions.jsonnet | 2 +- manifests/smtp-server-deployment.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/image_sources_versions.jsonnet b/image_sources_versions.jsonnet index 9151408..6de5246 100644 --- a/image_sources_versions.jsonnet +++ b/image_sources_versions.jsonnet @@ -13,7 +13,7 @@ configmapReloader: "v0.2.2", prometheusConfigReloader: "v0.28.0", armExporter: 'latest', - smtpServer: 'latest', + smtpServer: 'v1.0.1', }, imageRepos+:: { diff --git a/manifests/smtp-server-deployment.yaml b/manifests/smtp-server-deployment.yaml index df63d8d..3839e25 100644 --- a/manifests/smtp-server-deployment.yaml +++ b/manifests/smtp-server-deployment.yaml @@ -31,7 +31,7 @@ spec: value: "True" - name: RELAY_DOMAINS value: :192.168.0.0/24:10.0.0.0/16 - image: carlosedp/docker-smtp:latest + image: carlosedp/docker-smtp:v1.0.1 name: smtp-server ports: - containerPort: 25 From 44dc9d46a6fd48639f3ca4b2e80b1e3d8023fc46 Mon Sep 17 00:00:00 2001 From: CarlosEDP Date: Fri, 22 Feb 2019 12:43:19 -0300 Subject: [PATCH 30/30] Updated readme --- Readme.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Readme.md b/Readme.md index 39ca7e0..afbec9f 100644 --- a/Readme.md +++ b/Readme.md @@ -8,6 +8,8 @@ This repository collects Kubernetes manifests, Grafana dashboards, and Prometheu The content of this project is written in jsonnet and is an extension of the fantastic [kube-prometheus](https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus) project. +To continue using my previous stack with manifests and previous versions of the operator and components, use the legacy repo tag from: https://github.com/carlosedp/prometheus-operator-ARM/tree/legacy. + Components included in this package: * The Prometheus Operator