mirror of
				https://github.com/carlosedp/cluster-monitoring.git
				synced 2025-10-26 10:23:04 +01:00 
			
		
		
		
	Initial import
This commit is contained in:
		
						commit
						063e040dac
					
				
							
								
								
									
										23
									
								
								Readme.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								Readme.md
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,23 @@ | ||||
| # Prometheus Operator for ARM platform | ||||
| 
 | ||||
| The Prometheus Operator for Kubernetes provides easy monitoring definitions for Kubernetes services and deployment and management of Prometheus instances. | ||||
| 
 | ||||
| This project aims on porting the [official manifests](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus) and images to the ARM platform. This have been tested on a ARM64 Kubernetes cluster deployed as [this article](medium.com/@carlosedp/building-an-arm-kubernetes-cluster-ef31032636f9). | ||||
| 
 | ||||
| ## Changes to Kubeadm for Prometheus Operator | ||||
| 
 | ||||
| According to the official deployment documentation [here](https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus/docs/kube-prometheus-on-kubeadm.md), a couple of changes on the cluster are required: | ||||
| 
 | ||||
| We need to expose the cadvisor that is installed and managed by the kubelet daemon and allow webhook token authentication. To do so, we do the following on **all the masters and nodes**: | ||||
| 
 | ||||
|     sed -e "/cadvisor-port=0/d" -i /etc/systemd/system/kubelet.service.d/10-kubeadm.conf | ||||
|     sed -e "s/--authorization-mode=Webhook/--authentication-token-webhook=true --authorization-mode=Webhook/" | ||||
|     systemctl daemon-reload | ||||
|     systemctl restart kubelet | ||||
| 
 | ||||
| In case you already have a Kubernetes deployed with kubeadm, change the address kube-controller-manager and kube-scheduler listens **on master node** in addition to previous kubelet change: | ||||
| 
 | ||||
|     sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-controller-manager.yaml | ||||
|     sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-scheduler.yaml | ||||
| 
 | ||||
| 
 | ||||
							
								
								
									
										42
									
								
								deploy
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										42
									
								
								deploy
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,42 @@ | ||||
| #!/usr/bin/env bash | ||||
| 
 | ||||
| if [ -z "${KUBECONFIG}" ]; then | ||||
|     export KUBECONFIG=~/.kube/config | ||||
| fi | ||||
| 
 | ||||
| # CAUTION - setting NAMESPACE will deploy most components to the given namespace | ||||
| # however some are hardcoded to 'monitoring'. Only use if you have reviewed all manifests. | ||||
| 
 | ||||
| if [ -z "${NAMESPACE}" ]; then | ||||
|     NAMESPACE=monitoring | ||||
| fi | ||||
| 
 | ||||
| kubectl create namespace "$NAMESPACE" | ||||
| 
 | ||||
| kctl() { | ||||
|     kubectl --namespace "$NAMESPACE" "$@" | ||||
| } | ||||
| 
 | ||||
| kubectl apply -f manifests/k8s | ||||
| 
 | ||||
| kctl apply -f manifests/prometheus-operator | ||||
| 
 | ||||
| # Wait for CRDs to be ready. | ||||
| printf "Waiting for Operator to register custom resource definitions..." | ||||
| until kctl get customresourcedefinitions servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done | ||||
| until kctl get customresourcedefinitions prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done | ||||
| until kctl get customresourcedefinitions alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done | ||||
| until kctl get servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done | ||||
| until kctl get prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done | ||||
| until kctl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done | ||||
| echo "done!" | ||||
| 
 | ||||
| kctl apply -f manifests/node-exporter | ||||
| kctl apply -f manifests/kube-state-metrics | ||||
| kctl apply -f manifests/grafana/grafana-credentials.yaml | ||||
| kctl apply -f manifests/grafana | ||||
| find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \; | ||||
| kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml | ||||
| kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml | ||||
| kctl apply -f manifests/alertmanager/ | ||||
| 
 | ||||
							
								
								
									
										6
									
								
								manifests/alertmanager/alertmanager-config.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										6
									
								
								manifests/alertmanager/alertmanager-config.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,6 @@ | ||||
| apiVersion: v1 | ||||
| kind: Secret | ||||
| metadata: | ||||
|   name: alertmanager-main | ||||
| data: | ||||
|   alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg== | ||||
							
								
								
									
										15
									
								
								manifests/alertmanager/alertmanager-ingress.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								manifests/alertmanager/alertmanager-ingress.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,15 @@ | ||||
| apiVersion: extensions/v1beta1 | ||||
| kind: Ingress | ||||
| metadata: | ||||
|   name: alertmanager | ||||
|   namespace: monitoring | ||||
| spec: | ||||
|   rules: | ||||
|   - host: alertmanager.internal.carlosedp.com | ||||
|     http: | ||||
|       paths: | ||||
|       - path: / | ||||
|         backend: | ||||
|           serviceName: alertmanager-main | ||||
|           servicePort: web | ||||
| 
 | ||||
							
								
								
									
										16
									
								
								manifests/alertmanager/alertmanager-service.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								manifests/alertmanager/alertmanager-service.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,16 @@ | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   labels: | ||||
|     alertmanager: main | ||||
|   name: alertmanager-main | ||||
| spec: | ||||
|   type: NodePort | ||||
|   ports: | ||||
|   - name: web | ||||
|     nodePort: 30903 | ||||
|     port: 9093 | ||||
|     protocol: TCP | ||||
|     targetPort: web | ||||
|   selector: | ||||
|     alertmanager: main | ||||
							
								
								
									
										11
									
								
								manifests/alertmanager/alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										11
									
								
								manifests/alertmanager/alertmanager.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,11 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Alertmanager | ||||
| metadata: | ||||
|   name: main | ||||
|   labels: | ||||
|     alertmanager: main | ||||
| spec: | ||||
|   replicas: 1 | ||||
|   baseImage: carlosedp/alertmanager | ||||
|   version: v0.14.0 | ||||
| 
 | ||||
							
								
								
									
										22
									
								
								manifests/armexporter/daemonset.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										22
									
								
								manifests/armexporter/daemonset.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,22 @@ | ||||
| apiVersion: extensions/v1beta1 | ||||
| kind: DaemonSet | ||||
| metadata: | ||||
|   name: arm-exporter | ||||
|   namespace: monitoring | ||||
|   labels: | ||||
|     k8s-app: arm-exporter | ||||
| spec: | ||||
|   template: | ||||
|     metadata: | ||||
|       name: arm-exporter | ||||
|       labels: | ||||
|         k8s-app: arm-exporter | ||||
|     spec: | ||||
|       hostNetwork: true | ||||
|       containers: | ||||
|       - image: carlosedp/arm_exporter | ||||
|         name: arm-exporter | ||||
|         ports: | ||||
|         - name: http | ||||
|           containerPort: 9243 | ||||
|           hostPort: 9243 | ||||
							
								
								
									
										17
									
								
								manifests/armexporter/node-exporter-service.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								manifests/armexporter/node-exporter-service.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,17 @@ | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   labels: | ||||
|     app: node-exporter | ||||
|     k8s-app: node-exporter | ||||
|   name: node-exporter | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   clusterIP: None | ||||
|   ports: | ||||
|   - name: http | ||||
|     port: 9100 | ||||
|     protocol: TCP | ||||
|   selector: | ||||
|     app: node-exporter | ||||
| 
 | ||||
							
								
								
									
										14
									
								
								manifests/armexporter/service.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								manifests/armexporter/service.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,14 @@ | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   labels: | ||||
|     k8s-app: arm-exporter | ||||
|   name: arm-exporter | ||||
| spec: | ||||
|   ports: | ||||
|   - name: http | ||||
|     port: 9243 | ||||
|     protocol: TCP | ||||
|   selector: | ||||
|     k8s-app: arm-exporter | ||||
| 
 | ||||
							
								
								
									
										12
									
								
								manifests/grafana/grafana-claim.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								manifests/grafana/grafana-claim.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,12 @@ | ||||
| kind: PersistentVolumeClaim | ||||
| apiVersion: v1 | ||||
| metadata: | ||||
|   name: grafana-claim | ||||
|   namespace: monitoring | ||||
| spec: | ||||
|   accessModes: | ||||
|     - ReadWriteMany | ||||
|   resources: | ||||
|     requests: | ||||
|       storage: 5Gi | ||||
| 
 | ||||
							
								
								
									
										23
									
								
								manifests/grafana/grafana-configmap.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										23
									
								
								manifests/grafana/grafana-configmap.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,23 @@ | ||||
| apiVersion: v1 | ||||
| kind: ConfigMap | ||||
| metadata: | ||||
|   name: grafana-config | ||||
|   namespace: monitoring | ||||
| data: | ||||
|   config.ini: | | ||||
|     [database] | ||||
|     path = /data/grafana.db | ||||
| 
 | ||||
|     [paths] | ||||
|     data = /data | ||||
|     logs = /data/log | ||||
|     plugins = /data/plugins | ||||
| 
 | ||||
|     [session] | ||||
|     provider = memory | ||||
| 
 | ||||
|     [auth.basic] | ||||
|     enabled = false | ||||
| 
 | ||||
|     [auth.anonymous] | ||||
|     enabled = false | ||||
							
								
								
									
										7
									
								
								manifests/grafana/grafana-credentials.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7
									
								
								manifests/grafana/grafana-credentials.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,7 @@ | ||||
| apiVersion: v1 | ||||
| kind: Secret | ||||
| metadata: | ||||
|   name: grafana-credentials | ||||
| data: | ||||
|   user: YWRtaW4= | ||||
|   password: YWRtaW4= | ||||
							
								
								
									
										7360
									
								
								manifests/grafana/grafana-dashboard-definitions.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										7360
									
								
								manifests/grafana/grafana-dashboard-definitions.yaml
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										12
									
								
								manifests/grafana/grafana-dashboards.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										12
									
								
								manifests/grafana/grafana-dashboards.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,12 @@ | ||||
| apiVersion: v1 | ||||
| kind: ConfigMap | ||||
| metadata: | ||||
|   name: grafana-dashboards | ||||
| data: | ||||
|   dashboards.yaml: |+ | ||||
|     - name: '0' | ||||
|       org_id: 1 | ||||
|       folder: '' | ||||
|       type: file | ||||
|       options: | ||||
|         folder: /grafana-dashboard-definitions/0 | ||||
							
								
								
									
										15
									
								
								manifests/grafana/grafana-datasources.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								manifests/grafana/grafana-datasources.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,15 @@ | ||||
| apiVersion: v1 | ||||
| kind: ConfigMap | ||||
| metadata: | ||||
|   name: grafana-datasources | ||||
| data: | ||||
|   prometheus.yaml: |+ | ||||
|     datasources: | ||||
|      - name: prometheus | ||||
|        type: prometheus | ||||
|        access: proxy | ||||
|        org_id: 1 | ||||
|        url: http://prometheus-k8s.monitoring.svc:9090 | ||||
|        version: 1 | ||||
|        editable: false | ||||
| 
 | ||||
							
								
								
									
										60
									
								
								manifests/grafana/grafana-deployment.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										60
									
								
								manifests/grafana/grafana-deployment.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,60 @@ | ||||
| apiVersion: apps/v1beta1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   name: grafana | ||||
| spec: | ||||
|   replicas: 1 | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: grafana | ||||
|     spec: | ||||
|       securityContext: | ||||
|         runAsNonRoot: true | ||||
|         runAsUser: 65534 | ||||
|       containers: | ||||
|       - name: grafana | ||||
|         image: carlosedp/monitoring-grafana:latest | ||||
|         volumeMounts: | ||||
|         - name: grafana-config | ||||
|           mountPath: /grafana/conf/config.ini | ||||
|           subPath: config.ini | ||||
|         - name: grafana-storage | ||||
|           mountPath: /data | ||||
|         - name: grafana-datasources | ||||
|           mountPath: /grafana/conf/provisioning/datasources | ||||
|         - name: grafana-dashboards | ||||
|           mountPath: /grafana/conf/provisioning/dashboards | ||||
|         - name: grafana-dashboard-definitions-0 | ||||
|           mountPath: /grafana-dashboard-definitions/0 | ||||
|         ports: | ||||
|         - name: web | ||||
|           containerPort: 3000 | ||||
|         env: | ||||
|         - name: GF_INSTALL_PLUGINS | ||||
|           value: "grafana-clock-panel,grafana-piechart-panel" | ||||
|         - name: GF_PATHS_PLUGINS | ||||
|           value: "/data/plugins" | ||||
|         resources: | ||||
|           requests: | ||||
|             memory: 100Mi | ||||
|             cpu: 100m | ||||
|           limits: | ||||
|             memory: 200Mi | ||||
|             cpu: 200m | ||||
|       volumes: | ||||
|       - name: grafana-config | ||||
|         configMap: | ||||
|           name: grafana-config | ||||
|       - name: grafana-storage | ||||
|         persistentVolumeClaim: | ||||
|           claimName: grafana-claim | ||||
|       - name: grafana-datasources | ||||
|         configMap: | ||||
|           name: grafana-datasources | ||||
|       - name: grafana-dashboards | ||||
|         configMap: | ||||
|           name: grafana-dashboards | ||||
|       - name: grafana-dashboard-definitions-0 | ||||
|         configMap: | ||||
|           name: grafana-dashboard-definitions-0 | ||||
							
								
								
									
										15
									
								
								manifests/grafana/grafana-ingress.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								manifests/grafana/grafana-ingress.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,15 @@ | ||||
| apiVersion: extensions/v1beta1 | ||||
| kind: Ingress | ||||
| metadata: | ||||
|   name: grafana | ||||
|   namespace: monitoring | ||||
| spec: | ||||
|   rules: | ||||
|   - host: grafana.internal.carlosedp.com | ||||
|     http: | ||||
|       paths: | ||||
|       - path: / | ||||
|         backend: | ||||
|           serviceName: grafana | ||||
|           servicePort: 3000 | ||||
| 
 | ||||
							
								
								
									
										15
									
								
								manifests/grafana/grafana-service.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								manifests/grafana/grafana-service.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,15 @@ | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   name: grafana | ||||
|   labels: | ||||
|     app: grafana | ||||
| spec: | ||||
|   type: NodePort | ||||
|   ports: | ||||
|   - port: 3000 | ||||
|     protocol: TCP | ||||
|     nodePort: 30902 | ||||
|     targetPort: web | ||||
|   selector: | ||||
|     app: grafana | ||||
							
								
								
									
										17
									
								
								manifests/k8s/kube-controller-manager.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								manifests/k8s/kube-controller-manager.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,17 @@ | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   namespace: kube-system | ||||
|   name: kube-controller-manager-prometheus-discovery | ||||
|   labels: | ||||
|     k8s-app: kube-controller-manager | ||||
| spec: | ||||
|   selector: | ||||
|     component: kube-controller-manager | ||||
|   type: ClusterIP | ||||
|   clusterIP: None | ||||
|   ports: | ||||
|   - name: http-metrics | ||||
|     port: 10252 | ||||
|     targetPort: 10252 | ||||
|     protocol: TCP | ||||
							
								
								
									
										17
									
								
								manifests/k8s/kube-scheduler.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								manifests/k8s/kube-scheduler.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,17 @@ | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   namespace: kube-system | ||||
|   name: kube-scheduler-prometheus-discovery | ||||
|   labels: | ||||
|     k8s-app: kube-scheduler | ||||
| spec: | ||||
|   selector: | ||||
|     component: kube-scheduler | ||||
|   type: ClusterIP | ||||
|   clusterIP: None | ||||
|   ports: | ||||
|   - name: http-metrics | ||||
|     port: 10251 | ||||
|     targetPort: 10251 | ||||
|     protocol: TCP | ||||
| @ -0,0 +1,12 @@ | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: ClusterRoleBinding | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
| roleRef: | ||||
|   apiGroup: rbac.authorization.k8s.io | ||||
|   kind: ClusterRole | ||||
|   name: kube-state-metrics | ||||
| subjects: | ||||
| - kind: ServiceAccount | ||||
|   name: kube-state-metrics | ||||
|   namespace: monitoring | ||||
| @ -0,0 +1,45 @@ | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: ClusterRole | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
| rules: | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - nodes | ||||
|   - pods | ||||
|   - services | ||||
|   - resourcequotas | ||||
|   - replicationcontrollers | ||||
|   - limitranges | ||||
|   - persistentvolumeclaims | ||||
|   - persistentvolumes | ||||
|   - namespaces | ||||
|   - endpoints | ||||
|   verbs: ["list", "watch"] | ||||
| - apiGroups: ["extensions"] | ||||
|   resources: | ||||
|   - daemonsets | ||||
|   - deployments | ||||
|   - replicasets | ||||
|   verbs: ["list", "watch"] | ||||
| - apiGroups: ["apps"] | ||||
|   resources: | ||||
|   - statefulsets | ||||
|   verbs: ["list", "watch"] | ||||
| - apiGroups: ["batch"] | ||||
|   resources: | ||||
|   - cronjobs | ||||
|   - jobs | ||||
|   verbs: ["list", "watch"] | ||||
| - apiGroups: ["autoscaling"] | ||||
|   resources: | ||||
|   - horizontalpodautoscalers | ||||
|   verbs: ["list", "watch"] | ||||
| - apiGroups: ["authentication.k8s.io"] | ||||
|   resources: | ||||
|   - tokenreviews | ||||
|   verbs: ["create"] | ||||
| - apiGroups: ["authorization.k8s.io"] | ||||
|   resources: | ||||
|   - subjectaccessreviews | ||||
|   verbs: ["create"] | ||||
| @ -0,0 +1,55 @@ | ||||
| apiVersion: extensions/v1beta1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
| spec: | ||||
|   replicas: 1 | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: kube-state-metrics | ||||
|     spec: | ||||
|       serviceAccountName: kube-state-metrics | ||||
|       securityContext: | ||||
|         runAsNonRoot: true | ||||
|         runAsUser: 65534 | ||||
|       containers: | ||||
|       - name: kube-state-metrics | ||||
|         image: carlosedp/kube-state-metrics-arm64:v1.2.0 | ||||
|         args: | ||||
|         - "--host=0.0.0.0" | ||||
|         - "--port=8443" | ||||
|         - "--telemetry-host=0.0.0.0" | ||||
|         - "--telemetry-port=9443" | ||||
|         ports: | ||||
|         - name: http-main | ||||
|           containerPort: 8443 | ||||
|         - name: http-self | ||||
|           containerPort: 9443 | ||||
|       - name: addon-resizer | ||||
|         image: gcr.io/google-containers/addon-resizer-arm64:2.1 | ||||
|         resources: | ||||
|           limits: | ||||
|             cpu: 100m | ||||
|             memory: 30Mi | ||||
|           requests: | ||||
|             cpu: 100m | ||||
|             memory: 30Mi | ||||
|         env: | ||||
|           - name: MY_POD_NAME | ||||
|             valueFrom: | ||||
|               fieldRef: | ||||
|                 fieldPath: metadata.name | ||||
|           - name: MY_POD_NAMESPACE | ||||
|             valueFrom: | ||||
|               fieldRef: | ||||
|                 fieldPath: metadata.namespace | ||||
|         command: | ||||
|           - /pod_nanny | ||||
|           - --container=kube-state-metrics | ||||
|           - --cpu=100m | ||||
|           - --extra-cpu=2m | ||||
|           - --memory=150Mi | ||||
|           - --extra-memory=30Mi | ||||
|           #- --threshold=5 | ||||
|           - --deployment=kube-state-metrics | ||||
| @ -0,0 +1,12 @@ | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: RoleBinding | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
| roleRef: | ||||
|   apiGroup: rbac.authorization.k8s.io | ||||
|   kind: Role | ||||
|   name: kube-state-metrics-resizer | ||||
| subjects: | ||||
| - kind: ServiceAccount | ||||
|   name: kube-state-metrics | ||||
| 
 | ||||
							
								
								
									
										15
									
								
								manifests/kube-state-metrics/kube-state-metrics-role.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										15
									
								
								manifests/kube-state-metrics/kube-state-metrics-role.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,15 @@ | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: Role | ||||
| metadata: | ||||
|   name: kube-state-metrics-resizer | ||||
| rules: | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - pods | ||||
|   verbs: ["get"] | ||||
| - apiGroups: ["extensions"] | ||||
|   resources: | ||||
|   - deployments | ||||
|   resourceNames: ["kube-state-metrics"] | ||||
|   verbs: ["get", "update"] | ||||
| 
 | ||||
| @ -0,0 +1,4 @@ | ||||
| apiVersion: v1 | ||||
| kind: ServiceAccount | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
							
								
								
									
										21
									
								
								manifests/kube-state-metrics/kube-state-metrics-service.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										21
									
								
								manifests/kube-state-metrics/kube-state-metrics-service.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,21 @@ | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   labels: | ||||
|     app: kube-state-metrics | ||||
|     k8s-app: kube-state-metrics | ||||
|   name: kube-state-metrics | ||||
| spec: | ||||
|   clusterIP: None | ||||
|   ports: | ||||
|   - name: http-main | ||||
|     port: 8443 | ||||
|     targetPort: http-main | ||||
|     protocol: TCP | ||||
|   - name: http-self | ||||
|     port: 9443 | ||||
|     targetPort: http-self | ||||
|     protocol: TCP | ||||
|   selector: | ||||
|     app: kube-state-metrics | ||||
| 
 | ||||
| @ -0,0 +1,12 @@ | ||||
| apiVersion: rbac.authorization.k8s.io/v1 | ||||
| kind: ClusterRoleBinding | ||||
| metadata: | ||||
|   name: node-exporter | ||||
| roleRef: | ||||
|   apiGroup: rbac.authorization.k8s.io | ||||
|   kind: ClusterRole | ||||
|   name: node-exporter | ||||
| subjects: | ||||
| - kind: ServiceAccount | ||||
|   name: node-exporter | ||||
|   namespace: monitoring | ||||
							
								
								
									
										13
									
								
								manifests/node-exporter/node-exporter-cluster-role.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										13
									
								
								manifests/node-exporter/node-exporter-cluster-role.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,13 @@ | ||||
| apiVersion: rbac.authorization.k8s.io/v1 | ||||
| kind: ClusterRole | ||||
| metadata: | ||||
|   name: node-exporter | ||||
| rules: | ||||
| - apiGroups: ["authentication.k8s.io"] | ||||
|   resources: | ||||
|   - tokenreviews | ||||
|   verbs: ["create"] | ||||
| - apiGroups: ["authorization.k8s.io"] | ||||
|   resources: | ||||
|   - subjectaccessreviews | ||||
|   verbs: ["create"] | ||||
							
								
								
									
										57
									
								
								manifests/node-exporter/node-exporter-daemonset.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										57
									
								
								manifests/node-exporter/node-exporter-daemonset.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,57 @@ | ||||
| apiVersion: extensions/v1beta1 | ||||
| kind: DaemonSet | ||||
| metadata: | ||||
|   name: node-exporter | ||||
| spec: | ||||
|   updateStrategy: | ||||
|     rollingUpdate: | ||||
|       maxUnavailable: 1 | ||||
|     type: RollingUpdate | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         app: node-exporter | ||||
|       name: node-exporter | ||||
|     spec: | ||||
|       serviceAccountName: node-exporter | ||||
|       securityContext: | ||||
|         runAsNonRoot: true | ||||
|         runAsUser: 65534 | ||||
|       hostNetwork: true | ||||
|       hostPID: true | ||||
|       containers: | ||||
|       - image: napnap75/rpi-prometheus:node_exporter | ||||
|         args: | ||||
|         #- "--web.listen-address=0.0.0.1:9100" | ||||
|         - "--path.procfs=/host/proc" | ||||
|         - "--path.sysfs=/host/sys" | ||||
|         name: node-exporter | ||||
|         ports: | ||||
|         - containerPort: 9100 | ||||
|           hostPort: 9100 | ||||
|           name: http | ||||
|         resources: | ||||
|           requests: | ||||
|             memory: 30Mi | ||||
|             cpu: 100m | ||||
|           limits: | ||||
|             memory: 50Mi | ||||
|             cpu: 200m | ||||
|         volumeMounts: | ||||
|         - name: proc | ||||
|           readOnly:  true | ||||
|           mountPath: /host/proc | ||||
|         - name: sys | ||||
|           readOnly: true | ||||
|           mountPath: /host/sys | ||||
|       tolerations: | ||||
|         - effect: NoSchedule | ||||
|           operator: Exists | ||||
|       volumes: | ||||
|       - name: proc | ||||
|         hostPath: | ||||
|           path: /proc | ||||
|       - name: sys | ||||
|         hostPath: | ||||
|           path: /sys | ||||
| 
 | ||||
| @ -0,0 +1,4 @@ | ||||
| apiVersion: v1 | ||||
| kind: ServiceAccount | ||||
| metadata: | ||||
|   name: node-exporter | ||||
							
								
								
									
										17
									
								
								manifests/node-exporter/node-exporter-service.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										17
									
								
								manifests/node-exporter/node-exporter-service.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,17 @@ | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   labels: | ||||
|     app: node-exporter | ||||
|     k8s-app: node-exporter | ||||
|   name: node-exporter | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   clusterIP: None | ||||
|   ports: | ||||
|   - name: http | ||||
|     port: 9100 | ||||
|     protocol: TCP | ||||
|   selector: | ||||
|     app: node-exporter | ||||
| 
 | ||||
| @ -0,0 +1,12 @@ | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: ClusterRoleBinding | ||||
| metadata: | ||||
|   name: prometheus-operator | ||||
| roleRef: | ||||
|   apiGroup: rbac.authorization.k8s.io | ||||
|   kind: ClusterRole | ||||
|   name: prometheus-operator | ||||
| subjects: | ||||
| - kind: ServiceAccount | ||||
|   name: prometheus-operator | ||||
|   namespace: monitoring | ||||
| @ -0,0 +1,54 @@ | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: ClusterRole | ||||
| metadata: | ||||
|   name: prometheus-operator | ||||
| rules: | ||||
| - apiGroups: | ||||
|   - extensions | ||||
|   resources: | ||||
|   - thirdpartyresources | ||||
|   verbs: | ||||
|   - "*" | ||||
| - apiGroups: | ||||
|   - apiextensions.k8s.io | ||||
|   resources: | ||||
|   - customresourcedefinitions | ||||
|   verbs: | ||||
|   - "*" | ||||
| - apiGroups: | ||||
|   - monitoring.coreos.com | ||||
|   resources: | ||||
|   - alertmanagers | ||||
|   - prometheuses | ||||
|   - prometheuses/finalizers | ||||
|   - alertmanagers/finalizers | ||||
|   - servicemonitors | ||||
|   verbs: | ||||
|   - "*" | ||||
| - apiGroups: | ||||
|   - apps | ||||
|   resources: | ||||
|   - statefulsets | ||||
|   verbs: ["*"] | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - configmaps | ||||
|   - secrets | ||||
|   verbs: ["*"] | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - pods | ||||
|   verbs: ["list", "delete"] | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - services | ||||
|   - endpoints | ||||
|   verbs: ["get", "create", "update"] | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - nodes | ||||
|   verbs: ["list", "watch"] | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - namespaces | ||||
|   verbs: ["list"] | ||||
| @ -0,0 +1,4 @@ | ||||
| apiVersion: v1 | ||||
| kind: ServiceAccount | ||||
| metadata: | ||||
|   name: prometheus-operator | ||||
| @ -0,0 +1,15 @@ | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   name: prometheus-operator | ||||
|   labels: | ||||
|     k8s-app: prometheus-operator | ||||
| spec: | ||||
|   type: ClusterIP | ||||
|   ports: | ||||
|   - name: http | ||||
|     port: 8080 | ||||
|     targetPort: http | ||||
|     protocol: TCP | ||||
|   selector: | ||||
|     k8s-app: prometheus-operator | ||||
							
								
								
									
										34
									
								
								manifests/prometheus-operator/prometheus-operator.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										34
									
								
								manifests/prometheus-operator/prometheus-operator.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,34 @@ | ||||
| apiVersion: extensions/v1beta1 | ||||
| kind: Deployment | ||||
| metadata: | ||||
|   labels: | ||||
|     k8s-app: prometheus-operator | ||||
|   name: prometheus-operator | ||||
| spec: | ||||
|   replicas: 1 | ||||
|   template: | ||||
|     metadata: | ||||
|       labels: | ||||
|         k8s-app: prometheus-operator | ||||
|     spec: | ||||
|       containers: | ||||
|       - args: | ||||
|         - --kubelet-service=kube-system/kubelet | ||||
|         - --prometheus-config-reloader=carlosedp/prometheus-config-reloader:v0.0.2 | ||||
|         - --config-reloader-image=carlosedp/configmap-reload:latest | ||||
|         image: carlosedp/prometheus-operator:v0.17.0 | ||||
|         name: prometheus-operator | ||||
|         ports: | ||||
|         - containerPort: 8080 | ||||
|           name: http | ||||
|         resources: | ||||
|           limits: | ||||
|             cpu: 200m | ||||
|             memory: 100Mi | ||||
|           requests: | ||||
|             cpu: 100m | ||||
|             memory: 50Mi | ||||
|       securityContext: | ||||
|         runAsNonRoot: true | ||||
|         runAsUser: 65534 | ||||
|       serviceAccountName: prometheus-operator | ||||
							
								
								
									
										14
									
								
								manifests/prometheus/prometheus-k8s-ingress.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										14
									
								
								manifests/prometheus/prometheus-k8s-ingress.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,14 @@ | ||||
| apiVersion: extensions/v1beta1 | ||||
| kind: Ingress | ||||
| metadata: | ||||
|   name: prometheus | ||||
|   namespace: monitoring | ||||
| spec: | ||||
|   rules: | ||||
|   - host: prometheus.internal.carlosedp.com | ||||
|     http: | ||||
|       paths: | ||||
|       - path: / | ||||
|         backend: | ||||
|           serviceName: prometheus-k8s | ||||
|           servicePort: web | ||||
							
								
								
									
										54
									
								
								manifests/prometheus/prometheus-k8s-role-bindings.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								manifests/prometheus/prometheus-k8s-role-bindings.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,54 @@ | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: RoleBinding | ||||
| metadata: | ||||
|   name: prometheus-k8s | ||||
|   namespace: monitoring | ||||
| roleRef: | ||||
|   apiGroup: rbac.authorization.k8s.io | ||||
|   kind: Role | ||||
|   name: prometheus-k8s | ||||
| subjects: | ||||
| - kind: ServiceAccount | ||||
|   name: prometheus-k8s | ||||
|   namespace: monitoring | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: RoleBinding | ||||
| metadata: | ||||
|   name: prometheus-k8s | ||||
|   namespace: kube-system | ||||
| roleRef: | ||||
|   apiGroup: rbac.authorization.k8s.io | ||||
|   kind: Role | ||||
|   name: prometheus-k8s | ||||
| subjects: | ||||
| - kind: ServiceAccount | ||||
|   name: prometheus-k8s | ||||
|   namespace: monitoring | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: RoleBinding | ||||
| metadata: | ||||
|   name: prometheus-k8s | ||||
|   namespace: default | ||||
| roleRef: | ||||
|   apiGroup: rbac.authorization.k8s.io | ||||
|   kind: Role | ||||
|   name: prometheus-k8s | ||||
| subjects: | ||||
| - kind: ServiceAccount | ||||
|   name: prometheus-k8s | ||||
|   namespace: monitoring | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: ClusterRoleBinding | ||||
| metadata: | ||||
|   name: prometheus-k8s | ||||
| roleRef: | ||||
|   apiGroup: rbac.authorization.k8s.io | ||||
|   kind: ClusterRole | ||||
|   name: prometheus-k8s | ||||
| subjects: | ||||
| - kind: ServiceAccount | ||||
|   name: prometheus-k8s | ||||
|   namespace: monitoring | ||||
							
								
								
									
										55
									
								
								manifests/prometheus/prometheus-k8s-roles.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										55
									
								
								manifests/prometheus/prometheus-k8s-roles.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,55 @@ | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: Role | ||||
| metadata: | ||||
|   name: prometheus-k8s | ||||
|   namespace: monitoring | ||||
| rules: | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - nodes | ||||
|   - services | ||||
|   - endpoints | ||||
|   - pods | ||||
|   verbs: ["get", "list", "watch"] | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - configmaps | ||||
|   verbs: ["get"] | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: Role | ||||
| metadata: | ||||
|   name: prometheus-k8s | ||||
|   namespace: kube-system | ||||
| rules: | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - services | ||||
|   - endpoints | ||||
|   - pods | ||||
|   verbs: ["get", "list", "watch"] | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: Role | ||||
| metadata: | ||||
|   name: prometheus-k8s | ||||
|   namespace: default | ||||
| rules: | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - services | ||||
|   - endpoints | ||||
|   - pods | ||||
|   verbs: ["get", "list", "watch"] | ||||
| --- | ||||
| apiVersion: rbac.authorization.k8s.io/v1beta1 | ||||
| kind: ClusterRole | ||||
| metadata: | ||||
|   name: prometheus-k8s | ||||
| rules: | ||||
| - apiGroups: [""] | ||||
|   resources: | ||||
|   - nodes/metrics | ||||
|   verbs: ["get"] | ||||
| - nonResourceURLs: ["/metrics"] | ||||
|   verbs: ["get"] | ||||
							
								
								
									
										602
									
								
								manifests/prometheus/prometheus-k8s-rules.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										602
									
								
								manifests/prometheus/prometheus-k8s-rules.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,602 @@ | ||||
| apiVersion: v1 | ||||
| kind: ConfigMap | ||||
| metadata: | ||||
|   name: prometheus-k8s-rules | ||||
|   labels: | ||||
|     role: prometheus-rulefiles | ||||
|     prometheus: k8s | ||||
| data: | ||||
|   alertmanager.rules.yaml: |+ | ||||
|     groups: | ||||
|     - name: alertmanager.rules | ||||
|       rules: | ||||
|       - alert: AlertmanagerConfigInconsistent | ||||
|         expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service) | ||||
|           GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service", | ||||
|           "alertmanager-$1", "alertmanager", "(.*)") != 1 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: The configuration of the instances of the Alertmanager cluster | ||||
|             `{{$labels.service}}` are out of sync. | ||||
|       - alert: AlertmanagerDownOrMissing | ||||
|         expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1", | ||||
|           "alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: An unexpected number of Alertmanagers are scraped or Alertmanagers | ||||
|             disappeared from discovery. | ||||
|       - alert: AlertmanagerFailedReload | ||||
|         expr: alertmanager_config_last_reload_successful == 0 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace | ||||
|             }}/{{ $labels.pod}}. | ||||
|   etcd3.rules.yaml: |+ | ||||
|     groups: | ||||
|     - name: ./etcd3.rules | ||||
|       rules: | ||||
|       - alert: InsufficientMembers | ||||
|         expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1) | ||||
|         for: 3m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: If one more etcd member goes down the cluster will be unavailable | ||||
|           summary: etcd cluster insufficient members | ||||
|       - alert: NoLeader | ||||
|         expr: etcd_server_has_leader{job="etcd"} == 0 | ||||
|         for: 1m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: etcd member {{ $labels.instance }} has no leader | ||||
|           summary: etcd member has no leader | ||||
|       - alert: HighNumberOfLeaderChanges | ||||
|         expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3 | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader | ||||
|             changes within the last hour | ||||
|           summary: a high number of leader changes within the etcd cluster are happening | ||||
|       - alert: HighNumberOfFailedGRPCRequests | ||||
|         expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) | ||||
|           / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed | ||||
|             on etcd instance {{ $labels.instance }}' | ||||
|           summary: a high number of gRPC requests are failing | ||||
|       - alert: HighNumberOfFailedGRPCRequests | ||||
|         expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method) | ||||
|           / sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed | ||||
|             on etcd instance {{ $labels.instance }}' | ||||
|           summary: a high number of gRPC requests are failing | ||||
|       - alert: GRPCRequestsSlow | ||||
|         expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) | ||||
|           > 0.15 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method | ||||
|             }} are slow | ||||
|           summary: slow gRPC requests | ||||
|       - alert: HighNumberOfFailedHTTPRequests | ||||
|         expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) | ||||
|           BY (method) > 0.01 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd | ||||
|             instance {{ $labels.instance }}' | ||||
|           summary: a high number of HTTP requests are failing | ||||
|       - alert: HighNumberOfFailedHTTPRequests | ||||
|         expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m])) | ||||
|           BY (method) > 0.05 | ||||
|         for: 5m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd | ||||
|             instance {{ $labels.instance }}' | ||||
|           summary: a high number of HTTP requests are failing | ||||
|       - alert: HTTPRequestsSlow | ||||
|         expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) | ||||
|           > 0.15 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method | ||||
|             }} are slow | ||||
|           summary: slow HTTP requests | ||||
|       - alert: EtcdMemberCommunicationSlow | ||||
|         expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) | ||||
|           > 0.15 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: etcd instance {{ $labels.instance }} member communication with | ||||
|             {{ $labels.To }} is slow | ||||
|           summary: etcd member communication is slow | ||||
|       - alert: HighNumberOfFailedProposals | ||||
|         expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5 | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal | ||||
|             failures within the last hour | ||||
|           summary: a high number of proposals within the etcd cluster are failing | ||||
|       - alert: HighFsyncDurations | ||||
|         expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) | ||||
|           > 0.5 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: etcd instance {{ $labels.instance }} fync durations are high | ||||
|           summary: high fsync durations | ||||
|       - alert: HighCommitDurations | ||||
|         expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m])) | ||||
|           > 0.25 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: etcd instance {{ $labels.instance }} commit durations are high | ||||
|           summary: high commit durations | ||||
|   general.rules.yaml: |+ | ||||
|     groups: | ||||
|     - name: general.rules | ||||
|       rules: | ||||
|       - alert: TargetDown | ||||
|         expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: '{{ $value }}% of {{ $labels.job }} targets are down.' | ||||
|           summary: Targets are down | ||||
|       - alert: DeadMansSwitch | ||||
|         expr: vector(1) | ||||
|         labels: | ||||
|           severity: none | ||||
|         annotations: | ||||
|           description: This is a DeadMansSwitch meant to ensure that the entire Alerting | ||||
|             pipeline is functional. | ||||
|           summary: Alerting DeadMansSwitch | ||||
|       - record: fd_utilization | ||||
|         expr: process_open_fds / process_max_fds | ||||
|       - alert: FdExhaustionClose | ||||
|         expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance | ||||
|             will exhaust in file/socket descriptors within the next 4 hours' | ||||
|           summary: file descriptors soon exhausted | ||||
|       - alert: FdExhaustionClose | ||||
|         expr: predict_linear(fd_utilization[10m], 3600) > 1 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance | ||||
|             will exhaust in file/socket descriptors within the next hour' | ||||
|           summary: file descriptors soon exhausted | ||||
|   kube-controller-manager.rules.yaml: |+ | ||||
|     groups: | ||||
|     - name: kube-controller-manager.rules | ||||
|       rules: | ||||
|       - alert: K8SControllerManagerDown | ||||
|         expr: absent(up{job="kube-controller-manager"} == 1) | ||||
|         for: 5m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: There is no running K8S controller manager. Deployments and replication | ||||
|             controllers are not making progress. | ||||
|           runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager | ||||
|           summary: Controller manager is down | ||||
|   kube-scheduler.rules.yaml: |+ | ||||
|     groups: | ||||
|     - name: kube-scheduler.rules | ||||
|       rules: | ||||
|       - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile | ||||
|         expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) | ||||
|           BY (le, cluster)) / 1e+06 | ||||
|         labels: | ||||
|           quantile: "0.99" | ||||
|       - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile | ||||
|         expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) | ||||
|           BY (le, cluster)) / 1e+06 | ||||
|         labels: | ||||
|           quantile: "0.9" | ||||
|       - record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile | ||||
|         expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket) | ||||
|           BY (le, cluster)) / 1e+06 | ||||
|         labels: | ||||
|           quantile: "0.5" | ||||
|       - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile | ||||
|         expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) | ||||
|           BY (le, cluster)) / 1e+06 | ||||
|         labels: | ||||
|           quantile: "0.99" | ||||
|       - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile | ||||
|         expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) | ||||
|           BY (le, cluster)) / 1e+06 | ||||
|         labels: | ||||
|           quantile: "0.9" | ||||
|       - record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile | ||||
|         expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket) | ||||
|           BY (le, cluster)) / 1e+06 | ||||
|         labels: | ||||
|           quantile: "0.5" | ||||
|       - record: cluster:scheduler_binding_latency_seconds:quantile | ||||
|         expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket) | ||||
|           BY (le, cluster)) / 1e+06 | ||||
|         labels: | ||||
|           quantile: "0.99" | ||||
|       - record: cluster:scheduler_binding_latency_seconds:quantile | ||||
|         expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket) | ||||
|           BY (le, cluster)) / 1e+06 | ||||
|         labels: | ||||
|           quantile: "0.9" | ||||
|       - record: cluster:scheduler_binding_latency_seconds:quantile | ||||
|         expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket) | ||||
|           BY (le, cluster)) / 1e+06 | ||||
|         labels: | ||||
|           quantile: "0.5" | ||||
|       - alert: K8SSchedulerDown | ||||
|         expr: absent(up{job="kube-scheduler"} == 1) | ||||
|         for: 5m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: There is no running K8S scheduler. New pods are not being assigned | ||||
|             to nodes. | ||||
|           runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler | ||||
|           summary: Scheduler is down | ||||
|   kube-state-metrics.rules.yaml: |+ | ||||
|     groups: | ||||
|     - name: kube-state-metrics.rules | ||||
|       rules: | ||||
|       - alert: DeploymentGenerationMismatch | ||||
|         expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation | ||||
|         for: 15m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Observed deployment generation does not match expected one for | ||||
|             deployment {{$labels.namespaces}}/{{$labels.deployment}} | ||||
|           summary: Deployment is outdated | ||||
|       - alert: DeploymentReplicasNotUpdated | ||||
|         expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas) | ||||
|           or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas)) | ||||
|           unless (kube_deployment_spec_paused == 1) | ||||
|         for: 15m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}} | ||||
|           summary: Deployment replicas are outdated | ||||
|       - alert: DaemonSetRolloutStuck | ||||
|         expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled | ||||
|           * 100 < 100 | ||||
|         for: 15m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Only {{$value}}% of desired pods scheduled and ready for daemon | ||||
|             set {{$labels.namespaces}}/{{$labels.daemonset}} | ||||
|           summary: DaemonSet is missing pods | ||||
|       - alert: K8SDaemonSetsNotScheduled | ||||
|         expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled | ||||
|           > 0 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: A number of daemonsets are not scheduled. | ||||
|           summary: Daemonsets are not scheduled correctly | ||||
|       - alert: DaemonSetsMissScheduled | ||||
|         expr: kube_daemonset_status_number_misscheduled > 0 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: A number of daemonsets are running where they are not supposed | ||||
|             to run. | ||||
|           summary: Daemonsets are not scheduled correctly | ||||
|       - alert: PodFrequentlyRestarting | ||||
|         expr: increase(kube_pod_container_status_restarts_total[1h]) > 5 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}} | ||||
|             times within the last hour | ||||
|           summary: Pod is restarting frequently | ||||
|   kubelet.rules.yaml: |+ | ||||
|     groups: | ||||
|     - name: kubelet.rules | ||||
|       rules: | ||||
|       - alert: K8SNodeNotReady | ||||
|         expr: kube_node_status_condition{condition="Ready",status="true"} == 0 | ||||
|         for: 1h | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: The Kubelet on {{ $labels.node }} has not checked in with the API, | ||||
|             or has set itself to NotReady, for more than an hour | ||||
|           summary: Node status is NotReady | ||||
|       - alert: K8SManyNodesNotReady | ||||
|         expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0) | ||||
|           > 1 and (count(kube_node_status_condition{condition="Ready",status="true"} == | ||||
|           0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2 | ||||
|         for: 1m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: '{{ $value }}% of Kubernetes nodes are not ready' | ||||
|       - alert: K8SKubeletDown | ||||
|         expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3 | ||||
|         for: 1h | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Prometheus failed to scrape {{ $value }}% of kubelets. | ||||
|       - alert: K8SKubeletDown | ||||
|         expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"})) | ||||
|           * 100 > 1 | ||||
|         for: 1h | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets | ||||
|             have disappeared from service discovery. | ||||
|           summary: Many Kubelets cannot be scraped | ||||
|       - alert: K8SKubeletTooManyPods | ||||
|         expr: kubelet_running_pod_count > 100 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Kubelet {{$labels.instance}} is running {{$value}} pods, close | ||||
|             to the limit of 110 | ||||
|           summary: Kubelet is close to pod limit | ||||
|   kubernetes.rules.yaml: |+ | ||||
|     groups: | ||||
|     - name: kubernetes.rules | ||||
|       rules: | ||||
|       - record: pod_name:container_memory_usage_bytes:sum | ||||
|         expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY | ||||
|           (pod_name) | ||||
|       - record: pod_name:container_spec_cpu_shares:sum | ||||
|         expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name) | ||||
|       - record: pod_name:container_cpu_usage:sum | ||||
|         expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) | ||||
|           BY (pod_name) | ||||
|       - record: pod_name:container_fs_usage_bytes:sum | ||||
|         expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name) | ||||
|       - record: namespace:container_memory_usage_bytes:sum | ||||
|         expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace) | ||||
|       - record: namespace:container_spec_cpu_shares:sum | ||||
|         expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace) | ||||
|       - record: namespace:container_cpu_usage:sum | ||||
|         expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m])) | ||||
|           BY (namespace) | ||||
|       - record: cluster:memory_usage:ratio | ||||
|         expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY | ||||
|           (cluster) / sum(machine_memory_bytes) BY (cluster) | ||||
|       - record: cluster:container_spec_cpu_shares:ratio | ||||
|         expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000 | ||||
|           / sum(machine_cpu_cores) | ||||
|       - record: cluster:container_cpu_usage:ratio | ||||
|         expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m])) | ||||
|           / sum(machine_cpu_cores) | ||||
|       - record: apiserver_latency_seconds:quantile | ||||
|         expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) / | ||||
|           1e+06 | ||||
|         labels: | ||||
|           quantile: "0.99" | ||||
|       - record: apiserver_latency:quantile_seconds | ||||
|         expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) / | ||||
|           1e+06 | ||||
|         labels: | ||||
|           quantile: "0.9" | ||||
|       - record: apiserver_latency_seconds:quantile | ||||
|         expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) / | ||||
|           1e+06 | ||||
|         labels: | ||||
|           quantile: "0.5" | ||||
|       - alert: APIServerLatencyHigh | ||||
|         expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} | ||||
|           > 1 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: the API server has a 99th percentile latency of {{ $value }} seconds | ||||
|             for {{$labels.verb}} {{$labels.resource}} | ||||
|       - alert: APIServerLatencyHigh | ||||
|         expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"} | ||||
|           > 4 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: the API server has a 99th percentile latency of {{ $value }} seconds | ||||
|             for {{$labels.verb}} {{$labels.resource}} | ||||
|       - alert: APIServerErrorsHigh | ||||
|         expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) | ||||
|           * 100 > 2 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: API server returns errors for {{ $value }}% of requests | ||||
|       - alert: APIServerErrorsHigh | ||||
|         expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m]) | ||||
|           * 100 > 5 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: API server returns errors for {{ $value }}% of requests | ||||
|       - alert: K8SApiserverDown | ||||
|         expr: absent(up{job="apiserver"} == 1) | ||||
|         for: 20m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: No API servers are reachable or all have disappeared from service | ||||
|             discovery | ||||
|      | ||||
|       - alert: K8sCertificateExpirationNotice | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Kubernetes API Certificate is expiring soon (less than 7 days) | ||||
|         expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0 | ||||
|      | ||||
|       - alert: K8sCertificateExpirationNotice | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: Kubernetes API Certificate is expiring in less than 1 day | ||||
|         expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0 | ||||
|   node.rules.yaml: |+ | ||||
|     groups: | ||||
|     - name: node.rules | ||||
|       rules: | ||||
|       - record: instance:node_cpu:rate:sum | ||||
|         expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m])) | ||||
|           BY (instance) | ||||
|       - record: instance:node_filesystem_usage:sum | ||||
|         expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"})) | ||||
|           BY (instance) | ||||
|       - record: instance:node_network_receive_bytes:rate:sum | ||||
|         expr: sum(rate(node_network_receive_bytes[3m])) BY (instance) | ||||
|       - record: instance:node_network_transmit_bytes:rate:sum | ||||
|         expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance) | ||||
|       - record: instance:node_cpu:ratio | ||||
|         expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance) | ||||
|           GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance) | ||||
|       - record: cluster:node_cpu:sum_rate5m | ||||
|         expr: sum(rate(node_cpu{mode!="idle"}[5m])) | ||||
|       - record: cluster:node_cpu:ratio | ||||
|         expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu)) | ||||
|       - alert: NodeExporterDown | ||||
|         expr: absent(up{job="node-exporter"} == 1) | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Prometheus could not scrape a node-exporter for more than 10m, | ||||
|             or node-exporters have disappeared from discovery | ||||
|       - alert: NodeDiskRunningFull | ||||
|         expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0 | ||||
|         for: 30m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: device {{$labels.device}} on node {{$labels.instance}} is running | ||||
|             full within the next 24 hours (mounted at {{$labels.mountpoint}}) | ||||
|       - alert: NodeDiskRunningFull | ||||
|         expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: device {{$labels.device}} on node {{$labels.instance}} is running | ||||
|             full within the next 2 hours (mounted at {{$labels.mountpoint}}) | ||||
|   prometheus.rules.yaml: |+ | ||||
|     groups: | ||||
|     - name: prometheus.rules | ||||
|       rules: | ||||
|       - alert: PrometheusConfigReloadFailed | ||||
|         expr: prometheus_config_last_reload_successful == 0 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}} | ||||
|       - alert: PrometheusNotificationQueueRunningFull | ||||
|         expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{ | ||||
|             $labels.pod}} | ||||
|       - alert: PrometheusErrorSendingAlerts | ||||
|         expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) | ||||
|           > 0.01 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ | ||||
|             $labels.pod}} to Alertmanager {{$labels.Alertmanager}} | ||||
|       - alert: PrometheusErrorSendingAlerts | ||||
|         expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m]) | ||||
|           > 0.03 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: critical | ||||
|         annotations: | ||||
|           description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{ | ||||
|             $labels.pod}} to Alertmanager {{$labels.Alertmanager}} | ||||
|       - alert: PrometheusNotConnectedToAlertmanagers | ||||
|         expr: prometheus_notifications_alertmanagers_discovered < 1 | ||||
|         for: 10m | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected | ||||
|             to any Alertmanagers | ||||
|       - alert: PrometheusTSDBReloadsFailing | ||||
|         expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0 | ||||
|         for: 12h | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} | ||||
|             reload failures over the last four hours.' | ||||
|           summary: Prometheus has issues reloading data blocks from disk | ||||
|       - alert: PrometheusTSDBCompactionsFailing | ||||
|         expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0 | ||||
|         for: 12h | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}} | ||||
|             compaction failures over the last four hours.' | ||||
|           summary: Prometheus has issues compacting sample blocks | ||||
|       - alert: PrometheusTSDBWALCorruptions | ||||
|         expr: tsdb_wal_corruptions_total > 0 | ||||
|         for: 4h | ||||
|         labels: | ||||
|           severity: warning | ||||
|         annotations: | ||||
|           description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead | ||||
|             log (WAL).' | ||||
|           summary: Prometheus write-ahead log is corrupted | ||||
							
								
								
									
										4
									
								
								manifests/prometheus/prometheus-k8s-service-account.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								manifests/prometheus/prometheus-k8s-service-account.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,4 @@ | ||||
| apiVersion: v1 | ||||
| kind: ServiceAccount | ||||
| metadata: | ||||
|   name: prometheus-k8s | ||||
| @ -0,0 +1,16 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: alertmanager | ||||
|   labels: | ||||
|     k8s-app: alertmanager | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       alertmanager: main | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - monitoring | ||||
|   endpoints: | ||||
|   - port: web | ||||
|     interval: 30s | ||||
| @ -0,0 +1,23 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: kube-apiserver | ||||
|   labels: | ||||
|     k8s-app: apiserver | ||||
| spec: | ||||
|   jobLabel: component | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       component: apiserver | ||||
|       provider: kubernetes | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - default | ||||
|   endpoints: | ||||
|   - port: https | ||||
|     interval: 30s | ||||
|     scheme: https | ||||
|     tlsConfig: | ||||
|       caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt | ||||
|       serverName: kubernetes | ||||
|     bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token | ||||
| @ -0,0 +1,18 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: arm-exporter | ||||
|   namespace: monitoring | ||||
|   labels: | ||||
|     k8s-app: arm-exporter | ||||
| spec: | ||||
|   jobLabel: k8s-app | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       k8s-app: arm-exporter | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - monitoring | ||||
|   endpoints: | ||||
|   - port: http | ||||
|     interval: 30s | ||||
| @ -0,0 +1,17 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: kube-controller-manager | ||||
|   labels: | ||||
|     k8s-app: kube-controller-manager | ||||
| spec: | ||||
|   jobLabel: k8s-app | ||||
|   endpoints: | ||||
|   - port: http-metrics | ||||
|     interval: 30s | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       k8s-app: kube-controller-manager | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - kube-system | ||||
| @ -0,0 +1,17 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: kube-scheduler | ||||
|   labels: | ||||
|     k8s-app: kube-scheduler | ||||
| spec: | ||||
|   jobLabel: k8s-app | ||||
|   endpoints: | ||||
|   - port: http-metrics | ||||
|     interval: 30s | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       k8s-app: kube-scheduler | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - kube-system | ||||
| @ -0,0 +1,28 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: kube-state-metrics | ||||
|   labels: | ||||
|     k8s-app: kube-state-metrics | ||||
| spec: | ||||
|   jobLabel: k8s-app | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       k8s-app: kube-state-metrics | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - monitoring | ||||
|   endpoints: | ||||
|   - port: http-main | ||||
|     #scheme: https | ||||
|     interval: 30s | ||||
|     honorLabels: true | ||||
|     #bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token | ||||
|     #tlsConfig: | ||||
|     #  insecureSkipVerify: true | ||||
|   - port: http-self | ||||
|   #  scheme: https | ||||
|     interval: 30s | ||||
|   #  bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token | ||||
|   #  tlsConfig: | ||||
|   #    insecureSkipVerify: true | ||||
| @ -0,0 +1,29 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: kubelet | ||||
|   labels: | ||||
|     k8s-app: kubelet | ||||
| spec: | ||||
|   jobLabel: k8s-app | ||||
|   endpoints: | ||||
|   - port: https-metrics | ||||
|     scheme: https | ||||
|     interval: 30s | ||||
|     tlsConfig: | ||||
|       insecureSkipVerify: true | ||||
|     bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token | ||||
|   - port: https-metrics | ||||
|     scheme: https | ||||
|     path: /metrics/cadvisor | ||||
|     interval: 30s | ||||
|     honorLabels: true | ||||
|     tlsConfig: | ||||
|       insecureSkipVerify: true | ||||
|     bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       k8s-app: kubelet | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - kube-system | ||||
| @ -0,0 +1,21 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: node-exporter | ||||
|   labels: | ||||
|     k8s-app: node-exporter | ||||
| spec: | ||||
|   jobLabel: k8s-app | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       k8s-app: node-exporter | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - monitoring | ||||
|   endpoints: | ||||
|   - port: http | ||||
|     #scheme: http | ||||
|     interval: 30s | ||||
|     #bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token | ||||
|     #tlsConfig: | ||||
|     #  insecureSkipVerify: true | ||||
| @ -0,0 +1,12 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: prometheus-operator | ||||
|   labels: | ||||
|     k8s-app: prometheus-operator | ||||
| spec: | ||||
|   endpoints: | ||||
|   - port: http | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       k8s-app: prometheus-operator | ||||
| @ -0,0 +1,16 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: prometheus | ||||
|   labels: | ||||
|     k8s-app: prometheus | ||||
| spec: | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       prometheus: k8s | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - monitoring | ||||
|   endpoints: | ||||
|   - port: web | ||||
|     interval: 30s | ||||
| @ -0,0 +1,17 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: ServiceMonitor | ||||
| metadata: | ||||
|   name: traefik-ingress-lb | ||||
|   labels: | ||||
|     k8s-app: traefik-ingress-lb | ||||
| spec: | ||||
|   jobLabel: k8s-app | ||||
|   endpoints: | ||||
|   - port: admin | ||||
|     interval: 30s | ||||
|   selector: | ||||
|     matchLabels: | ||||
|       k8s-app: traefik-ingress-lb | ||||
|   namespaceSelector: | ||||
|     matchNames: | ||||
|     - kube-system | ||||
							
								
								
									
										16
									
								
								manifests/prometheus/prometheus-k8s-service.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										16
									
								
								manifests/prometheus/prometheus-k8s-service.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,16 @@ | ||||
| apiVersion: v1 | ||||
| kind: Service | ||||
| metadata: | ||||
|   labels: | ||||
|     prometheus: k8s | ||||
|   name: prometheus-k8s | ||||
| spec: | ||||
|   type: NodePort | ||||
|   ports: | ||||
|   - name: web | ||||
|     nodePort: 30900 | ||||
|     port: 9090 | ||||
|     protocol: TCP | ||||
|     targetPort: web | ||||
|   selector: | ||||
|     prometheus: k8s | ||||
							
								
								
									
										39
									
								
								manifests/prometheus/prometheus-k8s.yaml
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										39
									
								
								manifests/prometheus/prometheus-k8s.yaml
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,39 @@ | ||||
| apiVersion: monitoring.coreos.com/v1 | ||||
| kind: Prometheus | ||||
| metadata: | ||||
|   name: k8s | ||||
|   labels: | ||||
|     prometheus: k8s | ||||
| spec: | ||||
|   replicas: 1 | ||||
|   baseImage: carlosedp/prometheus | ||||
|   version: v2.1.0 | ||||
|   retention: "168h" | ||||
|   serviceAccountName: prometheus-k8s | ||||
|   serviceMonitorSelector: | ||||
|     matchExpressions: | ||||
|     - {key: k8s-app, operator: Exists} | ||||
|   ruleSelector: | ||||
|     matchLabels: | ||||
|       role: prometheus-rulefiles | ||||
|       prometheus: k8s | ||||
|   resources: | ||||
|     requests: | ||||
|       # 2Gi is default, but won't schedule if you don't have a node with >2Gi | ||||
|       # memory. Modify based on your target and time-series count for | ||||
|       # production use. This value is mainly meant for demonstration/testing | ||||
|       # purposes. | ||||
|       memory: 400Mi | ||||
|   alerting: | ||||
|     alertmanagers: | ||||
|     - namespace: monitoring | ||||
|       name: alertmanager-main | ||||
|       port: web | ||||
|   storage: | ||||
|     volumeClaimTemplate: | ||||
|       spec: | ||||
|         accessModes: | ||||
|         - ReadWriteOnce | ||||
|         resources: | ||||
|           requests: | ||||
|             storage: 50Gi | ||||
							
								
								
									
										30
									
								
								teardown
									
									
									
									
									
										Executable file
									
								
							
							
						
						
									
										30
									
								
								teardown
									
									
									
									
									
										Executable file
									
								
							| @ -0,0 +1,30 @@ | ||||
| #!/usr/bin/env bash | ||||
| 
 | ||||
| if [ -z "${KUBECONFIG}" ]; then | ||||
|     export KUBECONFIG=~/.kube/config | ||||
| fi | ||||
| 
 | ||||
| # CAUTION - NAMESPACE must match its value when deploy script was run. | ||||
| # Some resources are always deployed to the monitoring namespace.  | ||||
| 
 | ||||
| if [ -z "${NAMESPACE}" ]; then | ||||
|     NAMESPACE=monitoring | ||||
| fi | ||||
| 
 | ||||
| kctl() { | ||||
|     kubectl --namespace "$NAMESPACE" "$@" | ||||
| } | ||||
| 
 | ||||
| kctl delete -f manifests/node-exporter | ||||
| kctl delete -f manifests/kube-state-metrics | ||||
| kctl delete -f manifests/grafana | ||||
| find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \; | ||||
| kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml | ||||
| kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml | ||||
| kctl delete -f manifests/alertmanager | ||||
| 
 | ||||
| # Hack: wait a bit to let the controller delete the deployed Prometheus server. | ||||
| sleep 5 | ||||
| 
 | ||||
| kctl delete -f manifests/prometheus-operator | ||||
| 
 | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user