mirror of
https://github.com/carlosedp/cluster-monitoring.git
synced 2025-07-16 13:46:36 +02:00
Initial import
This commit is contained in:
commit
063e040dac
23
Readme.md
Normal file
23
Readme.md
Normal file
@ -0,0 +1,23 @@
|
||||
# Prometheus Operator for ARM platform
|
||||
|
||||
The Prometheus Operator for Kubernetes provides easy monitoring definitions for Kubernetes services and deployment and management of Prometheus instances.
|
||||
|
||||
This project aims on porting the [official manifests](https://github.com/coreos/prometheus-operator/tree/master/contrib/kube-prometheus) and images to the ARM platform. This have been tested on a ARM64 Kubernetes cluster deployed as [this article](medium.com/@carlosedp/building-an-arm-kubernetes-cluster-ef31032636f9).
|
||||
|
||||
## Changes to Kubeadm for Prometheus Operator
|
||||
|
||||
According to the official deployment documentation [here](https://github.com/coreos/prometheus-operator/blob/master/contrib/kube-prometheus/docs/kube-prometheus-on-kubeadm.md), a couple of changes on the cluster are required:
|
||||
|
||||
We need to expose the cadvisor that is installed and managed by the kubelet daemon and allow webhook token authentication. To do so, we do the following on **all the masters and nodes**:
|
||||
|
||||
sed -e "/cadvisor-port=0/d" -i /etc/systemd/system/kubelet.service.d/10-kubeadm.conf
|
||||
sed -e "s/--authorization-mode=Webhook/--authentication-token-webhook=true --authorization-mode=Webhook/"
|
||||
systemctl daemon-reload
|
||||
systemctl restart kubelet
|
||||
|
||||
In case you already have a Kubernetes deployed with kubeadm, change the address kube-controller-manager and kube-scheduler listens **on master node** in addition to previous kubelet change:
|
||||
|
||||
sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-controller-manager.yaml
|
||||
sed -e "s/- --address=127.0.0.1/- --address=0.0.0.0/" -i /etc/kubernetes/manifests/kube-scheduler.yaml
|
||||
|
||||
|
42
deploy
Executable file
42
deploy
Executable file
@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
export KUBECONFIG=~/.kube/config
|
||||
fi
|
||||
|
||||
# CAUTION - setting NAMESPACE will deploy most components to the given namespace
|
||||
# however some are hardcoded to 'monitoring'. Only use if you have reviewed all manifests.
|
||||
|
||||
if [ -z "${NAMESPACE}" ]; then
|
||||
NAMESPACE=monitoring
|
||||
fi
|
||||
|
||||
kubectl create namespace "$NAMESPACE"
|
||||
|
||||
kctl() {
|
||||
kubectl --namespace "$NAMESPACE" "$@"
|
||||
}
|
||||
|
||||
kubectl apply -f manifests/k8s
|
||||
|
||||
kctl apply -f manifests/prometheus-operator
|
||||
|
||||
# Wait for CRDs to be ready.
|
||||
printf "Waiting for Operator to register custom resource definitions..."
|
||||
until kctl get customresourcedefinitions servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done
|
||||
until kctl get customresourcedefinitions prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done
|
||||
until kctl get customresourcedefinitions alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done
|
||||
until kctl get servicemonitors.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done
|
||||
until kctl get prometheuses.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done
|
||||
until kctl get alertmanagers.monitoring.coreos.com > /dev/null 2>&1; do sleep 1; printf "."; done
|
||||
echo "done!"
|
||||
|
||||
kctl apply -f manifests/node-exporter
|
||||
kctl apply -f manifests/kube-state-metrics
|
||||
kctl apply -f manifests/grafana/grafana-credentials.yaml
|
||||
kctl apply -f manifests/grafana
|
||||
find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" apply -f {} \;
|
||||
kubectl apply -f manifests/prometheus/prometheus-k8s-roles.yaml
|
||||
kubectl apply -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
|
||||
kctl apply -f manifests/alertmanager/
|
||||
|
6
manifests/alertmanager/alertmanager-config.yaml
Normal file
6
manifests/alertmanager/alertmanager-config.yaml
Normal file
@ -0,0 +1,6 @@
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: alertmanager-main
|
||||
data:
|
||||
alertmanager.yaml: Z2xvYmFsOgogIHJlc29sdmVfdGltZW91dDogNW0Kcm91dGU6CiAgZ3JvdXBfYnk6IFsnam9iJ10KICBncm91cF93YWl0OiAzMHMKICBncm91cF9pbnRlcnZhbDogNW0KICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJlY2VpdmVyOiAnbnVsbCcKICByb3V0ZXM6CiAgLSBtYXRjaDoKICAgICAgYWxlcnRuYW1lOiBEZWFkTWFuc1N3aXRjaAogICAgcmVjZWl2ZXI6ICdudWxsJwpyZWNlaXZlcnM6Ci0gbmFtZTogJ251bGwnCg==
|
15
manifests/alertmanager/alertmanager-ingress.yaml
Normal file
15
manifests/alertmanager/alertmanager-ingress.yaml
Normal file
@ -0,0 +1,15 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: alertmanager
|
||||
namespace: monitoring
|
||||
spec:
|
||||
rules:
|
||||
- host: alertmanager.internal.carlosedp.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
backend:
|
||||
serviceName: alertmanager-main
|
||||
servicePort: web
|
||||
|
16
manifests/alertmanager/alertmanager-service.yaml
Normal file
16
manifests/alertmanager/alertmanager-service.yaml
Normal file
@ -0,0 +1,16 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
alertmanager: main
|
||||
name: alertmanager-main
|
||||
spec:
|
||||
type: NodePort
|
||||
ports:
|
||||
- name: web
|
||||
nodePort: 30903
|
||||
port: 9093
|
||||
protocol: TCP
|
||||
targetPort: web
|
||||
selector:
|
||||
alertmanager: main
|
11
manifests/alertmanager/alertmanager.yaml
Normal file
11
manifests/alertmanager/alertmanager.yaml
Normal file
@ -0,0 +1,11 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: Alertmanager
|
||||
metadata:
|
||||
name: main
|
||||
labels:
|
||||
alertmanager: main
|
||||
spec:
|
||||
replicas: 1
|
||||
baseImage: carlosedp/alertmanager
|
||||
version: v0.14.0
|
||||
|
22
manifests/armexporter/daemonset.yaml
Normal file
22
manifests/armexporter/daemonset.yaml
Normal file
@ -0,0 +1,22 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: arm-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
k8s-app: arm-exporter
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
name: arm-exporter
|
||||
labels:
|
||||
k8s-app: arm-exporter
|
||||
spec:
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- image: carlosedp/arm_exporter
|
||||
name: arm-exporter
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 9243
|
||||
hostPort: 9243
|
17
manifests/armexporter/node-exporter-service.yaml
Normal file
17
manifests/armexporter/node-exporter-service.yaml
Normal file
@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app: node-exporter
|
||||
k8s-app: node-exporter
|
||||
name: node-exporter
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http
|
||||
port: 9100
|
||||
protocol: TCP
|
||||
selector:
|
||||
app: node-exporter
|
||||
|
14
manifests/armexporter/service.yaml
Normal file
14
manifests/armexporter/service.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: arm-exporter
|
||||
name: arm-exporter
|
||||
spec:
|
||||
ports:
|
||||
- name: http
|
||||
port: 9243
|
||||
protocol: TCP
|
||||
selector:
|
||||
k8s-app: arm-exporter
|
||||
|
12
manifests/grafana/grafana-claim.yaml
Normal file
12
manifests/grafana/grafana-claim.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
kind: PersistentVolumeClaim
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: grafana-claim
|
||||
namespace: monitoring
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteMany
|
||||
resources:
|
||||
requests:
|
||||
storage: 5Gi
|
||||
|
23
manifests/grafana/grafana-configmap.yaml
Normal file
23
manifests/grafana/grafana-configmap.yaml
Normal file
@ -0,0 +1,23 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-config
|
||||
namespace: monitoring
|
||||
data:
|
||||
config.ini: |
|
||||
[database]
|
||||
path = /data/grafana.db
|
||||
|
||||
[paths]
|
||||
data = /data
|
||||
logs = /data/log
|
||||
plugins = /data/plugins
|
||||
|
||||
[session]
|
||||
provider = memory
|
||||
|
||||
[auth.basic]
|
||||
enabled = false
|
||||
|
||||
[auth.anonymous]
|
||||
enabled = false
|
7
manifests/grafana/grafana-credentials.yaml
Normal file
7
manifests/grafana/grafana-credentials.yaml
Normal file
@ -0,0 +1,7 @@
|
||||
apiVersion: v1
|
||||
kind: Secret
|
||||
metadata:
|
||||
name: grafana-credentials
|
||||
data:
|
||||
user: YWRtaW4=
|
||||
password: YWRtaW4=
|
7360
manifests/grafana/grafana-dashboard-definitions.yaml
Normal file
7360
manifests/grafana/grafana-dashboard-definitions.yaml
Normal file
File diff suppressed because it is too large
Load Diff
12
manifests/grafana/grafana-dashboards.yaml
Normal file
12
manifests/grafana/grafana-dashboards.yaml
Normal file
@ -0,0 +1,12 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-dashboards
|
||||
data:
|
||||
dashboards.yaml: |+
|
||||
- name: '0'
|
||||
org_id: 1
|
||||
folder: ''
|
||||
type: file
|
||||
options:
|
||||
folder: /grafana-dashboard-definitions/0
|
15
manifests/grafana/grafana-datasources.yaml
Normal file
15
manifests/grafana/grafana-datasources.yaml
Normal file
@ -0,0 +1,15 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: grafana-datasources
|
||||
data:
|
||||
prometheus.yaml: |+
|
||||
datasources:
|
||||
- name: prometheus
|
||||
type: prometheus
|
||||
access: proxy
|
||||
org_id: 1
|
||||
url: http://prometheus-k8s.monitoring.svc:9090
|
||||
version: 1
|
||||
editable: false
|
||||
|
60
manifests/grafana/grafana-deployment.yaml
Normal file
60
manifests/grafana/grafana-deployment.yaml
Normal file
@ -0,0 +1,60 @@
|
||||
apiVersion: apps/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: grafana
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
containers:
|
||||
- name: grafana
|
||||
image: carlosedp/monitoring-grafana:latest
|
||||
volumeMounts:
|
||||
- name: grafana-config
|
||||
mountPath: /grafana/conf/config.ini
|
||||
subPath: config.ini
|
||||
- name: grafana-storage
|
||||
mountPath: /data
|
||||
- name: grafana-datasources
|
||||
mountPath: /grafana/conf/provisioning/datasources
|
||||
- name: grafana-dashboards
|
||||
mountPath: /grafana/conf/provisioning/dashboards
|
||||
- name: grafana-dashboard-definitions-0
|
||||
mountPath: /grafana-dashboard-definitions/0
|
||||
ports:
|
||||
- name: web
|
||||
containerPort: 3000
|
||||
env:
|
||||
- name: GF_INSTALL_PLUGINS
|
||||
value: "grafana-clock-panel,grafana-piechart-panel"
|
||||
- name: GF_PATHS_PLUGINS
|
||||
value: "/data/plugins"
|
||||
resources:
|
||||
requests:
|
||||
memory: 100Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 200Mi
|
||||
cpu: 200m
|
||||
volumes:
|
||||
- name: grafana-config
|
||||
configMap:
|
||||
name: grafana-config
|
||||
- name: grafana-storage
|
||||
persistentVolumeClaim:
|
||||
claimName: grafana-claim
|
||||
- name: grafana-datasources
|
||||
configMap:
|
||||
name: grafana-datasources
|
||||
- name: grafana-dashboards
|
||||
configMap:
|
||||
name: grafana-dashboards
|
||||
- name: grafana-dashboard-definitions-0
|
||||
configMap:
|
||||
name: grafana-dashboard-definitions-0
|
15
manifests/grafana/grafana-ingress.yaml
Normal file
15
manifests/grafana/grafana-ingress.yaml
Normal file
@ -0,0 +1,15 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
spec:
|
||||
rules:
|
||||
- host: grafana.internal.carlosedp.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
backend:
|
||||
serviceName: grafana
|
||||
servicePort: 3000
|
||||
|
15
manifests/grafana/grafana-service.yaml
Normal file
15
manifests/grafana/grafana-service.yaml
Normal file
@ -0,0 +1,15 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: grafana
|
||||
labels:
|
||||
app: grafana
|
||||
spec:
|
||||
type: NodePort
|
||||
ports:
|
||||
- port: 3000
|
||||
protocol: TCP
|
||||
nodePort: 30902
|
||||
targetPort: web
|
||||
selector:
|
||||
app: grafana
|
17
manifests/k8s/kube-controller-manager.yaml
Normal file
17
manifests/k8s/kube-controller-manager.yaml
Normal file
@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
namespace: kube-system
|
||||
name: kube-controller-manager-prometheus-discovery
|
||||
labels:
|
||||
k8s-app: kube-controller-manager
|
||||
spec:
|
||||
selector:
|
||||
component: kube-controller-manager
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 10252
|
||||
targetPort: 10252
|
||||
protocol: TCP
|
17
manifests/k8s/kube-scheduler.yaml
Normal file
17
manifests/k8s/kube-scheduler.yaml
Normal file
@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
namespace: kube-system
|
||||
name: kube-scheduler-prometheus-discovery
|
||||
labels:
|
||||
k8s-app: kube-scheduler
|
||||
spec:
|
||||
selector:
|
||||
component: kube-scheduler
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http-metrics
|
||||
port: 10251
|
||||
targetPort: 10251
|
||||
protocol: TCP
|
@ -0,0 +1,12 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: kube-state-metrics
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: kube-state-metrics
|
||||
namespace: monitoring
|
@ -0,0 +1,45 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
- pods
|
||||
- services
|
||||
- resourcequotas
|
||||
- replicationcontrollers
|
||||
- limitranges
|
||||
- persistentvolumeclaims
|
||||
- persistentvolumes
|
||||
- namespaces
|
||||
- endpoints
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["extensions"]
|
||||
resources:
|
||||
- daemonsets
|
||||
- deployments
|
||||
- replicasets
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["apps"]
|
||||
resources:
|
||||
- statefulsets
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["batch"]
|
||||
resources:
|
||||
- cronjobs
|
||||
- jobs
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["autoscaling"]
|
||||
resources:
|
||||
- horizontalpodautoscalers
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: ["authentication.k8s.io"]
|
||||
resources:
|
||||
- tokenreviews
|
||||
verbs: ["create"]
|
||||
- apiGroups: ["authorization.k8s.io"]
|
||||
resources:
|
||||
- subjectaccessreviews
|
||||
verbs: ["create"]
|
@ -0,0 +1,55 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
spec:
|
||||
serviceAccountName: kube-state-metrics
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
containers:
|
||||
- name: kube-state-metrics
|
||||
image: carlosedp/kube-state-metrics-arm64:v1.2.0
|
||||
args:
|
||||
- "--host=0.0.0.0"
|
||||
- "--port=8443"
|
||||
- "--telemetry-host=0.0.0.0"
|
||||
- "--telemetry-port=9443"
|
||||
ports:
|
||||
- name: http-main
|
||||
containerPort: 8443
|
||||
- name: http-self
|
||||
containerPort: 9443
|
||||
- name: addon-resizer
|
||||
image: gcr.io/google-containers/addon-resizer-arm64:2.1
|
||||
resources:
|
||||
limits:
|
||||
cpu: 100m
|
||||
memory: 30Mi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 30Mi
|
||||
env:
|
||||
- name: MY_POD_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.name
|
||||
- name: MY_POD_NAMESPACE
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: metadata.namespace
|
||||
command:
|
||||
- /pod_nanny
|
||||
- --container=kube-state-metrics
|
||||
- --cpu=100m
|
||||
- --extra-cpu=2m
|
||||
- --memory=150Mi
|
||||
- --extra-memory=30Mi
|
||||
#- --threshold=5
|
||||
- --deployment=kube-state-metrics
|
@ -0,0 +1,12 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: kube-state-metrics-resizer
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: kube-state-metrics
|
||||
|
15
manifests/kube-state-metrics/kube-state-metrics-role.yaml
Normal file
15
manifests/kube-state-metrics/kube-state-metrics-role.yaml
Normal file
@ -0,0 +1,15 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: kube-state-metrics-resizer
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- pods
|
||||
verbs: ["get"]
|
||||
- apiGroups: ["extensions"]
|
||||
resources:
|
||||
- deployments
|
||||
resourceNames: ["kube-state-metrics"]
|
||||
verbs: ["get", "update"]
|
||||
|
@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: kube-state-metrics
|
21
manifests/kube-state-metrics/kube-state-metrics-service.yaml
Normal file
21
manifests/kube-state-metrics/kube-state-metrics-service.yaml
Normal file
@ -0,0 +1,21 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app: kube-state-metrics
|
||||
k8s-app: kube-state-metrics
|
||||
name: kube-state-metrics
|
||||
spec:
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http-main
|
||||
port: 8443
|
||||
targetPort: http-main
|
||||
protocol: TCP
|
||||
- name: http-self
|
||||
port: 9443
|
||||
targetPort: http-self
|
||||
protocol: TCP
|
||||
selector:
|
||||
app: kube-state-metrics
|
||||
|
@ -0,0 +1,12 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: node-exporter
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: node-exporter
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: node-exporter
|
||||
namespace: monitoring
|
13
manifests/node-exporter/node-exporter-cluster-role.yaml
Normal file
13
manifests/node-exporter/node-exporter-cluster-role.yaml
Normal file
@ -0,0 +1,13 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: node-exporter
|
||||
rules:
|
||||
- apiGroups: ["authentication.k8s.io"]
|
||||
resources:
|
||||
- tokenreviews
|
||||
verbs: ["create"]
|
||||
- apiGroups: ["authorization.k8s.io"]
|
||||
resources:
|
||||
- subjectaccessreviews
|
||||
verbs: ["create"]
|
57
manifests/node-exporter/node-exporter-daemonset.yaml
Normal file
57
manifests/node-exporter/node-exporter-daemonset.yaml
Normal file
@ -0,0 +1,57 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-exporter
|
||||
spec:
|
||||
updateStrategy:
|
||||
rollingUpdate:
|
||||
maxUnavailable: 1
|
||||
type: RollingUpdate
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: node-exporter
|
||||
name: node-exporter
|
||||
spec:
|
||||
serviceAccountName: node-exporter
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
hostNetwork: true
|
||||
hostPID: true
|
||||
containers:
|
||||
- image: napnap75/rpi-prometheus:node_exporter
|
||||
args:
|
||||
#- "--web.listen-address=0.0.0.1:9100"
|
||||
- "--path.procfs=/host/proc"
|
||||
- "--path.sysfs=/host/sys"
|
||||
name: node-exporter
|
||||
ports:
|
||||
- containerPort: 9100
|
||||
hostPort: 9100
|
||||
name: http
|
||||
resources:
|
||||
requests:
|
||||
memory: 30Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 50Mi
|
||||
cpu: 200m
|
||||
volumeMounts:
|
||||
- name: proc
|
||||
readOnly: true
|
||||
mountPath: /host/proc
|
||||
- name: sys
|
||||
readOnly: true
|
||||
mountPath: /host/sys
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
operator: Exists
|
||||
volumes:
|
||||
- name: proc
|
||||
hostPath:
|
||||
path: /proc
|
||||
- name: sys
|
||||
hostPath:
|
||||
path: /sys
|
||||
|
@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: node-exporter
|
17
manifests/node-exporter/node-exporter-service.yaml
Normal file
17
manifests/node-exporter/node-exporter-service.yaml
Normal file
@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
app: node-exporter
|
||||
k8s-app: node-exporter
|
||||
name: node-exporter
|
||||
spec:
|
||||
type: ClusterIP
|
||||
clusterIP: None
|
||||
ports:
|
||||
- name: http
|
||||
port: 9100
|
||||
protocol: TCP
|
||||
selector:
|
||||
app: node-exporter
|
||||
|
@ -0,0 +1,12 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: prometheus-operator
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: prometheus-operator
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-operator
|
||||
namespace: monitoring
|
@ -0,0 +1,54 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: prometheus-operator
|
||||
rules:
|
||||
- apiGroups:
|
||||
- extensions
|
||||
resources:
|
||||
- thirdpartyresources
|
||||
verbs:
|
||||
- "*"
|
||||
- apiGroups:
|
||||
- apiextensions.k8s.io
|
||||
resources:
|
||||
- customresourcedefinitions
|
||||
verbs:
|
||||
- "*"
|
||||
- apiGroups:
|
||||
- monitoring.coreos.com
|
||||
resources:
|
||||
- alertmanagers
|
||||
- prometheuses
|
||||
- prometheuses/finalizers
|
||||
- alertmanagers/finalizers
|
||||
- servicemonitors
|
||||
verbs:
|
||||
- "*"
|
||||
- apiGroups:
|
||||
- apps
|
||||
resources:
|
||||
- statefulsets
|
||||
verbs: ["*"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
- secrets
|
||||
verbs: ["*"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- pods
|
||||
verbs: ["list", "delete"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- services
|
||||
- endpoints
|
||||
verbs: ["get", "create", "update"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
verbs: ["list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- namespaces
|
||||
verbs: ["list"]
|
@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: prometheus-operator
|
@ -0,0 +1,15 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: prometheus-operator
|
||||
labels:
|
||||
k8s-app: prometheus-operator
|
||||
spec:
|
||||
type: ClusterIP
|
||||
ports:
|
||||
- name: http
|
||||
port: 8080
|
||||
targetPort: http
|
||||
protocol: TCP
|
||||
selector:
|
||||
k8s-app: prometheus-operator
|
34
manifests/prometheus-operator/prometheus-operator.yaml
Normal file
34
manifests/prometheus-operator/prometheus-operator.yaml
Normal file
@ -0,0 +1,34 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: prometheus-operator
|
||||
name: prometheus-operator
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
k8s-app: prometheus-operator
|
||||
spec:
|
||||
containers:
|
||||
- args:
|
||||
- --kubelet-service=kube-system/kubelet
|
||||
- --prometheus-config-reloader=carlosedp/prometheus-config-reloader:v0.0.2
|
||||
- --config-reloader-image=carlosedp/configmap-reload:latest
|
||||
image: carlosedp/prometheus-operator:v0.17.0
|
||||
name: prometheus-operator
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
resources:
|
||||
limits:
|
||||
cpu: 200m
|
||||
memory: 100Mi
|
||||
requests:
|
||||
cpu: 100m
|
||||
memory: 50Mi
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 65534
|
||||
serviceAccountName: prometheus-operator
|
14
manifests/prometheus/prometheus-k8s-ingress.yaml
Normal file
14
manifests/prometheus/prometheus-k8s-ingress.yaml
Normal file
@ -0,0 +1,14 @@
|
||||
apiVersion: extensions/v1beta1
|
||||
kind: Ingress
|
||||
metadata:
|
||||
name: prometheus
|
||||
namespace: monitoring
|
||||
spec:
|
||||
rules:
|
||||
- host: prometheus.internal.carlosedp.com
|
||||
http:
|
||||
paths:
|
||||
- path: /
|
||||
backend:
|
||||
serviceName: prometheus-k8s
|
||||
servicePort: web
|
54
manifests/prometheus/prometheus-k8s-role-bindings.yaml
Normal file
54
manifests/prometheus/prometheus-k8s-role-bindings.yaml
Normal file
@ -0,0 +1,54 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: prometheus-k8s
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: kube-system
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: prometheus-k8s
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: default
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: prometheus-k8s
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: prometheus-k8s
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
55
manifests/prometheus/prometheus-k8s-roles.yaml
Normal file
55
manifests/prometheus/prometheus-k8s-roles.yaml
Normal file
@ -0,0 +1,55 @@
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: monitoring
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- configmaps
|
||||
verbs: ["get"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: kube-system
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
namespace: default
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- services
|
||||
- endpoints
|
||||
- pods
|
||||
verbs: ["get", "list", "watch"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1beta1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: prometheus-k8s
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources:
|
||||
- nodes/metrics
|
||||
verbs: ["get"]
|
||||
- nonResourceURLs: ["/metrics"]
|
||||
verbs: ["get"]
|
602
manifests/prometheus/prometheus-k8s-rules.yaml
Normal file
602
manifests/prometheus/prometheus-k8s-rules.yaml
Normal file
@ -0,0 +1,602 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: prometheus-k8s-rules
|
||||
labels:
|
||||
role: prometheus-rulefiles
|
||||
prometheus: k8s
|
||||
data:
|
||||
alertmanager.rules.yaml: |+
|
||||
groups:
|
||||
- name: alertmanager.rules
|
||||
rules:
|
||||
- alert: AlertmanagerConfigInconsistent
|
||||
expr: count_values("config_hash", alertmanager_config_hash) BY (service) / ON(service)
|
||||
GROUP_LEFT() label_replace(prometheus_operator_alertmanager_spec_replicas, "service",
|
||||
"alertmanager-$1", "alertmanager", "(.*)") != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: The configuration of the instances of the Alertmanager cluster
|
||||
`{{$labels.service}}` are out of sync.
|
||||
- alert: AlertmanagerDownOrMissing
|
||||
expr: label_replace(prometheus_operator_alertmanager_spec_replicas, "job", "alertmanager-$1",
|
||||
"alertmanager", "(.*)") / ON(job) GROUP_RIGHT() sum(up) BY (job) != 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: An unexpected number of Alertmanagers are scraped or Alertmanagers
|
||||
disappeared from discovery.
|
||||
- alert: AlertmanagerFailedReload
|
||||
expr: alertmanager_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Alertmanager's configuration has failed for {{ $labels.namespace
|
||||
}}/{{ $labels.pod}}.
|
||||
etcd3.rules.yaml: |+
|
||||
groups:
|
||||
- name: ./etcd3.rules
|
||||
rules:
|
||||
- alert: InsufficientMembers
|
||||
expr: count(up{job="etcd"} == 0) > (count(up{job="etcd"}) / 2 - 1)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: If one more etcd member goes down the cluster will be unavailable
|
||||
summary: etcd cluster insufficient members
|
||||
- alert: NoLeader
|
||||
expr: etcd_server_has_leader{job="etcd"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: etcd member {{ $labels.instance }} has no leader
|
||||
summary: etcd member has no leader
|
||||
- alert: HighNumberOfLeaderChanges
|
||||
expr: increase(etcd_server_leader_changes_seen_total{job="etcd"}[1h]) > 3
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} leader
|
||||
changes within the last hour
|
||||
summary: a high number of leader changes within the etcd cluster are happening
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: HighNumberOfFailedGRPCRequests
|
||||
expr: sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd"}[5m])) BY (grpc_service, grpc_method)
|
||||
/ sum(rate(grpc_server_handled_total{job="etcd"}[5m])) BY (grpc_service, grpc_method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.grpc_method }} failed
|
||||
on etcd instance {{ $labels.instance }}'
|
||||
summary: a high number of gRPC requests are failing
|
||||
- alert: GRPCRequestsSlow
|
||||
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
|
||||
}} are slow
|
||||
summary: slow gRPC requests
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HighNumberOfFailedHTTPRequests
|
||||
expr: sum(rate(etcd_http_failed_total{job="etcd"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job="etcd"}[5m]))
|
||||
BY (method) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of requests for {{ $labels.method }} failed on etcd
|
||||
instance {{ $labels.instance }}'
|
||||
summary: a high number of HTTP requests are failing
|
||||
- alert: HTTPRequestsSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: on etcd instance {{ $labels.instance }} HTTP requests to {{ $labels.method
|
||||
}} are slow
|
||||
summary: slow HTTP requests
|
||||
- alert: EtcdMemberCommunicationSlow
|
||||
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} member communication with
|
||||
{{ $labels.To }} is slow
|
||||
summary: etcd member communication is slow
|
||||
- alert: HighNumberOfFailedProposals
|
||||
expr: increase(etcd_server_proposals_failed_total{job="etcd"}[1h]) > 5
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} has seen {{ $value }} proposal
|
||||
failures within the last hour
|
||||
summary: a high number of proposals within the etcd cluster are failing
|
||||
- alert: HighFsyncDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} fync durations are high
|
||||
summary: high fsync durations
|
||||
- alert: HighCommitDurations
|
||||
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: etcd instance {{ $labels.instance }} commit durations are high
|
||||
summary: high commit durations
|
||||
general.rules.yaml: |+
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
expr: 100 * (count(up == 0) BY (job) / count(up) BY (job)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $value }}% of {{ $labels.job }} targets are down.'
|
||||
summary: Targets are down
|
||||
- alert: DeadMansSwitch
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
annotations:
|
||||
description: This is a DeadMansSwitch meant to ensure that the entire Alerting
|
||||
pipeline is functional.
|
||||
summary: Alerting DeadMansSwitch
|
||||
- record: fd_utilization
|
||||
expr: process_open_fds / process_max_fds
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(fd_utilization[1h], 3600 * 4) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
||||
will exhaust in file/socket descriptors within the next 4 hours'
|
||||
summary: file descriptors soon exhausted
|
||||
- alert: FdExhaustionClose
|
||||
expr: predict_linear(fd_utilization[10m], 3600) > 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $labels.job }}: {{ $labels.namespace }}/{{ $labels.pod }} instance
|
||||
will exhaust in file/socket descriptors within the next hour'
|
||||
summary: file descriptors soon exhausted
|
||||
kube-controller-manager.rules.yaml: |+
|
||||
groups:
|
||||
- name: kube-controller-manager.rules
|
||||
rules:
|
||||
- alert: K8SControllerManagerDown
|
||||
expr: absent(up{job="kube-controller-manager"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S controller manager. Deployments and replication
|
||||
controllers are not making progress.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-controller-manager
|
||||
summary: Controller manager is down
|
||||
kube-scheduler.rules.yaml: |+
|
||||
groups:
|
||||
- name: kube-scheduler.rules
|
||||
rules:
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_e2e_scheduling_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_e2e_scheduling_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_scheduling_algorithm_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_scheduling_algorithm_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.9, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: cluster:scheduler_binding_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, sum(scheduler_binding_latency_microseconds_bucket)
|
||||
BY (le, cluster)) / 1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- alert: K8SSchedulerDown
|
||||
expr: absent(up{job="kube-scheduler"} == 1)
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: There is no running K8S scheduler. New pods are not being assigned
|
||||
to nodes.
|
||||
runbook: https://coreos.com/tectonic/docs/latest/troubleshooting/controller-recovery.html#recovering-a-scheduler
|
||||
summary: Scheduler is down
|
||||
kube-state-metrics.rules.yaml: |+
|
||||
groups:
|
||||
- name: kube-state-metrics.rules
|
||||
rules:
|
||||
- alert: DeploymentGenerationMismatch
|
||||
expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Observed deployment generation does not match expected one for
|
||||
deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
||||
summary: Deployment is outdated
|
||||
- alert: DeploymentReplicasNotUpdated
|
||||
expr: ((kube_deployment_status_replicas_updated != kube_deployment_spec_replicas)
|
||||
or (kube_deployment_status_replicas_available != kube_deployment_spec_replicas))
|
||||
unless (kube_deployment_spec_paused == 1)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Replicas are not updated and available for deployment {{$labels.namespaces}}/{{$labels.deployment}}
|
||||
summary: Deployment replicas are outdated
|
||||
- alert: DaemonSetRolloutStuck
|
||||
expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled
|
||||
* 100 < 100
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Only {{$value}}% of desired pods scheduled and ready for daemon
|
||||
set {{$labels.namespaces}}/{{$labels.daemonset}}
|
||||
summary: DaemonSet is missing pods
|
||||
- alert: K8SDaemonSetsNotScheduled
|
||||
expr: kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are not scheduled.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: DaemonSetsMissScheduled
|
||||
expr: kube_daemonset_status_number_misscheduled > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: A number of daemonsets are running where they are not supposed
|
||||
to run.
|
||||
summary: Daemonsets are not scheduled correctly
|
||||
- alert: PodFrequentlyRestarting
|
||||
expr: increase(kube_pod_container_status_restarts_total[1h]) > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Pod {{$labels.namespaces}}/{{$labels.pod}} is was restarted {{$value}}
|
||||
times within the last hour
|
||||
summary: Pod is restarting frequently
|
||||
kubelet.rules.yaml: |+
|
||||
groups:
|
||||
- name: kubelet.rules
|
||||
rules:
|
||||
- alert: K8SNodeNotReady
|
||||
expr: kube_node_status_condition{condition="Ready",status="true"} == 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: The Kubelet on {{ $labels.node }} has not checked in with the API,
|
||||
or has set itself to NotReady, for more than an hour
|
||||
summary: Node status is NotReady
|
||||
- alert: K8SManyNodesNotReady
|
||||
expr: count(kube_node_status_condition{condition="Ready",status="true"} == 0)
|
||||
> 1 and (count(kube_node_status_condition{condition="Ready",status="true"} ==
|
||||
0) / count(kube_node_status_condition{condition="Ready",status="true"})) > 0.2
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: '{{ $value }}% of Kubernetes nodes are not ready'
|
||||
- alert: K8SKubeletDown
|
||||
expr: count(up{job="kubelet"} == 0) / count(up{job="kubelet"}) * 100 > 3
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets.
|
||||
- alert: K8SKubeletDown
|
||||
expr: (absent(up{job="kubelet"} == 1) or count(up{job="kubelet"} == 0) / count(up{job="kubelet"}))
|
||||
* 100 > 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Prometheus failed to scrape {{ $value }}% of kubelets, or all Kubelets
|
||||
have disappeared from service discovery.
|
||||
summary: Many Kubelets cannot be scraped
|
||||
- alert: K8SKubeletTooManyPods
|
||||
expr: kubelet_running_pod_count > 100
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Kubelet {{$labels.instance}} is running {{$value}} pods, close
|
||||
to the limit of 110
|
||||
summary: Kubelet is close to pod limit
|
||||
kubernetes.rules.yaml: |+
|
||||
groups:
|
||||
- name: kubernetes.rules
|
||||
rules:
|
||||
- record: pod_name:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(pod_name)
|
||||
- record: pod_name:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: pod_name:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
BY (pod_name)
|
||||
- record: pod_name:container_fs_usage_bytes:sum
|
||||
expr: sum(container_fs_usage_bytes{container_name!="POD",pod_name!=""}) BY (pod_name)
|
||||
- record: namespace:container_memory_usage_bytes:sum
|
||||
expr: sum(container_memory_usage_bytes{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_spec_cpu_shares:sum
|
||||
expr: sum(container_spec_cpu_shares{container_name!=""}) BY (namespace)
|
||||
- record: namespace:container_cpu_usage:sum
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD"}[5m]))
|
||||
BY (namespace)
|
||||
- record: cluster:memory_usage:ratio
|
||||
expr: sum(container_memory_usage_bytes{container_name!="POD",pod_name!=""}) BY
|
||||
(cluster) / sum(machine_memory_bytes) BY (cluster)
|
||||
- record: cluster:container_spec_cpu_shares:ratio
|
||||
expr: sum(container_spec_cpu_shares{container_name!="POD",pod_name!=""}) / 1000
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: cluster:container_cpu_usage:ratio
|
||||
expr: sum(rate(container_cpu_usage_seconds_total{container_name!="POD",pod_name!=""}[5m]))
|
||||
/ sum(machine_cpu_cores)
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.99, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.99"
|
||||
- record: apiserver_latency:quantile_seconds
|
||||
expr: histogram_quantile(0.9, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.9"
|
||||
- record: apiserver_latency_seconds:quantile
|
||||
expr: histogram_quantile(0.5, rate(apiserver_request_latencies_bucket[5m])) /
|
||||
1e+06
|
||||
labels:
|
||||
quantile: "0.5"
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
||||
for {{$labels.verb}} {{$labels.resource}}
|
||||
- alert: APIServerLatencyHigh
|
||||
expr: apiserver_latency_seconds:quantile{quantile="0.99",subresource!="log",verb!~"^(?:WATCH|WATCHLIST|PROXY|CONNECT)$"}
|
||||
> 4
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: the API server has a 99th percentile latency of {{ $value }} seconds
|
||||
for {{$labels.verb}} {{$labels.resource}}
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 2
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: API server returns errors for {{ $value }}% of requests
|
||||
- alert: APIServerErrorsHigh
|
||||
expr: rate(apiserver_request_count{code=~"^(?:5..)$"}[5m]) / rate(apiserver_request_count[5m])
|
||||
* 100 > 5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: API server returns errors for {{ $value }}% of requests
|
||||
- alert: K8SApiserverDown
|
||||
expr: absent(up{job="apiserver"} == 1)
|
||||
for: 20m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: No API servers are reachable or all have disappeared from service
|
||||
discovery
|
||||
|
||||
- alert: K8sCertificateExpirationNotice
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Kubernetes API Certificate is expiring soon (less than 7 days)
|
||||
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="604800"}) > 0
|
||||
|
||||
- alert: K8sCertificateExpirationNotice
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Kubernetes API Certificate is expiring in less than 1 day
|
||||
expr: sum(apiserver_client_certificate_expiration_seconds_bucket{le="86400"}) > 0
|
||||
node.rules.yaml: |+
|
||||
groups:
|
||||
- name: node.rules
|
||||
rules:
|
||||
- record: instance:node_cpu:rate:sum
|
||||
expr: sum(rate(node_cpu{mode!="idle",mode!="iowait",mode!~"^(?:guest.*)$"}[3m]))
|
||||
BY (instance)
|
||||
- record: instance:node_filesystem_usage:sum
|
||||
expr: sum((node_filesystem_size{mountpoint="/"} - node_filesystem_free{mountpoint="/"}))
|
||||
BY (instance)
|
||||
- record: instance:node_network_receive_bytes:rate:sum
|
||||
expr: sum(rate(node_network_receive_bytes[3m])) BY (instance)
|
||||
- record: instance:node_network_transmit_bytes:rate:sum
|
||||
expr: sum(rate(node_network_transmit_bytes[3m])) BY (instance)
|
||||
- record: instance:node_cpu:ratio
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m])) WITHOUT (cpu, mode) / ON(instance)
|
||||
GROUP_LEFT() count(sum(node_cpu) BY (instance, cpu)) BY (instance)
|
||||
- record: cluster:node_cpu:sum_rate5m
|
||||
expr: sum(rate(node_cpu{mode!="idle"}[5m]))
|
||||
- record: cluster:node_cpu:ratio
|
||||
expr: cluster:node_cpu:rate5m / count(sum(node_cpu) BY (instance, cpu))
|
||||
- alert: NodeExporterDown
|
||||
expr: absent(up{job="node-exporter"} == 1)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus could not scrape a node-exporter for more than 10m,
|
||||
or node-exporters have disappeared from discovery
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[6h], 3600 * 24) < 0
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||
full within the next 24 hours (mounted at {{$labels.mountpoint}})
|
||||
- alert: NodeDiskRunningFull
|
||||
expr: predict_linear(node_filesystem_free[30m], 3600 * 2) < 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: device {{$labels.device}} on node {{$labels.instance}} is running
|
||||
full within the next 2 hours (mounted at {{$labels.mountpoint}})
|
||||
prometheus.rules.yaml: |+
|
||||
groups:
|
||||
- name: prometheus.rules
|
||||
rules:
|
||||
- alert: PrometheusConfigReloadFailed
|
||||
expr: prometheus_config_last_reload_successful == 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Reloading Prometheus' configuration has failed for {{$labels.namespace}}/{{$labels.pod}}
|
||||
- alert: PrometheusNotificationQueueRunningFull
|
||||
expr: predict_linear(prometheus_notifications_queue_length[5m], 60 * 30) > prometheus_notifications_queue_capacity
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus' alert notification queue is running full for {{$labels.namespace}}/{{
|
||||
$labels.pod}}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.01
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||
- alert: PrometheusErrorSendingAlerts
|
||||
expr: rate(prometheus_notifications_errors_total[5m]) / rate(prometheus_notifications_sent_total[5m])
|
||||
> 0.03
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: Errors while sending alerts from Prometheus {{$labels.namespace}}/{{
|
||||
$labels.pod}} to Alertmanager {{$labels.Alertmanager}}
|
||||
- alert: PrometheusNotConnectedToAlertmanagers
|
||||
expr: prometheus_notifications_alertmanagers_discovered < 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: Prometheus {{ $labels.namespace }}/{{ $labels.pod}} is not connected
|
||||
to any Alertmanagers
|
||||
- alert: PrometheusTSDBReloadsFailing
|
||||
expr: increase(prometheus_tsdb_reloads_failures_total[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||
reload failures over the last four hours.'
|
||||
summary: Prometheus has issues reloading data blocks from disk
|
||||
- alert: PrometheusTSDBCompactionsFailing
|
||||
expr: increase(prometheus_tsdb_compactions_failed_total[2h]) > 0
|
||||
for: 12h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} at {{$labels.instance}} had {{$value | humanize}}
|
||||
compaction failures over the last four hours.'
|
||||
summary: Prometheus has issues compacting sample blocks
|
||||
- alert: PrometheusTSDBWALCorruptions
|
||||
expr: tsdb_wal_corruptions_total > 0
|
||||
for: 4h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: '{{$labels.job}} at {{$labels.instance}} has a corrupted write-ahead
|
||||
log (WAL).'
|
||||
summary: Prometheus write-ahead log is corrupted
|
4
manifests/prometheus/prometheus-k8s-service-account.yaml
Normal file
4
manifests/prometheus/prometheus-k8s-service-account.yaml
Normal file
@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: prometheus-k8s
|
@ -0,0 +1,16 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: alertmanager
|
||||
labels:
|
||||
k8s-app: alertmanager
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
alertmanager: main
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- monitoring
|
||||
endpoints:
|
||||
- port: web
|
||||
interval: 30s
|
@ -0,0 +1,23 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kube-apiserver
|
||||
labels:
|
||||
k8s-app: apiserver
|
||||
spec:
|
||||
jobLabel: component
|
||||
selector:
|
||||
matchLabels:
|
||||
component: apiserver
|
||||
provider: kubernetes
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- default
|
||||
endpoints:
|
||||
- port: https
|
||||
interval: 30s
|
||||
scheme: https
|
||||
tlsConfig:
|
||||
caFile: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
|
||||
serverName: kubernetes
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
@ -0,0 +1,18 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: arm-exporter
|
||||
namespace: monitoring
|
||||
labels:
|
||||
k8s-app: arm-exporter
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: arm-exporter
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- monitoring
|
||||
endpoints:
|
||||
- port: http
|
||||
interval: 30s
|
@ -0,0 +1,17 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kube-controller-manager
|
||||
labels:
|
||||
k8s-app: kube-controller-manager
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
interval: 30s
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: kube-controller-manager
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
@ -0,0 +1,17 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kube-scheduler
|
||||
labels:
|
||||
k8s-app: kube-scheduler
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
endpoints:
|
||||
- port: http-metrics
|
||||
interval: 30s
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: kube-scheduler
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
@ -0,0 +1,28 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kube-state-metrics
|
||||
labels:
|
||||
k8s-app: kube-state-metrics
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: kube-state-metrics
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- monitoring
|
||||
endpoints:
|
||||
- port: http-main
|
||||
#scheme: https
|
||||
interval: 30s
|
||||
honorLabels: true
|
||||
#bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
#tlsConfig:
|
||||
# insecureSkipVerify: true
|
||||
- port: http-self
|
||||
# scheme: https
|
||||
interval: 30s
|
||||
# bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
# tlsConfig:
|
||||
# insecureSkipVerify: true
|
@ -0,0 +1,29 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: kubelet
|
||||
labels:
|
||||
k8s-app: kubelet
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
endpoints:
|
||||
- port: https-metrics
|
||||
scheme: https
|
||||
interval: 30s
|
||||
tlsConfig:
|
||||
insecureSkipVerify: true
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
- port: https-metrics
|
||||
scheme: https
|
||||
path: /metrics/cadvisor
|
||||
interval: 30s
|
||||
honorLabels: true
|
||||
tlsConfig:
|
||||
insecureSkipVerify: true
|
||||
bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: kubelet
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
@ -0,0 +1,21 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: node-exporter
|
||||
labels:
|
||||
k8s-app: node-exporter
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: node-exporter
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- monitoring
|
||||
endpoints:
|
||||
- port: http
|
||||
#scheme: http
|
||||
interval: 30s
|
||||
#bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
|
||||
#tlsConfig:
|
||||
# insecureSkipVerify: true
|
@ -0,0 +1,12 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: prometheus-operator
|
||||
labels:
|
||||
k8s-app: prometheus-operator
|
||||
spec:
|
||||
endpoints:
|
||||
- port: http
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: prometheus-operator
|
@ -0,0 +1,16 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: prometheus
|
||||
labels:
|
||||
k8s-app: prometheus
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
prometheus: k8s
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- monitoring
|
||||
endpoints:
|
||||
- port: web
|
||||
interval: 30s
|
@ -0,0 +1,17 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: ServiceMonitor
|
||||
metadata:
|
||||
name: traefik-ingress-lb
|
||||
labels:
|
||||
k8s-app: traefik-ingress-lb
|
||||
spec:
|
||||
jobLabel: k8s-app
|
||||
endpoints:
|
||||
- port: admin
|
||||
interval: 30s
|
||||
selector:
|
||||
matchLabels:
|
||||
k8s-app: traefik-ingress-lb
|
||||
namespaceSelector:
|
||||
matchNames:
|
||||
- kube-system
|
16
manifests/prometheus/prometheus-k8s-service.yaml
Normal file
16
manifests/prometheus/prometheus-k8s-service.yaml
Normal file
@ -0,0 +1,16 @@
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
labels:
|
||||
prometheus: k8s
|
||||
name: prometheus-k8s
|
||||
spec:
|
||||
type: NodePort
|
||||
ports:
|
||||
- name: web
|
||||
nodePort: 30900
|
||||
port: 9090
|
||||
protocol: TCP
|
||||
targetPort: web
|
||||
selector:
|
||||
prometheus: k8s
|
39
manifests/prometheus/prometheus-k8s.yaml
Normal file
39
manifests/prometheus/prometheus-k8s.yaml
Normal file
@ -0,0 +1,39 @@
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: Prometheus
|
||||
metadata:
|
||||
name: k8s
|
||||
labels:
|
||||
prometheus: k8s
|
||||
spec:
|
||||
replicas: 1
|
||||
baseImage: carlosedp/prometheus
|
||||
version: v2.1.0
|
||||
retention: "168h"
|
||||
serviceAccountName: prometheus-k8s
|
||||
serviceMonitorSelector:
|
||||
matchExpressions:
|
||||
- {key: k8s-app, operator: Exists}
|
||||
ruleSelector:
|
||||
matchLabels:
|
||||
role: prometheus-rulefiles
|
||||
prometheus: k8s
|
||||
resources:
|
||||
requests:
|
||||
# 2Gi is default, but won't schedule if you don't have a node with >2Gi
|
||||
# memory. Modify based on your target and time-series count for
|
||||
# production use. This value is mainly meant for demonstration/testing
|
||||
# purposes.
|
||||
memory: 400Mi
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- namespace: monitoring
|
||||
name: alertmanager-main
|
||||
port: web
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
30
teardown
Executable file
30
teardown
Executable file
@ -0,0 +1,30 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
if [ -z "${KUBECONFIG}" ]; then
|
||||
export KUBECONFIG=~/.kube/config
|
||||
fi
|
||||
|
||||
# CAUTION - NAMESPACE must match its value when deploy script was run.
|
||||
# Some resources are always deployed to the monitoring namespace.
|
||||
|
||||
if [ -z "${NAMESPACE}" ]; then
|
||||
NAMESPACE=monitoring
|
||||
fi
|
||||
|
||||
kctl() {
|
||||
kubectl --namespace "$NAMESPACE" "$@"
|
||||
}
|
||||
|
||||
kctl delete -f manifests/node-exporter
|
||||
kctl delete -f manifests/kube-state-metrics
|
||||
kctl delete -f manifests/grafana
|
||||
find manifests/prometheus -type f ! -name prometheus-k8s-roles.yaml ! -name prometheus-k8s-role-bindings.yaml -exec kubectl --namespace "$NAMESPACE" delete -f {} \;
|
||||
kubectl delete -f manifests/prometheus/prometheus-k8s-roles.yaml
|
||||
kubectl delete -f manifests/prometheus/prometheus-k8s-role-bindings.yaml
|
||||
kctl delete -f manifests/alertmanager
|
||||
|
||||
# Hack: wait a bit to let the controller delete the deployed Prometheus server.
|
||||
sleep 5
|
||||
|
||||
kctl delete -f manifests/prometheus-operator
|
||||
|
Loading…
Reference in New Issue
Block a user