mirror of
https://github.com/carlosedp/cluster-monitoring.git
synced 2024-11-20 19:07:17 +01:00
commit
8a357ac414
7
Makefile
7
Makefile
@ -46,3 +46,10 @@ ifeq (, $(shell which jsonnet))
|
|||||||
@go get github.com/google/go-jsonnet/cmd/jsonnet
|
@go get github.com/google/go-jsonnet/cmd/jsonnet
|
||||||
@go get github.com/brancz/gojsontoyaml
|
@go get github.com/brancz/gojsontoyaml
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
change_suffix:
|
||||||
|
@perl -p -i -e 's/^(\s*)\-\ host:.*/\1- host: alertmanager.${IP}.nip.io/g' manifests/ingress-alertmanager-main.yaml manifests/ingress-prometheus-k8s.yaml manifests/ingress-grafana.yaml
|
||||||
|
@echo "Ingress IPs changed to [service].${IP}.nip.io"
|
||||||
|
${K3S} kubectl apply -f manifests/ingress-alertmanager-main.yaml
|
||||||
|
${K3S} kubectl apply -f manifests/ingress-grafana.yaml
|
||||||
|
${K3S} kubectl apply -f manifests/ingress-prometheus-k8s.yaml
|
||||||
|
26
Readme.md
26
Readme.md
@ -55,6 +55,32 @@ $ until kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo ""
|
|||||||
$ kubectl apply -f manifests/ # This command sometimes may need to be done twice (to workaround a race condition).
|
$ kubectl apply -f manifests/ # This command sometimes may need to be done twice (to workaround a race condition).
|
||||||
```
|
```
|
||||||
|
|
||||||
|
If you get an error from applying the manifests, run the `make deploy` or `kubectl apply -f manifests/` again. Sometimes the resources required to apply the CRDs are not deployed yet.
|
||||||
|
|
||||||
|
## Customizing for K3s
|
||||||
|
|
||||||
|
To have your [K3s](https://github.com/rancher/k3s) cluster and the monitoring stack on it, deploy K3s with `curl -sfL https://get.k3s.io | sh -`.
|
||||||
|
|
||||||
|
Now to deploy the monitoring stack on your K3s cluster, there are three parameters to be configured on `vars.jsonnet`:
|
||||||
|
|
||||||
|
1. Set `k3s.enabled` to `true`.
|
||||||
|
2. Change your K3s master node IP(your VM or host IP) on `k3s.master_ip`.
|
||||||
|
3. Edit `suffixDomain` to have your node IP with the `.nip.io` suffix. This will be your ingress URL suffix.
|
||||||
|
|
||||||
|
After changing these values, run `make` to build the manifests and `k3s kubectl apply -f manifests/` to apply the stack to your cluster. In case of errors on some resources, re-run the command.
|
||||||
|
|
||||||
|
Now you can open the applications:
|
||||||
|
|
||||||
|
* Grafana on [https://grafana.[your_node_ip].nip.io](https://grafana.[your_node_ip].nip.io),
|
||||||
|
* Prometheus on [https://prometheus.[your_node_ip].nip.io](https://prometheus.[your_node_ip].nip.io)
|
||||||
|
* Alertmanager on [https://alertmanager.[your_node_ip].nip.io](https://alertmanager.[your_node_ip].nip.io)
|
||||||
|
|
||||||
|
There are some dashboards that shows no values due to some cadvisor metrics not having the complete metadata. Check the open issues for more information.
|
||||||
|
|
||||||
|
## Updating the ingress suffixes
|
||||||
|
|
||||||
|
To avoid rebuilding all manifests, there is a make target to update the Ingress URL suffix to a different suffix (using nip.io) to match your host IP. Run `make change_suffix IP="[IP-ADDRESS]"` to change the ingress route IP for Grafana, Prometheus and Alertmanager and reapply the manifests. If you have a K3s cluster, run `make change_suffix IP="[IP-ADDRESS] K3S=k3s`.
|
||||||
|
|
||||||
## Customizing
|
## Customizing
|
||||||
|
|
||||||
The content of this project consists of a set of jsonnet files making up a library to be consumed.
|
The content of this project consists of a set of jsonnet files making up a library to be consumed.
|
||||||
|
@ -8,7 +8,7 @@
|
|||||||
"subdir": "jsonnet/kube-prometheus"
|
"subdir": "jsonnet/kube-prometheus"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "4adb70b017e9a4ecb884a636dfef6fcae7d4bed8"
|
"version": "da959c643657c7d2aac6f5ddd68582a949283c49"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "ksonnet",
|
"name": "ksonnet",
|
||||||
@ -28,7 +28,7 @@
|
|||||||
"subdir": ""
|
"subdir": ""
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "0afc72e70df6048c6b65fd3e4968e53b0812b30c"
|
"version": "193d4934f85c9ff596d1f3e4dce7bd2da62a4d5e"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafonnet",
|
"name": "grafonnet",
|
||||||
@ -48,7 +48,7 @@
|
|||||||
"subdir": "grafana-builder"
|
"subdir": "grafana-builder"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "3c44dfa9bfe2b66985733d4b16e0afd29094b4a0"
|
"version": "565bf6b51d636e0efe4add39f2ab8e2b1abb731f"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "grafana",
|
"name": "grafana",
|
||||||
@ -58,7 +58,7 @@
|
|||||||
"subdir": "grafana"
|
"subdir": "grafana"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "c27d2792764867cdaf6484f067cc875cb8aef2f6"
|
"version": "7fadaf2274d5cbe4ac6fbaf8786e4b7ecf3c1713"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "prometheus-operator",
|
"name": "prometheus-operator",
|
||||||
@ -78,7 +78,7 @@
|
|||||||
"subdir": "Documentation/etcd-mixin"
|
"subdir": "Documentation/etcd-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "43ce2eefaa0a4bdd5c1e825ff08a32e6e46f3343"
|
"version": "8037e6e08727d4a17649f782cb4dbc482b8fe780"
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"name": "prometheus",
|
"name": "prometheus",
|
||||||
@ -88,7 +88,7 @@
|
|||||||
"subdir": "documentation/prometheus-mixin"
|
"subdir": "documentation/prometheus-mixin"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"version": "ff40de7ca6084f5aab1f3971025c00c217615589"
|
"version": "f0bb8129c3e6ffc6906bdc130f5625110643f168"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
250
k3s-overrides.jsonnet
Normal file
250
k3s-overrides.jsonnet
Normal file
@ -0,0 +1,250 @@
|
|||||||
|
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
|
||||||
|
local vars = import 'vars.jsonnet';
|
||||||
|
local service = k.core.v1.service;
|
||||||
|
local servicePort = k.core.v1.service.mixin.spec.portsType;
|
||||||
|
|
||||||
|
{
|
||||||
|
prometheus+:: {
|
||||||
|
kubeControllerManagerPrometheusDiscoveryService:
|
||||||
|
service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) +
|
||||||
|
service.mixin.metadata.withNamespace('kube-system') +
|
||||||
|
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
||||||
|
service.mixin.spec.withClusterIp('None'),
|
||||||
|
kubeControllerManagerPrometheusDiscoveryEndpoints:
|
||||||
|
local endpoints = k.core.v1.endpoints;
|
||||||
|
local endpointSubset = endpoints.subsetsType;
|
||||||
|
local endpointPort = endpointSubset.portsType;
|
||||||
|
|
||||||
|
local Port = endpointPort.new() +
|
||||||
|
endpointPort.withName('http-metrics') +
|
||||||
|
endpointPort.withPort(10252) +
|
||||||
|
endpointPort.withProtocol('TCP');
|
||||||
|
|
||||||
|
local subset = endpointSubset.new() +
|
||||||
|
endpointSubset.withAddresses([
|
||||||
|
{ ip: vars.k3s.master_ip }]) +
|
||||||
|
endpointSubset.withPorts(Port);
|
||||||
|
|
||||||
|
endpoints.new() +
|
||||||
|
endpoints.mixin.metadata.withName('kube-controller-manager-prometheus-discovery') +
|
||||||
|
endpoints.mixin.metadata.withNamespace('kube-system') +
|
||||||
|
endpoints.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) +
|
||||||
|
endpoints.withSubsets(subset),
|
||||||
|
|
||||||
|
kubeSchedulerPrometheusDiscoveryService:
|
||||||
|
service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) +
|
||||||
|
service.mixin.metadata.withNamespace('kube-system') +
|
||||||
|
service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
||||||
|
service.mixin.spec.withClusterIp('None'),
|
||||||
|
|
||||||
|
kubeSchedulerPrometheusDiscoveryEndpoints:
|
||||||
|
local endpoints = k.core.v1.endpoints;
|
||||||
|
local endpointSubset = endpoints.subsetsType;
|
||||||
|
local endpointPort = endpointSubset.portsType;
|
||||||
|
|
||||||
|
local Port = endpointPort.new() +
|
||||||
|
endpointPort.withName('http-metrics') +
|
||||||
|
endpointPort.withPort(10251) +
|
||||||
|
endpointPort.withProtocol('TCP');
|
||||||
|
|
||||||
|
local subset = endpointSubset.new() +
|
||||||
|
endpointSubset.withAddresses([
|
||||||
|
{ ip: vars.k3s.master_ip }]) +
|
||||||
|
endpointSubset.withPorts(Port);
|
||||||
|
|
||||||
|
endpoints.new() +
|
||||||
|
endpoints.mixin.metadata.withName('kube-scheduler-prometheus-discovery') +
|
||||||
|
endpoints.mixin.metadata.withNamespace('kube-system') +
|
||||||
|
endpoints.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) +
|
||||||
|
endpoints.withSubsets(subset),
|
||||||
|
|
||||||
|
serviceMonitorKubelet+:
|
||||||
|
{
|
||||||
|
spec+: {
|
||||||
|
endpoints: [
|
||||||
|
{
|
||||||
|
port: 'https-metrics',
|
||||||
|
scheme: 'https',
|
||||||
|
interval: '30s',
|
||||||
|
honorLabels: true,
|
||||||
|
tlsConfig: {
|
||||||
|
insecureSkipVerify: true,
|
||||||
|
},
|
||||||
|
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
||||||
|
},
|
||||||
|
{
|
||||||
|
port: 'https-metrics',
|
||||||
|
scheme: 'https',
|
||||||
|
path: '/metrics/cadvisor',
|
||||||
|
interval: '30s',
|
||||||
|
honorLabels: true,
|
||||||
|
tlsConfig: {
|
||||||
|
insecureSkipVerify: true,
|
||||||
|
},
|
||||||
|
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
|
||||||
|
metricRelabelings: [
|
||||||
|
// Drop a bunch of metrics which are disabled but still sent, see
|
||||||
|
// https://github.com/google/cadvisor/issues/1925.
|
||||||
|
{
|
||||||
|
sourceLabels: ['__name__'],
|
||||||
|
regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)',
|
||||||
|
action: 'drop',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
nodeExporter+:: {
|
||||||
|
daemonset+: {
|
||||||
|
spec+: {
|
||||||
|
template+: {
|
||||||
|
spec+: {
|
||||||
|
containers:
|
||||||
|
std.filterMap(
|
||||||
|
function(c) std.startsWith(c.name, 'kube-rbac') != true,
|
||||||
|
function(c)
|
||||||
|
if std.startsWith(c.name, 'node-exporter') then
|
||||||
|
c {
|
||||||
|
args: [
|
||||||
|
'--web.listen-address=:' + $._config.nodeExporter.port,
|
||||||
|
'--path.procfs=/host/proc',
|
||||||
|
'--path.sysfs=/host/sys',
|
||||||
|
'--path.rootfs=/host/root',
|
||||||
|
// The following settings have been taken from
|
||||||
|
// https://github.com/prometheus/node_exporter/blob/0662673/collector/filesystem_linux.go#L30-L31
|
||||||
|
// Once node exporter is being released with those settings, this can be removed.
|
||||||
|
'--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)',
|
||||||
|
'--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$',
|
||||||
|
],
|
||||||
|
ports: [
|
||||||
|
{
|
||||||
|
containerPort: 9100,
|
||||||
|
name: 'http'
|
||||||
|
}],
|
||||||
|
|
||||||
|
}
|
||||||
|
else
|
||||||
|
c,
|
||||||
|
super.containers,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
service+:
|
||||||
|
{
|
||||||
|
spec+: {
|
||||||
|
ports: [{
|
||||||
|
name: 'http',
|
||||||
|
port: 9100,
|
||||||
|
targetPort: 'http'
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
serviceMonitor+:
|
||||||
|
{
|
||||||
|
spec+: {
|
||||||
|
endpoints: [
|
||||||
|
{
|
||||||
|
port: 'http',
|
||||||
|
scheme: 'http',
|
||||||
|
interval: '30s',
|
||||||
|
relabelings: [
|
||||||
|
{
|
||||||
|
action: 'replace',
|
||||||
|
regex: '(.*)',
|
||||||
|
replacment: '$1',
|
||||||
|
sourceLabels: ['__meta_kubernetes_pod_node_name'],
|
||||||
|
targetLabel: 'instance',
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
|
||||||
|
kubeStateMetrics+:: {
|
||||||
|
deployment+: {
|
||||||
|
spec+: {
|
||||||
|
template+: {
|
||||||
|
spec+: {
|
||||||
|
containers:
|
||||||
|
std.filterMap(
|
||||||
|
function(c) std.startsWith(c.name, 'kube-rbac') != true,
|
||||||
|
function(c)
|
||||||
|
if std.startsWith(c.name, 'kube-state-metrics') then
|
||||||
|
c {
|
||||||
|
args: [
|
||||||
|
'--port=8080',
|
||||||
|
'--telemetry-port=8081',
|
||||||
|
],
|
||||||
|
ports: [
|
||||||
|
{
|
||||||
|
containerPort: 8080,
|
||||||
|
name: 'http-main'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
containerPort: 8081,
|
||||||
|
name: 'http-self'
|
||||||
|
}],
|
||||||
|
}
|
||||||
|
else
|
||||||
|
c,
|
||||||
|
super.containers,
|
||||||
|
),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
service+:
|
||||||
|
{
|
||||||
|
spec+: {
|
||||||
|
ports: [{
|
||||||
|
name: 'http-main',
|
||||||
|
port: 8080,
|
||||||
|
targetPort: 'http-main'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: 'http-self',
|
||||||
|
port: 8081,
|
||||||
|
targetPort: 'http-self'
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
|
||||||
|
serviceMonitor+:
|
||||||
|
{
|
||||||
|
spec+: {
|
||||||
|
endpoints: [
|
||||||
|
{
|
||||||
|
port: 'http-main',
|
||||||
|
scheme: 'http',
|
||||||
|
interval: $._config.kubeStateMetrics.scrapeInterval,
|
||||||
|
scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout,
|
||||||
|
honorLabels: true,
|
||||||
|
tlsConfig: {
|
||||||
|
insecureSkipVerify: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
port: 'http-self',
|
||||||
|
scheme: 'http',
|
||||||
|
interval: '30s',
|
||||||
|
tlsConfig: {
|
||||||
|
insecureSkipVerify: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
|
||||||
|
}
|
@ -15,13 +15,17 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet')
|
|||||||
+ (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet')
|
+ (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet')
|
||||||
// Use http Kubelet targets. Comment to revert to https
|
// Use http Kubelet targets. Comment to revert to https
|
||||||
+ (import 'kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet')
|
+ (import 'kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet')
|
||||||
+ (import 'base_operator_stack.jsonnet')
|
|
||||||
+ (import 'smtp_server.jsonnet')
|
+ (import 'smtp_server.jsonnet')
|
||||||
// Additional modules are loaded dynamically from vars.jsonnet
|
// Additional modules are loaded dynamically from vars.jsonnet
|
||||||
+ join_objects([module.file for module in vars.modules if module.enabled])
|
+ join_objects([module.file for module in vars.modules if module.enabled])
|
||||||
|
// Load K3s customized modules
|
||||||
|
+ join_objects([m for m in [import 'k3s-overrides.jsonnet'] if vars.k3s.enabled])
|
||||||
|
// Base stack is loaded at the end to override previous definitions
|
||||||
|
+ (import 'base_operator_stack.jsonnet')
|
||||||
// Load image versions last to override default from modules
|
// Load image versions last to override default from modules
|
||||||
+ (import 'image_sources_versions.jsonnet');
|
+ (import 'image_sources_versions.jsonnet');
|
||||||
|
|
||||||
|
|
||||||
// Generate core modules
|
// Generate core modules
|
||||||
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) }
|
{ ['00namespace-' + name]: kp.kubePrometheus[name] for name in std.objectFields(kp.kubePrometheus) }
|
||||||
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) }
|
{ ['0prometheus-operator-' + name]: kp.prometheusOperator[name] for name in std.objectFields(kp.prometheusOperator) }
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
|||||||
apiVersion: apps/v1beta2
|
apiVersion: apps/v1
|
||||||
kind: Deployment
|
kind: Deployment
|
||||||
metadata:
|
metadata:
|
||||||
labels:
|
labels:
|
||||||
@ -51,12 +51,6 @@ spec:
|
|||||||
- mountPath: /grafana-dashboard-definitions/0/coredns-dashboard
|
- mountPath: /grafana-dashboard-definitions/0/coredns-dashboard
|
||||||
name: grafana-dashboard-coredns-dashboard
|
name: grafana-dashboard-coredns-dashboard
|
||||||
readOnly: false
|
readOnly: false
|
||||||
- mountPath: /grafana-dashboard-definitions/0/k8s-cluster-rsrc-use
|
|
||||||
name: grafana-dashboard-k8s-cluster-rsrc-use
|
|
||||||
readOnly: false
|
|
||||||
- mountPath: /grafana-dashboard-definitions/0/k8s-node-rsrc-use
|
|
||||||
name: grafana-dashboard-k8s-node-rsrc-use
|
|
||||||
readOnly: false
|
|
||||||
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster
|
- mountPath: /grafana-dashboard-definitions/0/k8s-resources-cluster
|
||||||
name: grafana-dashboard-k8s-resources-cluster
|
name: grafana-dashboard-k8s-resources-cluster
|
||||||
readOnly: false
|
readOnly: false
|
||||||
@ -78,9 +72,6 @@ spec:
|
|||||||
- mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard
|
- mountPath: /grafana-dashboard-definitions/0/kubernetes-cluster-dashboard
|
||||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||||
readOnly: false
|
readOnly: false
|
||||||
- mountPath: /grafana-dashboard-definitions/0/nodes
|
|
||||||
name: grafana-dashboard-nodes
|
|
||||||
readOnly: false
|
|
||||||
- mountPath: /grafana-dashboard-definitions/0/persistentvolumesusage
|
- mountPath: /grafana-dashboard-definitions/0/persistentvolumesusage
|
||||||
name: grafana-dashboard-persistentvolumesusage
|
name: grafana-dashboard-persistentvolumesusage
|
||||||
readOnly: false
|
readOnly: false
|
||||||
@ -132,12 +123,6 @@ spec:
|
|||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-coredns-dashboard
|
name: grafana-dashboard-coredns-dashboard
|
||||||
name: grafana-dashboard-coredns-dashboard
|
name: grafana-dashboard-coredns-dashboard
|
||||||
- configMap:
|
|
||||||
name: grafana-dashboard-k8s-cluster-rsrc-use
|
|
||||||
name: grafana-dashboard-k8s-cluster-rsrc-use
|
|
||||||
- configMap:
|
|
||||||
name: grafana-dashboard-k8s-node-rsrc-use
|
|
||||||
name: grafana-dashboard-k8s-node-rsrc-use
|
|
||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-k8s-resources-cluster
|
name: grafana-dashboard-k8s-resources-cluster
|
||||||
name: grafana-dashboard-k8s-resources-cluster
|
name: grafana-dashboard-k8s-resources-cluster
|
||||||
@ -159,9 +144,6 @@ spec:
|
|||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||||
name: grafana-dashboard-kubernetes-cluster-dashboard
|
name: grafana-dashboard-kubernetes-cluster-dashboard
|
||||||
- configMap:
|
|
||||||
name: grafana-dashboard-nodes
|
|
||||||
name: grafana-dashboard-nodes
|
|
||||||
- configMap:
|
- configMap:
|
||||||
name: grafana-dashboard-persistentvolumesusage
|
name: grafana-dashboard-persistentvolumesusage
|
||||||
name: grafana-dashboard-persistentvolumesusage
|
name: grafana-dashboard-persistentvolumesusage
|
||||||
|
@ -153,177 +153,9 @@ spec:
|
|||||||
node_namespace_pod:kube_pod_info:
|
node_namespace_pod:kube_pod_info:
|
||||||
))
|
))
|
||||||
record: node:node_num_cpu:sum
|
record: node:node_num_cpu:sum
|
||||||
- expr: |
|
|
||||||
1 - avg(rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m]))
|
|
||||||
record: :node_cpu_utilisation:avg1m
|
|
||||||
- expr: |
|
|
||||||
1 - avg by (node) (
|
|
||||||
rate(node_cpu_seconds_total{job="node-exporter",mode="idle"}[1m])
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:)
|
|
||||||
record: node:node_cpu_utilisation:avg1m
|
|
||||||
- expr: |
|
|
||||||
node:node_cpu_utilisation:avg1m
|
|
||||||
*
|
|
||||||
node:node_num_cpu:sum
|
|
||||||
/
|
|
||||||
scalar(sum(node:node_num_cpu:sum))
|
|
||||||
record: node:cluster_cpu_utilisation:ratio
|
|
||||||
- expr: |
|
|
||||||
sum(node_load1{job="node-exporter"})
|
|
||||||
/
|
|
||||||
sum(node:node_num_cpu:sum)
|
|
||||||
record: ':node_cpu_saturation_load1:'
|
|
||||||
- expr: |
|
|
||||||
sum by (node) (
|
|
||||||
node_load1{job="node-exporter"}
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
/
|
|
||||||
node:node_num_cpu:sum
|
|
||||||
record: 'node:node_cpu_saturation_load1:'
|
|
||||||
- expr: |
|
|
||||||
1 -
|
|
||||||
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
|
||||||
/
|
|
||||||
sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
|
||||||
record: ':node_memory_utilisation:'
|
|
||||||
- expr: |
|
- expr: |
|
||||||
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
sum(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
||||||
record: :node_memory_MemFreeCachedBuffers_bytes:sum
|
record: :node_memory_MemFreeCachedBuffers_bytes:sum
|
||||||
- expr: |
|
|
||||||
sum(node_memory_MemTotal_bytes{job="node-exporter"})
|
|
||||||
record: :node_memory_MemTotal_bytes:sum
|
|
||||||
- expr: |
|
|
||||||
sum by (node) (
|
|
||||||
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_memory_bytes_available:sum
|
|
||||||
- expr: |
|
|
||||||
sum by (node) (
|
|
||||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_memory_bytes_total:sum
|
|
||||||
- expr: |
|
|
||||||
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
|
||||||
/
|
|
||||||
node:node_memory_bytes_total:sum
|
|
||||||
record: node:node_memory_utilisation:ratio
|
|
||||||
- expr: |
|
|
||||||
(node:node_memory_bytes_total:sum - node:node_memory_bytes_available:sum)
|
|
||||||
/
|
|
||||||
scalar(sum(node:node_memory_bytes_total:sum))
|
|
||||||
record: node:cluster_memory_utilisation:ratio
|
|
||||||
- expr: |
|
|
||||||
1e3 * sum(
|
|
||||||
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
|
|
||||||
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
|
|
||||||
)
|
|
||||||
record: :node_memory_swap_io_bytes:sum_rate
|
|
||||||
- expr: |
|
|
||||||
1 -
|
|
||||||
sum by (node) (
|
|
||||||
(node_memory_MemFree_bytes{job="node-exporter"} + node_memory_Cached_bytes{job="node-exporter"} + node_memory_Buffers_bytes{job="node-exporter"})
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
/
|
|
||||||
sum by (node) (
|
|
||||||
node_memory_MemTotal_bytes{job="node-exporter"}
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: 'node:node_memory_utilisation:'
|
|
||||||
- expr: |
|
|
||||||
1 - (node:node_memory_bytes_available:sum / node:node_memory_bytes_total:sum)
|
|
||||||
record: 'node:node_memory_utilisation_2:'
|
|
||||||
- expr: |
|
|
||||||
1e3 * sum by (node) (
|
|
||||||
(rate(node_vmstat_pgpgin{job="node-exporter"}[1m])
|
|
||||||
+ rate(node_vmstat_pgpgout{job="node-exporter"}[1m]))
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_memory_swap_io_bytes:sum_rate
|
|
||||||
- expr: |
|
|
||||||
avg(irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
|
||||||
record: :node_disk_utilisation:avg_irate
|
|
||||||
- expr: |
|
|
||||||
avg by (node) (
|
|
||||||
irate(node_disk_io_time_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_disk_utilisation:avg_irate
|
|
||||||
- expr: |
|
|
||||||
avg(irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]))
|
|
||||||
record: :node_disk_saturation:avg_irate
|
|
||||||
- expr: |
|
|
||||||
avg by (node) (
|
|
||||||
irate(node_disk_io_time_weighted_seconds_total{job="node-exporter",device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m])
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_disk_saturation:avg_irate
|
|
||||||
- expr: |
|
|
||||||
max by (instance, namespace, pod, device) ((node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"}
|
|
||||||
- node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
||||||
/ node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
||||||
record: 'node:node_filesystem_usage:'
|
|
||||||
- expr: |
|
|
||||||
max by (instance, namespace, pod, device) (node_filesystem_avail_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"} / node_filesystem_size_bytes{fstype=~"ext[234]|btrfs|xfs|zfs"})
|
|
||||||
record: 'node:node_filesystem_avail:'
|
|
||||||
- expr: |
|
|
||||||
sum(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m])) +
|
|
||||||
sum(irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
||||||
record: :node_net_utilisation:sum_irate
|
|
||||||
- expr: |
|
|
||||||
sum by (node) (
|
|
||||||
(irate(node_network_receive_bytes_total{job="node-exporter",device!~"veth.+"}[1m]) +
|
|
||||||
irate(node_network_transmit_bytes_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_net_utilisation:sum_irate
|
|
||||||
- expr: |
|
|
||||||
sum(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m])) +
|
|
||||||
sum(irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
||||||
record: :node_net_saturation:sum_irate
|
|
||||||
- expr: |
|
|
||||||
sum by (node) (
|
|
||||||
(irate(node_network_receive_drop_total{job="node-exporter",device!~"veth.+"}[1m]) +
|
|
||||||
irate(node_network_transmit_drop_total{job="node-exporter",device!~"veth.+"}[1m]))
|
|
||||||
* on (namespace, pod) group_left(node)
|
|
||||||
node_namespace_pod:kube_pod_info:
|
|
||||||
)
|
|
||||||
record: node:node_net_saturation:sum_irate
|
|
||||||
- expr: |
|
|
||||||
max(
|
|
||||||
max(
|
|
||||||
kube_pod_info{job="kube-state-metrics", host_ip!=""}
|
|
||||||
) by (node, host_ip)
|
|
||||||
* on (host_ip) group_right (node)
|
|
||||||
label_replace(
|
|
||||||
(max(node_filesystem_files{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
|
|
||||||
)
|
|
||||||
) by (node)
|
|
||||||
record: 'node:node_inodes_total:'
|
|
||||||
- expr: |
|
|
||||||
max(
|
|
||||||
max(
|
|
||||||
kube_pod_info{job="kube-state-metrics", host_ip!=""}
|
|
||||||
) by (node, host_ip)
|
|
||||||
* on (host_ip) group_right (node)
|
|
||||||
label_replace(
|
|
||||||
(max(node_filesystem_files_free{job="node-exporter", mountpoint="/"}) by (instance)), "host_ip", "$1", "instance", "(.*):.*"
|
|
||||||
)
|
|
||||||
) by (node)
|
|
||||||
record: 'node:node_inodes_free:'
|
|
||||||
- name: kube-prometheus-node-recording.rules
|
- name: kube-prometheus-node-recording.rules
|
||||||
rules:
|
rules:
|
||||||
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
|
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
|
||||||
@ -446,17 +278,17 @@ spec:
|
|||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping
|
||||||
expr: |
|
expr: |
|
||||||
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
|
rate(kube_pod_container_status_restarts_total{job="kube-state-metrics"}[15m]) * 60 * 5 > 0
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubePodNotReady
|
- alert: KubePodNotReady
|
||||||
annotations:
|
annotations:
|
||||||
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
message: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||||
state for longer than an hour.
|
state for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
|
||||||
expr: |
|
expr: |
|
||||||
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) > 0
|
sum by (namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) > 0
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubeDeploymentGenerationMismatch
|
- alert: KubeDeploymentGenerationMismatch
|
||||||
@ -475,13 +307,13 @@ spec:
|
|||||||
- alert: KubeDeploymentReplicasMismatch
|
- alert: KubeDeploymentReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
|
message: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has not
|
||||||
matched the expected number of replicas for longer than an hour.
|
matched the expected number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
|
||||||
expr: |
|
expr: |
|
||||||
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
kube_deployment_spec_replicas{job="kube-state-metrics"}
|
||||||
!=
|
!=
|
||||||
kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
kube_deployment_status_replicas_available{job="kube-state-metrics"}
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubeStatefulSetReplicasMismatch
|
- alert: KubeStatefulSetReplicasMismatch
|
||||||
@ -589,7 +421,7 @@ spec:
|
|||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubejobfailed
|
||||||
expr: |
|
expr: |
|
||||||
kube_job_status_failed{job="kube-state-metrics"} > 0
|
kube_job_status_failed{job="kube-state-metrics"} > 0
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- name: kubernetes-resources
|
- name: kubernetes-resources
|
||||||
@ -723,7 +555,7 @@ spec:
|
|||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodenotready
|
||||||
expr: |
|
expr: |
|
||||||
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
kube_node_status_condition{job="kube-state-metrics",condition="Ready",status="true"} == 0
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeVersionMismatch
|
- alert: KubeVersionMismatch
|
||||||
@ -733,7 +565,7 @@ spec:
|
|||||||
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeversionmismatch
|
||||||
expr: |
|
expr: |
|
||||||
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
|
count(count by (gitVersion) (label_replace(kubernetes_build_info{job!~"kube-dns|coredns"},"gitVersion","$1","gitVersion","(v[0-9]*.[0-9]*.[0-9]*).*"))) > 1
|
||||||
for: 1h
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeClientErrors
|
- alert: KubeClientErrors
|
||||||
@ -949,17 +781,6 @@ spec:
|
|||||||
for: 4h
|
for: 4h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PrometheusTSDBWALCorruptions
|
|
||||||
annotations:
|
|
||||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} has detected
|
|
||||||
{{$value | humanize}} corruptions of the write-ahead log (WAL) over the
|
|
||||||
last 3h.
|
|
||||||
summary: Prometheus is detecting WAL corruptions.
|
|
||||||
expr: |
|
|
||||||
increase(tsdb_wal_corruptions_total{job="prometheus-k8s",namespace="monitoring"}[3h]) > 0
|
|
||||||
for: 4h
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: PrometheusNotIngestingSamples
|
- alert: PrometheusNotIngestingSamples
|
||||||
annotations:
|
annotations:
|
||||||
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
|
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} is not ingesting
|
||||||
@ -1054,7 +875,7 @@ spec:
|
|||||||
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
|
message: The configuration of the instances of the Alertmanager cluster `{{$labels.service}}`
|
||||||
are out of sync.
|
are out of sync.
|
||||||
expr: |
|
expr: |
|
||||||
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}, "service", "alertmanager-$1", "name", "(.*)") != 1
|
count_values("config_hash", alertmanager_config_hash{job="alertmanager-main",namespace="monitoring"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="prometheus-operator",namespace="monitoring",controller="alertmanager"}) by (name, job, namespace, controller), "service", "alertmanager-$1", "name", "(.*)") != 1
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
|
11
vars.jsonnet
11
vars.jsonnet
@ -28,15 +28,20 @@
|
|||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
|
||||||
|
k3s: {
|
||||||
|
enabled: false,
|
||||||
|
master_ip: '192.168.99.100'
|
||||||
|
},
|
||||||
|
|
||||||
|
// Domain suffix for the ingresses
|
||||||
|
suffixDomain: '192.168.99.100.nip.io',
|
||||||
|
|
||||||
// Setting these to false, defaults to emptyDirs
|
// Setting these to false, defaults to emptyDirs
|
||||||
enablePersistence: {
|
enablePersistence: {
|
||||||
prometheus: false,
|
prometheus: false,
|
||||||
grafana: false,
|
grafana: false,
|
||||||
},
|
},
|
||||||
|
|
||||||
// Domain suffix for the ingresses
|
|
||||||
suffixDomain: '192.168.99.100.nip.io',
|
|
||||||
|
|
||||||
// Grafana "from" email
|
// Grafana "from" email
|
||||||
grafana: {
|
grafana: {
|
||||||
from_address: 'myemail@gmail.com',
|
from_address: 'myemail@gmail.com',
|
||||||
|
Loading…
Reference in New Issue
Block a user