diff --git a/Makefile b/Makefile index 5d4b5c4..ca262f4 100644 --- a/Makefile +++ b/Makefile @@ -27,6 +27,10 @@ deploy: manifests sleep 40 kubectl apply -f ./manifests/ +ingressip: + @perl -p -i -e 's/^(\s*)\-\ host:.*/\1- host: alertmanager.${IP}.nip.io/g' manifests/ingress-alertmanager-main.yaml manifests/ingress-prometheus-k8s.yaml manifests/ingress-grafana.yaml + @echo "Ingress IPs changed to [service].${IP}.nip.io" + teardown: kubectl delete -f ./manifests/ diff --git a/Readme.md b/Readme.md index d8a0b79..fa23aaa 100644 --- a/Readme.md +++ b/Readme.md @@ -55,6 +55,27 @@ $ until kubectl get servicemonitors --all-namespaces ; do date; sleep 1; echo "" $ kubectl apply -f manifests/ # This command sometimes may need to be done twice (to workaround a race condition). ``` +If you get an error from applying the manifests, run the `make deploy` or `kubectl apply -f manifests/` again. Sometimes the resources required to apply the CRDs are not deployed yet. + +## Customizing for K3s + +To have your [K3s](https://github.com/rancher/k3s) cluster and the monitoring stack on it, deploy K3s with `curl -sfL https://get.k3s.io | sh -`. + +Now to deploy the monitoring stack on your K3s cluster, there are three parameters to be configured on `vars.jsonnet`: + +1. Set `k3s.enabled` to `true`. +2. Change your K3s master node IP(your VM or host IP) on `k3s.master_ip`. +3. Edit `suffixDomain` to have your node IP with the `.nip.io` suffix. This will be your ingress URL suffix. + +After changing these values, run `make` to build the manifests and `k3s kubectl apply -f manifests/` to apply the stack to your cluster. In case of errors on some resources, re-run the command. + +If you already have the manifests and don't want to rebuilt from Jsonnet, just run `make ingressip IP="[IP-ADDRESS]"` to change the IP for the ingress routes for Grafana, Prometheus and Alertmanager. Re-apply the ingress manifests after this. + +Now you can open the applications: +* Grafana on [https://grafana.[your_node_ip].nip.io](https://grafana.[your_node_ip].nip.io), +* Prometheus on [https://prometheus.[your_node_ip].nip.io](https://prometheus.[your_node_ip].nip.io) +* Alertmanager on [https://alertmanager.[your_node_ip].nip.io](https://alertmanager.[your_node_ip].nip.io) + ## Customizing The content of this project consists of a set of jsonnet files making up a library to be consumed. diff --git a/k3s-overrides.jsonnet b/k3s-overrides.jsonnet index dd6be1d..8d8672e 100644 --- a/k3s-overrides.jsonnet +++ b/k3s-overrides.jsonnet @@ -1,7 +1,102 @@ local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; local vars = import 'vars.jsonnet'; +local service = k.core.v1.service; +local servicePort = k.core.v1.service.mixin.spec.portsType; { + prometheus+:: { + kubeControllerManagerPrometheusDiscoveryService: + service.new('kube-controller-manager-prometheus-discovery', { 'k8s-app': 'kube-controller-manager' }, servicePort.newNamed('http-metrics', 10252, 10252)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) + + service.mixin.spec.withClusterIp('None'), + kubeControllerManagerPrometheusDiscoveryEndpoints: + local endpoints = k.core.v1.endpoints; + local endpointSubset = endpoints.subsetsType; + local endpointPort = endpointSubset.portsType; + + local Port = endpointPort.new() + + endpointPort.withName('http-metrics') + + endpointPort.withPort(10252) + + endpointPort.withProtocol('TCP'); + + local subset = endpointSubset.new() + + endpointSubset.withAddresses([ + { ip: vars.k3s.master_ip }]) + + endpointSubset.withPorts(Port); + + endpoints.new() + + endpoints.mixin.metadata.withName('kube-controller-manager-prometheus-discovery') + + endpoints.mixin.metadata.withNamespace('kube-system') + + endpoints.mixin.metadata.withLabels({ 'k8s-app': 'kube-controller-manager' }) + + endpoints.withSubsets(subset), + + kubeSchedulerPrometheusDiscoveryService: + service.new('kube-scheduler-prometheus-discovery', { 'k8s-app': 'kube-scheduler' }, servicePort.newNamed('http-metrics', 10251, 10251)) + + service.mixin.metadata.withNamespace('kube-system') + + service.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) + + service.mixin.spec.withClusterIp('None'), + + kubeSchedulerPrometheusDiscoveryEndpoints: + local endpoints = k.core.v1.endpoints; + local endpointSubset = endpoints.subsetsType; + local endpointPort = endpointSubset.portsType; + + local Port = endpointPort.new() + + endpointPort.withName('http-metrics') + + endpointPort.withPort(10251) + + endpointPort.withProtocol('TCP'); + + local subset = endpointSubset.new() + + endpointSubset.withAddresses([ + { ip: vars.k3s.master_ip }]) + + endpointSubset.withPorts(Port); + + endpoints.new() + + endpoints.mixin.metadata.withName('kube-scheduler-prometheus-discovery') + + endpoints.mixin.metadata.withNamespace('kube-system') + + endpoints.mixin.metadata.withLabels({ 'k8s-app': 'kube-scheduler' }) + + endpoints.withSubsets(subset), + + serviceMonitorKubelet+: + { + spec+: { + endpoints: [ + { + port: 'https-metrics', + scheme: 'https', + interval: '30s', + honorLabels: true, + tlsConfig: { + insecureSkipVerify: true, + }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + }, + { + port: 'https-metrics', + scheme: 'https', + path: '/metrics/cadvisor', + interval: '30s', + honorLabels: true, + tlsConfig: { + insecureSkipVerify: true, + }, + bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token', + metricRelabelings: [ + // Drop a bunch of metrics which are disabled but still sent, see + // https://github.com/google/cadvisor/issues/1925. + { + sourceLabels: ['__name__'], + regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)', + action: 'drop', + }, + ], + }, + ], + }, + }, + }, + nodeExporter+:: { daemonset+: { spec+: { @@ -24,6 +119,12 @@ local vars = import 'vars.jsonnet'; '--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)', '--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$', ], + ports: [ + { + containerPort: 9100, + name: 'http' + }], + } else c, @@ -135,7 +236,7 @@ local vars = import 'vars.jsonnet'; }, { port: 'http-self', - scheme: 'https', + scheme: 'http', interval: '30s', tlsConfig: { insecureSkipVerify: true, diff --git a/main.jsonnet b/main.jsonnet index 8621058..094a403 100644 --- a/main.jsonnet +++ b/main.jsonnet @@ -14,12 +14,12 @@ local kp = (import 'kube-prometheus/kube-prometheus.libsonnet') + (import 'kube-prometheus/kube-prometheus-kops-coredns.libsonnet') + (import 'kube-prometheus/kube-prometheus-kubeadm.libsonnet') // Use http Kubelet targets. Comment to revert to https - + join_objects([m for m in [import 'kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet'] if vars.k3s == false]) + + (import 'kube-prometheus/kube-prometheus-insecure-kubelet.libsonnet') + (import 'smtp_server.jsonnet') // Additional modules are loaded dynamically from vars.jsonnet + join_objects([module.file for module in vars.modules if module.enabled]) // Load K3s customized modules - + join_objects([m for m in [import 'k3s-overrides.jsonnet'] if vars.k3s]) + + join_objects([m for m in [import 'k3s-overrides.jsonnet'] if vars.k3s.enabled]) // Base stack is loaded at the end to override previous definitions + (import 'base_operator_stack.jsonnet') // Load image versions last to override default from modules diff --git a/manifests/grafana-dashboardDefinitions.yaml b/manifests/grafana-dashboardDefinitions.yaml index f61c7ac..a23c662 100644 --- a/manifests/grafana-dashboardDefinitions.yaml +++ b/manifests/grafana-dashboardDefinitions.yaml @@ -29023,4 +29023,1786 @@ items: metadata: name: grafana-dashboard-statefulset namespace: monitoring +- apiVersion: v1 + data: + traefik-dashboard.json: |- + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Traefik dashboard prometheus", + "editable": true, + "gnetId": 5851, + "graphTooltip": 0, + "id": 20, + "iteration": 1549118220486, + "links": [ + + ], + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 21, + "panels": [ + + ], + "title": "General", + "type": "row" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#d44a3a", + "rgba(237, 129, 40, 0.89)", + "#299c46" + ], + "datasource": "prometheus", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 3, + "x": 0, + "y": 1 + }, + "id": 13, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "count(kube_pod_status_ready{namespace=\"$namespace\",condition=\"true\",pod=~\"traefik.*\"})", + "format": "time_series", + "intervalFactor": 1, + "refId": "A" + } + ], + "thresholds": "", + "title": "Traefik instances", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "s", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 3, + "x": 3, + "y": 1 + }, + "id": 37, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "time() - max(process_start_time_seconds{job=~\"traefik.*\"})", + "format": "time_series", + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Uptime", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "current" + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "prometheus", + "format": "ms", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 7, + "w": 4, + "x": 6, + "y": 1 + }, + "id": 39, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(traefik_entrypoint_request_duration_seconds_sum) / sum(traefik_entrypoint_requests_total) * 1000", + "format": "time_series", + "intervalFactor": 2, + "refId": "A" + } + ], + "thresholds": "", + "title": "Average response time", + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { + "Latency over 1 min": "rgb(9, 116, 190)", + "Latency over 5 min": "#bf1b00" + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 2, + "gridPos": { + "h": 7, + "w": 14, + "x": 10, + "y": 1 + }, + "id": 11, + "legend": { + "avg": true, + "current": false, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 2, + "links": [ + + ], + "nullPointMode": "null as zero", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + { + "alias": "Latency over 5 min", + "yaxis": 1 + } + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.$percentiles, sum(rate(traefik_entrypoint_request_duration_seconds_bucket{namespace=\"$namespace\", code=\"200\",method=\"GET\"}[5m])) by (le))", + "format": "time_series", + "hide": false, + "intervalFactor": 1, + "legendFormat": "Latency over 1 min", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Global latency $percentiles th perc over 5 min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 8 + }, + "id": 29, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.$percentiles, rate(traefik_entrypoint_request_duration_seconds_bucket{namespace=\"$namespace\",code=\"200\",method=\"GET\"}[5m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ instance }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Per node latency $percentiles th perc over 5 min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ms", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 15 + }, + "id": 17, + "panels": [ + + ], + "title": "Frontends (entrypoints)", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 16 + }, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_entrypoint_requests_total[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "Total requests", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Total requests over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 7, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 19, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(traefik_entrypoint_open_connections{namespace=\"$namespace\"}) by (method)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ method }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Open Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 23 + }, + "id": 22, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_entrypoint_request_duration_seconds_bucket{namespace=\"$namespace\",le=\"0.1\",code=\"200\"}[5m])) by (job) / sum(rate(traefik_entrypoint_request_duration_seconds_count{namespace=\"$namespace\",code=\"200\"}[5m])) by (job)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Code 200", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Apdex score (over 5 min)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 30 + }, + "id": 24, + "panels": [ + + ], + "title": "Backends", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 7, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 31 + }, + "id": 25, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(traefik_backend_open_connections{namespace=\"$namespace\"}) by (method)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "{{ method }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Open Connections", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 31 + }, + "id": 26, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_request_duration_seconds_bucket{namespace=\"$namespace\",le=\"0.1\",code=\"200\"}[5m])) by (job) / sum(rate(traefik_backend_request_duration_seconds_count{namespace=\"$namespace\",code=\"200\"}[5m])) by (job)", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Code 200", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Apdex score (over 5 min)", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 38 + }, + "id": 15, + "panels": [ + + ], + "title": "HTTP Codes stats", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 39 + }, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\",code=~\"2..\"}[5m])) by (method, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{method}} : {{code}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Status code 2xx over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 39 + }, + "id": 27, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\",code=~\"5..\"}[5m])) by (method, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{method}} : {{code}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Status code 5xx over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 48 + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sort": "avg", + "sortDesc": true, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\"}[1m])) by (backend)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ backend }}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Backend total requests over 1min per backend", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": true, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 48 + }, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sortDesc": false, + "total": false, + "values": true + }, + "lines": false, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": true, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(traefik_backend_requests_total{namespace=\"$namespace\",code!~\"2..|5..\"}[5m])) by (method, code)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{ method }} : {{code}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Others status code over 5min", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 57 + }, + "id": 35, + "panels": [ + + ], + "title": "Pods ressources", + "type": "row" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 58 + }, + "id": 31, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(container_memory_usage_bytes{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Max memory used", + "refId": "A" + }, + { + "expr": "avg(kube_pod_container_resource_requests_memory_bytes{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requested memory usage", + "refId": "B" + }, + { + "expr": "avg(kube_pod_container_resource_limits_memory_bytes{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit memory usage", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Traefik max memory usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": false + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "prometheus", + "fill": 1, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 58 + }, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ + + ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "max(rate(container_cpu_usage_seconds_total{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"}[1m]))", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Max cpu used", + "refId": "A" + }, + { + "expr": "avg(kube_pod_container_resource_requests_cpu_cores{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Requested cpu usage", + "refId": "B" + }, + { + "expr": "avg(kube_pod_container_resource_limits_cpu_cores{namespace=\"$namespace\", pod_name=~\"traefik-ingress-controller.*\"})", + "format": "time_series", + "intervalFactor": 1, + "legendFormat": "Limit cpu usage", + "refId": "C" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeRegions": [ + + ], + "timeShift": null, + "title": "Traefik max CPU usage", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "schemaVersion": 16, + "style": "dark", + "tags": [ + "traefik" + ], + "templating": { + "list": [ + { + "allValue": null, + "current": { + "text": "kube-system", + "value": "kube-system" + }, + "datasource": "prometheus", + "definition": "", + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "namespace", + "options": [ + + ], + "query": "label_values(traefik_config_reloads_total, namespace)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + "selected": true, + "tags": [ + + ], + "text": "99", + "value": "99" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "percentiles", + "options": [ + { + "selected": false, + "text": "95", + "value": "95" + }, + { + "selected": true, + "text": "99", + "value": "99" + } + ], + "query": "95,99", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "", + "title": "Traefik", + "uid": "000000113", + "version": 3 + } + kind: ConfigMap + metadata: + name: grafana-dashboard-traefik-dashboard + namespace: monitoring kind: ConfigMapList diff --git a/manifests/grafana-deployment.yaml b/manifests/grafana-deployment.yaml index 225f654..14266df 100644 --- a/manifests/grafana-deployment.yaml +++ b/manifests/grafana-deployment.yaml @@ -105,6 +105,9 @@ spec: - mountPath: /grafana-dashboard-definitions/0/statefulset name: grafana-dashboard-statefulset readOnly: false + - mountPath: /grafana-dashboard-definitions/0/traefik-dashboard + name: grafana-dashboard-traefik-dashboard + readOnly: false - mountPath: /etc/grafana name: grafana-config readOnly: false @@ -186,6 +189,9 @@ spec: - configMap: name: grafana-dashboard-statefulset name: grafana-dashboard-statefulset + - configMap: + name: grafana-dashboard-traefik-dashboard + name: grafana-dashboard-traefik-dashboard - name: grafana-config secret: secretName: grafana-config diff --git a/manifests/kube-state-metrics-serviceMonitor.yaml b/manifests/kube-state-metrics-serviceMonitor.yaml index cef7d5b..6b4fc15 100644 --- a/manifests/kube-state-metrics-serviceMonitor.yaml +++ b/manifests/kube-state-metrics-serviceMonitor.yaml @@ -16,7 +16,7 @@ spec: insecureSkipVerify: true - interval: 30s port: http-self - scheme: https + scheme: http tlsConfig: insecureSkipVerify: true jobLabel: k8s-app diff --git a/manifests/node-exporter-daemonset.yaml b/manifests/node-exporter-daemonset.yaml index 672969b..d261e01 100644 --- a/manifests/node-exporter-daemonset.yaml +++ b/manifests/node-exporter-daemonset.yaml @@ -24,6 +24,9 @@ spec: - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ image: prom/node-exporter:v0.18.1 name: node-exporter + ports: + - containerPort: 9100 + name: http resources: limits: cpu: 250m diff --git a/manifests/prometheus-kubeControllerManagerPrometheusDiscoveryEndpoints.yaml b/manifests/prometheus-kubeControllerManagerPrometheusDiscoveryEndpoints.yaml new file mode 100644 index 0000000..2739fe8 --- /dev/null +++ b/manifests/prometheus-kubeControllerManagerPrometheusDiscoveryEndpoints.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Endpoints +metadata: + labels: + k8s-app: kube-controller-manager + name: kube-controller-manager-prometheus-discovery + namespace: kube-system +subsets: +- addresses: + - ip: 192.168.99.100 + ports: + - name: http-metrics + port: 10252 + protocol: TCP diff --git a/manifests/prometheus-kubeControllerManagerPrometheusDiscoveryService.yaml b/manifests/prometheus-kubeControllerManagerPrometheusDiscoveryService.yaml index 9506973..7e5707b 100644 --- a/manifests/prometheus-kubeControllerManagerPrometheusDiscoveryService.yaml +++ b/manifests/prometheus-kubeControllerManagerPrometheusDiscoveryService.yaml @@ -12,4 +12,4 @@ spec: port: 10252 targetPort: 10252 selector: - component: kube-controller-manager + k8s-app: kube-controller-manager diff --git a/manifests/prometheus-kubeSchedulerPrometheusDiscoveryEndpoints.yaml b/manifests/prometheus-kubeSchedulerPrometheusDiscoveryEndpoints.yaml new file mode 100644 index 0000000..8aafbac --- /dev/null +++ b/manifests/prometheus-kubeSchedulerPrometheusDiscoveryEndpoints.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Endpoints +metadata: + labels: + k8s-app: kube-scheduler + name: kube-scheduler-prometheus-discovery + namespace: kube-system +subsets: +- addresses: + - ip: 192.168.99.100 + ports: + - name: http-metrics + port: 10251 + protocol: TCP diff --git a/manifests/prometheus-kubeSchedulerPrometheusDiscoveryService.yaml b/manifests/prometheus-kubeSchedulerPrometheusDiscoveryService.yaml index b4843c7..70f1551 100644 --- a/manifests/prometheus-kubeSchedulerPrometheusDiscoveryService.yaml +++ b/manifests/prometheus-kubeSchedulerPrometheusDiscoveryService.yaml @@ -12,4 +12,4 @@ spec: port: 10251 targetPort: 10251 selector: - component: kube-scheduler + k8s-app: kube-scheduler diff --git a/manifests/traefikexporter-serviceMonitor.yaml b/manifests/traefikexporter-serviceMonitor.yaml new file mode 100644 index 0000000..fdd3333 --- /dev/null +++ b/manifests/traefikexporter-serviceMonitor.yaml @@ -0,0 +1,19 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + k8s-app: traefik-ingress-lb + name: traefik-ingress-lb + namespace: monitoring +spec: + endpoints: + - interval: 30s + port: admin + scheme: http + jobLabel: k8s-app + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + k8s-app: traefik-ingress-lb diff --git a/vars.jsonnet b/vars.jsonnet index 8bc80eb..fcb9347 100644 --- a/vars.jsonnet +++ b/vars.jsonnet @@ -18,7 +18,7 @@ }, { name: 'traefikExporter', - enabled: false, + enabled: true, file: import 'traefik.jsonnet', }, { @@ -28,7 +28,14 @@ }, ], - k3s: true, + k3s: { + enabled: true, + master_ip: '192.168.99.100' + + }, + + // Domain suffix for the ingresses + suffixDomain: '192.168.99.100.nip.io', // Setting these to false, defaults to emptyDirs enablePersistence: { @@ -36,9 +43,6 @@ grafana: false, }, - // Domain suffix for the ingresses - suffixDomain: '192.168.99.100.nip.io', - // Grafana "from" email grafana: { from_address: 'myemail@gmail.com',