166 lines
5.6 KiB
YAML
166 lines
5.6 KiB
YAML
---
|
|
# yaml-language-server: $schema=https://schemas.tholinka.dev/helm.toolkit.fluxcd.io/helmrelease_v2.json
|
|
apiVersion: helm.toolkit.fluxcd.io/v2
|
|
kind: HelmRelease
|
|
metadata:
|
|
name: kube-prometheus-stack
|
|
spec:
|
|
interval: 1h
|
|
chartRef:
|
|
kind: OCIRepository
|
|
name: kube-prometheus-stack
|
|
valuesFrom:
|
|
- kind: ConfigMap
|
|
name: flux-metrics-configmap
|
|
valuesKey: flux-metrics.yaml
|
|
values:
|
|
crds:
|
|
enabled: true
|
|
upgradeJob:
|
|
enabled: true
|
|
forceConflicts: true
|
|
cleanPrometheusOperatorObjectNames: true
|
|
alertmanager:
|
|
route:
|
|
main:
|
|
enabled: true
|
|
hostnames:
|
|
- alertmanager.tholinka.dev
|
|
parentRefs:
|
|
- name: envoy-internal
|
|
namespace: network
|
|
annotations:
|
|
gethomepage.dev/enabled: 'true'
|
|
gethomepage.dev/group: Observability
|
|
gethomepage.dev/name: Alertmanager
|
|
gethomepage.dev/icon: alertmanager.svg
|
|
gethomepage.dev/pod-selector: app.kubernetes.io/name=alertmanager
|
|
alertmanagerSpec:
|
|
alertmanagerConfiguration:
|
|
name: alertmanager
|
|
global:
|
|
resolveTimeout: 5m
|
|
externalUrl: https://alertmanager.tholinka.dev
|
|
resources:
|
|
requests:
|
|
cpu: 10m
|
|
memory: 64Mi
|
|
storage:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
storageClassName: ceph-block
|
|
resources:
|
|
requests:
|
|
storage: 1Gi
|
|
kubeEtcd:
|
|
service:
|
|
selector:
|
|
component: kube-apiserver # etcd runs on control plane nodes
|
|
kubeProxy:
|
|
enabled: false
|
|
prometheusOperator:
|
|
resources:
|
|
requests:
|
|
cpu: 1m
|
|
memory: 64Mi
|
|
prometheus:
|
|
route:
|
|
main:
|
|
enabled: true
|
|
hostnames:
|
|
- prometheus.tholinka.dev
|
|
parentRefs:
|
|
- name: envoy-internal
|
|
namespace: network
|
|
annotations:
|
|
gethomepage.dev/enabled: 'true'
|
|
gethomepage.dev/group: Observability
|
|
gethomepage.dev/name: Prometheus
|
|
gethomepage.dev/icon: prometheus.svg
|
|
gethomepage.dev/pod-selector: operator.prometheus.io/name=kube-prometheus-stack
|
|
gethomepage.dev/widget.type: prometheus
|
|
gethomepage.dev/widget.url: 'http://kube-prometheus-stack-prometheus.observability.svc.cluster.local.:9090'
|
|
prometheusSpec:
|
|
externalUrl: https://prometheus.tholinka.dev
|
|
version: v2.55.1 # override 'unsupported Prometheus version' error for prompp
|
|
image:
|
|
registry: mirror.gcr.io
|
|
repository: prompp/prompp
|
|
tag: 0.7.5
|
|
securityContext:
|
|
runAsNonRoot: true
|
|
runAsUser: 64535
|
|
runAsGroup: 64535
|
|
fsGroup: 64535
|
|
podMonitorSelectorNilUsesHelmValues: false
|
|
probeSelectorNilUsesHelmValues: false
|
|
ruleSelectorNilUsesHelmValues: false
|
|
scrapeConfigSelectorNilUsesHelmValues: false
|
|
serviceMonitorSelectorNilUsesHelmValues: false
|
|
retention: 14d
|
|
retentionSize: 50GB
|
|
resources:
|
|
requests:
|
|
cpu: 800m
|
|
limits:
|
|
memory: 3000Mi
|
|
storageSpec:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
storageClassName: ceph-block
|
|
resources:
|
|
requests:
|
|
storage: 50Gi
|
|
prometheus-node-exporter:
|
|
fullnameOverride: node-exporter
|
|
resources:
|
|
requests:
|
|
cpu: 10m
|
|
memory: 32Mi
|
|
kube-state-metrics:
|
|
fullnameOverride: kube-state-metrics
|
|
resources:
|
|
requests:
|
|
cpu: 10m
|
|
memory: 64Mi
|
|
grafana:
|
|
enabled: false
|
|
forceDeployDashboards: true
|
|
additionalPrometheusRulesMap:
|
|
dockerhub-rules:
|
|
groups:
|
|
- name: dockerhub
|
|
rules:
|
|
- alert: DockerhubRateLimitRisk
|
|
annotations:
|
|
summary: Kubernetes cluster Dockerhub rate limit risk
|
|
expr: count (count by (image) (time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30)) > 25
|
|
labels:
|
|
severity: critical
|
|
oom-rules:
|
|
groups:
|
|
- name: oom
|
|
rules:
|
|
- alert: OomKilled
|
|
annotations:
|
|
summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
|
|
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
|
|
labels:
|
|
severity: critical
|
|
btrfs-rules:
|
|
groups:
|
|
- name: btrfs
|
|
rules:
|
|
- alert: BtrfsDeviceErrorsDetected
|
|
annotations:
|
|
sumamry: BTRFS device {{$labels.device}} on {{$labels.kubernetes_node}}@{{$labels.instance}} detected an error of type {{$labels.type}}
|
|
expr: node_btrfs_device_errors_total > 0
|
|
labels:
|
|
severity: critical
|
|
- alert: BtrfsDeviceAlmostFull
|
|
annotations:
|
|
summary: BTRFS device {{$labels.device}}@{{$labels.instance}} has less than 1% of free space left
|
|
expr: (node_btrfs_device_unused_bytes{job="node-exporter"} / node_btrfs_device_size_bytes{job="node-exporter"} * 100) < 1
|
|
labels:
|
|
severity: critical
|