--- # yaml-language-server: $schema=https://schemas.tholinka.dev/helm.toolkit.fluxcd.io/helmrelease_v2.json apiVersion: helm.toolkit.fluxcd.io/v2 kind: HelmRelease metadata: name: kube-prometheus-stack spec: interval: 1h chartRef: kind: OCIRepository name: kube-prometheus-stack valuesFrom: - kind: ConfigMap name: flux-metrics-configmap valuesKey: flux-metrics.yaml values: crds: enabled: true upgradeJob: enabled: true forceConflicts: true cleanPrometheusOperatorObjectNames: true alertmanager: route: main: enabled: true hostnames: - alertmanager.laurivan.com parentRefs: - name: envoy-internal namespace: network annotations: gethomepage.dev/enabled: "true" gethomepage.dev/group: Observability gethomepage.dev/name: Alertmanager gethomepage.dev/icon: alertmanager.svg gethomepage.dev/pod-selector: app.kubernetes.io/name=alertmanager alertmanagerSpec: alertmanagerConfiguration: name: alertmanager global: resolveTimeout: 5m externalUrl: https://alertmanager.laurivan.com resources: requests: cpu: 10m memory: 64Mi storage: volumeClaimTemplate: spec: storageClassName: ceph-block resources: requests: storage: 1Gi kubeEtcd: service: selector: component: kube-apiserver # etcd runs on control plane nodes kubeProxy: enabled: false prometheusOperator: resources: requests: cpu: 1m memory: 64Mi prometheus: route: main: enabled: true hostnames: - prometheus.laurivan.com parentRefs: - name: envoy-internal namespace: network annotations: gethomepage.dev/enabled: "true" gethomepage.dev/group: Observability gethomepage.dev/name: Prometheus gethomepage.dev/icon: prometheus.svg gethomepage.dev/pod-selector: operator.prometheus.io/name=kube-prometheus-stack gethomepage.dev/widget.type: prometheus gethomepage.dev/widget.url: "http://kube-prometheus-stack-prometheus.observability.svc.cluster.local.:9090" prometheusSpec: externalUrl: https://prometheus.laurivan.com version: v2.55.1 # override 'unsupported Prometheus version' error for prompp image: registry: mirror.gcr.io repository: prompp/prompp tag: 0.7.5 securityContext: runAsNonRoot: true runAsUser: 64535 runAsGroup: 64535 fsGroup: 64535 podMonitorSelectorNilUsesHelmValues: false probeSelectorNilUsesHelmValues: false ruleSelectorNilUsesHelmValues: false scrapeConfigSelectorNilUsesHelmValues: false serviceMonitorSelectorNilUsesHelmValues: false retention: 14d retentionSize: 50GB resources: requests: cpu: 800m limits: memory: 3000Mi storageSpec: volumeClaimTemplate: spec: storageClassName: ceph-block resources: requests: storage: 50Gi prometheus-node-exporter: fullnameOverride: node-exporter resources: requests: cpu: 10m memory: 32Mi kube-state-metrics: fullnameOverride: kube-state-metrics resources: requests: cpu: 10m memory: 64Mi grafana: enabled: false forceDeployDashboards: true additionalPrometheusRulesMap: dockerhub-rules: groups: - name: dockerhub rules: - alert: DockerhubRateLimitRisk annotations: summary: Kubernetes cluster Dockerhub rate limit risk expr: count (count by (image) (time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30)) > 25 labels: severity: critical oom-rules: groups: - name: oom rules: - alert: OomKilled annotations: summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 labels: severity: critical btrfs-rules: groups: - name: btrfs rules: - alert: BtrfsDeviceErrorsDetected annotations: sumamry: BTRFS device {{$labels.device}} on {{$labels.kubernetes_node}}@{{$labels.instance}} detected an error of type {{$labels.type}} expr: node_btrfs_device_errors_total > 0 labels: severity: critical - alert: BtrfsDeviceAlmostFull annotations: summary: BTRFS device {{$labels.device}}@{{$labels.instance}} has less than 1% of free space left expr: (node_btrfs_device_unused_bytes{job="node-exporter"} / node_btrfs_device_size_bytes{job="node-exporter"} * 100) < 1 labels: severity: critical