Files
talos-cluster/kubernetes/apps/observability/kube-prometheus-stack/app/helmrelease.yaml

166 lines
5.6 KiB
YAML

---
# yaml-language-server: $schema=https://schemas.tholinka.dev/helm.toolkit.fluxcd.io/helmrelease_v2.json
apiVersion: helm.toolkit.fluxcd.io/v2
kind: HelmRelease
metadata:
name: kube-prometheus-stack
spec:
interval: 1h
chartRef:
kind: OCIRepository
name: kube-prometheus-stack
valuesFrom:
- kind: ConfigMap
name: flux-metrics-configmap
valuesKey: flux-metrics.yaml
values:
crds:
enabled: true
upgradeJob:
enabled: true
forceConflicts: true
cleanPrometheusOperatorObjectNames: true
alertmanager:
route:
main:
enabled: true
hostnames:
- alertmanager.tholinka.dev
parentRefs:
- name: envoy-internal
namespace: network
annotations:
gethomepage.dev/enabled: 'true'
gethomepage.dev/group: Observability
gethomepage.dev/name: Alertmanager
gethomepage.dev/icon: alertmanager.svg
gethomepage.dev/pod-selector: app.kubernetes.io/name=alertmanager
alertmanagerSpec:
alertmanagerConfiguration:
name: alertmanager
global:
resolveTimeout: 5m
externalUrl: https://alertmanager.tholinka.dev
resources:
requests:
cpu: 10m
memory: 64Mi
storage:
volumeClaimTemplate:
spec:
storageClassName: ceph-block
resources:
requests:
storage: 1Gi
kubeEtcd:
service:
selector:
component: kube-apiserver # etcd runs on control plane nodes
kubeProxy:
enabled: false
prometheusOperator:
resources:
requests:
cpu: 1m
memory: 64Mi
prometheus:
route:
main:
enabled: true
hostnames:
- prometheus.tholinka.dev
parentRefs:
- name: envoy-internal
namespace: network
annotations:
gethomepage.dev/enabled: 'true'
gethomepage.dev/group: Observability
gethomepage.dev/name: Prometheus
gethomepage.dev/icon: prometheus.svg
gethomepage.dev/pod-selector: operator.prometheus.io/name=kube-prometheus-stack
gethomepage.dev/widget.type: prometheus
gethomepage.dev/widget.url: 'http://kube-prometheus-stack-prometheus.observability.svc.cluster.local.:9090'
prometheusSpec:
externalUrl: https://prometheus.tholinka.dev
version: v2.55.1 # override 'unsupported Prometheus version' error for prompp
image:
registry: mirror.gcr.io
repository: prompp/prompp
tag: 0.7.5
securityContext:
runAsNonRoot: true
runAsUser: 64535
runAsGroup: 64535
fsGroup: 64535
podMonitorSelectorNilUsesHelmValues: false
probeSelectorNilUsesHelmValues: false
ruleSelectorNilUsesHelmValues: false
scrapeConfigSelectorNilUsesHelmValues: false
serviceMonitorSelectorNilUsesHelmValues: false
retention: 14d
retentionSize: 50GB
resources:
requests:
cpu: 800m
limits:
memory: 3000Mi
storageSpec:
volumeClaimTemplate:
spec:
storageClassName: ceph-block
resources:
requests:
storage: 50Gi
prometheus-node-exporter:
fullnameOverride: node-exporter
resources:
requests:
cpu: 10m
memory: 32Mi
kube-state-metrics:
fullnameOverride: kube-state-metrics
resources:
requests:
cpu: 10m
memory: 64Mi
grafana:
enabled: false
forceDeployDashboards: true
additionalPrometheusRulesMap:
dockerhub-rules:
groups:
- name: dockerhub
rules:
- alert: DockerhubRateLimitRisk
annotations:
summary: Kubernetes cluster Dockerhub rate limit risk
expr: count (count by (image) (time() - container_last_seen{image=~"(docker.io).*",container!=""} < 30)) > 25
labels:
severity: critical
oom-rules:
groups:
- name: oom
rules:
- alert: OomKilled
annotations:
summary: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1
labels:
severity: critical
btrfs-rules:
groups:
- name: btrfs
rules:
- alert: BtrfsDeviceErrorsDetected
annotations:
sumamry: BTRFS device {{$labels.device}} on {{$labels.kubernetes_node}}@{{$labels.instance}} detected an error of type {{$labels.type}}
expr: node_btrfs_device_errors_total > 0
labels:
severity: critical
- alert: BtrfsDeviceAlmostFull
annotations:
summary: BTRFS device {{$labels.device}}@{{$labels.instance}} has less than 1% of free space left
expr: (node_btrfs_device_unused_bytes{job="node-exporter"} / node_btrfs_device_size_bytes{job="node-exporter"} * 100) < 1
labels:
severity: critical