Bump libraries and use kube-rbac-proxy in K3s

This commit is contained in:
Carlos de Paula 2020-03-18 11:13:47 -03:00
parent 10b82768c2
commit cdb23a0bcc
39 changed files with 10886 additions and 4623 deletions

View File

@ -50,8 +50,8 @@ ifeq (, $(shell which jsonnet))
endif endif
change_suffix: change_suffix:
@perl -p -i -e 's/^(\s*)\-\ host:.*/\1- host: alertmanager.${IP}.nip.io/g' manifests/ingress-alertmanager-main.yaml manifests/ingress-prometheus-k8s.yaml manifests/ingress-grafana.yaml @perl -p -i -e 's/^(\s*)\-\ host:.*/\1- host: alertmanager.${IP}.nip.io/g' manifests/ingress-alertmanager.yaml manifests/ingress-prometheus.yaml manifests/ingress-grafana.yaml
@echo "Ingress IPs changed to [service].${IP}.nip.io" @echo "Ingress IPs changed to [service].${IP}.nip.io"
${K3S} kubectl apply -f manifests/ingress-alertmanager-main.yaml ${K3S} kubectl apply -f manifests/ingress-alertmanager.yaml
${K3S} kubectl apply -f manifests/ingress-grafana.yaml ${K3S} kubectl apply -f manifests/ingress-grafana.yaml
${K3S} kubectl apply -f manifests/ingress-prometheus-k8s.yaml ${K3S} kubectl apply -f manifests/ingress-prometheus-k8s.yaml

View File

@ -1,6 +1,6 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local vars = import 'vars.jsonnet';
local utils = import 'utils.libsonnet'; local utils = import 'utils.libsonnet';
local vars = import 'vars.jsonnet';
{ {
_config+:: { _config+:: {
@ -65,7 +65,7 @@ local utils = import 'utils.libsonnet';
//--------------------------------------- //---------------------------------------
prometheus+:: { prometheus+:: {
# Add option (from vars.yaml) to enable persistence // Add option (from vars.yaml) to enable persistence
local pvc = k.core.v1.persistentVolumeClaim, local pvc = k.core.v1.persistentVolumeClaim,
prometheus+: { prometheus+: {
spec+: { spec+: {
@ -177,7 +177,7 @@ local utils = import 'utils.libsonnet';
// secret.new('basic-auth', { auth: std.base64(importstr 'auth') }) + // secret.new('basic-auth', { auth: std.base64(importstr 'auth') }) +
// secret.mixin.metadata.withNamespace($._config.namespace), // secret.mixin.metadata.withNamespace($._config.namespace),
} + if vars.UseProvidedCerts then { } + if vars.UseProvidedCerts then {
secret: secret:
utils.newTLSSecret('ingress-TLS-secret', $._config.namespace, vars.TLSCertificate, vars.TLSKey) utils.newTLSSecret('ingress-TLS-secret', $._config.namespace, vars.TLSCertificate, vars.TLSKey),
} else {}, } else {},
} }

View File

@ -8,8 +8,8 @@
"subdir": "Documentation/etcd-mixin" "subdir": "Documentation/etcd-mixin"
} }
}, },
"version": "63dd73c1869f1784f907b922f61571176a2802e8", "version": "07a74d61cb6c07965c5b594748dc999d1644862b",
"sum": "bkp18AxkOUYnVC15Gh9EoIi+mMAn0IT3hMzb8mlzpSw=" "sum": "Ko3qhNfC2vN/houLh6C0Ryacjv70gl0DVPGU/PQ4OD0="
}, },
{ {
"name": "grafana", "name": "grafana",
@ -19,8 +19,8 @@
"subdir": "grafana" "subdir": "grafana"
} }
}, },
"version": "539a90dbf63c812ad0194d8078dd776868a11c81", "version": "57b4365eacda291b82e0d55ba7eec573a8198dda",
"sum": "b8faWX1qqLGyN67sA36oRqYZ5HX+tHBRMPtrWRqIysE=" "sum": "92DWADwGjnCfpZaL7Q07C0GZayxBziGla/O03qWea34="
}, },
{ {
"name": "grafana-builder", "name": "grafana-builder",
@ -30,8 +30,8 @@
"subdir": "grafana-builder" "subdir": "grafana-builder"
} }
}, },
"version": "250bf5499d81e5e77e1e5ed2242c89ad27485aec", "version": "c19a92e586a6752f11745b47f309b13f02ef7147",
"sum": "ELsYwK+kGdzX1mee2Yy+/b2mdO4Y503BOCDkFzwmGbE=" "sum": "slxrtftVDiTlQK22ertdfrg4Epnq97gdrLI63ftUfaE="
}, },
{ {
"name": "grafonnet", "name": "grafonnet",
@ -41,8 +41,8 @@
"subdir": "grafonnet" "subdir": "grafonnet"
} }
}, },
"version": "cb9e43f59558ff6338a76ae1806a0fb4b70b1b16", "version": "7a932c9cfc6ccdb1efca9535f165e055949be42a",
"sum": "YIo2bziNlqzZtlnpLtoi9qa1aztcAX7j1UuKRjjEzVY=" "sum": "HbCbHRvgA9a6K5FlOAYOUnErDHnNPWOCYPvDFU++bQE="
}, },
{ {
"name": "ksonnet", "name": "ksonnet",
@ -63,8 +63,30 @@
"subdir": "jsonnet/kube-prometheus" "subdir": "jsonnet/kube-prometheus"
} }
}, },
"version": "ce5fe790ee9f1772ad52d935b320d545c8f88722", "version": "502f81b235a84484b55493af5cf96623ae37ef80",
"sum": "AerKgmCkb6FsMAjPHqExxYSr6C/uYYq5qV9pAZULaxY=" "sum": "weorIzfuzEqgRWW5mtt/p8cXMRhmilW20ppYruOpSZs="
},
{
"name": "kube-state-metrics",
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"subdir": "jsonnet/kube-state-metrics"
}
},
"version": "fdd2ef120e5d9b56a29e7c3eeeda153acfb446ce",
"sum": "cJjGZaLBjcIGrLHZLjRPU9c3KL+ep9rZTb9dbALSKqA="
},
{
"name": "kube-state-metrics-mixin",
"source": {
"git": {
"remote": "https://github.com/kubernetes/kube-state-metrics",
"subdir": "jsonnet/kube-state-metrics-mixin"
}
},
"version": "fdd2ef120e5d9b56a29e7c3eeeda153acfb446ce",
"sum": "E1GGavnf9PCWBm4WVrxWnc0FIj72UcbcweqGioWrOdU="
}, },
{ {
"name": "kubernetes-mixin", "name": "kubernetes-mixin",
@ -74,8 +96,8 @@
"subdir": "" "subdir": ""
} }
}, },
"version": "23416282b86f75bff14bf94732c397764d7ba9db", "version": "16ff3841fea16a0f2151479ab67d8d34893759f3",
"sum": "NIE+/dQ5RelahRoLKLfxqAZEel8OTpDbDr1y05glb0E=" "sum": "UdI7A4jYc5PxmUHZBIGymx9Hk3eStqYSzXuUHot4oTQ="
}, },
{ {
"name": "node-mixin", "name": "node-mixin",
@ -85,8 +107,8 @@
"subdir": "docs/node-mixin" "subdir": "docs/node-mixin"
} }
}, },
"version": "c4c5f1f062b141d0111b4068a52eb5917048b3ea", "version": "0107bc794204f50d887898da60032da890637471",
"sum": "7vEamDTP9AApeiF4Zu9ZyXzDIs3rYHzwf9k7g8X+wsg=" "sum": "VKdF0zPMSCiuIuXWblSz2VOeBaXzQ7fp40vz9sxj+Bo="
}, },
{ {
"name": "prometheus", "name": "prometheus",
@ -96,8 +118,8 @@
"subdir": "documentation/prometheus-mixin" "subdir": "documentation/prometheus-mixin"
} }
}, },
"version": "23c0299d85bfeb5d9b59e994861553a25ca578e5", "version": "012161d90d6a8a6bb930b90601fb89ff6cc3ae60",
"sum": "wSDLAXS5Xzla9RFRE2IW5mRToeRFULHb7dSYYBDfEsM=" "sum": "5EUgr6Spr1zNR8Y2/NevjvEkGV9WMvKo6nEScNER1Lc="
}, },
{ {
"name": "prometheus-operator", "name": "prometheus-operator",
@ -107,8 +129,8 @@
"subdir": "jsonnet/prometheus-operator" "subdir": "jsonnet/prometheus-operator"
} }
}, },
"version": "8d44e0990230144177f97cf62ae4f43b1c4e3168", "version": "59bdf55453ba08b4ed7c271cb3c6627058945ed5",
"sum": "5U7/8MD3pF9O0YDTtUhg4vctkUBRVFxZxWUyhtNiBM8=" "sum": "qwMbUQkdPhAn9Sl4OVLgzmNOuOTnRLUmvv14I0unsa8="
}, },
{ {
"name": "promgrafonnet", "name": "promgrafonnet",
@ -118,8 +140,19 @@
"subdir": "lib/promgrafonnet" "subdir": "lib/promgrafonnet"
} }
}, },
"version": "23416282b86f75bff14bf94732c397764d7ba9db", "version": "16ff3841fea16a0f2151479ab67d8d34893759f3",
"sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc=" "sum": "VhgBM39yv0f4bKv8VfGg4FXkg573evGDRalip9ypKbc="
},
{
"name": "slo-libsonnet",
"source": {
"git": {
"remote": "https://github.com/metalmatze/slo-libsonnet",
"subdir": "slo-libsonnet"
}
},
"version": "5ddd7ffc39e7a54c9aca997c2c389a8046fab0ff",
"sum": "S7/+tnAkzVh8Li7sg7Hu4aeIQAWHCtxhRQ+k1OKjoQk="
} }
] ]
} }

View File

@ -1,6 +1,6 @@
local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet'; local k = import 'ksonnet/ksonnet.beta.4/k.libsonnet';
local vars = import 'vars.jsonnet';
local utils = import 'utils.libsonnet'; local utils = import 'utils.libsonnet';
local vars = import 'vars.jsonnet';
{ {
prometheus+:: { prometheus+:: {
@ -9,143 +9,20 @@ local utils = import 'utils.libsonnet';
kubeSchedulerPrometheusDiscoveryEndpoints: kubeSchedulerPrometheusDiscoveryEndpoints:
utils.newEndpoint('kube-scheduler-prometheus-discovery', 'kube-system', vars.k3s.master_ip, 'http-metrics', 10251), utils.newEndpoint('kube-scheduler-prometheus-discovery', 'kube-system', vars.k3s.master_ip, 'http-metrics', 10251),
serviceMonitorKubelet+:
{
spec+: {
endpoints: [
{
port: 'https-metrics',
scheme: 'https',
interval: '30s',
honorLabels: true,
tlsConfig: {
insecureSkipVerify: true,
},
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
},
{
port: 'https-metrics',
scheme: 'https',
path: '/metrics/cadvisor',
interval: '30s',
honorLabels: true,
tlsConfig: {
insecureSkipVerify: true,
},
bearerTokenFile: '/var/run/secrets/kubernetes.io/serviceaccount/token',
metricRelabelings: [
// Drop a bunch of metrics which are disabled but still sent, see
// https://github.com/google/cadvisor/issues/1925.
{
sourceLabels: ['__name__'],
regex: 'container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)',
action: 'drop',
},
],
},
],
},
},
}, },
nodeExporter+:: { // Temporary workaround until merge of https://github.com/coreos/kube-prometheus/pull/456
daemonset+: {
spec+: {
template+: {
spec+: {
containers:
std.filterMap(
function(c) std.startsWith(c.name, 'kube-rbac') != true,
function(c)
if std.startsWith(c.name, 'node-exporter') then
c {
args: [
'--web.listen-address=:' + $._config.nodeExporter.port,
'--path.procfs=/host/proc',
'--path.sysfs=/host/sys',
'--path.rootfs=/host/root',
// The following settings have been taken from
// https://github.com/prometheus/node_exporter/blob/0662673/collector/filesystem_linux.go#L30-L31
// Once node exporter is being released with those settings, this can be removed.
'--collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)',
'--collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$',
],
ports: [
{
containerPort: 9100,
name: 'http'
}],
}
else
c,
super.containers,
),
},
},
},
},
service+:
{
spec+: {
ports: [{
name: 'http',
port: 9100,
targetPort: 'http'
}]
}
},
serviceMonitor+:
{
spec+: {
endpoints: [
{
port: 'http',
scheme: 'http',
interval: '30s',
relabelings: [
{
action: 'replace',
regex: '(.*)',
replacment: '$1',
sourceLabels: ['__meta_kubernetes_pod_node_name'],
targetLabel: 'instance',
},
],
},
],
},
},
},
kubeStateMetrics+:: { kubeStateMetrics+:: {
deployment+: { deployment+: {
spec+: { spec+: {
template+: { template+: {
spec+: { spec+: {
containers: containers:
std.filterMap( std.map(
function(c) std.startsWith(c.name, 'kube-rbac') != true,
function(c) function(c)
if std.startsWith(c.name, 'kube-state-metrics') then if std.startsWith(c.name, 'kube-state-metrics') then
c { c {
args: [ image: $._config.imageRepos.kubeStateMetrics + ':' + $._config.versions.kubeStateMetrics,
'--port=8080',
'--telemetry-port=8081',
],
ports: [
{
containerPort: 8080,
name: 'http-main'
},
{
containerPort: 8081,
name: 'http-self'
}],
} }
else else
c, c,
@ -155,48 +32,5 @@ local utils = import 'utils.libsonnet';
}, },
}, },
}, },
service+:
{
spec+: {
ports: [{
name: 'http-main',
port: 8080,
targetPort: 'http-main'
},
{
name: 'http-self',
port: 8081,
targetPort: 'http-self'
}]
}
},
serviceMonitor+:
{
spec+: {
endpoints: [
{
port: 'http-main',
scheme: 'http',
interval: $._config.kubeStateMetrics.scrapeInterval,
scrapeTimeout: $._config.kubeStateMetrics.scrapeTimeout,
honorLabels: true,
tlsConfig: {
insecureSkipVerify: true,
},
},
{
port: 'http-self',
scheme: 'http',
interval: '30s',
tlsConfig: {
insecureSkipVerify: true,
},
},
],
},
},
}, },
} }

View File

@ -1,16 +1,21 @@
apiVersion: apiextensions.k8s.io/v1beta1 apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition kind: CustomResourceDefinition
metadata: metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.2.4
creationTimestamp: null creationTimestamp: null
name: podmonitors.monitoring.coreos.com name: podmonitors.monitoring.coreos.com
spec: spec:
group: monitoring.coreos.com group: monitoring.coreos.com
names: names:
kind: PodMonitor kind: PodMonitor
listKind: PodMonitorList
plural: podmonitors plural: podmonitors
singular: podmonitor
scope: Namespaced scope: Namespaced
validation: validation:
openAPIV3Schema: openAPIV3Schema:
description: PodMonitor defines monitoring for a set of pods.
properties: properties:
apiVersion: apiVersion:
description: 'APIVersion defines the versioned schema of this representation description: 'APIVersion defines the versioned schema of this representation
@ -22,15 +27,18 @@ spec:
object represents. Servers may infer this from the endpoint the client object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string type: string
metadata:
type: object
spec: spec:
description: PodMonitorSpec contains specification parameters for a PodMonitor. description: Specification of desired Pod selection for target discovery
by Prometheus.
properties: properties:
jobLabel: jobLabel:
description: The label to use to retrieve the job name from. description: The label to use to retrieve the job name from.
type: string type: string
namespaceSelector: namespaceSelector:
description: NamespaceSelector is a selector for selecting either all description: Selector to select which namespaces the Endpoints objects
namespaces or a list of namespaces. are discovered from.
properties: properties:
any: any:
description: Boolean describing whether all namespaces are selected description: Boolean describing whether all namespaces are selected
@ -78,7 +86,7 @@ spec:
type: integer type: integer
regex: regex:
description: Regular expression against which the extracted description: Regular expression against which the extracted
value is matched. defailt is '(.*)' value is matched. Default is '(.*)'
type: string type: string
replacement: replacement:
description: Replacement value against which a regex replace description: Replacement value against which a regex replace
@ -105,6 +113,10 @@ spec:
type: object type: object
type: array type: array
params: params:
additionalProperties:
items:
type: string
type: array
description: Optional HTTP URL parameters description: Optional HTTP URL parameters
type: object type: object
path: path:
@ -138,7 +150,7 @@ spec:
type: integer type: integer
regex: regex:
description: Regular expression against which the extracted description: Regular expression against which the extracted
value is matched. defailt is '(.*)' value is matched. Default is '(.*)'
type: string type: string
replacement: replacement:
description: Replacement value against which a regex replace description: Replacement value against which a regex replace
@ -172,8 +184,11 @@ spec:
type: string type: string
targetPort: targetPort:
anyOf: anyOf:
- type: string
- type: integer - type: integer
- type: string
description: Name or number of the target port of the endpoint.
Mutually exclusive with port.
x-kubernetes-int-or-string: true
type: object type: object
type: array type: array
podTargetLabels: podTargetLabels:
@ -188,10 +203,7 @@ spec:
format: int64 format: int64
type: integer type: integer
selector: selector:
description: A label selector is a label query over a set of resources. description: Selector to select Pod objects.
The result of matchLabels and matchExpressions are ANDed. An empty
label selector matches all objects. A null label selector matches
no objects.
properties: properties:
matchExpressions: matchExpressions:
description: matchExpressions is a list of label selector requirements. description: matchExpressions is a list of label selector requirements.
@ -224,6 +236,8 @@ spec:
type: object type: object
type: array type: array
matchLabels: matchLabels:
additionalProperties:
type: string
description: matchLabels is a map of {key,value} pairs. A single description: matchLabels is a map of {key,value} pairs. A single
{key,value} in the matchLabels map is equivalent to an element {key,value} in the matchLabels map is equivalent to an element
of matchExpressions, whose key field is "key", the operator is of matchExpressions, whose key field is "key", the operator is
@ -235,5 +249,17 @@ spec:
- podMetricsEndpoints - podMetricsEndpoints
- selector - selector
type: object type: object
required:
- spec
type: object type: object
version: v1 version: v1
versions:
- name: v1
served: true
storage: true
status:
acceptedNames:
kind: ""
plural: ""
conditions: []
storedVersions: []

View File

@ -1,16 +1,21 @@
apiVersion: apiextensions.k8s.io/v1beta1 apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition kind: CustomResourceDefinition
metadata: metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.2.4
creationTimestamp: null creationTimestamp: null
name: prometheusrules.monitoring.coreos.com name: prometheusrules.monitoring.coreos.com
spec: spec:
group: monitoring.coreos.com group: monitoring.coreos.com
names: names:
kind: PrometheusRule kind: PrometheusRule
listKind: PrometheusRuleList
plural: prometheusrules plural: prometheusrules
singular: prometheusrule
scope: Namespaced scope: Namespaced
validation: validation:
openAPIV3Schema: openAPIV3Schema:
description: PrometheusRule defines alerting rules for a Prometheus instance
properties: properties:
apiVersion: apiVersion:
description: 'APIVersion defines the versioned schema of this representation description: 'APIVersion defines the versioned schema of this representation
@ -23,201 +28,24 @@ spec:
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string type: string
metadata: metadata:
description: ObjectMeta is metadata that all persisted resources must have,
which includes all objects users must create.
properties:
annotations:
description: 'Annotations is an unstructured key value map stored with
a resource that may be set by external tools to store and retrieve
arbitrary metadata. They are not queryable and should be preserved
when modifying objects. More info: http://kubernetes.io/docs/user-guide/annotations'
type: object
clusterName:
description: The name of the cluster which the object belongs to. This
is used to distinguish resources with same name and namespace in different
clusters. This field is not set anywhere right now and apiserver is
going to ignore it if set in create or update request.
type: string
creationTimestamp:
description: Time is a wrapper around time.Time which supports correct
marshaling to YAML and JSON. Wrappers are provided for many of the
factory methods that the time package offers.
format: date-time
type: string
deletionGracePeriodSeconds:
description: Number of seconds allowed for this object to gracefully
terminate before it will be removed from the system. Only set when
deletionTimestamp is also set. May only be shortened. Read-only.
format: int64
type: integer
deletionTimestamp:
description: Time is a wrapper around time.Time which supports correct
marshaling to YAML and JSON. Wrappers are provided for many of the
factory methods that the time package offers.
format: date-time
type: string
finalizers:
description: Must be empty before the object is deleted from the registry.
Each entry is an identifier for the responsible component that will
remove the entry from the list. If the deletionTimestamp of the object
is non-nil, entries in this list can only be removed.
items:
type: string
type: array
generateName:
description: |-
GenerateName is an optional prefix, used by the server, to generate a unique name ONLY IF the Name field has not been provided. If this field is used, the name returned to the client will be different than the name passed. This value will also be combined with a unique suffix. The provided value has the same validation rules as the Name field, and may be truncated by the length of the suffix required to make the value unique on the server.
If this field is specified and the generated name exists, the server will NOT return a 409 - instead, it will either return 201 Created or 500 with Reason ServerTimeout indicating a unique name could not be found in the time allotted, and the client should retry (optionally after the time indicated in the Retry-After header).
Applied only if Name is not specified. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#idempotency
type: string
generation:
description: A sequence number representing a specific generation of
the desired state. Populated by the system. Read-only.
format: int64
type: integer
labels:
description: 'Map of string keys and values that can be used to organize
and categorize (scope and select) objects. May match selectors of
replication controllers and services. More info: http://kubernetes.io/docs/user-guide/labels'
type: object
managedFields:
description: ManagedFields maps workflow-id and version to the set of
fields that are managed by that workflow. This is mostly for internal
housekeeping, and users typically shouldn't need to set or understand
this field. A workflow can be the user's name, a controller's name,
or the name of a specific apply path like "ci-cd". The set of fields
is always in the version that the workflow used when modifying the
object.
items:
description: ManagedFieldsEntry is a workflow-id, a FieldSet and the
group version of the resource that the fieldset applies to.
properties:
apiVersion:
description: APIVersion defines the version of this resource that
this field set applies to. The format is "group/version" just
like the top-level APIVersion field. It is necessary to track
the version of a field set because it cannot be automatically
converted.
type: string
fieldsType:
description: 'FieldsType is the discriminator for the different
fields format and version. There is currently only one possible
value: "FieldsV1"'
type: string
fieldsV1:
description: |-
FieldsV1 stores a set of fields in a data structure like a Trie, in JSON format.
Each key is either a '.' representing the field itself, and will always map to an empty set, or a string representing a sub-field or item. The string will follow one of these four formats: 'f:<name>', where <name> is the name of a field in a struct, or key in a map 'v:<value>', where <value> is the exact json formatted value of a list item 'i:<index>', where <index> is position of a item in a list 'k:<keys>', where <keys> is a map of a list item's key fields to their unique values If a key maps to an empty Fields value, the field that key represents is part of the set.
The exact format is defined in sigs.k8s.io/structured-merge-diff
type: object
manager:
description: Manager is an identifier of the workflow managing
these fields.
type: string
operation:
description: Operation is the type of operation which lead to
this ManagedFieldsEntry being created. The only valid values
for this field are 'Apply' and 'Update'.
type: string
time:
description: Time is a wrapper around time.Time which supports
correct marshaling to YAML and JSON. Wrappers are provided
for many of the factory methods that the time package offers.
format: date-time
type: string
type: object
type: array
name:
description: 'Name must be unique within a namespace. Is required when
creating resources, although some resources may allow a client to
request the generation of an appropriate name automatically. Name
is primarily intended for creation idempotence and configuration definition.
Cannot be updated. More info: http://kubernetes.io/docs/user-guide/identifiers#names'
type: string
namespace:
description: |-
Namespace defines the space within each name must be unique. An empty namespace is equivalent to the "default" namespace, but "default" is the canonical representation. Not all objects are required to be scoped to a namespace - the value of this field for those objects will be empty.
Must be a DNS_LABEL. Cannot be updated. More info: http://kubernetes.io/docs/user-guide/namespaces
type: string
ownerReferences:
description: List of objects depended by this object. If ALL objects
in the list have been deleted, this object will be garbage collected.
If this object is managed by a controller, then an entry in this list
will point to this controller, with the controller field set to true.
There cannot be more than one managing controller.
items:
description: OwnerReference contains enough information to let you
identify an owning object. An owning object must be in the same
namespace as the dependent, or be cluster-scoped, so there is no
namespace field.
properties:
apiVersion:
description: API version of the referent.
type: string
blockOwnerDeletion:
description: If true, AND if the owner has the "foregroundDeletion"
finalizer, then the owner cannot be deleted from the key-value
store until this reference is removed. Defaults to false. To
set this field, a user needs "delete" permission of the owner,
otherwise 422 (Unprocessable Entity) will be returned.
type: boolean
controller:
description: If true, this reference points to the managing controller.
type: boolean
kind:
description: 'Kind of the referent. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string
name:
description: 'Name of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#names'
type: string
uid:
description: 'UID of the referent. More info: http://kubernetes.io/docs/user-guide/identifiers#uids'
type: string
required:
- apiVersion
- kind
- name
- uid
type: object
type: array
resourceVersion:
description: |-
An opaque value that represents the internal version of this object that can be used by clients to determine when objects have changed. May be used for optimistic concurrency, change detection, and the watch operation on a resource or set of resources. Clients must treat these values as opaque and passed unmodified back to the server. They may only be valid for a particular resource or set of resources.
Populated by the system. Read-only. Value must be treated as opaque by clients and . More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency
type: string
selfLink:
description: |-
SelfLink is a URL representing this object. Populated by the system. Read-only.
DEPRECATED Kubernetes will stop propagating this field in 1.20 release and the field is planned to be removed in 1.21 release.
type: string
uid:
description: |-
UID is the unique in time and space value for this object. It is typically generated by the server on successful creation of a resource and is not allowed to change on PUT operations.
Populated by the system. Read-only. More info: http://kubernetes.io/docs/user-guide/identifiers#uids
type: string
type: object type: object
spec: spec:
description: PrometheusRuleSpec contains specification parameters for a description: Specification of desired alerting rule definitions for Prometheus.
Rule.
properties: properties:
groups: groups:
description: Content of Prometheus rule file description: Content of Prometheus rule file
items: items:
description: RuleGroup is a list of sequentially evaluated recording description: 'RuleGroup is a list of sequentially evaluated recording
and alerting rules. and alerting rules. Note: PartialResponseStrategy is only used by
ThanosRuler and will be ignored by Prometheus instances. Valid
values for this field are ''warn'' or ''abort''. More info: https://github.com/thanos-io/thanos/blob/master/docs/components/rule.md#partial-response'
properties: properties:
interval: interval:
type: string type: string
name: name:
type: string type: string
partial_response_strategy:
type: string
rules: rules:
items: items:
description: Rule describes an alerting or recording rule. description: Rule describes an alerting or recording rule.
@ -225,14 +53,19 @@ spec:
alert: alert:
type: string type: string
annotations: annotations:
additionalProperties:
type: string
type: object type: object
expr: expr:
anyOf: anyOf:
- type: string
- type: integer - type: integer
- type: string
x-kubernetes-int-or-string: true
for: for:
type: string type: string
labels: labels:
additionalProperties:
type: string
type: object type: object
record: record:
type: string type: string
@ -246,5 +79,17 @@ spec:
type: object type: object
type: array type: array
type: object type: object
required:
- spec
type: object type: object
version: v1 version: v1
versions:
- name: v1
served: true
storage: true
status:
acceptedNames:
kind: ""
plural: ""
conditions: []
storedVersions: []

View File

@ -1,16 +1,21 @@
apiVersion: apiextensions.k8s.io/v1beta1 apiVersion: apiextensions.k8s.io/v1beta1
kind: CustomResourceDefinition kind: CustomResourceDefinition
metadata: metadata:
annotations:
controller-gen.kubebuilder.io/version: v0.2.4
creationTimestamp: null creationTimestamp: null
name: servicemonitors.monitoring.coreos.com name: servicemonitors.monitoring.coreos.com
spec: spec:
group: monitoring.coreos.com group: monitoring.coreos.com
names: names:
kind: ServiceMonitor kind: ServiceMonitor
listKind: ServiceMonitorList
plural: servicemonitors plural: servicemonitors
singular: servicemonitor
scope: Namespaced scope: Namespaced
validation: validation:
openAPIV3Schema: openAPIV3Schema:
description: ServiceMonitor defines monitoring for a set of services.
properties: properties:
apiVersion: apiVersion:
description: 'APIVersion defines the versioned schema of this representation description: 'APIVersion defines the versioned schema of this representation
@ -22,9 +27,11 @@ spec:
object represents. Servers may infer this from the endpoint the client object represents. Servers may infer this from the endpoint the client
submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds' submits requests to. Cannot be updated. In CamelCase. More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds'
type: string type: string
metadata:
type: object
spec: spec:
description: ServiceMonitorSpec contains specification parameters for a description: Specification of desired Service selection for target discovery
ServiceMonitor. by Prometheus.
properties: properties:
endpoints: endpoints:
description: A list of endpoints allowed as part of this ServiceMonitor. description: A list of endpoints allowed as part of this ServiceMonitor.
@ -37,14 +44,16 @@ spec:
basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints' basic authentication More info: https://prometheus.io/docs/operating/configuration/#endpoints'
properties: properties:
password: password:
description: SecretKeySelector selects a key of a Secret. description: The secret in the service monitor namespace that
contains the password for authentication.
properties: properties:
key: key:
description: The key of the secret to select from. Must description: The key of the secret to select from. Must
be a valid secret key. be a valid secret key.
type: string type: string
name: name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string type: string
optional: optional:
description: Specify whether the Secret or its key must description: Specify whether the Secret or its key must
@ -54,14 +63,16 @@ spec:
- key - key
type: object type: object
username: username:
description: SecretKeySelector selects a key of a Secret. description: The secret in the service monitor namespace that
contains the username for authentication.
properties: properties:
key: key:
description: The key of the secret to select from. Must description: The key of the secret to select from. Must
be a valid secret key. be a valid secret key.
type: string type: string
name: name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string type: string
optional: optional:
description: Specify whether the Secret or its key must description: Specify whether the Secret or its key must
@ -75,14 +86,17 @@ spec:
description: File to read bearer token for scraping targets. description: File to read bearer token for scraping targets.
type: string type: string
bearerTokenSecret: bearerTokenSecret:
description: SecretKeySelector selects a key of a Secret. description: Secret to mount to read bearer token for scraping
targets. The secret needs to be in the same namespace as the
service monitor and accessible by the Prometheus Operator.
properties: properties:
key: key:
description: The key of the secret to select from. Must be description: The key of the secret to select from. Must be
a valid secret key. a valid secret key.
type: string type: string
name: name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string type: string
optional: optional:
description: Specify whether the Secret or its key must be description: Specify whether the Secret or its key must be
@ -121,7 +135,7 @@ spec:
type: integer type: integer
regex: regex:
description: Regular expression against which the extracted description: Regular expression against which the extracted
value is matched. defailt is '(.*)' value is matched. Default is '(.*)'
type: string type: string
replacement: replacement:
description: Replacement value against which a regex replace description: Replacement value against which a regex replace
@ -148,6 +162,10 @@ spec:
type: object type: object
type: array type: array
params: params:
additionalProperties:
items:
type: string
type: array
description: Optional HTTP URL parameters description: Optional HTTP URL parameters
type: object type: object
path: path:
@ -181,7 +199,7 @@ spec:
type: integer type: integer
regex: regex:
description: Regular expression against which the extracted description: Regular expression against which the extracted
value is matched. defailt is '(.*)' value is matched. Default is '(.*)'
type: string type: string
replacement: replacement:
description: Replacement value against which a regex replace description: Replacement value against which a regex replace
@ -215,17 +233,103 @@ spec:
type: string type: string
targetPort: targetPort:
anyOf: anyOf:
- type: string
- type: integer - type: integer
- type: string
description: Name or number of the target port of the endpoint.
Mutually exclusive with port.
x-kubernetes-int-or-string: true
tlsConfig: tlsConfig:
description: TLSConfig specifies TLS configuration parameters. description: TLS configuration to use when scraping the endpoint
properties: properties:
ca: {} ca:
description: Stuct containing the CA cert to use for the targets.
properties:
configMap:
description: ConfigMap containing data to use for the
targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the ConfigMap or its
key must be defined
type: boolean
required:
- key
type: object
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
type: object
caFile: caFile:
description: Path to the CA cert in the Prometheus container description: Path to the CA cert in the Prometheus container
to use for the targets. to use for the targets.
type: string type: string
cert: {} cert:
description: Struct containing the client cert file for the
targets.
properties:
configMap:
description: ConfigMap containing data to use for the
targets.
properties:
key:
description: The key to select.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the ConfigMap or its
key must be defined
type: boolean
required:
- key
type: object
secret:
description: Secret containing data to use for the targets.
properties:
key:
description: The key of the secret to select from. Must
be a valid secret key.
type: string
name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind,
uid?'
type: string
optional:
description: Specify whether the Secret or its key
must be defined
type: boolean
required:
- key
type: object
type: object
certFile: certFile:
description: Path to the client cert file in the Prometheus description: Path to the client cert file in the Prometheus
container for the targets. container for the targets.
@ -238,14 +342,16 @@ spec:
container for the targets. container for the targets.
type: string type: string
keySecret: keySecret:
description: SecretKeySelector selects a key of a Secret. description: Secret containing the client key file for the
targets.
properties: properties:
key: key:
description: The key of the secret to select from. Must description: The key of the secret to select from. Must
be a valid secret key. be a valid secret key.
type: string type: string
name: name:
description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names' description: 'Name of the referent. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names
TODO: Add other useful fields. apiVersion, kind, uid?'
type: string type: string
optional: optional:
description: Specify whether the Secret or its key must description: Specify whether the Secret or its key must
@ -264,8 +370,8 @@ spec:
description: The label to use to retrieve the job name from. description: The label to use to retrieve the job name from.
type: string type: string
namespaceSelector: namespaceSelector:
description: NamespaceSelector is a selector for selecting either all description: Selector to select which namespaces the Endpoints objects
namespaces or a list of namespaces. are discovered from.
properties: properties:
any: any:
description: Boolean describing whether all namespaces are selected description: Boolean describing whether all namespaces are selected
@ -289,10 +395,7 @@ spec:
format: int64 format: int64
type: integer type: integer
selector: selector:
description: A label selector is a label query over a set of resources. description: Selector to select Endpoints objects.
The result of matchLabels and matchExpressions are ANDed. An empty
label selector matches all objects. A null label selector matches
no objects.
properties: properties:
matchExpressions: matchExpressions:
description: matchExpressions is a list of label selector requirements. description: matchExpressions is a list of label selector requirements.
@ -325,6 +428,8 @@ spec:
type: object type: object
type: array type: array
matchLabels: matchLabels:
additionalProperties:
type: string
description: matchLabels is a map of {key,value} pairs. A single description: matchLabels is a map of {key,value} pairs. A single
{key,value} in the matchLabels map is equivalent to an element {key,value} in the matchLabels map is equivalent to an element
of matchExpressions, whose key field is "key", the operator is of matchExpressions, whose key field is "key", the operator is
@ -342,5 +447,17 @@ spec:
- endpoints - endpoints
- selector - selector
type: object type: object
required:
- spec
type: object type: object
version: v1 version: v1
versions:
- name: v1
served: true
storage: true
status:
acceptedNames:
kind: ""
plural: ""
conditions: []
storedVersions: []

File diff suppressed because it is too large Load Diff

View File

@ -12,14 +12,30 @@ rules:
resources: resources:
- customresourcedefinitions - customresourcedefinitions
verbs: verbs:
- '*' - create
- apiGroups:
- apiextensions.k8s.io
resourceNames:
- alertmanagers.monitoring.coreos.com
- podmonitors.monitoring.coreos.com
- prometheuses.monitoring.coreos.com
- prometheusrules.monitoring.coreos.com
- servicemonitors.monitoring.coreos.com
- thanosrulers.monitoring.coreos.com
resources:
- customresourcedefinitions
verbs:
- get
- update
- apiGroups: - apiGroups:
- monitoring.coreos.com - monitoring.coreos.com
resources: resources:
- alertmanagers - alertmanagers
- alertmanagers/finalizers
- prometheuses - prometheuses
- prometheuses/finalizers - prometheuses/finalizers
- alertmanagers/finalizers - thanosrulers
- thanosrulers/finalizers
- servicemonitors - servicemonitors
- podmonitors - podmonitors
- prometheusrules - prometheusrules

View File

@ -20,7 +20,7 @@ spec:
- monitoring - monitoring
topologyKey: kubernetes.io/hostname topologyKey: kubernetes.io/hostname
weight: 100 weight: 100
baseImage: prom/alertmanager image: prom/alertmanager:v0.20.0
nodeSelector: nodeSelector:
kubernetes.io/os: linux kubernetes.io/os: linux
replicas: 1 replicas: 1
@ -29,4 +29,4 @@ spec:
runAsNonRoot: true runAsNonRoot: true
runAsUser: 1000 runAsUser: 1000
serviceAccountName: alertmanager-main serviceAccountName: alertmanager-main
version: v0.18.0 version: v0.20.0

View File

@ -1,8 +1,44 @@
apiVersion: v1 apiVersion: v1
data: data: {}
alertmanager.yaml: Imdsb2JhbCI6CiAgInJlc29sdmVfdGltZW91dCI6ICI1bSIKInJlY2VpdmVycyI6Ci0gIm5hbWUiOiAibnVsbCIKInJvdXRlIjoKICAiZ3JvdXBfYnkiOgogIC0gImpvYiIKICAiZ3JvdXBfaW50ZXJ2YWwiOiAiNW0iCiAgImdyb3VwX3dhaXQiOiAiMzBzIgogICJyZWNlaXZlciI6ICJudWxsIgogICJyZXBlYXRfaW50ZXJ2YWwiOiAiMTJoIgogICJyb3V0ZXMiOgogIC0gIm1hdGNoIjoKICAgICAgImFsZXJ0bmFtZSI6ICJXYXRjaGRvZyIKICAgICJyZWNlaXZlciI6ICJudWxsIg==
kind: Secret kind: Secret
metadata: metadata:
name: alertmanager-main name: alertmanager-main
namespace: monitoring namespace: monitoring
stringData:
alertmanager.yaml: |-
"global":
"resolve_timeout": "5m"
"inhibit_rules":
- "equal":
- "namespace"
- "alertname"
"source_match":
"severity": "critical"
"target_match_re":
"severity": "warning|info"
- "equal":
- "namespace"
- "alertname"
"source_match":
"severity": "warning"
"target_match_re":
"severity": "info"
"receivers":
- "name": "Default"
- "name": "Watchdog"
- "name": "Critical"
"route":
"group_by":
- "namespace"
"group_interval": "5m"
"group_wait": "30s"
"receiver": "Default"
"repeat_interval": "12h"
"routes":
- "match":
"alertname": "Watchdog"
"receiver": "Watchdog"
- "match":
"severity": "critical"
"receiver": "Critical"
type: Opaque type: Opaque

File diff suppressed because it is too large Load Diff

View File

@ -5,7 +5,7 @@ data:
"apiVersion": 1, "apiVersion": 1,
"providers": [ "providers": [
{ {
"folder": "", "folder": "Default",
"name": "0", "name": "0",
"options": { "options": {
"path": "/grafana-dashboard-definitions/0" "path": "/grafana-dashboard-definitions/0"

View File

@ -16,7 +16,8 @@ spec:
app: grafana app: grafana
spec: spec:
containers: containers:
- image: grafana/grafana:6.3.2 - env: []
image: grafana/grafana:6.6.2
name: grafana name: grafana
ports: ports:
- containerPort: 3000 - containerPort: 3000
@ -99,9 +100,6 @@ spec:
- mountPath: /grafana-dashboard-definitions/0/pod-total - mountPath: /grafana-dashboard-definitions/0/pod-total
name: grafana-dashboard-pod-total name: grafana-dashboard-pod-total
readOnly: false readOnly: false
- mountPath: /grafana-dashboard-definitions/0/pods
name: grafana-dashboard-pods
readOnly: false
- mountPath: /grafana-dashboard-definitions/0/prometheus-dashboard - mountPath: /grafana-dashboard-definitions/0/prometheus-dashboard
name: grafana-dashboard-prometheus-dashboard name: grafana-dashboard-prometheus-dashboard
readOnly: false readOnly: false
@ -198,9 +196,6 @@ spec:
- configMap: - configMap:
name: grafana-dashboard-pod-total name: grafana-dashboard-pod-total
name: grafana-dashboard-pod-total name: grafana-dashboard-pod-total
- configMap:
name: grafana-dashboard-pods
name: grafana-dashboard-pods
- configMap: - configMap:
name: grafana-dashboard-prometheus-dashboard name: grafana-dashboard-prometheus-dashboard
name: grafana-dashboard-prometheus-dashboard name: grafana-dashboard-prometheus-dashboard

View File

@ -5,7 +5,7 @@ metadata:
namespace: monitoring namespace: monitoring
spec: spec:
rules: rules:
- host: alertmanager.192.168.99.100.nip.io - host: alertmanager.192.168.15.15.nip.io
http: http:
paths: paths:
- backend: - backend:
@ -14,4 +14,4 @@ spec:
path: / path: /
tls: tls:
- hosts: - hosts:
- alertmanager.192.168.99.100.nip.io - alertmanager.192.168.15.15.nip.io

View File

@ -5,7 +5,7 @@ metadata:
namespace: monitoring namespace: monitoring
spec: spec:
rules: rules:
- host: grafana.192.168.99.100.nip.io - host: grafana.192.168.15.15.nip.io
http: http:
paths: paths:
- backend: - backend:
@ -14,4 +14,4 @@ spec:
path: / path: /
tls: tls:
- hosts: - hosts:
- grafana.192.168.99.100.nip.io - grafana.192.168.15.15.nip.io

View File

@ -5,7 +5,7 @@ metadata:
namespace: monitoring namespace: monitoring
spec: spec:
rules: rules:
- host: prometheus.192.168.99.100.nip.io - host: prometheus.192.168.15.15.nip.io
http: http:
paths: paths:
- backend: - backend:
@ -14,4 +14,4 @@ spec:
path: / path: /
tls: tls:
- hosts: - hosts:
- prometheus.192.168.99.100.nip.io - prometheus.192.168.15.15.nip.io

View File

@ -1,6 +1,9 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole kind: ClusterRole
metadata: metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
name: kube-state-metrics name: kube-state-metrics
rules: rules:
- apiGroups: - apiGroups:
@ -86,6 +89,29 @@ rules:
- storage.k8s.io - storage.k8s.io
resources: resources:
- storageclasses - storageclasses
- volumeattachments
verbs:
- list
- watch
- apiGroups:
- admissionregistration.k8s.io
resources:
- mutatingwebhookconfigurations
- validatingwebhookconfigurations
verbs:
- list
- watch
- apiGroups:
- networking.k8s.io
resources:
- networkpolicies
verbs:
- list
- watch
- apiGroups:
- coordination.k8s.io
resources:
- leases
verbs: verbs:
- list - list
- watch - watch

View File

@ -1,6 +1,9 @@
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding kind: ClusterRoleBinding
metadata: metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
name: kube-state-metrics name: kube-state-metrics
roleRef: roleRef:
apiGroup: rbac.authorization.k8s.io apiGroup: rbac.authorization.k8s.io

View File

@ -2,20 +2,31 @@ apiVersion: apps/v1
kind: Deployment kind: Deployment
metadata: metadata:
labels: labels:
app: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
name: kube-state-metrics name: kube-state-metrics
namespace: monitoring namespace: monitoring
spec: spec:
replicas: 1 replicas: 1
selector: selector:
matchLabels: matchLabels:
app: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
template: template:
metadata: metadata:
labels: labels:
app: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
spec: spec:
containers: containers:
- args:
- --host=127.0.0.1
- --port=8081
- --telemetry-host=127.0.0.1
- --telemetry-port=8082
image: quay.io/coreos/kube-state-metrics:v1.9.5
name: kube-state-metrics
securityContext:
runAsUser: 65534
- args: - args:
- --logtostderr - --logtostderr
- --secure-listen-address=:8443 - --secure-listen-address=:8443
@ -26,13 +37,6 @@ spec:
ports: ports:
- containerPort: 8443 - containerPort: 8443
name: https-main name: https-main
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
- args: - args:
- --logtostderr - --logtostderr
- --secure-listen-address=:9443 - --secure-listen-address=:9443
@ -43,30 +47,6 @@ spec:
ports: ports:
- containerPort: 9443 - containerPort: 9443
name: https-self name: https-self
resources:
limits:
cpu: 20m
memory: 40Mi
requests:
cpu: 10m
memory: 20Mi
- args:
- --host=127.0.0.1
- --port=8081
- --telemetry-host=127.0.0.1
- --telemetry-port=8082
image: carlosedp/kube-state-metrics:v1.7.2
name: kube-state-metrics
resources:
limits:
cpu: 100m
memory: 150Mi
requests:
cpu: 100m
memory: 150Mi
nodeSelector: nodeSelector:
kubernetes.io/os: linux kubernetes.io/os: linux
securityContext:
runAsNonRoot: true
runAsUser: 65534
serviceAccountName: kube-state-metrics serviceAccountName: kube-state-metrics

View File

@ -1,30 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: kube-state-metrics
namespace: monitoring
rules:
- apiGroups:
- ""
resources:
- pods
verbs:
- get
- apiGroups:
- extensions
resourceNames:
- kube-state-metrics
resources:
- deployments
verbs:
- get
- update
- apiGroups:
- apps
resourceNames:
- kube-state-metrics
resources:
- deployments
verbs:
- get
- update

View File

@ -1,12 +0,0 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: kube-state-metrics
namespace: monitoring
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: Role
name: kube-state-metrics
subjects:
- kind: ServiceAccount
name: kube-state-metrics

View File

@ -2,7 +2,8 @@ apiVersion: v1
kind: Service kind: Service
metadata: metadata:
labels: labels:
k8s-app: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
name: kube-state-metrics name: kube-state-metrics
namespace: monitoring namespace: monitoring
spec: spec:
@ -15,4 +16,4 @@ spec:
port: 9443 port: 9443
targetPort: https-self targetPort: https-self
selector: selector:
app: kube-state-metrics app.kubernetes.io/name: kube-state-metrics

View File

@ -1,5 +1,8 @@
apiVersion: v1 apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
metadata: metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
name: kube-state-metrics name: kube-state-metrics
namespace: monitoring namespace: monitoring

View File

@ -2,7 +2,8 @@ apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor kind: ServiceMonitor
metadata: metadata:
labels: labels:
k8s-app: kube-state-metrics app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 1.9.5
name: kube-state-metrics name: kube-state-metrics
namespace: monitoring namespace: monitoring
spec: spec:
@ -24,7 +25,7 @@ spec:
scheme: https scheme: https
tlsConfig: tlsConfig:
insecureSkipVerify: true insecureSkipVerify: true
jobLabel: k8s-app jobLabel: app.kubernetes.io/name
selector: selector:
matchLabels: matchLabels:
k8s-app: kube-state-metrics app.kubernetes.io/name: kube-state-metrics

View File

@ -20,6 +20,8 @@ spec:
- --path.procfs=/host/proc - --path.procfs=/host/proc
- --path.sysfs=/host/sys - --path.sysfs=/host/sys
- --path.rootfs=/host/root - --path.rootfs=/host/root
- --no-collector.wifi
- --no-collector.hwmon
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/) - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+)($|/)
- --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$ - --collector.filesystem.ignored-fs-types=^(autofs|binfmt_misc|cgroup|configfs|debugfs|devpts|devtmpfs|fusectl|hugetlbfs|mqueue|overlay|proc|procfs|pstore|rpc_pipefs|securityfs|sysfs|tracefs)$
image: prom/node-exporter:v0.18.1 image: prom/node-exporter:v0.18.1
@ -44,7 +46,7 @@ spec:
readOnly: true readOnly: true
- args: - args:
- --logtostderr - --logtostderr
- --secure-listen-address=$(IP):9100 - --secure-listen-address=[$(IP)]:9100
- --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256 - --tls-cipher-suites=TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256,TLS_RSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA256,TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA256
- --upstream=http://127.0.0.1:9100/ - --upstream=http://127.0.0.1:9100/
env: env:

View File

@ -8,7 +8,7 @@ metadata:
spec: spec:
endpoints: endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s interval: 15s
port: https port: https
relabelings: relabelings:
- action: replace - action: replace

View File

@ -11,6 +11,7 @@ rules:
- metrics.k8s.io - metrics.k8s.io
resources: resources:
- pods - pods
- nodes
verbs: verbs:
- get - get
- list - list

View File

@ -3,8 +3,8 @@ data:
config.yaml: | config.yaml: |
resourceRules: resourceRules:
cpu: cpu:
containerQuery: sum(rate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>) containerQuery: sum(irate(container_cpu_usage_seconds_total{<<.LabelMatchers>>,container!="POD",container!="",pod!=""}[5m])) by (<<.GroupBy>>)
nodeQuery: sum(1 - rate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>) nodeQuery: sum(1 - irate(node_cpu_seconds_total{mode="idle"}[5m]) * on(namespace, pod) group_left(node) node_namespace_pod:kube_pod_info:{<<.LabelMatchers>>}) by (<<.GroupBy>>)
resources: resources:
overrides: overrides:
node: node:

View File

@ -25,10 +25,11 @@ spec:
- name: alertmanager-main - name: alertmanager-main
namespace: monitoring namespace: monitoring
port: web port: web
baseImage: prom/prometheus externalUrl: http://prometheus.192.168.15.15.nip.io
externalUrl: http://prometheus.192.168.99.100.nip.io image: prom/prometheus:v2.16.0
nodeSelector: nodeSelector:
kubernetes.io/os: linux kubernetes.io/os: linux
podMonitorNamespaceSelector: {}
podMonitorSelector: {} podMonitorSelector: {}
replicas: 1 replicas: 1
resources: resources:
@ -46,4 +47,4 @@ spec:
serviceAccountName: prometheus-k8s serviceAccountName: prometheus-k8s
serviceMonitorNamespaceSelector: {} serviceMonitorNamespaceSelector: {}
serviceMonitorSelector: {} serviceMonitorSelector: {}
version: v2.11.1 version: v2.16.0

View File

@ -40,10 +40,10 @@ spec:
rate(node_vmstat_pgmajfault{job="node-exporter"}[1m]) rate(node_vmstat_pgmajfault{job="node-exporter"}[1m])
record: instance:node_vmstat_pgmajfault:rate1m record: instance:node_vmstat_pgmajfault:rate1m
- expr: | - expr: |
rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) rate(node_disk_io_time_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_seconds:rate1m record: instance_device:node_disk_io_time_seconds:rate1m
- expr: | - expr: |
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+"}[1m]) rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|dasd.+"}[1m])
record: instance_device:node_disk_io_time_weighted_seconds:rate1m record: instance_device:node_disk_io_time_weighted_seconds:rate1m
- expr: | - expr: |
sum without (device) ( sum without (device) (
@ -65,96 +65,237 @@ spec:
rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m]) rate(node_network_transmit_drop_total{job="node-exporter", device!="lo"}[1m])
) )
record: instance:node_network_transmit_drop_excluding_lo:rate1m record: instance:node_network_transmit_drop_excluding_lo:rate1m
- name: kube-apiserver-error
rules:
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[5m]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate5m
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[30m]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate30m
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[1h]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate1h
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[2h]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate2h
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[6h]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate6h
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[1d]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate1d
- expr: |
sum by (status_class) (
label_replace(
rate(apiserver_request_total{job="apiserver"}[3d]
), "status_class", "${1}xx", "code", "([0-9])..")
)
labels:
job: apiserver
record: status_class:apiserver_request_total:rate3d
- expr: |
sum(status_class:apiserver_request_total:rate5m{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate5m{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate5m
- expr: |
sum(status_class:apiserver_request_total:rate30m{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate30m{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate30m
- expr: |
sum(status_class:apiserver_request_total:rate1h{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate1h{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate1h
- expr: |
sum(status_class:apiserver_request_total:rate2h{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate2h{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate2h
- expr: |
sum(status_class:apiserver_request_total:rate6h{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate6h{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate6h
- expr: |
sum(status_class:apiserver_request_total:rate1d{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate1d{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate1d
- expr: |
sum(status_class:apiserver_request_total:rate3d{job="apiserver",status_class="5xx"})
/
sum(status_class:apiserver_request_total:rate3d{job="apiserver"})
labels:
job: apiserver
record: status_class_5xx:apiserver_request_total:ratio_rate3d
- name: kube-apiserver.rules - name: kube-apiserver.rules
rules: rules:
- expr: | - expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod)) sum(rate(apiserver_request_duration_seconds_sum{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
/
sum(rate(apiserver_request_duration_seconds_count{subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod)
record: cluster:apiserver_request_duration_seconds:mean5m
- expr: |
histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels: labels:
quantile: "0.99" quantile: "0.99"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: | - expr: |
histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod)) histogram_quantile(0.9, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels: labels:
quantile: "0.9" quantile: "0.9"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- expr: | - expr: |
histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver"}[5m])) without(instance, pod)) histogram_quantile(0.5, sum(rate(apiserver_request_duration_seconds_bucket{job="apiserver",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|DELETECOLLECTION|PROXY|CONNECT"}[5m])) without(instance, pod))
labels: labels:
quantile: "0.5" quantile: "0.5"
record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile record: cluster_quantile:apiserver_request_duration_seconds:histogram_quantile
- name: k8s.rules - name: k8s.rules
rules: rules:
- expr: | - expr: |
sum(rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m])) by (namespace) sum(rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])) by (namespace)
record: namespace:container_cpu_usage_seconds_total:sum_rate record: namespace:container_cpu_usage_seconds_total:sum_rate
- expr: | - expr: |
sum by (namespace, pod, container) ( sum by (cluster, namespace, pod, container) (
rate(container_cpu_usage_seconds_total{job="kubelet", image!="", container!="POD"}[5m]) rate(container_cpu_usage_seconds_total{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}[5m])
) * on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) ) * on (cluster, namespace, pod) group_left(node) topk by (cluster, namespace, pod) (
1, max by(cluster, namespace, pod, node) (kube_pod_info)
)
record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate record: node_namespace_pod_container:container_cpu_usage_seconds_total:sum_rate
- expr: | - expr: |
container_memory_working_set_bytes{job="kubelet", image!=""} container_memory_working_set_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info)
)
record: node_namespace_pod_container:container_memory_working_set_bytes record: node_namespace_pod_container:container_memory_working_set_bytes
- expr: | - expr: |
container_memory_rss{job="kubelet", image!=""} container_memory_rss{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info)
)
record: node_namespace_pod_container:container_memory_rss record: node_namespace_pod_container:container_memory_rss
- expr: | - expr: |
container_memory_cache{job="kubelet", image!=""} container_memory_cache{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info)
)
record: node_namespace_pod_container:container_memory_cache record: node_namespace_pod_container:container_memory_cache
- expr: | - expr: |
container_memory_swap{job="kubelet", image!=""} container_memory_swap{job="kubelet", metrics_path="/metrics/cadvisor", image!=""}
* on (namespace, pod) group_left(node) max by(namespace, pod, node) (kube_pod_info) * on (namespace, pod) group_left(node) topk by(namespace, pod) (1,
max by(namespace, pod, node) (kube_pod_info)
)
record: node_namespace_pod_container:container_memory_swap record: node_namespace_pod_container:container_memory_swap
- expr: | - expr: |
sum(container_memory_usage_bytes{job="kubelet", image!="", container!="POD"}) by (namespace) sum(container_memory_usage_bytes{job="kubelet", metrics_path="/metrics/cadvisor", image!="", container!="POD"}) by (namespace)
record: namespace:container_memory_usage_bytes:sum record: namespace:container_memory_usage_bytes:sum
- expr: | - expr: |
sum by (namespace, label_name) ( sum by (namespace) (
sum(kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod) sum by (namespace, pod) (
* on (namespace, pod) max by (namespace, pod, container) (
group_left(label_name) kube_pod_labels{job="kube-state-metrics"} kube_pod_container_resource_requests_memory_bytes{job="kube-state-metrics"}
) * on(namespace, pod) group_left() max by (namespace, pod) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
) )
record: namespace:kube_pod_container_resource_requests_memory_bytes:sum record: namespace:kube_pod_container_resource_requests_memory_bytes:sum
- expr: | - expr: |
sum by (namespace, label_name) ( sum by (namespace) (
sum(kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"} * on (endpoint, instance, job, namespace, pod, service) group_left(phase) (kube_pod_status_phase{phase=~"Pending|Running"} == 1)) by (namespace, pod) sum by (namespace, pod) (
* on (namespace, pod) max by (namespace, pod, container) (
group_left(label_name) kube_pod_labels{job="kube-state-metrics"} kube_pod_container_resource_requests_cpu_cores{job="kube-state-metrics"}
) * on(namespace, pod) group_left() max by (namespace, pod) (
kube_pod_status_phase{phase=~"Pending|Running"} == 1
)
)
) )
record: namespace:kube_pod_container_resource_requests_cpu_cores:sum record: namespace:kube_pod_container_resource_requests_cpu_cores:sum
- expr: | - expr: |
sum( max by (cluster, namespace, workload, pod) (
label_replace( label_replace(
label_replace( label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"}, kube_pod_owner{job="kube-state-metrics", owner_kind="ReplicaSet"},
"replicaset", "$1", "owner_name", "(.*)" "replicaset", "$1", "owner_name", "(.*)"
) * on(replicaset, namespace) group_left(owner_name) kube_replicaset_owner{job="kube-state-metrics"}, ) * on(replicaset, namespace) group_left(owner_name) topk by(replicaset, namespace) (
1, max by (replicaset, namespace, owner_name) (
kube_replicaset_owner{job="kube-state-metrics"}
)
),
"workload", "$1", "owner_name", "(.*)" "workload", "$1", "owner_name", "(.*)"
) )
) by (namespace, workload, pod) )
labels: labels:
workload_type: deployment workload_type: deployment
record: mixin_pod_workload record: mixin_pod_workload
- expr: | - expr: |
sum( max by (cluster, namespace, workload, pod) (
label_replace( label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"}, kube_pod_owner{job="kube-state-metrics", owner_kind="DaemonSet"},
"workload", "$1", "owner_name", "(.*)" "workload", "$1", "owner_name", "(.*)"
) )
) by (namespace, workload, pod) )
labels: labels:
workload_type: daemonset workload_type: daemonset
record: mixin_pod_workload record: mixin_pod_workload
- expr: | - expr: |
sum( max by (cluster, namespace, workload, pod) (
label_replace( label_replace(
kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"}, kube_pod_owner{job="kube-state-metrics", owner_kind="StatefulSet"},
"workload", "$1", "owner_name", "(.*)" "workload", "$1", "owner_name", "(.*)"
) )
) by (namespace, workload, pod) )
labels: labels:
workload_type: statefulset workload_type: statefulset
record: mixin_pod_workload record: mixin_pod_workload
@ -207,13 +348,17 @@ spec:
record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile record: cluster_quantile:scheduler_binding_duration_seconds:histogram_quantile
- name: node.rules - name: node.rules
rules: rules:
- expr: sum(min(kube_pod_info) by (node)) - expr: |
sum(min(kube_pod_info) by (cluster, node))
record: ':kube_pod_info_node_count:' record: ':kube_pod_info_node_count:'
- expr: | - expr: |
max(label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")) by (node, namespace, pod) topk by(namespace, pod) (1,
max by (node, namespace, pod) (
label_replace(kube_pod_info{job="kube-state-metrics"}, "pod", "$1", "pod", "(.*)")
))
record: 'node_namespace_pod:kube_pod_info:' record: 'node_namespace_pod:kube_pod_info:'
- expr: | - expr: |
count by (node) (sum by (node, cpu) ( count by (cluster, node) (sum by (node, cpu) (
node_cpu_seconds_total{job="node-exporter"} node_cpu_seconds_total{job="node-exporter"}
* on (namespace, pod) group_left(node) * on (namespace, pod) group_left(node)
node_namespace_pod:kube_pod_info: node_namespace_pod:kube_pod_info:
@ -228,8 +373,25 @@ spec:
node_memory_MemFree_bytes{job="node-exporter"} + node_memory_MemFree_bytes{job="node-exporter"} +
node_memory_Slab_bytes{job="node-exporter"} node_memory_Slab_bytes{job="node-exporter"}
) )
) ) by (cluster)
record: :node_memory_MemAvailable_bytes:sum record: :node_memory_MemAvailable_bytes:sum
- name: kubelet.rules
rules:
- expr: |
histogram_quantile(0.99, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: "0.99"
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.9, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: "0.9"
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- expr: |
histogram_quantile(0.5, sum(rate(kubelet_pleg_relist_duration_seconds_bucket[5m])) by (instance, le) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"})
labels:
quantile: "0.5"
record: node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile
- name: kube-prometheus-node-recording.rules - name: kube-prometheus-node-recording.rules
rules: rules:
- expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY - expr: sum(rate(node_cpu_seconds_total{mode!="idle",mode!="iowait"}[3m])) BY
@ -251,6 +413,42 @@ spec:
- expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total) - expr: cluster:node_cpu_seconds_total:rate5m / count(sum(node_cpu_seconds_total)
BY (instance, cpu)) BY (instance, cpu))
record: cluster:node_cpu:ratio record: cluster:node_cpu:ratio
- name: kube-prometheus-general.rules
rules:
- expr: count without(instance, pod, node) (up == 1)
record: count:up1
- expr: count without(instance, pod, node) (up == 0)
record: count:up0
- name: kube-state-metrics
rules:
- alert: KubeStateMetricsListErrors
annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in
list operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricslisterrors
expr: |
(sum(rate(kube_state_metrics_list_total{job="kube-state-metrics",result="error"}[5m]))
/
sum(rate(kube_state_metrics_list_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
labels:
severity: critical
- alert: KubeStateMetricsWatchErrors
annotations:
message: kube-state-metrics is experiencing errors at an elevated rate in
watch operations. This is likely causing it to not be able to expose metrics
about Kubernetes objects correctly or at all.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatemetricswatcherrors
expr: |
(sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics",result="error"}[5m]))
/
sum(rate(kube_state_metrics_watch_total{job="kube-state-metrics"}[5m])))
> 0.01
for: 15m
labels:
severity: critical
- name: node-exporter - name: node-exporter
rules: rules:
- alert: NodeFilesystemSpaceFillingUp - alert: NodeFilesystemSpaceFillingUp
@ -280,7 +478,7 @@ spec:
summary: Filesystem is predicted to run out of space within the next 4 hours. summary: Filesystem is predicted to run out of space within the next 4 hours.
expr: | expr: |
( (
node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 20 node_filesystem_avail_bytes{job="node-exporter",fstype!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!=""} * 100 < 15
and and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0 predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!=""}[6h], 4*60*60) < 0
and and
@ -425,7 +623,7 @@ spec:
state for longer than 15 minutes. state for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodnotready
expr: | expr: |
sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Failed|Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0 sum by (namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics", phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) > 0
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
@ -448,9 +646,15 @@ spec:
matched the expected number of replicas for longer than 15 minutes. matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedeploymentreplicasmismatch
expr: | expr: |
kube_deployment_spec_replicas{job="kube-state-metrics"} (
!= kube_deployment_spec_replicas{job="kube-state-metrics"}
kube_deployment_status_replicas_available{job="kube-state-metrics"} !=
kube_deployment_status_replicas_available{job="kube-state-metrics"}
) and (
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics"}[5m])
==
0
)
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
@ -460,9 +664,15 @@ spec:
not matched the expected number of replicas for longer than 15 minutes. not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubestatefulsetreplicasmismatch
expr: | expr: |
kube_statefulset_status_replicas_ready{job="kube-state-metrics"} (
!= kube_statefulset_status_replicas_ready{job="kube-state-metrics"}
kube_statefulset_status_replicas{job="kube-state-metrics"} !=
kube_statefulset_status_replicas{job="kube-state-metrics"}
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics"}[5m])
==
0
)
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
@ -606,7 +816,7 @@ spec:
tolerate node failure. tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubecpuovercommit
expr: | expr: |
sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum) sum(namespace:kube_pod_container_resource_requests_cpu_cores:sum{})
/ /
sum(kube_node_status_allocatable_cpu_cores) sum(kube_node_status_allocatable_cpu_cores)
> >
@ -620,7 +830,7 @@ spec:
tolerate node failure. tolerate node failure.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubememovercommit
expr: | expr: |
sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum) sum(namespace:kube_pod_container_resource_requests_memory_bytes:sum{})
/ /
sum(kube_node_status_allocatable_memory_bytes) sum(kube_node_status_allocatable_memory_bytes)
> >
@ -690,9 +900,9 @@ spec:
}} free. }} free.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumeusagecritical
expr: | expr: |
kubelet_volume_stats_available_bytes{job="kubelet"} kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/ /
kubelet_volume_stats_capacity_bytes{job="kubelet"} kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
< 0.03 < 0.03
for: 1m for: 1m
labels: labels:
@ -705,12 +915,12 @@ spec:
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepersistentvolumefullinfourdays
expr: | expr: |
( (
kubelet_volume_stats_available_bytes{job="kubelet"} kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}
/ /
kubelet_volume_stats_capacity_bytes{job="kubelet"} kubelet_volume_stats_capacity_bytes{job="kubelet", metrics_path="/metrics"}
) < 0.15 ) < 0.15
and and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet"}[6h], 4 * 24 * 3600) < 0 predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
for: 1h for: 1h
labels: labels:
severity: critical severity: critical
@ -749,16 +959,72 @@ spec:
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
- name: kube-apiserver-error-alerts
rules:
- alert: ErrorBudgetBurn
annotations:
message: 'High requests error budget burn for job=apiserver (current value:
{{ $value }})'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
expr: |
(
status_class_5xx:apiserver_request_total:ratio_rate1h{job="apiserver"} > (14.4*0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate5m{job="apiserver"} > (14.4*0.010000)
)
or
(
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (6*0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate30m{job="apiserver"} > (6*0.010000)
)
labels:
job: apiserver
severity: critical
- alert: ErrorBudgetBurn
annotations:
message: 'High requests error budget burn for job=apiserver (current value:
{{ $value }})'
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-errorbudgetburn
expr: |
(
status_class_5xx:apiserver_request_total:ratio_rate1d{job="apiserver"} > (3*0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate2h{job="apiserver"} > (3*0.010000)
)
or
(
status_class_5xx:apiserver_request_total:ratio_rate3d{job="apiserver"} > (0.010000)
and
status_class_5xx:apiserver_request_total:ratio_rate6h{job="apiserver"} > (0.010000)
)
labels:
job: apiserver
severity: warning
- name: kubernetes-system-apiserver - name: kubernetes-system-apiserver
rules: rules:
- alert: KubeAPILatencyHigh - alert: KubeAPILatencyHigh
annotations: annotations:
message: The API server has a 99th percentile latency of {{ $value }} seconds message: The API server has an abnormal latency of {{ $value }} seconds for
for {{ $labels.verb }} {{ $labels.resource }}. {{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: | expr: |
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 1 (
for: 10m cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"}
>
on (verb) group_left()
(
avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
+
2*stddev by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
)
) > on (verb) group_left()
1.2 * avg by (verb) (cluster:apiserver_request_duration_seconds:mean5m{job="apiserver"} >= 0)
and on (verb,resource)
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"}
>
1
for: 5m
labels: labels:
severity: warning severity: warning
- alert: KubeAPILatencyHigh - alert: KubeAPILatencyHigh
@ -767,34 +1033,10 @@ spec:
for {{ $labels.verb }} {{ $labels.resource }}. for {{ $labels.verb }} {{ $labels.resource }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapilatencyhigh
expr: | expr: |
cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99",subresource!="log",verb!~"LIST|WATCH|WATCHLIST|PROXY|CONNECT"} > 4 cluster_quantile:apiserver_request_duration_seconds:histogram_quantile{job="apiserver",quantile="0.99"} > 4
for: 10m for: 10m
labels: labels:
severity: critical severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value | humanizePercentage
}} of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
/
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.03
for: 10m
labels:
severity: critical
- alert: KubeAPIErrorsHigh
annotations:
message: API server is returning errors for {{ $value | humanizePercentage
}} of requests.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeapierrorshigh
expr: |
sum(rate(apiserver_request_total{job="apiserver",code=~"5.."}[5m]))
/
sum(rate(apiserver_request_total{job="apiserver"}[5m])) > 0.01
for: 10m
labels:
severity: warning
- alert: KubeAPIErrorsHigh - alert: KubeAPIErrorsHigh
annotations: annotations:
message: API server is returning errors for {{ $value | humanizePercentage message: API server is returning errors for {{ $value | humanizePercentage
@ -827,7 +1069,7 @@ spec:
in less than 7.0 days. in less than 7.0 days.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800 apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 604800
labels: labels:
severity: warning severity: warning
- alert: KubeClientCertificateExpiration - alert: KubeClientCertificateExpiration
@ -836,9 +1078,30 @@ spec:
in less than 24.0 hours. in less than 24.0 hours.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeclientcertificateexpiration
expr: | expr: |
apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400 apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and on(job) histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 86400
labels: labels:
severity: critical severity: critical
- alert: AggregatedAPIErrors
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} has
reported errors. The number of errors have increased for it in the past
five minutes. High values indicate that the availability of the service
changes too often.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapierrors
expr: |
sum by(name, namespace)(increase(aggregator_unavailable_apiservice_count[5m])) > 2
labels:
severity: warning
- alert: AggregatedAPIDown
annotations:
message: An aggregated API {{ $labels.name }}/{{ $labels.namespace }} is down.
It has not been available at least for the past five minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-aggregatedapidown
expr: |
sum by(name, namespace)(sum_over_time(aggregator_unavailable_apiservice[5m])) > 0
for: 5m
labels:
severity: warning
- alert: KubeAPIDown - alert: KubeAPIDown
annotations: annotations:
message: KubeAPI has disappeared from Prometheus target discovery. message: KubeAPI has disappeared from Prometheus target discovery.
@ -865,6 +1128,7 @@ spec:
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodeunreachable
expr: | expr: |
kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} == 1 kube_node_spec_taint{job="kube-state-metrics",key="node.kubernetes.io/unreachable",effect="NoSchedule"} == 1
for: 2m
labels: labels:
severity: warning severity: warning
- alert: KubeletTooManyPods - alert: KubeletTooManyPods
@ -873,7 +1137,37 @@ spec:
}} of its Pod capacity. }} of its Pod capacity.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods
expr: | expr: |
max(max(kubelet_running_pod_count{job="kubelet"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95 max(max(kubelet_running_pod_count{job="kubelet", metrics_path="/metrics"}) by(instance) * on(instance) group_left(node) kubelet_node_name{job="kubelet", metrics_path="/metrics"}) by(node) / max(kube_node_status_capacity_pods{job="kube-state-metrics"}) by(node) > 0.95
for: 15m
labels:
severity: warning
- alert: KubeNodeReadinessFlapping
annotations:
message: The readiness status of node {{ $labels.node }} has changed {{ $value
}} times in the last 15 minutes.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubenodereadinessflapping
expr: |
sum(changes(kube_node_status_condition{status="true",condition="Ready"}[15m])) by (node) > 2
for: 15m
labels:
severity: warning
- alert: KubeletPlegDurationHigh
annotations:
message: The Kubelet Pod Lifecycle Event Generator has a 99th percentile duration
of {{ $value }} seconds on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletplegdurationhigh
expr: |
node_quantile:kubelet_pleg_relist_duration_seconds:histogram_quantile{quantile="0.99"} >= 10
for: 5m
labels:
severity: warning
- alert: KubeletPodStartUpLatencyHigh
annotations:
message: Kubelet Pod startup 99th percentile latency is {{ $value }} seconds
on node {{ $labels.node }}.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletpodstartuplatencyhigh
expr: |
histogram_quantile(0.99, sum(rate(kubelet_pod_worker_duration_seconds_bucket{job="kubelet", metrics_path="/metrics"}[5m])) by (instance, le)) * on(instance) group_left(node) kubelet_node_name > 60
for: 15m for: 15m
labels: labels:
severity: warning severity: warning
@ -882,7 +1176,7 @@ spec:
message: Kubelet has disappeared from Prometheus target discovery. message: Kubelet has disappeared from Prometheus target discovery.
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubeletdown
expr: | expr: |
absent(up{job="kubelet"} == 1) absent(up{job="kubelet", metrics_path="/metrics"} == 1)
for: 15m for: 15m
labels: labels:
severity: critical severity: critical
@ -1038,7 +1332,8 @@ spec:
- alert: PrometheusRemoteStorageFailures - alert: PrometheusRemoteStorageFailures
annotations: annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send description: Prometheus {{$labels.namespace}}/{{$labels.pod}} failed to send
{{ printf "%.1f" $value }}% of the samples to queue {{$labels.queue}}. {{ printf "%.1f" $value }}% of the samples to {{ if $labels.queue }}{{ $labels.queue
}}{{ else }}{{ $labels.url }}{{ end }}.
summary: Prometheus fails to send samples to remote storage. summary: Prometheus fails to send samples to remote storage.
expr: | expr: |
( (
@ -1058,7 +1353,8 @@ spec:
- alert: PrometheusRemoteWriteBehind - alert: PrometheusRemoteWriteBehind
annotations: annotations:
description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write description: Prometheus {{$labels.namespace}}/{{$labels.pod}} remote write
is {{ printf "%.1f" $value }}s behind for queue {{$labels.queue}}. is {{ printf "%.1f" $value }}s behind for {{ if $labels.queue }}{{ $labels.queue
}}{{ else }}{{ $labels.url }}{{ end }}.
summary: Prometheus remote write is behind. summary: Prometheus remote write is behind.
expr: | expr: |
# Without max_over_time, failed scrapes could create false negatives, see # Without max_over_time, failed scrapes could create false negatives, see
@ -1145,8 +1441,8 @@ spec:
rules: rules:
- alert: TargetDown - alert: TargetDown
annotations: annotations:
message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }} targets in message: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
{{ $labels.namespace }} namespace are down.' }} targets in {{ $labels.namespace }} namespace are down.'
expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job, expr: 100 * (count(up == 0) BY (job, namespace, service) / count(up) BY (job,
namespace, service)) > 10 namespace, service)) > 10
for: 10m for: 10m

View File

@ -10,6 +10,38 @@ spec:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
interval: 30s interval: 30s
metricRelabelings: metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
sourceLabels:
- __name__
- action: drop - action: drop
regex: etcd_(debugging|disk|request|server).* regex: etcd_(debugging|disk|request|server).*
sourceLabels: sourceLabels:
@ -22,6 +54,11 @@ spec:
regex: apiserver_admission_step_admission_latencies_seconds_.* regex: apiserver_admission_step_admission_latencies_seconds_.*
sourceLabels: sourceLabels:
- __name__ - __name__
- action: drop
regex: apiserver_request_duration_seconds_bucket;(0.15|0.25|0.3|0.35|0.4|0.45|0.6|0.7|0.8|0.9|1.25|1.5|1.75|2.5|3|3.5|4.5|6|7|8|9|15|25|30|50)
sourceLabels:
- __name__
- le
port: https port: https
scheme: https scheme: https
tlsConfig: tlsConfig:

View File

@ -9,6 +9,38 @@ spec:
endpoints: endpoints:
- interval: 30s - interval: 30s
metricRelabelings: metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
sourceLabels:
- __name__
- action: drop - action: drop
regex: etcd_(debugging|disk|request|server).* regex: etcd_(debugging|disk|request|server).*
sourceLabels: sourceLabels:

View File

@ -8,13 +8,49 @@ metadata:
spec: spec:
endpoints: endpoints:
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true
interval: 30s interval: 30s
port: http-metrics metricRelabelings:
- action: drop
regex: kubelet_(pod_worker_latency_microseconds|pod_start_latency_microseconds|cgroup_manager_latency_microseconds|pod_worker_start_latency_microseconds|pleg_relist_latency_microseconds|pleg_relist_interval_microseconds|runtime_operations|runtime_operations_latency_microseconds|runtime_operations_errors|eviction_stats_age_microseconds|device_plugin_registration_count|device_plugin_alloc_latency_microseconds|network_plugin_operations_latency_microseconds)
sourceLabels:
- __name__
- action: drop
regex: scheduler_(e2e_scheduling_latency_microseconds|scheduling_algorithm_predicate_evaluation|scheduling_algorithm_priority_evaluation|scheduling_algorithm_preemption_evaluation|scheduling_algorithm_latency_microseconds|binding_latency_microseconds|scheduling_latency_seconds)
sourceLabels:
- __name__
- action: drop
regex: apiserver_(request_count|request_latencies|request_latencies_summary|dropped_requests|storage_data_key_generation_latencies_microseconds|storage_transformation_failures_total|storage_transformation_latencies_microseconds|proxy_tunnel_sync_latency_secs)
sourceLabels:
- __name__
- action: drop
regex: kubelet_docker_(operations|operations_latency_microseconds|operations_errors|operations_timeout)
sourceLabels:
- __name__
- action: drop
regex: reflector_(items_per_list|items_per_watch|list_duration_seconds|lists_total|short_watches_total|watch_duration_seconds|watches_total)
sourceLabels:
- __name__
- action: drop
regex: etcd_(helper_cache_hit_count|helper_cache_miss_count|helper_cache_entry_count|request_cache_get_latencies_summary|request_cache_add_latencies_summary|request_latencies_summary)
sourceLabels:
- __name__
- action: drop
regex: transformation_(transformation_latencies_microseconds|failures_total)
sourceLabels:
- __name__
- action: drop
regex: (admission_quota_controller_adds|crd_autoregistration_controller_work_duration|APIServiceOpenAPIAggregationControllerQueue1_adds|AvailableConditionController_retries|crd_openapi_controller_unfinished_work_seconds|APIServiceRegistrationController_retries|admission_quota_controller_longest_running_processor_microseconds|crdEstablishing_longest_running_processor_microseconds|crdEstablishing_unfinished_work_seconds|crd_openapi_controller_adds|crd_autoregistration_controller_retries|crd_finalizer_queue_latency|AvailableConditionController_work_duration|non_structural_schema_condition_controller_depth|crd_autoregistration_controller_unfinished_work_seconds|AvailableConditionController_adds|DiscoveryController_longest_running_processor_microseconds|autoregister_queue_latency|crd_autoregistration_controller_adds|non_structural_schema_condition_controller_work_duration|APIServiceRegistrationController_adds|crd_finalizer_work_duration|crd_naming_condition_controller_unfinished_work_seconds|crd_openapi_controller_longest_running_processor_microseconds|DiscoveryController_adds|crd_autoregistration_controller_longest_running_processor_microseconds|autoregister_unfinished_work_seconds|crd_naming_condition_controller_queue_latency|crd_naming_condition_controller_retries|non_structural_schema_condition_controller_queue_latency|crd_naming_condition_controller_depth|AvailableConditionController_longest_running_processor_microseconds|crdEstablishing_depth|crd_finalizer_longest_running_processor_microseconds|crd_naming_condition_controller_adds|APIServiceOpenAPIAggregationControllerQueue1_longest_running_processor_microseconds|DiscoveryController_queue_latency|DiscoveryController_unfinished_work_seconds|crd_openapi_controller_depth|APIServiceOpenAPIAggregationControllerQueue1_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_unfinished_work_seconds|DiscoveryController_work_duration|autoregister_adds|crd_autoregistration_controller_queue_latency|crd_finalizer_retries|AvailableConditionController_unfinished_work_seconds|autoregister_longest_running_processor_microseconds|non_structural_schema_condition_controller_unfinished_work_seconds|APIServiceOpenAPIAggregationControllerQueue1_depth|AvailableConditionController_depth|DiscoveryController_retries|admission_quota_controller_depth|crdEstablishing_adds|APIServiceOpenAPIAggregationControllerQueue1_retries|crdEstablishing_queue_latency|non_structural_schema_condition_controller_longest_running_processor_microseconds|autoregister_work_duration|crd_openapi_controller_retries|APIServiceRegistrationController_work_duration|crdEstablishing_work_duration|crd_finalizer_adds|crd_finalizer_depth|crd_openapi_controller_queue_latency|APIServiceOpenAPIAggregationControllerQueue1_work_duration|APIServiceRegistrationController_queue_latency|crd_autoregistration_controller_depth|AvailableConditionController_queue_latency|admission_quota_controller_queue_latency|crd_naming_condition_controller_work_duration|crd_openapi_controller_work_duration|DiscoveryController_depth|crd_naming_condition_controller_longest_running_processor_microseconds|APIServiceRegistrationController_depth|APIServiceRegistrationController_longest_running_processor_microseconds|crd_finalizer_unfinished_work_seconds|crdEstablishing_retries|admission_quota_controller_unfinished_work_seconds|non_structural_schema_condition_controller_adds|APIServiceRegistrationController_unfinished_work_seconds|admission_quota_controller_work_duration|autoregister_depth|autoregister_retries|kubeproxy_sync_proxy_rules_latency_microseconds|rest_client_request_latency_seconds|non_structural_schema_condition_controller_retries)
sourceLabels:
- __name__
port: https-metrics
relabelings: relabelings:
- sourceLabels: - sourceLabels:
- __metrics_path__ - __metrics_path__
targetLabel: metrics_path targetLabel: metrics_path
scheme: http scheme: https
tlsConfig:
insecureSkipVerify: true
- bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token
honorLabels: true honorLabels: true
interval: 30s interval: 30s
@ -24,12 +60,14 @@ spec:
sourceLabels: sourceLabels:
- __name__ - __name__
path: /metrics/cadvisor path: /metrics/cadvisor
port: http-metrics port: https-metrics
relabelings: relabelings:
- sourceLabels: - sourceLabels:
- __metrics_path__ - __metrics_path__
targetLabel: metrics_path targetLabel: metrics_path
scheme: http scheme: https
tlsConfig:
insecureSkipVerify: true
jobLabel: k8s-app jobLabel: k8s-app
namespaceSelector: namespaceSelector:
matchNames: matchNames:

View File

@ -1,38 +0,0 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
run: smtp-server
name: smtp-server
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
run: smtp-server
template:
metadata:
labels:
run: smtp-server
spec:
containers:
- env:
- name: GMAIL_USER
valueFrom:
secretKeyRef:
key: username
name: smtp-account
- name: GMAIL_PASSWORD
valueFrom:
secretKeyRef:
key: password
name: smtp-account
- name: DISABLE_IPV6
value: "True"
- name: RELAY_DOMAINS
value: :192.168.0.0/24:10.0.0.0/16
image: carlosedp/docker-smtp:v1.0.1
name: smtp-server
ports:
- containerPort: 25
name: smtp

View File

@ -1,14 +0,0 @@
apiVersion: v1
kind: Service
metadata:
labels:
run: smtp-server
name: smtp-server
namespace: monitoring
spec:
ports:
- name: smtp
port: 25
targetPort: smtp
selector:
run: smtp-server