fix(rules/bootstrap): Format yaml

This commit is contained in:
2025-12-29 13:23:04 +01:00
parent f81a4b2732
commit 957526a6bc
22 changed files with 2142 additions and 2044 deletions
+1
View File
@@ -1,6 +1,7 @@
#!/usr/bin/env bash
# the shebang is ignored, but nice for editors
watch_file nix/sources.json
watch_file nix/checks.nix
# Load .env file if it exists
dotenv_if_exists
+1 -2
View File
@@ -8,7 +8,7 @@ stages:
release:
stage: release
rules:
- if: '$CI_COMMIT_BRANCH =~ /^main/'
- if: "$CI_COMMIT_BRANCH =~ /^main/"
when: always
- when: never
script:
@@ -43,4 +43,3 @@ rebuild:
"${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts"
fi
done
+5 -5
View File
@@ -4,15 +4,15 @@ metadata:
name: argocd-cluster-admin
rules:
- apiGroups:
- '*'
- "*"
resources:
- '*'
- "*"
verbs:
- '*'
- "*"
- nonResourceURLs:
- '*'
- "*"
verbs:
- '*'
- "*"
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
-2
View File
@@ -6,5 +6,3 @@ metadata:
name: cluster-admin-token
namespace: kube-system
type: kubernetes.io/service-account-token
-2
View File
@@ -10,5 +10,3 @@ metadata:
name: cluster-ekman
namespace: argocd
type: Opaque
+1 -2
View File
@@ -66,7 +66,7 @@ spec:
itemType: string
collectionType: string
string: ""
# All the fields above besides "string" apply to both the array and map type parameter announcements.
# All the fields above besides 'string' apply to both the array and map type parameter announcements.
# - name: array-param
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
# array: [default, items]
@@ -84,4 +84,3 @@ spec:
# If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository
# might have executable files. Set to true only if you trust the CMP plugin authors.
preserveFileMode: false
@@ -422,4 +422,3 @@ spec:
path: ca.crt
optional: true
secretName: argocd-repo-server-tls
-1
View File
@@ -13,4 +13,3 @@ stringData:
name: staging-vcluster
server: https://staging-vcluster.staging-vcluster
type: Opaque
+4 -4
View File
@@ -32,12 +32,12 @@ projects:
additionalAnnotations: {}
description: sys components project
sourceRepos:
- '*'
- "*"
destinations:
- namespace: '*'
- namespace: "*"
server: https://kubernetes.default.svc
clusterResourceWhitelist:
- group: '*'
kind: '*'
- group: "*"
kind: "*"
orphanedResources:
warn: false
+8 -3
View File
@@ -5,6 +5,8 @@ let
globalExcludes = [
"nix/default.nix"
"attic"
"vcluster"
".*vendor"
".*chart/.*"
".*schema.json"
@@ -32,6 +34,7 @@ pre-commit.run {
enable = true;
excludes = [
"vcluster/"
"attic/"
];
args = [
"-x"
@@ -41,15 +44,17 @@ pre-commit.run {
};
yamllint = {
enable = false;
enable = true;
excludes = [
"attic/"
"charts/templates/"
"charts/charts/"
"charts/"
"values/"
"vcluster/"
];
settings = {
strict = true;
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 165} } }'';
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 300} } }'';
};
};
+42 -21
View File
@@ -3,7 +3,8 @@ groups:
rules:
- alert: etcdMembersDown
annotations:
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
description:
'etcd cluster "{{ $labels.job }}": members are down ({{ $value
}}).'
summary: etcd cluster members are down.
expr: |-
@@ -20,17 +21,20 @@ groups:
severity: critical
- alert: etcdInsufficientMembers
annotations:
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
description:
'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
}}).'
summary: etcd cluster has insufficient number of members.
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
expr:
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
without (instance) + 1) / 2)
for: 3m
labels:
severity: critical
- alert: etcdNoLeader
annotations:
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
description:
'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
has no leader.'
summary: etcd cluster has no leader.
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
@@ -39,12 +43,14 @@ groups:
severity: critical
- alert: etcdHighNumberOfLeaderChanges
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
description:
'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
within the last 15 minutes. Frequent elections may be a sign of insufficient
resources, high network latency, or disruptions by other components and should
be investigated.'
summary: etcd cluster has high number of leader changes.
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
expr:
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
>= 4
for: 5m
@@ -52,7 +58,8 @@ groups:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
description:
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
summary: etcd cluster has high number of failed grpc requests.
expr: |-
@@ -65,7 +72,8 @@ groups:
severity: warning
- alert: etcdHighNumberOfFailedGRPCRequests
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
description:
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
summary: etcd cluster has high number of failed grpc requests.
expr: |-
@@ -78,7 +86,8 @@ groups:
severity: critical
- alert: etcdGRPCRequestsSlow
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
description:
'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
}} method.'
summary: etcd grpc requests are slow
@@ -90,7 +99,8 @@ groups:
severity: critical
- alert: etcdMemberCommunicationSlow
annotations:
description: 'etcd cluster "{{ $labels.job }}": member communication with {{
description:
'etcd cluster "{{ $labels.job }}": member communication with {{
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
}}.'
summary: etcd cluster member communication is slow.
@@ -102,7 +112,8 @@ groups:
severity: warning
- alert: etcdHighNumberOfFailedProposals
annotations:
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
description:
'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
summary: etcd cluster has high number of proposal failures.
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
@@ -111,7 +122,8 @@ groups:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
description:
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
@@ -122,7 +134,8 @@ groups:
severity: warning
- alert: etcdHighFsyncDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
description:
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile fsync durations are too high.
expr: |-
@@ -133,7 +146,8 @@ groups:
severity: critical
- alert: etcdHighCommitDurations
annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
description:
'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
{{ $value }}s on etcd instance {{ $labels.instance }}.'
summary: etcd cluster 99th percentile commit durations are too high.
expr: |-
@@ -144,11 +158,13 @@ groups:
severity: warning
- alert: etcdDatabaseQuotaLowSpace
annotations:
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined
description:
'etcd cluster "{{ $labels.job }}": database size exceeds the defined
quota on etcd instance {{ $labels.instance }}, please defrag or increase the
quota as the writes to etcd will be disabled when it is full.'
summary: etcd cluster database is running full.
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
expr:
(last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 >
95
for: 10m
@@ -156,26 +172,31 @@ groups:
severity: critical
- alert: etcdExcessiveDatabaseGrowth
annotations:
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk
description:
'etcd cluster "{{ $labels.job }}": Predicting running out of disk
space in the next four hours, based on write observations within the past
four hours on etcd instance {{ $labels.instance }}, please check as it might
be disruptive.'
summary: etcd cluster database growing very fast.
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
expr:
predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
> etcd_server_quota_backend_bytes{job=~".*etcd.*"}
for: 10m
labels:
severity: warning
- alert: etcdDatabaseHighFragmentationRatio
annotations:
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance
description:
'etcd cluster "{{ $labels.job }}": database size in use on instance
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
retrieve the unused fragmented disk space.'
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
summary: etcd database size in use is less than 50% of the actual allocated
summary:
etcd database size in use is less than 50% of the actual allocated
storage.
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
expr:
(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
for: 10m
+8 -4
View File
@@ -3,11 +3,13 @@ groups:
rules:
- alert: TargetDown
annotations:
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
description:
'{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
}} targets in {{ $labels.namespace }} namespace are down.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
summary: One or more targets are unreachable.
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
expr:
100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
BY (cluster, job, namespace, service)) > 10
for: 10m
labels:
@@ -21,7 +23,8 @@ groups:
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
summary: An alert that should always be firing to certify that Alertmanager
summary:
An alert that should always be firing to certify that Alertmanager
is working properly.
expr: vector(1)
labels:
@@ -37,7 +40,8 @@ groups:
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
summary: Info-level alert inhibition.
expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
expr:
ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: none
+40 -21
View File
@@ -3,18 +3,21 @@ groups:
rules:
- alert: KubePodCrashLooping
annotations:
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
description:
'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
}}) is in waiting state (reason: "CrashLoopBackOff").'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
summary: Pod is crash looping.
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
expr:
max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
for: 15m
labels:
severity: warning
- alert: KubePodNotReady
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
description:
Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
state for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
summary: Pod has been in a non-ready state for more than 15 minutes.
@@ -31,7 +34,8 @@ groups:
severity: warning
- alert: KubeDeploymentGenerationMismatch
annotations:
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
description:
Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
}} does not match, this indicates that the Deployment has failed but has not
been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
@@ -45,7 +49,8 @@ groups:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
description:
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
@@ -64,7 +69,8 @@ groups:
severity: warning
- alert: KubeDeploymentRolloutStuck
annotations:
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
description:
Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
}} is not progressing for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
summary: Deployment rollout is not progressing.
@@ -76,7 +82,8 @@ groups:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
description:
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
summary: StatefulSet has not matched the expected number of replicas.
@@ -95,7 +102,8 @@ groups:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
description:
StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
@@ -109,7 +117,8 @@ groups:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
description:
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
@@ -136,7 +145,8 @@ groups:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
description:
DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
finished or progressed for at least 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
@@ -169,19 +179,22 @@ groups:
severity: warning
- alert: KubeContainerWaiting
annotations:
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
description:
pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
{{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
expr:
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled.'
description:
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
expr: |-
@@ -193,18 +206,21 @@ groups:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run.'
description:
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
expr:
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
> 0
for: 15m
labels:
severity: warning
- alert: KubeJobNotCompleted
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
description:
Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
than {{ "43200" | humanizeDuration }} to complete.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
summary: Job did not complete in time
@@ -216,7 +232,8 @@ groups:
severity: warning
- alert: KubeJobFailed
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
description:
Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
Removing failed job after investigation should clear this alert.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
summary: Job failed to complete.
@@ -226,7 +243,8 @@ groups:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
description:
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
has not matched the desired number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
summary: HPA has not matched desired number of replicas.
@@ -249,7 +267,8 @@ groups:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
description:
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
has been running at max replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
summary: HPA is running at max replicas
+17 -9
View File
@@ -3,7 +3,8 @@ groups:
rules:
- alert: KubeCPUOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
description:
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
summary: Cluster has overcommitted CPU resource requests.
@@ -16,7 +17,8 @@ groups:
severity: warning
- alert: KubeMemoryOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
description:
Cluster {{ $labels.cluster }} has overcommitted memory resource
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
failure.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
@@ -30,7 +32,8 @@ groups:
severity: warning
- alert: KubeCPUQuotaOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
description:
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
summary: Cluster has overcommitted CPU resource requests.
@@ -44,7 +47,8 @@ groups:
severity: warning
- alert: KubeMemoryQuotaOvercommit
annotations:
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
description:
Cluster {{ $labels.cluster }} has overcommitted memory resource
requests for Namespaces.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
summary: Cluster has overcommitted memory resource requests.
@@ -58,7 +62,8 @@ groups:
severity: warning
- alert: KubeQuotaAlmostFull
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
description:
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
summary: Namespace quota is going to be full.
@@ -72,7 +77,8 @@ groups:
severity: info
- alert: KubeQuotaFullyUsed
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
description:
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
summary: Namespace quota is fully used.
@@ -86,7 +92,8 @@ groups:
severity: info
- alert: KubeQuotaExceeded
annotations:
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
description:
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
}} of its {{ $labels.resource }} quota.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
summary: Namespace quota has exceeded the limits.
@@ -100,9 +107,10 @@ groups:
severity: warning
- alert: CPUThrottlingHigh
annotations:
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
description:
"{{ $value | humanizePercentage }} throttling of CPU in namespace
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
}}.'
}}."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
summary: Processes experience elevated CPU throttling.
expr: |-
+12 -7
View File
@@ -1,10 +1,10 @@
groups:
- name: kubernetes-storage
rules:
- alert: KubePersistentVolumeFillingUp
annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
description:
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
@@ -26,7 +26,8 @@ groups:
severity: critical
- alert: KubePersistentVolumeFillingUp
annotations:
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
description:
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
{{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value
| humanizePercentage }} is available.
@@ -51,7 +52,8 @@ groups:
severity: warning
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
description:
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
@@ -73,7 +75,8 @@ groups:
severity: critical
- alert: KubePersistentVolumeInodesFillingUp
annotations:
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
description:
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
{{ . }} {{- end }} is expected to run out of inodes within four days. Currently
{{ $value | humanizePercentage }} of its inodes are free.
@@ -98,11 +101,13 @@ groups:
severity: warning
- alert: KubePersistentVolumeErrors
annotations:
description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
description:
The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
-}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
summary: PersistentVolume is having issues with provisioning.
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
expr:
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
> 0
for: 5m
labels:
+55 -28
View File
@@ -3,7 +3,8 @@ groups:
rules:
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
description:
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
@@ -21,7 +22,8 @@ groups:
severity: warning
- alert: NodeFilesystemSpaceFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
description:
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
@@ -39,7 +41,8 @@ groups:
severity: critical
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
description:
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
@@ -55,7 +58,8 @@ groups:
severity: warning
- alert: NodeFilesystemAlmostOutOfSpace
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
description:
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
space left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
@@ -71,7 +75,8 @@ groups:
severity: critical
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
description:
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left and is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
@@ -89,7 +94,8 @@ groups:
severity: warning
- alert: NodeFilesystemFilesFillingUp
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
description:
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left and is filling up fast.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
@@ -107,7 +113,8 @@ groups:
severity: critical
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
description:
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
@@ -123,7 +130,8 @@ groups:
severity: warning
- alert: NodeFilesystemAlmostOutOfFiles
annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
description:
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
inodes left.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
@@ -139,38 +147,44 @@ groups:
severity: critical
- alert: NodeNetworkReceiveErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
description:
'{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
summary: Network interface is reporting many receive errors.
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
expr:
rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
> 0.01
for: 1h
labels:
severity: warning
- alert: NodeNetworkTransmitErrs
annotations:
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
description:
'{{ $labels.instance }} interface {{ $labels.device }} has encountered
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
summary: Network interface is reporting many transmit errors.
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
expr:
rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
> 0.01
for: 1h
labels:
severity: warning
- alert: NodeHighNumberConntrackEntriesUsed
annotations:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
description: "{{ $value | humanizePercentage }} of conntrack entries are used."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
summary: Number of conntrack are getting close to the limit.
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
expr:
(node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
> 0.75
labels:
severity: warning
- alert: NodeTextFileCollectorScrapeError
annotations:
description: Node Exporter text file collector on {{ $labels.instance }} failed
description:
Node Exporter text file collector on {{ $labels.instance }} failed
to scrape.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
summary: Node Exporter text file collector failed to scrape.
@@ -179,7 +193,8 @@ groups:
severity: warning
- alert: NodeClockSkewDetected
annotations:
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
description:
Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
Ensure NTP is configured correctly on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
summary: Clock skew detected.
@@ -200,7 +215,8 @@ groups:
severity: warning
- alert: NodeClockNotSynchronising
annotations:
description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
description:
Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
is configured on this host.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
summary: Clock not synchronising.
@@ -213,12 +229,14 @@ groups:
severity: warning
- alert: NodeRAIDDegraded
annotations:
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
description:
RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
in degraded state due to one or more disks failures. Number of spare drives
is insufficient to fix issue automatically.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
summary: RAID Array is degraded.
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
expr:
node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
> 0
for: 15m
@@ -226,17 +244,20 @@ groups:
severity: critical
- alert: NodeRAIDDiskFailure
annotations:
description: At least one device in RAID array at {{ $labels.instance }} failed.
description:
At least one device in RAID array at {{ $labels.instance }} failed.
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
summary: Failed device in RAID array.
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
expr:
node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
> 0
labels:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently at
description:
File descriptors limit at {{ $labels.instance }} is currently at
{{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
@@ -249,7 +270,8 @@ groups:
severity: warning
- alert: NodeFileDescriptorLimit
annotations:
description: File descriptors limit at {{ $labels.instance }} is currently at
description:
File descriptors limit at {{ $labels.instance }} is currently at
{{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
summary: Kernel is predicted to exhaust file descriptors limit soon.
@@ -266,7 +288,8 @@ groups:
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
summary: High CPU usage.
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
expr:
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
mode!="idle"}[2m]))) * 100 > 90
for: 15m
labels:
@@ -301,7 +324,8 @@ groups:
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
summary: Host is running out of memory.
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
expr:
100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
* 100) > 90
for: 15m
labels:
@@ -313,14 +337,16 @@ groups:
This symptom might indicate disk saturation.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
summary: Disk IO queue is high.
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
expr:
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
> 10
for: 30m
labels:
severity: warning
- alert: NodeSystemdServiceFailed
annotations:
description: Systemd service {{ $labels.name }} has entered failed state at
description:
Systemd service {{ $labels.name }} has entered failed state at
{{ $labels.instance }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
summary: Systemd service has entered failed state.
@@ -330,7 +356,8 @@ groups:
severity: warning
- alert: NodeBondingDegraded
annotations:
description: Bonding interface {{ $labels.master }} on {{ $labels.instance }}
description:
Bonding interface {{ $labels.master }} on {{ $labels.instance }}
is in degraded state due to one or more slave failures.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
summary: Bonding interface is degraded
+12 -6
View File
@@ -8,14 +8,16 @@ groups:
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host high CPU load (instance {{ $labels.instance }})
expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
expr:
(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 10m
labels:
severity: critical
- alert: MemoryUtilizationHighWarning
annotations:
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
dashboard:
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
description: Node {{ $labels.instance }} has less than 10% available memory.
summary: Node Memory utilization warning
@@ -25,7 +27,8 @@ groups:
severity: critical
- alert: MemoryUtilizationHighCritical
annotations:
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
dashboard:
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
description: Node {{ $labels.instance }} has less than 5% available memory.
summary: Node Memory utilization critical
@@ -37,7 +40,8 @@ groups:
annotations:
description: Node {{ $labels.node }} has CPU utilization over 90%.
summary: Node has been in not-ready state for longer than 3 minutes
expr: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
expr:
(sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
> 0
for: 5m
@@ -50,7 +54,8 @@ groups:
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
expr:
kube_node_status_condition{condition="MemoryPressure",status="true"} ==
1
for: 2m
labels:
@@ -62,7 +67,8 @@ groups:
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
expr:
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
== 1
for: 0m
+8 -4
View File
@@ -3,18 +3,22 @@ groups:
rules:
- alert: VeleroBackupPartialFailures
annotations:
message: Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
message:
Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
failed backups.
expr: velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
expr:
velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
> 0.25
for: 15m
labels:
severity: critical
- alert: VeleroBackupFailures
annotations:
message: Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
message:
Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
backups.
expr: velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
expr:
velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
> 0.25
for: 15m
labels:
+12 -6
View File
@@ -3,7 +3,8 @@ groups:
rules:
- alert: X509ExporterReadErrors
annotations:
description: Over the last 15 minutes, this x509-certificate-exporter instance
description:
Over the last 15 minutes, this x509-certificate-exporter instance
has experienced errors reading certificate files or querying the Kubernetes
API. This could be caused by a misconfiguration if triggered when the exporter
starts.
@@ -14,7 +15,8 @@ groups:
severity: warning
- alert: CertificateError
annotations:
description: Certificate could not be decoded {{if $labels.secret_name }} in
description:
Certificate could not be decoded {{if $labels.secret_name }} in
Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at
location "{{ $labels.filepath }}"{{end}}
summary: Certificate cannot be decoded
@@ -24,22 +26,26 @@ groups:
severity: warning
- alert: CertificateRenewal
annotations:
description: Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
description:
Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
summary: Certificate should be renewed
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
expr:
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28
for: 15m
labels:
severity: warning
- alert: CertificateExpiration
annotations:
description: Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
description:
Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
summary: Certificate is about to expire
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
expr:
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14
for: 15m
labels: