fix(rules/bootstrap): Format yaml
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
# the shebang is ignored, but nice for editors
|
||||
watch_file nix/sources.json
|
||||
watch_file nix/checks.nix
|
||||
|
||||
# Load .env file if it exists
|
||||
dotenv_if_exists
|
||||
|
||||
+1
-2
@@ -8,7 +8,7 @@ stages:
|
||||
release:
|
||||
stage: release
|
||||
rules:
|
||||
- if: '$CI_COMMIT_BRANCH =~ /^main/'
|
||||
- if: "$CI_COMMIT_BRANCH =~ /^main/"
|
||||
when: always
|
||||
- when: never
|
||||
script:
|
||||
@@ -43,4 +43,3 @@ rebuild:
|
||||
"${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts"
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
@@ -4,15 +4,15 @@ metadata:
|
||||
name: argocd-cluster-admin
|
||||
rules:
|
||||
- apiGroups:
|
||||
- '*'
|
||||
- "*"
|
||||
resources:
|
||||
- '*'
|
||||
- "*"
|
||||
verbs:
|
||||
- '*'
|
||||
- "*"
|
||||
- nonResourceURLs:
|
||||
- '*'
|
||||
- "*"
|
||||
verbs:
|
||||
- '*'
|
||||
- "*"
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
|
||||
@@ -6,5 +6,3 @@ metadata:
|
||||
name: cluster-admin-token
|
||||
namespace: kube-system
|
||||
type: kubernetes.io/service-account-token
|
||||
|
||||
|
||||
|
||||
@@ -10,5 +10,3 @@ metadata:
|
||||
name: cluster-ekman
|
||||
namespace: argocd
|
||||
type: Opaque
|
||||
|
||||
|
||||
|
||||
@@ -66,7 +66,7 @@ spec:
|
||||
itemType: string
|
||||
collectionType: string
|
||||
string: ""
|
||||
# All the fields above besides "string" apply to both the array and map type parameter announcements.
|
||||
# All the fields above besides 'string' apply to both the array and map type parameter announcements.
|
||||
# - name: array-param
|
||||
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
|
||||
# array: [default, items]
|
||||
@@ -84,4 +84,3 @@ spec:
|
||||
# If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository
|
||||
# might have executable files. Set to true only if you trust the CMP plugin authors.
|
||||
preserveFileMode: false
|
||||
|
||||
|
||||
@@ -422,4 +422,3 @@ spec:
|
||||
path: ca.crt
|
||||
optional: true
|
||||
secretName: argocd-repo-server-tls
|
||||
|
||||
|
||||
@@ -13,4 +13,3 @@ stringData:
|
||||
name: staging-vcluster
|
||||
server: https://staging-vcluster.staging-vcluster
|
||||
type: Opaque
|
||||
|
||||
|
||||
@@ -32,12 +32,12 @@ projects:
|
||||
additionalAnnotations: {}
|
||||
description: sys components project
|
||||
sourceRepos:
|
||||
- '*'
|
||||
- "*"
|
||||
destinations:
|
||||
- namespace: '*'
|
||||
- namespace: "*"
|
||||
server: https://kubernetes.default.svc
|
||||
clusterResourceWhitelist:
|
||||
- group: '*'
|
||||
kind: '*'
|
||||
- group: "*"
|
||||
kind: "*"
|
||||
orphanedResources:
|
||||
warn: false
|
||||
|
||||
+8
-3
@@ -5,6 +5,8 @@ let
|
||||
|
||||
globalExcludes = [
|
||||
"nix/default.nix"
|
||||
"attic"
|
||||
"vcluster"
|
||||
".*vendor"
|
||||
".*chart/.*"
|
||||
".*schema.json"
|
||||
@@ -32,6 +34,7 @@ pre-commit.run {
|
||||
enable = true;
|
||||
excludes = [
|
||||
"vcluster/"
|
||||
"attic/"
|
||||
];
|
||||
args = [
|
||||
"-x"
|
||||
@@ -41,15 +44,17 @@ pre-commit.run {
|
||||
};
|
||||
|
||||
yamllint = {
|
||||
enable = false;
|
||||
enable = true;
|
||||
excludes = [
|
||||
"attic/"
|
||||
"charts/templates/"
|
||||
"charts/charts/"
|
||||
"charts/"
|
||||
"values/"
|
||||
"vcluster/"
|
||||
];
|
||||
settings = {
|
||||
strict = true;
|
||||
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 165} } }'';
|
||||
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 300} } }'';
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
+42
-21
@@ -3,7 +3,8 @@ groups:
|
||||
rules:
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": members are down ({{ $value
|
||||
}}).'
|
||||
summary: etcd cluster members are down.
|
||||
expr: |-
|
||||
@@ -20,17 +21,20 @@ groups:
|
||||
severity: critical
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
||||
}}).'
|
||||
summary: etcd cluster has insufficient number of members.
|
||||
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
|
||||
expr:
|
||||
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
|
||||
without (instance) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
|
||||
has no leader.'
|
||||
summary: etcd cluster has no leader.
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
@@ -39,12 +43,14 @@ groups:
|
||||
severity: critical
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
|
||||
within the last 15 minutes. Frequent elections may be a sign of insufficient
|
||||
resources, high network latency, or disruptions by other components and should
|
||||
be investigated.'
|
||||
summary: etcd cluster has high number of leader changes.
|
||||
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
|
||||
expr:
|
||||
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
|
||||
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
|
||||
>= 4
|
||||
for: 5m
|
||||
@@ -52,7 +58,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
@@ -65,7 +72,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
@@ -78,7 +86,8 @@ groups:
|
||||
severity: critical
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
|
||||
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
|
||||
}} method.'
|
||||
summary: etcd grpc requests are slow
|
||||
@@ -90,7 +99,8 @@ groups:
|
||||
severity: critical
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": member communication with {{
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": member communication with {{
|
||||
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
|
||||
}}.'
|
||||
summary: etcd cluster member communication is slow.
|
||||
@@ -102,7 +112,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
|
||||
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster has high number of proposal failures.
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
@@ -111,7 +122,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
@@ -122,7 +134,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
@@ -133,7 +146,8 @@ groups:
|
||||
severity: critical
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
||||
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile commit durations are too high.
|
||||
expr: |-
|
||||
@@ -144,11 +158,13 @@ groups:
|
||||
severity: warning
|
||||
- alert: etcdDatabaseQuotaLowSpace
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": database size exceeds the defined
|
||||
quota on etcd instance {{ $labels.instance }}, please defrag or increase the
|
||||
quota as the writes to etcd will be disabled when it is full.'
|
||||
summary: etcd cluster database is running full.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
|
||||
expr:
|
||||
(last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
|
||||
last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 >
|
||||
95
|
||||
for: 10m
|
||||
@@ -156,26 +172,31 @@ groups:
|
||||
severity: critical
|
||||
- alert: etcdExcessiveDatabaseGrowth
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": Predicting running out of disk
|
||||
space in the next four hours, based on write observations within the past
|
||||
four hours on etcd instance {{ $labels.instance }}, please check as it might
|
||||
be disruptive.'
|
||||
summary: etcd cluster database growing very fast.
|
||||
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
|
||||
expr:
|
||||
predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
|
||||
> etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdDatabaseHighFragmentationRatio
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": database size in use on instance
|
||||
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
|
||||
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
|
||||
retrieve the unused fragmented disk space.'
|
||||
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
||||
summary: etcd database size in use is less than 50% of the actual allocated
|
||||
summary:
|
||||
etcd database size in use is less than 50% of the actual allocated
|
||||
storage.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
|
||||
expr:
|
||||
(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
|
||||
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
|
||||
and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
||||
for: 10m
|
||||
|
||||
+8
-4
@@ -3,11 +3,13 @@ groups:
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
|
||||
description:
|
||||
'{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
|
||||
}} targets in {{ $labels.namespace }} namespace are down.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||||
summary: One or more targets are unreachable.
|
||||
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
|
||||
expr:
|
||||
100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
|
||||
BY (cluster, job, namespace, service)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
@@ -21,7 +23,8 @@ groups:
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
||||
summary: An alert that should always be firing to certify that Alertmanager
|
||||
summary:
|
||||
An alert that should always be firing to certify that Alertmanager
|
||||
is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
@@ -37,7 +40,8 @@ groups:
|
||||
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
||||
summary: Info-level alert inhibition.
|
||||
expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
|
||||
expr:
|
||||
ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
|
||||
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
||||
labels:
|
||||
severity: none
|
||||
+40
-21
@@ -3,18 +3,21 @@ groups:
|
||||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||
description:
|
||||
'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||
}}) is in waiting state (reason: "CrashLoopBackOff").'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
||||
summary: Pod is crash looping.
|
||||
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
|
||||
expr:
|
||||
max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
|
||||
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||
description:
|
||||
Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||
state for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||
@@ -31,7 +34,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||
description:
|
||||
Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} does not match, this indicates that the Deployment has failed but has not
|
||||
been rolled back.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
||||
@@ -45,7 +49,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
|
||||
description:
|
||||
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
|
||||
not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
@@ -64,7 +69,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentRolloutStuck
|
||||
annotations:
|
||||
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
|
||||
description:
|
||||
Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} is not progressing for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
|
||||
summary: Deployment rollout is not progressing.
|
||||
@@ -76,7 +82,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
|
||||
description:
|
||||
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
|
||||
not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
||||
summary: StatefulSet has not matched the expected number of replicas.
|
||||
@@ -95,7 +102,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||
description:
|
||||
StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||
}} does not match, this indicates that the StatefulSet has failed but has
|
||||
not been rolled back.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
||||
@@ -109,7 +117,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
||||
description:
|
||||
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
||||
has not been rolled out.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
||||
summary: StatefulSet update has not been rolled out.
|
||||
@@ -136,7 +145,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
|
||||
description:
|
||||
DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
|
||||
finished or progressed for at least 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
||||
summary: DaemonSet rollout is stuck.
|
||||
@@ -169,19 +179,22 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
|
||||
description:
|
||||
pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
|
||||
{{ $labels.container}} has been in waiting state for longer than 1 hour.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
||||
summary: Pod container waiting longer than 1 hour
|
||||
expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
|
||||
expr:
|
||||
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
|
||||
namespace=~".*"}) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are not scheduled.'
|
||||
description:
|
||||
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are not scheduled."
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
||||
summary: DaemonSet pods are not scheduled.
|
||||
expr: |-
|
||||
@@ -193,18 +206,21 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are running where they are not supposed to run.'
|
||||
description:
|
||||
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are running where they are not supposed to run."
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
||||
summary: DaemonSet pods are misscheduled.
|
||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
expr:
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeJobNotCompleted
|
||||
annotations:
|
||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
||||
description:
|
||||
Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
||||
than {{ "43200" | humanizeDuration }} to complete.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
|
||||
summary: Job did not complete in time
|
||||
@@ -216,7 +232,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
||||
description:
|
||||
Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
||||
Removing failed job after investigation should clear this alert.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
||||
summary: Job failed to complete.
|
||||
@@ -226,7 +243,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
description:
|
||||
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
has not matched the desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
||||
summary: HPA has not matched desired number of replicas.
|
||||
@@ -249,7 +267,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
description:
|
||||
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
has been running at max replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
||||
summary: HPA is running at max replicas
|
||||
|
||||
@@ -3,7 +3,8 @@ groups:
|
||||
rules:
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||
description:
|
||||
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||
for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
@@ -16,7 +17,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
description:
|
||||
Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
|
||||
failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
||||
@@ -30,7 +32,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||
description:
|
||||
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||
for Namespaces.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
@@ -44,7 +47,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
description:
|
||||
Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
requests for Namespaces.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
@@ -58,7 +62,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
description:
|
||||
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
||||
summary: Namespace quota is going to be full.
|
||||
@@ -72,7 +77,8 @@ groups:
|
||||
severity: info
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
description:
|
||||
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
||||
summary: Namespace quota is fully used.
|
||||
@@ -86,7 +92,8 @@ groups:
|
||||
severity: info
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
description:
|
||||
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
||||
summary: Namespace quota has exceeded the limits.
|
||||
@@ -100,9 +107,10 @@ groups:
|
||||
severity: warning
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
|
||||
description:
|
||||
"{{ $value | humanizePercentage }} throttling of CPU in namespace
|
||||
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
|
||||
}}.'
|
||||
}}."
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
||||
summary: Processes experience elevated CPU throttling.
|
||||
expr: |-
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
|
||||
groups:
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
description:
|
||||
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||
@@ -26,7 +26,8 @@ groups:
|
||||
severity: critical
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
description:
|
||||
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value
|
||||
| humanizePercentage }} is available.
|
||||
@@ -51,7 +52,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
description:
|
||||
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
@@ -73,7 +75,8 @@ groups:
|
||||
severity: critical
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
description:
|
||||
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} is expected to run out of inodes within four days. Currently
|
||||
{{ $value | humanizePercentage }} of its inodes are free.
|
||||
@@ -98,11 +101,13 @@ groups:
|
||||
severity: warning
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
|
||||
description:
|
||||
The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
|
||||
-}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
||||
summary: PersistentVolume is having issues with provisioning.
|
||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
|
||||
expr:
|
||||
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
|
||||
> 0
|
||||
for: 5m
|
||||
labels:
|
||||
|
||||
+55
-28
@@ -3,7 +3,8 @@ groups:
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
@@ -21,7 +22,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
@@ -39,7 +41,8 @@ groups:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
@@ -55,7 +58,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
@@ -71,7 +75,8 @@ groups:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
@@ -89,7 +94,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
@@ -107,7 +113,8 @@ groups:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
@@ -123,7 +130,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
@@ -139,38 +147,44 @@ groups:
|
||||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
description:
|
||||
'{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
|
||||
expr:
|
||||
rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
|
||||
> 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
description:
|
||||
'{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
|
||||
expr:
|
||||
rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
|
||||
> 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
|
||||
description: "{{ $value | humanizePercentage }} of conntrack entries are used."
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
|
||||
expr:
|
||||
(node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
|
||||
> 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector on {{ $labels.instance }} failed
|
||||
description:
|
||||
Node Exporter text file collector on {{ $labels.instance }} failed
|
||||
to scrape.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
@@ -179,7 +193,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
|
||||
description:
|
||||
Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
|
||||
Ensure NTP is configured correctly on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
@@ -200,7 +215,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
|
||||
description:
|
||||
Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
|
||||
is configured on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
@@ -213,12 +229,14 @@ groups:
|
||||
severity: warning
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
|
||||
description:
|
||||
RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
|
||||
in degraded state due to one or more disks failures. Number of spare drives
|
||||
is insufficient to fix issue automatically.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||
summary: RAID Array is degraded.
|
||||
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
expr:
|
||||
node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
|
||||
> 0
|
||||
for: 15m
|
||||
@@ -226,17 +244,20 @@ groups:
|
||||
severity: critical
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: At least one device in RAID array at {{ $labels.instance }} failed.
|
||||
description:
|
||||
At least one device in RAID array at {{ $labels.instance }} failed.
|
||||
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array.
|
||||
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
expr:
|
||||
node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
> 0
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently at
|
||||
description:
|
||||
File descriptors limit at {{ $labels.instance }} is currently at
|
||||
{{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
@@ -249,7 +270,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently at
|
||||
description:
|
||||
File descriptors limit at {{ $labels.instance }} is currently at
|
||||
{{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
@@ -266,7 +288,8 @@ groups:
|
||||
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
|
||||
summary: High CPU usage.
|
||||
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
|
||||
expr:
|
||||
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
|
||||
mode!="idle"}[2m]))) * 100 > 90
|
||||
for: 15m
|
||||
labels:
|
||||
@@ -301,7 +324,8 @@ groups:
|
||||
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
|
||||
summary: Host is running out of memory.
|
||||
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
expr:
|
||||
100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
* 100) > 90
|
||||
for: 15m
|
||||
labels:
|
||||
@@ -313,14 +337,16 @@ groups:
|
||||
This symptom might indicate disk saturation.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
|
||||
summary: Disk IO queue is high.
|
||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
expr:
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
> 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeSystemdServiceFailed
|
||||
annotations:
|
||||
description: Systemd service {{ $labels.name }} has entered failed state at
|
||||
description:
|
||||
Systemd service {{ $labels.name }} has entered failed state at
|
||||
{{ $labels.instance }}
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
|
||||
summary: Systemd service has entered failed state.
|
||||
@@ -330,7 +356,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: NodeBondingDegraded
|
||||
annotations:
|
||||
description: Bonding interface {{ $labels.master }} on {{ $labels.instance }}
|
||||
description:
|
||||
Bonding interface {{ $labels.master }} on {{ $labels.instance }}
|
||||
is in degraded state due to one or more slave failures.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
|
||||
summary: Bonding interface is degraded
|
||||
|
||||
@@ -8,14 +8,16 @@ groups:
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
|
||||
expr:
|
||||
(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
|
||||
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: MemoryUtilizationHighWarning
|
||||
annotations:
|
||||
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||
dashboard:
|
||||
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||
description: Node {{ $labels.instance }} has less than 10% available memory.
|
||||
summary: Node Memory utilization warning
|
||||
@@ -25,7 +27,8 @@ groups:
|
||||
severity: critical
|
||||
- alert: MemoryUtilizationHighCritical
|
||||
annotations:
|
||||
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||
dashboard:
|
||||
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||
description: Node {{ $labels.instance }} has less than 5% available memory.
|
||||
summary: Node Memory utilization critical
|
||||
@@ -37,7 +40,8 @@ groups:
|
||||
annotations:
|
||||
description: Node {{ $labels.node }} has CPU utilization over 90%.
|
||||
summary: Node has been in not-ready state for longer than 3 minutes
|
||||
expr: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
|
||||
expr:
|
||||
(sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
|
||||
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
|
||||
> 0
|
||||
for: 5m
|
||||
@@ -50,7 +54,8 @@ groups:
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
|
||||
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
||||
expr:
|
||||
kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
||||
1
|
||||
for: 2m
|
||||
labels:
|
||||
@@ -62,7 +67,8 @@ groups:
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
|
||||
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
||||
expr:
|
||||
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
||||
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
|
||||
== 1
|
||||
for: 0m
|
||||
|
||||
+8
-4
@@ -3,18 +3,22 @@ groups:
|
||||
rules:
|
||||
- alert: VeleroBackupPartialFailures
|
||||
annotations:
|
||||
message: Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
|
||||
message:
|
||||
Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
|
||||
failed backups.
|
||||
expr: velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||
expr:
|
||||
velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||
> 0.25
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: VeleroBackupFailures
|
||||
annotations:
|
||||
message: Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
|
||||
message:
|
||||
Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
|
||||
backups.
|
||||
expr: velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||
expr:
|
||||
velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||
> 0.25
|
||||
for: 15m
|
||||
labels:
|
||||
|
||||
@@ -3,7 +3,8 @@ groups:
|
||||
rules:
|
||||
- alert: X509ExporterReadErrors
|
||||
annotations:
|
||||
description: Over the last 15 minutes, this x509-certificate-exporter instance
|
||||
description:
|
||||
Over the last 15 minutes, this x509-certificate-exporter instance
|
||||
has experienced errors reading certificate files or querying the Kubernetes
|
||||
API. This could be caused by a misconfiguration if triggered when the exporter
|
||||
starts.
|
||||
@@ -14,7 +15,8 @@ groups:
|
||||
severity: warning
|
||||
- alert: CertificateError
|
||||
annotations:
|
||||
description: Certificate could not be decoded {{if $labels.secret_name }} in
|
||||
description:
|
||||
Certificate could not be decoded {{if $labels.secret_name }} in
|
||||
Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at
|
||||
location "{{ $labels.filepath }}"{{end}}
|
||||
summary: Certificate cannot be decoded
|
||||
@@ -24,22 +26,26 @@ groups:
|
||||
severity: warning
|
||||
- alert: CertificateRenewal
|
||||
annotations:
|
||||
description: Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
|
||||
description:
|
||||
Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
|
||||
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
||||
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
||||
summary: Certificate should be renewed
|
||||
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||
expr:
|
||||
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CertificateExpiration
|
||||
annotations:
|
||||
description: Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
|
||||
description:
|
||||
Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
|
||||
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
||||
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
||||
summary: Certificate is about to expire
|
||||
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||
expr:
|
||||
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14
|
||||
for: 15m
|
||||
labels:
|
||||
|
||||
Reference in New Issue
Block a user