fix(rules/bootstrap): Format yaml
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
# the shebang is ignored, but nice for editors
|
# the shebang is ignored, but nice for editors
|
||||||
watch_file nix/sources.json
|
watch_file nix/sources.json
|
||||||
|
watch_file nix/checks.nix
|
||||||
|
|
||||||
# Load .env file if it exists
|
# Load .env file if it exists
|
||||||
dotenv_if_exists
|
dotenv_if_exists
|
||||||
|
|||||||
+1
-2
@@ -8,7 +8,7 @@ stages:
|
|||||||
release:
|
release:
|
||||||
stage: release
|
stage: release
|
||||||
rules:
|
rules:
|
||||||
- if: '$CI_COMMIT_BRANCH =~ /^main/'
|
- if: "$CI_COMMIT_BRANCH =~ /^main/"
|
||||||
when: always
|
when: always
|
||||||
- when: never
|
- when: never
|
||||||
script:
|
script:
|
||||||
@@ -43,4 +43,3 @@ rebuild:
|
|||||||
"${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts"
|
"${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|||||||
@@ -4,15 +4,15 @@ metadata:
|
|||||||
name: argocd-cluster-admin
|
name: argocd-cluster-admin
|
||||||
rules:
|
rules:
|
||||||
- apiGroups:
|
- apiGroups:
|
||||||
- '*'
|
- "*"
|
||||||
resources:
|
resources:
|
||||||
- '*'
|
- "*"
|
||||||
verbs:
|
verbs:
|
||||||
- '*'
|
- "*"
|
||||||
- nonResourceURLs:
|
- nonResourceURLs:
|
||||||
- '*'
|
- "*"
|
||||||
verbs:
|
verbs:
|
||||||
- '*'
|
- "*"
|
||||||
---
|
---
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
kind: ClusterRoleBinding
|
kind: ClusterRoleBinding
|
||||||
|
|||||||
@@ -6,5 +6,3 @@ metadata:
|
|||||||
name: cluster-admin-token
|
name: cluster-admin-token
|
||||||
namespace: kube-system
|
namespace: kube-system
|
||||||
type: kubernetes.io/service-account-token
|
type: kubernetes.io/service-account-token
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -10,5 +10,3 @@ metadata:
|
|||||||
name: cluster-ekman
|
name: cluster-ekman
|
||||||
namespace: argocd
|
namespace: argocd
|
||||||
type: Opaque
|
type: Opaque
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ spec:
|
|||||||
itemType: string
|
itemType: string
|
||||||
collectionType: string
|
collectionType: string
|
||||||
string: ""
|
string: ""
|
||||||
# All the fields above besides "string" apply to both the array and map type parameter announcements.
|
# All the fields above besides 'string' apply to both the array and map type parameter announcements.
|
||||||
# - name: array-param
|
# - name: array-param
|
||||||
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
|
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
|
||||||
# array: [default, items]
|
# array: [default, items]
|
||||||
@@ -84,4 +84,3 @@ spec:
|
|||||||
# If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository
|
# If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository
|
||||||
# might have executable files. Set to true only if you trust the CMP plugin authors.
|
# might have executable files. Set to true only if you trust the CMP plugin authors.
|
||||||
preserveFileMode: false
|
preserveFileMode: false
|
||||||
|
|
||||||
|
|||||||
@@ -422,4 +422,3 @@ spec:
|
|||||||
path: ca.crt
|
path: ca.crt
|
||||||
optional: true
|
optional: true
|
||||||
secretName: argocd-repo-server-tls
|
secretName: argocd-repo-server-tls
|
||||||
|
|
||||||
|
|||||||
@@ -13,4 +13,3 @@ stringData:
|
|||||||
name: staging-vcluster
|
name: staging-vcluster
|
||||||
server: https://staging-vcluster.staging-vcluster
|
server: https://staging-vcluster.staging-vcluster
|
||||||
type: Opaque
|
type: Opaque
|
||||||
|
|
||||||
|
|||||||
@@ -32,12 +32,12 @@ projects:
|
|||||||
additionalAnnotations: {}
|
additionalAnnotations: {}
|
||||||
description: sys components project
|
description: sys components project
|
||||||
sourceRepos:
|
sourceRepos:
|
||||||
- '*'
|
- "*"
|
||||||
destinations:
|
destinations:
|
||||||
- namespace: '*'
|
- namespace: "*"
|
||||||
server: https://kubernetes.default.svc
|
server: https://kubernetes.default.svc
|
||||||
clusterResourceWhitelist:
|
clusterResourceWhitelist:
|
||||||
- group: '*'
|
- group: "*"
|
||||||
kind: '*'
|
kind: "*"
|
||||||
orphanedResources:
|
orphanedResources:
|
||||||
warn: false
|
warn: false
|
||||||
|
|||||||
+8
-3
@@ -5,6 +5,8 @@ let
|
|||||||
|
|
||||||
globalExcludes = [
|
globalExcludes = [
|
||||||
"nix/default.nix"
|
"nix/default.nix"
|
||||||
|
"attic"
|
||||||
|
"vcluster"
|
||||||
".*vendor"
|
".*vendor"
|
||||||
".*chart/.*"
|
".*chart/.*"
|
||||||
".*schema.json"
|
".*schema.json"
|
||||||
@@ -32,6 +34,7 @@ pre-commit.run {
|
|||||||
enable = true;
|
enable = true;
|
||||||
excludes = [
|
excludes = [
|
||||||
"vcluster/"
|
"vcluster/"
|
||||||
|
"attic/"
|
||||||
];
|
];
|
||||||
args = [
|
args = [
|
||||||
"-x"
|
"-x"
|
||||||
@@ -41,15 +44,17 @@ pre-commit.run {
|
|||||||
};
|
};
|
||||||
|
|
||||||
yamllint = {
|
yamllint = {
|
||||||
enable = false;
|
enable = true;
|
||||||
excludes = [
|
excludes = [
|
||||||
"attic/"
|
"attic/"
|
||||||
"charts/templates/"
|
"charts/templates/"
|
||||||
"charts/charts/"
|
"charts/"
|
||||||
|
"values/"
|
||||||
|
"vcluster/"
|
||||||
];
|
];
|
||||||
settings = {
|
settings = {
|
||||||
strict = true;
|
strict = true;
|
||||||
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 165} } }'';
|
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 300} } }'';
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
+42
-21
@@ -3,7 +3,8 @@ groups:
|
|||||||
rules:
|
rules:
|
||||||
- alert: etcdMembersDown
|
- alert: etcdMembersDown
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": members are down ({{ $value
|
||||||
}}).'
|
}}).'
|
||||||
summary: etcd cluster members are down.
|
summary: etcd cluster members are down.
|
||||||
expr: |-
|
expr: |-
|
||||||
@@ -20,17 +21,20 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdInsufficientMembers
|
- alert: etcdInsufficientMembers
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
||||||
}}).'
|
}}).'
|
||||||
summary: etcd cluster has insufficient number of members.
|
summary: etcd cluster has insufficient number of members.
|
||||||
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
|
expr:
|
||||||
|
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
|
||||||
without (instance) + 1) / 2)
|
without (instance) + 1) / 2)
|
||||||
for: 3m
|
for: 3m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdNoLeader
|
- alert: etcdNoLeader
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
|
||||||
has no leader.'
|
has no leader.'
|
||||||
summary: etcd cluster has no leader.
|
summary: etcd cluster has no leader.
|
||||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||||
@@ -39,12 +43,14 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdHighNumberOfLeaderChanges
|
- alert: etcdHighNumberOfLeaderChanges
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
|
||||||
within the last 15 minutes. Frequent elections may be a sign of insufficient
|
within the last 15 minutes. Frequent elections may be a sign of insufficient
|
||||||
resources, high network latency, or disruptions by other components and should
|
resources, high network latency, or disruptions by other components and should
|
||||||
be investigated.'
|
be investigated.'
|
||||||
summary: etcd cluster has high number of leader changes.
|
summary: etcd cluster has high number of leader changes.
|
||||||
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
|
expr:
|
||||||
|
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
|
||||||
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
|
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
|
||||||
>= 4
|
>= 4
|
||||||
for: 5m
|
for: 5m
|
||||||
@@ -52,7 +58,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||||
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||||
summary: etcd cluster has high number of failed grpc requests.
|
summary: etcd cluster has high number of failed grpc requests.
|
||||||
expr: |-
|
expr: |-
|
||||||
@@ -65,7 +72,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||||
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||||
summary: etcd cluster has high number of failed grpc requests.
|
summary: etcd cluster has high number of failed grpc requests.
|
||||||
expr: |-
|
expr: |-
|
||||||
@@ -78,7 +86,8 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdGRPCRequestsSlow
|
- alert: etcdGRPCRequestsSlow
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
|
||||||
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
|
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
|
||||||
}} method.'
|
}} method.'
|
||||||
summary: etcd grpc requests are slow
|
summary: etcd grpc requests are slow
|
||||||
@@ -90,7 +99,8 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdMemberCommunicationSlow
|
- alert: etcdMemberCommunicationSlow
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": member communication with {{
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": member communication with {{
|
||||||
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
|
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
|
||||||
}}.'
|
}}.'
|
||||||
summary: etcd cluster member communication is slow.
|
summary: etcd cluster member communication is slow.
|
||||||
@@ -102,7 +112,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighNumberOfFailedProposals
|
- alert: etcdHighNumberOfFailedProposals
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
|
||||||
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||||
summary: etcd cluster has high number of proposal failures.
|
summary: etcd cluster has high number of proposal failures.
|
||||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||||
@@ -111,7 +122,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighFsyncDurations
|
- alert: etcdHighFsyncDurations
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||||
expr: |-
|
expr: |-
|
||||||
@@ -122,7 +134,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighFsyncDurations
|
- alert: etcdHighFsyncDurations
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||||
expr: |-
|
expr: |-
|
||||||
@@ -133,7 +146,8 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdHighCommitDurations
|
- alert: etcdHighCommitDurations
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
||||||
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
summary: etcd cluster 99th percentile commit durations are too high.
|
summary: etcd cluster 99th percentile commit durations are too high.
|
||||||
expr: |-
|
expr: |-
|
||||||
@@ -144,11 +158,13 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdDatabaseQuotaLowSpace
|
- alert: etcdDatabaseQuotaLowSpace
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": database size exceeds the defined
|
||||||
quota on etcd instance {{ $labels.instance }}, please defrag or increase the
|
quota on etcd instance {{ $labels.instance }}, please defrag or increase the
|
||||||
quota as the writes to etcd will be disabled when it is full.'
|
quota as the writes to etcd will be disabled when it is full.'
|
||||||
summary: etcd cluster database is running full.
|
summary: etcd cluster database is running full.
|
||||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
|
expr:
|
||||||
|
(last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
|
||||||
last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 >
|
last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 >
|
||||||
95
|
95
|
||||||
for: 10m
|
for: 10m
|
||||||
@@ -156,26 +172,31 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: etcdExcessiveDatabaseGrowth
|
- alert: etcdExcessiveDatabaseGrowth
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": Predicting running out of disk
|
||||||
space in the next four hours, based on write observations within the past
|
space in the next four hours, based on write observations within the past
|
||||||
four hours on etcd instance {{ $labels.instance }}, please check as it might
|
four hours on etcd instance {{ $labels.instance }}, please check as it might
|
||||||
be disruptive.'
|
be disruptive.'
|
||||||
summary: etcd cluster database growing very fast.
|
summary: etcd cluster database growing very fast.
|
||||||
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
|
expr:
|
||||||
|
predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
|
||||||
> etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
> etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdDatabaseHighFragmentationRatio
|
- alert: etcdDatabaseHighFragmentationRatio
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": database size in use on instance
|
||||||
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
|
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
|
||||||
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
|
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
|
||||||
retrieve the unused fragmented disk space.'
|
retrieve the unused fragmented disk space.'
|
||||||
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
||||||
summary: etcd database size in use is less than 50% of the actual allocated
|
summary:
|
||||||
|
etcd database size in use is less than 50% of the actual allocated
|
||||||
storage.
|
storage.
|
||||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
|
expr:
|
||||||
|
(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
|
||||||
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
|
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
|
||||||
and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
||||||
for: 10m
|
for: 10m
|
||||||
|
|||||||
+8
-4
@@ -3,11 +3,13 @@ groups:
|
|||||||
rules:
|
rules:
|
||||||
- alert: TargetDown
|
- alert: TargetDown
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
|
description:
|
||||||
|
'{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
|
||||||
}} targets in {{ $labels.namespace }} namespace are down.'
|
}} targets in {{ $labels.namespace }} namespace are down.'
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||||||
summary: One or more targets are unreachable.
|
summary: One or more targets are unreachable.
|
||||||
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
|
expr:
|
||||||
|
100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
|
||||||
BY (cluster, job, namespace, service)) > 10
|
BY (cluster, job, namespace, service)) > 10
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
@@ -21,7 +23,8 @@ groups:
|
|||||||
mechanisms that send a notification when this alert is not firing. For example the
|
mechanisms that send a notification when this alert is not firing. For example the
|
||||||
"DeadMansSnitch" integration in PagerDuty.
|
"DeadMansSnitch" integration in PagerDuty.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
||||||
summary: An alert that should always be firing to certify that Alertmanager
|
summary:
|
||||||
|
An alert that should always be firing to certify that Alertmanager
|
||||||
is working properly.
|
is working properly.
|
||||||
expr: vector(1)
|
expr: vector(1)
|
||||||
labels:
|
labels:
|
||||||
@@ -37,7 +40,8 @@ groups:
|
|||||||
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
||||||
summary: Info-level alert inhibition.
|
summary: Info-level alert inhibition.
|
||||||
expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
|
expr:
|
||||||
|
ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
|
||||||
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
||||||
labels:
|
labels:
|
||||||
severity: none
|
severity: none
|
||||||
+40
-21
@@ -3,18 +3,21 @@ groups:
|
|||||||
rules:
|
rules:
|
||||||
- alert: KubePodCrashLooping
|
- alert: KubePodCrashLooping
|
||||||
annotations:
|
annotations:
|
||||||
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
description:
|
||||||
|
'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||||
}}) is in waiting state (reason: "CrashLoopBackOff").'
|
}}) is in waiting state (reason: "CrashLoopBackOff").'
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
||||||
summary: Pod is crash looping.
|
summary: Pod is crash looping.
|
||||||
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
|
expr:
|
||||||
|
max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
|
||||||
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
|
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubePodNotReady
|
- alert: KubePodNotReady
|
||||||
annotations:
|
annotations:
|
||||||
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
description:
|
||||||
|
Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||||
state for longer than 15 minutes.
|
state for longer than 15 minutes.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
||||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||||
@@ -31,7 +34,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDeploymentGenerationMismatch
|
- alert: KubeDeploymentGenerationMismatch
|
||||||
annotations:
|
annotations:
|
||||||
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
description:
|
||||||
|
Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||||
}} does not match, this indicates that the Deployment has failed but has not
|
}} does not match, this indicates that the Deployment has failed but has not
|
||||||
been rolled back.
|
been rolled back.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
||||||
@@ -45,7 +49,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDeploymentReplicasMismatch
|
- alert: KubeDeploymentReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
|
description:
|
||||||
|
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
|
||||||
not matched the expected number of replicas for longer than 15 minutes.
|
not matched the expected number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
||||||
summary: Deployment has not matched the expected number of replicas.
|
summary: Deployment has not matched the expected number of replicas.
|
||||||
@@ -64,7 +69,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDeploymentRolloutStuck
|
- alert: KubeDeploymentRolloutStuck
|
||||||
annotations:
|
annotations:
|
||||||
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
|
description:
|
||||||
|
Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
|
||||||
}} is not progressing for longer than 15 minutes.
|
}} is not progressing for longer than 15 minutes.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
|
||||||
summary: Deployment rollout is not progressing.
|
summary: Deployment rollout is not progressing.
|
||||||
@@ -76,7 +82,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeStatefulSetReplicasMismatch
|
- alert: KubeStatefulSetReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
|
description:
|
||||||
|
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
|
||||||
not matched the expected number of replicas for longer than 15 minutes.
|
not matched the expected number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
||||||
summary: StatefulSet has not matched the expected number of replicas.
|
summary: StatefulSet has not matched the expected number of replicas.
|
||||||
@@ -95,7 +102,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeStatefulSetGenerationMismatch
|
- alert: KubeStatefulSetGenerationMismatch
|
||||||
annotations:
|
annotations:
|
||||||
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
description:
|
||||||
|
StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||||
}} does not match, this indicates that the StatefulSet has failed but has
|
}} does not match, this indicates that the StatefulSet has failed but has
|
||||||
not been rolled back.
|
not been rolled back.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
||||||
@@ -109,7 +117,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||||
annotations:
|
annotations:
|
||||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
description:
|
||||||
|
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
||||||
has not been rolled out.
|
has not been rolled out.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
||||||
summary: StatefulSet update has not been rolled out.
|
summary: StatefulSet update has not been rolled out.
|
||||||
@@ -136,7 +145,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDaemonSetRolloutStuck
|
- alert: KubeDaemonSetRolloutStuck
|
||||||
annotations:
|
annotations:
|
||||||
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
|
description:
|
||||||
|
DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
|
||||||
finished or progressed for at least 15 minutes.
|
finished or progressed for at least 15 minutes.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
||||||
summary: DaemonSet rollout is stuck.
|
summary: DaemonSet rollout is stuck.
|
||||||
@@ -169,19 +179,22 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeContainerWaiting
|
- alert: KubeContainerWaiting
|
||||||
annotations:
|
annotations:
|
||||||
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
|
description:
|
||||||
|
pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
|
||||||
{{ $labels.container}} has been in waiting state for longer than 1 hour.
|
{{ $labels.container}} has been in waiting state for longer than 1 hour.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
||||||
summary: Pod container waiting longer than 1 hour
|
summary: Pod container waiting longer than 1 hour
|
||||||
expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
|
expr:
|
||||||
|
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
|
||||||
namespace=~".*"}) > 0
|
namespace=~".*"}) > 0
|
||||||
for: 1h
|
for: 1h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDaemonSetNotScheduled
|
- alert: KubeDaemonSetNotScheduled
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
description:
|
||||||
}} are not scheduled.'
|
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||||
|
}} are not scheduled."
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
||||||
summary: DaemonSet pods are not scheduled.
|
summary: DaemonSet pods are not scheduled.
|
||||||
expr: |-
|
expr: |-
|
||||||
@@ -193,18 +206,21 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeDaemonSetMisScheduled
|
- alert: KubeDaemonSetMisScheduled
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
description:
|
||||||
}} are running where they are not supposed to run.'
|
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||||
|
}} are running where they are not supposed to run."
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
||||||
summary: DaemonSet pods are misscheduled.
|
summary: DaemonSet pods are misscheduled.
|
||||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
expr:
|
||||||
|
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||||
> 0
|
> 0
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeJobNotCompleted
|
- alert: KubeJobNotCompleted
|
||||||
annotations:
|
annotations:
|
||||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
description:
|
||||||
|
Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
||||||
than {{ "43200" | humanizeDuration }} to complete.
|
than {{ "43200" | humanizeDuration }} to complete.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
|
||||||
summary: Job did not complete in time
|
summary: Job did not complete in time
|
||||||
@@ -216,7 +232,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeJobFailed
|
- alert: KubeJobFailed
|
||||||
annotations:
|
annotations:
|
||||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
description:
|
||||||
|
Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
||||||
Removing failed job after investigation should clear this alert.
|
Removing failed job after investigation should clear this alert.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
||||||
summary: Job failed to complete.
|
summary: Job failed to complete.
|
||||||
@@ -226,7 +243,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeHpaReplicasMismatch
|
- alert: KubeHpaReplicasMismatch
|
||||||
annotations:
|
annotations:
|
||||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
description:
|
||||||
|
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||||
has not matched the desired number of replicas for longer than 15 minutes.
|
has not matched the desired number of replicas for longer than 15 minutes.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
||||||
summary: HPA has not matched desired number of replicas.
|
summary: HPA has not matched desired number of replicas.
|
||||||
@@ -249,7 +267,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeHpaMaxedOut
|
- alert: KubeHpaMaxedOut
|
||||||
annotations:
|
annotations:
|
||||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
description:
|
||||||
|
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||||
has been running at max replicas for longer than 15 minutes.
|
has been running at max replicas for longer than 15 minutes.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
||||||
summary: HPA is running at max replicas
|
summary: HPA is running at max replicas
|
||||||
|
|||||||
@@ -3,7 +3,8 @@ groups:
|
|||||||
rules:
|
rules:
|
||||||
- alert: KubeCPUOvercommit
|
- alert: KubeCPUOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
description:
|
||||||
|
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||||
for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
||||||
summary: Cluster has overcommitted CPU resource requests.
|
summary: Cluster has overcommitted CPU resource requests.
|
||||||
@@ -16,7 +17,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeMemoryOvercommit
|
- alert: KubeMemoryOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
description:
|
||||||
|
Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||||
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
|
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
|
||||||
failure.
|
failure.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
||||||
@@ -30,7 +32,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeCPUQuotaOvercommit
|
- alert: KubeCPUQuotaOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
description:
|
||||||
|
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||||
for Namespaces.
|
for Namespaces.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
||||||
summary: Cluster has overcommitted CPU resource requests.
|
summary: Cluster has overcommitted CPU resource requests.
|
||||||
@@ -44,7 +47,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeMemoryQuotaOvercommit
|
- alert: KubeMemoryQuotaOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
description:
|
||||||
|
Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||||
requests for Namespaces.
|
requests for Namespaces.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
||||||
summary: Cluster has overcommitted memory resource requests.
|
summary: Cluster has overcommitted memory resource requests.
|
||||||
@@ -58,7 +62,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeQuotaAlmostFull
|
- alert: KubeQuotaAlmostFull
|
||||||
annotations:
|
annotations:
|
||||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
description:
|
||||||
|
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||||
}} of its {{ $labels.resource }} quota.
|
}} of its {{ $labels.resource }} quota.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
||||||
summary: Namespace quota is going to be full.
|
summary: Namespace quota is going to be full.
|
||||||
@@ -72,7 +77,8 @@ groups:
|
|||||||
severity: info
|
severity: info
|
||||||
- alert: KubeQuotaFullyUsed
|
- alert: KubeQuotaFullyUsed
|
||||||
annotations:
|
annotations:
|
||||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
description:
|
||||||
|
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||||
}} of its {{ $labels.resource }} quota.
|
}} of its {{ $labels.resource }} quota.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
||||||
summary: Namespace quota is fully used.
|
summary: Namespace quota is fully used.
|
||||||
@@ -86,7 +92,8 @@ groups:
|
|||||||
severity: info
|
severity: info
|
||||||
- alert: KubeQuotaExceeded
|
- alert: KubeQuotaExceeded
|
||||||
annotations:
|
annotations:
|
||||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
description:
|
||||||
|
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||||
}} of its {{ $labels.resource }} quota.
|
}} of its {{ $labels.resource }} quota.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
||||||
summary: Namespace quota has exceeded the limits.
|
summary: Namespace quota has exceeded the limits.
|
||||||
@@ -100,9 +107,10 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: CPUThrottlingHigh
|
- alert: CPUThrottlingHigh
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
|
description:
|
||||||
|
"{{ $value | humanizePercentage }} throttling of CPU in namespace
|
||||||
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
|
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
|
||||||
}}.'
|
}}."
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
||||||
summary: Processes experience elevated CPU throttling.
|
summary: Processes experience elevated CPU throttling.
|
||||||
expr: |-
|
expr: |-
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
|
|
||||||
groups:
|
groups:
|
||||||
- name: kubernetes-storage
|
- name: kubernetes-storage
|
||||||
rules:
|
rules:
|
||||||
- alert: KubePersistentVolumeFillingUp
|
- alert: KubePersistentVolumeFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
description:
|
||||||
|
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||||
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
|
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||||
@@ -26,7 +26,8 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubePersistentVolumeFillingUp
|
- alert: KubePersistentVolumeFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
description:
|
||||||
|
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||||
{{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value
|
{{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value
|
||||||
| humanizePercentage }} is available.
|
| humanizePercentage }} is available.
|
||||||
@@ -51,7 +52,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubePersistentVolumeInodesFillingUp
|
- alert: KubePersistentVolumeInodesFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
description:
|
||||||
|
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||||
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
|
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||||
@@ -73,7 +75,8 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: KubePersistentVolumeInodesFillingUp
|
- alert: KubePersistentVolumeInodesFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
description:
|
||||||
|
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||||
{{ . }} {{- end }} is expected to run out of inodes within four days. Currently
|
{{ . }} {{- end }} is expected to run out of inodes within four days. Currently
|
||||||
{{ $value | humanizePercentage }} of its inodes are free.
|
{{ $value | humanizePercentage }} of its inodes are free.
|
||||||
@@ -98,11 +101,13 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubePersistentVolumeErrors
|
- alert: KubePersistentVolumeErrors
|
||||||
annotations:
|
annotations:
|
||||||
description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
|
description:
|
||||||
|
The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
|
||||||
-}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
|
-}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
||||||
summary: PersistentVolume is having issues with provisioning.
|
summary: PersistentVolume is having issues with provisioning.
|
||||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
|
expr:
|
||||||
|
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
|
||||||
> 0
|
> 0
|
||||||
for: 5m
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
|
|||||||
+55
-28
@@ -3,7 +3,8 @@ groups:
|
|||||||
rules:
|
rules:
|
||||||
- alert: NodeFilesystemSpaceFillingUp
|
- alert: NodeFilesystemSpaceFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
description:
|
||||||
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
space left and is filling up.
|
space left and is filling up.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||||
@@ -21,7 +22,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeFilesystemSpaceFillingUp
|
- alert: NodeFilesystemSpaceFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
description:
|
||||||
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
space left and is filling up fast.
|
space left and is filling up fast.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||||
@@ -39,7 +41,8 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: NodeFilesystemAlmostOutOfSpace
|
- alert: NodeFilesystemAlmostOutOfSpace
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
description:
|
||||||
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
space left.
|
space left.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||||
@@ -55,7 +58,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeFilesystemAlmostOutOfSpace
|
- alert: NodeFilesystemAlmostOutOfSpace
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
description:
|
||||||
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
space left.
|
space left.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||||
@@ -71,7 +75,8 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: NodeFilesystemFilesFillingUp
|
- alert: NodeFilesystemFilesFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
description:
|
||||||
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
inodes left and is filling up.
|
inodes left and is filling up.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||||
@@ -89,7 +94,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeFilesystemFilesFillingUp
|
- alert: NodeFilesystemFilesFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
description:
|
||||||
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
inodes left and is filling up fast.
|
inodes left and is filling up fast.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||||
@@ -107,7 +113,8 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: NodeFilesystemAlmostOutOfFiles
|
- alert: NodeFilesystemAlmostOutOfFiles
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
description:
|
||||||
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
inodes left.
|
inodes left.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||||
@@ -123,7 +130,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeFilesystemAlmostOutOfFiles
|
- alert: NodeFilesystemAlmostOutOfFiles
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
description:
|
||||||
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
inodes left.
|
inodes left.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||||
@@ -139,38 +147,44 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: NodeNetworkReceiveErrs
|
- alert: NodeNetworkReceiveErrs
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
description:
|
||||||
|
'{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||||
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||||
summary: Network interface is reporting many receive errors.
|
summary: Network interface is reporting many receive errors.
|
||||||
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
|
expr:
|
||||||
|
rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
|
||||||
> 0.01
|
> 0.01
|
||||||
for: 1h
|
for: 1h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeNetworkTransmitErrs
|
- alert: NodeNetworkTransmitErrs
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
description:
|
||||||
|
'{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||||
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||||
summary: Network interface is reporting many transmit errors.
|
summary: Network interface is reporting many transmit errors.
|
||||||
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
|
expr:
|
||||||
|
rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
|
||||||
> 0.01
|
> 0.01
|
||||||
for: 1h
|
for: 1h
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeHighNumberConntrackEntriesUsed
|
- alert: NodeHighNumberConntrackEntriesUsed
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
|
description: "{{ $value | humanizePercentage }} of conntrack entries are used."
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||||
summary: Number of conntrack are getting close to the limit.
|
summary: Number of conntrack are getting close to the limit.
|
||||||
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
|
expr:
|
||||||
|
(node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
|
||||||
> 0.75
|
> 0.75
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeTextFileCollectorScrapeError
|
- alert: NodeTextFileCollectorScrapeError
|
||||||
annotations:
|
annotations:
|
||||||
description: Node Exporter text file collector on {{ $labels.instance }} failed
|
description:
|
||||||
|
Node Exporter text file collector on {{ $labels.instance }} failed
|
||||||
to scrape.
|
to scrape.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||||
summary: Node Exporter text file collector failed to scrape.
|
summary: Node Exporter text file collector failed to scrape.
|
||||||
@@ -179,7 +193,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeClockSkewDetected
|
- alert: NodeClockSkewDetected
|
||||||
annotations:
|
annotations:
|
||||||
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
|
description:
|
||||||
|
Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
|
||||||
Ensure NTP is configured correctly on this host.
|
Ensure NTP is configured correctly on this host.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||||
summary: Clock skew detected.
|
summary: Clock skew detected.
|
||||||
@@ -200,7 +215,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeClockNotSynchronising
|
- alert: NodeClockNotSynchronising
|
||||||
annotations:
|
annotations:
|
||||||
description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
|
description:
|
||||||
|
Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
|
||||||
is configured on this host.
|
is configured on this host.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||||
summary: Clock not synchronising.
|
summary: Clock not synchronising.
|
||||||
@@ -213,12 +229,14 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeRAIDDegraded
|
- alert: NodeRAIDDegraded
|
||||||
annotations:
|
annotations:
|
||||||
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
|
description:
|
||||||
|
RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
|
||||||
in degraded state due to one or more disks failures. Number of spare drives
|
in degraded state due to one or more disks failures. Number of spare drives
|
||||||
is insufficient to fix issue automatically.
|
is insufficient to fix issue automatically.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||||
summary: RAID Array is degraded.
|
summary: RAID Array is degraded.
|
||||||
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
expr:
|
||||||
|
node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||||
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
|
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
|
||||||
> 0
|
> 0
|
||||||
for: 15m
|
for: 15m
|
||||||
@@ -226,17 +244,20 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: NodeRAIDDiskFailure
|
- alert: NodeRAIDDiskFailure
|
||||||
annotations:
|
annotations:
|
||||||
description: At least one device in RAID array at {{ $labels.instance }} failed.
|
description:
|
||||||
|
At least one device in RAID array at {{ $labels.instance }} failed.
|
||||||
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||||
summary: Failed device in RAID array.
|
summary: Failed device in RAID array.
|
||||||
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
expr:
|
||||||
|
node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||||
> 0
|
> 0
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeFileDescriptorLimit
|
- alert: NodeFileDescriptorLimit
|
||||||
annotations:
|
annotations:
|
||||||
description: File descriptors limit at {{ $labels.instance }} is currently at
|
description:
|
||||||
|
File descriptors limit at {{ $labels.instance }} is currently at
|
||||||
{{ printf "%.2f" $value }}%.
|
{{ printf "%.2f" $value }}%.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||||
@@ -249,7 +270,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeFileDescriptorLimit
|
- alert: NodeFileDescriptorLimit
|
||||||
annotations:
|
annotations:
|
||||||
description: File descriptors limit at {{ $labels.instance }} is currently at
|
description:
|
||||||
|
File descriptors limit at {{ $labels.instance }} is currently at
|
||||||
{{ printf "%.2f" $value }}%.
|
{{ printf "%.2f" $value }}%.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||||
@@ -266,7 +288,8 @@ groups:
|
|||||||
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
|
||||||
summary: High CPU usage.
|
summary: High CPU usage.
|
||||||
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
|
expr:
|
||||||
|
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
|
||||||
mode!="idle"}[2m]))) * 100 > 90
|
mode!="idle"}[2m]))) * 100 > 90
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
@@ -301,7 +324,8 @@ groups:
|
|||||||
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
|
||||||
summary: Host is running out of memory.
|
summary: Host is running out of memory.
|
||||||
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
|
expr:
|
||||||
|
100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
|
||||||
* 100) > 90
|
* 100) > 90
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
@@ -313,14 +337,16 @@ groups:
|
|||||||
This symptom might indicate disk saturation.
|
This symptom might indicate disk saturation.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
|
||||||
summary: Disk IO queue is high.
|
summary: Disk IO queue is high.
|
||||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
expr:
|
||||||
|
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||||
> 10
|
> 10
|
||||||
for: 30m
|
for: 30m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeSystemdServiceFailed
|
- alert: NodeSystemdServiceFailed
|
||||||
annotations:
|
annotations:
|
||||||
description: Systemd service {{ $labels.name }} has entered failed state at
|
description:
|
||||||
|
Systemd service {{ $labels.name }} has entered failed state at
|
||||||
{{ $labels.instance }}
|
{{ $labels.instance }}
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
|
||||||
summary: Systemd service has entered failed state.
|
summary: Systemd service has entered failed state.
|
||||||
@@ -330,7 +356,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeBondingDegraded
|
- alert: NodeBondingDegraded
|
||||||
annotations:
|
annotations:
|
||||||
description: Bonding interface {{ $labels.master }} on {{ $labels.instance }}
|
description:
|
||||||
|
Bonding interface {{ $labels.master }} on {{ $labels.instance }}
|
||||||
is in degraded state due to one or more slave failures.
|
is in degraded state due to one or more slave failures.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
|
||||||
summary: Bonding interface is degraded
|
summary: Bonding interface is degraded
|
||||||
|
|||||||
@@ -8,14 +8,16 @@ groups:
|
|||||||
VALUE = {{ $value }}
|
VALUE = {{ $value }}
|
||||||
LABELS = {{ $labels }}
|
LABELS = {{ $labels }}
|
||||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||||
expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
|
expr:
|
||||||
|
(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
|
||||||
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
for: 10m
|
for: 10m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: MemoryUtilizationHighWarning
|
- alert: MemoryUtilizationHighWarning
|
||||||
annotations:
|
annotations:
|
||||||
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
dashboard:
|
||||||
|
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||||
description: Node {{ $labels.instance }} has less than 10% available memory.
|
description: Node {{ $labels.instance }} has less than 10% available memory.
|
||||||
summary: Node Memory utilization warning
|
summary: Node Memory utilization warning
|
||||||
@@ -25,7 +27,8 @@ groups:
|
|||||||
severity: critical
|
severity: critical
|
||||||
- alert: MemoryUtilizationHighCritical
|
- alert: MemoryUtilizationHighCritical
|
||||||
annotations:
|
annotations:
|
||||||
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
dashboard:
|
||||||
|
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||||
description: Node {{ $labels.instance }} has less than 5% available memory.
|
description: Node {{ $labels.instance }} has less than 5% available memory.
|
||||||
summary: Node Memory utilization critical
|
summary: Node Memory utilization critical
|
||||||
@@ -37,7 +40,8 @@ groups:
|
|||||||
annotations:
|
annotations:
|
||||||
description: Node {{ $labels.node }} has CPU utilization over 90%.
|
description: Node {{ $labels.node }} has CPU utilization over 90%.
|
||||||
summary: Node has been in not-ready state for longer than 3 minutes
|
summary: Node has been in not-ready state for longer than 3 minutes
|
||||||
expr: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
|
expr:
|
||||||
|
(sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
|
||||||
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
|
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
|
||||||
> 0
|
> 0
|
||||||
for: 5m
|
for: 5m
|
||||||
@@ -50,7 +54,8 @@ groups:
|
|||||||
VALUE = {{ $value }}
|
VALUE = {{ $value }}
|
||||||
LABELS = {{ $labels }}
|
LABELS = {{ $labels }}
|
||||||
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
|
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
|
||||||
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
expr:
|
||||||
|
kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
||||||
1
|
1
|
||||||
for: 2m
|
for: 2m
|
||||||
labels:
|
labels:
|
||||||
@@ -62,7 +67,8 @@ groups:
|
|||||||
VALUE = {{ $value }}
|
VALUE = {{ $value }}
|
||||||
LABELS = {{ $labels }}
|
LABELS = {{ $labels }}
|
||||||
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
|
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
|
||||||
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
expr:
|
||||||
|
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
||||||
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
|
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
|
||||||
== 1
|
== 1
|
||||||
for: 0m
|
for: 0m
|
||||||
|
|||||||
+8
-4
@@ -3,18 +3,22 @@ groups:
|
|||||||
rules:
|
rules:
|
||||||
- alert: VeleroBackupPartialFailures
|
- alert: VeleroBackupPartialFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
|
message:
|
||||||
|
Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
|
||||||
failed backups.
|
failed backups.
|
||||||
expr: velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
expr:
|
||||||
|
velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||||
> 0.25
|
> 0.25
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: critical
|
severity: critical
|
||||||
- alert: VeleroBackupFailures
|
- alert: VeleroBackupFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
|
message:
|
||||||
|
Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
|
||||||
backups.
|
backups.
|
||||||
expr: velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
expr:
|
||||||
|
velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||||
> 0.25
|
> 0.25
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
|
|||||||
@@ -3,7 +3,8 @@ groups:
|
|||||||
rules:
|
rules:
|
||||||
- alert: X509ExporterReadErrors
|
- alert: X509ExporterReadErrors
|
||||||
annotations:
|
annotations:
|
||||||
description: Over the last 15 minutes, this x509-certificate-exporter instance
|
description:
|
||||||
|
Over the last 15 minutes, this x509-certificate-exporter instance
|
||||||
has experienced errors reading certificate files or querying the Kubernetes
|
has experienced errors reading certificate files or querying the Kubernetes
|
||||||
API. This could be caused by a misconfiguration if triggered when the exporter
|
API. This could be caused by a misconfiguration if triggered when the exporter
|
||||||
starts.
|
starts.
|
||||||
@@ -14,7 +15,8 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: CertificateError
|
- alert: CertificateError
|
||||||
annotations:
|
annotations:
|
||||||
description: Certificate could not be decoded {{if $labels.secret_name }} in
|
description:
|
||||||
|
Certificate could not be decoded {{if $labels.secret_name }} in
|
||||||
Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at
|
Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at
|
||||||
location "{{ $labels.filepath }}"{{end}}
|
location "{{ $labels.filepath }}"{{end}}
|
||||||
summary: Certificate cannot be decoded
|
summary: Certificate cannot be decoded
|
||||||
@@ -24,22 +26,26 @@ groups:
|
|||||||
severity: warning
|
severity: warning
|
||||||
- alert: CertificateRenewal
|
- alert: CertificateRenewal
|
||||||
annotations:
|
annotations:
|
||||||
description: Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
|
description:
|
||||||
|
Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
|
||||||
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
||||||
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
||||||
summary: Certificate should be renewed
|
summary: Certificate should be renewed
|
||||||
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
expr:
|
||||||
|
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||||
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28
|
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: CertificateExpiration
|
- alert: CertificateExpiration
|
||||||
annotations:
|
annotations:
|
||||||
description: Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
|
description:
|
||||||
|
Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
|
||||||
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
||||||
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
||||||
summary: Certificate is about to expire
|
summary: Certificate is about to expire
|
||||||
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
expr:
|
||||||
|
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||||
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14
|
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14
|
||||||
for: 15m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
|
|||||||
Reference in New Issue
Block a user