fix(rules/bootstrap): Format yaml

This commit is contained in:
2025-12-29 13:23:04 +01:00
parent f81a4b2732
commit 957526a6bc
22 changed files with 2142 additions and 2044 deletions
+1
View File
@@ -1,6 +1,7 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# the shebang is ignored, but nice for editors # the shebang is ignored, but nice for editors
watch_file nix/sources.json watch_file nix/sources.json
watch_file nix/checks.nix
# Load .env file if it exists # Load .env file if it exists
dotenv_if_exists dotenv_if_exists
+4 -5
View File
@@ -1,6 +1,6 @@
image: image:
name: alpine/helm:latest name: alpine/helm:latest
entrypoint: [ "/bin/bash", "-c" ] entrypoint: ["/bin/bash", "-c"]
stages: stages:
- release - release
@@ -8,9 +8,9 @@ stages:
release: release:
stage: release stage: release
rules: rules:
- if: '$CI_COMMIT_BRANCH =~ /^main/' - if: "$CI_COMMIT_BRANCH =~ /^main/"
when: always when: always
- when: never - when: never
script: script:
- | - |
cd $CI_PROJECT_DIR cd $CI_PROJECT_DIR
@@ -43,4 +43,3 @@ rebuild:
"${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts" "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts"
fi fi
done done
+13 -13
View File
@@ -3,16 +3,16 @@ kind: ClusterRole
metadata: metadata:
name: argocd-cluster-admin name: argocd-cluster-admin
rules: rules:
- apiGroups: - apiGroups:
- '*' - "*"
resources: resources:
- '*' - "*"
verbs: verbs:
- '*' - "*"
- nonResourceURLs: - nonResourceURLs:
- '*' - "*"
verbs: verbs:
- '*' - "*"
--- ---
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding kind: ClusterRoleBinding
@@ -23,9 +23,9 @@ roleRef:
kind: ClusterRole kind: ClusterRole
name: argocd-cluster-admin name: argocd-cluster-admin
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: argocd-cluster-admin name: argocd-cluster-admin
namespace: kube-system namespace: kube-system
--- ---
apiVersion: v1 apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
-2
View File
@@ -6,5 +6,3 @@ metadata:
name: cluster-admin-token name: cluster-admin-token
namespace: kube-system namespace: kube-system
type: kubernetes.io/service-account-token type: kubernetes.io/service-account-token
-2
View File
@@ -10,5 +10,3 @@ metadata:
name: cluster-ekman name: cluster-ekman
namespace: argocd namespace: argocd
type: Opaque type: Opaque
+24 -25
View File
@@ -9,7 +9,7 @@ spec:
init: init:
# Init always happens immediately before generate, but its output is not treated as manifests. # Init always happens immediately before generate, but its output is not treated as manifests.
# This is a good place to, for example, download chart dependencies. # This is a good place to, for example, download chart dependencies.
command: [ /bin/sh ] command: [/bin/sh]
args: args:
- /plugin/init.sh - /plugin/init.sh
# The generate command runs in the Application source directory each time manifests are generated. Standard output # The generate command runs in the Application source directory each time manifests are generated. Standard output
@@ -17,7 +17,7 @@ spec:
# To write log messages from the command, write them to stderr, it will always be displayed. # To write log messages from the command, write them to stderr, it will always be displayed.
# Error output will be sent to the UI, so avoid printing sensitive information (such as secrets). # Error output will be sent to the UI, so avoid printing sensitive information (such as secrets).
generate: generate:
command: [ /bin/sh ] command: [/bin/sh]
args: args:
- /plugin/generate.sh - /plugin/generate.sh
@@ -27,15 +27,15 @@ spec:
# Only one of fileName, find.glob, or find.command should be specified. If multiple are specified then only the # Only one of fileName, find.glob, or find.command should be specified. If multiple are specified then only the
# first (in that order) is evaluated. # first (in that order) is evaluated.
# discover: # discover:
# fileName is a glob pattern (https://pkg.go.dev/path/filepath#Glob) that is applied to the Application's source # fileName is a glob pattern (https://pkg.go.dev/path/filepath#Glob) that is applied to the Application's source
# directory. If there is a match, this plugin may be used for the Application. # directory. If there is a match, this plugin may be used for the Application.
# fileName: "./subdir/s*.yaml" # fileName: "./subdir/s*.yaml"
# find: # find:
# This does the same thing as fileName, but it supports double-start (nested directory) glob patterns. # This does the same thing as fileName, but it supports double-start (nested directory) glob patterns.
# glob: "**/Chart.yaml" # glob: "**/Chart.yaml"
# The find command runs in the repository's root directory. To match, it must exit with status code 0 _and_ # The find command runs in the repository's root directory. To match, it must exit with status code 0 _and_
# produce non-empty output to standard out. # produce non-empty output to standard out.
# command: [sh, -c, find . -name env.yaml] # command: [sh, -c, find . -name env.yaml]
# The parameters config describes what parameters the UI should display for an Application. It is up to the user to # The parameters config describes what parameters the UI should display for an Application. It is up to the user to
# actually set parameters in the Application manifest (in spec.source.plugin.parameters). The announcements _only_ # actually set parameters in the Application manifest (in spec.source.plugin.parameters). The announcements _only_
# inform the "Parameters" tab in the App Details page of the UI. # inform the "Parameters" tab in the App Details page of the UI.
@@ -66,22 +66,21 @@ spec:
itemType: string itemType: string
collectionType: string collectionType: string
string: "" string: ""
# All the fields above besides "string" apply to both the array and map type parameter announcements. # All the fields above besides 'string' apply to both the array and map type parameter announcements.
# - name: array-param # - name: array-param
# # This field communicates the parameter's default value to the UI. Setting this field is optional. # # This field communicates the parameter's default value to the UI. Setting this field is optional.
# array: [default, items] # array: [default, items]
# collectionType: array # collectionType: array
# - name: map-param # - name: map-param
# # This field communicates the parameter's default value to the UI. Setting this field is optional. # # This field communicates the parameter's default value to the UI. Setting this field is optional.
# map: # map:
# some: value # some: value
# collectionType: map # collectionType: map
# dynamic: # dynamic:
# The command is run in an Application's source directory. Standard output must be JSON matching the schema of the # The command is run in an Application's source directory. Standard output must be JSON matching the schema of the
# static parameter announcements list. # static parameter announcements list.
# command: [ /bin/sh, /plugin/get-values.sh ] # command: [ /bin/sh, /plugin/get-values.sh ]
# If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository # If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository
# might have executable files. Set to true only if you trust the CMP plugin authors. # might have executable files. Set to true only if you trust the CMP plugin authors.
preserveFileMode: false preserveFileMode: false
+417 -417
View File
@@ -45,432 +45,432 @@ spec:
affinity: affinity:
podAntiAffinity: podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution: preferredDuringSchedulingIgnoredDuringExecution:
- podAffinityTerm: - podAffinityTerm:
labelSelector: labelSelector:
matchLabels: matchLabels:
app.kubernetes.io/name: argocd-repo-server app.kubernetes.io/name: argocd-repo-server
topologyKey: kubernetes.io/hostname topologyKey: kubernetes.io/hostname
weight: 100 weight: 100
automountServiceAccountToken: true automountServiceAccountToken: true
containers: containers:
- args: - args:
- /usr/local/bin/argocd-repo-server - /usr/local/bin/argocd-repo-server
- --port=8081 - --port=8081
- --metrics-port=8084 - --metrics-port=8084
env: env:
- name: ARGOCD_REPO_SERVER_NAME - name: ARGOCD_REPO_SERVER_NAME
value: argocd-repo-server value: argocd-repo-server
- name: ARGOCD_RECONCILIATION_TIMEOUT - name: ARGOCD_RECONCILIATION_TIMEOUT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: timeout.reconciliation key: timeout.reconciliation
name: argocd-cm name: argocd-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LOGFORMAT - name: ARGOCD_REPO_SERVER_LOGFORMAT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.log.format key: reposerver.log.format
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LOGLEVEL - name: ARGOCD_REPO_SERVER_LOGLEVEL
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.log.level key: reposerver.log.level
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT - name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.parallelism.limit key: reposerver.parallelism.limit
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS - name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.listen.address key: reposerver.listen.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS - name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.metrics.listen.address key: reposerver.metrics.listen.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_DISABLE_TLS - name: ARGOCD_REPO_SERVER_DISABLE_TLS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.disable.tls key: reposerver.disable.tls
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_MIN_VERSION - name: ARGOCD_TLS_MIN_VERSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.minversion key: reposerver.tls.minversion
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_MAX_VERSION - name: ARGOCD_TLS_MAX_VERSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.maxversion key: reposerver.tls.maxversion
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_CIPHERS - name: ARGOCD_TLS_CIPHERS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.ciphers key: reposerver.tls.ciphers
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_CACHE_EXPIRATION - name: ARGOCD_REPO_CACHE_EXPIRATION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.repo.cache.expiration key: reposerver.repo.cache.expiration
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_SERVER - name: REDIS_SERVER
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.server key: redis.server
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_COMPRESSION - name: REDIS_COMPRESSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.compression key: redis.compression
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDISDB - name: REDISDB
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.db key: redis.db
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_USERNAME - name: REDIS_USERNAME
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: redis-username key: redis-username
name: argocd-redis name: argocd-redis
optional: true optional: true
- name: REDIS_PASSWORD - name: REDIS_PASSWORD
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: auth key: auth
name: argocd-redis name: argocd-redis
- name: REDIS_SENTINEL_USERNAME - name: REDIS_SENTINEL_USERNAME
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: redis-sentinel-username key: redis-sentinel-username
name: argocd-redis name: argocd-redis
optional: true optional: true
- name: REDIS_SENTINEL_PASSWORD - name: REDIS_SENTINEL_PASSWORD
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: redis-sentinel-password key: redis-sentinel-password
name: argocd-redis name: argocd-redis
optional: true optional: true
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION - name: ARGOCD_DEFAULT_CACHE_EXPIRATION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.default.cache.expiration key: reposerver.default.cache.expiration
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS - name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.address key: otlp.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE - name: ARGOCD_REPO_SERVER_OTLP_INSECURE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.insecure key: otlp.insecure
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS - name: ARGOCD_REPO_SERVER_OTLP_HEADERS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.headers key: otlp.headers
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE - name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.max.combined.directory.manifests.size key: reposerver.max.combined.directory.manifests.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS - name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.plugin.tar.exclusions key: reposerver.plugin.tar.exclusions
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS - name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.allow.oob.symlinks key: reposerver.allow.oob.symlinks
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.streamed.manifest.max.tar.size key: reposerver.streamed.manifest.max.tar.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.streamed.manifest.max.extracted.size key: reposerver.streamed.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.helm.manifest.max.extracted.size key: reposerver.helm.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.disable.helm.manifest.max.extracted.size key: reposerver.disable.helm.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_MODULES_ENABLED - name: ARGOCD_GIT_MODULES_ENABLED
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.enable.git.submodule key: reposerver.enable.git.submodule
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT - name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.git.lsremote.parallelism.limit key: reposerver.git.lsremote.parallelism.limit
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_REQUEST_TIMEOUT - name: ARGOCD_GIT_REQUEST_TIMEOUT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.git.request.timeout key: reposerver.git.request.timeout
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REVISION_CACHE_LOCK_TIMEOUT - name: ARGOCD_REVISION_CACHE_LOCK_TIMEOUT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.revision.cache.lock.timeout key: reposerver.revision.cache.lock.timeout
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_INCLUDE_HIDDEN_DIRECTORIES - name: ARGOCD_REPO_SERVER_INCLUDE_HIDDEN_DIRECTORIES
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.include.hidden.directories key: reposerver.include.hidden.directories
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: HELM_CACHE_HOME - name: HELM_CACHE_HOME
value: /helm-working-dir value: /helm-working-dir
- name: HELM_CONFIG_HOME - name: HELM_CONFIG_HOME
value: /helm-working-dir value: /helm-working-dir
- name: HELM_DATA_HOME - name: HELM_DATA_HOME
value: /helm-working-dir value: /helm-working-dir
image: quay.io/argoproj/argocd:v2.12.3 image: quay.io/argoproj/argocd:v2.12.3
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
livenessProbe: livenessProbe:
failureThreshold: 3 failureThreshold: 3
httpGet: httpGet:
path: /healthz?full=true path: /healthz?full=true
port: metrics port: metrics
scheme: HTTP scheme: HTTP
initialDelaySeconds: 10 initialDelaySeconds: 10
periodSeconds: 10 periodSeconds: 10
successThreshold: 1 successThreshold: 1
timeoutSeconds: 1 timeoutSeconds: 1
name: repo-server
ports:
- containerPort: 8081
name: repo-server name: repo-server
protocol: TCP ports:
- containerPort: 8084 - containerPort: 8081
name: metrics name: repo-server
protocol: TCP protocol: TCP
readinessProbe: - containerPort: 8084
failureThreshold: 3 name: metrics
httpGet: protocol: TCP
path: /healthz readinessProbe:
port: metrics failureThreshold: 3
scheme: HTTP httpGet:
initialDelaySeconds: 10 path: /healthz
periodSeconds: 10 port: metrics
successThreshold: 1 scheme: HTTP
timeoutSeconds: 1 initialDelaySeconds: 10
securityContext: periodSeconds: 10
allowPrivilegeEscalation: false successThreshold: 1
capabilities: timeoutSeconds: 1
drop: securityContext:
- ALL allowPrivilegeEscalation: false
readOnlyRootFilesystem: true capabilities:
runAsNonRoot: true drop:
seccompProfile: - ALL
type: RuntimeDefault readOnlyRootFilesystem: true
terminationMessagePath: /dev/termination-log runAsNonRoot: true
terminationMessagePolicy: File seccompProfile:
volumeMounts: type: RuntimeDefault
- mountPath: /app/config/ssh terminationMessagePath: /dev/termination-log
name: ssh-known-hosts terminationMessagePolicy: File
- mountPath: /app/config/tls volumeMounts:
name: tls-certs - mountPath: /app/config/ssh
- mountPath: /app/config/gpg/source name: ssh-known-hosts
name: gpg-keys - mountPath: /app/config/tls
- mountPath: /app/config/gpg/keys name: tls-certs
name: gpg-keyring - mountPath: /app/config/gpg/source
- mountPath: /app/config/reposerver/tls name: gpg-keys
name: argocd-repo-server-tls - mountPath: /app/config/gpg/keys
- mountPath: /helm-working-dir name: gpg-keyring
name: helm-working-dir - mountPath: /app/config/reposerver/tls
- mountPath: /home/argocd/cmp-server/plugins name: argocd-repo-server-tls
name: plugins - mountPath: /helm-working-dir
- mountPath: /tmp name: helm-working-dir
name: tmp - mountPath: /home/argocd/cmp-server/plugins
- command: name: plugins
- /var/run/argocd/argocd-cmp-server - mountPath: /tmp
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest name: tmp
imagePullPolicy: Always - command:
name: kustomize-helm-with-rewrite - /var/run/argocd/argocd-cmp-server
securityContext: image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
runAsNonRoot: true imagePullPolicy: Always
runAsUser: 999 name: kustomize-helm-with-rewrite
terminationMessagePath: /dev/termination-log securityContext:
terminationMessagePolicy: File runAsNonRoot: true
volumeMounts: runAsUser: 999
- mountPath: /var/run/argocd terminationMessagePath: /dev/termination-log
name: var-files terminationMessagePolicy: File
- mountPath: /home/argocd/cmp-server/plugins volumeMounts:
name: plugins - mountPath: /var/run/argocd
- mountPath: /tmp name: var-files
name: cmp-tmp - mountPath: /home/argocd/cmp-server/plugins
- mountPath: /helm-working-dir name: plugins
name: helm-working-dir - mountPath: /tmp
- command: name: cmp-tmp
- /var/run/argocd/argocd-cmp-server - mountPath: /helm-working-dir
image: registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp:latest name: helm-working-dir
imagePullPolicy: Always - command:
name: helm-kustomize-cmp - /var/run/argocd/argocd-cmp-server
securityContext: image: registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp:latest
runAsNonRoot: true imagePullPolicy: Always
runAsUser: 999 name: helm-kustomize-cmp
terminationMessagePath: /dev/termination-log securityContext:
terminationMessagePolicy: File runAsNonRoot: true
volumeMounts: runAsUser: 999
- mountPath: /var/run/argocd terminationMessagePath: /dev/termination-log
name: var-files terminationMessagePolicy: File
- mountPath: /home/argocd/cmp-server/plugins volumeMounts:
name: plugins - mountPath: /var/run/argocd
- mountPath: /tmp name: var-files
name: cmp-tmp - mountPath: /home/argocd/cmp-server/plugins
- mountPath: /helm-working-dir name: plugins
name: helm-working-dir - mountPath: /tmp
- command: name: cmp-tmp
- /var/run/argocd/argocd-cmp-server - mountPath: /helm-working-dir
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest name: helm-working-dir
imagePullPolicy: Always - command:
name: helmfile-cmp - /var/run/argocd/argocd-cmp-server
securityContext: image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
runAsNonRoot: true imagePullPolicy: Always
runAsUser: 999 name: helmfile-cmp
terminationMessagePath: /dev/termination-log securityContext:
terminationMessagePolicy: File runAsNonRoot: true
volumeMounts: runAsUser: 999
- mountPath: /var/run/argocd terminationMessagePath: /dev/termination-log
name: var-files terminationMessagePolicy: File
- mountPath: /home/argocd/cmp-server/plugins volumeMounts:
name: plugins - mountPath: /var/run/argocd
- mountPath: /tmp name: var-files
name: cmp-tmp - mountPath: /home/argocd/cmp-server/plugins
- mountPath: /helm-working-dir name: plugins
name: helm-working-dir - mountPath: /tmp
name: cmp-tmp
- mountPath: /helm-working-dir
name: helm-working-dir
dnsPolicy: ClusterFirst dnsPolicy: ClusterFirst
imagePullSecrets: imagePullSecrets:
- name: gitlab-pull-secret - name: gitlab-pull-secret
initContainers: initContainers:
- command: - command:
- /bin/cp - /bin/cp
- -n - -n
- /usr/local/bin/argocd - /usr/local/bin/argocd
- /var/run/argocd/argocd-cmp-server - /var/run/argocd/argocd-cmp-server
image: quay.io/argoproj/argocd:v2.12.3 image: quay.io/argoproj/argocd:v2.12.3
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
name: copyutil name: copyutil
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
capabilities: capabilities:
drop: drop:
- ALL - ALL
readOnlyRootFilesystem: true readOnlyRootFilesystem: true
runAsNonRoot: true runAsNonRoot: true
seccompProfile: seccompProfile:
type: RuntimeDefault type: RuntimeDefault
terminationMessagePath: /dev/termination-log terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File terminationMessagePolicy: File
volumeMounts: volumeMounts:
- mountPath: /var/run/argocd - mountPath: /var/run/argocd
name: var-files name: var-files
- command: - command:
- /bin/sh - /bin/sh
- /plugin/init-helm-repos.sh - /plugin/init-helm-repos.sh
env: env:
- name: OCEANBOX_HELM_ACCESS_TOKEN - name: OCEANBOX_HELM_ACCESS_TOKEN
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: token key: token
name: oceanbox-helm name: oceanbox-helm
optional: false optional: false
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
imagePullPolicy: Always imagePullPolicy: Always
name: init-helm-repos name: init-helm-repos
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
capabilities: capabilities:
drop: drop:
- ALL - ALL
readOnlyRootFilesystem: true readOnlyRootFilesystem: true
runAsNonRoot: true runAsNonRoot: true
runAsUser: 999 runAsUser: 999
seccompProfile: seccompProfile:
type: RuntimeDefault type: RuntimeDefault
terminationMessagePath: /dev/termination-log terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File terminationMessagePolicy: File
volumeMounts: volumeMounts:
- mountPath: /helm-working-dir - mountPath: /helm-working-dir
name: helm-working-dir name: helm-working-dir
restartPolicy: Always restartPolicy: Always
schedulerName: default-scheduler schedulerName: default-scheduler
serviceAccount: argocd-repo-server serviceAccount: argocd-repo-server
serviceAccountName: argocd-repo-server serviceAccountName: argocd-repo-server
terminationGracePeriodSeconds: 30 terminationGracePeriodSeconds: 30
volumes: volumes:
- name: cmp-tmp - name: cmp-tmp
- name: helm-working-dir - name: helm-working-dir
- name: plugins - name: plugins
- name: var-files - name: var-files
- name: tmp - name: tmp
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-ssh-known-hosts-cm name: argocd-ssh-known-hosts-cm
name: ssh-known-hosts name: ssh-known-hosts
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-tls-certs-cm name: argocd-tls-certs-cm
name: tls-certs name: tls-certs
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-gpg-keys-cm name: argocd-gpg-keys-cm
name: gpg-keys name: gpg-keys
- name: gpg-keyring - name: gpg-keyring
- name: argocd-repo-server-tls - name: argocd-repo-server-tls
secret: secret:
defaultMode: 420 defaultMode: 420
items: items:
- key: tls.crt - key: tls.crt
path: tls.crt path: tls.crt
- key: tls.key - key: tls.key
path: tls.key path: tls.key
- key: ca.crt - key: ca.crt
path: ca.crt path: ca.crt
optional: true optional: true
secretName: argocd-repo-server-tls secretName: argocd-repo-server-tls
@@ -4,24 +4,24 @@ spec:
template: template:
spec: spec:
imagePullSecrets: imagePullSecrets:
- name: gitlab-pull-secret - name: gitlab-pull-secret
containers: containers:
- command: - command:
- /var/run/argocd/argocd-cmp-server - /var/run/argocd/argocd-cmp-server
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
imagePullPolicy: Always imagePullPolicy: Always
name: helmfile-cmp name: helmfile-cmp
securityContext: securityContext:
runAsNonRoot: true runAsNonRoot: true
runAsUser: 999 runAsUser: 999
terminationMessagePath: /dev/termination-log terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File terminationMessagePolicy: File
volumeMounts: volumeMounts:
- mountPath: /var/run/argocd - mountPath: /var/run/argocd
name: var-files name: var-files
- mountPath: /home/argocd/cmp-server/plugins - mountPath: /home/argocd/cmp-server/plugins
name: plugins name: plugins
- mountPath: /tmp - mountPath: /tmp
name: tmp name: tmp
- mountPath: /helm-working-dir - mountPath: /helm-working-dir
name: helm-working-dir name: helm-working-dir
+1 -1
View File
@@ -4,7 +4,7 @@ metadata:
name: helmfile-cmp name: helmfile-cmp
spec: spec:
generate: generate:
command: [ "/bin/sh" ] command: ["/bin/sh"]
args: args:
- /plugin/generate.sh - /plugin/generate.sh
lockRepo: false lockRepo: false
@@ -44,341 +44,341 @@ spec:
affinity: affinity:
podAntiAffinity: podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution: preferredDuringSchedulingIgnoredDuringExecution:
- podAffinityTerm: - podAffinityTerm:
labelSelector: labelSelector:
matchLabels: matchLabels:
app.kubernetes.io/name: argocd-repo-server app.kubernetes.io/name: argocd-repo-server
topologyKey: kubernetes.io/hostname topologyKey: kubernetes.io/hostname
weight: 100 weight: 100
containers: containers:
- args: - args:
- /usr/local/bin/argocd-repo-server - /usr/local/bin/argocd-repo-server
- --port=8081 - --port=8081
- --metrics-port=8084 - --metrics-port=8084
env: env:
- name: ARGOCD_REPO_SERVER_NAME - name: ARGOCD_REPO_SERVER_NAME
value: argocd-repo-server value: argocd-repo-server
- name: ARGOCD_RECONCILIATION_TIMEOUT - name: ARGOCD_RECONCILIATION_TIMEOUT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: timeout.reconciliation key: timeout.reconciliation
name: argocd-cm name: argocd-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LOGFORMAT - name: ARGOCD_REPO_SERVER_LOGFORMAT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.log.format key: reposerver.log.format
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LOGLEVEL - name: ARGOCD_REPO_SERVER_LOGLEVEL
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.log.level key: reposerver.log.level
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT - name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.parallelism.limit key: reposerver.parallelism.limit
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS - name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.listen.address key: reposerver.listen.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS - name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.metrics.listen.address key: reposerver.metrics.listen.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_DISABLE_TLS - name: ARGOCD_REPO_SERVER_DISABLE_TLS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.disable.tls key: reposerver.disable.tls
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_MIN_VERSION - name: ARGOCD_TLS_MIN_VERSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.minversion key: reposerver.tls.minversion
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_MAX_VERSION - name: ARGOCD_TLS_MAX_VERSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.maxversion key: reposerver.tls.maxversion
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_CIPHERS - name: ARGOCD_TLS_CIPHERS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.ciphers key: reposerver.tls.ciphers
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_CACHE_EXPIRATION - name: ARGOCD_REPO_CACHE_EXPIRATION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.repo.cache.expiration key: reposerver.repo.cache.expiration
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_SERVER - name: REDIS_SERVER
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.server key: redis.server
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_COMPRESSION - name: REDIS_COMPRESSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.compression key: redis.compression
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDISDB - name: REDISDB
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.db key: redis.db
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_USERNAME - name: REDIS_USERNAME
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: redis-username key: redis-username
name: argocd-redis name: argocd-redis
optional: true optional: true
- name: REDIS_PASSWORD - name: REDIS_PASSWORD
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: redis-password key: redis-password
name: argocd-redis name: argocd-redis
optional: true optional: true
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION - name: ARGOCD_DEFAULT_CACHE_EXPIRATION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.default.cache.expiration key: reposerver.default.cache.expiration
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS - name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.address key: otlp.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE - name: ARGOCD_REPO_SERVER_OTLP_INSECURE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.insecure key: otlp.insecure
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS - name: ARGOCD_REPO_SERVER_OTLP_HEADERS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.headers key: otlp.headers
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE - name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.max.combined.directory.manifests.size key: reposerver.max.combined.directory.manifests.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS - name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.plugin.tar.exclusions key: reposerver.plugin.tar.exclusions
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS - name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.allow.oob.symlinks key: reposerver.allow.oob.symlinks
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.streamed.manifest.max.tar.size key: reposerver.streamed.manifest.max.tar.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.streamed.manifest.max.extracted.size key: reposerver.streamed.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.helm.manifest.max.extracted.size key: reposerver.helm.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.disable.helm.manifest.max.extracted.size key: reposerver.disable.helm.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_MODULES_ENABLED - name: ARGOCD_GIT_MODULES_ENABLED
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.enable.git.submodule key: reposerver.enable.git.submodule
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT - name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.git.lsremote.parallelism.limit key: reposerver.git.lsremote.parallelism.limit
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_REQUEST_TIMEOUT - name: ARGOCD_GIT_REQUEST_TIMEOUT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.git.request.timeout key: reposerver.git.request.timeout
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: HELM_CACHE_HOME - name: HELM_CACHE_HOME
value: /helm-working-dir value: /helm-working-dir
- name: HELM_CONFIG_HOME - name: HELM_CONFIG_HOME
value: /helm-working-dir value: /helm-working-dir
- name: HELM_DATA_HOME - name: HELM_DATA_HOME
value: /helm-working-dir value: /helm-working-dir
image: quay.io/argoproj/argocd:v2.10.4 image: quay.io/argoproj/argocd:v2.10.4
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
livenessProbe: livenessProbe:
failureThreshold: 3 failureThreshold: 3
httpGet: httpGet:
path: /healthz?full=true path: /healthz?full=true
port: metrics port: metrics
scheme: HTTP scheme: HTTP
initialDelaySeconds: 10 initialDelaySeconds: 10
periodSeconds: 10 periodSeconds: 10
successThreshold: 1 successThreshold: 1
timeoutSeconds: 1 timeoutSeconds: 1
name: repo-server
ports:
- containerPort: 8081
name: repo-server name: repo-server
protocol: TCP ports:
- containerPort: 8084 - containerPort: 8081
name: metrics name: repo-server
protocol: TCP protocol: TCP
readinessProbe: - containerPort: 8084
failureThreshold: 3 name: metrics
httpGet: protocol: TCP
path: /healthz readinessProbe:
port: metrics failureThreshold: 3
scheme: HTTP httpGet:
initialDelaySeconds: 10 path: /healthz
periodSeconds: 10 port: metrics
successThreshold: 1 scheme: HTTP
timeoutSeconds: 1 initialDelaySeconds: 10
resources: {} periodSeconds: 10
securityContext: successThreshold: 1
allowPrivilegeEscalation: false timeoutSeconds: 1
capabilities: resources: {}
drop: securityContext:
- ALL allowPrivilegeEscalation: false
readOnlyRootFilesystem: true capabilities:
runAsNonRoot: true drop:
seccompProfile: - ALL
type: RuntimeDefault readOnlyRootFilesystem: true
terminationMessagePath: /dev/termination-log runAsNonRoot: true
terminationMessagePolicy: File seccompProfile:
volumeMounts: type: RuntimeDefault
- mountPath: /app/config/ssh terminationMessagePath: /dev/termination-log
name: ssh-known-hosts terminationMessagePolicy: File
- mountPath: /app/config/tls volumeMounts:
name: tls-certs - mountPath: /app/config/ssh
- mountPath: /app/config/gpg/source name: ssh-known-hosts
name: gpg-keys - mountPath: /app/config/tls
- mountPath: /app/config/gpg/keys name: tls-certs
name: gpg-keyring - mountPath: /app/config/gpg/source
- mountPath: /app/config/reposerver/tls name: gpg-keys
name: argocd-repo-server-tls - mountPath: /app/config/gpg/keys
- mountPath: /helm-working-dir name: gpg-keyring
name: helm-working-dir - mountPath: /app/config/reposerver/tls
- mountPath: /home/argocd/cmp-server/plugins name: argocd-repo-server-tls
name: plugins - mountPath: /helm-working-dir
- mountPath: /tmp name: helm-working-dir
name: tmp - mountPath: /home/argocd/cmp-server/plugins
- command: name: plugins
- /var/run/argocd/argocd-cmp-server - mountPath: /tmp
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest name: tmp
imagePullPolicy: Always - command:
name: kustomize-helm-with-rewrite - /var/run/argocd/argocd-cmp-server
resources: {} image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
securityContext: imagePullPolicy: Always
runAsNonRoot: true name: kustomize-helm-with-rewrite
runAsUser: 999 resources: {}
terminationMessagePath: /dev/termination-log securityContext:
terminationMessagePolicy: File runAsNonRoot: true
volumeMounts: runAsUser: 999
- mountPath: /var/run/argocd terminationMessagePath: /dev/termination-log
name: var-files terminationMessagePolicy: File
- mountPath: /home/argocd/cmp-server/plugins volumeMounts:
name: plugins - mountPath: /var/run/argocd
- mountPath: /tmp name: var-files
name: cmp-tmp - mountPath: /home/argocd/cmp-server/plugins
- mountPath: /helm-working-dir name: plugins
name: helm-working-dir - mountPath: /tmp
name: cmp-tmp
- mountPath: /helm-working-dir
name: helm-working-dir
dnsPolicy: ClusterFirst dnsPolicy: ClusterFirst
imagePullSecrets: imagePullSecrets:
- name: gitlab-pull-secret - name: gitlab-pull-secret
initContainers: initContainers:
- command: - command:
- /bin/cp - /bin/cp
- -n - -n
- /usr/local/bin/argocd - /usr/local/bin/argocd
- /var/run/argocd/argocd-cmp-server - /var/run/argocd/argocd-cmp-server
image: quay.io/argoproj/argocd:v2.10.4 image: quay.io/argoproj/argocd:v2.10.4
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
name: copyutil name: copyutil
resources: {} resources: {}
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
capabilities: capabilities:
drop: drop:
- ALL - ALL
readOnlyRootFilesystem: true readOnlyRootFilesystem: true
runAsNonRoot: true runAsNonRoot: true
seccompProfile: seccompProfile:
type: RuntimeDefault type: RuntimeDefault
terminationMessagePath: /dev/termination-log terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File terminationMessagePolicy: File
volumeMounts: volumeMounts:
- mountPath: /var/run/argocd - mountPath: /var/run/argocd
name: var-files name: var-files
- command: - command:
- /bin/sh - /bin/sh
- /plugin/init-helm-repos.sh - /plugin/init-helm-repos.sh
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
imagePullPolicy: Always imagePullPolicy: Always
name: init-helm-repos name: init-helm-repos
resources: {} resources: {}
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
capabilities: capabilities:
drop: drop:
- ALL - ALL
readOnlyRootFilesystem: true readOnlyRootFilesystem: true
runAsUser: 999 runAsUser: 999
runAsNonRoot: true runAsNonRoot: true
seccompProfile: seccompProfile:
type: RuntimeDefault type: RuntimeDefault
terminationMessagePath: /dev/termination-log terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File terminationMessagePolicy: File
env: env:
- name: OCEANBOX_HELM_ACCESS_TOKEN - name: OCEANBOX_HELM_ACCESS_TOKEN
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: token key: token
name: oceanbox-helm name: oceanbox-helm
optional: false optional: false
volumeMounts: volumeMounts:
- mountPath: /helm-working-dir - mountPath: /helm-working-dir
name: helm-working-dir name: helm-working-dir
restartPolicy: Always restartPolicy: Always
schedulerName: default-scheduler schedulerName: default-scheduler
securityContext: {} securityContext: {}
@@ -386,40 +386,39 @@ spec:
serviceAccountName: argocd-repo-server serviceAccountName: argocd-repo-server
terminationGracePeriodSeconds: 30 terminationGracePeriodSeconds: 30
volumes: volumes:
- emptyDir: {} - emptyDir: {}
name: cmp-tmp name: cmp-tmp
- emptyDir: {} - emptyDir: {}
name: helm-working-dir name: helm-working-dir
- emptyDir: {} - emptyDir: {}
name: plugins name: plugins
- emptyDir: {} - emptyDir: {}
name: var-files name: var-files
- emptyDir: {} - emptyDir: {}
name: tmp name: tmp
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-ssh-known-hosts-cm name: argocd-ssh-known-hosts-cm
name: ssh-known-hosts name: ssh-known-hosts
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-tls-certs-cm name: argocd-tls-certs-cm
name: tls-certs name: tls-certs
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-gpg-keys-cm name: argocd-gpg-keys-cm
name: gpg-keys name: gpg-keys
- emptyDir: {} - emptyDir: {}
name: gpg-keyring name: gpg-keyring
- name: argocd-repo-server-tls - name: argocd-repo-server-tls
secret: secret:
defaultMode: 420 defaultMode: 420
items: items:
- key: tls.crt - key: tls.crt
path: tls.crt path: tls.crt
- key: tls.key - key: tls.key
path: tls.key path: tls.key
- key: ca.crt - key: ca.crt
path: ca.crt path: ca.crt
optional: true optional: true
secretName: argocd-repo-server-tls secretName: argocd-repo-server-tls
-1
View File
@@ -13,4 +13,3 @@ stringData:
name: staging-vcluster name: staging-vcluster
server: https://staging-vcluster.staging-vcluster server: https://staging-vcluster.staging-vcluster
type: Opaque type: Opaque
+11 -11
View File
@@ -19,12 +19,12 @@ applications:
plugin: plugin:
name: helmfile-cmp name: helmfile-cmp
env: env:
- name: CLUSTER_NAME - name: CLUSTER_NAME
value: replaceme value: replaceme
- name: HELMFILE_ENVIRONMENT - name: HELMFILE_ENVIRONMENT
value: default value: default
- name: HELMFILE_FILE_PATH - name: HELMFILE_FILE_PATH
value: system.yaml.gotmpl value: system.yaml.gotmpl
projects: projects:
sys: sys:
namespace: argocd namespace: argocd
@@ -32,12 +32,12 @@ projects:
additionalAnnotations: {} additionalAnnotations: {}
description: sys components project description: sys components project
sourceRepos: sourceRepos:
- '*' - "*"
destinations: destinations:
- namespace: '*' - namespace: "*"
server: https://kubernetes.default.svc server: https://kubernetes.default.svc
clusterResourceWhitelist: clusterResourceWhitelist:
- group: '*' - group: "*"
kind: '*' kind: "*"
orphanedResources: orphanedResources:
warn: false warn: false
+8 -3
View File
@@ -5,6 +5,8 @@ let
globalExcludes = [ globalExcludes = [
"nix/default.nix" "nix/default.nix"
"attic"
"vcluster"
".*vendor" ".*vendor"
".*chart/.*" ".*chart/.*"
".*schema.json" ".*schema.json"
@@ -32,6 +34,7 @@ pre-commit.run {
enable = true; enable = true;
excludes = [ excludes = [
"vcluster/" "vcluster/"
"attic/"
]; ];
args = [ args = [
"-x" "-x"
@@ -41,15 +44,17 @@ pre-commit.run {
}; };
yamllint = { yamllint = {
enable = false; enable = true;
excludes = [ excludes = [
"attic/" "attic/"
"charts/templates/" "charts/templates/"
"charts/charts/" "charts/"
"values/"
"vcluster/"
]; ];
settings = { settings = {
strict = true; strict = true;
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 165} } }''; configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 300} } }'';
}; };
}; };
+203 -182
View File
@@ -1,183 +1,204 @@
groups: groups:
- name: etcd - name: etcd
rules: rules:
- alert: etcdMembersDown - alert: etcdMembersDown
annotations: annotations:
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value description:
}}).' 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
summary: etcd cluster members are down. }}).'
expr: |- summary: etcd cluster members are down.
max without (endpoint) ( expr: |-
sum without (instance) (up{job=~".*etcd.*"} == bool 0) max without (endpoint) (
or sum without (instance) (up{job=~".*etcd.*"} == bool 0)
count without (To) ( or
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 count without (To) (
) sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
) )
> 0 )
for: 10m > 0
labels: for: 10m
severity: critical labels:
- alert: etcdInsufficientMembers severity: critical
annotations: - alert: etcdInsufficientMembers
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value annotations:
}}).' description:
summary: etcd cluster has insufficient number of members. 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) }}).'
without (instance) + 1) / 2) summary: etcd cluster has insufficient number of members.
for: 3m expr:
labels: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
severity: critical without (instance) + 1) / 2)
- alert: etcdNoLeader for: 3m
annotations: labels:
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} severity: critical
has no leader.' - alert: etcdNoLeader
summary: etcd cluster has no leader. annotations:
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 description:
for: 1m 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
labels: has no leader.'
severity: critical summary: etcd cluster has no leader.
- alert: etcdHighNumberOfLeaderChanges expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
annotations: for: 1m
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes labels:
within the last 15 minutes. Frequent elections may be a sign of insufficient severity: critical
resources, high network latency, or disruptions by other components and should - alert: etcdHighNumberOfLeaderChanges
be investigated.' annotations:
summary: etcd cluster has high number of leader changes. description:
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) within the last 15 minutes. Frequent elections may be a sign of insufficient
>= 4 resources, high network latency, or disruptions by other components and should
for: 5m be investigated.'
labels: summary: etcd cluster has high number of leader changes.
severity: warning expr:
- alert: etcdHighNumberOfFailedGRPCRequests increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
annotations: or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for >= 4
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' for: 5m
summary: etcd cluster has high number of failed grpc requests. labels:
expr: |- severity: warning
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) - alert: etcdHighNumberOfFailedGRPCRequests
/ annotations:
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) description:
> 1 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
for: 10m {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
labels: summary: etcd cluster has high number of failed grpc requests.
severity: warning expr: |-
- alert: etcdHighNumberOfFailedGRPCRequests 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
annotations: /
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' > 1
summary: etcd cluster has high number of failed grpc requests. for: 10m
expr: |- labels:
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) severity: warning
/ - alert: etcdHighNumberOfFailedGRPCRequests
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) annotations:
> 5 description:
for: 5m 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
labels: {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
severity: critical summary: etcd cluster has high number of failed grpc requests.
- alert: etcdGRPCRequestsSlow expr: |-
annotations: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests /
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
}} method.' > 5
summary: etcd grpc requests are slow for: 5m
expr: |- labels:
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) severity: critical
> 0.15 - alert: etcdGRPCRequestsSlow
for: 10m annotations:
labels: description:
severity: critical 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
- alert: etcdMemberCommunicationSlow is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
annotations: }} method.'
description: 'etcd cluster "{{ $labels.job }}": member communication with {{ summary: etcd grpc requests are slow
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance expr: |-
}}.' histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
summary: etcd cluster member communication is slow. > 0.15
expr: |- for: 10m
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) labels:
> 0.15 severity: critical
for: 10m - alert: etcdMemberCommunicationSlow
labels: annotations:
severity: warning description:
- alert: etcdHighNumberOfFailedProposals 'etcd cluster "{{ $labels.job }}": member communication with {{
annotations: $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures }}.'
within the last 30 minutes on etcd instance {{ $labels.instance }}.' summary: etcd cluster member communication is slow.
summary: etcd cluster has high number of proposal failures. expr: |-
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
for: 15m > 0.15
labels: for: 10m
severity: warning labels:
- alert: etcdHighFsyncDurations severity: warning
annotations: - alert: etcdHighNumberOfFailedProposals
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations annotations:
are {{ $value }}s on etcd instance {{ $labels.instance }}.' description:
summary: etcd cluster 99th percentile fsync durations are too high. 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
expr: |- within the last 30 minutes on etcd instance {{ $labels.instance }}.'
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) summary: etcd cluster has high number of proposal failures.
> 0.5 expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
for: 10m for: 15m
labels: labels:
severity: warning severity: warning
- alert: etcdHighFsyncDurations - alert: etcdHighFsyncDurations
annotations: annotations:
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations description:
are {{ $value }}s on etcd instance {{ $labels.instance }}.' 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
summary: etcd cluster 99th percentile fsync durations are too high. are {{ $value }}s on etcd instance {{ $labels.instance }}.'
expr: |- summary: etcd cluster 99th percentile fsync durations are too high.
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) expr: |-
> 1 histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
for: 10m > 0.5
labels: for: 10m
severity: critical labels:
- alert: etcdHighCommitDurations severity: warning
annotations: - alert: etcdHighFsyncDurations
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations annotations:
{{ $value }}s on etcd instance {{ $labels.instance }}.' description:
summary: etcd cluster 99th percentile commit durations are too high. 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
expr: |- are {{ $value }}s on etcd instance {{ $labels.instance }}.'
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) summary: etcd cluster 99th percentile fsync durations are too high.
> 0.25 expr: |-
for: 10m histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
labels: > 1
severity: warning for: 10m
- alert: etcdDatabaseQuotaLowSpace labels:
annotations: severity: critical
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined - alert: etcdHighCommitDurations
quota on etcd instance {{ $labels.instance }}, please defrag or increase the annotations:
quota as the writes to etcd will be disabled when it is full.' description:
summary: etcd cluster database is running full. 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / {{ $value }}s on etcd instance {{ $labels.instance }}.'
last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > summary: etcd cluster 99th percentile commit durations are too high.
95 expr: |-
for: 10m histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
labels: > 0.25
severity: critical for: 10m
- alert: etcdExcessiveDatabaseGrowth labels:
annotations: severity: warning
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk - alert: etcdDatabaseQuotaLowSpace
space in the next four hours, based on write observations within the past annotations:
four hours on etcd instance {{ $labels.instance }}, please check as it might description:
be disruptive.' 'etcd cluster "{{ $labels.job }}": database size exceeds the defined
summary: etcd cluster database growing very fast. quota on etcd instance {{ $labels.instance }}, please defrag or increase the
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) quota as the writes to etcd will be disabled when it is full.'
> etcd_server_quota_backend_bytes{job=~".*etcd.*"} summary: etcd cluster database is running full.
for: 10m expr:
labels: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
severity: warning last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 >
- alert: etcdDatabaseHighFragmentationRatio 95
annotations: for: 10m
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance labels:
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual severity: critical
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to - alert: etcdExcessiveDatabaseGrowth
retrieve the unused fragmented disk space.' annotations:
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation description:
summary: etcd database size in use is less than 50% of the actual allocated 'etcd cluster "{{ $labels.job }}": Predicting running out of disk
storage. space in the next four hours, based on write observations within the past
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) four hours on etcd instance {{ $labels.instance }}, please check as it might
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 be disruptive.'
and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600 summary: etcd cluster database growing very fast.
for: 10m expr:
labels: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
severity: warning > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
for: 10m
labels:
severity: warning
- alert: etcdDatabaseHighFragmentationRatio
annotations:
description:
'etcd cluster "{{ $labels.job }}": database size in use on instance
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
retrieve the unused fragmented disk space.'
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
summary:
etcd database size in use is less than 50% of the actual allocated
storage.
expr:
(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
for: 10m
labels:
severity: warning
+46 -42
View File
@@ -1,43 +1,47 @@
groups: groups:
- name: general.rules - name: general.rules
rules: rules:
- alert: TargetDown - alert: TargetDown
annotations: annotations:
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service description:
}} targets in {{ $labels.namespace }} namespace are down.' '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown }} targets in {{ $labels.namespace }} namespace are down.'
summary: One or more targets are unreachable. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) summary: One or more targets are unreachable.
BY (cluster, job, namespace, service)) > 10 expr:
for: 10m 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
labels: BY (cluster, job, namespace, service)) > 10
severity: warning for: 10m
- alert: Watchdog labels:
annotations: severity: warning
description: | - alert: Watchdog
This is an alert meant to ensure that the entire alerting pipeline is functional. annotations:
This alert is always firing, therefore it should always be firing in Alertmanager description: |
and always fire against a receiver. There are integrations with various notification This is an alert meant to ensure that the entire alerting pipeline is functional.
mechanisms that send a notification when this alert is not firing. For example the This alert is always firing, therefore it should always be firing in Alertmanager
"DeadMansSnitch" integration in PagerDuty. and always fire against a receiver. There are integrations with various notification
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog mechanisms that send a notification when this alert is not firing. For example the
summary: An alert that should always be firing to certify that Alertmanager "DeadMansSnitch" integration in PagerDuty.
is working properly. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
expr: vector(1) summary:
labels: An alert that should always be firing to certify that Alertmanager
severity: none is working properly.
- alert: InfoInhibitor expr: vector(1)
annotations: labels:
description: | severity: none
This is an alert that is used to inhibit info alerts. - alert: InfoInhibitor
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with annotations:
other alerts. description: |
This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a This is an alert that is used to inhibit info alerts.
severity of 'warning' or 'critical' starts firing on the same namespace. By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". other alerts.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
summary: Info-level alert inhibition. severity of 'warning' or 'critical' starts firing on the same namespace.
expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname != This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
labels: summary: Info-level alert inhibition.
severity: none expr:
ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: none
+277 -258
View File
@@ -1,262 +1,281 @@
groups: groups:
- name: kubernetes-apps - name: kubernetes-apps
rules: rules:
- alert: KubePodCrashLooping - alert: KubePodCrashLooping
annotations: annotations:
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container description:
}}) is in waiting state (reason: "CrashLoopBackOff").' 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping }}) is in waiting state (reason: "CrashLoopBackOff").'
summary: Pod is crash looping. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", summary: Pod is crash looping.
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1 expr:
for: 15m max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
labels: job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
severity: warning for: 15m
- alert: KubePodNotReady labels:
annotations: severity: warning
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready - alert: KubePodNotReady
state for longer than 15 minutes. annotations:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready description:
summary: Pod has been in a non-ready state for more than 15 minutes. Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
expr: |- state for longer than 15 minutes.
sum by (namespace, pod, cluster) ( runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
max by (namespace, pod, cluster) ( summary: Pod has been in a non-ready state for more than 15 minutes.
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"} expr: |-
) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) ( sum by (namespace, pod, cluster) (
1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) max by (namespace, pod, cluster) (
) kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"}
) > 0 ) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) (
for: 15m 1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
labels: )
severity: warning ) > 0
- alert: KubeDeploymentGenerationMismatch for: 15m
annotations: labels:
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment severity: warning
}} does not match, this indicates that the Deployment has failed but has not - alert: KubeDeploymentGenerationMismatch
been rolled back. annotations:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch description:
summary: Deployment generation mismatch due to possible roll-back Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
expr: |- }} does not match, this indicates that the Deployment has failed but has not
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"} been rolled back.
!= runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"} summary: Deployment generation mismatch due to possible roll-back
for: 15m expr: |-
labels: kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |-
(
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
>
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
) and (
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDeploymentRolloutStuck
annotations:
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
}} is not progressing for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
summary: Deployment rollout is not progressing.
expr: |-
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
!= 0
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
summary: StatefulSet has not matched the expected number of replicas.
expr: |-
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: |-
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: |-
(
max by (namespace, statefulset) (
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
!= !=
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"} kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
) for: 15m
) and ( labels:
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m]) severity: warning
== - alert: KubeDeploymentReplicasMismatch
0 annotations:
) description:
for: 15m Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
labels: not matched the expected number of replicas for longer than 15 minutes.
severity: warning runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
- alert: KubeDaemonSetRolloutStuck summary: Deployment has not matched the expected number of replicas.
annotations: expr: |-
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not (
finished or progressed for at least 15 minutes. kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck >
summary: DaemonSet rollout is stuck. kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
expr: |- ) and (
( changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
( ==
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} 0
!= )
for: 15m
labels:
severity: warning
- alert: KubeDeploymentRolloutStuck
annotations:
description:
Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
}} is not progressing for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
summary: Deployment rollout is not progressing.
expr: |-
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
!= 0
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
description:
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
summary: StatefulSet has not matched the expected number of replicas.
expr: |-
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
description:
StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: |-
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description:
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: |-
(
max by (namespace, statefulset) (
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
)
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
description:
DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
finished or progressed for at least 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: |-
(
(
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
) or (
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
!=
0
) or (
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
) or (
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
)
) and (
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeContainerWaiting
annotations:
description:
pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
{{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr:
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
description:
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
expr: |-
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
) or ( -
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
for: 10m
labels:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
description:
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
expr:
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
!= > 0
0 for: 15m
) or ( labels:
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"} severity: warning
!= - alert: KubeJobNotCompleted
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} annotations:
) or ( description:
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"} Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
!= than {{ "43200" | humanizeDuration }} to complete.
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
) summary: Job did not complete in time
) and ( expr: |-
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m]) time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
== and
0 kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
) labels:
for: 15m severity: warning
labels: - alert: KubeJobFailed
severity: warning annotations:
- alert: KubeContainerWaiting description:
annotations: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container Removing failed job after investigation should clear this alert.
{{ $labels.container}} has been in waiting state for longer than 1 hour. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting summary: Job failed to complete.
summary: Pod container waiting longer than 1 hour expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", for: 15m
namespace=~".*"}) > 0 labels:
for: 1h severity: warning
labels: - alert: KubeHpaReplicasMismatch
severity: warning annotations:
- alert: KubeDaemonSetNotScheduled description:
annotations: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset has not matched the desired number of replicas for longer than 15 minutes.
}} are not scheduled.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled summary: HPA has not matched desired number of replicas.
summary: DaemonSet pods are not scheduled. expr: |-
expr: |- (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} !=
- kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0 and
for: 10m (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
labels: >
severity: warning kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
- alert: KubeDaemonSetMisScheduled and
annotations: (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset <
}} are running where they are not supposed to run.' kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled and
summary: DaemonSet pods are misscheduled. changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} for: 15m
> 0 labels:
for: 15m severity: warning
labels: - alert: KubeHpaMaxedOut
severity: warning annotations:
- alert: KubeJobNotCompleted description:
annotations: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more has been running at max replicas for longer than 15 minutes.
than {{ "43200" | humanizeDuration }} to complete. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted summary: HPA is running at max replicas
summary: Job did not complete in time expr: |-
expr: |- kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"} ==
and kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200 for: 15m
labels: labels:
severity: warning severity: warning
- alert: KubeJobFailed
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
Removing failed job after investigation should clear this alert.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
summary: Job failed to complete.
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
for: 15m
labels:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
has not matched the desired number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
summary: HPA has not matched desired number of replicas.
expr: |-
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
!=
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
>
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
<
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
and
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
for: 15m
labels:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
has been running at max replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
summary: HPA is running at max replicas
expr: |-
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
==
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning
+122 -114
View File
@@ -1,115 +1,123 @@
groups: groups:
- name: kubernetes-resources - name: kubernetes-resources
rules: rules:
- alert: KubeCPUOvercommit - alert: KubeCPUOvercommit
annotations: annotations:
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests description:
for Pods by {{ $value }} CPU shares and cannot tolerate node failure. Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
summary: Cluster has overcommitted CPU resource requests. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
expr: |- summary: Cluster has overcommitted CPU resource requests.
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 expr: |-
and sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 and
for: 10m (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
labels: for: 10m
severity: warning labels:
- alert: KubeMemoryOvercommit severity: warning
annotations: - alert: KubeMemoryOvercommit
description: Cluster {{ $labels.cluster }} has overcommitted memory resource annotations:
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node description:
failure. Cluster {{ $labels.cluster }} has overcommitted memory resource
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
summary: Cluster has overcommitted memory resource requests. failure.
expr: |- runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 summary: Cluster has overcommitted memory resource requests.
and expr: |-
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
for: 10m and
labels: (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
severity: warning for: 10m
- alert: KubeCPUQuotaOvercommit labels:
annotations: severity: warning
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests - alert: KubeCPUQuotaOvercommit
for Namespaces. annotations:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit description:
summary: Cluster has overcommitted CPU resource requests. Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
expr: |- for Namespaces.
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster) runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
/ summary: Cluster has overcommitted CPU resource requests.
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster) expr: |-
> 1.5 sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
for: 5m /
labels: sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
severity: warning > 1.5
- alert: KubeMemoryQuotaOvercommit for: 5m
annotations: labels:
description: Cluster {{ $labels.cluster }} has overcommitted memory resource severity: warning
requests for Namespaces. - alert: KubeMemoryQuotaOvercommit
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit annotations:
summary: Cluster has overcommitted memory resource requests. description:
expr: |- Cluster {{ $labels.cluster }} has overcommitted memory resource
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster) requests for Namespaces.
/ runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) summary: Cluster has overcommitted memory resource requests.
> 1.5 expr: |-
for: 5m sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
labels: /
severity: warning sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
- alert: KubeQuotaAlmostFull > 1.5
annotations: for: 5m
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage labels:
}} of its {{ $labels.resource }} quota. severity: warning
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull - alert: KubeQuotaAlmostFull
summary: Namespace quota is going to be full. annotations:
expr: |- description:
kube_resourcequota{job="kube-state-metrics", type="used"} Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
/ ignoring(instance, job, type) }} of its {{ $labels.resource }} quota.
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
> 0.9 < 1 summary: Namespace quota is going to be full.
for: 15m expr: |-
labels: kube_resourcequota{job="kube-state-metrics", type="used"}
severity: info / ignoring(instance, job, type)
- alert: KubeQuotaFullyUsed (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
annotations: > 0.9 < 1
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage for: 15m
}} of its {{ $labels.resource }} quota. labels:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused severity: info
summary: Namespace quota is fully used. - alert: KubeQuotaFullyUsed
expr: |- annotations:
kube_resourcequota{job="kube-state-metrics", type="used"} description:
/ ignoring(instance, job, type) Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) }} of its {{ $labels.resource }} quota.
== 1 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
for: 15m summary: Namespace quota is fully used.
labels: expr: |-
severity: info kube_resourcequota{job="kube-state-metrics", type="used"}
- alert: KubeQuotaExceeded / ignoring(instance, job, type)
annotations: (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage == 1
}} of its {{ $labels.resource }} quota. for: 15m
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded labels:
summary: Namespace quota has exceeded the limits. severity: info
expr: |- - alert: KubeQuotaExceeded
kube_resourcequota{job="kube-state-metrics", type="used"} annotations:
/ ignoring(instance, job, type) description:
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
> 1 }} of its {{ $labels.resource }} quota.
for: 15m runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
labels: summary: Namespace quota has exceeded the limits.
severity: warning expr: |-
- alert: CPUThrottlingHigh kube_resourcequota{job="kube-state-metrics", type="used"}
annotations: / ignoring(instance, job, type)
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod > 1
}}.' for: 15m
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh labels:
summary: Processes experience elevated CPU throttling. severity: warning
expr: |- - alert: CPUThrottlingHigh
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace) annotations:
/ description:
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) "{{ $value | humanizePercentage }} throttling of CPU in namespace
> ( 25 / 100 ) {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
for: 15m }}."
labels: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
severity: info summary: Processes experience elevated CPU throttling.
expr: |-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
> ( 25 / 100 )
for: 15m
labels:
severity: info
+113 -108
View File
@@ -1,109 +1,114 @@
groups: groups:
- name: kubernetes-storage - name: kubernetes-storage
rules: rules:
- alert: KubePersistentVolumeFillingUp - alert: KubePersistentVolumeFillingUp
annotations: annotations:
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim description:
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free. }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
summary: PersistentVolume is filling up. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
expr: |- summary: PersistentVolume is filling up.
( expr: |-
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} (
/ kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} /
) < 0.03 kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
and ) < 0.03
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 and
unless on (cluster, namespace, persistentvolumeclaim) kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on (cluster, namespace, persistentvolumeclaim)
unless on (cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 unless on (cluster, namespace, persistentvolumeclaim)
for: 1m kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
labels: for: 1m
severity: critical labels:
- alert: KubePersistentVolumeFillingUp severity: critical
annotations: - alert: KubePersistentVolumeFillingUp
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim annotations:
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster description:
{{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
| humanizePercentage }} is available. }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value
summary: PersistentVolume is filling up. | humanizePercentage }} is available.
expr: |- runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
( summary: PersistentVolume is filling up.
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} expr: |-
/ (
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
) < 0.15 /
and kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 ) < 0.15
and and
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
unless on (cluster, namespace, persistentvolumeclaim) and
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
unless on (cluster, namespace, persistentvolumeclaim) unless on (cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
for: 1h unless on (cluster, namespace, persistentvolumeclaim)
labels: kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
severity: warning for: 1h
- alert: KubePersistentVolumeInodesFillingUp labels:
annotations: severity: warning
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim - alert: KubePersistentVolumeInodesFillingUp
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster annotations:
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes. description:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
summary: PersistentVolumeInodes are filling up. }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
expr: |- {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
( runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} summary: PersistentVolumeInodes are filling up.
/ expr: |-
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} (
) < 0.03 kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
and /
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
unless on (cluster, namespace, persistentvolumeclaim) ) < 0.03
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 and
unless on (cluster, namespace, persistentvolumeclaim) kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 unless on (cluster, namespace, persistentvolumeclaim)
for: 1m kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
labels: unless on (cluster, namespace, persistentvolumeclaim)
severity: critical kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
- alert: KubePersistentVolumeInodesFillingUp for: 1m
annotations: labels:
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim severity: critical
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster - alert: KubePersistentVolumeInodesFillingUp
{{ . }} {{- end }} is expected to run out of inodes within four days. Currently annotations:
{{ $value | humanizePercentage }} of its inodes are free. description:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
summary: PersistentVolumeInodes are filling up. }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
expr: |- {{ . }} {{- end }} is expected to run out of inodes within four days. Currently
( {{ $value | humanizePercentage }} of its inodes are free.
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
/ summary: PersistentVolumeInodes are filling up.
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} expr: |-
) < 0.15 (
and kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 /
and kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 ) < 0.15
unless on (cluster, namespace, persistentvolumeclaim) and
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
unless on (cluster, namespace, persistentvolumeclaim) and
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
for: 1h unless on (cluster, namespace, persistentvolumeclaim)
labels: kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
severity: warning unless on (cluster, namespace, persistentvolumeclaim)
- alert: KubePersistentVolumeErrors kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
annotations: for: 1h
description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster labels:
-}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}. severity: warning
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors - alert: KubePersistentVolumeErrors
summary: PersistentVolume is having issues with provisioning. annotations:
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} description:
> 0 The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
for: 5m -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
labels: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
severity: critical summary: PersistentVolume is having issues with provisioning.
expr:
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
> 0
for: 5m
labels:
severity: critical
+366 -339
View File
@@ -1,340 +1,367 @@
groups: groups:
- name: node-exporter - name: node-exporter
rules: rules:
- alert: NodeFilesystemSpaceFillingUp - alert: NodeFilesystemSpaceFillingUp
annotations: annotations:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint description:
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
space left and is filling up. }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup space left and is filling up.
summary: Filesystem is predicted to run out of space within the next 24 hours. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
expr: |- summary: Filesystem is predicted to run out of space within the next 24 hours.
( expr: |-
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 (
and node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 and
and predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 and
) node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
for: 1h )
labels: for: 1h
severity: warning labels:
- alert: NodeFilesystemSpaceFillingUp severity: warning
annotations: - alert: NodeFilesystemSpaceFillingUp
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint annotations:
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available description:
space left and is filling up fast. Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
summary: Filesystem is predicted to run out of space within the next 4 hours. space left and is filling up fast.
expr: |- runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
( summary: Filesystem is predicted to run out of space within the next 4 hours.
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 expr: |-
and (
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
and and
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
) and
for: 1h node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
labels: )
severity: critical for: 1h
- alert: NodeFilesystemAlmostOutOfSpace labels:
annotations: severity: critical
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint - alert: NodeFilesystemAlmostOutOfSpace
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available annotations:
space left. description:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
summary: Filesystem has less than 5% space left. }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
expr: |- space left.
( runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 summary: Filesystem has less than 5% space left.
and expr: |-
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 (
) node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
for: 30m and
labels: node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
severity: warning )
- alert: NodeFilesystemAlmostOutOfSpace for: 30m
annotations: labels:
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint severity: warning
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available - alert: NodeFilesystemAlmostOutOfSpace
space left. annotations:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace description:
summary: Filesystem has less than 3% space left. Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
expr: |- }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
( space left.
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
and summary: Filesystem has less than 3% space left.
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 expr: |-
) (
for: 30m node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
labels: and
severity: critical node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
- alert: NodeFilesystemFilesFillingUp )
annotations: for: 30m
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint labels:
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available severity: critical
inodes left and is filling up. - alert: NodeFilesystemFilesFillingUp
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup annotations:
summary: Filesystem is predicted to run out of inodes within the next 24 hours. description:
expr: |- Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
( }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 inodes left and is filling up.
and runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 summary: Filesystem is predicted to run out of inodes within the next 24 hours.
and expr: |-
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 (
) node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
for: 1h and
labels: predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
severity: warning and
- alert: NodeFilesystemFilesFillingUp node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
annotations: )
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint for: 1h
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available labels:
inodes left and is filling up fast. severity: warning
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup - alert: NodeFilesystemFilesFillingUp
summary: Filesystem is predicted to run out of inodes within the next 4 hours. annotations:
expr: |- description:
( Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
and inodes left and is filling up fast.
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
and summary: Filesystem is predicted to run out of inodes within the next 4 hours.
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 expr: |-
) (
for: 1h node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
labels: and
severity: critical predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
- alert: NodeFilesystemAlmostOutOfFiles and
annotations: node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint )
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available for: 1h
inodes left. labels:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles severity: critical
summary: Filesystem has less than 5% inodes left. - alert: NodeFilesystemAlmostOutOfFiles
expr: |- annotations:
( description:
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
and }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 inodes left.
) runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
for: 1h summary: Filesystem has less than 5% inodes left.
labels: expr: |-
severity: warning (
- alert: NodeFilesystemAlmostOutOfFiles node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
annotations: and
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available )
inodes left. for: 1h
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles labels:
summary: Filesystem has less than 3% inodes left. severity: warning
expr: |- - alert: NodeFilesystemAlmostOutOfFiles
( annotations:
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 description:
and Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
) inodes left.
for: 1h runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
labels: summary: Filesystem has less than 3% inodes left.
severity: critical expr: |-
- alert: NodeNetworkReceiveErrs (
annotations: node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered and
{{ printf "%.0f" $value }} receive errors in the last two minutes.' node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs )
summary: Network interface is reporting many receive errors. for: 1h
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) labels:
> 0.01 severity: critical
for: 1h - alert: NodeNetworkReceiveErrs
labels: annotations:
severity: warning description:
- alert: NodeNetworkTransmitErrs '{{ $labels.instance }} interface {{ $labels.device }} has encountered
annotations: {{ printf "%.0f" $value }} receive errors in the last two minutes.'
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
{{ printf "%.0f" $value }} transmit errors in the last two minutes.' summary: Network interface is reporting many receive errors.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs expr:
summary: Network interface is reporting many transmit errors. rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) > 0.01
> 0.01 for: 1h
for: 1h labels:
labels: severity: warning
severity: warning - alert: NodeNetworkTransmitErrs
- alert: NodeHighNumberConntrackEntriesUsed annotations:
annotations: description:
description: '{{ $value | humanizePercentage }} of conntrack entries are used.' '{{ $labels.instance }} interface {{ $labels.device }} has encountered
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
summary: Number of conntrack are getting close to the limit. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) summary: Network interface is reporting many transmit errors.
> 0.75 expr:
labels: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
severity: warning > 0.01
- alert: NodeTextFileCollectorScrapeError for: 1h
annotations: labels:
description: Node Exporter text file collector on {{ $labels.instance }} failed severity: warning
to scrape. - alert: NodeHighNumberConntrackEntriesUsed
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror annotations:
summary: Node Exporter text file collector failed to scrape. description: "{{ $value | humanizePercentage }} of conntrack entries are used."
expr: node_textfile_scrape_error{job="node-exporter"} == 1 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
labels: summary: Number of conntrack are getting close to the limit.
severity: warning expr:
- alert: NodeClockSkewDetected (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
annotations: > 0.75
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. labels:
Ensure NTP is configured correctly on this host. severity: warning
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected - alert: NodeTextFileCollectorScrapeError
summary: Clock skew detected. annotations:
expr: |- description:
( Node Exporter text file collector on {{ $labels.instance }} failed
node_timex_offset_seconds{job="node-exporter"} > 0.05 to scrape.
and runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 summary: Node Exporter text file collector failed to scrape.
) expr: node_textfile_scrape_error{job="node-exporter"} == 1
or labels:
( severity: warning
node_timex_offset_seconds{job="node-exporter"} < -0.05 - alert: NodeClockSkewDetected
and annotations:
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 description:
) Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
for: 10m Ensure NTP is configured correctly on this host.
labels: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
severity: warning summary: Clock skew detected.
- alert: NodeClockNotSynchronising expr: |-
annotations: (
description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP node_timex_offset_seconds{job="node-exporter"} > 0.05
is configured on this host. and
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
summary: Clock not synchronising. )
expr: |- or
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 (
and node_timex_offset_seconds{job="node-exporter"} < -0.05
node_timex_maxerror_seconds{job="node-exporter"} >= 16 and
for: 10m deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
labels: )
severity: warning for: 10m
- alert: NodeRAIDDegraded labels:
annotations: severity: warning
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is - alert: NodeClockNotSynchronising
in degraded state due to one or more disks failures. Number of spare drives annotations:
is insufficient to fix issue automatically. description:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
summary: RAID Array is degraded. is configured on this host.
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) summary: Clock not synchronising.
> 0 expr: |-
for: 15m min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
labels: and
severity: critical node_timex_maxerror_seconds{job="node-exporter"} >= 16
- alert: NodeRAIDDiskFailure for: 10m
annotations: labels:
description: At least one device in RAID array at {{ $labels.instance }} failed. severity: warning
Array '{{ $labels.device }}' needs attention and possibly a disk swap. - alert: NodeRAIDDegraded
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure annotations:
summary: Failed device in RAID array. description:
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
> 0 in degraded state due to one or more disks failures. Number of spare drives
labels: is insufficient to fix issue automatically.
severity: warning runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
- alert: NodeFileDescriptorLimit summary: RAID Array is degraded.
annotations: expr:
description: File descriptors limit at {{ $labels.instance }} is currently at node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
{{ printf "%.2f" $value }}%. - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit > 0
summary: Kernel is predicted to exhaust file descriptors limit soon. for: 15m
expr: |- labels:
( severity: critical
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 - alert: NodeRAIDDiskFailure
) annotations:
for: 15m description:
labels: At least one device in RAID array at {{ $labels.instance }} failed.
severity: warning Array '{{ $labels.device }}' needs attention and possibly a disk swap.
- alert: NodeFileDescriptorLimit runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
annotations: summary: Failed device in RAID array.
description: File descriptors limit at {{ $labels.instance }} is currently at expr:
{{ printf "%.2f" $value }}%. node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit > 0
summary: Kernel is predicted to exhaust file descriptors limit soon. labels:
expr: |- severity: warning
( - alert: NodeFileDescriptorLimit
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 annotations:
) description:
for: 15m File descriptors limit at {{ $labels.instance }} is currently at
labels: {{ printf "%.2f" $value }}%.
severity: critical runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
- alert: NodeCPUHighUsage summary: Kernel is predicted to exhaust file descriptors limit soon.
annotations: expr: |-
description: | (
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage )
summary: High CPU usage. for: 15m
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", labels:
mode!="idle"}[2m]))) * 100 > 90 severity: warning
for: 15m - alert: NodeFileDescriptorLimit
labels: annotations:
severity: info description:
- alert: NodeSystemSaturation File descriptors limit at {{ $labels.instance }} is currently at
annotations: {{ printf "%.2f" $value }}%.
description: | runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. summary: Kernel is predicted to exhaust file descriptors limit soon.
This might indicate this instance resources saturation and can cause it becoming unresponsive. expr: |-
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation (
summary: System saturated, load per core is very high. node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
expr: |- )
node_load1{job="node-exporter"} for: 15m
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 labels:
for: 15m severity: critical
labels: - alert: NodeCPUHighUsage
severity: warning annotations:
- alert: NodeMemoryMajorPagesFaults description: |
annotations: CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
description: | runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. summary: High CPU usage.
Please check that there is enough memory available at this instance. expr:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
summary: Memory major page faults are occurring at very high rate. mode!="idle"}[2m]))) * 100 > 90
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 for: 15m
for: 15m labels:
labels: severity: info
severity: warning - alert: NodeSystemSaturation
- alert: NodeMemoryHighUtilization annotations:
annotations: description: |
description: | System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. This might indicate this instance resources saturation and can cause it becoming unresponsive.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
summary: Host is running out of memory. summary: System saturated, load per core is very high.
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} expr: |-
* 100) > 90 node_load1{job="node-exporter"}
for: 15m / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
labels: for: 15m
severity: warning labels:
- alert: NodeDiskIOSaturation severity: warning
annotations: - alert: NodeMemoryMajorPagesFaults
description: | annotations:
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. description: |
This symptom might indicate disk saturation. Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation Please check that there is enough memory available at this instance.
summary: Disk IO queue is high. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) summary: Memory major page faults are occurring at very high rate.
> 10 expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
for: 30m for: 15m
labels: labels:
severity: warning severity: warning
- alert: NodeSystemdServiceFailed - alert: NodeMemoryHighUtilization
annotations: annotations:
description: Systemd service {{ $labels.name }} has entered failed state at description: |
{{ $labels.instance }} Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
summary: Systemd service has entered failed state. summary: Host is running out of memory.
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1 expr:
for: 5m 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
labels: * 100) > 90
severity: warning for: 15m
- alert: NodeBondingDegraded labels:
annotations: severity: warning
description: Bonding interface {{ $labels.master }} on {{ $labels.instance }} - alert: NodeDiskIOSaturation
is in degraded state due to one or more slave failures. annotations:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded description: |
summary: Bonding interface is degraded Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
expr: (node_bonding_slaves - node_bonding_active) != 0 This symptom might indicate disk saturation.
for: 5m runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
labels: summary: Disk IO queue is high.
severity: warning expr:
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
> 10
for: 30m
labels:
severity: warning
- alert: NodeSystemdServiceFailed
annotations:
description:
Systemd service {{ $labels.name }} has entered failed state at
{{ $labels.instance }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
summary: Systemd service has entered failed state.
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
for: 5m
labels:
severity: warning
- alert: NodeBondingDegraded
annotations:
description:
Bonding interface {{ $labels.master }} on {{ $labels.instance }}
is in degraded state due to one or more slave failures.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
summary: Bonding interface is degraded
expr: (node_bonding_slaves - node_bonding_active) != 0
for: 5m
labels:
severity: warning
+75 -69
View File
@@ -1,70 +1,76 @@
groups: groups:
- name: node-resource-utilization.rules - name: node-resource-utilization.rules
rules: rules:
- alert: HostHighCpuLoad - alert: HostHighCpuLoad
annotations: annotations:
description: |- description: |-
CPU load is > 90% CPU load is > 90%
VALUE = {{ $value }} VALUE = {{ $value }}
LABELS = {{ $labels }} LABELS = {{ $labels }}
summary: Host high CPU load (instance {{ $labels.instance }}) summary: Host high CPU load (instance {{ $labels.instance }})
expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) expr:
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
for: 10m > 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
labels: for: 10m
severity: critical labels:
- alert: MemoryUtilizationHighWarning severity: critical
annotations: - alert: MemoryUtilizationHighWarning
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ annotations:
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D dashboard:
description: Node {{ $labels.instance }} has less than 10% available memory. https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
summary: Node Memory utilization warning $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 description: Node {{ $labels.instance }} has less than 10% available memory.
for: 5m summary: Node Memory utilization warning
labels: expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
severity: critical for: 5m
- alert: MemoryUtilizationHighCritical labels:
annotations: severity: critical
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ - alert: MemoryUtilizationHighCritical
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D annotations:
description: Node {{ $labels.instance }} has less than 5% available memory. dashboard:
summary: Node Memory utilization critical https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5 $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
for: 1m description: Node {{ $labels.instance }} has less than 5% available memory.
labels: summary: Node Memory utilization critical
severity: critical expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
- alert: NodeNotReady for: 1m
annotations: labels:
description: Node {{ $labels.node }} has CPU utilization over 90%. severity: critical
summary: Node has been in not-ready state for longer than 3 minutes - alert: NodeNotReady
expr: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m]) annotations:
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"})) description: Node {{ $labels.node }} has CPU utilization over 90%.
> 0 summary: Node has been in not-ready state for longer than 3 minutes
for: 5m expr:
labels: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
severity: critical <= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
- alert: KubernetesNodeMemoryPressure > 0
annotations: for: 5m
description: |- labels:
Node {{ $labels.node }} has MemoryPressure condition severity: critical
VALUE = {{ $value }} - alert: KubernetesNodeMemoryPressure
LABELS = {{ $labels }} annotations:
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }}) description: |-
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == Node {{ $labels.node }} has MemoryPressure condition
1 VALUE = {{ $value }}
for: 2m LABELS = {{ $labels }}
labels: summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
severity: critical expr:
- alert: KubernetesContainerOomKiller kube_node_status_condition{condition="MemoryPressure",status="true"} ==
annotations: 1
description: |- for: 2m
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. labels:
VALUE = {{ $value }} severity: critical
LABELS = {{ $labels }} - alert: KubernetesContainerOomKiller
summary: Kubernetes Container oom killer (instance {{ $labels.instance }}) annotations:
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total description: |-
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
== 1 VALUE = {{ $value }}
for: 0m LABELS = {{ $labels }}
labels: summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
severity: warning expr:
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
== 1
for: 0m
labels:
severity: warning
+24 -20
View File
@@ -1,21 +1,25 @@
groups: groups:
- name: velero - name: velero
rules: rules:
- alert: VeleroBackupPartialFailures - alert: VeleroBackupPartialFailures
annotations: annotations:
message: Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy message:
failed backups. Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
expr: velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""} failed backups.
> 0.25 expr:
for: 15m velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
labels: > 0.25
severity: critical for: 15m
- alert: VeleroBackupFailures labels:
annotations: severity: critical
message: Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed - alert: VeleroBackupFailures
backups. annotations:
expr: velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""} message:
> 0.25 Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
for: 15m backups.
labels: expr:
severity: critical velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
> 0.25
for: 15m
labels:
severity: critical
+51 -45
View File
@@ -1,46 +1,52 @@
groups: groups:
- name: x509-certificate-exporter.rules - name: x509-certificate-exporter.rules
rules: rules:
- alert: X509ExporterReadErrors - alert: X509ExporterReadErrors
annotations: annotations:
description: Over the last 15 minutes, this x509-certificate-exporter instance description:
has experienced errors reading certificate files or querying the Kubernetes Over the last 15 minutes, this x509-certificate-exporter instance
API. This could be caused by a misconfiguration if triggered when the exporter has experienced errors reading certificate files or querying the Kubernetes
starts. API. This could be caused by a misconfiguration if triggered when the exporter
summary: Increasing read errors for x509-certificate-exporter starts.
expr: delta(x509_read_errors[15m]) > 0 summary: Increasing read errors for x509-certificate-exporter
for: 5m expr: delta(x509_read_errors[15m]) > 0
labels: for: 5m
severity: warning labels:
- alert: CertificateError severity: warning
annotations: - alert: CertificateError
description: Certificate could not be decoded {{if $labels.secret_name }} in annotations:
Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at description:
location "{{ $labels.filepath }}"{{end}} Certificate could not be decoded {{if $labels.secret_name }} in
summary: Certificate cannot be decoded Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at
expr: x509_cert_error > 0 location "{{ $labels.filepath }}"{{end}}
for: 15m summary: Certificate cannot be decoded
labels: expr: x509_cert_error > 0
severity: warning for: 15m
- alert: CertificateRenewal labels:
annotations: severity: warning
description: Certificate for "{{ $labels.subject_CN }}" should be renewed {{if - alert: CertificateRenewal
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{ annotations:
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}} description:
summary: Certificate should be renewed Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="", $labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28 $labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
for: 15m summary: Certificate should be renewed
labels: expr:
severity: warning ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
- alert: CertificateExpiration issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28
annotations: for: 15m
description: Certificate for "{{ $labels.subject_CN }}" is about to expire {{if labels:
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{ severity: warning
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}} - alert: CertificateExpiration
summary: Certificate is about to expire annotations:
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="", description:
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14 Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
for: 15m $labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
labels: $labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
severity: critical summary: Certificate is about to expire
expr:
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14
for: 15m
labels:
severity: critical