fix(rules/bootstrap): Format yaml
This commit is contained in:
@@ -1,6 +1,7 @@
|
||||
#!/usr/bin/env bash
|
||||
# the shebang is ignored, but nice for editors
|
||||
watch_file nix/sources.json
|
||||
watch_file nix/checks.nix
|
||||
|
||||
# Load .env file if it exists
|
||||
dotenv_if_exists
|
||||
|
||||
+4
-5
@@ -1,6 +1,6 @@
|
||||
image:
|
||||
name: alpine/helm:latest
|
||||
entrypoint: [ "/bin/bash", "-c" ]
|
||||
entrypoint: ["/bin/bash", "-c"]
|
||||
|
||||
stages:
|
||||
- release
|
||||
@@ -8,9 +8,9 @@ stages:
|
||||
release:
|
||||
stage: release
|
||||
rules:
|
||||
- if: '$CI_COMMIT_BRANCH =~ /^main/'
|
||||
when: always
|
||||
- when: never
|
||||
- if: "$CI_COMMIT_BRANCH =~ /^main/"
|
||||
when: always
|
||||
- when: never
|
||||
script:
|
||||
- |
|
||||
cd $CI_PROJECT_DIR
|
||||
@@ -43,4 +43,3 @@ rebuild:
|
||||
"${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts"
|
||||
fi
|
||||
done
|
||||
|
||||
|
||||
@@ -3,16 +3,16 @@ kind: ClusterRole
|
||||
metadata:
|
||||
name: argocd-cluster-admin
|
||||
rules:
|
||||
- apiGroups:
|
||||
- '*'
|
||||
resources:
|
||||
- '*'
|
||||
verbs:
|
||||
- '*'
|
||||
- nonResourceURLs:
|
||||
- '*'
|
||||
verbs:
|
||||
- '*'
|
||||
- apiGroups:
|
||||
- "*"
|
||||
resources:
|
||||
- "*"
|
||||
verbs:
|
||||
- "*"
|
||||
- nonResourceURLs:
|
||||
- "*"
|
||||
verbs:
|
||||
- "*"
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
@@ -23,9 +23,9 @@ roleRef:
|
||||
kind: ClusterRole
|
||||
name: argocd-cluster-admin
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: argocd-cluster-admin
|
||||
namespace: kube-system
|
||||
- kind: ServiceAccount
|
||||
name: argocd-cluster-admin
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
|
||||
@@ -6,5 +6,3 @@ metadata:
|
||||
name: cluster-admin-token
|
||||
namespace: kube-system
|
||||
type: kubernetes.io/service-account-token
|
||||
|
||||
|
||||
|
||||
@@ -10,5 +10,3 @@ metadata:
|
||||
name: cluster-ekman
|
||||
namespace: argocd
|
||||
type: Opaque
|
||||
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ spec:
|
||||
init:
|
||||
# Init always happens immediately before generate, but its output is not treated as manifests.
|
||||
# This is a good place to, for example, download chart dependencies.
|
||||
command: [ /bin/sh ]
|
||||
command: [/bin/sh]
|
||||
args:
|
||||
- /plugin/init.sh
|
||||
# The generate command runs in the Application source directory each time manifests are generated. Standard output
|
||||
@@ -17,7 +17,7 @@ spec:
|
||||
# To write log messages from the command, write them to stderr, it will always be displayed.
|
||||
# Error output will be sent to the UI, so avoid printing sensitive information (such as secrets).
|
||||
generate:
|
||||
command: [ /bin/sh ]
|
||||
command: [/bin/sh]
|
||||
args:
|
||||
- /plugin/generate.sh
|
||||
|
||||
@@ -27,15 +27,15 @@ spec:
|
||||
# Only one of fileName, find.glob, or find.command should be specified. If multiple are specified then only the
|
||||
# first (in that order) is evaluated.
|
||||
# discover:
|
||||
# fileName is a glob pattern (https://pkg.go.dev/path/filepath#Glob) that is applied to the Application's source
|
||||
# directory. If there is a match, this plugin may be used for the Application.
|
||||
# fileName: "./subdir/s*.yaml"
|
||||
# find:
|
||||
# This does the same thing as fileName, but it supports double-start (nested directory) glob patterns.
|
||||
# glob: "**/Chart.yaml"
|
||||
# The find command runs in the repository's root directory. To match, it must exit with status code 0 _and_
|
||||
# produce non-empty output to standard out.
|
||||
# command: [sh, -c, find . -name env.yaml]
|
||||
# fileName is a glob pattern (https://pkg.go.dev/path/filepath#Glob) that is applied to the Application's source
|
||||
# directory. If there is a match, this plugin may be used for the Application.
|
||||
# fileName: "./subdir/s*.yaml"
|
||||
# find:
|
||||
# This does the same thing as fileName, but it supports double-start (nested directory) glob patterns.
|
||||
# glob: "**/Chart.yaml"
|
||||
# The find command runs in the repository's root directory. To match, it must exit with status code 0 _and_
|
||||
# produce non-empty output to standard out.
|
||||
# command: [sh, -c, find . -name env.yaml]
|
||||
# The parameters config describes what parameters the UI should display for an Application. It is up to the user to
|
||||
# actually set parameters in the Application manifest (in spec.source.plugin.parameters). The announcements _only_
|
||||
# inform the "Parameters" tab in the App Details page of the UI.
|
||||
@@ -66,22 +66,21 @@ spec:
|
||||
itemType: string
|
||||
collectionType: string
|
||||
string: ""
|
||||
# All the fields above besides "string" apply to both the array and map type parameter announcements.
|
||||
# - name: array-param
|
||||
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
|
||||
# array: [default, items]
|
||||
# collectionType: array
|
||||
# - name: map-param
|
||||
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
|
||||
# map:
|
||||
# some: value
|
||||
# collectionType: map
|
||||
# All the fields above besides 'string' apply to both the array and map type parameter announcements.
|
||||
# - name: array-param
|
||||
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
|
||||
# array: [default, items]
|
||||
# collectionType: array
|
||||
# - name: map-param
|
||||
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
|
||||
# map:
|
||||
# some: value
|
||||
# collectionType: map
|
||||
# dynamic:
|
||||
# The command is run in an Application's source directory. Standard output must be JSON matching the schema of the
|
||||
# static parameter announcements list.
|
||||
# command: [ /bin/sh, /plugin/get-values.sh ]
|
||||
# The command is run in an Application's source directory. Standard output must be JSON matching the schema of the
|
||||
# static parameter announcements list.
|
||||
# command: [ /bin/sh, /plugin/get-values.sh ]
|
||||
|
||||
# If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository
|
||||
# might have executable files. Set to true only if you trust the CMP plugin authors.
|
||||
preserveFileMode: false
|
||||
|
||||
|
||||
@@ -45,432 +45,432 @@ spec:
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- podAffinityTerm:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: argocd-repo-server
|
||||
topologyKey: kubernetes.io/hostname
|
||||
weight: 100
|
||||
- podAffinityTerm:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: argocd-repo-server
|
||||
topologyKey: kubernetes.io/hostname
|
||||
weight: 100
|
||||
automountServiceAccountToken: true
|
||||
containers:
|
||||
- args:
|
||||
- /usr/local/bin/argocd-repo-server
|
||||
- --port=8081
|
||||
- --metrics-port=8084
|
||||
env:
|
||||
- name: ARGOCD_REPO_SERVER_NAME
|
||||
value: argocd-repo-server
|
||||
- name: ARGOCD_RECONCILIATION_TIMEOUT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: timeout.reconciliation
|
||||
name: argocd-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LOGFORMAT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.log.format
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LOGLEVEL
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.log.level
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.parallelism.limit
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.listen.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.metrics.listen.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_DISABLE_TLS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.disable.tls
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_MIN_VERSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.minversion
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_MAX_VERSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.maxversion
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_CIPHERS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.ciphers
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_CACHE_EXPIRATION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.repo.cache.expiration
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_SERVER
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.server
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_COMPRESSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.compression
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDISDB
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.db
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: redis-username
|
||||
name: argocd-redis
|
||||
optional: true
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: auth
|
||||
name: argocd-redis
|
||||
- name: REDIS_SENTINEL_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: redis-sentinel-username
|
||||
name: argocd-redis
|
||||
optional: true
|
||||
- name: REDIS_SENTINEL_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: redis-sentinel-password
|
||||
name: argocd-redis
|
||||
optional: true
|
||||
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.default.cache.expiration
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.insecure
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.headers
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.max.combined.directory.manifests.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.plugin.tar.exclusions
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.allow.oob.symlinks
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.streamed.manifest.max.tar.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.streamed.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.helm.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.disable.helm.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_MODULES_ENABLED
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.enable.git.submodule
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.git.lsremote.parallelism.limit
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_REQUEST_TIMEOUT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.git.request.timeout
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REVISION_CACHE_LOCK_TIMEOUT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.revision.cache.lock.timeout
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_INCLUDE_HIDDEN_DIRECTORIES
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.include.hidden.directories
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: HELM_CACHE_HOME
|
||||
value: /helm-working-dir
|
||||
- name: HELM_CONFIG_HOME
|
||||
value: /helm-working-dir
|
||||
- name: HELM_DATA_HOME
|
||||
value: /helm-working-dir
|
||||
image: quay.io/argoproj/argocd:v2.12.3
|
||||
imagePullPolicy: IfNotPresent
|
||||
livenessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /healthz?full=true
|
||||
port: metrics
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
name: repo-server
|
||||
ports:
|
||||
- containerPort: 8081
|
||||
- args:
|
||||
- /usr/local/bin/argocd-repo-server
|
||||
- --port=8081
|
||||
- --metrics-port=8084
|
||||
env:
|
||||
- name: ARGOCD_REPO_SERVER_NAME
|
||||
value: argocd-repo-server
|
||||
- name: ARGOCD_RECONCILIATION_TIMEOUT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: timeout.reconciliation
|
||||
name: argocd-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LOGFORMAT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.log.format
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LOGLEVEL
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.log.level
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.parallelism.limit
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.listen.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.metrics.listen.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_DISABLE_TLS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.disable.tls
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_MIN_VERSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.minversion
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_MAX_VERSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.maxversion
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_CIPHERS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.ciphers
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_CACHE_EXPIRATION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.repo.cache.expiration
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_SERVER
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.server
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_COMPRESSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.compression
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDISDB
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.db
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: redis-username
|
||||
name: argocd-redis
|
||||
optional: true
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: auth
|
||||
name: argocd-redis
|
||||
- name: REDIS_SENTINEL_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: redis-sentinel-username
|
||||
name: argocd-redis
|
||||
optional: true
|
||||
- name: REDIS_SENTINEL_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: redis-sentinel-password
|
||||
name: argocd-redis
|
||||
optional: true
|
||||
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.default.cache.expiration
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.insecure
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.headers
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.max.combined.directory.manifests.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.plugin.tar.exclusions
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.allow.oob.symlinks
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.streamed.manifest.max.tar.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.streamed.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.helm.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.disable.helm.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_MODULES_ENABLED
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.enable.git.submodule
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.git.lsremote.parallelism.limit
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_REQUEST_TIMEOUT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.git.request.timeout
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REVISION_CACHE_LOCK_TIMEOUT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.revision.cache.lock.timeout
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_INCLUDE_HIDDEN_DIRECTORIES
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.include.hidden.directories
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: HELM_CACHE_HOME
|
||||
value: /helm-working-dir
|
||||
- name: HELM_CONFIG_HOME
|
||||
value: /helm-working-dir
|
||||
- name: HELM_DATA_HOME
|
||||
value: /helm-working-dir
|
||||
image: quay.io/argoproj/argocd:v2.12.3
|
||||
imagePullPolicy: IfNotPresent
|
||||
livenessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /healthz?full=true
|
||||
port: metrics
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
name: repo-server
|
||||
protocol: TCP
|
||||
- containerPort: 8084
|
||||
name: metrics
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: metrics
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /app/config/ssh
|
||||
name: ssh-known-hosts
|
||||
- mountPath: /app/config/tls
|
||||
name: tls-certs
|
||||
- mountPath: /app/config/gpg/source
|
||||
name: gpg-keys
|
||||
- mountPath: /app/config/gpg/keys
|
||||
name: gpg-keyring
|
||||
- mountPath: /app/config/reposerver/tls
|
||||
name: argocd-repo-server-tls
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: tmp
|
||||
- command:
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||
imagePullPolicy: Always
|
||||
name: kustomize-helm-with-rewrite
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: cmp-tmp
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
- command:
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp:latest
|
||||
imagePullPolicy: Always
|
||||
name: helm-kustomize-cmp
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: cmp-tmp
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
- command:
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
|
||||
imagePullPolicy: Always
|
||||
name: helmfile-cmp
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: cmp-tmp
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
ports:
|
||||
- containerPort: 8081
|
||||
name: repo-server
|
||||
protocol: TCP
|
||||
- containerPort: 8084
|
||||
name: metrics
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: metrics
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /app/config/ssh
|
||||
name: ssh-known-hosts
|
||||
- mountPath: /app/config/tls
|
||||
name: tls-certs
|
||||
- mountPath: /app/config/gpg/source
|
||||
name: gpg-keys
|
||||
- mountPath: /app/config/gpg/keys
|
||||
name: gpg-keyring
|
||||
- mountPath: /app/config/reposerver/tls
|
||||
name: argocd-repo-server-tls
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: tmp
|
||||
- command:
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||
imagePullPolicy: Always
|
||||
name: kustomize-helm-with-rewrite
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: cmp-tmp
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
- command:
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp:latest
|
||||
imagePullPolicy: Always
|
||||
name: helm-kustomize-cmp
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: cmp-tmp
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
- command:
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
|
||||
imagePullPolicy: Always
|
||||
name: helmfile-cmp
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: cmp-tmp
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
dnsPolicy: ClusterFirst
|
||||
imagePullSecrets:
|
||||
- name: gitlab-pull-secret
|
||||
- name: gitlab-pull-secret
|
||||
initContainers:
|
||||
- command:
|
||||
- /bin/cp
|
||||
- -n
|
||||
- /usr/local/bin/argocd
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: quay.io/argoproj/argocd:v2.12.3
|
||||
imagePullPolicy: IfNotPresent
|
||||
name: copyutil
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- command:
|
||||
- /bin/sh
|
||||
- /plugin/init-helm-repos.sh
|
||||
env:
|
||||
- name: OCEANBOX_HELM_ACCESS_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: token
|
||||
name: oceanbox-helm
|
||||
optional: false
|
||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||
imagePullPolicy: Always
|
||||
name: init-helm-repos
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
- command:
|
||||
- /bin/cp
|
||||
- -n
|
||||
- /usr/local/bin/argocd
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: quay.io/argoproj/argocd:v2.12.3
|
||||
imagePullPolicy: IfNotPresent
|
||||
name: copyutil
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- command:
|
||||
- /bin/sh
|
||||
- /plugin/init-helm-repos.sh
|
||||
env:
|
||||
- name: OCEANBOX_HELM_ACCESS_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: token
|
||||
name: oceanbox-helm
|
||||
optional: false
|
||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||
imagePullPolicy: Always
|
||||
name: init-helm-repos
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
restartPolicy: Always
|
||||
schedulerName: default-scheduler
|
||||
serviceAccount: argocd-repo-server
|
||||
serviceAccountName: argocd-repo-server
|
||||
terminationGracePeriodSeconds: 30
|
||||
volumes:
|
||||
- name: cmp-tmp
|
||||
- name: helm-working-dir
|
||||
- name: plugins
|
||||
- name: var-files
|
||||
- name: tmp
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-ssh-known-hosts-cm
|
||||
name: ssh-known-hosts
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-tls-certs-cm
|
||||
name: tls-certs
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-gpg-keys-cm
|
||||
name: gpg-keys
|
||||
- name: gpg-keyring
|
||||
- name: argocd-repo-server-tls
|
||||
secret:
|
||||
defaultMode: 420
|
||||
items:
|
||||
- key: tls.crt
|
||||
path: tls.crt
|
||||
- key: tls.key
|
||||
path: tls.key
|
||||
- key: ca.crt
|
||||
path: ca.crt
|
||||
optional: true
|
||||
secretName: argocd-repo-server-tls
|
||||
- name: cmp-tmp
|
||||
- name: helm-working-dir
|
||||
- name: plugins
|
||||
- name: var-files
|
||||
- name: tmp
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-ssh-known-hosts-cm
|
||||
name: ssh-known-hosts
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-tls-certs-cm
|
||||
name: tls-certs
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-gpg-keys-cm
|
||||
name: gpg-keys
|
||||
- name: gpg-keyring
|
||||
- name: argocd-repo-server-tls
|
||||
secret:
|
||||
defaultMode: 420
|
||||
items:
|
||||
- key: tls.crt
|
||||
path: tls.crt
|
||||
- key: tls.key
|
||||
path: tls.key
|
||||
- key: ca.crt
|
||||
path: ca.crt
|
||||
optional: true
|
||||
secretName: argocd-repo-server-tls
|
||||
|
||||
@@ -4,24 +4,24 @@ spec:
|
||||
template:
|
||||
spec:
|
||||
imagePullSecrets:
|
||||
- name: gitlab-pull-secret
|
||||
- name: gitlab-pull-secret
|
||||
containers:
|
||||
- command:
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
|
||||
imagePullPolicy: Always
|
||||
name: helmfile-cmp
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: tmp
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
- command:
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
|
||||
imagePullPolicy: Always
|
||||
name: helmfile-cmp
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: tmp
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
|
||||
@@ -4,7 +4,7 @@ metadata:
|
||||
name: helmfile-cmp
|
||||
spec:
|
||||
generate:
|
||||
command: [ "/bin/sh" ]
|
||||
command: ["/bin/sh"]
|
||||
args:
|
||||
- /plugin/generate.sh
|
||||
lockRepo: false
|
||||
|
||||
@@ -44,341 +44,341 @@ spec:
|
||||
affinity:
|
||||
podAntiAffinity:
|
||||
preferredDuringSchedulingIgnoredDuringExecution:
|
||||
- podAffinityTerm:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: argocd-repo-server
|
||||
topologyKey: kubernetes.io/hostname
|
||||
weight: 100
|
||||
- podAffinityTerm:
|
||||
labelSelector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: argocd-repo-server
|
||||
topologyKey: kubernetes.io/hostname
|
||||
weight: 100
|
||||
containers:
|
||||
- args:
|
||||
- /usr/local/bin/argocd-repo-server
|
||||
- --port=8081
|
||||
- --metrics-port=8084
|
||||
env:
|
||||
- name: ARGOCD_REPO_SERVER_NAME
|
||||
value: argocd-repo-server
|
||||
- name: ARGOCD_RECONCILIATION_TIMEOUT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: timeout.reconciliation
|
||||
name: argocd-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LOGFORMAT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.log.format
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LOGLEVEL
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.log.level
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.parallelism.limit
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.listen.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.metrics.listen.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_DISABLE_TLS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.disable.tls
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_MIN_VERSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.minversion
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_MAX_VERSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.maxversion
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_CIPHERS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.ciphers
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_CACHE_EXPIRATION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.repo.cache.expiration
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_SERVER
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.server
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_COMPRESSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.compression
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDISDB
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.db
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: redis-username
|
||||
name: argocd-redis
|
||||
optional: true
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: redis-password
|
||||
name: argocd-redis
|
||||
optional: true
|
||||
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.default.cache.expiration
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.insecure
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.headers
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.max.combined.directory.manifests.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.plugin.tar.exclusions
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.allow.oob.symlinks
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.streamed.manifest.max.tar.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.streamed.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.helm.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.disable.helm.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_MODULES_ENABLED
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.enable.git.submodule
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.git.lsremote.parallelism.limit
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_REQUEST_TIMEOUT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.git.request.timeout
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: HELM_CACHE_HOME
|
||||
value: /helm-working-dir
|
||||
- name: HELM_CONFIG_HOME
|
||||
value: /helm-working-dir
|
||||
- name: HELM_DATA_HOME
|
||||
value: /helm-working-dir
|
||||
image: quay.io/argoproj/argocd:v2.10.4
|
||||
imagePullPolicy: IfNotPresent
|
||||
livenessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /healthz?full=true
|
||||
port: metrics
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
name: repo-server
|
||||
ports:
|
||||
- containerPort: 8081
|
||||
- args:
|
||||
- /usr/local/bin/argocd-repo-server
|
||||
- --port=8081
|
||||
- --metrics-port=8084
|
||||
env:
|
||||
- name: ARGOCD_REPO_SERVER_NAME
|
||||
value: argocd-repo-server
|
||||
- name: ARGOCD_RECONCILIATION_TIMEOUT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: timeout.reconciliation
|
||||
name: argocd-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LOGFORMAT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.log.format
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LOGLEVEL
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.log.level
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.parallelism.limit
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.listen.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.metrics.listen.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_DISABLE_TLS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.disable.tls
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_MIN_VERSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.minversion
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_MAX_VERSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.maxversion
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_TLS_CIPHERS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.tls.ciphers
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_CACHE_EXPIRATION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.repo.cache.expiration
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_SERVER
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.server
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_COMPRESSION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.compression
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDISDB
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: redis.db
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: REDIS_USERNAME
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: redis-username
|
||||
name: argocd-redis
|
||||
optional: true
|
||||
- name: REDIS_PASSWORD
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: redis-password
|
||||
name: argocd-redis
|
||||
optional: true
|
||||
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.default.cache.expiration
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.address
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.insecure
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: otlp.headers
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.max.combined.directory.manifests.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.plugin.tar.exclusions
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.allow.oob.symlinks
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.streamed.manifest.max.tar.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.streamed.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.helm.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.disable.helm.manifest.max.extracted.size
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_MODULES_ENABLED
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.enable.git.submodule
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.git.lsremote.parallelism.limit
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: ARGOCD_GIT_REQUEST_TIMEOUT
|
||||
valueFrom:
|
||||
configMapKeyRef:
|
||||
key: reposerver.git.request.timeout
|
||||
name: argocd-cmd-params-cm
|
||||
optional: true
|
||||
- name: HELM_CACHE_HOME
|
||||
value: /helm-working-dir
|
||||
- name: HELM_CONFIG_HOME
|
||||
value: /helm-working-dir
|
||||
- name: HELM_DATA_HOME
|
||||
value: /helm-working-dir
|
||||
image: quay.io/argoproj/argocd:v2.10.4
|
||||
imagePullPolicy: IfNotPresent
|
||||
livenessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /healthz?full=true
|
||||
port: metrics
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
name: repo-server
|
||||
protocol: TCP
|
||||
- containerPort: 8084
|
||||
name: metrics
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: metrics
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
resources: {}
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /app/config/ssh
|
||||
name: ssh-known-hosts
|
||||
- mountPath: /app/config/tls
|
||||
name: tls-certs
|
||||
- mountPath: /app/config/gpg/source
|
||||
name: gpg-keys
|
||||
- mountPath: /app/config/gpg/keys
|
||||
name: gpg-keyring
|
||||
- mountPath: /app/config/reposerver/tls
|
||||
name: argocd-repo-server-tls
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: tmp
|
||||
- command:
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||
imagePullPolicy: Always
|
||||
name: kustomize-helm-with-rewrite
|
||||
resources: {}
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: cmp-tmp
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
ports:
|
||||
- containerPort: 8081
|
||||
name: repo-server
|
||||
protocol: TCP
|
||||
- containerPort: 8084
|
||||
name: metrics
|
||||
protocol: TCP
|
||||
readinessProbe:
|
||||
failureThreshold: 3
|
||||
httpGet:
|
||||
path: /healthz
|
||||
port: metrics
|
||||
scheme: HTTP
|
||||
initialDelaySeconds: 10
|
||||
periodSeconds: 10
|
||||
successThreshold: 1
|
||||
timeoutSeconds: 1
|
||||
resources: {}
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /app/config/ssh
|
||||
name: ssh-known-hosts
|
||||
- mountPath: /app/config/tls
|
||||
name: tls-certs
|
||||
- mountPath: /app/config/gpg/source
|
||||
name: gpg-keys
|
||||
- mountPath: /app/config/gpg/keys
|
||||
name: gpg-keyring
|
||||
- mountPath: /app/config/reposerver/tls
|
||||
name: argocd-repo-server-tls
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: tmp
|
||||
- command:
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||
imagePullPolicy: Always
|
||||
name: kustomize-helm-with-rewrite
|
||||
resources: {}
|
||||
securityContext:
|
||||
runAsNonRoot: true
|
||||
runAsUser: 999
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- mountPath: /home/argocd/cmp-server/plugins
|
||||
name: plugins
|
||||
- mountPath: /tmp
|
||||
name: cmp-tmp
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
dnsPolicy: ClusterFirst
|
||||
imagePullSecrets:
|
||||
- name: gitlab-pull-secret
|
||||
- name: gitlab-pull-secret
|
||||
initContainers:
|
||||
- command:
|
||||
- /bin/cp
|
||||
- -n
|
||||
- /usr/local/bin/argocd
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: quay.io/argoproj/argocd:v2.10.4
|
||||
imagePullPolicy: IfNotPresent
|
||||
name: copyutil
|
||||
resources: {}
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- command:
|
||||
- /bin/sh
|
||||
- /plugin/init-helm-repos.sh
|
||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||
imagePullPolicy: Always
|
||||
name: init-helm-repos
|
||||
resources: {}
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsUser: 999
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
env:
|
||||
- name: OCEANBOX_HELM_ACCESS_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: token
|
||||
name: oceanbox-helm
|
||||
optional: false
|
||||
volumeMounts:
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
- command:
|
||||
- /bin/cp
|
||||
- -n
|
||||
- /usr/local/bin/argocd
|
||||
- /var/run/argocd/argocd-cmp-server
|
||||
image: quay.io/argoproj/argocd:v2.10.4
|
||||
imagePullPolicy: IfNotPresent
|
||||
name: copyutil
|
||||
resources: {}
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
volumeMounts:
|
||||
- mountPath: /var/run/argocd
|
||||
name: var-files
|
||||
- command:
|
||||
- /bin/sh
|
||||
- /plugin/init-helm-repos.sh
|
||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||
imagePullPolicy: Always
|
||||
name: init-helm-repos
|
||||
resources: {}
|
||||
securityContext:
|
||||
allowPrivilegeEscalation: false
|
||||
capabilities:
|
||||
drop:
|
||||
- ALL
|
||||
readOnlyRootFilesystem: true
|
||||
runAsUser: 999
|
||||
runAsNonRoot: true
|
||||
seccompProfile:
|
||||
type: RuntimeDefault
|
||||
terminationMessagePath: /dev/termination-log
|
||||
terminationMessagePolicy: File
|
||||
env:
|
||||
- name: OCEANBOX_HELM_ACCESS_TOKEN
|
||||
valueFrom:
|
||||
secretKeyRef:
|
||||
key: token
|
||||
name: oceanbox-helm
|
||||
optional: false
|
||||
volumeMounts:
|
||||
- mountPath: /helm-working-dir
|
||||
name: helm-working-dir
|
||||
restartPolicy: Always
|
||||
schedulerName: default-scheduler
|
||||
securityContext: {}
|
||||
@@ -386,40 +386,39 @@ spec:
|
||||
serviceAccountName: argocd-repo-server
|
||||
terminationGracePeriodSeconds: 30
|
||||
volumes:
|
||||
- emptyDir: {}
|
||||
name: cmp-tmp
|
||||
- emptyDir: {}
|
||||
name: helm-working-dir
|
||||
- emptyDir: {}
|
||||
name: plugins
|
||||
- emptyDir: {}
|
||||
name: var-files
|
||||
- emptyDir: {}
|
||||
name: tmp
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-ssh-known-hosts-cm
|
||||
name: ssh-known-hosts
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-tls-certs-cm
|
||||
name: tls-certs
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-gpg-keys-cm
|
||||
name: gpg-keys
|
||||
- emptyDir: {}
|
||||
name: gpg-keyring
|
||||
- name: argocd-repo-server-tls
|
||||
secret:
|
||||
defaultMode: 420
|
||||
items:
|
||||
- key: tls.crt
|
||||
path: tls.crt
|
||||
- key: tls.key
|
||||
path: tls.key
|
||||
- key: ca.crt
|
||||
path: ca.crt
|
||||
optional: true
|
||||
secretName: argocd-repo-server-tls
|
||||
|
||||
- emptyDir: {}
|
||||
name: cmp-tmp
|
||||
- emptyDir: {}
|
||||
name: helm-working-dir
|
||||
- emptyDir: {}
|
||||
name: plugins
|
||||
- emptyDir: {}
|
||||
name: var-files
|
||||
- emptyDir: {}
|
||||
name: tmp
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-ssh-known-hosts-cm
|
||||
name: ssh-known-hosts
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-tls-certs-cm
|
||||
name: tls-certs
|
||||
- configMap:
|
||||
defaultMode: 420
|
||||
name: argocd-gpg-keys-cm
|
||||
name: gpg-keys
|
||||
- emptyDir: {}
|
||||
name: gpg-keyring
|
||||
- name: argocd-repo-server-tls
|
||||
secret:
|
||||
defaultMode: 420
|
||||
items:
|
||||
- key: tls.crt
|
||||
path: tls.crt
|
||||
- key: tls.key
|
||||
path: tls.key
|
||||
- key: ca.crt
|
||||
path: ca.crt
|
||||
optional: true
|
||||
secretName: argocd-repo-server-tls
|
||||
|
||||
@@ -13,4 +13,3 @@ stringData:
|
||||
name: staging-vcluster
|
||||
server: https://staging-vcluster.staging-vcluster
|
||||
type: Opaque
|
||||
|
||||
|
||||
+11
-11
@@ -19,12 +19,12 @@ applications:
|
||||
plugin:
|
||||
name: helmfile-cmp
|
||||
env:
|
||||
- name: CLUSTER_NAME
|
||||
value: replaceme
|
||||
- name: HELMFILE_ENVIRONMENT
|
||||
value: default
|
||||
- name: HELMFILE_FILE_PATH
|
||||
value: system.yaml.gotmpl
|
||||
- name: CLUSTER_NAME
|
||||
value: replaceme
|
||||
- name: HELMFILE_ENVIRONMENT
|
||||
value: default
|
||||
- name: HELMFILE_FILE_PATH
|
||||
value: system.yaml.gotmpl
|
||||
projects:
|
||||
sys:
|
||||
namespace: argocd
|
||||
@@ -32,12 +32,12 @@ projects:
|
||||
additionalAnnotations: {}
|
||||
description: sys components project
|
||||
sourceRepos:
|
||||
- '*'
|
||||
- "*"
|
||||
destinations:
|
||||
- namespace: '*'
|
||||
server: https://kubernetes.default.svc
|
||||
- namespace: "*"
|
||||
server: https://kubernetes.default.svc
|
||||
clusterResourceWhitelist:
|
||||
- group: '*'
|
||||
kind: '*'
|
||||
- group: "*"
|
||||
kind: "*"
|
||||
orphanedResources:
|
||||
warn: false
|
||||
|
||||
+8
-3
@@ -5,6 +5,8 @@ let
|
||||
|
||||
globalExcludes = [
|
||||
"nix/default.nix"
|
||||
"attic"
|
||||
"vcluster"
|
||||
".*vendor"
|
||||
".*chart/.*"
|
||||
".*schema.json"
|
||||
@@ -32,6 +34,7 @@ pre-commit.run {
|
||||
enable = true;
|
||||
excludes = [
|
||||
"vcluster/"
|
||||
"attic/"
|
||||
];
|
||||
args = [
|
||||
"-x"
|
||||
@@ -41,15 +44,17 @@ pre-commit.run {
|
||||
};
|
||||
|
||||
yamllint = {
|
||||
enable = false;
|
||||
enable = true;
|
||||
excludes = [
|
||||
"attic/"
|
||||
"charts/templates/"
|
||||
"charts/charts/"
|
||||
"charts/"
|
||||
"values/"
|
||||
"vcluster/"
|
||||
];
|
||||
settings = {
|
||||
strict = true;
|
||||
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 165} } }'';
|
||||
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 300} } }'';
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
+203
-182
@@ -1,183 +1,204 @@
|
||||
groups:
|
||||
- name: etcd
|
||||
rules:
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
|
||||
}}).'
|
||||
summary: etcd cluster members are down.
|
||||
expr: |-
|
||||
max without (endpoint) (
|
||||
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count without (To) (
|
||||
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
||||
}}).'
|
||||
summary: etcd cluster has insufficient number of members.
|
||||
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
|
||||
without (instance) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
|
||||
has no leader.'
|
||||
summary: etcd cluster has no leader.
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
|
||||
within the last 15 minutes. Frequent elections may be a sign of insufficient
|
||||
resources, high network latency, or disruptions by other components and should
|
||||
be investigated.'
|
||||
summary: etcd cluster has high number of leader changes.
|
||||
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
|
||||
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
|
||||
>= 4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
|
||||
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
|
||||
}} method.'
|
||||
summary: etcd grpc requests are slow
|
||||
expr: |-
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": member communication with {{
|
||||
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
|
||||
}}.'
|
||||
summary: etcd cluster member communication is slow.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
|
||||
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster has high number of proposal failures.
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
||||
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile commit durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdDatabaseQuotaLowSpace
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined
|
||||
quota on etcd instance {{ $labels.instance }}, please defrag or increase the
|
||||
quota as the writes to etcd will be disabled when it is full.'
|
||||
summary: etcd cluster database is running full.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
|
||||
last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 >
|
||||
95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdExcessiveDatabaseGrowth
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk
|
||||
space in the next four hours, based on write observations within the past
|
||||
four hours on etcd instance {{ $labels.instance }}, please check as it might
|
||||
be disruptive.'
|
||||
summary: etcd cluster database growing very fast.
|
||||
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
|
||||
> etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdDatabaseHighFragmentationRatio
|
||||
annotations:
|
||||
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance
|
||||
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
|
||||
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
|
||||
retrieve the unused fragmented disk space.'
|
||||
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
||||
summary: etcd database size in use is less than 50% of the actual allocated
|
||||
storage.
|
||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
|
||||
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
|
||||
and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: etcd
|
||||
rules:
|
||||
- alert: etcdMembersDown
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": members are down ({{ $value
|
||||
}}).'
|
||||
summary: etcd cluster members are down.
|
||||
expr: |-
|
||||
max without (endpoint) (
|
||||
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
|
||||
or
|
||||
count without (To) (
|
||||
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
||||
)
|
||||
)
|
||||
> 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdInsufficientMembers
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
||||
}}).'
|
||||
summary: etcd cluster has insufficient number of members.
|
||||
expr:
|
||||
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
|
||||
without (instance) + 1) / 2)
|
||||
for: 3m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdNoLeader
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
|
||||
has no leader.'
|
||||
summary: etcd cluster has no leader.
|
||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdHighNumberOfLeaderChanges
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
|
||||
within the last 15 minutes. Frequent elections may be a sign of insufficient
|
||||
resources, high network latency, or disruptions by other components and should
|
||||
be investigated.'
|
||||
summary: etcd cluster has high number of leader changes.
|
||||
expr:
|
||||
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
|
||||
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
|
||||
>= 4
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster has high number of failed grpc requests.
|
||||
expr: |-
|
||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||
/
|
||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||
> 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdGRPCRequestsSlow
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
|
||||
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
|
||||
}} method.'
|
||||
summary: etcd grpc requests are slow
|
||||
expr: |-
|
||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdMemberCommunicationSlow
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": member communication with {{
|
||||
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
|
||||
}}.'
|
||||
summary: etcd cluster member communication is slow.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.15
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighNumberOfFailedProposals
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
|
||||
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster has high number of proposal failures.
|
||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.5
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdHighFsyncDurations
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 1
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdHighCommitDurations
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
||||
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||
summary: etcd cluster 99th percentile commit durations are too high.
|
||||
expr: |-
|
||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||
> 0.25
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdDatabaseQuotaLowSpace
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": database size exceeds the defined
|
||||
quota on etcd instance {{ $labels.instance }}, please defrag or increase the
|
||||
quota as the writes to etcd will be disabled when it is full.'
|
||||
summary: etcd cluster database is running full.
|
||||
expr:
|
||||
(last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
|
||||
last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 >
|
||||
95
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: etcdExcessiveDatabaseGrowth
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": Predicting running out of disk
|
||||
space in the next four hours, based on write observations within the past
|
||||
four hours on etcd instance {{ $labels.instance }}, please check as it might
|
||||
be disruptive.'
|
||||
summary: etcd cluster database growing very fast.
|
||||
expr:
|
||||
predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
|
||||
> etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: etcdDatabaseHighFragmentationRatio
|
||||
annotations:
|
||||
description:
|
||||
'etcd cluster "{{ $labels.job }}": database size in use on instance
|
||||
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
|
||||
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
|
||||
retrieve the unused fragmented disk space.'
|
||||
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
||||
summary:
|
||||
etcd database size in use is less than 50% of the actual allocated
|
||||
storage.
|
||||
expr:
|
||||
(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
|
||||
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
|
||||
and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
+46
-42
@@ -1,43 +1,47 @@
|
||||
groups:
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
|
||||
}} targets in {{ $labels.namespace }} namespace are down.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||||
summary: One or more targets are unreachable.
|
||||
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
|
||||
BY (cluster, job, namespace, service)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
description: |
|
||||
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
||||
summary: An alert that should always be firing to certify that Alertmanager
|
||||
is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
- alert: InfoInhibitor
|
||||
annotations:
|
||||
description: |
|
||||
This is an alert that is used to inhibit info alerts.
|
||||
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
||||
other alerts.
|
||||
This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
|
||||
severity of 'warning' or 'critical' starts firing on the same namespace.
|
||||
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
||||
summary: Info-level alert inhibition.
|
||||
expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
|
||||
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
||||
labels:
|
||||
severity: none
|
||||
- name: general.rules
|
||||
rules:
|
||||
- alert: TargetDown
|
||||
annotations:
|
||||
description:
|
||||
'{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
|
||||
}} targets in {{ $labels.namespace }} namespace are down.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||||
summary: One or more targets are unreachable.
|
||||
expr:
|
||||
100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
|
||||
BY (cluster, job, namespace, service)) > 10
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: Watchdog
|
||||
annotations:
|
||||
description: |
|
||||
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||
and always fire against a receiver. There are integrations with various notification
|
||||
mechanisms that send a notification when this alert is not firing. For example the
|
||||
"DeadMansSnitch" integration in PagerDuty.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
||||
summary:
|
||||
An alert that should always be firing to certify that Alertmanager
|
||||
is working properly.
|
||||
expr: vector(1)
|
||||
labels:
|
||||
severity: none
|
||||
- alert: InfoInhibitor
|
||||
annotations:
|
||||
description: |
|
||||
This is an alert that is used to inhibit info alerts.
|
||||
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
||||
other alerts.
|
||||
This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
|
||||
severity of 'warning' or 'critical' starts firing on the same namespace.
|
||||
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
||||
summary: Info-level alert inhibition.
|
||||
expr:
|
||||
ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
|
||||
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
||||
labels:
|
||||
severity: none
|
||||
|
||||
+277
-258
@@ -1,262 +1,281 @@
|
||||
groups:
|
||||
- name: kubernetes-apps
|
||||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||
}}) is in waiting state (reason: "CrashLoopBackOff").'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
||||
summary: Pod is crash looping.
|
||||
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
|
||||
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||
state for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||
expr: |-
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, cluster) (
|
||||
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"}
|
||||
) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) (
|
||||
1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
|
||||
)
|
||||
) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} does not match, this indicates that the Deployment has failed but has not
|
||||
been rolled back.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
||||
summary: Deployment generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
|
||||
not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
>
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentRolloutStuck
|
||||
annotations:
|
||||
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} is not progressing for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
|
||||
summary: Deployment rollout is not progressing.
|
||||
expr: |-
|
||||
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
|
||||
not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
||||
summary: StatefulSet has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||
}} does not match, this indicates that the StatefulSet has failed but has
|
||||
not been rolled back.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
||||
summary: StatefulSet generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
||||
has not been rolled out.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
||||
summary: StatefulSet update has not been rolled out.
|
||||
expr: |-
|
||||
(
|
||||
max by (namespace, statefulset) (
|
||||
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
- name: kubernetes-apps
|
||||
rules:
|
||||
- alert: KubePodCrashLooping
|
||||
annotations:
|
||||
description:
|
||||
'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||
}}) is in waiting state (reason: "CrashLoopBackOff").'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
||||
summary: Pod is crash looping.
|
||||
expr:
|
||||
max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
|
||||
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePodNotReady
|
||||
annotations:
|
||||
description:
|
||||
Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||
state for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||
expr: |-
|
||||
sum by (namespace, pod, cluster) (
|
||||
max by (namespace, pod, cluster) (
|
||||
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"}
|
||||
) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) (
|
||||
1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
|
||||
)
|
||||
) > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentGenerationMismatch
|
||||
annotations:
|
||||
description:
|
||||
Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} does not match, this indicates that the Deployment has failed but has not
|
||||
been rolled back.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
||||
summary: Deployment generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
|
||||
finished or progressed for at least 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
||||
summary: DaemonSet rollout is stuck.
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentReplicasMismatch
|
||||
annotations:
|
||||
description:
|
||||
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
|
||||
not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
||||
summary: Deployment has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
>
|
||||
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
|
||||
) and (
|
||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDeploymentRolloutStuck
|
||||
annotations:
|
||||
description:
|
||||
Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
|
||||
}} is not progressing for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
|
||||
summary: Deployment rollout is not progressing.
|
||||
expr: |-
|
||||
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
|
||||
!= 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetReplicasMismatch
|
||||
annotations:
|
||||
description:
|
||||
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
|
||||
not matched the expected number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
||||
summary: StatefulSet has not matched the expected number of replicas.
|
||||
expr: |-
|
||||
(
|
||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetGenerationMismatch
|
||||
annotations:
|
||||
description:
|
||||
StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||
}} does not match, this indicates that the StatefulSet has failed but has
|
||||
not been rolled back.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
||||
summary: StatefulSet generation mismatch due to possible roll-back
|
||||
expr: |-
|
||||
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||
annotations:
|
||||
description:
|
||||
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
||||
has not been rolled out.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
||||
summary: StatefulSet update has not been rolled out.
|
||||
expr: |-
|
||||
(
|
||||
max by (namespace, statefulset) (
|
||||
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
|
||||
unless
|
||||
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
*
|
||||
(
|
||||
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetRolloutStuck
|
||||
annotations:
|
||||
description:
|
||||
DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
|
||||
finished or progressed for at least 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
||||
summary: DaemonSet rollout is stuck.
|
||||
expr: |-
|
||||
(
|
||||
(
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
) or (
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
0
|
||||
) or (
|
||||
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
) or (
|
||||
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
description:
|
||||
pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
|
||||
{{ $labels.container}} has been in waiting state for longer than 1 hour.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
||||
summary: Pod container waiting longer than 1 hour
|
||||
expr:
|
||||
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
|
||||
namespace=~".*"}) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
description:
|
||||
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are not scheduled."
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
||||
summary: DaemonSet pods are not scheduled.
|
||||
expr: |-
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
) or (
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
description:
|
||||
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are running where they are not supposed to run."
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
||||
summary: DaemonSet pods are misscheduled.
|
||||
expr:
|
||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
0
|
||||
) or (
|
||||
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
) or (
|
||||
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
)
|
||||
) and (
|
||||
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
|
||||
==
|
||||
0
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeContainerWaiting
|
||||
annotations:
|
||||
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
|
||||
{{ $labels.container}} has been in waiting state for longer than 1 hour.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
||||
summary: Pod container waiting longer than 1 hour
|
||||
expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
|
||||
namespace=~".*"}) > 0
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetNotScheduled
|
||||
annotations:
|
||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are not scheduled.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
||||
summary: DaemonSet pods are not scheduled.
|
||||
expr: |-
|
||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
-
|
||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeDaemonSetMisScheduled
|
||||
annotations:
|
||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||
}} are running where they are not supposed to run.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
||||
summary: DaemonSet pods are misscheduled.
|
||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeJobNotCompleted
|
||||
annotations:
|
||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
||||
than {{ "43200" | humanizeDuration }} to complete.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
|
||||
summary: Job did not complete in time
|
||||
expr: |-
|
||||
time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
|
||||
and
|
||||
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
||||
Removing failed job after investigation should clear this alert.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
||||
summary: Job failed to complete.
|
||||
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
has not matched the desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
||||
summary: HPA has not matched desired number of replicas.
|
||||
expr: |-
|
||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
>
|
||||
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
<
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
has been running at max replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
||||
summary: HPA is running at max replicas
|
||||
expr: |-
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
==
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeJobNotCompleted
|
||||
annotations:
|
||||
description:
|
||||
Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
||||
than {{ "43200" | humanizeDuration }} to complete.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
|
||||
summary: Job did not complete in time
|
||||
expr: |-
|
||||
time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
|
||||
and
|
||||
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeJobFailed
|
||||
annotations:
|
||||
description:
|
||||
Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
||||
Removing failed job after investigation should clear this alert.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
||||
summary: Job failed to complete.
|
||||
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeHpaReplicasMismatch
|
||||
annotations:
|
||||
description:
|
||||
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
has not matched the desired number of replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
||||
summary: HPA has not matched desired number of replicas.
|
||||
expr: |-
|
||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
!=
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
>
|
||||
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
<
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||
and
|
||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeHpaMaxedOut
|
||||
annotations:
|
||||
description:
|
||||
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||
has been running at max replicas for longer than 15 minutes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
||||
summary: HPA is running at max replicas
|
||||
expr: |-
|
||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
==
|
||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
+122
-114
@@ -1,115 +1,123 @@
|
||||
groups:
|
||||
- name: kubernetes-resources
|
||||
rules:
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||
for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
|
||||
failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||
for Namespaces.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
requests for Namespaces.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
||||
summary: Namespace quota is going to be full.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 0.9 < 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
||||
summary: Namespace quota is fully used.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
== 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
||||
summary: Namespace quota has exceeded the limits.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
|
||||
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
|
||||
}}.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
||||
summary: Processes experience elevated CPU throttling.
|
||||
expr: |-
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
|
||||
> ( 25 / 100 )
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
- name: kubernetes-resources
|
||||
rules:
|
||||
- alert: KubeCPUOvercommit
|
||||
annotations:
|
||||
description:
|
||||
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||
for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeMemoryOvercommit
|
||||
annotations:
|
||||
description:
|
||||
Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
|
||||
failure.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||
and
|
||||
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeCPUQuotaOvercommit
|
||||
annotations:
|
||||
description:
|
||||
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||
for Namespaces.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
||||
summary: Cluster has overcommitted CPU resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeMemoryQuotaOvercommit
|
||||
annotations:
|
||||
description:
|
||||
Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||
requests for Namespaces.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
||||
summary: Cluster has overcommitted memory resource requests.
|
||||
expr: |-
|
||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
|
||||
/
|
||||
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
|
||||
> 1.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubeQuotaAlmostFull
|
||||
annotations:
|
||||
description:
|
||||
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
||||
summary: Namespace quota is going to be full.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 0.9 < 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: KubeQuotaFullyUsed
|
||||
annotations:
|
||||
description:
|
||||
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
||||
summary: Namespace quota is fully used.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
== 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: KubeQuotaExceeded
|
||||
annotations:
|
||||
description:
|
||||
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||
}} of its {{ $labels.resource }} quota.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
||||
summary: Namespace quota has exceeded the limits.
|
||||
expr: |-
|
||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||
/ ignoring(instance, job, type)
|
||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||
> 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CPUThrottlingHigh
|
||||
annotations:
|
||||
description:
|
||||
"{{ $value | humanizePercentage }} throttling of CPU in namespace
|
||||
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
|
||||
}}."
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
||||
summary: Processes experience elevated CPU throttling.
|
||||
expr: |-
|
||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
|
||||
/
|
||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
|
||||
> ( 25 / 100 )
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
|
||||
+113
-108
@@ -1,109 +1,114 @@
|
||||
|
||||
groups:
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value
|
||||
| humanizePercentage }} is available.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} is expected to run out of inodes within four days. Currently
|
||||
{{ $value | humanizePercentage }} of its inodes are free.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
|
||||
-}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
||||
summary: PersistentVolume is having issues with provisioning.
|
||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
|
||||
> 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: kubernetes-storage
|
||||
rules:
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description:
|
||||
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubePersistentVolumeFillingUp
|
||||
annotations:
|
||||
description:
|
||||
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value
|
||||
| humanizePercentage }} is available.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||
summary: PersistentVolume is filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description:
|
||||
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.03
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubePersistentVolumeInodesFillingUp
|
||||
annotations:
|
||||
description:
|
||||
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||
{{ . }} {{- end }} is expected to run out of inodes within four days. Currently
|
||||
{{ $value | humanizePercentage }} of its inodes are free.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||
summary: PersistentVolumeInodes are filling up.
|
||||
expr: |-
|
||||
(
|
||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
/
|
||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||
) < 0.15
|
||||
and
|
||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||
and
|
||||
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||
unless on (cluster, namespace, persistentvolumeclaim)
|
||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: KubePersistentVolumeErrors
|
||||
annotations:
|
||||
description:
|
||||
The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
|
||||
-}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
||||
summary: PersistentVolume is having issues with provisioning.
|
||||
expr:
|
||||
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
|
||||
> 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
+366
-339
@@ -1,340 +1,367 @@
|
||||
groups:
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
|
||||
> 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
|
||||
> 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
|
||||
> 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description: Node Exporter text file collector on {{ $labels.instance }} failed
|
||||
to scrape.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
|
||||
Ensure NTP is configured correctly on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |-
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
|
||||
is configured on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |-
|
||||
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
|
||||
in degraded state due to one or more disks failures. Number of spare drives
|
||||
is insufficient to fix issue automatically.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||
summary: RAID Array is degraded.
|
||||
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description: At least one device in RAID array at {{ $labels.instance }} failed.
|
||||
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array.
|
||||
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
> 0
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently at
|
||||
{{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description: File descriptors limit at {{ $labels.instance }} is currently at
|
||||
{{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeCPUHighUsage
|
||||
annotations:
|
||||
description: |
|
||||
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
|
||||
summary: High CPU usage.
|
||||
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
|
||||
mode!="idle"}[2m]))) * 100 > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: NodeSystemSaturation
|
||||
annotations:
|
||||
description: |
|
||||
System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
|
||||
summary: System saturated, load per core is very high.
|
||||
expr: |-
|
||||
node_load1{job="node-exporter"}
|
||||
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeMemoryMajorPagesFaults
|
||||
annotations:
|
||||
description: |
|
||||
Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
Please check that there is enough memory available at this instance.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
|
||||
summary: Memory major page faults are occurring at very high rate.
|
||||
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeMemoryHighUtilization
|
||||
annotations:
|
||||
description: |
|
||||
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
|
||||
summary: Host is running out of memory.
|
||||
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
* 100) > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeDiskIOSaturation
|
||||
annotations:
|
||||
description: |
|
||||
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
This symptom might indicate disk saturation.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
|
||||
summary: Disk IO queue is high.
|
||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
> 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeSystemdServiceFailed
|
||||
annotations:
|
||||
description: Systemd service {{ $labels.name }} has entered failed state at
|
||||
{{ $labels.instance }}
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
|
||||
summary: Systemd service has entered failed state.
|
||||
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeBondingDegraded
|
||||
annotations:
|
||||
description: Bonding interface {{ $labels.master }} on {{ $labels.instance }}
|
||||
is in degraded state due to one or more slave failures.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
|
||||
summary: Bonding interface is degraded
|
||||
expr: (node_bonding_slaves - node_bonding_active) != 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: node-exporter
|
||||
rules:
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemSpaceFillingUp
|
||||
annotations:
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
||||
and
|
||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 5% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfSpace
|
||||
annotations:
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
space left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||
summary: Filesystem has less than 3% space left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left and is filling up.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemFilesFillingUp
|
||||
annotations:
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left and is filling up fast.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
||||
and
|
||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 5% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFilesystemAlmostOutOfFiles
|
||||
annotations:
|
||||
description:
|
||||
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||
inodes left.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||
summary: Filesystem has less than 3% inodes left.
|
||||
expr: |-
|
||||
(
|
||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||
and
|
||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||
)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeNetworkReceiveErrs
|
||||
annotations:
|
||||
description:
|
||||
'{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||
summary: Network interface is reporting many receive errors.
|
||||
expr:
|
||||
rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
|
||||
> 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeNetworkTransmitErrs
|
||||
annotations:
|
||||
description:
|
||||
'{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||
summary: Network interface is reporting many transmit errors.
|
||||
expr:
|
||||
rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
|
||||
> 0.01
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeHighNumberConntrackEntriesUsed
|
||||
annotations:
|
||||
description: "{{ $value | humanizePercentage }} of conntrack entries are used."
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||
summary: Number of conntrack are getting close to the limit.
|
||||
expr:
|
||||
(node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
|
||||
> 0.75
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeTextFileCollectorScrapeError
|
||||
annotations:
|
||||
description:
|
||||
Node Exporter text file collector on {{ $labels.instance }} failed
|
||||
to scrape.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||
summary: Node Exporter text file collector failed to scrape.
|
||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockSkewDetected
|
||||
annotations:
|
||||
description:
|
||||
Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
|
||||
Ensure NTP is configured correctly on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||
summary: Clock skew detected.
|
||||
expr: |-
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||
)
|
||||
or
|
||||
(
|
||||
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||
and
|
||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||
)
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeClockNotSynchronising
|
||||
annotations:
|
||||
description:
|
||||
Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
|
||||
is configured on this host.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||
summary: Clock not synchronising.
|
||||
expr: |-
|
||||
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||
and
|
||||
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeRAIDDegraded
|
||||
annotations:
|
||||
description:
|
||||
RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
|
||||
in degraded state due to one or more disks failures. Number of spare drives
|
||||
is insufficient to fix issue automatically.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||
summary: RAID Array is degraded.
|
||||
expr:
|
||||
node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
|
||||
> 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeRAIDDiskFailure
|
||||
annotations:
|
||||
description:
|
||||
At least one device in RAID array at {{ $labels.instance }} failed.
|
||||
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||
summary: Failed device in RAID array.
|
||||
expr:
|
||||
node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||
> 0
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description:
|
||||
File descriptors limit at {{ $labels.instance }} is currently at
|
||||
{{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeFileDescriptorLimit
|
||||
annotations:
|
||||
description:
|
||||
File descriptors limit at {{ $labels.instance }} is currently at
|
||||
{{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||
expr: |-
|
||||
(
|
||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||
)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeCPUHighUsage
|
||||
annotations:
|
||||
description: |
|
||||
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
|
||||
summary: High CPU usage.
|
||||
expr:
|
||||
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
|
||||
mode!="idle"}[2m]))) * 100 > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: info
|
||||
- alert: NodeSystemSaturation
|
||||
annotations:
|
||||
description: |
|
||||
System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
|
||||
summary: System saturated, load per core is very high.
|
||||
expr: |-
|
||||
node_load1{job="node-exporter"}
|
||||
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeMemoryMajorPagesFaults
|
||||
annotations:
|
||||
description: |
|
||||
Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
Please check that there is enough memory available at this instance.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
|
||||
summary: Memory major page faults are occurring at very high rate.
|
||||
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeMemoryHighUtilization
|
||||
annotations:
|
||||
description: |
|
||||
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
|
||||
summary: Host is running out of memory.
|
||||
expr:
|
||||
100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
|
||||
* 100) > 90
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeDiskIOSaturation
|
||||
annotations:
|
||||
description: |
|
||||
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||
This symptom might indicate disk saturation.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
|
||||
summary: Disk IO queue is high.
|
||||
expr:
|
||||
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||
> 10
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeSystemdServiceFailed
|
||||
annotations:
|
||||
description:
|
||||
Systemd service {{ $labels.name }} has entered failed state at
|
||||
{{ $labels.instance }}
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
|
||||
summary: Systemd service has entered failed state.
|
||||
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: NodeBondingDegraded
|
||||
annotations:
|
||||
description:
|
||||
Bonding interface {{ $labels.master }} on {{ $labels.instance }}
|
||||
is in degraded state due to one or more slave failures.
|
||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
|
||||
summary: Bonding interface is degraded
|
||||
expr: (node_bonding_slaves - node_bonding_active) != 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
@@ -1,70 +1,76 @@
|
||||
groups:
|
||||
- name: node-resource-utilization.rules
|
||||
rules:
|
||||
- alert: HostHighCpuLoad
|
||||
annotations:
|
||||
description: |-
|
||||
CPU load is > 90%
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
|
||||
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: MemoryUtilizationHighWarning
|
||||
annotations:
|
||||
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||
description: Node {{ $labels.instance }} has less than 10% available memory.
|
||||
summary: Node Memory utilization warning
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: MemoryUtilizationHighCritical
|
||||
annotations:
|
||||
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||
description: Node {{ $labels.instance }} has less than 5% available memory.
|
||||
summary: Node Memory utilization critical
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeNotReady
|
||||
annotations:
|
||||
description: Node {{ $labels.node }} has CPU utilization over 90%.
|
||||
summary: Node has been in not-ready state for longer than 3 minutes
|
||||
expr: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
|
||||
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
|
||||
> 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubernetesNodeMemoryPressure
|
||||
annotations:
|
||||
description: |-
|
||||
Node {{ $labels.node }} has MemoryPressure condition
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
|
||||
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
||||
1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubernetesContainerOomKiller
|
||||
annotations:
|
||||
description: |-
|
||||
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
|
||||
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
||||
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
|
||||
== 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
- name: node-resource-utilization.rules
|
||||
rules:
|
||||
- alert: HostHighCpuLoad
|
||||
annotations:
|
||||
description: |-
|
||||
CPU load is > 90%
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
expr:
|
||||
(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
|
||||
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: MemoryUtilizationHighWarning
|
||||
annotations:
|
||||
dashboard:
|
||||
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||
description: Node {{ $labels.instance }} has less than 10% available memory.
|
||||
summary: Node Memory utilization warning
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: MemoryUtilizationHighCritical
|
||||
annotations:
|
||||
dashboard:
|
||||
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||
description: Node {{ $labels.instance }} has less than 5% available memory.
|
||||
summary: Node Memory utilization critical
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeNotReady
|
||||
annotations:
|
||||
description: Node {{ $labels.node }} has CPU utilization over 90%.
|
||||
summary: Node has been in not-ready state for longer than 3 minutes
|
||||
expr:
|
||||
(sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
|
||||
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
|
||||
> 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubernetesNodeMemoryPressure
|
||||
annotations:
|
||||
description: |-
|
||||
Node {{ $labels.node }} has MemoryPressure condition
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
|
||||
expr:
|
||||
kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
||||
1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubernetesContainerOomKiller
|
||||
annotations:
|
||||
description: |-
|
||||
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
|
||||
expr:
|
||||
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
||||
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
|
||||
== 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
|
||||
+24
-20
@@ -1,21 +1,25 @@
|
||||
groups:
|
||||
- name: velero
|
||||
rules:
|
||||
- alert: VeleroBackupPartialFailures
|
||||
annotations:
|
||||
message: Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
|
||||
failed backups.
|
||||
expr: velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||
> 0.25
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: VeleroBackupFailures
|
||||
annotations:
|
||||
message: Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
|
||||
backups.
|
||||
expr: velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||
> 0.25
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: velero
|
||||
rules:
|
||||
- alert: VeleroBackupPartialFailures
|
||||
annotations:
|
||||
message:
|
||||
Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
|
||||
failed backups.
|
||||
expr:
|
||||
velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||
> 0.25
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: VeleroBackupFailures
|
||||
annotations:
|
||||
message:
|
||||
Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
|
||||
backups.
|
||||
expr:
|
||||
velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||
> 0.25
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
+51
-45
@@ -1,46 +1,52 @@
|
||||
groups:
|
||||
- name: x509-certificate-exporter.rules
|
||||
rules:
|
||||
- alert: X509ExporterReadErrors
|
||||
annotations:
|
||||
description: Over the last 15 minutes, this x509-certificate-exporter instance
|
||||
has experienced errors reading certificate files or querying the Kubernetes
|
||||
API. This could be caused by a misconfiguration if triggered when the exporter
|
||||
starts.
|
||||
summary: Increasing read errors for x509-certificate-exporter
|
||||
expr: delta(x509_read_errors[15m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CertificateError
|
||||
annotations:
|
||||
description: Certificate could not be decoded {{if $labels.secret_name }} in
|
||||
Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at
|
||||
location "{{ $labels.filepath }}"{{end}}
|
||||
summary: Certificate cannot be decoded
|
||||
expr: x509_cert_error > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CertificateRenewal
|
||||
annotations:
|
||||
description: Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
|
||||
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
||||
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
||||
summary: Certificate should be renewed
|
||||
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CertificateExpiration
|
||||
annotations:
|
||||
description: Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
|
||||
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
||||
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
||||
summary: Certificate is about to expire
|
||||
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
- name: x509-certificate-exporter.rules
|
||||
rules:
|
||||
- alert: X509ExporterReadErrors
|
||||
annotations:
|
||||
description:
|
||||
Over the last 15 minutes, this x509-certificate-exporter instance
|
||||
has experienced errors reading certificate files or querying the Kubernetes
|
||||
API. This could be caused by a misconfiguration if triggered when the exporter
|
||||
starts.
|
||||
summary: Increasing read errors for x509-certificate-exporter
|
||||
expr: delta(x509_read_errors[15m]) > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CertificateError
|
||||
annotations:
|
||||
description:
|
||||
Certificate could not be decoded {{if $labels.secret_name }} in
|
||||
Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at
|
||||
location "{{ $labels.filepath }}"{{end}}
|
||||
summary: Certificate cannot be decoded
|
||||
expr: x509_cert_error > 0
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CertificateRenewal
|
||||
annotations:
|
||||
description:
|
||||
Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
|
||||
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
||||
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
||||
summary: Certificate should be renewed
|
||||
expr:
|
||||
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: CertificateExpiration
|
||||
annotations:
|
||||
description:
|
||||
Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
|
||||
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
||||
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
||||
summary: Certificate is about to expire
|
||||
expr:
|
||||
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
|
||||
Reference in New Issue
Block a user