fix(rules/bootstrap): Format yaml
This commit is contained in:
@@ -1,6 +1,7 @@
|
|||||||
#!/usr/bin/env bash
|
#!/usr/bin/env bash
|
||||||
# the shebang is ignored, but nice for editors
|
# the shebang is ignored, but nice for editors
|
||||||
watch_file nix/sources.json
|
watch_file nix/sources.json
|
||||||
|
watch_file nix/checks.nix
|
||||||
|
|
||||||
# Load .env file if it exists
|
# Load .env file if it exists
|
||||||
dotenv_if_exists
|
dotenv_if_exists
|
||||||
|
|||||||
+4
-5
@@ -1,6 +1,6 @@
|
|||||||
image:
|
image:
|
||||||
name: alpine/helm:latest
|
name: alpine/helm:latest
|
||||||
entrypoint: [ "/bin/bash", "-c" ]
|
entrypoint: ["/bin/bash", "-c"]
|
||||||
|
|
||||||
stages:
|
stages:
|
||||||
- release
|
- release
|
||||||
@@ -8,9 +8,9 @@ stages:
|
|||||||
release:
|
release:
|
||||||
stage: release
|
stage: release
|
||||||
rules:
|
rules:
|
||||||
- if: '$CI_COMMIT_BRANCH =~ /^main/'
|
- if: "$CI_COMMIT_BRANCH =~ /^main/"
|
||||||
when: always
|
when: always
|
||||||
- when: never
|
- when: never
|
||||||
script:
|
script:
|
||||||
- |
|
- |
|
||||||
cd $CI_PROJECT_DIR
|
cd $CI_PROJECT_DIR
|
||||||
@@ -43,4 +43,3 @@ rebuild:
|
|||||||
"${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts"
|
"${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts"
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|||||||
@@ -3,16 +3,16 @@ kind: ClusterRole
|
|||||||
metadata:
|
metadata:
|
||||||
name: argocd-cluster-admin
|
name: argocd-cluster-admin
|
||||||
rules:
|
rules:
|
||||||
- apiGroups:
|
- apiGroups:
|
||||||
- '*'
|
- "*"
|
||||||
resources:
|
resources:
|
||||||
- '*'
|
- "*"
|
||||||
verbs:
|
verbs:
|
||||||
- '*'
|
- "*"
|
||||||
- nonResourceURLs:
|
- nonResourceURLs:
|
||||||
- '*'
|
- "*"
|
||||||
verbs:
|
verbs:
|
||||||
- '*'
|
- "*"
|
||||||
---
|
---
|
||||||
apiVersion: rbac.authorization.k8s.io/v1
|
apiVersion: rbac.authorization.k8s.io/v1
|
||||||
kind: ClusterRoleBinding
|
kind: ClusterRoleBinding
|
||||||
@@ -23,9 +23,9 @@ roleRef:
|
|||||||
kind: ClusterRole
|
kind: ClusterRole
|
||||||
name: argocd-cluster-admin
|
name: argocd-cluster-admin
|
||||||
subjects:
|
subjects:
|
||||||
- kind: ServiceAccount
|
- kind: ServiceAccount
|
||||||
name: argocd-cluster-admin
|
name: argocd-cluster-admin
|
||||||
namespace: kube-system
|
namespace: kube-system
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
kind: ServiceAccount
|
kind: ServiceAccount
|
||||||
|
|||||||
@@ -6,5 +6,3 @@ metadata:
|
|||||||
name: cluster-admin-token
|
name: cluster-admin-token
|
||||||
namespace: kube-system
|
namespace: kube-system
|
||||||
type: kubernetes.io/service-account-token
|
type: kubernetes.io/service-account-token
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -10,5 +10,3 @@ metadata:
|
|||||||
name: cluster-ekman
|
name: cluster-ekman
|
||||||
namespace: argocd
|
namespace: argocd
|
||||||
type: Opaque
|
type: Opaque
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ spec:
|
|||||||
init:
|
init:
|
||||||
# Init always happens immediately before generate, but its output is not treated as manifests.
|
# Init always happens immediately before generate, but its output is not treated as manifests.
|
||||||
# This is a good place to, for example, download chart dependencies.
|
# This is a good place to, for example, download chart dependencies.
|
||||||
command: [ /bin/sh ]
|
command: [/bin/sh]
|
||||||
args:
|
args:
|
||||||
- /plugin/init.sh
|
- /plugin/init.sh
|
||||||
# The generate command runs in the Application source directory each time manifests are generated. Standard output
|
# The generate command runs in the Application source directory each time manifests are generated. Standard output
|
||||||
@@ -17,7 +17,7 @@ spec:
|
|||||||
# To write log messages from the command, write them to stderr, it will always be displayed.
|
# To write log messages from the command, write them to stderr, it will always be displayed.
|
||||||
# Error output will be sent to the UI, so avoid printing sensitive information (such as secrets).
|
# Error output will be sent to the UI, so avoid printing sensitive information (such as secrets).
|
||||||
generate:
|
generate:
|
||||||
command: [ /bin/sh ]
|
command: [/bin/sh]
|
||||||
args:
|
args:
|
||||||
- /plugin/generate.sh
|
- /plugin/generate.sh
|
||||||
|
|
||||||
@@ -27,15 +27,15 @@ spec:
|
|||||||
# Only one of fileName, find.glob, or find.command should be specified. If multiple are specified then only the
|
# Only one of fileName, find.glob, or find.command should be specified. If multiple are specified then only the
|
||||||
# first (in that order) is evaluated.
|
# first (in that order) is evaluated.
|
||||||
# discover:
|
# discover:
|
||||||
# fileName is a glob pattern (https://pkg.go.dev/path/filepath#Glob) that is applied to the Application's source
|
# fileName is a glob pattern (https://pkg.go.dev/path/filepath#Glob) that is applied to the Application's source
|
||||||
# directory. If there is a match, this plugin may be used for the Application.
|
# directory. If there is a match, this plugin may be used for the Application.
|
||||||
# fileName: "./subdir/s*.yaml"
|
# fileName: "./subdir/s*.yaml"
|
||||||
# find:
|
# find:
|
||||||
# This does the same thing as fileName, but it supports double-start (nested directory) glob patterns.
|
# This does the same thing as fileName, but it supports double-start (nested directory) glob patterns.
|
||||||
# glob: "**/Chart.yaml"
|
# glob: "**/Chart.yaml"
|
||||||
# The find command runs in the repository's root directory. To match, it must exit with status code 0 _and_
|
# The find command runs in the repository's root directory. To match, it must exit with status code 0 _and_
|
||||||
# produce non-empty output to standard out.
|
# produce non-empty output to standard out.
|
||||||
# command: [sh, -c, find . -name env.yaml]
|
# command: [sh, -c, find . -name env.yaml]
|
||||||
# The parameters config describes what parameters the UI should display for an Application. It is up to the user to
|
# The parameters config describes what parameters the UI should display for an Application. It is up to the user to
|
||||||
# actually set parameters in the Application manifest (in spec.source.plugin.parameters). The announcements _only_
|
# actually set parameters in the Application manifest (in spec.source.plugin.parameters). The announcements _only_
|
||||||
# inform the "Parameters" tab in the App Details page of the UI.
|
# inform the "Parameters" tab in the App Details page of the UI.
|
||||||
@@ -66,22 +66,21 @@ spec:
|
|||||||
itemType: string
|
itemType: string
|
||||||
collectionType: string
|
collectionType: string
|
||||||
string: ""
|
string: ""
|
||||||
# All the fields above besides "string" apply to both the array and map type parameter announcements.
|
# All the fields above besides 'string' apply to both the array and map type parameter announcements.
|
||||||
# - name: array-param
|
# - name: array-param
|
||||||
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
|
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
|
||||||
# array: [default, items]
|
# array: [default, items]
|
||||||
# collectionType: array
|
# collectionType: array
|
||||||
# - name: map-param
|
# - name: map-param
|
||||||
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
|
# # This field communicates the parameter's default value to the UI. Setting this field is optional.
|
||||||
# map:
|
# map:
|
||||||
# some: value
|
# some: value
|
||||||
# collectionType: map
|
# collectionType: map
|
||||||
# dynamic:
|
# dynamic:
|
||||||
# The command is run in an Application's source directory. Standard output must be JSON matching the schema of the
|
# The command is run in an Application's source directory. Standard output must be JSON matching the schema of the
|
||||||
# static parameter announcements list.
|
# static parameter announcements list.
|
||||||
# command: [ /bin/sh, /plugin/get-values.sh ]
|
# command: [ /bin/sh, /plugin/get-values.sh ]
|
||||||
|
|
||||||
# If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository
|
# If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository
|
||||||
# might have executable files. Set to true only if you trust the CMP plugin authors.
|
# might have executable files. Set to true only if you trust the CMP plugin authors.
|
||||||
preserveFileMode: false
|
preserveFileMode: false
|
||||||
|
|
||||||
|
|||||||
@@ -45,432 +45,432 @@ spec:
|
|||||||
affinity:
|
affinity:
|
||||||
podAntiAffinity:
|
podAntiAffinity:
|
||||||
preferredDuringSchedulingIgnoredDuringExecution:
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
- podAffinityTerm:
|
- podAffinityTerm:
|
||||||
labelSelector:
|
labelSelector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
app.kubernetes.io/name: argocd-repo-server
|
app.kubernetes.io/name: argocd-repo-server
|
||||||
topologyKey: kubernetes.io/hostname
|
topologyKey: kubernetes.io/hostname
|
||||||
weight: 100
|
weight: 100
|
||||||
automountServiceAccountToken: true
|
automountServiceAccountToken: true
|
||||||
containers:
|
containers:
|
||||||
- args:
|
- args:
|
||||||
- /usr/local/bin/argocd-repo-server
|
- /usr/local/bin/argocd-repo-server
|
||||||
- --port=8081
|
- --port=8081
|
||||||
- --metrics-port=8084
|
- --metrics-port=8084
|
||||||
env:
|
env:
|
||||||
- name: ARGOCD_REPO_SERVER_NAME
|
- name: ARGOCD_REPO_SERVER_NAME
|
||||||
value: argocd-repo-server
|
value: argocd-repo-server
|
||||||
- name: ARGOCD_RECONCILIATION_TIMEOUT
|
- name: ARGOCD_RECONCILIATION_TIMEOUT
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: timeout.reconciliation
|
key: timeout.reconciliation
|
||||||
name: argocd-cm
|
name: argocd-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_LOGFORMAT
|
- name: ARGOCD_REPO_SERVER_LOGFORMAT
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.log.format
|
key: reposerver.log.format
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_LOGLEVEL
|
- name: ARGOCD_REPO_SERVER_LOGLEVEL
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.log.level
|
key: reposerver.log.level
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
|
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.parallelism.limit
|
key: reposerver.parallelism.limit
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
|
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.listen.address
|
key: reposerver.listen.address
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
|
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.metrics.listen.address
|
key: reposerver.metrics.listen.address
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_DISABLE_TLS
|
- name: ARGOCD_REPO_SERVER_DISABLE_TLS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.disable.tls
|
key: reposerver.disable.tls
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_TLS_MIN_VERSION
|
- name: ARGOCD_TLS_MIN_VERSION
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.tls.minversion
|
key: reposerver.tls.minversion
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_TLS_MAX_VERSION
|
- name: ARGOCD_TLS_MAX_VERSION
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.tls.maxversion
|
key: reposerver.tls.maxversion
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_TLS_CIPHERS
|
- name: ARGOCD_TLS_CIPHERS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.tls.ciphers
|
key: reposerver.tls.ciphers
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_CACHE_EXPIRATION
|
- name: ARGOCD_REPO_CACHE_EXPIRATION
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.repo.cache.expiration
|
key: reposerver.repo.cache.expiration
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: REDIS_SERVER
|
- name: REDIS_SERVER
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: redis.server
|
key: redis.server
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: REDIS_COMPRESSION
|
- name: REDIS_COMPRESSION
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: redis.compression
|
key: redis.compression
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: REDISDB
|
- name: REDISDB
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: redis.db
|
key: redis.db
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: REDIS_USERNAME
|
- name: REDIS_USERNAME
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
key: redis-username
|
key: redis-username
|
||||||
name: argocd-redis
|
name: argocd-redis
|
||||||
optional: true
|
optional: true
|
||||||
- name: REDIS_PASSWORD
|
- name: REDIS_PASSWORD
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
key: auth
|
key: auth
|
||||||
name: argocd-redis
|
name: argocd-redis
|
||||||
- name: REDIS_SENTINEL_USERNAME
|
- name: REDIS_SENTINEL_USERNAME
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
key: redis-sentinel-username
|
key: redis-sentinel-username
|
||||||
name: argocd-redis
|
name: argocd-redis
|
||||||
optional: true
|
optional: true
|
||||||
- name: REDIS_SENTINEL_PASSWORD
|
- name: REDIS_SENTINEL_PASSWORD
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
key: redis-sentinel-password
|
key: redis-sentinel-password
|
||||||
name: argocd-redis
|
name: argocd-redis
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION
|
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.default.cache.expiration
|
key: reposerver.default.cache.expiration
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
|
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: otlp.address
|
key: otlp.address
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE
|
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: otlp.insecure
|
key: otlp.insecure
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS
|
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: otlp.headers
|
key: otlp.headers
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
|
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.max.combined.directory.manifests.size
|
key: reposerver.max.combined.directory.manifests.size
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
|
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.plugin.tar.exclusions
|
key: reposerver.plugin.tar.exclusions
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
|
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.allow.oob.symlinks
|
key: reposerver.allow.oob.symlinks
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
|
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.streamed.manifest.max.tar.size
|
key: reposerver.streamed.manifest.max.tar.size
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
|
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.streamed.manifest.max.extracted.size
|
key: reposerver.streamed.manifest.max.extracted.size
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.helm.manifest.max.extracted.size
|
key: reposerver.helm.manifest.max.extracted.size
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.disable.helm.manifest.max.extracted.size
|
key: reposerver.disable.helm.manifest.max.extracted.size
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_GIT_MODULES_ENABLED
|
- name: ARGOCD_GIT_MODULES_ENABLED
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.enable.git.submodule
|
key: reposerver.enable.git.submodule
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
|
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.git.lsremote.parallelism.limit
|
key: reposerver.git.lsremote.parallelism.limit
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_GIT_REQUEST_TIMEOUT
|
- name: ARGOCD_GIT_REQUEST_TIMEOUT
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.git.request.timeout
|
key: reposerver.git.request.timeout
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REVISION_CACHE_LOCK_TIMEOUT
|
- name: ARGOCD_REVISION_CACHE_LOCK_TIMEOUT
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.revision.cache.lock.timeout
|
key: reposerver.revision.cache.lock.timeout
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_INCLUDE_HIDDEN_DIRECTORIES
|
- name: ARGOCD_REPO_SERVER_INCLUDE_HIDDEN_DIRECTORIES
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.include.hidden.directories
|
key: reposerver.include.hidden.directories
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: HELM_CACHE_HOME
|
- name: HELM_CACHE_HOME
|
||||||
value: /helm-working-dir
|
value: /helm-working-dir
|
||||||
- name: HELM_CONFIG_HOME
|
- name: HELM_CONFIG_HOME
|
||||||
value: /helm-working-dir
|
value: /helm-working-dir
|
||||||
- name: HELM_DATA_HOME
|
- name: HELM_DATA_HOME
|
||||||
value: /helm-working-dir
|
value: /helm-working-dir
|
||||||
image: quay.io/argoproj/argocd:v2.12.3
|
image: quay.io/argoproj/argocd:v2.12.3
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
failureThreshold: 3
|
failureThreshold: 3
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /healthz?full=true
|
path: /healthz?full=true
|
||||||
port: metrics
|
port: metrics
|
||||||
scheme: HTTP
|
scheme: HTTP
|
||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
successThreshold: 1
|
successThreshold: 1
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 1
|
||||||
name: repo-server
|
|
||||||
ports:
|
|
||||||
- containerPort: 8081
|
|
||||||
name: repo-server
|
name: repo-server
|
||||||
protocol: TCP
|
ports:
|
||||||
- containerPort: 8084
|
- containerPort: 8081
|
||||||
name: metrics
|
name: repo-server
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
readinessProbe:
|
- containerPort: 8084
|
||||||
failureThreshold: 3
|
name: metrics
|
||||||
httpGet:
|
protocol: TCP
|
||||||
path: /healthz
|
readinessProbe:
|
||||||
port: metrics
|
failureThreshold: 3
|
||||||
scheme: HTTP
|
httpGet:
|
||||||
initialDelaySeconds: 10
|
path: /healthz
|
||||||
periodSeconds: 10
|
port: metrics
|
||||||
successThreshold: 1
|
scheme: HTTP
|
||||||
timeoutSeconds: 1
|
initialDelaySeconds: 10
|
||||||
securityContext:
|
periodSeconds: 10
|
||||||
allowPrivilegeEscalation: false
|
successThreshold: 1
|
||||||
capabilities:
|
timeoutSeconds: 1
|
||||||
drop:
|
securityContext:
|
||||||
- ALL
|
allowPrivilegeEscalation: false
|
||||||
readOnlyRootFilesystem: true
|
capabilities:
|
||||||
runAsNonRoot: true
|
drop:
|
||||||
seccompProfile:
|
- ALL
|
||||||
type: RuntimeDefault
|
readOnlyRootFilesystem: true
|
||||||
terminationMessagePath: /dev/termination-log
|
runAsNonRoot: true
|
||||||
terminationMessagePolicy: File
|
seccompProfile:
|
||||||
volumeMounts:
|
type: RuntimeDefault
|
||||||
- mountPath: /app/config/ssh
|
terminationMessagePath: /dev/termination-log
|
||||||
name: ssh-known-hosts
|
terminationMessagePolicy: File
|
||||||
- mountPath: /app/config/tls
|
volumeMounts:
|
||||||
name: tls-certs
|
- mountPath: /app/config/ssh
|
||||||
- mountPath: /app/config/gpg/source
|
name: ssh-known-hosts
|
||||||
name: gpg-keys
|
- mountPath: /app/config/tls
|
||||||
- mountPath: /app/config/gpg/keys
|
name: tls-certs
|
||||||
name: gpg-keyring
|
- mountPath: /app/config/gpg/source
|
||||||
- mountPath: /app/config/reposerver/tls
|
name: gpg-keys
|
||||||
name: argocd-repo-server-tls
|
- mountPath: /app/config/gpg/keys
|
||||||
- mountPath: /helm-working-dir
|
name: gpg-keyring
|
||||||
name: helm-working-dir
|
- mountPath: /app/config/reposerver/tls
|
||||||
- mountPath: /home/argocd/cmp-server/plugins
|
name: argocd-repo-server-tls
|
||||||
name: plugins
|
- mountPath: /helm-working-dir
|
||||||
- mountPath: /tmp
|
name: helm-working-dir
|
||||||
name: tmp
|
- mountPath: /home/argocd/cmp-server/plugins
|
||||||
- command:
|
name: plugins
|
||||||
- /var/run/argocd/argocd-cmp-server
|
- mountPath: /tmp
|
||||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
name: tmp
|
||||||
imagePullPolicy: Always
|
- command:
|
||||||
name: kustomize-helm-with-rewrite
|
- /var/run/argocd/argocd-cmp-server
|
||||||
securityContext:
|
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||||
runAsNonRoot: true
|
imagePullPolicy: Always
|
||||||
runAsUser: 999
|
name: kustomize-helm-with-rewrite
|
||||||
terminationMessagePath: /dev/termination-log
|
securityContext:
|
||||||
terminationMessagePolicy: File
|
runAsNonRoot: true
|
||||||
volumeMounts:
|
runAsUser: 999
|
||||||
- mountPath: /var/run/argocd
|
terminationMessagePath: /dev/termination-log
|
||||||
name: var-files
|
terminationMessagePolicy: File
|
||||||
- mountPath: /home/argocd/cmp-server/plugins
|
volumeMounts:
|
||||||
name: plugins
|
- mountPath: /var/run/argocd
|
||||||
- mountPath: /tmp
|
name: var-files
|
||||||
name: cmp-tmp
|
- mountPath: /home/argocd/cmp-server/plugins
|
||||||
- mountPath: /helm-working-dir
|
name: plugins
|
||||||
name: helm-working-dir
|
- mountPath: /tmp
|
||||||
- command:
|
name: cmp-tmp
|
||||||
- /var/run/argocd/argocd-cmp-server
|
- mountPath: /helm-working-dir
|
||||||
image: registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp:latest
|
name: helm-working-dir
|
||||||
imagePullPolicy: Always
|
- command:
|
||||||
name: helm-kustomize-cmp
|
- /var/run/argocd/argocd-cmp-server
|
||||||
securityContext:
|
image: registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp:latest
|
||||||
runAsNonRoot: true
|
imagePullPolicy: Always
|
||||||
runAsUser: 999
|
name: helm-kustomize-cmp
|
||||||
terminationMessagePath: /dev/termination-log
|
securityContext:
|
||||||
terminationMessagePolicy: File
|
runAsNonRoot: true
|
||||||
volumeMounts:
|
runAsUser: 999
|
||||||
- mountPath: /var/run/argocd
|
terminationMessagePath: /dev/termination-log
|
||||||
name: var-files
|
terminationMessagePolicy: File
|
||||||
- mountPath: /home/argocd/cmp-server/plugins
|
volumeMounts:
|
||||||
name: plugins
|
- mountPath: /var/run/argocd
|
||||||
- mountPath: /tmp
|
name: var-files
|
||||||
name: cmp-tmp
|
- mountPath: /home/argocd/cmp-server/plugins
|
||||||
- mountPath: /helm-working-dir
|
name: plugins
|
||||||
name: helm-working-dir
|
- mountPath: /tmp
|
||||||
- command:
|
name: cmp-tmp
|
||||||
- /var/run/argocd/argocd-cmp-server
|
- mountPath: /helm-working-dir
|
||||||
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
|
name: helm-working-dir
|
||||||
imagePullPolicy: Always
|
- command:
|
||||||
name: helmfile-cmp
|
- /var/run/argocd/argocd-cmp-server
|
||||||
securityContext:
|
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
|
||||||
runAsNonRoot: true
|
imagePullPolicy: Always
|
||||||
runAsUser: 999
|
name: helmfile-cmp
|
||||||
terminationMessagePath: /dev/termination-log
|
securityContext:
|
||||||
terminationMessagePolicy: File
|
runAsNonRoot: true
|
||||||
volumeMounts:
|
runAsUser: 999
|
||||||
- mountPath: /var/run/argocd
|
terminationMessagePath: /dev/termination-log
|
||||||
name: var-files
|
terminationMessagePolicy: File
|
||||||
- mountPath: /home/argocd/cmp-server/plugins
|
volumeMounts:
|
||||||
name: plugins
|
- mountPath: /var/run/argocd
|
||||||
- mountPath: /tmp
|
name: var-files
|
||||||
name: cmp-tmp
|
- mountPath: /home/argocd/cmp-server/plugins
|
||||||
- mountPath: /helm-working-dir
|
name: plugins
|
||||||
name: helm-working-dir
|
- mountPath: /tmp
|
||||||
|
name: cmp-tmp
|
||||||
|
- mountPath: /helm-working-dir
|
||||||
|
name: helm-working-dir
|
||||||
dnsPolicy: ClusterFirst
|
dnsPolicy: ClusterFirst
|
||||||
imagePullSecrets:
|
imagePullSecrets:
|
||||||
- name: gitlab-pull-secret
|
- name: gitlab-pull-secret
|
||||||
initContainers:
|
initContainers:
|
||||||
- command:
|
- command:
|
||||||
- /bin/cp
|
- /bin/cp
|
||||||
- -n
|
- -n
|
||||||
- /usr/local/bin/argocd
|
- /usr/local/bin/argocd
|
||||||
- /var/run/argocd/argocd-cmp-server
|
- /var/run/argocd/argocd-cmp-server
|
||||||
image: quay.io/argoproj/argocd:v2.12.3
|
image: quay.io/argoproj/argocd:v2.12.3
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
name: copyutil
|
name: copyutil
|
||||||
securityContext:
|
securityContext:
|
||||||
allowPrivilegeEscalation: false
|
allowPrivilegeEscalation: false
|
||||||
capabilities:
|
capabilities:
|
||||||
drop:
|
drop:
|
||||||
- ALL
|
- ALL
|
||||||
readOnlyRootFilesystem: true
|
readOnlyRootFilesystem: true
|
||||||
runAsNonRoot: true
|
runAsNonRoot: true
|
||||||
seccompProfile:
|
seccompProfile:
|
||||||
type: RuntimeDefault
|
type: RuntimeDefault
|
||||||
terminationMessagePath: /dev/termination-log
|
terminationMessagePath: /dev/termination-log
|
||||||
terminationMessagePolicy: File
|
terminationMessagePolicy: File
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- mountPath: /var/run/argocd
|
- mountPath: /var/run/argocd
|
||||||
name: var-files
|
name: var-files
|
||||||
- command:
|
- command:
|
||||||
- /bin/sh
|
- /bin/sh
|
||||||
- /plugin/init-helm-repos.sh
|
- /plugin/init-helm-repos.sh
|
||||||
env:
|
env:
|
||||||
- name: OCEANBOX_HELM_ACCESS_TOKEN
|
- name: OCEANBOX_HELM_ACCESS_TOKEN
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
key: token
|
key: token
|
||||||
name: oceanbox-helm
|
name: oceanbox-helm
|
||||||
optional: false
|
optional: false
|
||||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
name: init-helm-repos
|
name: init-helm-repos
|
||||||
securityContext:
|
securityContext:
|
||||||
allowPrivilegeEscalation: false
|
allowPrivilegeEscalation: false
|
||||||
capabilities:
|
capabilities:
|
||||||
drop:
|
drop:
|
||||||
- ALL
|
- ALL
|
||||||
readOnlyRootFilesystem: true
|
readOnlyRootFilesystem: true
|
||||||
runAsNonRoot: true
|
runAsNonRoot: true
|
||||||
runAsUser: 999
|
runAsUser: 999
|
||||||
seccompProfile:
|
seccompProfile:
|
||||||
type: RuntimeDefault
|
type: RuntimeDefault
|
||||||
terminationMessagePath: /dev/termination-log
|
terminationMessagePath: /dev/termination-log
|
||||||
terminationMessagePolicy: File
|
terminationMessagePolicy: File
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- mountPath: /helm-working-dir
|
- mountPath: /helm-working-dir
|
||||||
name: helm-working-dir
|
name: helm-working-dir
|
||||||
restartPolicy: Always
|
restartPolicy: Always
|
||||||
schedulerName: default-scheduler
|
schedulerName: default-scheduler
|
||||||
serviceAccount: argocd-repo-server
|
serviceAccount: argocd-repo-server
|
||||||
serviceAccountName: argocd-repo-server
|
serviceAccountName: argocd-repo-server
|
||||||
terminationGracePeriodSeconds: 30
|
terminationGracePeriodSeconds: 30
|
||||||
volumes:
|
volumes:
|
||||||
- name: cmp-tmp
|
- name: cmp-tmp
|
||||||
- name: helm-working-dir
|
- name: helm-working-dir
|
||||||
- name: plugins
|
- name: plugins
|
||||||
- name: var-files
|
- name: var-files
|
||||||
- name: tmp
|
- name: tmp
|
||||||
- configMap:
|
- configMap:
|
||||||
defaultMode: 420
|
defaultMode: 420
|
||||||
name: argocd-ssh-known-hosts-cm
|
name: argocd-ssh-known-hosts-cm
|
||||||
name: ssh-known-hosts
|
name: ssh-known-hosts
|
||||||
- configMap:
|
- configMap:
|
||||||
defaultMode: 420
|
defaultMode: 420
|
||||||
name: argocd-tls-certs-cm
|
name: argocd-tls-certs-cm
|
||||||
name: tls-certs
|
name: tls-certs
|
||||||
- configMap:
|
- configMap:
|
||||||
defaultMode: 420
|
defaultMode: 420
|
||||||
name: argocd-gpg-keys-cm
|
name: argocd-gpg-keys-cm
|
||||||
name: gpg-keys
|
name: gpg-keys
|
||||||
- name: gpg-keyring
|
- name: gpg-keyring
|
||||||
- name: argocd-repo-server-tls
|
- name: argocd-repo-server-tls
|
||||||
secret:
|
secret:
|
||||||
defaultMode: 420
|
defaultMode: 420
|
||||||
items:
|
items:
|
||||||
- key: tls.crt
|
- key: tls.crt
|
||||||
path: tls.crt
|
path: tls.crt
|
||||||
- key: tls.key
|
- key: tls.key
|
||||||
path: tls.key
|
path: tls.key
|
||||||
- key: ca.crt
|
- key: ca.crt
|
||||||
path: ca.crt
|
path: ca.crt
|
||||||
optional: true
|
optional: true
|
||||||
secretName: argocd-repo-server-tls
|
secretName: argocd-repo-server-tls
|
||||||
|
|||||||
@@ -4,24 +4,24 @@ spec:
|
|||||||
template:
|
template:
|
||||||
spec:
|
spec:
|
||||||
imagePullSecrets:
|
imagePullSecrets:
|
||||||
- name: gitlab-pull-secret
|
- name: gitlab-pull-secret
|
||||||
containers:
|
containers:
|
||||||
- command:
|
- command:
|
||||||
- /var/run/argocd/argocd-cmp-server
|
- /var/run/argocd/argocd-cmp-server
|
||||||
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
|
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
name: helmfile-cmp
|
name: helmfile-cmp
|
||||||
securityContext:
|
securityContext:
|
||||||
runAsNonRoot: true
|
runAsNonRoot: true
|
||||||
runAsUser: 999
|
runAsUser: 999
|
||||||
terminationMessagePath: /dev/termination-log
|
terminationMessagePath: /dev/termination-log
|
||||||
terminationMessagePolicy: File
|
terminationMessagePolicy: File
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- mountPath: /var/run/argocd
|
- mountPath: /var/run/argocd
|
||||||
name: var-files
|
name: var-files
|
||||||
- mountPath: /home/argocd/cmp-server/plugins
|
- mountPath: /home/argocd/cmp-server/plugins
|
||||||
name: plugins
|
name: plugins
|
||||||
- mountPath: /tmp
|
- mountPath: /tmp
|
||||||
name: tmp
|
name: tmp
|
||||||
- mountPath: /helm-working-dir
|
- mountPath: /helm-working-dir
|
||||||
name: helm-working-dir
|
name: helm-working-dir
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ metadata:
|
|||||||
name: helmfile-cmp
|
name: helmfile-cmp
|
||||||
spec:
|
spec:
|
||||||
generate:
|
generate:
|
||||||
command: [ "/bin/sh" ]
|
command: ["/bin/sh"]
|
||||||
args:
|
args:
|
||||||
- /plugin/generate.sh
|
- /plugin/generate.sh
|
||||||
lockRepo: false
|
lockRepo: false
|
||||||
|
|||||||
@@ -44,341 +44,341 @@ spec:
|
|||||||
affinity:
|
affinity:
|
||||||
podAntiAffinity:
|
podAntiAffinity:
|
||||||
preferredDuringSchedulingIgnoredDuringExecution:
|
preferredDuringSchedulingIgnoredDuringExecution:
|
||||||
- podAffinityTerm:
|
- podAffinityTerm:
|
||||||
labelSelector:
|
labelSelector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
app.kubernetes.io/name: argocd-repo-server
|
app.kubernetes.io/name: argocd-repo-server
|
||||||
topologyKey: kubernetes.io/hostname
|
topologyKey: kubernetes.io/hostname
|
||||||
weight: 100
|
weight: 100
|
||||||
containers:
|
containers:
|
||||||
- args:
|
- args:
|
||||||
- /usr/local/bin/argocd-repo-server
|
- /usr/local/bin/argocd-repo-server
|
||||||
- --port=8081
|
- --port=8081
|
||||||
- --metrics-port=8084
|
- --metrics-port=8084
|
||||||
env:
|
env:
|
||||||
- name: ARGOCD_REPO_SERVER_NAME
|
- name: ARGOCD_REPO_SERVER_NAME
|
||||||
value: argocd-repo-server
|
value: argocd-repo-server
|
||||||
- name: ARGOCD_RECONCILIATION_TIMEOUT
|
- name: ARGOCD_RECONCILIATION_TIMEOUT
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: timeout.reconciliation
|
key: timeout.reconciliation
|
||||||
name: argocd-cm
|
name: argocd-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_LOGFORMAT
|
- name: ARGOCD_REPO_SERVER_LOGFORMAT
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.log.format
|
key: reposerver.log.format
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_LOGLEVEL
|
- name: ARGOCD_REPO_SERVER_LOGLEVEL
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.log.level
|
key: reposerver.log.level
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
|
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.parallelism.limit
|
key: reposerver.parallelism.limit
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
|
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.listen.address
|
key: reposerver.listen.address
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
|
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.metrics.listen.address
|
key: reposerver.metrics.listen.address
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_DISABLE_TLS
|
- name: ARGOCD_REPO_SERVER_DISABLE_TLS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.disable.tls
|
key: reposerver.disable.tls
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_TLS_MIN_VERSION
|
- name: ARGOCD_TLS_MIN_VERSION
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.tls.minversion
|
key: reposerver.tls.minversion
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_TLS_MAX_VERSION
|
- name: ARGOCD_TLS_MAX_VERSION
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.tls.maxversion
|
key: reposerver.tls.maxversion
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_TLS_CIPHERS
|
- name: ARGOCD_TLS_CIPHERS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.tls.ciphers
|
key: reposerver.tls.ciphers
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_CACHE_EXPIRATION
|
- name: ARGOCD_REPO_CACHE_EXPIRATION
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.repo.cache.expiration
|
key: reposerver.repo.cache.expiration
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: REDIS_SERVER
|
- name: REDIS_SERVER
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: redis.server
|
key: redis.server
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: REDIS_COMPRESSION
|
- name: REDIS_COMPRESSION
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: redis.compression
|
key: redis.compression
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: REDISDB
|
- name: REDISDB
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: redis.db
|
key: redis.db
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: REDIS_USERNAME
|
- name: REDIS_USERNAME
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
key: redis-username
|
key: redis-username
|
||||||
name: argocd-redis
|
name: argocd-redis
|
||||||
optional: true
|
optional: true
|
||||||
- name: REDIS_PASSWORD
|
- name: REDIS_PASSWORD
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
key: redis-password
|
key: redis-password
|
||||||
name: argocd-redis
|
name: argocd-redis
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION
|
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.default.cache.expiration
|
key: reposerver.default.cache.expiration
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
|
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: otlp.address
|
key: otlp.address
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE
|
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: otlp.insecure
|
key: otlp.insecure
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS
|
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: otlp.headers
|
key: otlp.headers
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
|
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.max.combined.directory.manifests.size
|
key: reposerver.max.combined.directory.manifests.size
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
|
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.plugin.tar.exclusions
|
key: reposerver.plugin.tar.exclusions
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
|
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.allow.oob.symlinks
|
key: reposerver.allow.oob.symlinks
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
|
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.streamed.manifest.max.tar.size
|
key: reposerver.streamed.manifest.max.tar.size
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
|
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.streamed.manifest.max.extracted.size
|
key: reposerver.streamed.manifest.max.extracted.size
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.helm.manifest.max.extracted.size
|
key: reposerver.helm.manifest.max.extracted.size
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.disable.helm.manifest.max.extracted.size
|
key: reposerver.disable.helm.manifest.max.extracted.size
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_GIT_MODULES_ENABLED
|
- name: ARGOCD_GIT_MODULES_ENABLED
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.enable.git.submodule
|
key: reposerver.enable.git.submodule
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
|
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.git.lsremote.parallelism.limit
|
key: reposerver.git.lsremote.parallelism.limit
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: ARGOCD_GIT_REQUEST_TIMEOUT
|
- name: ARGOCD_GIT_REQUEST_TIMEOUT
|
||||||
valueFrom:
|
valueFrom:
|
||||||
configMapKeyRef:
|
configMapKeyRef:
|
||||||
key: reposerver.git.request.timeout
|
key: reposerver.git.request.timeout
|
||||||
name: argocd-cmd-params-cm
|
name: argocd-cmd-params-cm
|
||||||
optional: true
|
optional: true
|
||||||
- name: HELM_CACHE_HOME
|
- name: HELM_CACHE_HOME
|
||||||
value: /helm-working-dir
|
value: /helm-working-dir
|
||||||
- name: HELM_CONFIG_HOME
|
- name: HELM_CONFIG_HOME
|
||||||
value: /helm-working-dir
|
value: /helm-working-dir
|
||||||
- name: HELM_DATA_HOME
|
- name: HELM_DATA_HOME
|
||||||
value: /helm-working-dir
|
value: /helm-working-dir
|
||||||
image: quay.io/argoproj/argocd:v2.10.4
|
image: quay.io/argoproj/argocd:v2.10.4
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
livenessProbe:
|
livenessProbe:
|
||||||
failureThreshold: 3
|
failureThreshold: 3
|
||||||
httpGet:
|
httpGet:
|
||||||
path: /healthz?full=true
|
path: /healthz?full=true
|
||||||
port: metrics
|
port: metrics
|
||||||
scheme: HTTP
|
scheme: HTTP
|
||||||
initialDelaySeconds: 10
|
initialDelaySeconds: 10
|
||||||
periodSeconds: 10
|
periodSeconds: 10
|
||||||
successThreshold: 1
|
successThreshold: 1
|
||||||
timeoutSeconds: 1
|
timeoutSeconds: 1
|
||||||
name: repo-server
|
|
||||||
ports:
|
|
||||||
- containerPort: 8081
|
|
||||||
name: repo-server
|
name: repo-server
|
||||||
protocol: TCP
|
ports:
|
||||||
- containerPort: 8084
|
- containerPort: 8081
|
||||||
name: metrics
|
name: repo-server
|
||||||
protocol: TCP
|
protocol: TCP
|
||||||
readinessProbe:
|
- containerPort: 8084
|
||||||
failureThreshold: 3
|
name: metrics
|
||||||
httpGet:
|
protocol: TCP
|
||||||
path: /healthz
|
readinessProbe:
|
||||||
port: metrics
|
failureThreshold: 3
|
||||||
scheme: HTTP
|
httpGet:
|
||||||
initialDelaySeconds: 10
|
path: /healthz
|
||||||
periodSeconds: 10
|
port: metrics
|
||||||
successThreshold: 1
|
scheme: HTTP
|
||||||
timeoutSeconds: 1
|
initialDelaySeconds: 10
|
||||||
resources: {}
|
periodSeconds: 10
|
||||||
securityContext:
|
successThreshold: 1
|
||||||
allowPrivilegeEscalation: false
|
timeoutSeconds: 1
|
||||||
capabilities:
|
resources: {}
|
||||||
drop:
|
securityContext:
|
||||||
- ALL
|
allowPrivilegeEscalation: false
|
||||||
readOnlyRootFilesystem: true
|
capabilities:
|
||||||
runAsNonRoot: true
|
drop:
|
||||||
seccompProfile:
|
- ALL
|
||||||
type: RuntimeDefault
|
readOnlyRootFilesystem: true
|
||||||
terminationMessagePath: /dev/termination-log
|
runAsNonRoot: true
|
||||||
terminationMessagePolicy: File
|
seccompProfile:
|
||||||
volumeMounts:
|
type: RuntimeDefault
|
||||||
- mountPath: /app/config/ssh
|
terminationMessagePath: /dev/termination-log
|
||||||
name: ssh-known-hosts
|
terminationMessagePolicy: File
|
||||||
- mountPath: /app/config/tls
|
volumeMounts:
|
||||||
name: tls-certs
|
- mountPath: /app/config/ssh
|
||||||
- mountPath: /app/config/gpg/source
|
name: ssh-known-hosts
|
||||||
name: gpg-keys
|
- mountPath: /app/config/tls
|
||||||
- mountPath: /app/config/gpg/keys
|
name: tls-certs
|
||||||
name: gpg-keyring
|
- mountPath: /app/config/gpg/source
|
||||||
- mountPath: /app/config/reposerver/tls
|
name: gpg-keys
|
||||||
name: argocd-repo-server-tls
|
- mountPath: /app/config/gpg/keys
|
||||||
- mountPath: /helm-working-dir
|
name: gpg-keyring
|
||||||
name: helm-working-dir
|
- mountPath: /app/config/reposerver/tls
|
||||||
- mountPath: /home/argocd/cmp-server/plugins
|
name: argocd-repo-server-tls
|
||||||
name: plugins
|
- mountPath: /helm-working-dir
|
||||||
- mountPath: /tmp
|
name: helm-working-dir
|
||||||
name: tmp
|
- mountPath: /home/argocd/cmp-server/plugins
|
||||||
- command:
|
name: plugins
|
||||||
- /var/run/argocd/argocd-cmp-server
|
- mountPath: /tmp
|
||||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
name: tmp
|
||||||
imagePullPolicy: Always
|
- command:
|
||||||
name: kustomize-helm-with-rewrite
|
- /var/run/argocd/argocd-cmp-server
|
||||||
resources: {}
|
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||||
securityContext:
|
imagePullPolicy: Always
|
||||||
runAsNonRoot: true
|
name: kustomize-helm-with-rewrite
|
||||||
runAsUser: 999
|
resources: {}
|
||||||
terminationMessagePath: /dev/termination-log
|
securityContext:
|
||||||
terminationMessagePolicy: File
|
runAsNonRoot: true
|
||||||
volumeMounts:
|
runAsUser: 999
|
||||||
- mountPath: /var/run/argocd
|
terminationMessagePath: /dev/termination-log
|
||||||
name: var-files
|
terminationMessagePolicy: File
|
||||||
- mountPath: /home/argocd/cmp-server/plugins
|
volumeMounts:
|
||||||
name: plugins
|
- mountPath: /var/run/argocd
|
||||||
- mountPath: /tmp
|
name: var-files
|
||||||
name: cmp-tmp
|
- mountPath: /home/argocd/cmp-server/plugins
|
||||||
- mountPath: /helm-working-dir
|
name: plugins
|
||||||
name: helm-working-dir
|
- mountPath: /tmp
|
||||||
|
name: cmp-tmp
|
||||||
|
- mountPath: /helm-working-dir
|
||||||
|
name: helm-working-dir
|
||||||
dnsPolicy: ClusterFirst
|
dnsPolicy: ClusterFirst
|
||||||
imagePullSecrets:
|
imagePullSecrets:
|
||||||
- name: gitlab-pull-secret
|
- name: gitlab-pull-secret
|
||||||
initContainers:
|
initContainers:
|
||||||
- command:
|
- command:
|
||||||
- /bin/cp
|
- /bin/cp
|
||||||
- -n
|
- -n
|
||||||
- /usr/local/bin/argocd
|
- /usr/local/bin/argocd
|
||||||
- /var/run/argocd/argocd-cmp-server
|
- /var/run/argocd/argocd-cmp-server
|
||||||
image: quay.io/argoproj/argocd:v2.10.4
|
image: quay.io/argoproj/argocd:v2.10.4
|
||||||
imagePullPolicy: IfNotPresent
|
imagePullPolicy: IfNotPresent
|
||||||
name: copyutil
|
name: copyutil
|
||||||
resources: {}
|
resources: {}
|
||||||
securityContext:
|
securityContext:
|
||||||
allowPrivilegeEscalation: false
|
allowPrivilegeEscalation: false
|
||||||
capabilities:
|
capabilities:
|
||||||
drop:
|
drop:
|
||||||
- ALL
|
- ALL
|
||||||
readOnlyRootFilesystem: true
|
readOnlyRootFilesystem: true
|
||||||
runAsNonRoot: true
|
runAsNonRoot: true
|
||||||
seccompProfile:
|
seccompProfile:
|
||||||
type: RuntimeDefault
|
type: RuntimeDefault
|
||||||
terminationMessagePath: /dev/termination-log
|
terminationMessagePath: /dev/termination-log
|
||||||
terminationMessagePolicy: File
|
terminationMessagePolicy: File
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- mountPath: /var/run/argocd
|
- mountPath: /var/run/argocd
|
||||||
name: var-files
|
name: var-files
|
||||||
- command:
|
- command:
|
||||||
- /bin/sh
|
- /bin/sh
|
||||||
- /plugin/init-helm-repos.sh
|
- /plugin/init-helm-repos.sh
|
||||||
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
|
||||||
imagePullPolicy: Always
|
imagePullPolicy: Always
|
||||||
name: init-helm-repos
|
name: init-helm-repos
|
||||||
resources: {}
|
resources: {}
|
||||||
securityContext:
|
securityContext:
|
||||||
allowPrivilegeEscalation: false
|
allowPrivilegeEscalation: false
|
||||||
capabilities:
|
capabilities:
|
||||||
drop:
|
drop:
|
||||||
- ALL
|
- ALL
|
||||||
readOnlyRootFilesystem: true
|
readOnlyRootFilesystem: true
|
||||||
runAsUser: 999
|
runAsUser: 999
|
||||||
runAsNonRoot: true
|
runAsNonRoot: true
|
||||||
seccompProfile:
|
seccompProfile:
|
||||||
type: RuntimeDefault
|
type: RuntimeDefault
|
||||||
terminationMessagePath: /dev/termination-log
|
terminationMessagePath: /dev/termination-log
|
||||||
terminationMessagePolicy: File
|
terminationMessagePolicy: File
|
||||||
env:
|
env:
|
||||||
- name: OCEANBOX_HELM_ACCESS_TOKEN
|
- name: OCEANBOX_HELM_ACCESS_TOKEN
|
||||||
valueFrom:
|
valueFrom:
|
||||||
secretKeyRef:
|
secretKeyRef:
|
||||||
key: token
|
key: token
|
||||||
name: oceanbox-helm
|
name: oceanbox-helm
|
||||||
optional: false
|
optional: false
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- mountPath: /helm-working-dir
|
- mountPath: /helm-working-dir
|
||||||
name: helm-working-dir
|
name: helm-working-dir
|
||||||
restartPolicy: Always
|
restartPolicy: Always
|
||||||
schedulerName: default-scheduler
|
schedulerName: default-scheduler
|
||||||
securityContext: {}
|
securityContext: {}
|
||||||
@@ -386,40 +386,39 @@ spec:
|
|||||||
serviceAccountName: argocd-repo-server
|
serviceAccountName: argocd-repo-server
|
||||||
terminationGracePeriodSeconds: 30
|
terminationGracePeriodSeconds: 30
|
||||||
volumes:
|
volumes:
|
||||||
- emptyDir: {}
|
- emptyDir: {}
|
||||||
name: cmp-tmp
|
name: cmp-tmp
|
||||||
- emptyDir: {}
|
- emptyDir: {}
|
||||||
name: helm-working-dir
|
name: helm-working-dir
|
||||||
- emptyDir: {}
|
- emptyDir: {}
|
||||||
name: plugins
|
name: plugins
|
||||||
- emptyDir: {}
|
- emptyDir: {}
|
||||||
name: var-files
|
name: var-files
|
||||||
- emptyDir: {}
|
- emptyDir: {}
|
||||||
name: tmp
|
name: tmp
|
||||||
- configMap:
|
- configMap:
|
||||||
defaultMode: 420
|
defaultMode: 420
|
||||||
name: argocd-ssh-known-hosts-cm
|
name: argocd-ssh-known-hosts-cm
|
||||||
name: ssh-known-hosts
|
name: ssh-known-hosts
|
||||||
- configMap:
|
- configMap:
|
||||||
defaultMode: 420
|
defaultMode: 420
|
||||||
name: argocd-tls-certs-cm
|
name: argocd-tls-certs-cm
|
||||||
name: tls-certs
|
name: tls-certs
|
||||||
- configMap:
|
- configMap:
|
||||||
defaultMode: 420
|
defaultMode: 420
|
||||||
name: argocd-gpg-keys-cm
|
name: argocd-gpg-keys-cm
|
||||||
name: gpg-keys
|
name: gpg-keys
|
||||||
- emptyDir: {}
|
- emptyDir: {}
|
||||||
name: gpg-keyring
|
name: gpg-keyring
|
||||||
- name: argocd-repo-server-tls
|
- name: argocd-repo-server-tls
|
||||||
secret:
|
secret:
|
||||||
defaultMode: 420
|
defaultMode: 420
|
||||||
items:
|
items:
|
||||||
- key: tls.crt
|
- key: tls.crt
|
||||||
path: tls.crt
|
path: tls.crt
|
||||||
- key: tls.key
|
- key: tls.key
|
||||||
path: tls.key
|
path: tls.key
|
||||||
- key: ca.crt
|
- key: ca.crt
|
||||||
path: ca.crt
|
path: ca.crt
|
||||||
optional: true
|
optional: true
|
||||||
secretName: argocd-repo-server-tls
|
secretName: argocd-repo-server-tls
|
||||||
|
|
||||||
|
|||||||
@@ -13,4 +13,3 @@ stringData:
|
|||||||
name: staging-vcluster
|
name: staging-vcluster
|
||||||
server: https://staging-vcluster.staging-vcluster
|
server: https://staging-vcluster.staging-vcluster
|
||||||
type: Opaque
|
type: Opaque
|
||||||
|
|
||||||
|
|||||||
+11
-11
@@ -19,12 +19,12 @@ applications:
|
|||||||
plugin:
|
plugin:
|
||||||
name: helmfile-cmp
|
name: helmfile-cmp
|
||||||
env:
|
env:
|
||||||
- name: CLUSTER_NAME
|
- name: CLUSTER_NAME
|
||||||
value: replaceme
|
value: replaceme
|
||||||
- name: HELMFILE_ENVIRONMENT
|
- name: HELMFILE_ENVIRONMENT
|
||||||
value: default
|
value: default
|
||||||
- name: HELMFILE_FILE_PATH
|
- name: HELMFILE_FILE_PATH
|
||||||
value: system.yaml.gotmpl
|
value: system.yaml.gotmpl
|
||||||
projects:
|
projects:
|
||||||
sys:
|
sys:
|
||||||
namespace: argocd
|
namespace: argocd
|
||||||
@@ -32,12 +32,12 @@ projects:
|
|||||||
additionalAnnotations: {}
|
additionalAnnotations: {}
|
||||||
description: sys components project
|
description: sys components project
|
||||||
sourceRepos:
|
sourceRepos:
|
||||||
- '*'
|
- "*"
|
||||||
destinations:
|
destinations:
|
||||||
- namespace: '*'
|
- namespace: "*"
|
||||||
server: https://kubernetes.default.svc
|
server: https://kubernetes.default.svc
|
||||||
clusterResourceWhitelist:
|
clusterResourceWhitelist:
|
||||||
- group: '*'
|
- group: "*"
|
||||||
kind: '*'
|
kind: "*"
|
||||||
orphanedResources:
|
orphanedResources:
|
||||||
warn: false
|
warn: false
|
||||||
|
|||||||
+8
-3
@@ -5,6 +5,8 @@ let
|
|||||||
|
|
||||||
globalExcludes = [
|
globalExcludes = [
|
||||||
"nix/default.nix"
|
"nix/default.nix"
|
||||||
|
"attic"
|
||||||
|
"vcluster"
|
||||||
".*vendor"
|
".*vendor"
|
||||||
".*chart/.*"
|
".*chart/.*"
|
||||||
".*schema.json"
|
".*schema.json"
|
||||||
@@ -32,6 +34,7 @@ pre-commit.run {
|
|||||||
enable = true;
|
enable = true;
|
||||||
excludes = [
|
excludes = [
|
||||||
"vcluster/"
|
"vcluster/"
|
||||||
|
"attic/"
|
||||||
];
|
];
|
||||||
args = [
|
args = [
|
||||||
"-x"
|
"-x"
|
||||||
@@ -41,15 +44,17 @@ pre-commit.run {
|
|||||||
};
|
};
|
||||||
|
|
||||||
yamllint = {
|
yamllint = {
|
||||||
enable = false;
|
enable = true;
|
||||||
excludes = [
|
excludes = [
|
||||||
"attic/"
|
"attic/"
|
||||||
"charts/templates/"
|
"charts/templates/"
|
||||||
"charts/charts/"
|
"charts/"
|
||||||
|
"values/"
|
||||||
|
"vcluster/"
|
||||||
];
|
];
|
||||||
settings = {
|
settings = {
|
||||||
strict = true;
|
strict = true;
|
||||||
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 165} } }'';
|
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 300} } }'';
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
+203
-182
@@ -1,183 +1,204 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: etcd
|
- name: etcd
|
||||||
rules:
|
rules:
|
||||||
- alert: etcdMembersDown
|
- alert: etcdMembersDown
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
|
description:
|
||||||
}}).'
|
'etcd cluster "{{ $labels.job }}": members are down ({{ $value
|
||||||
summary: etcd cluster members are down.
|
}}).'
|
||||||
expr: |-
|
summary: etcd cluster members are down.
|
||||||
max without (endpoint) (
|
expr: |-
|
||||||
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
|
max without (endpoint) (
|
||||||
or
|
sum without (instance) (up{job=~".*etcd.*"} == bool 0)
|
||||||
count without (To) (
|
or
|
||||||
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
count without (To) (
|
||||||
)
|
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
|
||||||
)
|
)
|
||||||
> 0
|
)
|
||||||
for: 10m
|
> 0
|
||||||
labels:
|
for: 10m
|
||||||
severity: critical
|
labels:
|
||||||
- alert: etcdInsufficientMembers
|
severity: critical
|
||||||
annotations:
|
- alert: etcdInsufficientMembers
|
||||||
description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
annotations:
|
||||||
}}).'
|
description:
|
||||||
summary: etcd cluster has insufficient number of members.
|
'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
|
||||||
expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
|
}}).'
|
||||||
without (instance) + 1) / 2)
|
summary: etcd cluster has insufficient number of members.
|
||||||
for: 3m
|
expr:
|
||||||
labels:
|
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
|
||||||
severity: critical
|
without (instance) + 1) / 2)
|
||||||
- alert: etcdNoLeader
|
for: 3m
|
||||||
annotations:
|
labels:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
|
severity: critical
|
||||||
has no leader.'
|
- alert: etcdNoLeader
|
||||||
summary: etcd cluster has no leader.
|
annotations:
|
||||||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
description:
|
||||||
for: 1m
|
'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
|
||||||
labels:
|
has no leader.'
|
||||||
severity: critical
|
summary: etcd cluster has no leader.
|
||||||
- alert: etcdHighNumberOfLeaderChanges
|
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
|
||||||
annotations:
|
for: 1m
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
|
labels:
|
||||||
within the last 15 minutes. Frequent elections may be a sign of insufficient
|
severity: critical
|
||||||
resources, high network latency, or disruptions by other components and should
|
- alert: etcdHighNumberOfLeaderChanges
|
||||||
be investigated.'
|
annotations:
|
||||||
summary: etcd cluster has high number of leader changes.
|
description:
|
||||||
expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
|
'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
|
||||||
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
|
within the last 15 minutes. Frequent elections may be a sign of insufficient
|
||||||
>= 4
|
resources, high network latency, or disruptions by other components and should
|
||||||
for: 5m
|
be investigated.'
|
||||||
labels:
|
summary: etcd cluster has high number of leader changes.
|
||||||
severity: warning
|
expr:
|
||||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
|
||||||
annotations:
|
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
>= 4
|
||||||
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
for: 5m
|
||||||
summary: etcd cluster has high number of failed grpc requests.
|
labels:
|
||||||
expr: |-
|
severity: warning
|
||||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||||
/
|
annotations:
|
||||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
description:
|
||||||
> 1
|
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||||
for: 10m
|
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||||
labels:
|
summary: etcd cluster has high number of failed grpc requests.
|
||||||
severity: warning
|
expr: |-
|
||||||
- alert: etcdHighNumberOfFailedGRPCRequests
|
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||||
annotations:
|
/
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||||
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
> 1
|
||||||
summary: etcd cluster has high number of failed grpc requests.
|
for: 10m
|
||||||
expr: |-
|
labels:
|
||||||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
severity: warning
|
||||||
/
|
- alert: etcdHighNumberOfFailedGRPCRequests
|
||||||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
annotations:
|
||||||
> 5
|
description:
|
||||||
for: 5m
|
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
|
||||||
labels:
|
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
|
||||||
severity: critical
|
summary: etcd cluster has high number of failed grpc requests.
|
||||||
- alert: etcdGRPCRequestsSlow
|
expr: |-
|
||||||
annotations:
|
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
|
/
|
||||||
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
|
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
|
||||||
}} method.'
|
> 5
|
||||||
summary: etcd grpc requests are slow
|
for: 5m
|
||||||
expr: |-
|
labels:
|
||||||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
severity: critical
|
||||||
> 0.15
|
- alert: etcdGRPCRequestsSlow
|
||||||
for: 10m
|
annotations:
|
||||||
labels:
|
description:
|
||||||
severity: critical
|
'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
|
||||||
- alert: etcdMemberCommunicationSlow
|
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
|
||||||
annotations:
|
}} method.'
|
||||||
description: 'etcd cluster "{{ $labels.job }}": member communication with {{
|
summary: etcd grpc requests are slow
|
||||||
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
|
expr: |-
|
||||||
}}.'
|
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
|
||||||
summary: etcd cluster member communication is slow.
|
> 0.15
|
||||||
expr: |-
|
for: 10m
|
||||||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
labels:
|
||||||
> 0.15
|
severity: critical
|
||||||
for: 10m
|
- alert: etcdMemberCommunicationSlow
|
||||||
labels:
|
annotations:
|
||||||
severity: warning
|
description:
|
||||||
- alert: etcdHighNumberOfFailedProposals
|
'etcd cluster "{{ $labels.job }}": member communication with {{
|
||||||
annotations:
|
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
|
||||||
description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
|
}}.'
|
||||||
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
summary: etcd cluster member communication is slow.
|
||||||
summary: etcd cluster has high number of proposal failures.
|
expr: |-
|
||||||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||||
for: 15m
|
> 0.15
|
||||||
labels:
|
for: 10m
|
||||||
severity: warning
|
labels:
|
||||||
- alert: etcdHighFsyncDurations
|
severity: warning
|
||||||
annotations:
|
- alert: etcdHighNumberOfFailedProposals
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
annotations:
|
||||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
description:
|
||||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
|
||||||
expr: |-
|
within the last 30 minutes on etcd instance {{ $labels.instance }}.'
|
||||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
summary: etcd cluster has high number of proposal failures.
|
||||||
> 0.5
|
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
|
||||||
for: 10m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: etcdHighFsyncDurations
|
- alert: etcdHighFsyncDurations
|
||||||
annotations:
|
annotations:
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
description:
|
||||||
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||||
summary: etcd cluster 99th percentile fsync durations are too high.
|
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
expr: |-
|
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
expr: |-
|
||||||
> 1
|
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||||
for: 10m
|
> 0.5
|
||||||
labels:
|
for: 10m
|
||||||
severity: critical
|
labels:
|
||||||
- alert: etcdHighCommitDurations
|
severity: warning
|
||||||
annotations:
|
- alert: etcdHighFsyncDurations
|
||||||
description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
annotations:
|
||||||
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
description:
|
||||||
summary: etcd cluster 99th percentile commit durations are too high.
|
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
|
||||||
expr: |-
|
are {{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
summary: etcd cluster 99th percentile fsync durations are too high.
|
||||||
> 0.25
|
expr: |-
|
||||||
for: 10m
|
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||||
labels:
|
> 1
|
||||||
severity: warning
|
for: 10m
|
||||||
- alert: etcdDatabaseQuotaLowSpace
|
labels:
|
||||||
annotations:
|
severity: critical
|
||||||
description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined
|
- alert: etcdHighCommitDurations
|
||||||
quota on etcd instance {{ $labels.instance }}, please defrag or increase the
|
annotations:
|
||||||
quota as the writes to etcd will be disabled when it is full.'
|
description:
|
||||||
summary: etcd cluster database is running full.
|
'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
|
||||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
|
{{ $value }}s on etcd instance {{ $labels.instance }}.'
|
||||||
last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 >
|
summary: etcd cluster 99th percentile commit durations are too high.
|
||||||
95
|
expr: |-
|
||||||
for: 10m
|
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
|
||||||
labels:
|
> 0.25
|
||||||
severity: critical
|
for: 10m
|
||||||
- alert: etcdExcessiveDatabaseGrowth
|
labels:
|
||||||
annotations:
|
severity: warning
|
||||||
description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk
|
- alert: etcdDatabaseQuotaLowSpace
|
||||||
space in the next four hours, based on write observations within the past
|
annotations:
|
||||||
four hours on etcd instance {{ $labels.instance }}, please check as it might
|
description:
|
||||||
be disruptive.'
|
'etcd cluster "{{ $labels.job }}": database size exceeds the defined
|
||||||
summary: etcd cluster database growing very fast.
|
quota on etcd instance {{ $labels.instance }}, please defrag or increase the
|
||||||
expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
|
quota as the writes to etcd will be disabled when it is full.'
|
||||||
> etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
summary: etcd cluster database is running full.
|
||||||
for: 10m
|
expr:
|
||||||
labels:
|
(last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
|
||||||
severity: warning
|
last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 >
|
||||||
- alert: etcdDatabaseHighFragmentationRatio
|
95
|
||||||
annotations:
|
for: 10m
|
||||||
description: 'etcd cluster "{{ $labels.job }}": database size in use on instance
|
labels:
|
||||||
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
|
severity: critical
|
||||||
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
|
- alert: etcdExcessiveDatabaseGrowth
|
||||||
retrieve the unused fragmented disk space.'
|
annotations:
|
||||||
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
description:
|
||||||
summary: etcd database size in use is less than 50% of the actual allocated
|
'etcd cluster "{{ $labels.job }}": Predicting running out of disk
|
||||||
storage.
|
space in the next four hours, based on write observations within the past
|
||||||
expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
|
four hours on etcd instance {{ $labels.instance }}, please check as it might
|
||||||
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
|
be disruptive.'
|
||||||
and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
summary: etcd cluster database growing very fast.
|
||||||
for: 10m
|
expr:
|
||||||
labels:
|
predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
|
||||||
severity: warning
|
> etcd_server_quota_backend_bytes{job=~".*etcd.*"}
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: etcdDatabaseHighFragmentationRatio
|
||||||
|
annotations:
|
||||||
|
description:
|
||||||
|
'etcd cluster "{{ $labels.job }}": database size in use on instance
|
||||||
|
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
|
||||||
|
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
|
||||||
|
retrieve the unused fragmented disk space.'
|
||||||
|
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
|
||||||
|
summary:
|
||||||
|
etcd database size in use is less than 50% of the actual allocated
|
||||||
|
storage.
|
||||||
|
expr:
|
||||||
|
(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
|
||||||
|
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
|
||||||
|
and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|||||||
+46
-42
@@ -1,43 +1,47 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: general.rules
|
- name: general.rules
|
||||||
rules:
|
rules:
|
||||||
- alert: TargetDown
|
- alert: TargetDown
|
||||||
annotations:
|
annotations:
|
||||||
description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
|
description:
|
||||||
}} targets in {{ $labels.namespace }} namespace are down.'
|
'{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
}} targets in {{ $labels.namespace }} namespace are down.'
|
||||||
summary: One or more targets are unreachable.
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
|
||||||
expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
|
summary: One or more targets are unreachable.
|
||||||
BY (cluster, job, namespace, service)) > 10
|
expr:
|
||||||
for: 10m
|
100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
|
||||||
labels:
|
BY (cluster, job, namespace, service)) > 10
|
||||||
severity: warning
|
for: 10m
|
||||||
- alert: Watchdog
|
labels:
|
||||||
annotations:
|
severity: warning
|
||||||
description: |
|
- alert: Watchdog
|
||||||
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
annotations:
|
||||||
This alert is always firing, therefore it should always be firing in Alertmanager
|
description: |
|
||||||
and always fire against a receiver. There are integrations with various notification
|
This is an alert meant to ensure that the entire alerting pipeline is functional.
|
||||||
mechanisms that send a notification when this alert is not firing. For example the
|
This alert is always firing, therefore it should always be firing in Alertmanager
|
||||||
"DeadMansSnitch" integration in PagerDuty.
|
and always fire against a receiver. There are integrations with various notification
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
mechanisms that send a notification when this alert is not firing. For example the
|
||||||
summary: An alert that should always be firing to certify that Alertmanager
|
"DeadMansSnitch" integration in PagerDuty.
|
||||||
is working properly.
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
|
||||||
expr: vector(1)
|
summary:
|
||||||
labels:
|
An alert that should always be firing to certify that Alertmanager
|
||||||
severity: none
|
is working properly.
|
||||||
- alert: InfoInhibitor
|
expr: vector(1)
|
||||||
annotations:
|
labels:
|
||||||
description: |
|
severity: none
|
||||||
This is an alert that is used to inhibit info alerts.
|
- alert: InfoInhibitor
|
||||||
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
annotations:
|
||||||
other alerts.
|
description: |
|
||||||
This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
|
This is an alert that is used to inhibit info alerts.
|
||||||
severity of 'warning' or 'critical' starts firing on the same namespace.
|
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
|
||||||
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
other alerts.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
|
||||||
summary: Info-level alert inhibition.
|
severity of 'warning' or 'critical' starts firing on the same namespace.
|
||||||
expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
|
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
|
||||||
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
|
||||||
labels:
|
summary: Info-level alert inhibition.
|
||||||
severity: none
|
expr:
|
||||||
|
ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
|
||||||
|
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
|
||||||
|
labels:
|
||||||
|
severity: none
|
||||||
|
|||||||
+277
-258
@@ -1,262 +1,281 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: kubernetes-apps
|
- name: kubernetes-apps
|
||||||
rules:
|
rules:
|
||||||
- alert: KubePodCrashLooping
|
- alert: KubePodCrashLooping
|
||||||
annotations:
|
annotations:
|
||||||
description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
description:
|
||||||
}}) is in waiting state (reason: "CrashLoopBackOff").'
|
'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
}}) is in waiting state (reason: "CrashLoopBackOff").'
|
||||||
summary: Pod is crash looping.
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
|
||||||
expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
|
summary: Pod is crash looping.
|
||||||
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
|
expr:
|
||||||
for: 15m
|
max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
|
||||||
labels:
|
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
|
||||||
severity: warning
|
for: 15m
|
||||||
- alert: KubePodNotReady
|
labels:
|
||||||
annotations:
|
severity: warning
|
||||||
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
- alert: KubePodNotReady
|
||||||
state for longer than 15 minutes.
|
annotations:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
description:
|
||||||
summary: Pod has been in a non-ready state for more than 15 minutes.
|
Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
|
||||||
expr: |-
|
state for longer than 15 minutes.
|
||||||
sum by (namespace, pod, cluster) (
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
|
||||||
max by (namespace, pod, cluster) (
|
summary: Pod has been in a non-ready state for more than 15 minutes.
|
||||||
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"}
|
expr: |-
|
||||||
) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) (
|
sum by (namespace, pod, cluster) (
|
||||||
1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
|
max by (namespace, pod, cluster) (
|
||||||
)
|
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"}
|
||||||
) > 0
|
) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) (
|
||||||
for: 15m
|
1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
|
||||||
labels:
|
)
|
||||||
severity: warning
|
) > 0
|
||||||
- alert: KubeDeploymentGenerationMismatch
|
for: 15m
|
||||||
annotations:
|
labels:
|
||||||
description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
severity: warning
|
||||||
}} does not match, this indicates that the Deployment has failed but has not
|
- alert: KubeDeploymentGenerationMismatch
|
||||||
been rolled back.
|
annotations:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
description:
|
||||||
summary: Deployment generation mismatch due to possible roll-back
|
Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
|
||||||
expr: |-
|
}} does not match, this indicates that the Deployment has failed but has not
|
||||||
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
been rolled back.
|
||||||
!=
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
|
||||||
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
summary: Deployment generation mismatch due to possible roll-back
|
||||||
for: 15m
|
expr: |-
|
||||||
labels:
|
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
||||||
severity: warning
|
|
||||||
- alert: KubeDeploymentReplicasMismatch
|
|
||||||
annotations:
|
|
||||||
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
|
|
||||||
not matched the expected number of replicas for longer than 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
|
||||||
summary: Deployment has not matched the expected number of replicas.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
>
|
|
||||||
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
) and (
|
|
||||||
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
|
||||||
==
|
|
||||||
0
|
|
||||||
)
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeDeploymentRolloutStuck
|
|
||||||
annotations:
|
|
||||||
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
|
|
||||||
}} is not progressing for longer than 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
|
|
||||||
summary: Deployment rollout is not progressing.
|
|
||||||
expr: |-
|
|
||||||
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!= 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeStatefulSetReplicasMismatch
|
|
||||||
annotations:
|
|
||||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
|
|
||||||
not matched the expected number of replicas for longer than 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
|
||||||
summary: StatefulSet has not matched the expected number of replicas.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
) and (
|
|
||||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
|
||||||
==
|
|
||||||
0
|
|
||||||
)
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeStatefulSetGenerationMismatch
|
|
||||||
annotations:
|
|
||||||
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
|
||||||
}} does not match, this indicates that the StatefulSet has failed but has
|
|
||||||
not been rolled back.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
|
||||||
summary: StatefulSet generation mismatch due to possible roll-back
|
|
||||||
expr: |-
|
|
||||||
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeStatefulSetUpdateNotRolledOut
|
|
||||||
annotations:
|
|
||||||
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
|
||||||
has not been rolled out.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
|
||||||
summary: StatefulSet update has not been rolled out.
|
|
||||||
expr: |-
|
|
||||||
(
|
|
||||||
max by (namespace, statefulset) (
|
|
||||||
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
unless
|
|
||||||
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
)
|
|
||||||
*
|
|
||||||
(
|
|
||||||
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
!=
|
||||||
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
|
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
||||||
)
|
for: 15m
|
||||||
) and (
|
labels:
|
||||||
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
|
severity: warning
|
||||||
==
|
- alert: KubeDeploymentReplicasMismatch
|
||||||
0
|
annotations:
|
||||||
)
|
description:
|
||||||
for: 15m
|
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
|
||||||
labels:
|
not matched the expected number of replicas for longer than 15 minutes.
|
||||||
severity: warning
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
|
||||||
- alert: KubeDaemonSetRolloutStuck
|
summary: Deployment has not matched the expected number of replicas.
|
||||||
annotations:
|
expr: |-
|
||||||
description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
|
(
|
||||||
finished or progressed for at least 15 minutes.
|
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
>
|
||||||
summary: DaemonSet rollout is stuck.
|
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
|
||||||
expr: |-
|
) and (
|
||||||
(
|
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
||||||
(
|
==
|
||||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
0
|
||||||
!=
|
)
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeDeploymentRolloutStuck
|
||||||
|
annotations:
|
||||||
|
description:
|
||||||
|
Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
|
||||||
|
}} is not progressing for longer than 15 minutes.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
|
||||||
|
summary: Deployment rollout is not progressing.
|
||||||
|
expr: |-
|
||||||
|
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
|
||||||
|
!= 0
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeStatefulSetReplicasMismatch
|
||||||
|
annotations:
|
||||||
|
description:
|
||||||
|
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
|
||||||
|
not matched the expected number of replicas for longer than 15 minutes.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
|
||||||
|
summary: StatefulSet has not matched the expected number of replicas.
|
||||||
|
expr: |-
|
||||||
|
(
|
||||||
|
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
!=
|
||||||
|
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
) and (
|
||||||
|
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
|
||||||
|
==
|
||||||
|
0
|
||||||
|
)
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeStatefulSetGenerationMismatch
|
||||||
|
annotations:
|
||||||
|
description:
|
||||||
|
StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
|
||||||
|
}} does not match, this indicates that the StatefulSet has failed but has
|
||||||
|
not been rolled back.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
|
||||||
|
summary: StatefulSet generation mismatch due to possible roll-back
|
||||||
|
expr: |-
|
||||||
|
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
!=
|
||||||
|
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeStatefulSetUpdateNotRolledOut
|
||||||
|
annotations:
|
||||||
|
description:
|
||||||
|
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
|
||||||
|
has not been rolled out.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
|
||||||
|
summary: StatefulSet update has not been rolled out.
|
||||||
|
expr: |-
|
||||||
|
(
|
||||||
|
max by (namespace, statefulset) (
|
||||||
|
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
unless
|
||||||
|
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
)
|
||||||
|
*
|
||||||
|
(
|
||||||
|
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
!=
|
||||||
|
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
)
|
||||||
|
) and (
|
||||||
|
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
|
||||||
|
==
|
||||||
|
0
|
||||||
|
)
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeDaemonSetRolloutStuck
|
||||||
|
annotations:
|
||||||
|
description:
|
||||||
|
DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
|
||||||
|
finished or progressed for at least 15 minutes.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
|
||||||
|
summary: DaemonSet rollout is stuck.
|
||||||
|
expr: |-
|
||||||
|
(
|
||||||
|
(
|
||||||
|
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
!=
|
||||||
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
) or (
|
||||||
|
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
!=
|
||||||
|
0
|
||||||
|
) or (
|
||||||
|
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
!=
|
||||||
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
) or (
|
||||||
|
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
!=
|
||||||
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||||
|
)
|
||||||
|
) and (
|
||||||
|
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
|
||||||
|
==
|
||||||
|
0
|
||||||
|
)
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeContainerWaiting
|
||||||
|
annotations:
|
||||||
|
description:
|
||||||
|
pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
|
||||||
|
{{ $labels.container}} has been in waiting state for longer than 1 hour.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
||||||
|
summary: Pod container waiting longer than 1 hour
|
||||||
|
expr:
|
||||||
|
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
|
||||||
|
namespace=~".*"}) > 0
|
||||||
|
for: 1h
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeDaemonSetNotScheduled
|
||||||
|
annotations:
|
||||||
|
description:
|
||||||
|
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||||
|
}} are not scheduled."
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
||||||
|
summary: DaemonSet pods are not scheduled.
|
||||||
|
expr: |-
|
||||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
||||||
) or (
|
-
|
||||||
|
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
|
||||||
|
for: 10m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: KubeDaemonSetMisScheduled
|
||||||
|
annotations:
|
||||||
|
description:
|
||||||
|
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
||||||
|
}} are running where they are not supposed to run."
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
||||||
|
summary: DaemonSet pods are misscheduled.
|
||||||
|
expr:
|
||||||
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
||||||
!=
|
> 0
|
||||||
0
|
for: 15m
|
||||||
) or (
|
labels:
|
||||||
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
severity: warning
|
||||||
!=
|
- alert: KubeJobNotCompleted
|
||||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
annotations:
|
||||||
) or (
|
description:
|
||||||
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
|
Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
||||||
!=
|
than {{ "43200" | humanizeDuration }} to complete.
|
||||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
|
||||||
)
|
summary: Job did not complete in time
|
||||||
) and (
|
expr: |-
|
||||||
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
|
time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
|
||||||
==
|
and
|
||||||
0
|
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
|
||||||
)
|
labels:
|
||||||
for: 15m
|
severity: warning
|
||||||
labels:
|
- alert: KubeJobFailed
|
||||||
severity: warning
|
annotations:
|
||||||
- alert: KubeContainerWaiting
|
description:
|
||||||
annotations:
|
Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
||||||
description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
|
Removing failed job after investigation should clear this alert.
|
||||||
{{ $labels.container}} has been in waiting state for longer than 1 hour.
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
|
summary: Job failed to complete.
|
||||||
summary: Pod container waiting longer than 1 hour
|
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
|
||||||
expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
|
for: 15m
|
||||||
namespace=~".*"}) > 0
|
labels:
|
||||||
for: 1h
|
severity: warning
|
||||||
labels:
|
- alert: KubeHpaReplicasMismatch
|
||||||
severity: warning
|
annotations:
|
||||||
- alert: KubeDaemonSetNotScheduled
|
description:
|
||||||
annotations:
|
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
has not matched the desired number of replicas for longer than 15 minutes.
|
||||||
}} are not scheduled.'
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
|
summary: HPA has not matched desired number of replicas.
|
||||||
summary: DaemonSet pods are not scheduled.
|
expr: |-
|
||||||
expr: |-
|
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||||
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
|
!=
|
||||||
-
|
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||||
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
|
and
|
||||||
for: 10m
|
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||||
labels:
|
>
|
||||||
severity: warning
|
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||||
- alert: KubeDaemonSetMisScheduled
|
and
|
||||||
annotations:
|
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||||
description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
|
<
|
||||||
}} are running where they are not supposed to run.'
|
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
|
and
|
||||||
summary: DaemonSet pods are misscheduled.
|
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
|
||||||
expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
|
for: 15m
|
||||||
> 0
|
labels:
|
||||||
for: 15m
|
severity: warning
|
||||||
labels:
|
- alert: KubeHpaMaxedOut
|
||||||
severity: warning
|
annotations:
|
||||||
- alert: KubeJobNotCompleted
|
description:
|
||||||
annotations:
|
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
||||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
|
has been running at max replicas for longer than 15 minutes.
|
||||||
than {{ "43200" | humanizeDuration }} to complete.
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
|
summary: HPA is running at max replicas
|
||||||
summary: Job did not complete in time
|
expr: |-
|
||||||
expr: |-
|
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||||
time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
|
==
|
||||||
and
|
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
|
||||||
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: KubeJobFailed
|
|
||||||
annotations:
|
|
||||||
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
|
|
||||||
Removing failed job after investigation should clear this alert.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
|
|
||||||
summary: Job failed to complete.
|
|
||||||
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeHpaReplicasMismatch
|
|
||||||
annotations:
|
|
||||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
|
||||||
has not matched the desired number of replicas for longer than 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
|
|
||||||
summary: HPA has not matched desired number of replicas.
|
|
||||||
expr: |-
|
|
||||||
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
!=
|
|
||||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
|
|
||||||
and
|
|
||||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
>
|
|
||||||
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
|
|
||||||
and
|
|
||||||
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
<
|
|
||||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
|
|
||||||
and
|
|
||||||
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
- alert: KubeHpaMaxedOut
|
|
||||||
annotations:
|
|
||||||
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
|
|
||||||
has been running at max replicas for longer than 15 minutes.
|
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
|
|
||||||
summary: HPA is running at max replicas
|
|
||||||
expr: |-
|
|
||||||
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
==
|
|
||||||
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
|
|
||||||
for: 15m
|
|
||||||
labels:
|
|
||||||
severity: warning
|
|
||||||
|
|||||||
+122
-114
@@ -1,115 +1,123 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: kubernetes-resources
|
- name: kubernetes-resources
|
||||||
rules:
|
rules:
|
||||||
- alert: KubeCPUOvercommit
|
- alert: KubeCPUOvercommit
|
||||||
annotations:
|
annotations:
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
description:
|
||||||
for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
|
||||||
summary: Cluster has overcommitted CPU resource requests.
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
|
||||||
expr: |-
|
summary: Cluster has overcommitted CPU resource requests.
|
||||||
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
expr: |-
|
||||||
and
|
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||||
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
and
|
||||||
for: 10m
|
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
|
||||||
labels:
|
for: 10m
|
||||||
severity: warning
|
labels:
|
||||||
- alert: KubeMemoryOvercommit
|
severity: warning
|
||||||
annotations:
|
- alert: KubeMemoryOvercommit
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
annotations:
|
||||||
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
|
description:
|
||||||
failure.
|
Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
|
||||||
summary: Cluster has overcommitted memory resource requests.
|
failure.
|
||||||
expr: |-
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
|
||||||
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
summary: Cluster has overcommitted memory resource requests.
|
||||||
and
|
expr: |-
|
||||||
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||||
for: 10m
|
and
|
||||||
labels:
|
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
|
||||||
severity: warning
|
for: 10m
|
||||||
- alert: KubeCPUQuotaOvercommit
|
labels:
|
||||||
annotations:
|
severity: warning
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
- alert: KubeCPUQuotaOvercommit
|
||||||
for Namespaces.
|
annotations:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
description:
|
||||||
summary: Cluster has overcommitted CPU resource requests.
|
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
|
||||||
expr: |-
|
for Namespaces.
|
||||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
|
||||||
/
|
summary: Cluster has overcommitted CPU resource requests.
|
||||||
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
|
expr: |-
|
||||||
> 1.5
|
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
|
||||||
for: 5m
|
/
|
||||||
labels:
|
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
|
||||||
severity: warning
|
> 1.5
|
||||||
- alert: KubeMemoryQuotaOvercommit
|
for: 5m
|
||||||
annotations:
|
labels:
|
||||||
description: Cluster {{ $labels.cluster }} has overcommitted memory resource
|
severity: warning
|
||||||
requests for Namespaces.
|
- alert: KubeMemoryQuotaOvercommit
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
annotations:
|
||||||
summary: Cluster has overcommitted memory resource requests.
|
description:
|
||||||
expr: |-
|
Cluster {{ $labels.cluster }} has overcommitted memory resource
|
||||||
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
|
requests for Namespaces.
|
||||||
/
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
|
||||||
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
|
summary: Cluster has overcommitted memory resource requests.
|
||||||
> 1.5
|
expr: |-
|
||||||
for: 5m
|
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
|
||||||
labels:
|
/
|
||||||
severity: warning
|
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
|
||||||
- alert: KubeQuotaAlmostFull
|
> 1.5
|
||||||
annotations:
|
for: 5m
|
||||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
labels:
|
||||||
}} of its {{ $labels.resource }} quota.
|
severity: warning
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
- alert: KubeQuotaAlmostFull
|
||||||
summary: Namespace quota is going to be full.
|
annotations:
|
||||||
expr: |-
|
description:
|
||||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||||
/ ignoring(instance, job, type)
|
}} of its {{ $labels.resource }} quota.
|
||||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
|
||||||
> 0.9 < 1
|
summary: Namespace quota is going to be full.
|
||||||
for: 15m
|
expr: |-
|
||||||
labels:
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||||
severity: info
|
/ ignoring(instance, job, type)
|
||||||
- alert: KubeQuotaFullyUsed
|
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||||
annotations:
|
> 0.9 < 1
|
||||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
for: 15m
|
||||||
}} of its {{ $labels.resource }} quota.
|
labels:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
severity: info
|
||||||
summary: Namespace quota is fully used.
|
- alert: KubeQuotaFullyUsed
|
||||||
expr: |-
|
annotations:
|
||||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
description:
|
||||||
/ ignoring(instance, job, type)
|
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
}} of its {{ $labels.resource }} quota.
|
||||||
== 1
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
|
||||||
for: 15m
|
summary: Namespace quota is fully used.
|
||||||
labels:
|
expr: |-
|
||||||
severity: info
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||||
- alert: KubeQuotaExceeded
|
/ ignoring(instance, job, type)
|
||||||
annotations:
|
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||||
description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
== 1
|
||||||
}} of its {{ $labels.resource }} quota.
|
for: 15m
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
labels:
|
||||||
summary: Namespace quota has exceeded the limits.
|
severity: info
|
||||||
expr: |-
|
- alert: KubeQuotaExceeded
|
||||||
kube_resourcequota{job="kube-state-metrics", type="used"}
|
annotations:
|
||||||
/ ignoring(instance, job, type)
|
description:
|
||||||
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
|
||||||
> 1
|
}} of its {{ $labels.resource }} quota.
|
||||||
for: 15m
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
|
||||||
labels:
|
summary: Namespace quota has exceeded the limits.
|
||||||
severity: warning
|
expr: |-
|
||||||
- alert: CPUThrottlingHigh
|
kube_resourcequota{job="kube-state-metrics", type="used"}
|
||||||
annotations:
|
/ ignoring(instance, job, type)
|
||||||
description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
|
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
|
||||||
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
|
> 1
|
||||||
}}.'
|
for: 15m
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
labels:
|
||||||
summary: Processes experience elevated CPU throttling.
|
severity: warning
|
||||||
expr: |-
|
- alert: CPUThrottlingHigh
|
||||||
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
|
annotations:
|
||||||
/
|
description:
|
||||||
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
|
"{{ $value | humanizePercentage }} throttling of CPU in namespace
|
||||||
> ( 25 / 100 )
|
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
|
||||||
for: 15m
|
}}."
|
||||||
labels:
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
|
||||||
severity: info
|
summary: Processes experience elevated CPU throttling.
|
||||||
|
expr: |-
|
||||||
|
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
|
||||||
|
/
|
||||||
|
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
|
||||||
|
> ( 25 / 100 )
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: info
|
||||||
|
|||||||
+113
-108
@@ -1,109 +1,114 @@
|
|||||||
|
|
||||||
groups:
|
groups:
|
||||||
- name: kubernetes-storage
|
- name: kubernetes-storage
|
||||||
rules:
|
rules:
|
||||||
- alert: KubePersistentVolumeFillingUp
|
- alert: KubePersistentVolumeFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
description:
|
||||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
|
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
|
||||||
summary: PersistentVolume is filling up.
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||||
expr: |-
|
summary: PersistentVolume is filling up.
|
||||||
(
|
expr: |-
|
||||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
(
|
||||||
/
|
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
/
|
||||||
) < 0.03
|
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||||
and
|
) < 0.03
|
||||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
and
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
unless on (cluster, namespace, persistentvolumeclaim)
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
unless on (cluster, namespace, persistentvolumeclaim)
|
||||||
for: 1m
|
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||||
labels:
|
for: 1m
|
||||||
severity: critical
|
labels:
|
||||||
- alert: KubePersistentVolumeFillingUp
|
severity: critical
|
||||||
annotations:
|
- alert: KubePersistentVolumeFillingUp
|
||||||
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
annotations:
|
||||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
description:
|
||||||
{{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value
|
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
| humanizePercentage }} is available.
|
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
{{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value
|
||||||
summary: PersistentVolume is filling up.
|
| humanizePercentage }} is available.
|
||||||
expr: |-
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
|
||||||
(
|
summary: PersistentVolume is filling up.
|
||||||
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
expr: |-
|
||||||
/
|
(
|
||||||
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||||
) < 0.15
|
/
|
||||||
and
|
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||||
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
) < 0.15
|
||||||
and
|
and
|
||||||
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
and
|
||||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
unless on (cluster, namespace, persistentvolumeclaim)
|
||||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||||
for: 1h
|
unless on (cluster, namespace, persistentvolumeclaim)
|
||||||
labels:
|
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||||
severity: warning
|
for: 1h
|
||||||
- alert: KubePersistentVolumeInodesFillingUp
|
labels:
|
||||||
annotations:
|
severity: warning
|
||||||
description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
- alert: KubePersistentVolumeInodesFillingUp
|
||||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
annotations:
|
||||||
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
|
description:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
summary: PersistentVolumeInodes are filling up.
|
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||||
expr: |-
|
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
|
||||||
(
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
summary: PersistentVolumeInodes are filling up.
|
||||||
/
|
expr: |-
|
||||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
(
|
||||||
) < 0.03
|
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||||
and
|
/
|
||||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
) < 0.03
|
||||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
and
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
unless on (cluster, namespace, persistentvolumeclaim)
|
||||||
for: 1m
|
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||||
labels:
|
unless on (cluster, namespace, persistentvolumeclaim)
|
||||||
severity: critical
|
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||||
- alert: KubePersistentVolumeInodesFillingUp
|
for: 1m
|
||||||
annotations:
|
labels:
|
||||||
description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
severity: critical
|
||||||
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
- alert: KubePersistentVolumeInodesFillingUp
|
||||||
{{ . }} {{- end }} is expected to run out of inodes within four days. Currently
|
annotations:
|
||||||
{{ $value | humanizePercentage }} of its inodes are free.
|
description:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
|
||||||
summary: PersistentVolumeInodes are filling up.
|
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
|
||||||
expr: |-
|
{{ . }} {{- end }} is expected to run out of inodes within four days. Currently
|
||||||
(
|
{{ $value | humanizePercentage }} of its inodes are free.
|
||||||
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
|
||||||
/
|
summary: PersistentVolumeInodes are filling up.
|
||||||
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
expr: |-
|
||||||
) < 0.15
|
(
|
||||||
and
|
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||||
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
/
|
||||||
and
|
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
|
||||||
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
) < 0.15
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
and
|
||||||
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
|
||||||
unless on (cluster, namespace, persistentvolumeclaim)
|
and
|
||||||
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
|
||||||
for: 1h
|
unless on (cluster, namespace, persistentvolumeclaim)
|
||||||
labels:
|
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
|
||||||
severity: warning
|
unless on (cluster, namespace, persistentvolumeclaim)
|
||||||
- alert: KubePersistentVolumeErrors
|
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
|
||||||
annotations:
|
for: 1h
|
||||||
description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
|
labels:
|
||||||
-}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
|
severity: warning
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
- alert: KubePersistentVolumeErrors
|
||||||
summary: PersistentVolume is having issues with provisioning.
|
annotations:
|
||||||
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
|
description:
|
||||||
> 0
|
The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
|
||||||
for: 5m
|
-}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
|
||||||
labels:
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
|
||||||
severity: critical
|
summary: PersistentVolume is having issues with provisioning.
|
||||||
|
expr:
|
||||||
|
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
|
||||||
|
> 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
|||||||
+366
-339
@@ -1,340 +1,367 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: node-exporter
|
- name: node-exporter
|
||||||
rules:
|
rules:
|
||||||
- alert: NodeFilesystemSpaceFillingUp
|
- alert: NodeFilesystemSpaceFillingUp
|
||||||
annotations:
|
annotations:
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
description:
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
space left and is filling up.
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
space left and is filling up.
|
||||||
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||||
expr: |-
|
summary: Filesystem is predicted to run out of space within the next 24 hours.
|
||||||
(
|
expr: |-
|
||||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
(
|
||||||
and
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
|
||||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
and
|
||||||
and
|
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
and
|
||||||
)
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
for: 1h
|
)
|
||||||
labels:
|
for: 1h
|
||||||
severity: warning
|
labels:
|
||||||
- alert: NodeFilesystemSpaceFillingUp
|
severity: warning
|
||||||
annotations:
|
- alert: NodeFilesystemSpaceFillingUp
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
annotations:
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
description:
|
||||||
space left and is filling up fast.
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
space left and is filling up fast.
|
||||||
expr: |-
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
|
||||||
(
|
summary: Filesystem is predicted to run out of space within the next 4 hours.
|
||||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
expr: |-
|
||||||
and
|
(
|
||||||
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
|
||||||
and
|
and
|
||||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||||
)
|
and
|
||||||
for: 1h
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
labels:
|
)
|
||||||
severity: critical
|
for: 1h
|
||||||
- alert: NodeFilesystemAlmostOutOfSpace
|
labels:
|
||||||
annotations:
|
severity: critical
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
- alert: NodeFilesystemAlmostOutOfSpace
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
annotations:
|
||||||
space left.
|
description:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
summary: Filesystem has less than 5% space left.
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
expr: |-
|
space left.
|
||||||
(
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
summary: Filesystem has less than 5% space left.
|
||||||
and
|
expr: |-
|
||||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
(
|
||||||
)
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||||
for: 30m
|
and
|
||||||
labels:
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
severity: warning
|
)
|
||||||
- alert: NodeFilesystemAlmostOutOfSpace
|
for: 30m
|
||||||
annotations:
|
labels:
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
severity: warning
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
- alert: NodeFilesystemAlmostOutOfSpace
|
||||||
space left.
|
annotations:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
description:
|
||||||
summary: Filesystem has less than 3% space left.
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
expr: |-
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
(
|
space left.
|
||||||
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
|
||||||
and
|
summary: Filesystem has less than 3% space left.
|
||||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
expr: |-
|
||||||
)
|
(
|
||||||
for: 30m
|
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||||
labels:
|
and
|
||||||
severity: critical
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
- alert: NodeFilesystemFilesFillingUp
|
)
|
||||||
annotations:
|
for: 30m
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
labels:
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
severity: critical
|
||||||
inodes left and is filling up.
|
- alert: NodeFilesystemFilesFillingUp
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
annotations:
|
||||||
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
description:
|
||||||
expr: |-
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
(
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
inodes left and is filling up.
|
||||||
and
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
summary: Filesystem is predicted to run out of inodes within the next 24 hours.
|
||||||
and
|
expr: |-
|
||||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
(
|
||||||
)
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
|
||||||
for: 1h
|
and
|
||||||
labels:
|
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
|
||||||
severity: warning
|
and
|
||||||
- alert: NodeFilesystemFilesFillingUp
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
annotations:
|
)
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
for: 1h
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
labels:
|
||||||
inodes left and is filling up fast.
|
severity: warning
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
- alert: NodeFilesystemFilesFillingUp
|
||||||
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
annotations:
|
||||||
expr: |-
|
description:
|
||||||
(
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
and
|
inodes left and is filling up fast.
|
||||||
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
|
||||||
and
|
summary: Filesystem is predicted to run out of inodes within the next 4 hours.
|
||||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
expr: |-
|
||||||
)
|
(
|
||||||
for: 1h
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
|
||||||
labels:
|
and
|
||||||
severity: critical
|
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
|
||||||
- alert: NodeFilesystemAlmostOutOfFiles
|
and
|
||||||
annotations:
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
)
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
for: 1h
|
||||||
inodes left.
|
labels:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
severity: critical
|
||||||
summary: Filesystem has less than 5% inodes left.
|
- alert: NodeFilesystemAlmostOutOfFiles
|
||||||
expr: |-
|
annotations:
|
||||||
(
|
description:
|
||||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
and
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
inodes left.
|
||||||
)
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||||
for: 1h
|
summary: Filesystem has less than 5% inodes left.
|
||||||
labels:
|
expr: |-
|
||||||
severity: warning
|
(
|
||||||
- alert: NodeFilesystemAlmostOutOfFiles
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
|
||||||
annotations:
|
and
|
||||||
description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
)
|
||||||
inodes left.
|
for: 1h
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
labels:
|
||||||
summary: Filesystem has less than 3% inodes left.
|
severity: warning
|
||||||
expr: |-
|
- alert: NodeFilesystemAlmostOutOfFiles
|
||||||
(
|
annotations:
|
||||||
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
description:
|
||||||
and
|
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
|
||||||
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
|
||||||
)
|
inodes left.
|
||||||
for: 1h
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
|
||||||
labels:
|
summary: Filesystem has less than 3% inodes left.
|
||||||
severity: critical
|
expr: |-
|
||||||
- alert: NodeNetworkReceiveErrs
|
(
|
||||||
annotations:
|
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
|
||||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
and
|
||||||
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
)
|
||||||
summary: Network interface is reporting many receive errors.
|
for: 1h
|
||||||
expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
|
labels:
|
||||||
> 0.01
|
severity: critical
|
||||||
for: 1h
|
- alert: NodeNetworkReceiveErrs
|
||||||
labels:
|
annotations:
|
||||||
severity: warning
|
description:
|
||||||
- alert: NodeNetworkTransmitErrs
|
'{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||||
annotations:
|
{{ printf "%.0f" $value }} receive errors in the last two minutes.'
|
||||||
description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
|
||||||
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
summary: Network interface is reporting many receive errors.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
expr:
|
||||||
summary: Network interface is reporting many transmit errors.
|
rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
|
||||||
expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
|
> 0.01
|
||||||
> 0.01
|
for: 1h
|
||||||
for: 1h
|
labels:
|
||||||
labels:
|
severity: warning
|
||||||
severity: warning
|
- alert: NodeNetworkTransmitErrs
|
||||||
- alert: NodeHighNumberConntrackEntriesUsed
|
annotations:
|
||||||
annotations:
|
description:
|
||||||
description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
|
'{{ $labels.instance }} interface {{ $labels.device }} has encountered
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
{{ printf "%.0f" $value }} transmit errors in the last two minutes.'
|
||||||
summary: Number of conntrack are getting close to the limit.
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
|
||||||
expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
|
summary: Network interface is reporting many transmit errors.
|
||||||
> 0.75
|
expr:
|
||||||
labels:
|
rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
|
||||||
severity: warning
|
> 0.01
|
||||||
- alert: NodeTextFileCollectorScrapeError
|
for: 1h
|
||||||
annotations:
|
labels:
|
||||||
description: Node Exporter text file collector on {{ $labels.instance }} failed
|
severity: warning
|
||||||
to scrape.
|
- alert: NodeHighNumberConntrackEntriesUsed
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
annotations:
|
||||||
summary: Node Exporter text file collector failed to scrape.
|
description: "{{ $value | humanizePercentage }} of conntrack entries are used."
|
||||||
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
|
||||||
labels:
|
summary: Number of conntrack are getting close to the limit.
|
||||||
severity: warning
|
expr:
|
||||||
- alert: NodeClockSkewDetected
|
(node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
|
||||||
annotations:
|
> 0.75
|
||||||
description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
|
labels:
|
||||||
Ensure NTP is configured correctly on this host.
|
severity: warning
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
- alert: NodeTextFileCollectorScrapeError
|
||||||
summary: Clock skew detected.
|
annotations:
|
||||||
expr: |-
|
description:
|
||||||
(
|
Node Exporter text file collector on {{ $labels.instance }} failed
|
||||||
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
to scrape.
|
||||||
and
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
|
||||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
summary: Node Exporter text file collector failed to scrape.
|
||||||
)
|
expr: node_textfile_scrape_error{job="node-exporter"} == 1
|
||||||
or
|
labels:
|
||||||
(
|
severity: warning
|
||||||
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
- alert: NodeClockSkewDetected
|
||||||
and
|
annotations:
|
||||||
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
description:
|
||||||
)
|
Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
|
||||||
for: 10m
|
Ensure NTP is configured correctly on this host.
|
||||||
labels:
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
|
||||||
severity: warning
|
summary: Clock skew detected.
|
||||||
- alert: NodeClockNotSynchronising
|
expr: |-
|
||||||
annotations:
|
(
|
||||||
description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
|
node_timex_offset_seconds{job="node-exporter"} > 0.05
|
||||||
is configured on this host.
|
and
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
|
||||||
summary: Clock not synchronising.
|
)
|
||||||
expr: |-
|
or
|
||||||
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
(
|
||||||
and
|
node_timex_offset_seconds{job="node-exporter"} < -0.05
|
||||||
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
and
|
||||||
for: 10m
|
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
|
||||||
labels:
|
)
|
||||||
severity: warning
|
for: 10m
|
||||||
- alert: NodeRAIDDegraded
|
labels:
|
||||||
annotations:
|
severity: warning
|
||||||
description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
|
- alert: NodeClockNotSynchronising
|
||||||
in degraded state due to one or more disks failures. Number of spare drives
|
annotations:
|
||||||
is insufficient to fix issue automatically.
|
description:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
|
||||||
summary: RAID Array is degraded.
|
is configured on this host.
|
||||||
expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
|
||||||
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
|
summary: Clock not synchronising.
|
||||||
> 0
|
expr: |-
|
||||||
for: 15m
|
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
|
||||||
labels:
|
and
|
||||||
severity: critical
|
node_timex_maxerror_seconds{job="node-exporter"} >= 16
|
||||||
- alert: NodeRAIDDiskFailure
|
for: 10m
|
||||||
annotations:
|
labels:
|
||||||
description: At least one device in RAID array at {{ $labels.instance }} failed.
|
severity: warning
|
||||||
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
- alert: NodeRAIDDegraded
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
annotations:
|
||||||
summary: Failed device in RAID array.
|
description:
|
||||||
expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
|
||||||
> 0
|
in degraded state due to one or more disks failures. Number of spare drives
|
||||||
labels:
|
is insufficient to fix issue automatically.
|
||||||
severity: warning
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
|
||||||
- alert: NodeFileDescriptorLimit
|
summary: RAID Array is degraded.
|
||||||
annotations:
|
expr:
|
||||||
description: File descriptors limit at {{ $labels.instance }} is currently at
|
node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||||
{{ printf "%.2f" $value }}%.
|
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
> 0
|
||||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
for: 15m
|
||||||
expr: |-
|
labels:
|
||||||
(
|
severity: critical
|
||||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
- alert: NodeRAIDDiskFailure
|
||||||
)
|
annotations:
|
||||||
for: 15m
|
description:
|
||||||
labels:
|
At least one device in RAID array at {{ $labels.instance }} failed.
|
||||||
severity: warning
|
Array '{{ $labels.device }}' needs attention and possibly a disk swap.
|
||||||
- alert: NodeFileDescriptorLimit
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
|
||||||
annotations:
|
summary: Failed device in RAID array.
|
||||||
description: File descriptors limit at {{ $labels.instance }} is currently at
|
expr:
|
||||||
{{ printf "%.2f" $value }}%.
|
node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
> 0
|
||||||
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
labels:
|
||||||
expr: |-
|
severity: warning
|
||||||
(
|
- alert: NodeFileDescriptorLimit
|
||||||
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
annotations:
|
||||||
)
|
description:
|
||||||
for: 15m
|
File descriptors limit at {{ $labels.instance }} is currently at
|
||||||
labels:
|
{{ printf "%.2f" $value }}%.
|
||||||
severity: critical
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||||
- alert: NodeCPUHighUsage
|
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||||
annotations:
|
expr: |-
|
||||||
description: |
|
(
|
||||||
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
|
)
|
||||||
summary: High CPU usage.
|
for: 15m
|
||||||
expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
|
labels:
|
||||||
mode!="idle"}[2m]))) * 100 > 90
|
severity: warning
|
||||||
for: 15m
|
- alert: NodeFileDescriptorLimit
|
||||||
labels:
|
annotations:
|
||||||
severity: info
|
description:
|
||||||
- alert: NodeSystemSaturation
|
File descriptors limit at {{ $labels.instance }} is currently at
|
||||||
annotations:
|
{{ printf "%.2f" $value }}%.
|
||||||
description: |
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
|
||||||
System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
summary: Kernel is predicted to exhaust file descriptors limit soon.
|
||||||
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
expr: |-
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
|
(
|
||||||
summary: System saturated, load per core is very high.
|
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
|
||||||
expr: |-
|
)
|
||||||
node_load1{job="node-exporter"}
|
for: 15m
|
||||||
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
|
labels:
|
||||||
for: 15m
|
severity: critical
|
||||||
labels:
|
- alert: NodeCPUHighUsage
|
||||||
severity: warning
|
annotations:
|
||||||
- alert: NodeMemoryMajorPagesFaults
|
description: |
|
||||||
annotations:
|
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||||
description: |
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
|
||||||
Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
summary: High CPU usage.
|
||||||
Please check that there is enough memory available at this instance.
|
expr:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
|
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
|
||||||
summary: Memory major page faults are occurring at very high rate.
|
mode!="idle"}[2m]))) * 100 > 90
|
||||||
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
|
for: 15m
|
||||||
for: 15m
|
labels:
|
||||||
labels:
|
severity: info
|
||||||
severity: warning
|
- alert: NodeSystemSaturation
|
||||||
- alert: NodeMemoryHighUtilization
|
annotations:
|
||||||
annotations:
|
description: |
|
||||||
description: |
|
System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||||
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
|
||||||
summary: Host is running out of memory.
|
summary: System saturated, load per core is very high.
|
||||||
expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
|
expr: |-
|
||||||
* 100) > 90
|
node_load1{job="node-exporter"}
|
||||||
for: 15m
|
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
|
||||||
labels:
|
for: 15m
|
||||||
severity: warning
|
labels:
|
||||||
- alert: NodeDiskIOSaturation
|
severity: warning
|
||||||
annotations:
|
- alert: NodeMemoryMajorPagesFaults
|
||||||
description: |
|
annotations:
|
||||||
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
|
description: |
|
||||||
This symptom might indicate disk saturation.
|
Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
|
Please check that there is enough memory available at this instance.
|
||||||
summary: Disk IO queue is high.
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
|
||||||
expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
summary: Memory major page faults are occurring at very high rate.
|
||||||
> 10
|
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
|
||||||
for: 30m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: NodeSystemdServiceFailed
|
- alert: NodeMemoryHighUtilization
|
||||||
annotations:
|
annotations:
|
||||||
description: Systemd service {{ $labels.name }} has entered failed state at
|
description: |
|
||||||
{{ $labels.instance }}
|
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
|
||||||
summary: Systemd service has entered failed state.
|
summary: Host is running out of memory.
|
||||||
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
|
expr:
|
||||||
for: 5m
|
100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
|
||||||
labels:
|
* 100) > 90
|
||||||
severity: warning
|
for: 15m
|
||||||
- alert: NodeBondingDegraded
|
labels:
|
||||||
annotations:
|
severity: warning
|
||||||
description: Bonding interface {{ $labels.master }} on {{ $labels.instance }}
|
- alert: NodeDiskIOSaturation
|
||||||
is in degraded state due to one or more slave failures.
|
annotations:
|
||||||
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
|
description: |
|
||||||
summary: Bonding interface is degraded
|
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
|
||||||
expr: (node_bonding_slaves - node_bonding_active) != 0
|
This symptom might indicate disk saturation.
|
||||||
for: 5m
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
|
||||||
labels:
|
summary: Disk IO queue is high.
|
||||||
severity: warning
|
expr:
|
||||||
|
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
|
||||||
|
> 10
|
||||||
|
for: 30m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeSystemdServiceFailed
|
||||||
|
annotations:
|
||||||
|
description:
|
||||||
|
Systemd service {{ $labels.name }} has entered failed state at
|
||||||
|
{{ $labels.instance }}
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
|
||||||
|
summary: Systemd service has entered failed state.
|
||||||
|
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
- alert: NodeBondingDegraded
|
||||||
|
annotations:
|
||||||
|
description:
|
||||||
|
Bonding interface {{ $labels.master }} on {{ $labels.instance }}
|
||||||
|
is in degraded state due to one or more slave failures.
|
||||||
|
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
|
||||||
|
summary: Bonding interface is degraded
|
||||||
|
expr: (node_bonding_slaves - node_bonding_active) != 0
|
||||||
|
for: 5m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|||||||
@@ -1,70 +1,76 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: node-resource-utilization.rules
|
- name: node-resource-utilization.rules
|
||||||
rules:
|
rules:
|
||||||
- alert: HostHighCpuLoad
|
- alert: HostHighCpuLoad
|
||||||
annotations:
|
annotations:
|
||||||
description: |-
|
description: |-
|
||||||
CPU load is > 90%
|
CPU load is > 90%
|
||||||
VALUE = {{ $value }}
|
VALUE = {{ $value }}
|
||||||
LABELS = {{ $labels }}
|
LABELS = {{ $labels }}
|
||||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||||
expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
|
expr:
|
||||||
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
|
||||||
for: 10m
|
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||||
labels:
|
for: 10m
|
||||||
severity: critical
|
labels:
|
||||||
- alert: MemoryUtilizationHighWarning
|
severity: critical
|
||||||
annotations:
|
- alert: MemoryUtilizationHighWarning
|
||||||
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
annotations:
|
||||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
dashboard:
|
||||||
description: Node {{ $labels.instance }} has less than 10% available memory.
|
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||||
summary: Node Memory utilization warning
|
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
description: Node {{ $labels.instance }} has less than 10% available memory.
|
||||||
for: 5m
|
summary: Node Memory utilization warning
|
||||||
labels:
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||||
severity: critical
|
for: 5m
|
||||||
- alert: MemoryUtilizationHighCritical
|
labels:
|
||||||
annotations:
|
severity: critical
|
||||||
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
- alert: MemoryUtilizationHighCritical
|
||||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
annotations:
|
||||||
description: Node {{ $labels.instance }} has less than 5% available memory.
|
dashboard:
|
||||||
summary: Node Memory utilization critical
|
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
|
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||||
for: 1m
|
description: Node {{ $labels.instance }} has less than 5% available memory.
|
||||||
labels:
|
summary: Node Memory utilization critical
|
||||||
severity: critical
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
|
||||||
- alert: NodeNotReady
|
for: 1m
|
||||||
annotations:
|
labels:
|
||||||
description: Node {{ $labels.node }} has CPU utilization over 90%.
|
severity: critical
|
||||||
summary: Node has been in not-ready state for longer than 3 minutes
|
- alert: NodeNotReady
|
||||||
expr: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
|
annotations:
|
||||||
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
|
description: Node {{ $labels.node }} has CPU utilization over 90%.
|
||||||
> 0
|
summary: Node has been in not-ready state for longer than 3 minutes
|
||||||
for: 5m
|
expr:
|
||||||
labels:
|
(sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
|
||||||
severity: critical
|
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
|
||||||
- alert: KubernetesNodeMemoryPressure
|
> 0
|
||||||
annotations:
|
for: 5m
|
||||||
description: |-
|
labels:
|
||||||
Node {{ $labels.node }} has MemoryPressure condition
|
severity: critical
|
||||||
VALUE = {{ $value }}
|
- alert: KubernetesNodeMemoryPressure
|
||||||
LABELS = {{ $labels }}
|
annotations:
|
||||||
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
|
description: |-
|
||||||
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
Node {{ $labels.node }} has MemoryPressure condition
|
||||||
1
|
VALUE = {{ $value }}
|
||||||
for: 2m
|
LABELS = {{ $labels }}
|
||||||
labels:
|
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
|
||||||
severity: critical
|
expr:
|
||||||
- alert: KubernetesContainerOomKiller
|
kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
||||||
annotations:
|
1
|
||||||
description: |-
|
for: 2m
|
||||||
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
|
labels:
|
||||||
VALUE = {{ $value }}
|
severity: critical
|
||||||
LABELS = {{ $labels }}
|
- alert: KubernetesContainerOomKiller
|
||||||
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
|
annotations:
|
||||||
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
description: |-
|
||||||
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
|
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
|
||||||
== 1
|
VALUE = {{ $value }}
|
||||||
for: 0m
|
LABELS = {{ $labels }}
|
||||||
labels:
|
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
|
||||||
severity: warning
|
expr:
|
||||||
|
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
||||||
|
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
|
||||||
|
== 1
|
||||||
|
for: 0m
|
||||||
|
labels:
|
||||||
|
severity: warning
|
||||||
|
|||||||
+24
-20
@@ -1,21 +1,25 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: velero
|
- name: velero
|
||||||
rules:
|
rules:
|
||||||
- alert: VeleroBackupPartialFailures
|
- alert: VeleroBackupPartialFailures
|
||||||
annotations:
|
annotations:
|
||||||
message: Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
|
message:
|
||||||
failed backups.
|
Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
|
||||||
expr: velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
failed backups.
|
||||||
> 0.25
|
expr:
|
||||||
for: 15m
|
velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||||
labels:
|
> 0.25
|
||||||
severity: critical
|
for: 15m
|
||||||
- alert: VeleroBackupFailures
|
labels:
|
||||||
annotations:
|
severity: critical
|
||||||
message: Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
|
- alert: VeleroBackupFailures
|
||||||
backups.
|
annotations:
|
||||||
expr: velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
message:
|
||||||
> 0.25
|
Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
|
||||||
for: 15m
|
backups.
|
||||||
labels:
|
expr:
|
||||||
severity: critical
|
velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
|
||||||
|
> 0.25
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
|||||||
+51
-45
@@ -1,46 +1,52 @@
|
|||||||
groups:
|
groups:
|
||||||
- name: x509-certificate-exporter.rules
|
- name: x509-certificate-exporter.rules
|
||||||
rules:
|
rules:
|
||||||
- alert: X509ExporterReadErrors
|
- alert: X509ExporterReadErrors
|
||||||
annotations:
|
annotations:
|
||||||
description: Over the last 15 minutes, this x509-certificate-exporter instance
|
description:
|
||||||
has experienced errors reading certificate files or querying the Kubernetes
|
Over the last 15 minutes, this x509-certificate-exporter instance
|
||||||
API. This could be caused by a misconfiguration if triggered when the exporter
|
has experienced errors reading certificate files or querying the Kubernetes
|
||||||
starts.
|
API. This could be caused by a misconfiguration if triggered when the exporter
|
||||||
summary: Increasing read errors for x509-certificate-exporter
|
starts.
|
||||||
expr: delta(x509_read_errors[15m]) > 0
|
summary: Increasing read errors for x509-certificate-exporter
|
||||||
for: 5m
|
expr: delta(x509_read_errors[15m]) > 0
|
||||||
labels:
|
for: 5m
|
||||||
severity: warning
|
labels:
|
||||||
- alert: CertificateError
|
severity: warning
|
||||||
annotations:
|
- alert: CertificateError
|
||||||
description: Certificate could not be decoded {{if $labels.secret_name }} in
|
annotations:
|
||||||
Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at
|
description:
|
||||||
location "{{ $labels.filepath }}"{{end}}
|
Certificate could not be decoded {{if $labels.secret_name }} in
|
||||||
summary: Certificate cannot be decoded
|
Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at
|
||||||
expr: x509_cert_error > 0
|
location "{{ $labels.filepath }}"{{end}}
|
||||||
for: 15m
|
summary: Certificate cannot be decoded
|
||||||
labels:
|
expr: x509_cert_error > 0
|
||||||
severity: warning
|
for: 15m
|
||||||
- alert: CertificateRenewal
|
labels:
|
||||||
annotations:
|
severity: warning
|
||||||
description: Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
|
- alert: CertificateRenewal
|
||||||
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
annotations:
|
||||||
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
description:
|
||||||
summary: Certificate should be renewed
|
Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
|
||||||
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
||||||
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28
|
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
||||||
for: 15m
|
summary: Certificate should be renewed
|
||||||
labels:
|
expr:
|
||||||
severity: warning
|
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||||
- alert: CertificateExpiration
|
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28
|
||||||
annotations:
|
for: 15m
|
||||||
description: Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
|
labels:
|
||||||
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
severity: warning
|
||||||
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
- alert: CertificateExpiration
|
||||||
summary: Certificate is about to expire
|
annotations:
|
||||||
expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
description:
|
||||||
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14
|
Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
|
||||||
for: 15m
|
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
|
||||||
labels:
|
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
|
||||||
severity: critical
|
summary: Certificate is about to expire
|
||||||
|
expr:
|
||||||
|
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
|
||||||
|
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14
|
||||||
|
for: 15m
|
||||||
|
labels:
|
||||||
|
severity: critical
|
||||||
|
|||||||
Reference in New Issue
Block a user