diff --git a/.envrc b/.envrc index 654f0a8f..a65a6462 100644 --- a/.envrc +++ b/.envrc @@ -1,6 +1,7 @@ #!/usr/bin/env bash # the shebang is ignored, but nice for editors watch_file nix/sources.json +watch_file nix/checks.nix # Load .env file if it exists dotenv_if_exists diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 3c42dfeb..c72e6401 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,6 +1,6 @@ image: name: alpine/helm:latest - entrypoint: [ "/bin/bash", "-c" ] + entrypoint: ["/bin/bash", "-c"] stages: - release @@ -8,9 +8,9 @@ stages: release: stage: release rules: - - if: '$CI_COMMIT_BRANCH =~ /^main/' - when: always - - when: never + - if: "$CI_COMMIT_BRANCH =~ /^main/" + when: always + - when: never script: - | cd $CI_PROJECT_DIR @@ -43,4 +43,3 @@ rebuild: "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts" fi done - diff --git a/bootstrap/argocd-cluster-admin.yaml b/bootstrap/argocd-cluster-admin.yaml index f98d0092..10d61961 100644 --- a/bootstrap/argocd-cluster-admin.yaml +++ b/bootstrap/argocd-cluster-admin.yaml @@ -3,16 +3,16 @@ kind: ClusterRole metadata: name: argocd-cluster-admin rules: -- apiGroups: - - '*' - resources: - - '*' - verbs: - - '*' -- nonResourceURLs: - - '*' - verbs: - - '*' + - apiGroups: + - "*" + resources: + - "*" + verbs: + - "*" + - nonResourceURLs: + - "*" + verbs: + - "*" --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding @@ -23,9 +23,9 @@ roleRef: kind: ClusterRole name: argocd-cluster-admin subjects: -- kind: ServiceAccount - name: argocd-cluster-admin - namespace: kube-system + - kind: ServiceAccount + name: argocd-cluster-admin + namespace: kube-system --- apiVersion: v1 kind: ServiceAccount diff --git a/bootstrap/cluster-admin-token.yaml b/bootstrap/cluster-admin-token.yaml index 34fae31b..b09150ed 100644 --- a/bootstrap/cluster-admin-token.yaml +++ b/bootstrap/cluster-admin-token.yaml @@ -6,5 +6,3 @@ metadata: name: cluster-admin-token namespace: kube-system type: kubernetes.io/service-account-token - - diff --git a/bootstrap/cluster-ekman.yaml b/bootstrap/cluster-ekman.yaml index b5d7eaf3..b15a09c9 100644 --- a/bootstrap/cluster-ekman.yaml +++ b/bootstrap/cluster-ekman.yaml @@ -10,5 +10,3 @@ metadata: name: cluster-ekman namespace: argocd type: Opaque - - diff --git a/bootstrap/helm-kustomize-cmp/plugin.yaml b/bootstrap/helm-kustomize-cmp/plugin.yaml index fdbacbd5..4243e835 100644 --- a/bootstrap/helm-kustomize-cmp/plugin.yaml +++ b/bootstrap/helm-kustomize-cmp/plugin.yaml @@ -9,7 +9,7 @@ spec: init: # Init always happens immediately before generate, but its output is not treated as manifests. # This is a good place to, for example, download chart dependencies. - command: [ /bin/sh ] + command: [/bin/sh] args: - /plugin/init.sh # The generate command runs in the Application source directory each time manifests are generated. Standard output @@ -17,7 +17,7 @@ spec: # To write log messages from the command, write them to stderr, it will always be displayed. # Error output will be sent to the UI, so avoid printing sensitive information (such as secrets). generate: - command: [ /bin/sh ] + command: [/bin/sh] args: - /plugin/generate.sh @@ -27,15 +27,15 @@ spec: # Only one of fileName, find.glob, or find.command should be specified. If multiple are specified then only the # first (in that order) is evaluated. # discover: - # fileName is a glob pattern (https://pkg.go.dev/path/filepath#Glob) that is applied to the Application's source - # directory. If there is a match, this plugin may be used for the Application. - # fileName: "./subdir/s*.yaml" - # find: - # This does the same thing as fileName, but it supports double-start (nested directory) glob patterns. - # glob: "**/Chart.yaml" - # The find command runs in the repository's root directory. To match, it must exit with status code 0 _and_ - # produce non-empty output to standard out. - # command: [sh, -c, find . -name env.yaml] + # fileName is a glob pattern (https://pkg.go.dev/path/filepath#Glob) that is applied to the Application's source + # directory. If there is a match, this plugin may be used for the Application. + # fileName: "./subdir/s*.yaml" + # find: + # This does the same thing as fileName, but it supports double-start (nested directory) glob patterns. + # glob: "**/Chart.yaml" + # The find command runs in the repository's root directory. To match, it must exit with status code 0 _and_ + # produce non-empty output to standard out. + # command: [sh, -c, find . -name env.yaml] # The parameters config describes what parameters the UI should display for an Application. It is up to the user to # actually set parameters in the Application manifest (in spec.source.plugin.parameters). The announcements _only_ # inform the "Parameters" tab in the App Details page of the UI. @@ -66,22 +66,21 @@ spec: itemType: string collectionType: string string: "" - # All the fields above besides "string" apply to both the array and map type parameter announcements. - # - name: array-param - # # This field communicates the parameter's default value to the UI. Setting this field is optional. - # array: [default, items] - # collectionType: array - # - name: map-param - # # This field communicates the parameter's default value to the UI. Setting this field is optional. - # map: - # some: value - # collectionType: map + # All the fields above besides 'string' apply to both the array and map type parameter announcements. + # - name: array-param + # # This field communicates the parameter's default value to the UI. Setting this field is optional. + # array: [default, items] + # collectionType: array + # - name: map-param + # # This field communicates the parameter's default value to the UI. Setting this field is optional. + # map: + # some: value + # collectionType: map # dynamic: - # The command is run in an Application's source directory. Standard output must be JSON matching the schema of the - # static parameter announcements list. - # command: [ /bin/sh, /plugin/get-values.sh ] + # The command is run in an Application's source directory. Standard output must be JSON matching the schema of the + # static parameter announcements list. + # command: [ /bin/sh, /plugin/get-values.sh ] # If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository # might have executable files. Set to true only if you trust the CMP plugin authors. preserveFileMode: false - diff --git a/bootstrap/helmfile-cmp/argo-repo-server-old.yaml b/bootstrap/helmfile-cmp/argo-repo-server-old.yaml index fb061f0e..6724f9e7 100644 --- a/bootstrap/helmfile-cmp/argo-repo-server-old.yaml +++ b/bootstrap/helmfile-cmp/argo-repo-server-old.yaml @@ -45,432 +45,432 @@ spec: affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/name: argocd-repo-server - topologyKey: kubernetes.io/hostname - weight: 100 + - podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: argocd-repo-server + topologyKey: kubernetes.io/hostname + weight: 100 automountServiceAccountToken: true containers: - - args: - - /usr/local/bin/argocd-repo-server - - --port=8081 - - --metrics-port=8084 - env: - - name: ARGOCD_REPO_SERVER_NAME - value: argocd-repo-server - - name: ARGOCD_RECONCILIATION_TIMEOUT - valueFrom: - configMapKeyRef: - key: timeout.reconciliation - name: argocd-cm - optional: true - - name: ARGOCD_REPO_SERVER_LOGFORMAT - valueFrom: - configMapKeyRef: - key: reposerver.log.format - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_LOGLEVEL - valueFrom: - configMapKeyRef: - key: reposerver.log.level - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT - valueFrom: - configMapKeyRef: - key: reposerver.parallelism.limit - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS - valueFrom: - configMapKeyRef: - key: reposerver.listen.address - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS - valueFrom: - configMapKeyRef: - key: reposerver.metrics.listen.address - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_DISABLE_TLS - valueFrom: - configMapKeyRef: - key: reposerver.disable.tls - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_TLS_MIN_VERSION - valueFrom: - configMapKeyRef: - key: reposerver.tls.minversion - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_TLS_MAX_VERSION - valueFrom: - configMapKeyRef: - key: reposerver.tls.maxversion - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_TLS_CIPHERS - valueFrom: - configMapKeyRef: - key: reposerver.tls.ciphers - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_CACHE_EXPIRATION - valueFrom: - configMapKeyRef: - key: reposerver.repo.cache.expiration - name: argocd-cmd-params-cm - optional: true - - name: REDIS_SERVER - valueFrom: - configMapKeyRef: - key: redis.server - name: argocd-cmd-params-cm - optional: true - - name: REDIS_COMPRESSION - valueFrom: - configMapKeyRef: - key: redis.compression - name: argocd-cmd-params-cm - optional: true - - name: REDISDB - valueFrom: - configMapKeyRef: - key: redis.db - name: argocd-cmd-params-cm - optional: true - - name: REDIS_USERNAME - valueFrom: - secretKeyRef: - key: redis-username - name: argocd-redis - optional: true - - name: REDIS_PASSWORD - valueFrom: - secretKeyRef: - key: auth - name: argocd-redis - - name: REDIS_SENTINEL_USERNAME - valueFrom: - secretKeyRef: - key: redis-sentinel-username - name: argocd-redis - optional: true - - name: REDIS_SENTINEL_PASSWORD - valueFrom: - secretKeyRef: - key: redis-sentinel-password - name: argocd-redis - optional: true - - name: ARGOCD_DEFAULT_CACHE_EXPIRATION - valueFrom: - configMapKeyRef: - key: reposerver.default.cache.expiration - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_OTLP_ADDRESS - valueFrom: - configMapKeyRef: - key: otlp.address - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_OTLP_INSECURE - valueFrom: - configMapKeyRef: - key: otlp.insecure - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_OTLP_HEADERS - valueFrom: - configMapKeyRef: - key: otlp.headers - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE - valueFrom: - configMapKeyRef: - key: reposerver.max.combined.directory.manifests.size - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS - valueFrom: - configMapKeyRef: - key: reposerver.plugin.tar.exclusions - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS - valueFrom: - configMapKeyRef: - key: reposerver.allow.oob.symlinks - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE - valueFrom: - configMapKeyRef: - key: reposerver.streamed.manifest.max.tar.size - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE - valueFrom: - configMapKeyRef: - key: reposerver.streamed.manifest.max.extracted.size - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE - valueFrom: - configMapKeyRef: - key: reposerver.helm.manifest.max.extracted.size - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE - valueFrom: - configMapKeyRef: - key: reposerver.disable.helm.manifest.max.extracted.size - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_GIT_MODULES_ENABLED - valueFrom: - configMapKeyRef: - key: reposerver.enable.git.submodule - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT - valueFrom: - configMapKeyRef: - key: reposerver.git.lsremote.parallelism.limit - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_GIT_REQUEST_TIMEOUT - valueFrom: - configMapKeyRef: - key: reposerver.git.request.timeout - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REVISION_CACHE_LOCK_TIMEOUT - valueFrom: - configMapKeyRef: - key: reposerver.revision.cache.lock.timeout - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_INCLUDE_HIDDEN_DIRECTORIES - valueFrom: - configMapKeyRef: - key: reposerver.include.hidden.directories - name: argocd-cmd-params-cm - optional: true - - name: HELM_CACHE_HOME - value: /helm-working-dir - - name: HELM_CONFIG_HOME - value: /helm-working-dir - - name: HELM_DATA_HOME - value: /helm-working-dir - image: quay.io/argoproj/argocd:v2.12.3 - imagePullPolicy: IfNotPresent - livenessProbe: - failureThreshold: 3 - httpGet: - path: /healthz?full=true - port: metrics - scheme: HTTP - initialDelaySeconds: 10 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - name: repo-server - ports: - - containerPort: 8081 + - args: + - /usr/local/bin/argocd-repo-server + - --port=8081 + - --metrics-port=8084 + env: + - name: ARGOCD_REPO_SERVER_NAME + value: argocd-repo-server + - name: ARGOCD_RECONCILIATION_TIMEOUT + valueFrom: + configMapKeyRef: + key: timeout.reconciliation + name: argocd-cm + optional: true + - name: ARGOCD_REPO_SERVER_LOGFORMAT + valueFrom: + configMapKeyRef: + key: reposerver.log.format + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_LOGLEVEL + valueFrom: + configMapKeyRef: + key: reposerver.log.level + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT + valueFrom: + configMapKeyRef: + key: reposerver.parallelism.limit + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS + valueFrom: + configMapKeyRef: + key: reposerver.listen.address + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS + valueFrom: + configMapKeyRef: + key: reposerver.metrics.listen.address + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_DISABLE_TLS + valueFrom: + configMapKeyRef: + key: reposerver.disable.tls + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_TLS_MIN_VERSION + valueFrom: + configMapKeyRef: + key: reposerver.tls.minversion + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_TLS_MAX_VERSION + valueFrom: + configMapKeyRef: + key: reposerver.tls.maxversion + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_TLS_CIPHERS + valueFrom: + configMapKeyRef: + key: reposerver.tls.ciphers + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_CACHE_EXPIRATION + valueFrom: + configMapKeyRef: + key: reposerver.repo.cache.expiration + name: argocd-cmd-params-cm + optional: true + - name: REDIS_SERVER + valueFrom: + configMapKeyRef: + key: redis.server + name: argocd-cmd-params-cm + optional: true + - name: REDIS_COMPRESSION + valueFrom: + configMapKeyRef: + key: redis.compression + name: argocd-cmd-params-cm + optional: true + - name: REDISDB + valueFrom: + configMapKeyRef: + key: redis.db + name: argocd-cmd-params-cm + optional: true + - name: REDIS_USERNAME + valueFrom: + secretKeyRef: + key: redis-username + name: argocd-redis + optional: true + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + key: auth + name: argocd-redis + - name: REDIS_SENTINEL_USERNAME + valueFrom: + secretKeyRef: + key: redis-sentinel-username + name: argocd-redis + optional: true + - name: REDIS_SENTINEL_PASSWORD + valueFrom: + secretKeyRef: + key: redis-sentinel-password + name: argocd-redis + optional: true + - name: ARGOCD_DEFAULT_CACHE_EXPIRATION + valueFrom: + configMapKeyRef: + key: reposerver.default.cache.expiration + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_OTLP_ADDRESS + valueFrom: + configMapKeyRef: + key: otlp.address + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_OTLP_INSECURE + valueFrom: + configMapKeyRef: + key: otlp.insecure + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_OTLP_HEADERS + valueFrom: + configMapKeyRef: + key: otlp.headers + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE + valueFrom: + configMapKeyRef: + key: reposerver.max.combined.directory.manifests.size + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS + valueFrom: + configMapKeyRef: + key: reposerver.plugin.tar.exclusions + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS + valueFrom: + configMapKeyRef: + key: reposerver.allow.oob.symlinks + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE + valueFrom: + configMapKeyRef: + key: reposerver.streamed.manifest.max.tar.size + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE + valueFrom: + configMapKeyRef: + key: reposerver.streamed.manifest.max.extracted.size + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE + valueFrom: + configMapKeyRef: + key: reposerver.helm.manifest.max.extracted.size + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE + valueFrom: + configMapKeyRef: + key: reposerver.disable.helm.manifest.max.extracted.size + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_GIT_MODULES_ENABLED + valueFrom: + configMapKeyRef: + key: reposerver.enable.git.submodule + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT + valueFrom: + configMapKeyRef: + key: reposerver.git.lsremote.parallelism.limit + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_GIT_REQUEST_TIMEOUT + valueFrom: + configMapKeyRef: + key: reposerver.git.request.timeout + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REVISION_CACHE_LOCK_TIMEOUT + valueFrom: + configMapKeyRef: + key: reposerver.revision.cache.lock.timeout + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_INCLUDE_HIDDEN_DIRECTORIES + valueFrom: + configMapKeyRef: + key: reposerver.include.hidden.directories + name: argocd-cmd-params-cm + optional: true + - name: HELM_CACHE_HOME + value: /helm-working-dir + - name: HELM_CONFIG_HOME + value: /helm-working-dir + - name: HELM_DATA_HOME + value: /helm-working-dir + image: quay.io/argoproj/argocd:v2.12.3 + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 3 + httpGet: + path: /healthz?full=true + port: metrics + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 name: repo-server - protocol: TCP - - containerPort: 8084 - name: metrics - protocol: TCP - readinessProbe: - failureThreshold: 3 - httpGet: - path: /healthz - port: metrics - scheme: HTTP - initialDelaySeconds: 10 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /app/config/ssh - name: ssh-known-hosts - - mountPath: /app/config/tls - name: tls-certs - - mountPath: /app/config/gpg/source - name: gpg-keys - - mountPath: /app/config/gpg/keys - name: gpg-keyring - - mountPath: /app/config/reposerver/tls - name: argocd-repo-server-tls - - mountPath: /helm-working-dir - name: helm-working-dir - - mountPath: /home/argocd/cmp-server/plugins - name: plugins - - mountPath: /tmp - name: tmp - - command: - - /var/run/argocd/argocd-cmp-server - image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest - imagePullPolicy: Always - name: kustomize-helm-with-rewrite - securityContext: - runAsNonRoot: true - runAsUser: 999 - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /var/run/argocd - name: var-files - - mountPath: /home/argocd/cmp-server/plugins - name: plugins - - mountPath: /tmp - name: cmp-tmp - - mountPath: /helm-working-dir - name: helm-working-dir - - command: - - /var/run/argocd/argocd-cmp-server - image: registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp:latest - imagePullPolicy: Always - name: helm-kustomize-cmp - securityContext: - runAsNonRoot: true - runAsUser: 999 - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /var/run/argocd - name: var-files - - mountPath: /home/argocd/cmp-server/plugins - name: plugins - - mountPath: /tmp - name: cmp-tmp - - mountPath: /helm-working-dir - name: helm-working-dir - - command: - - /var/run/argocd/argocd-cmp-server - image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest - imagePullPolicy: Always - name: helmfile-cmp - securityContext: - runAsNonRoot: true - runAsUser: 999 - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /var/run/argocd - name: var-files - - mountPath: /home/argocd/cmp-server/plugins - name: plugins - - mountPath: /tmp - name: cmp-tmp - - mountPath: /helm-working-dir - name: helm-working-dir + ports: + - containerPort: 8081 + name: repo-server + protocol: TCP + - containerPort: 8084 + name: metrics + protocol: TCP + readinessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: metrics + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /app/config/ssh + name: ssh-known-hosts + - mountPath: /app/config/tls + name: tls-certs + - mountPath: /app/config/gpg/source + name: gpg-keys + - mountPath: /app/config/gpg/keys + name: gpg-keyring + - mountPath: /app/config/reposerver/tls + name: argocd-repo-server-tls + - mountPath: /helm-working-dir + name: helm-working-dir + - mountPath: /home/argocd/cmp-server/plugins + name: plugins + - mountPath: /tmp + name: tmp + - command: + - /var/run/argocd/argocd-cmp-server + image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest + imagePullPolicy: Always + name: kustomize-helm-with-rewrite + securityContext: + runAsNonRoot: true + runAsUser: 999 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/argocd + name: var-files + - mountPath: /home/argocd/cmp-server/plugins + name: plugins + - mountPath: /tmp + name: cmp-tmp + - mountPath: /helm-working-dir + name: helm-working-dir + - command: + - /var/run/argocd/argocd-cmp-server + image: registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp:latest + imagePullPolicy: Always + name: helm-kustomize-cmp + securityContext: + runAsNonRoot: true + runAsUser: 999 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/argocd + name: var-files + - mountPath: /home/argocd/cmp-server/plugins + name: plugins + - mountPath: /tmp + name: cmp-tmp + - mountPath: /helm-working-dir + name: helm-working-dir + - command: + - /var/run/argocd/argocd-cmp-server + image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest + imagePullPolicy: Always + name: helmfile-cmp + securityContext: + runAsNonRoot: true + runAsUser: 999 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/argocd + name: var-files + - mountPath: /home/argocd/cmp-server/plugins + name: plugins + - mountPath: /tmp + name: cmp-tmp + - mountPath: /helm-working-dir + name: helm-working-dir dnsPolicy: ClusterFirst imagePullSecrets: - - name: gitlab-pull-secret + - name: gitlab-pull-secret initContainers: - - command: - - /bin/cp - - -n - - /usr/local/bin/argocd - - /var/run/argocd/argocd-cmp-server - image: quay.io/argoproj/argocd:v2.12.3 - imagePullPolicy: IfNotPresent - name: copyutil - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /var/run/argocd - name: var-files - - command: - - /bin/sh - - /plugin/init-helm-repos.sh - env: - - name: OCEANBOX_HELM_ACCESS_TOKEN - valueFrom: - secretKeyRef: - key: token - name: oceanbox-helm - optional: false - image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest - imagePullPolicy: Always - name: init-helm-repos - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - runAsNonRoot: true - runAsUser: 999 - seccompProfile: - type: RuntimeDefault - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /helm-working-dir - name: helm-working-dir + - command: + - /bin/cp + - -n + - /usr/local/bin/argocd + - /var/run/argocd/argocd-cmp-server + image: quay.io/argoproj/argocd:v2.12.3 + imagePullPolicy: IfNotPresent + name: copyutil + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/argocd + name: var-files + - command: + - /bin/sh + - /plugin/init-helm-repos.sh + env: + - name: OCEANBOX_HELM_ACCESS_TOKEN + valueFrom: + secretKeyRef: + key: token + name: oceanbox-helm + optional: false + image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest + imagePullPolicy: Always + name: init-helm-repos + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + runAsUser: 999 + seccompProfile: + type: RuntimeDefault + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /helm-working-dir + name: helm-working-dir restartPolicy: Always schedulerName: default-scheduler serviceAccount: argocd-repo-server serviceAccountName: argocd-repo-server terminationGracePeriodSeconds: 30 volumes: - - name: cmp-tmp - - name: helm-working-dir - - name: plugins - - name: var-files - - name: tmp - - configMap: - defaultMode: 420 - name: argocd-ssh-known-hosts-cm - name: ssh-known-hosts - - configMap: - defaultMode: 420 - name: argocd-tls-certs-cm - name: tls-certs - - configMap: - defaultMode: 420 - name: argocd-gpg-keys-cm - name: gpg-keys - - name: gpg-keyring - - name: argocd-repo-server-tls - secret: - defaultMode: 420 - items: - - key: tls.crt - path: tls.crt - - key: tls.key - path: tls.key - - key: ca.crt - path: ca.crt - optional: true - secretName: argocd-repo-server-tls + - name: cmp-tmp + - name: helm-working-dir + - name: plugins + - name: var-files + - name: tmp + - configMap: + defaultMode: 420 + name: argocd-ssh-known-hosts-cm + name: ssh-known-hosts + - configMap: + defaultMode: 420 + name: argocd-tls-certs-cm + name: tls-certs + - configMap: + defaultMode: 420 + name: argocd-gpg-keys-cm + name: gpg-keys + - name: gpg-keyring + - name: argocd-repo-server-tls + secret: + defaultMode: 420 + items: + - key: tls.crt + path: tls.crt + - key: tls.key + path: tls.key + - key: ca.crt + path: ca.crt + optional: true + secretName: argocd-repo-server-tls diff --git a/bootstrap/helmfile-cmp/argo-repo-server-patch.yaml b/bootstrap/helmfile-cmp/argo-repo-server-patch.yaml index 2521459b..0d2c3642 100644 --- a/bootstrap/helmfile-cmp/argo-repo-server-patch.yaml +++ b/bootstrap/helmfile-cmp/argo-repo-server-patch.yaml @@ -4,24 +4,24 @@ spec: template: spec: imagePullSecrets: - - name: gitlab-pull-secret + - name: gitlab-pull-secret containers: - - command: - - /var/run/argocd/argocd-cmp-server - image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest - imagePullPolicy: Always - name: helmfile-cmp - securityContext: - runAsNonRoot: true - runAsUser: 999 - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /var/run/argocd - name: var-files - - mountPath: /home/argocd/cmp-server/plugins - name: plugins - - mountPath: /tmp - name: tmp - - mountPath: /helm-working-dir - name: helm-working-dir + - command: + - /var/run/argocd/argocd-cmp-server + image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest + imagePullPolicy: Always + name: helmfile-cmp + securityContext: + runAsNonRoot: true + runAsUser: 999 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/argocd + name: var-files + - mountPath: /home/argocd/cmp-server/plugins + name: plugins + - mountPath: /tmp + name: tmp + - mountPath: /helm-working-dir + name: helm-working-dir diff --git a/bootstrap/helmfile-cmp/plugin.yaml b/bootstrap/helmfile-cmp/plugin.yaml index debd24b8..f2ac5e44 100644 --- a/bootstrap/helmfile-cmp/plugin.yaml +++ b/bootstrap/helmfile-cmp/plugin.yaml @@ -4,7 +4,7 @@ metadata: name: helmfile-cmp spec: generate: - command: [ "/bin/sh" ] + command: ["/bin/sh"] args: - /plugin/generate.sh lockRepo: false diff --git a/bootstrap/kustomize-helm-with-rewrite/argo-repo-server.yaml b/bootstrap/kustomize-helm-with-rewrite/argo-repo-server.yaml index 325afb26..e20166b2 100644 --- a/bootstrap/kustomize-helm-with-rewrite/argo-repo-server.yaml +++ b/bootstrap/kustomize-helm-with-rewrite/argo-repo-server.yaml @@ -44,341 +44,341 @@ spec: affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: - - podAffinityTerm: - labelSelector: - matchLabels: - app.kubernetes.io/name: argocd-repo-server - topologyKey: kubernetes.io/hostname - weight: 100 + - podAffinityTerm: + labelSelector: + matchLabels: + app.kubernetes.io/name: argocd-repo-server + topologyKey: kubernetes.io/hostname + weight: 100 containers: - - args: - - /usr/local/bin/argocd-repo-server - - --port=8081 - - --metrics-port=8084 - env: - - name: ARGOCD_REPO_SERVER_NAME - value: argocd-repo-server - - name: ARGOCD_RECONCILIATION_TIMEOUT - valueFrom: - configMapKeyRef: - key: timeout.reconciliation - name: argocd-cm - optional: true - - name: ARGOCD_REPO_SERVER_LOGFORMAT - valueFrom: - configMapKeyRef: - key: reposerver.log.format - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_LOGLEVEL - valueFrom: - configMapKeyRef: - key: reposerver.log.level - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT - valueFrom: - configMapKeyRef: - key: reposerver.parallelism.limit - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS - valueFrom: - configMapKeyRef: - key: reposerver.listen.address - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS - valueFrom: - configMapKeyRef: - key: reposerver.metrics.listen.address - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_DISABLE_TLS - valueFrom: - configMapKeyRef: - key: reposerver.disable.tls - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_TLS_MIN_VERSION - valueFrom: - configMapKeyRef: - key: reposerver.tls.minversion - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_TLS_MAX_VERSION - valueFrom: - configMapKeyRef: - key: reposerver.tls.maxversion - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_TLS_CIPHERS - valueFrom: - configMapKeyRef: - key: reposerver.tls.ciphers - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_CACHE_EXPIRATION - valueFrom: - configMapKeyRef: - key: reposerver.repo.cache.expiration - name: argocd-cmd-params-cm - optional: true - - name: REDIS_SERVER - valueFrom: - configMapKeyRef: - key: redis.server - name: argocd-cmd-params-cm - optional: true - - name: REDIS_COMPRESSION - valueFrom: - configMapKeyRef: - key: redis.compression - name: argocd-cmd-params-cm - optional: true - - name: REDISDB - valueFrom: - configMapKeyRef: - key: redis.db - name: argocd-cmd-params-cm - optional: true - - name: REDIS_USERNAME - valueFrom: - secretKeyRef: - key: redis-username - name: argocd-redis - optional: true - - name: REDIS_PASSWORD - valueFrom: - secretKeyRef: - key: redis-password - name: argocd-redis - optional: true - - name: ARGOCD_DEFAULT_CACHE_EXPIRATION - valueFrom: - configMapKeyRef: - key: reposerver.default.cache.expiration - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_OTLP_ADDRESS - valueFrom: - configMapKeyRef: - key: otlp.address - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_OTLP_INSECURE - valueFrom: - configMapKeyRef: - key: otlp.insecure - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_OTLP_HEADERS - valueFrom: - configMapKeyRef: - key: otlp.headers - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE - valueFrom: - configMapKeyRef: - key: reposerver.max.combined.directory.manifests.size - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS - valueFrom: - configMapKeyRef: - key: reposerver.plugin.tar.exclusions - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS - valueFrom: - configMapKeyRef: - key: reposerver.allow.oob.symlinks - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE - valueFrom: - configMapKeyRef: - key: reposerver.streamed.manifest.max.tar.size - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE - valueFrom: - configMapKeyRef: - key: reposerver.streamed.manifest.max.extracted.size - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE - valueFrom: - configMapKeyRef: - key: reposerver.helm.manifest.max.extracted.size - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE - valueFrom: - configMapKeyRef: - key: reposerver.disable.helm.manifest.max.extracted.size - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_GIT_MODULES_ENABLED - valueFrom: - configMapKeyRef: - key: reposerver.enable.git.submodule - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT - valueFrom: - configMapKeyRef: - key: reposerver.git.lsremote.parallelism.limit - name: argocd-cmd-params-cm - optional: true - - name: ARGOCD_GIT_REQUEST_TIMEOUT - valueFrom: - configMapKeyRef: - key: reposerver.git.request.timeout - name: argocd-cmd-params-cm - optional: true - - name: HELM_CACHE_HOME - value: /helm-working-dir - - name: HELM_CONFIG_HOME - value: /helm-working-dir - - name: HELM_DATA_HOME - value: /helm-working-dir - image: quay.io/argoproj/argocd:v2.10.4 - imagePullPolicy: IfNotPresent - livenessProbe: - failureThreshold: 3 - httpGet: - path: /healthz?full=true - port: metrics - scheme: HTTP - initialDelaySeconds: 10 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - name: repo-server - ports: - - containerPort: 8081 + - args: + - /usr/local/bin/argocd-repo-server + - --port=8081 + - --metrics-port=8084 + env: + - name: ARGOCD_REPO_SERVER_NAME + value: argocd-repo-server + - name: ARGOCD_RECONCILIATION_TIMEOUT + valueFrom: + configMapKeyRef: + key: timeout.reconciliation + name: argocd-cm + optional: true + - name: ARGOCD_REPO_SERVER_LOGFORMAT + valueFrom: + configMapKeyRef: + key: reposerver.log.format + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_LOGLEVEL + valueFrom: + configMapKeyRef: + key: reposerver.log.level + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT + valueFrom: + configMapKeyRef: + key: reposerver.parallelism.limit + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS + valueFrom: + configMapKeyRef: + key: reposerver.listen.address + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS + valueFrom: + configMapKeyRef: + key: reposerver.metrics.listen.address + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_DISABLE_TLS + valueFrom: + configMapKeyRef: + key: reposerver.disable.tls + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_TLS_MIN_VERSION + valueFrom: + configMapKeyRef: + key: reposerver.tls.minversion + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_TLS_MAX_VERSION + valueFrom: + configMapKeyRef: + key: reposerver.tls.maxversion + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_TLS_CIPHERS + valueFrom: + configMapKeyRef: + key: reposerver.tls.ciphers + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_CACHE_EXPIRATION + valueFrom: + configMapKeyRef: + key: reposerver.repo.cache.expiration + name: argocd-cmd-params-cm + optional: true + - name: REDIS_SERVER + valueFrom: + configMapKeyRef: + key: redis.server + name: argocd-cmd-params-cm + optional: true + - name: REDIS_COMPRESSION + valueFrom: + configMapKeyRef: + key: redis.compression + name: argocd-cmd-params-cm + optional: true + - name: REDISDB + valueFrom: + configMapKeyRef: + key: redis.db + name: argocd-cmd-params-cm + optional: true + - name: REDIS_USERNAME + valueFrom: + secretKeyRef: + key: redis-username + name: argocd-redis + optional: true + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + key: redis-password + name: argocd-redis + optional: true + - name: ARGOCD_DEFAULT_CACHE_EXPIRATION + valueFrom: + configMapKeyRef: + key: reposerver.default.cache.expiration + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_OTLP_ADDRESS + valueFrom: + configMapKeyRef: + key: otlp.address + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_OTLP_INSECURE + valueFrom: + configMapKeyRef: + key: otlp.insecure + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_OTLP_HEADERS + valueFrom: + configMapKeyRef: + key: otlp.headers + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE + valueFrom: + configMapKeyRef: + key: reposerver.max.combined.directory.manifests.size + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS + valueFrom: + configMapKeyRef: + key: reposerver.plugin.tar.exclusions + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS + valueFrom: + configMapKeyRef: + key: reposerver.allow.oob.symlinks + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE + valueFrom: + configMapKeyRef: + key: reposerver.streamed.manifest.max.tar.size + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE + valueFrom: + configMapKeyRef: + key: reposerver.streamed.manifest.max.extracted.size + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE + valueFrom: + configMapKeyRef: + key: reposerver.helm.manifest.max.extracted.size + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE + valueFrom: + configMapKeyRef: + key: reposerver.disable.helm.manifest.max.extracted.size + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_GIT_MODULES_ENABLED + valueFrom: + configMapKeyRef: + key: reposerver.enable.git.submodule + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT + valueFrom: + configMapKeyRef: + key: reposerver.git.lsremote.parallelism.limit + name: argocd-cmd-params-cm + optional: true + - name: ARGOCD_GIT_REQUEST_TIMEOUT + valueFrom: + configMapKeyRef: + key: reposerver.git.request.timeout + name: argocd-cmd-params-cm + optional: true + - name: HELM_CACHE_HOME + value: /helm-working-dir + - name: HELM_CONFIG_HOME + value: /helm-working-dir + - name: HELM_DATA_HOME + value: /helm-working-dir + image: quay.io/argoproj/argocd:v2.10.4 + imagePullPolicy: IfNotPresent + livenessProbe: + failureThreshold: 3 + httpGet: + path: /healthz?full=true + port: metrics + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 name: repo-server - protocol: TCP - - containerPort: 8084 - name: metrics - protocol: TCP - readinessProbe: - failureThreshold: 3 - httpGet: - path: /healthz - port: metrics - scheme: HTTP - initialDelaySeconds: 10 - periodSeconds: 10 - successThreshold: 1 - timeoutSeconds: 1 - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /app/config/ssh - name: ssh-known-hosts - - mountPath: /app/config/tls - name: tls-certs - - mountPath: /app/config/gpg/source - name: gpg-keys - - mountPath: /app/config/gpg/keys - name: gpg-keyring - - mountPath: /app/config/reposerver/tls - name: argocd-repo-server-tls - - mountPath: /helm-working-dir - name: helm-working-dir - - mountPath: /home/argocd/cmp-server/plugins - name: plugins - - mountPath: /tmp - name: tmp - - command: - - /var/run/argocd/argocd-cmp-server - image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest - imagePullPolicy: Always - name: kustomize-helm-with-rewrite - resources: {} - securityContext: - runAsNonRoot: true - runAsUser: 999 - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /var/run/argocd - name: var-files - - mountPath: /home/argocd/cmp-server/plugins - name: plugins - - mountPath: /tmp - name: cmp-tmp - - mountPath: /helm-working-dir - name: helm-working-dir + ports: + - containerPort: 8081 + name: repo-server + protocol: TCP + - containerPort: 8084 + name: metrics + protocol: TCP + readinessProbe: + failureThreshold: 3 + httpGet: + path: /healthz + port: metrics + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + resources: {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /app/config/ssh + name: ssh-known-hosts + - mountPath: /app/config/tls + name: tls-certs + - mountPath: /app/config/gpg/source + name: gpg-keys + - mountPath: /app/config/gpg/keys + name: gpg-keyring + - mountPath: /app/config/reposerver/tls + name: argocd-repo-server-tls + - mountPath: /helm-working-dir + name: helm-working-dir + - mountPath: /home/argocd/cmp-server/plugins + name: plugins + - mountPath: /tmp + name: tmp + - command: + - /var/run/argocd/argocd-cmp-server + image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest + imagePullPolicy: Always + name: kustomize-helm-with-rewrite + resources: {} + securityContext: + runAsNonRoot: true + runAsUser: 999 + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/argocd + name: var-files + - mountPath: /home/argocd/cmp-server/plugins + name: plugins + - mountPath: /tmp + name: cmp-tmp + - mountPath: /helm-working-dir + name: helm-working-dir dnsPolicy: ClusterFirst imagePullSecrets: - - name: gitlab-pull-secret + - name: gitlab-pull-secret initContainers: - - command: - - /bin/cp - - -n - - /usr/local/bin/argocd - - /var/run/argocd/argocd-cmp-server - image: quay.io/argoproj/argocd:v2.10.4 - imagePullPolicy: IfNotPresent - name: copyutil - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - volumeMounts: - - mountPath: /var/run/argocd - name: var-files - - command: - - /bin/sh - - /plugin/init-helm-repos.sh - image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest - imagePullPolicy: Always - name: init-helm-repos - resources: {} - securityContext: - allowPrivilegeEscalation: false - capabilities: - drop: - - ALL - readOnlyRootFilesystem: true - runAsUser: 999 - runAsNonRoot: true - seccompProfile: - type: RuntimeDefault - terminationMessagePath: /dev/termination-log - terminationMessagePolicy: File - env: - - name: OCEANBOX_HELM_ACCESS_TOKEN - valueFrom: - secretKeyRef: - key: token - name: oceanbox-helm - optional: false - volumeMounts: - - mountPath: /helm-working-dir - name: helm-working-dir + - command: + - /bin/cp + - -n + - /usr/local/bin/argocd + - /var/run/argocd/argocd-cmp-server + image: quay.io/argoproj/argocd:v2.10.4 + imagePullPolicy: IfNotPresent + name: copyutil + resources: {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/argocd + name: var-files + - command: + - /bin/sh + - /plugin/init-helm-repos.sh + image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest + imagePullPolicy: Always + name: init-helm-repos + resources: {} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + readOnlyRootFilesystem: true + runAsUser: 999 + runAsNonRoot: true + seccompProfile: + type: RuntimeDefault + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + env: + - name: OCEANBOX_HELM_ACCESS_TOKEN + valueFrom: + secretKeyRef: + key: token + name: oceanbox-helm + optional: false + volumeMounts: + - mountPath: /helm-working-dir + name: helm-working-dir restartPolicy: Always schedulerName: default-scheduler securityContext: {} @@ -386,40 +386,39 @@ spec: serviceAccountName: argocd-repo-server terminationGracePeriodSeconds: 30 volumes: - - emptyDir: {} - name: cmp-tmp - - emptyDir: {} - name: helm-working-dir - - emptyDir: {} - name: plugins - - emptyDir: {} - name: var-files - - emptyDir: {} - name: tmp - - configMap: - defaultMode: 420 - name: argocd-ssh-known-hosts-cm - name: ssh-known-hosts - - configMap: - defaultMode: 420 - name: argocd-tls-certs-cm - name: tls-certs - - configMap: - defaultMode: 420 - name: argocd-gpg-keys-cm - name: gpg-keys - - emptyDir: {} - name: gpg-keyring - - name: argocd-repo-server-tls - secret: - defaultMode: 420 - items: - - key: tls.crt - path: tls.crt - - key: tls.key - path: tls.key - - key: ca.crt - path: ca.crt - optional: true - secretName: argocd-repo-server-tls - + - emptyDir: {} + name: cmp-tmp + - emptyDir: {} + name: helm-working-dir + - emptyDir: {} + name: plugins + - emptyDir: {} + name: var-files + - emptyDir: {} + name: tmp + - configMap: + defaultMode: 420 + name: argocd-ssh-known-hosts-cm + name: ssh-known-hosts + - configMap: + defaultMode: 420 + name: argocd-tls-certs-cm + name: tls-certs + - configMap: + defaultMode: 420 + name: argocd-gpg-keys-cm + name: gpg-keys + - emptyDir: {} + name: gpg-keyring + - name: argocd-repo-server-tls + secret: + defaultMode: 420 + items: + - key: tls.crt + path: tls.crt + - key: tls.key + path: tls.key + - key: ca.crt + path: ca.crt + optional: true + secretName: argocd-repo-server-tls diff --git a/bootstrap/staging-vcluster.yaml b/bootstrap/staging-vcluster.yaml index 0c7fc569..d5b8f3ce 100644 --- a/bootstrap/staging-vcluster.yaml +++ b/bootstrap/staging-vcluster.yaml @@ -13,4 +13,3 @@ stringData: name: staging-vcluster server: https://staging-vcluster.staging-vcluster type: Opaque - diff --git a/bootstrap/values.yaml b/bootstrap/values.yaml index 25a94746..6d4a70e1 100644 --- a/bootstrap/values.yaml +++ b/bootstrap/values.yaml @@ -19,12 +19,12 @@ applications: plugin: name: helmfile-cmp env: - - name: CLUSTER_NAME - value: replaceme - - name: HELMFILE_ENVIRONMENT - value: default - - name: HELMFILE_FILE_PATH - value: system.yaml.gotmpl + - name: CLUSTER_NAME + value: replaceme + - name: HELMFILE_ENVIRONMENT + value: default + - name: HELMFILE_FILE_PATH + value: system.yaml.gotmpl projects: sys: namespace: argocd @@ -32,12 +32,12 @@ projects: additionalAnnotations: {} description: sys components project sourceRepos: - - '*' + - "*" destinations: - - namespace: '*' - server: https://kubernetes.default.svc + - namespace: "*" + server: https://kubernetes.default.svc clusterResourceWhitelist: - - group: '*' - kind: '*' + - group: "*" + kind: "*" orphanedResources: warn: false diff --git a/nix/checks.nix b/nix/checks.nix index 9a3b2383..91b910f5 100644 --- a/nix/checks.nix +++ b/nix/checks.nix @@ -5,6 +5,8 @@ let globalExcludes = [ "nix/default.nix" + "attic" + "vcluster" ".*vendor" ".*chart/.*" ".*schema.json" @@ -32,6 +34,7 @@ pre-commit.run { enable = true; excludes = [ "vcluster/" + "attic/" ]; args = [ "-x" @@ -41,15 +44,17 @@ pre-commit.run { }; yamllint = { - enable = false; + enable = true; excludes = [ "attic/" "charts/templates/" - "charts/charts/" + "charts/" + "values/" + "vcluster/" ]; settings = { strict = true; - configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 165} } }''; + configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 300} } }''; }; }; diff --git a/rules/etcd.yaml b/rules/etcd.yaml index 3cf28cb0..340fb8e1 100644 --- a/rules/etcd.yaml +++ b/rules/etcd.yaml @@ -1,183 +1,204 @@ groups: -- name: etcd - rules: - - alert: etcdMembersDown - annotations: - description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value - }}).' - summary: etcd cluster members are down. - expr: |- - max without (endpoint) ( - sum without (instance) (up{job=~".*etcd.*"} == bool 0) - or - count without (To) ( - sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 - ) - ) - > 0 - for: 10m - labels: - severity: critical - - alert: etcdInsufficientMembers - annotations: - description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value - }}).' - summary: etcd cluster has insufficient number of members. - expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) - without (instance) + 1) / 2) - for: 3m - labels: - severity: critical - - alert: etcdNoLeader - annotations: - description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} - has no leader.' - summary: etcd cluster has no leader. - expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 - for: 1m - labels: - severity: critical - - alert: etcdHighNumberOfLeaderChanges - annotations: - description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes - within the last 15 minutes. Frequent elections may be a sign of insufficient - resources, high network latency, or disruptions by other components and should - be investigated.' - summary: etcd cluster has high number of leader changes. - expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) - or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) - >= 4 - for: 5m - labels: - severity: warning - - alert: etcdHighNumberOfFailedGRPCRequests - annotations: - description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for - {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' - summary: etcd cluster has high number of failed grpc requests. - expr: |- - 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) - / - sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) - > 1 - for: 10m - labels: - severity: warning - - alert: etcdHighNumberOfFailedGRPCRequests - annotations: - description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for - {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' - summary: etcd cluster has high number of failed grpc requests. - expr: |- - 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) - / - sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) - > 5 - for: 5m - labels: - severity: critical - - alert: etcdGRPCRequestsSlow - annotations: - description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests - is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method - }} method.' - summary: etcd grpc requests are slow - expr: |- - histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) - > 0.15 - for: 10m - labels: - severity: critical - - alert: etcdMemberCommunicationSlow - annotations: - description: 'etcd cluster "{{ $labels.job }}": member communication with {{ - $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance - }}.' - summary: etcd cluster member communication is slow. - expr: |- - histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.15 - for: 10m - labels: - severity: warning - - alert: etcdHighNumberOfFailedProposals - annotations: - description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures - within the last 30 minutes on etcd instance {{ $labels.instance }}.' - summary: etcd cluster has high number of proposal failures. - expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 - for: 15m - labels: - severity: warning - - alert: etcdHighFsyncDurations - annotations: - description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations - are {{ $value }}s on etcd instance {{ $labels.instance }}.' - summary: etcd cluster 99th percentile fsync durations are too high. - expr: |- - histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.5 - for: 10m - labels: - severity: warning - - alert: etcdHighFsyncDurations - annotations: - description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations - are {{ $value }}s on etcd instance {{ $labels.instance }}.' - summary: etcd cluster 99th percentile fsync durations are too high. - expr: |- - histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 1 - for: 10m - labels: - severity: critical - - alert: etcdHighCommitDurations - annotations: - description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations - {{ $value }}s on etcd instance {{ $labels.instance }}.' - summary: etcd cluster 99th percentile commit durations are too high. - expr: |- - histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) - > 0.25 - for: 10m - labels: - severity: warning - - alert: etcdDatabaseQuotaLowSpace - annotations: - description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined - quota on etcd instance {{ $labels.instance }}, please defrag or increase the - quota as the writes to etcd will be disabled when it is full.' - summary: etcd cluster database is running full. - expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / - last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > - 95 - for: 10m - labels: - severity: critical - - alert: etcdExcessiveDatabaseGrowth - annotations: - description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk - space in the next four hours, based on write observations within the past - four hours on etcd instance {{ $labels.instance }}, please check as it might - be disruptive.' - summary: etcd cluster database growing very fast. - expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) - > etcd_server_quota_backend_bytes{job=~".*etcd.*"} - for: 10m - labels: - severity: warning - - alert: etcdDatabaseHighFragmentationRatio - annotations: - description: 'etcd cluster "{{ $labels.job }}": database size in use on instance - {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual - allocated disk space, please run defragmentation (e.g. etcdctl defrag) to - retrieve the unused fragmented disk space.' - runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation - summary: etcd database size in use is less than 50% of the actual allocated - storage. - expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) - / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 - and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600 - for: 10m - labels: - severity: warning \ No newline at end of file + - name: etcd + rules: + - alert: etcdMembersDown + annotations: + description: + 'etcd cluster "{{ $labels.job }}": members are down ({{ $value + }}).' + summary: etcd cluster members are down. + expr: |- + max without (endpoint) ( + sum without (instance) (up{job=~".*etcd.*"} == bool 0) + or + count without (To) ( + sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 + ) + ) + > 0 + for: 10m + labels: + severity: critical + - alert: etcdInsufficientMembers + annotations: + description: + 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value + }}).' + summary: etcd cluster has insufficient number of members. + expr: + sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) + without (instance) + 1) / 2) + for: 3m + labels: + severity: critical + - alert: etcdNoLeader + annotations: + description: + 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} + has no leader.' + summary: etcd cluster has no leader. + expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 + for: 1m + labels: + severity: critical + - alert: etcdHighNumberOfLeaderChanges + annotations: + description: + 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes + within the last 15 minutes. Frequent elections may be a sign of insufficient + resources, high network latency, or disruptions by other components and should + be investigated.' + summary: etcd cluster has high number of leader changes. + expr: + increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) + or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) + >= 4 + for: 5m + labels: + severity: warning + - alert: etcdHighNumberOfFailedGRPCRequests + annotations: + description: + 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for + {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' + summary: etcd cluster has high number of failed grpc requests. + expr: |- + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) + > 1 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedGRPCRequests + annotations: + description: + 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for + {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' + summary: etcd cluster has high number of failed grpc requests. + expr: |- + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) + > 5 + for: 5m + labels: + severity: critical + - alert: etcdGRPCRequestsSlow + annotations: + description: + 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests + is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method + }} method.' + summary: etcd grpc requests are slow + expr: |- + histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) + > 0.15 + for: 10m + labels: + severity: critical + - alert: etcdMemberCommunicationSlow + annotations: + description: + 'etcd cluster "{{ $labels.job }}": member communication with {{ + $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance + }}.' + summary: etcd cluster member communication is slow. + expr: |- + histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.15 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedProposals + annotations: + description: + 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures + within the last 30 minutes on etcd instance {{ $labels.instance }}.' + summary: etcd cluster has high number of proposal failures. + expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + for: 15m + labels: + severity: warning + - alert: etcdHighFsyncDurations + annotations: + description: + 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations + are {{ $value }}s on etcd instance {{ $labels.instance }}.' + summary: etcd cluster 99th percentile fsync durations are too high. + expr: |- + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.5 + for: 10m + labels: + severity: warning + - alert: etcdHighFsyncDurations + annotations: + description: + 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations + are {{ $value }}s on etcd instance {{ $labels.instance }}.' + summary: etcd cluster 99th percentile fsync durations are too high. + expr: |- + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 1 + for: 10m + labels: + severity: critical + - alert: etcdHighCommitDurations + annotations: + description: + 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations + {{ $value }}s on etcd instance {{ $labels.instance }}.' + summary: etcd cluster 99th percentile commit durations are too high. + expr: |- + histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.25 + for: 10m + labels: + severity: warning + - alert: etcdDatabaseQuotaLowSpace + annotations: + description: + 'etcd cluster "{{ $labels.job }}": database size exceeds the defined + quota on etcd instance {{ $labels.instance }}, please defrag or increase the + quota as the writes to etcd will be disabled when it is full.' + summary: etcd cluster database is running full. + expr: + (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / + last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > + 95 + for: 10m + labels: + severity: critical + - alert: etcdExcessiveDatabaseGrowth + annotations: + description: + 'etcd cluster "{{ $labels.job }}": Predicting running out of disk + space in the next four hours, based on write observations within the past + four hours on etcd instance {{ $labels.instance }}, please check as it might + be disruptive.' + summary: etcd cluster database growing very fast. + expr: + predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) + > etcd_server_quota_backend_bytes{job=~".*etcd.*"} + for: 10m + labels: + severity: warning + - alert: etcdDatabaseHighFragmentationRatio + annotations: + description: + 'etcd cluster "{{ $labels.job }}": database size in use on instance + {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual + allocated disk space, please run defragmentation (e.g. etcdctl defrag) to + retrieve the unused fragmented disk space.' + runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation + summary: + etcd database size in use is less than 50% of the actual allocated + storage. + expr: + (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) + / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 + and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600 + for: 10m + labels: + severity: warning diff --git a/rules/general.yaml b/rules/general.yaml index 1c692e6f..239c48ba 100644 --- a/rules/general.yaml +++ b/rules/general.yaml @@ -1,43 +1,47 @@ groups: -- name: general.rules - rules: - - alert: TargetDown - annotations: - description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service - }} targets in {{ $labels.namespace }} namespace are down.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown - summary: One or more targets are unreachable. - expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) - BY (cluster, job, namespace, service)) > 10 - for: 10m - labels: - severity: warning - - alert: Watchdog - annotations: - description: | - This is an alert meant to ensure that the entire alerting pipeline is functional. - This alert is always firing, therefore it should always be firing in Alertmanager - and always fire against a receiver. There are integrations with various notification - mechanisms that send a notification when this alert is not firing. For example the - "DeadMansSnitch" integration in PagerDuty. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog - summary: An alert that should always be firing to certify that Alertmanager - is working properly. - expr: vector(1) - labels: - severity: none - - alert: InfoInhibitor - annotations: - description: | - This is an alert that is used to inhibit info alerts. - By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with - other alerts. - This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a - severity of 'warning' or 'critical' starts firing on the same namespace. - This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor - summary: Info-level alert inhibition. - expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname != - "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 - labels: - severity: none \ No newline at end of file + - name: general.rules + rules: + - alert: TargetDown + annotations: + description: + '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service + }} targets in {{ $labels.namespace }} namespace are down.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown + summary: One or more targets are unreachable. + expr: + 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) + BY (cluster, job, namespace, service)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + description: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog + summary: + An alert that should always be firing to certify that Alertmanager + is working properly. + expr: vector(1) + labels: + severity: none + - alert: InfoInhibitor + annotations: + description: | + This is an alert that is used to inhibit info alerts. + By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with + other alerts. + This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a + severity of 'warning' or 'critical' starts firing on the same namespace. + This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor + summary: Info-level alert inhibition. + expr: + ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname != + "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 + labels: + severity: none diff --git a/rules/kubernetes-apps.yaml b/rules/kubernetes-apps.yaml index 896f9b7e..0ecd83b1 100644 --- a/rules/kubernetes-apps.yaml +++ b/rules/kubernetes-apps.yaml @@ -1,262 +1,281 @@ groups: -- name: kubernetes-apps - rules: - - alert: KubePodCrashLooping - annotations: - description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container - }}) is in waiting state (reason: "CrashLoopBackOff").' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping - summary: Pod is crash looping. - expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", - job="kube-state-metrics", namespace=~".*"}[5m]) >= 1 - for: 15m - labels: - severity: warning - - alert: KubePodNotReady - annotations: - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready - state for longer than 15 minutes. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready - summary: Pod has been in a non-ready state for more than 15 minutes. - expr: |- - sum by (namespace, pod, cluster) ( - max by (namespace, pod, cluster) ( - kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"} - ) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) ( - 1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) - ) - ) > 0 - for: 15m - labels: - severity: warning - - alert: KubeDeploymentGenerationMismatch - annotations: - description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment - }} does not match, this indicates that the Deployment has failed but has not - been rolled back. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch - summary: Deployment generation mismatch due to possible roll-back - expr: |- - kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"} - != - kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"} - for: 15m - labels: - severity: warning - - alert: KubeDeploymentReplicasMismatch - annotations: - description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has - not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch - summary: Deployment has not matched the expected number of replicas. - expr: |- - ( - kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"} - > - kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"} - ) and ( - changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeDeploymentRolloutStuck - annotations: - description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment - }} is not progressing for longer than 15 minutes. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck - summary: Deployment rollout is not progressing. - expr: |- - kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"} - != 0 - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetReplicasMismatch - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has - not matched the expected number of replicas for longer than 15 minutes. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch - summary: StatefulSet has not matched the expected number of replicas. - expr: |- - ( - kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"} - != - kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"} - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetGenerationMismatch - annotations: - description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset - }} does not match, this indicates that the StatefulSet has failed but has - not been rolled back. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch - summary: StatefulSet generation mismatch due to possible roll-back - expr: |- - kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"} - != - kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"} - for: 15m - labels: - severity: warning - - alert: KubeStatefulSetUpdateNotRolledOut - annotations: - description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update - has not been rolled out. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout - summary: StatefulSet update has not been rolled out. - expr: |- - ( - max by (namespace, statefulset) ( - kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"} - unless - kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"} - ) - * - ( - kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"} + - name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + description: + 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is in waiting state (reason: "CrashLoopBackOff").' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping + summary: Pod is crash looping. + expr: + max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", + job="kube-state-metrics", namespace=~".*"}[5m]) >= 1 + for: 15m + labels: + severity: warning + - alert: KubePodNotReady + annotations: + description: + Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready + state for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: |- + sum by (namespace, pod, cluster) ( + max by (namespace, pod, cluster) ( + kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"} + ) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) ( + 1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 + for: 15m + labels: + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: + Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment + }} does not match, this indicates that the Deployment has failed but has not + been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: |- + kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"} != - kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"} - ) - ) and ( - changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeDaemonSetRolloutStuck - annotations: - description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not - finished or progressed for at least 15 minutes. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck - summary: DaemonSet rollout is stuck. - expr: |- - ( - ( - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} - != + kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"} + for: 15m + labels: + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: + Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has + not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: |- + ( + kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"} + > + kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDeploymentRolloutStuck + annotations: + description: + Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment + }} is not progressing for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck + summary: Deployment rollout is not progressing. + expr: |- + kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"} + != 0 + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: + StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has + not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch + summary: StatefulSet has not matched the expected number of replicas. + expr: |- + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"} + != + kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: + StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset + }} does not match, this indicates that the StatefulSet has failed but has + not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: |- + kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"} + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: + StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update + has not been rolled out. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: |- + ( + max by (namespace, statefulset) ( + kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: + DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not + finished or progressed for at least 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: |- + ( + ( + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} + != + 0 + ) or ( + kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} + ) + ) and ( + changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeContainerWaiting + annotations: + description: + pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container + {{ $labels.container}} has been in waiting state for longer than 1 hour. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting + summary: Pod container waiting longer than 1 hour + expr: + sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", + namespace=~".*"}) > 0 + for: 1h + labels: + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + description: + "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are not scheduled." + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled + summary: DaemonSet pods are not scheduled. + expr: |- kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} - ) or ( + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + description: + "{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are running where they are not supposed to run." + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled + summary: DaemonSet pods are misscheduled. + expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} - != - 0 - ) or ( - kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} - ) or ( - kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"} - != - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} - ) - ) and ( - changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m]) - == - 0 - ) - for: 15m - labels: - severity: warning - - alert: KubeContainerWaiting - annotations: - description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container - {{ $labels.container}} has been in waiting state for longer than 1 hour. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting - summary: Pod container waiting longer than 1 hour - expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", - namespace=~".*"}) > 0 - for: 1h - labels: - severity: warning - - alert: KubeDaemonSetNotScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset - }} are not scheduled.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled - summary: DaemonSet pods are not scheduled. - expr: |- - kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} - - - kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0 - for: 10m - labels: - severity: warning - - alert: KubeDaemonSetMisScheduled - annotations: - description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset - }} are running where they are not supposed to run.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled - summary: DaemonSet pods are misscheduled. - expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} - > 0 - for: 15m - labels: - severity: warning - - alert: KubeJobNotCompleted - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more - than {{ "43200" | humanizeDuration }} to complete. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted - summary: Job did not complete in time - expr: |- - time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"} - and - kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200 - labels: - severity: warning - - alert: KubeJobFailed - annotations: - description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. - Removing failed job after investigation should clear this alert. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed - summary: Job failed to complete. - expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaReplicasMismatch - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} - has not matched the desired number of replicas for longer than 15 minutes. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch - summary: HPA has not matched desired number of replicas. - expr: |- - (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"} - != - kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}) - and - (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} - > - kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"}) - and - (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} - < - kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}) - and - changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0 - for: 15m - labels: - severity: warning - - alert: KubeHpaMaxedOut - annotations: - description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} - has been running at max replicas for longer than 15 minutes. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout - summary: HPA is running at max replicas - expr: |- - kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} - == - kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"} - for: 15m - labels: - severity: warning + > 0 + for: 15m + labels: + severity: warning + - alert: KubeJobNotCompleted + annotations: + description: + Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more + than {{ "43200" | humanizeDuration }} to complete. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted + summary: Job did not complete in time + expr: |- + time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"} + and + kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200 + labels: + severity: warning + - alert: KubeJobFailed + annotations: + description: + Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. + Removing failed job after investigation should clear this alert. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed + summary: Job failed to complete. + expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + description: + HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} + has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch + summary: HPA has not matched desired number of replicas. + expr: |- + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"} + != + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} + > + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} + < + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}) + and + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaMaxedOut + annotations: + description: + HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} + has been running at max replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout + summary: HPA is running at max replicas + expr: |- + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} + == + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"} + for: 15m + labels: + severity: warning diff --git a/rules/kubernetes-resources.yaml b/rules/kubernetes-resources.yaml index 048cfbfe..a375b4d8 100644 --- a/rules/kubernetes-resources.yaml +++ b/rules/kubernetes-resources.yaml @@ -1,115 +1,123 @@ groups: -- name: kubernetes-resources - rules: - - alert: KubeCPUOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests - for Pods by {{ $value }} CPU shares and cannot tolerate node failure. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: |- - sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 - and - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 - for: 10m - labels: - severity: warning - - alert: KubeMemoryOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted memory resource - requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node - failure. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit - summary: Cluster has overcommitted memory resource requests. - expr: |- - sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 - and - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 - for: 10m - labels: - severity: warning - - alert: KubeCPUQuotaOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests - for Namespaces. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit - summary: Cluster has overcommitted CPU resource requests. - expr: |- - sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster) - / - sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeMemoryQuotaOvercommit - annotations: - description: Cluster {{ $labels.cluster }} has overcommitted memory resource - requests for Namespaces. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit - summary: Cluster has overcommitted memory resource requests. - expr: |- - sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster) - / - sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - > 1.5 - for: 5m - labels: - severity: warning - - alert: KubeQuotaAlmostFull - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage - }} of its {{ $labels.resource }} quota. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull - summary: Namespace quota is going to be full. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 0.9 < 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaFullyUsed - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage - }} of its {{ $labels.resource }} quota. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused - summary: Namespace quota is fully used. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - == 1 - for: 15m - labels: - severity: info - - alert: KubeQuotaExceeded - annotations: - description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage - }} of its {{ $labels.resource }} quota. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded - summary: Namespace quota has exceeded the limits. - expr: |- - kube_resourcequota{job="kube-state-metrics", type="used"} - / ignoring(instance, job, type) - (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - > 1 - for: 15m - labels: - severity: warning - - alert: CPUThrottlingHigh - annotations: - description: '{{ $value | humanizePercentage }} throttling of CPU in namespace - {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod - }}.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh - summary: Processes experience elevated CPU throttling. - expr: |- - sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace) - / - sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) - > ( 25 / 100 ) - for: 15m - labels: - severity: info \ No newline at end of file + - name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + description: + Cluster {{ $labels.cluster }} has overcommitted CPU resource requests + for Pods by {{ $value }} CPU shares and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: |- + sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 + and + (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 + for: 10m + labels: + severity: warning + - alert: KubeMemoryOvercommit + annotations: + description: + Cluster {{ $labels.cluster }} has overcommitted memory resource + requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node + failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit + summary: Cluster has overcommitted memory resource requests. + expr: |- + sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 + and + (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 + for: 10m + labels: + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + description: + Cluster {{ $labels.cluster }} has overcommitted CPU resource requests + for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: |- + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster) + / + sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + description: + Cluster {{ $labels.cluster }} has overcommitted memory resource + requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit + summary: Cluster has overcommitted memory resource requests. + expr: |- + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster) + / + sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: + Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaFullyUsed + annotations: + description: + Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused + summary: Namespace quota is fully used. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + == 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaExceeded + annotations: + description: + Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + description: + "{{ $value | humanizePercentage }} throttling of CPU in namespace + {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod + }}." + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh + summary: Processes experience elevated CPU throttling. + expr: |- + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) + > ( 25 / 100 ) + for: 15m + labels: + severity: info diff --git a/rules/kubernetes-storage.yaml b/rules/kubernetes-storage.yaml index 24f06ca3..b274bee7 100644 --- a/rules/kubernetes-storage.yaml +++ b/rules/kubernetes-storage.yaml @@ -1,109 +1,114 @@ - groups: -- name: kubernetes-storage - rules: - - alert: KubePersistentVolumeFillingUp - annotations: - description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim - }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster - {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: |- - ( - kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - ) < 0.03 - and - kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1m - labels: - severity: critical - - alert: KubePersistentVolumeFillingUp - annotations: - description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim - }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster - {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value - | humanizePercentage }} is available. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup - summary: PersistentVolume is filling up. - expr: |- - ( - kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - / - kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - ) < 0.15 - and - kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 - and - predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1h - labels: - severity: warning - - alert: KubePersistentVolumeInodesFillingUp - annotations: - description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim - }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster - {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup - summary: PersistentVolumeInodes are filling up. - expr: |- - ( - kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} - / - kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - ) < 0.03 - and - kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1m - labels: - severity: critical - - alert: KubePersistentVolumeInodesFillingUp - annotations: - description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim - }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster - {{ . }} {{- end }} is expected to run out of inodes within four days. Currently - {{ $value | humanizePercentage }} of its inodes are free. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup - summary: PersistentVolumeInodes are filling up. - expr: |- - ( - kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} - / - kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} - ) < 0.15 - and - kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 - and - predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 - unless on (cluster, namespace, persistentvolumeclaim) - kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - for: 1h - labels: - severity: warning - - alert: KubePersistentVolumeErrors - annotations: - description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster - -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors - summary: PersistentVolume is having issues with provisioning. - expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} - > 0 - for: 5m - labels: - severity: critical \ No newline at end of file + - name: kubernetes-storage + rules: + - alert: KubePersistentVolumeFillingUp + annotations: + description: + The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster + {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: |- + ( + kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFillingUp + annotations: + description: + Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster + {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value + | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: |- + ( + kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeInodesFillingUp + annotations: + description: + The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster + {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup + summary: PersistentVolumeInodes are filling up. + expr: |- + ( + kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeInodesFillingUp + annotations: + description: + Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster + {{ . }} {{- end }} is expected to run out of inodes within four days. Currently + {{ $value | humanizePercentage }} of its inodes are free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup + summary: PersistentVolumeInodes are filling up. + expr: |- + ( + kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeErrors + annotations: + description: + The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster + -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: + kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} + > 0 + for: 5m + labels: + severity: critical diff --git a/rules/node-exporter.yaml b/rules/node-exporter.yaml index 2a24cb4d..2d1bb3a6 100644 --- a/rules/node-exporter.yaml +++ b/rules/node-exporter.yaml @@ -1,340 +1,367 @@ groups: -- name: node-exporter - rules: - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint - }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available - space left and is filling up. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 24 hours. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemSpaceFillingUp - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint - }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available - space left and is filling up fast. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup - summary: Filesystem is predicted to run out of space within the next 4 hours. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 - and - predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint - }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available - space left. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace - summary: Filesystem has less than 5% space left. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 30m - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfSpace - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint - }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available - space left. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace - summary: Filesystem has less than 3% space left. - expr: |- - ( - node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 30m - labels: - severity: critical - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint - }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available - inodes left and is filling up. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 24 hours. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemFilesFillingUp - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint - }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available - inodes left and is filling up fast. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup - summary: Filesystem is predicted to run out of inodes within the next 4 hours. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 - and - predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint - }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available - inodes left. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles - summary: Filesystem has less than 5% inodes left. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: warning - - alert: NodeFilesystemAlmostOutOfFiles - annotations: - description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint - }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available - inodes left. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles - summary: Filesystem has less than 3% inodes left. - expr: |- - ( - node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 - and - node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - ) - for: 1h - labels: - severity: critical - - alert: NodeNetworkReceiveErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - {{ printf "%.0f" $value }} receive errors in the last two minutes.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs - summary: Network interface is reporting many receive errors. - expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) - > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeNetworkTransmitErrs - annotations: - description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered - {{ printf "%.0f" $value }} transmit errors in the last two minutes.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs - summary: Network interface is reporting many transmit errors. - expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) - > 0.01 - for: 1h - labels: - severity: warning - - alert: NodeHighNumberConntrackEntriesUsed - annotations: - description: '{{ $value | humanizePercentage }} of conntrack entries are used.' - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused - summary: Number of conntrack are getting close to the limit. - expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) - > 0.75 - labels: - severity: warning - - alert: NodeTextFileCollectorScrapeError - annotations: - description: Node Exporter text file collector on {{ $labels.instance }} failed - to scrape. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror - summary: Node Exporter text file collector failed to scrape. - expr: node_textfile_scrape_error{job="node-exporter"} == 1 - labels: - severity: warning - - alert: NodeClockSkewDetected - annotations: - description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. - Ensure NTP is configured correctly on this host. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected - summary: Clock skew detected. - expr: |- - ( - node_timex_offset_seconds{job="node-exporter"} > 0.05 - and - deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 - ) - or - ( - node_timex_offset_seconds{job="node-exporter"} < -0.05 - and - deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 - ) - for: 10m - labels: - severity: warning - - alert: NodeClockNotSynchronising - annotations: - description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP - is configured on this host. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising - summary: Clock not synchronising. - expr: |- - min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 - and - node_timex_maxerror_seconds{job="node-exporter"} >= 16 - for: 10m - labels: - severity: warning - - alert: NodeRAIDDegraded - annotations: - description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is - in degraded state due to one or more disks failures. Number of spare drives - is insufficient to fix issue automatically. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded - summary: RAID Array is degraded. - expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) - > 0 - for: 15m - labels: - severity: critical - - alert: NodeRAIDDiskFailure - annotations: - description: At least one device in RAID array at {{ $labels.instance }} failed. - Array '{{ $labels.device }}' needs attention and possibly a disk swap. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure - summary: Failed device in RAID array. - expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} - > 0 - labels: - severity: warning - - alert: NodeFileDescriptorLimit - annotations: - description: File descriptors limit at {{ $labels.instance }} is currently at - {{ printf "%.2f" $value }}%. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit - summary: Kernel is predicted to exhaust file descriptors limit soon. - expr: |- - ( - node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 - ) - for: 15m - labels: - severity: warning - - alert: NodeFileDescriptorLimit - annotations: - description: File descriptors limit at {{ $labels.instance }} is currently at - {{ printf "%.2f" $value }}%. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit - summary: Kernel is predicted to exhaust file descriptors limit soon. - expr: |- - ( - node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 - ) - for: 15m - labels: - severity: critical - - alert: NodeCPUHighUsage - annotations: - description: | - CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage - summary: High CPU usage. - expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", - mode!="idle"}[2m]))) * 100 > 90 - for: 15m - labels: - severity: info - - alert: NodeSystemSaturation - annotations: - description: | - System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. - This might indicate this instance resources saturation and can cause it becoming unresponsive. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation - summary: System saturated, load per core is very high. - expr: |- - node_load1{job="node-exporter"} - / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 - for: 15m - labels: - severity: warning - - alert: NodeMemoryMajorPagesFaults - annotations: - description: | - Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. - Please check that there is enough memory available at this instance. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults - summary: Memory major page faults are occurring at very high rate. - expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 - for: 15m - labels: - severity: warning - - alert: NodeMemoryHighUtilization - annotations: - description: | - Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization - summary: Host is running out of memory. - expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} - * 100) > 90 - for: 15m - labels: - severity: warning - - alert: NodeDiskIOSaturation - annotations: - description: | - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. - This symptom might indicate disk saturation. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation - summary: Disk IO queue is high. - expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) - > 10 - for: 30m - labels: - severity: warning - - alert: NodeSystemdServiceFailed - annotations: - description: Systemd service {{ $labels.name }} has entered failed state at - {{ $labels.instance }} - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed - summary: Systemd service has entered failed state. - expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1 - for: 5m - labels: - severity: warning - - alert: NodeBondingDegraded - annotations: - description: Bonding interface {{ $labels.master }} on {{ $labels.instance }} - is in degraded state due to one or more slave failures. - runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded - summary: Bonding interface is degraded - expr: (node_bonding_slaves - node_bonding_active) != 0 - for: 5m - labels: - severity: warning + - name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: + Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: + Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: + Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 5% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: + Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 3% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: + Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: + Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: + Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 5% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: + Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 3% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: + '{{ $labels.instance }} interface {{ $labels.device }} has encountered + {{ printf "%.0f" $value }} receive errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs + summary: Network interface is reporting many receive errors. + expr: + rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) + > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: + '{{ $labels.instance }} interface {{ $labels.device }} has encountered + {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs + summary: Network interface is reporting many transmit errors. + expr: + rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) + > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: "{{ $value | humanizePercentage }} of conntrack entries are used." + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit. + expr: + (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) + > 0.75 + labels: + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: + Node Exporter text file collector on {{ $labels.instance }} failed + to scrape. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror + summary: Node Exporter text file collector failed to scrape. + expr: node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + description: + Clock at {{ $labels.instance }} is out of sync by more than 0.05s. + Ensure NTP is configured correctly on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected + summary: Clock skew detected. + expr: |- + ( + node_timex_offset_seconds{job="node-exporter"} > 0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job="node-exporter"} < -0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + description: + Clock at {{ $labels.instance }} is not synchronising. Ensure NTP + is configured on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising + summary: Clock not synchronising. + expr: |- + min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 + and + node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 10m + labels: + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: + RAID array '{{ $labels.device }}' at {{ $labels.instance }} is + in degraded state due to one or more disks failures. Number of spare drives + is insufficient to fix issue automatically. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded + summary: RAID Array is degraded. + expr: + node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} + - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) + > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: + At least one device in RAID array at {{ $labels.instance }} failed. + Array '{{ $labels.device }}' needs attention and possibly a disk swap. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure + summary: Failed device in RAID array. + expr: + node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} + > 0 + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: + File descriptors limit at {{ $labels.instance }} is currently at + {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: |- + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 + ) + for: 15m + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: + File descriptors limit at {{ $labels.instance }} is currently at + {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: |- + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical + - alert: NodeCPUHighUsage + annotations: + description: | + CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage + summary: High CPU usage. + expr: + sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", + mode!="idle"}[2m]))) * 100 > 90 + for: 15m + labels: + severity: info + - alert: NodeSystemSaturation + annotations: + description: | + System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + This might indicate this instance resources saturation and can cause it becoming unresponsive. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation + summary: System saturated, load per core is very high. + expr: |- + node_load1{job="node-exporter"} + / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 + for: 15m + labels: + severity: warning + - alert: NodeMemoryMajorPagesFaults + annotations: + description: | + Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Please check that there is enough memory available at this instance. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults + summary: Memory major page faults are occurring at very high rate. + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 + for: 15m + labels: + severity: warning + - alert: NodeMemoryHighUtilization + annotations: + description: | + Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization + summary: Host is running out of memory. + expr: + 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} + * 100) > 90 + for: 15m + labels: + severity: warning + - alert: NodeDiskIOSaturation + annotations: + description: | + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. + This symptom might indicate disk saturation. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation + summary: Disk IO queue is high. + expr: + rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + > 10 + for: 30m + labels: + severity: warning + - alert: NodeSystemdServiceFailed + annotations: + description: + Systemd service {{ $labels.name }} has entered failed state at + {{ $labels.instance }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed + summary: Systemd service has entered failed state. + expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1 + for: 5m + labels: + severity: warning + - alert: NodeBondingDegraded + annotations: + description: + Bonding interface {{ $labels.master }} on {{ $labels.instance }} + is in degraded state due to one or more slave failures. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded + summary: Bonding interface is degraded + expr: (node_bonding_slaves - node_bonding_active) != 0 + for: 5m + labels: + severity: warning diff --git a/rules/node-resource-utilization.yaml b/rules/node-resource-utilization.yaml index 7a5fd536..8620e211 100644 --- a/rules/node-resource-utilization.yaml +++ b/rules/node-resource-utilization.yaml @@ -1,70 +1,76 @@ groups: -- name: node-resource-utilization.rules - rules: - - alert: HostHighCpuLoad - annotations: - description: |- - CPU load is > 90% - VALUE = {{ $value }} - LABELS = {{ $labels }} - summary: Host high CPU load (instance {{ $labels.instance }}) - expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) - > 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} - for: 10m - labels: - severity: critical - - alert: MemoryUtilizationHighWarning - annotations: - dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ - $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D - description: Node {{ $labels.instance }} has less than 10% available memory. - summary: Node Memory utilization warning - expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 - for: 5m - labels: - severity: critical - - alert: MemoryUtilizationHighCritical - annotations: - dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ - $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D - description: Node {{ $labels.instance }} has less than 5% available memory. - summary: Node Memory utilization critical - expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5 - for: 1m - labels: - severity: critical - - alert: NodeNotReady - annotations: - description: Node {{ $labels.node }} has CPU utilization over 90%. - summary: Node has been in not-ready state for longer than 3 minutes - expr: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m]) - <= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"})) - > 0 - for: 5m - labels: - severity: critical - - alert: KubernetesNodeMemoryPressure - annotations: - description: |- - Node {{ $labels.node }} has MemoryPressure condition - VALUE = {{ $value }} - LABELS = {{ $labels }} - summary: Kubernetes Node memory pressure (instance {{ $labels.instance }}) - expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == - 1 - for: 2m - labels: - severity: critical - - alert: KubernetesContainerOomKiller - annotations: - description: |- - Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. - VALUE = {{ $value }} - LABELS = {{ $labels }} - summary: Kubernetes Container oom killer (instance {{ $labels.instance }}) - expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total - offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) - == 1 - for: 0m - labels: - severity: warning + - name: node-resource-utilization.rules + rules: + - alert: HostHighCpuLoad + annotations: + description: |- + CPU load is > 90% + VALUE = {{ $value }} + LABELS = {{ $labels }} + summary: Host high CPU load (instance {{ $labels.instance }}) + expr: + (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) + > 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 10m + labels: + severity: critical + - alert: MemoryUtilizationHighWarning + annotations: + dashboard: + https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ + $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D + description: Node {{ $labels.instance }} has less than 10% available memory. + summary: Node Memory utilization warning + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: critical + - alert: MemoryUtilizationHighCritical + annotations: + dashboard: + https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ + $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D + description: Node {{ $labels.instance }} has less than 5% available memory. + summary: Node Memory utilization critical + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5 + for: 1m + labels: + severity: critical + - alert: NodeNotReady + annotations: + description: Node {{ $labels.node }} has CPU utilization over 90%. + summary: Node has been in not-ready state for longer than 3 minutes + expr: + (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m]) + <= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"})) + > 0 + for: 5m + labels: + severity: critical + - alert: KubernetesNodeMemoryPressure + annotations: + description: |- + Node {{ $labels.node }} has MemoryPressure condition + VALUE = {{ $value }} + LABELS = {{ $labels }} + summary: Kubernetes Node memory pressure (instance {{ $labels.instance }}) + expr: + kube_node_status_condition{condition="MemoryPressure",status="true"} == + 1 + for: 2m + labels: + severity: critical + - alert: KubernetesContainerOomKiller + annotations: + description: |- + Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. + VALUE = {{ $value }} + LABELS = {{ $labels }} + summary: Kubernetes Container oom killer (instance {{ $labels.instance }}) + expr: + (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total + offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) + == 1 + for: 0m + labels: + severity: warning diff --git a/rules/velero.yaml b/rules/velero.yaml index 1c8eb91c..af46a8c2 100644 --- a/rules/velero.yaml +++ b/rules/velero.yaml @@ -1,21 +1,25 @@ groups: -- name: velero - rules: - - alert: VeleroBackupPartialFailures - annotations: - message: Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy - failed backups. - expr: velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""} - > 0.25 - for: 15m - labels: - severity: critical - - alert: VeleroBackupFailures - annotations: - message: Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed - backups. - expr: velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""} - > 0.25 - for: 15m - labels: - severity: critical + - name: velero + rules: + - alert: VeleroBackupPartialFailures + annotations: + message: + Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy + failed backups. + expr: + velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""} + > 0.25 + for: 15m + labels: + severity: critical + - alert: VeleroBackupFailures + annotations: + message: + Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed + backups. + expr: + velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""} + > 0.25 + for: 15m + labels: + severity: critical diff --git a/rules/x509-exporter.yaml b/rules/x509-exporter.yaml index 90cb7064..b2ac3b1d 100644 --- a/rules/x509-exporter.yaml +++ b/rules/x509-exporter.yaml @@ -1,46 +1,52 @@ groups: -- name: x509-certificate-exporter.rules - rules: - - alert: X509ExporterReadErrors - annotations: - description: Over the last 15 minutes, this x509-certificate-exporter instance - has experienced errors reading certificate files or querying the Kubernetes - API. This could be caused by a misconfiguration if triggered when the exporter - starts. - summary: Increasing read errors for x509-certificate-exporter - expr: delta(x509_read_errors[15m]) > 0 - for: 5m - labels: - severity: warning - - alert: CertificateError - annotations: - description: Certificate could not be decoded {{if $labels.secret_name }} in - Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at - location "{{ $labels.filepath }}"{{end}} - summary: Certificate cannot be decoded - expr: x509_cert_error > 0 - for: 15m - labels: - severity: warning - - alert: CertificateRenewal - annotations: - description: Certificate for "{{ $labels.subject_CN }}" should be renewed {{if - $labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{ - $labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}} - summary: Certificate should be renewed - expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="", - issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28 - for: 15m - labels: - severity: warning - - alert: CertificateExpiration - annotations: - description: Certificate for "{{ $labels.subject_CN }}" is about to expire {{if - $labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{ - $labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}} - summary: Certificate is about to expire - expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="", - issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14 - for: 15m - labels: - severity: critical + - name: x509-certificate-exporter.rules + rules: + - alert: X509ExporterReadErrors + annotations: + description: + Over the last 15 minutes, this x509-certificate-exporter instance + has experienced errors reading certificate files or querying the Kubernetes + API. This could be caused by a misconfiguration if triggered when the exporter + starts. + summary: Increasing read errors for x509-certificate-exporter + expr: delta(x509_read_errors[15m]) > 0 + for: 5m + labels: + severity: warning + - alert: CertificateError + annotations: + description: + Certificate could not be decoded {{if $labels.secret_name }} in + Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at + location "{{ $labels.filepath }}"{{end}} + summary: Certificate cannot be decoded + expr: x509_cert_error > 0 + for: 15m + labels: + severity: warning + - alert: CertificateRenewal + annotations: + description: + Certificate for "{{ $labels.subject_CN }}" should be renewed {{if + $labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{ + $labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}} + summary: Certificate should be renewed + expr: + ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="", + issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28 + for: 15m + labels: + severity: warning + - alert: CertificateExpiration + annotations: + description: + Certificate for "{{ $labels.subject_CN }}" is about to expire {{if + $labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{ + $labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}} + summary: Certificate is about to expire + expr: + ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="", + issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14 + for: 15m + labels: + severity: critical