Compare commits

..

1 Commits

Author SHA1 Message Date
juselius 6e9b1c8f29 feat: add stub sonatype-nexus helmfile 2025-12-16 20:11:20 +01:00
143 changed files with 2480 additions and 3348 deletions
-1
View File
@@ -1,7 +1,6 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# the shebang is ignored, but nice for editors # the shebang is ignored, but nice for editors
watch_file nix/sources.json watch_file nix/sources.json
watch_file nix/checks.nix
# Load .env file if it exists # Load .env file if it exists
dotenv_if_exists dotenv_if_exists
-1
View File
@@ -1,7 +1,6 @@
*.tgz *.tgz
_*/ _*/
.direnv/ .direnv/
.env
.pre-commit-config.yaml .pre-commit-config.yaml
_*.yaml _*.yaml
backup/ backup/
+42 -50
View File
@@ -1,54 +1,46 @@
# yaml-language-server: $schema=https://gitlab.com/gitlab-org/gitlab/-/raw/master/app/assets/javascripts/editor/schema/ci.json image:
default: name: alpine/helm:latest
tags: entrypoint: [ "/bin/bash", "-c" ]
- nix
include: stages:
- project: oceanbox/gitlab-ci - release
ref: v4.5
file: template/Base.gitlab-ci.yml
# stages:
# - release
# image: release:
# name: alpine/helm:latest stage: release
# entrypoint: ["/bin/bash", "-c"] rules:
- if: '$CI_COMMIT_BRANCH =~ /^main/'
when: always
- when: never
script:
- |
cd $CI_PROJECT_DIR
for i in $(git show --pretty="" --name-only | grep '^charts/.*/Chart.yaml' | cut -d/ -f2); do
pack=$(helm package ./charts/$i | sed 's/Success.*: \(.*\)/\1/')
if [ ! -z $pack ]; then
chart=$(basename $pack)
curl --request POST \
--user gitlab-ci-token:$CI_JOB_TOKEN \
--form "chart=@${chart}" \
"${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts"
fi
done
# release: rebuild:
# stage: release stage: release
# rules: rules:
# - if: "$CI_COMMIT_BRANCH =~ /^main/" - when: manual
# when: always allow_failure: true
# - when: never script:
# script: - |
# - | cd $CI_PROJECT_DIR
# cd $CI_PROJECT_DIR for i in $(find ./charts -maxdepth 2 -name Chart.yaml | cut -d/ -f3); do
# for i in $(git show --pretty="" --name-only | grep '^charts/.*/Chart.yaml' | cut -d/ -f2); do pack=$(helm package ./charts/$i | sed 's/Success.*: \(.*\)/\1/')
# pack=$(helm package ./charts/$i | sed 's/Success.*: \(.*\)/\1/') if [ ! -z $pack ]; then
# if [ ! -z $pack ]; then chart=$(basename $pack)
# chart=$(basename $pack) curl --request POST \
# curl --request POST \ --user gitlab-ci-token:$CI_JOB_TOKEN \
# --user gitlab-ci-token:$CI_JOB_TOKEN \ --form "chart=@${chart}" \
# --form "chart=@${chart}" \ "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts"
# "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts" fi
# fi done
# done
# rebuild:
# stage: release
# rules:
# - when: manual
# allow_failure: true
# script:
# - |
# cd $CI_PROJECT_DIR
# for i in $(find ./charts -maxdepth 2 -name Chart.yaml | cut -d/ -f3); do
# pack=$(helm package ./charts/$i | sed 's/Success.*: \(.*\)/\1/')
# if [ ! -z $pack ]; then
# chart=$(basename $pack)
# curl --request POST \
# --user gitlab-ci-token:$CI_JOB_TOKEN \
# --form "chart=@${chart}" \
# "${CI_API_V4_URL}/projects/${CI_PROJECT_ID}/packages/helm/api/stable/charts"
# fi
# done
+22 -29
View File
@@ -6,46 +6,39 @@ let
values = lib.apps.appValues { values = lib.apps.appValues {
inherit env; inherit env;
base = ../values/atlantis; base = ../values/atlantis;
extraValues = { }; extraValues = {};
}; };
kustomize = kustomize = r:
r:
if r.kind == "Deployment" then if r.kind == "Deployment" then
lib.attrsets.recursiveUpdate r { lib.attrsets.recursiveUpdate r {
spec.template.spec.containers = builtins.map ( spec.template.spec.containers =
x: builtins.map (x:
x x // {
// {
livenessProbe.httpGet.path = "/healthz"; livenessProbe.httpGet.path = "/healthz";
readinessProble.httpGet.path = "/healthz"; readinessProble.httpGet.path = "/healthz";
env = x.env ++ [ env = x.env ++ [ { name = "INERNAL_PORT"; value = 8000; } ];
{ }) r.spec.template.spec.containers;
name = "INERNAL_PORT";
value = 8000;
}
];
}
) r.spec.template.spec.containers;
} }
else if r.kind == "Service" then else if r.kind == "Service" then
{ } {}
else else r;
r;
in in
{ {
options.apps.atlantis = lib.apps.appOptions { options.apps.atlantis = lib.apps.appOptions {
revision = lib.mkOption { revision = lib.mkOption {
type = lib.types.str; type = lib.types.str;
default = "main"; default = "main";
description = "Revision"; description = "Revision";
}; };
hostname = lib.mkOption { hostname = lib.mkOption {
type = lib.types.str; type = lib.types.str;
default = if env == "prod" then "maps.oceanbox.io" else "atlantis.beta.oceanbox.io"; default = if env == "prod"
description = "Revision"; then "maps.oceanbox.io"
}; else "atlantis.beta.oceanbox.io";
description = "Revision";
};
}; };
config = lib.apps.appConfig cfg "${env}-atlantis" { config = lib.apps.appConfig cfg "${env}-atlantis" {
+24 -22
View File
@@ -6,32 +6,34 @@ let
values = lib.apps.appValues { values = lib.apps.appValues {
inherit env; inherit env;
base = ../values/openfga; base = ../values/openfga;
extraValues = { }; extraValues = {};
}; };
kustomize = kustomize = r:
r: if r.kind == "Job" then lib.attrsets.recursiveUpdate r { spec.backoffLimit = 2; } else r; if r.kind == "Job" then
lib.attrsets.recursiveUpdate r { spec.backoffLimit = 2; }
else r;
in in
{ {
options.apps.openfga = lib.apps.appOptions { }; options.apps.openfga = lib.apps.appOptions {};
config = lib.apps.appConfig cfg "${env}-openfga" { config = lib.apps.appConfig cfg "${env}-openfga" {
helm.releases."${env}-openfga" = { helm.releases."${env}-openfga" = {
inherit values; inherit values;
chart = lib.helm.downloadHelmChart { chart = lib.helm.downloadHelmChart {
repo = "https://openfga.github.io/helm-charts"; repo = "https://openfga.github.io/helm-charts";
chart = "openfga"; chart = "openfga";
version = "0.2.12"; version = "0.2.12";
chartHash = "sha256-7yLcw9/oNPvCePrtTJwKAG88t0Ym5Dl/S83Gz+gQdDU="; chartHash = "sha256-7yLcw9/oNPvCePrtTJwKAG88t0Ym5Dl/S83Gz+gQdDU=";
}; };
transformer = rs: builtins.map (x: kustomize x) rs; transformer = rs: builtins.map (x: kustomize x) rs;
}; };
annotations = { }; annotations = {};
resources = { resources = {
services.poop.spec = { services.poop.spec = {
};
};
}; };
}; }
};
}
+3 -3
View File
@@ -46,19 +46,19 @@ spec:
{{ end }} {{ end }}
cleanupController: cleanupController:
resources: resources:
limits: limits:
memory: {{ .Values.kyverno.resources.cleanupController.memory }} memory: {{ .Values.kyverno.resources.cleanupController.memory }}
requests: requests:
memory: {{ .Values.kyverno.resources.cleanupController.memory }} memory: {{ .Values.kyverno.resources.cleanupController.memory }}
reportsController: reportsController:
resources: resources:
limits: limits:
memory: {{ .Values.kyverno.resources.reportsController.memory }} memory: {{ .Values.kyverno.resources.reportsController.memory }}
requests: requests:
memory: {{ .Values.kyverno.resources.reportsController.memory }} memory: {{ .Values.kyverno.resources.reportsController.memory }}
backgroundController: backgroundController:
resources: resources:
limits: limits:
memory: {{ .Values.kyverno.resources.backgroundController.memory }} memory: {{ .Values.kyverno.resources.backgroundController.memory }}
requests: requests:
memory: {{ .Values.kyverno.resources.backgroundController.memory }} memory: {{ .Values.kyverno.resources.backgroundController.memory }}
+4 -4
View File
@@ -27,17 +27,17 @@ spec:
scheme: {{ .Values.linkerd.secretScheme }} scheme: {{ .Values.linkerd.secretScheme }}
{{- if .Values.linkerd.identityIssuerPEM }} {{- if .Values.linkerd.identityIssuerPEM }}
tls: tls:
crtPEM: {{- .Values.linkerd.identityIssuerPEM | toYaml | indent 14 }} crtPEM: {{- .Values.linkerd.identityIssuerPEM | toYaml | indent 14 }}
{{- end }} {{- end }}
policyValidator: policyValidator:
externalSecret: true externalSecret: true
caBundle: {{- .Values.linkerd.webhookPEM | toYaml | indent 9 }} caBundle: {{- .Values.linkerd.webhookPEM | toYaml | indent 9 }}
proxyInjector: proxyInjector:
externalSecret: true externalSecret: true
caBundle: {{- .Values.linkerd.webhookPEM | toYaml | indent 9 }} caBundle: {{- .Values.linkerd.webhookPEM | toYaml | indent 9 }}
profileValidator: profileValidator:
externalSecret: true externalSecret: true
caBundle: {{- .Values.linkerd.webhookPEM | toYaml | indent 9 }} caBundle: {{- .Values.linkerd.webhookPEM | toYaml | indent 9 }}
project: sys project: sys
syncPolicy: syncPolicy:
+1 -1
View File
@@ -16,7 +16,7 @@ spec:
helm: helm:
values: | values: |
containerPort: 10250 containerPort: 10250
resources: resources:
requests: requests:
cpu: 100m cpu: 100m
memory: 200Mi memory: 200Mi
+1 -1
View File
@@ -53,7 +53,7 @@ spec:
endpoint: "tempo.tempo.svc:4317" endpoint: "tempo.tempo.svc:4317"
tls: tls:
insecure: true insecure: true
## ##
otlphttp/metrics: otlphttp/metrics:
endpoint: http://prom-prometheus.prometheus:9090/api/v1/otlp endpoint: http://prom-prometheus.prometheus:9090/api/v1/otlp
tls: tls:
@@ -12,8 +12,8 @@ metadata:
policies.kyverno.io/minversion: 1.7.0 policies.kyverno.io/minversion: 1.7.0
kyverno.io/kubernetes-version: "1.23" kyverno.io/kubernetes-version: "1.23"
policies.kyverno.io/description: >- policies.kyverno.io/description: >-
Customers should not have full admin permissions on their own namespaces. Customers should not have full admin permissions on their own namespaces.
This policy will generate a RoleBinding, binding their group_id to This policy will generate a RoleBinding, binding their group_id to
the Cluster-Admin clusterrole. This will still only apply to the namespace as the Cluster-Admin clusterrole. This will still only apply to the namespace as
the resource is a rolebinding, not clusterrolebinding. the resource is a rolebinding, not clusterrolebinding.
This policy should not trigger on any namespaces with label component=sys This policy should not trigger on any namespaces with label component=sys
@@ -24,7 +24,7 @@ spec:
grafana_folder: Prometheus-stack grafana_folder: Prometheus-stack
targets: targets:
- apiVersion: v1 - apiVersion: v1
kind: ConfigMap kind: ConfigMap
name: "{{`{{ request.object.metadata.name }}`}}" name: "{{`{{ request.object.metadata.name }}`}}"
name: generate-dashboard-folder-annotation name: generate-dashboard-folder-annotation
skipBackgroundRequests: true skipBackgroundRequests: true
+1 -1
View File
@@ -13,7 +13,7 @@ metadata:
is time consuming and error prone. This policy will copy a is time consuming and error prone. This policy will copy a
Secret called `regcred` which exists in the `default` Namespace to Secret called `regcred` which exists in the `default` Namespace to
new Namespaces when they are created. It will also push updates to new Namespaces when they are created. It will also push updates to
the copied Secrets should the source Secret be changed. the copied Secrets should the source Secret be changed.
spec: spec:
rules: rules:
- name: sync-image-pull-secret - name: sync-image-pull-secret
@@ -9,12 +9,12 @@ metadata:
policies.kyverno.io/severity: medium policies.kyverno.io/severity: medium
policies.kyverno.io/subject: Ingress policies.kyverno.io/subject: Ingress
policies.kyverno.io/description: >- policies.kyverno.io/description: >-
Ingresses with the label "internal=true" should be whitelisted. Ingresses with the label "internal=true" should be whitelisted.
If no whitelist exists, add the default values, otherwise append If no whitelist exists, add the default values, otherwise append
whitelist to the already existing ones whitelist to the already existing ones
spec: spec:
mutateExistingOnPolicyUpdate: false mutateExistingOnPolicyUpdate: false
#precondition: has whitelist annotation or #precondition: has whitelist annotation or
rules: rules:
- name: ensure-nginx-whitelist-exists - name: ensure-nginx-whitelist-exists
match: match:
@@ -32,7 +32,7 @@ data:
} }
], ],
"__elements":{ "__elements":{
}, },
"__requires":[ "__requires":[
{ {
@@ -70,7 +70,7 @@ data:
"limit":100, "limit":100,
"matchAny":false, "matchAny":false,
"tags":[ "tags":[
], ],
"type":"dashboard" "type":"dashboard"
}, },
@@ -83,7 +83,7 @@ data:
"graphTooltip":0, "graphTooltip":0,
"id":null, "id":null,
"links":[ "links":[
], ],
"liveNow":false, "liveNow":false,
"panels":[ "panels":[
@@ -130,7 +130,7 @@ data:
} }
}, },
"mappings":[ "mappings":[
], ],
"thresholds":{ "thresholds":{
"mode":"absolute", "mode":"absolute",
@@ -195,7 +195,7 @@ data:
"options":{ "options":{
"legend":{ "legend":{
"calcs":[ "calcs":[
], ],
"displayMode":"list", "displayMode":"list",
"placement":"bottom", "placement":"bottom",
@@ -255,7 +255,7 @@ data:
"multi":false, "multi":false,
"name":"DS_PROMETHEUS", "name":"DS_PROMETHEUS",
"options":[ "options":[
], ],
"query":"prometheus", "query":"prometheus",
"refresh":1, "refresh":1,
@@ -266,7 +266,7 @@ data:
}, },
{ {
"current":{ "current":{
}, },
"datasource":{ "datasource":{
"type":"prometheus", "type":"prometheus",
@@ -279,7 +279,7 @@ data:
"multi":false, "multi":false,
"name":"namespace", "name":"namespace",
"options":[ "options":[
], ],
"query":{ "query":{
"query":"label_values(rabbitmq_identity_info, namespace)", "query":"label_values(rabbitmq_identity_info, namespace)",
@@ -296,7 +296,7 @@ data:
}, },
{ {
"current":{ "current":{
}, },
"datasource":{ "datasource":{
"type":"prometheus", "type":"prometheus",
@@ -309,7 +309,7 @@ data:
"multi":false, "multi":false,
"name":"rabbitmq_cluster", "name":"rabbitmq_cluster",
"options":[ "options":[
], ],
"query":{ "query":{
"query":"label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)", "query":"label_values(rabbitmq_identity_info{namespace=\"$namespace\"}, rabbitmq_cluster)",
@@ -326,7 +326,7 @@ data:
}, },
{ {
"current":{ "current":{
}, },
"datasource":{ "datasource":{
"type":"prometheus", "type":"prometheus",
@@ -339,7 +339,7 @@ data:
"multi":false, "multi":false,
"name":"queue", "name":"queue",
"options":[ "options":[
], ],
"query":{ "query":{
"query":"query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\", rabbitmq_cluster=\"$rabbitmq_cluster\"})", "query":"query_result(rabbitmq_detailed_queue_messages{namespace=\"$namespace\"} * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{namespace=\"$namespace\", rabbitmq_cluster=\"$rabbitmq_cluster\"})",
@@ -361,7 +361,7 @@ data:
"to":"now" "to":"now"
}, },
"timepicker":{ "timepicker":{
}, },
"timezone":"", "timezone":"",
"title":"RabbitMQ-Queue", "title":"RabbitMQ-Queue",
@@ -37,7 +37,7 @@ rules:
resources: resources:
- events - events
verbs: ["*"] verbs: ["*"]
- nonResourceURLs: ["*"] - nonResourceURLs: ["*"]
verbs: ["*"] verbs: ["*"]
- apiGroups: - apiGroups:
@@ -139,8 +139,8 @@ spec:
resources: {} resources: {}
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
command: command:
- "/bin/sh" - "/bin/sh"
- -c - -c
- /tmp/renew-certs/renew-certs.sh - /tmp/renew-certs/renew-certs.sh
volumeMounts: volumeMounts:
@@ -216,7 +216,7 @@ metadata:
name: default-deny-egress name: default-deny-egress
namespace: cert-manager namespace: cert-manager
spec: spec:
podSelector: podSelector:
matchLabels: matchLabels:
block-egress: "true" block-egress: "true"
policyTypes: policyTypes:
@@ -42,8 +42,8 @@ spec:
resources: {} resources: {}
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
command: command:
- "/bin/sh" - "/bin/sh"
- -c - -c
- /tmp/renew-certs/renew-certs.sh - /tmp/renew-certs/renew-certs.sh
volumeMounts: volumeMounts:
@@ -119,7 +119,7 @@ metadata:
name: default-deny-egress name: default-deny-egress
namespace: gitlab namespace: gitlab
spec: spec:
podSelector: podSelector:
matchLabels: matchLabels:
block-egress: "true" block-egress: "true"
policyTypes: policyTypes:
+3
View File
@@ -1,4 +1,7 @@
{ {
// Use IntelliSense to learn about possible attributes.
// Hover to view descriptions of existing attributes.
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
"version": "0.2.0", "version": "0.2.0",
"configurations": [ "configurations": [
{ {
+1 -1
View File
@@ -1,4 +1,4 @@
# Oceanbox IdP # Oceanbox IdP
``` ```
npm install && npm start npm install && npm start
+7 -7
View File
@@ -2,16 +2,16 @@
server="root@fs1-0" server="root@fs1-0"
path="/vol/brick0/nfs0/k1/pv-oceanbox-dex" path="/vol/brick0/nfs0/k1/pv-oceanbox-dex"
dest="${server}:${path}" dest="$server:$path"
index=$(basename dist/assets/index-*.js) index=$(basename dist/assets/index-*.js)
ssh "${server}" -- rm "${path}"/static/js/*.js ssh $server -- rm $path/static/js/*.js
scp dist/assets/*.js "${dest}"/static/js/ scp dist/assets/*.js $dest/static/js/
sed -r "s/@index@/${index}/" ./dex/templates/login.html > login.html.$$ sed -r "s/@index@/$index/" ./dex/templates/login.html > login.html.$$
scp ./dex/templates/* "${dest}"/templates/ scp ./dex/templates/* $dest/templates/
scp ./dex/static/*.* "${dest}"/static/ scp ./dex/static/*.* $dest/static/
scp login.html.$$ "${dest}"/templates/login.html scp login.html.$$ $dest/templates/login.html
rm login.html.$$ rm login.html.$$
ssh admin@k1-0.itpartner.intern -- kubectl rollout restart -n oceanbox deployment/dex ssh admin@k1-0.itpartner.intern -- kubectl rollout restart -n oceanbox deployment/dex
File diff suppressed because one or more lines are too long
+2 -2
View File
@@ -66,7 +66,7 @@ let MyApp() =
if isNullOrUndefined localStorage["user_id"] then if isNullOrUndefined localStorage["user_id"] then
"" ""
else else
localStorage["user_id"] localStorage["user_id"]
// Browser.Dom.document.cookie // Browser.Dom.document.cookie
// |> fun s -> s.Split ';' // |> fun s -> s.Split ';'
// |> Array.filter (fun s -> s.StartsWith "user_id=") // |> Array.filter (fun s -> s.StartsWith "user_id=")
@@ -75,7 +75,7 @@ let MyApp() =
// |> Option.defaultValue "" // |> Option.defaultValue ""
let toggleAmnesia _ = setAmnesia (not amnesia) let toggleAmnesia _ = setAmnesia (not amnesia)
html $""" html $"""
<div class="centering"> <div class="centering">
<div @keydown={Ev(onEnter)}> <div @keydown={Ev(onEnter)}>
+17 -18
View File
@@ -1,5 +1,4 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# shellcheck disable=SC2034 # Unused variables left for readability
helmfile () { helmfile () {
@@ -11,30 +10,30 @@ bases:
- ../envs/environments.yaml.gotmpl - ../envs/environments.yaml.gotmpl
commonLabels: commonLabels:
tier: ${tier} tier: $tier
releases: releases:
- name: ${name} - name: $name
namespace: {{ .Environment.Name }}-${name} namespace: {{ .Environment.Name }}-$name
chart: ../charts/${name} chart: ../charts/$name
condition: ${name}.enabled condition: $name.enabled
values: values:
- ../values/${name}/values/values.yaml.gotmpl - ../values/$name/values/values.yaml.gotmpl
- ../values/${name}/values/values-{{ .Environment.Name }}.yaml - ../values/$name/values/values-{{ .Environment.Name }}.yaml
postRenderer: ../bin/kustomizer postRenderer: ../bin/kustomizer
postRendererArgs: postRendererArgs:
- ../values/${name}/kustomize/{{ .Environment.Name }} - ../values/$name/kustomize/{{ .Environment.Name }}
missingFileHandler: Info missingFileHandler: Info
- name: manifests - name: manifests
namespace: {{ .Environment.Name }}-${name} namespace: {{ .Environment.Name }}-$name
chart: manifests chart: manifests
condition: ${name}.enabled condition: $name.enabled
missingFileHandler: Info missingFileHandler: Info
values: values:
- ../values/env.yaml - ../values/env.yaml
- ../values/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml - ../values/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml
- ../values/${name}/env.yaml.gotmpl - ../values/$name/env.yaml.gotmpl
- ../values/${name}/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml.gotmpl - ../values/$name/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml.gotmpl
hooks: hooks:
- events: [ prepare, cleanup ] - events: [ prepare, cleanup ]
showlogs: true showlogs: true
@@ -43,7 +42,7 @@ releases:
- '{{\`{{ if eq .Event.Name "prepare" }}build{{ else }}clean{{ end }}\`}}' - '{{\`{{ if eq .Event.Name "prepare" }}build{{ else }}clean{{ end }}\`}}'
- '{{\`{{ .Release.Chart }}\`}}' - '{{\`{{ .Release.Chart }}\`}}'
- '{{\`{{ .Environment.Name }}\`}}' - '{{\`{{ .Environment.Name }}\`}}'
- ../values/${name}/manifests - ../values/$name/manifests
- manifests - manifests
EOF EOF
} }
@@ -60,10 +59,10 @@ done
name=$1 name=$1
tier=$2 tier=$2
if [[ -n "${ns}" ]]; then if [ -n "$ns" ]; then
namespace="namespace: {{ .Environment.Name }}-${name}" namespace="namespace: {{ .Environment.Name }}-$name"
else else
namespace="namespace: ${name}" namespace="namespace: $name"
fi fi
helmfile "$1" "$2" helmfile $1 $2
+14 -13
View File
@@ -4,38 +4,39 @@ set -o pipefail
cmd=$1 cmd=$1
chart=$2 chart=$2
env=$3
manifests=${4:-manifests} manifests=${4:-manifests}
outdir=${5:-_manifests} outdir=${5:-_manifests}
build() { build() {
mkdir -p "${outdir}"/templates mkdir -p $outdir/templates
echo "Creating ${outdir}/templates" echo "Creating $outdir/templates"
echo "generating ${outdir}/Chart.yaml" 1>&2 echo "generating $outdir/Chart.yaml" 1>&2
cat <<EOF > "${outdir}"/Chart.yaml cat <<EOF > $outdir/Chart.yaml
apiVersion: v1 apiVersion: v1
appVersion: "1.0" appVersion: "1.0"
# description: A Helm chart for Kubernetes # description: A Helm chart for Kubernetes
name: ${chart} name: $chart
version: 0.1.0 version: 0.1.0
EOF EOF
if [[ -d "${manifests}" ]]; then if [ -d $manifests ]; then
cp -r "${manifests}"/* "${outdir}"/templates cp -r $manifests/* $outdir/templates
elif [[ -f "${manifests}" ]]; then elif [ -f $manifests ]; then
cp "${manifests}" "${outdir}"/templates cp $manifests $outdir/templates
fi fi
} }
clean() { clean() {
echo "cleaning ${outdir}" 1>&2 echo "cleaning $outdir" 1>&2
rm -rf "${outdir}" rm -rf $outdir
} }
case "${cmd}" in case "$cmd" in
"build" ) build ;; "build" ) build ;;
"clean" ) clean ;; "clean" ) clean ;;
* ) echo "unsupported command: ${cmd}" 1>&2; exit 1 ;; * ) echo "unsupported command: $cmd" 1>&2; exit 1 ;;
esac esac
+5 -5
View File
@@ -1,13 +1,13 @@
#!/usr/bin/env bash #!/usr/bin/env bash
[[ $# != 1 ]] && exit 1 [ $# != 1 ] && exit 1
dir=$1 dir=$1
base=${dir}/../base base=$dir/../base
if [[ -f "${base}"/kustomization.yaml ]] && [[ -f "${dir}"/kustomization.yaml ]]; then if [ -f $base/kustomization.yaml -a -f $dir/kustomization.yaml ]; then
cat > "${base}"/_manifest.yaml cat > $base/_manifest.yaml
kubectl kustomize "${dir}" kubectl kustomize $dir
else else
cat cat
fi fi
+13 -13
View File
@@ -3,16 +3,16 @@ kind: ClusterRole
metadata: metadata:
name: argocd-cluster-admin name: argocd-cluster-admin
rules: rules:
- apiGroups: - apiGroups:
- "*" - '*'
resources: resources:
- "*" - '*'
verbs: verbs:
- "*" - '*'
- nonResourceURLs: - nonResourceURLs:
- "*" - '*'
verbs: verbs:
- "*" - '*'
--- ---
apiVersion: rbac.authorization.k8s.io/v1 apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding kind: ClusterRoleBinding
@@ -23,9 +23,9 @@ roleRef:
kind: ClusterRole kind: ClusterRole
name: argocd-cluster-admin name: argocd-cluster-admin
subjects: subjects:
- kind: ServiceAccount - kind: ServiceAccount
name: argocd-cluster-admin name: argocd-cluster-admin
namespace: kube-system namespace: kube-system
--- ---
apiVersion: v1 apiVersion: v1
kind: ServiceAccount kind: ServiceAccount
+2
View File
@@ -6,3 +6,5 @@ metadata:
name: cluster-admin-token name: cluster-admin-token
namespace: kube-system namespace: kube-system
type: kubernetes.io/service-account-token type: kubernetes.io/service-account-token
+2
View File
@@ -10,3 +10,5 @@ metadata:
name: cluster-ekman name: cluster-ekman
namespace: argocd namespace: argocd
type: Opaque type: Opaque
+2 -2
View File
@@ -3,5 +3,5 @@
img=registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp img=registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp
tag=${1:-latest} tag=${1:-latest}
docker build -t "${img}":"${tag}" . docker build -t $img:$tag .
docker push "${img}":"${tag}" docker push $img:$tag
+15 -16
View File
@@ -1,15 +1,14 @@
#!/bin/sh #!/bin/sh
# shellcheck disable=SC2154
export HOME=/plugin export HOME=/plugin
env > /tmp/"${ARGOCD_APP_NAME}".env env > /tmp/$ARGOCD_APP_NAME.env
echo "${ARGOCD_APP_PARAMETERS}" | jq '.[] | select(.name == "helm-parameters") | .map' | yq -P -oy > parameters.yaml echo "$ARGOCD_APP_PARAMETERS" | jq '.[] | select(.name == "helm-parameters") | .map' | yq -P -oy > parameters.yaml
cp parameters.yaml /tmp/"${ARGOCD_APP_NAME}"-parameters.yaml cp parameters.yaml /tmp/$ARGOCD_APP_NAME-parameters.yaml
if [ -n "${PARAM_CHART}" ] && [ "${PARAM_CHART}" != "." ]; then if [ -n "$PARAM_CHART" -a "$PARAM_CHART" != "." ]; then
CHART=${PARAM_CHART} CHART=$PARAM_CHART
elif [ -d chart ]; then elif [ -d chart ]; then
CHART=chart CHART=chart
elif [ -f chart ]; then elif [ -f chart ]; then
@@ -19,19 +18,19 @@ else
fi fi
[ -f chart/values.yaml ] && VALUES="-f chart/values.yaml" [ -f chart/values.yaml ] && VALUES="-f chart/values.yaml"
[ -f values-chart.yaml ] && VALUES="${VALUES} -f values-chart.yaml" [ -f values-chart.yaml ] && VALUES="$VALUES -f values-chart.yaml"
[ -f values.yaml ] && VALUES="${VALUES} -f values.yaml" [ -f values.yaml ] && VALUES="$VALUES -f values.yaml"
[ -f values-"${PARAM_ENV}".yaml ] && VALUES="${VALUES} -f values-${PARAM_ENV}.yaml" [ -f values-$PARAM_ENV.yaml ] && VALUES="$VALUES -f values-$PARAM_ENV.yaml"
VALUES="${VALUES} -f parameters.yaml" VALUES="$VALUES -f parameters.yaml"
helm dependency update "${CHART}" >/tmp/"${ARGOCD_APP_NAME}"-helm-dependency-build.out helm dependency update $CHART >/tmp/$ARGOCD_APP_NAME-helm-dependency-build.out
mkdir -p base mkdir -p base
echo "helm template -n ${ARGOCD_APP_NAMESPACE} ${PARAM_FLAGS} ${VALUES} ${ARGOCD_APP_NAME} ${CHART}" > /tmp/"${ARGOCD_APP_NAME}"-helm.sh echo "helm template -n $ARGOCD_APP_NAMESPACE $PARAM_FLAGS $VALUES $ARGOCD_APP_NAME $CHART" > /tmp/$ARGOCD_APP_NAME-helm.sh
helm template -n "${ARGOCD_APP_NAMESPACE}" "${PARAM_FLAGS}" "${VALUES}" "${ARGOCD_APP_NAME}" "${CHART}" > ./base/_manifest.yaml helm template -n $ARGOCD_APP_NAMESPACE $PARAM_FLAGS $VALUES $ARGOCD_APP_NAME $CHART > ./base/_manifest.yaml
cp ./base/_manifest.yaml /tmp/"${ARGOCD_APP_NAME}"-manifest.yaml cp ./base/_manifest.yaml /tmp/$ARGOCD_APP_NAME-manifest.yaml
[ -d "${PARAM_ENV}" ] && kubectl kustomize "${PARAM_ENV}" > /tmp/"${ARGOCD_APP_NAME}"-manifest.yaml [ -d "$PARAM_ENV" ] && kubectl kustomize $PARAM_ENV > /tmp/$ARGOCD_APP_NAME-manifest.yaml
cat /tmp/"${ARGOCD_APP_NAME}"-manifest.yaml cat /tmp/$ARGOCD_APP_NAME-manifest.yaml
+1 -1
View File
@@ -18,7 +18,7 @@ EOF
exit 0 exit 0
fi fi
yq e -o=p "${VALUES}" | jq --slurp --raw-input ' yq e -o=p $VALUES | jq --slurp --raw-input '
[{ [{
name: "helm-parameters", name: "helm-parameters",
title: "Helm Parameters", title: "Helm Parameters",
@@ -1,9 +1,8 @@
#!/bin/sh #!/bin/sh
# shellcheck disable=SC2154
export HOME=/plugin export HOME=/plugin
helm repo add --username argocd-helm --password "${OCEANBOX_HELM_ACCESS_TOKEN}" oceanbox \ helm repo add --username argocd-helm --password "$OCEANBOX_HELM_ACCESS_TOKEN" oceanbox \
https://gitlab.com/api/v4/projects/54396343/packages/helm/stable https://gitlab.com/api/v4/projects/54396343/packages/helm/stable
helm repo add bitnami https://charts.bitnami.com/bitnami helm repo add bitnami https://charts.bitnami.com/bitnami
+3 -3
View File
@@ -4,9 +4,9 @@ export HOME=/plugin
helm repo update oceanbox helm repo update oceanbox
if [ -n "${PARAM_CHART}" ] && [ "${PARAM_CHART}" != "." ]; then if [ -n "$PARAM_CHART" -a "$PARAM_CHART" != "." ]; then
helm show values "${PARAM_CHART}" > values-chart.yaml helm show values $PARAM_CHART > values-chart.yaml
elif [ -f chart ]; then elif [ -f chart ]; then
CHART=$(cat chart) CHART=$(cat chart)
helm show values "${CHART}" > values-chart.yaml helm show values $CHART > values-chart.yaml
fi fi
+25 -24
View File
@@ -9,7 +9,7 @@ spec:
init: init:
# Init always happens immediately before generate, but its output is not treated as manifests. # Init always happens immediately before generate, but its output is not treated as manifests.
# This is a good place to, for example, download chart dependencies. # This is a good place to, for example, download chart dependencies.
command: [/bin/sh] command: [ /bin/sh ]
args: args:
- /plugin/init.sh - /plugin/init.sh
# The generate command runs in the Application source directory each time manifests are generated. Standard output # The generate command runs in the Application source directory each time manifests are generated. Standard output
@@ -17,7 +17,7 @@ spec:
# To write log messages from the command, write them to stderr, it will always be displayed. # To write log messages from the command, write them to stderr, it will always be displayed.
# Error output will be sent to the UI, so avoid printing sensitive information (such as secrets). # Error output will be sent to the UI, so avoid printing sensitive information (such as secrets).
generate: generate:
command: [/bin/sh] command: [ /bin/sh ]
args: args:
- /plugin/generate.sh - /plugin/generate.sh
@@ -27,15 +27,15 @@ spec:
# Only one of fileName, find.glob, or find.command should be specified. If multiple are specified then only the # Only one of fileName, find.glob, or find.command should be specified. If multiple are specified then only the
# first (in that order) is evaluated. # first (in that order) is evaluated.
# discover: # discover:
# fileName is a glob pattern (https://pkg.go.dev/path/filepath#Glob) that is applied to the Application's source # fileName is a glob pattern (https://pkg.go.dev/path/filepath#Glob) that is applied to the Application's source
# directory. If there is a match, this plugin may be used for the Application. # directory. If there is a match, this plugin may be used for the Application.
# fileName: "./subdir/s*.yaml" # fileName: "./subdir/s*.yaml"
# find: # find:
# This does the same thing as fileName, but it supports double-start (nested directory) glob patterns. # This does the same thing as fileName, but it supports double-start (nested directory) glob patterns.
# glob: "**/Chart.yaml" # glob: "**/Chart.yaml"
# The find command runs in the repository's root directory. To match, it must exit with status code 0 _and_ # The find command runs in the repository's root directory. To match, it must exit with status code 0 _and_
# produce non-empty output to standard out. # produce non-empty output to standard out.
# command: [sh, -c, find . -name env.yaml] # command: [sh, -c, find . -name env.yaml]
# The parameters config describes what parameters the UI should display for an Application. It is up to the user to # The parameters config describes what parameters the UI should display for an Application. It is up to the user to
# actually set parameters in the Application manifest (in spec.source.plugin.parameters). The announcements _only_ # actually set parameters in the Application manifest (in spec.source.plugin.parameters). The announcements _only_
# inform the "Parameters" tab in the App Details page of the UI. # inform the "Parameters" tab in the App Details page of the UI.
@@ -66,21 +66,22 @@ spec:
itemType: string itemType: string
collectionType: string collectionType: string
string: "" string: ""
# All the fields above besides 'string' apply to both the array and map type parameter announcements. # All the fields above besides "string" apply to both the array and map type parameter announcements.
# - name: array-param # - name: array-param
# # This field communicates the parameter's default value to the UI. Setting this field is optional. # # This field communicates the parameter's default value to the UI. Setting this field is optional.
# array: [default, items] # array: [default, items]
# collectionType: array # collectionType: array
# - name: map-param # - name: map-param
# # This field communicates the parameter's default value to the UI. Setting this field is optional. # # This field communicates the parameter's default value to the UI. Setting this field is optional.
# map: # map:
# some: value # some: value
# collectionType: map # collectionType: map
# dynamic: # dynamic:
# The command is run in an Application's source directory. Standard output must be JSON matching the schema of the # The command is run in an Application's source directory. Standard output must be JSON matching the schema of the
# static parameter announcements list. # static parameter announcements list.
# command: [ /bin/sh, /plugin/get-values.sh ] # command: [ /bin/sh, /plugin/get-values.sh ]
# If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository # If set to `true` then the plugin receives repository files with original file mode. Dangerous since the repository
# might have executable files. Set to true only if you trust the CMP plugin authors. # might have executable files. Set to true only if you trust the CMP plugin authors.
preserveFileMode: false preserveFileMode: false
+1 -1
View File
@@ -1,4 +1,4 @@
FROM ghcr.io/helmfile/helmfile:v1.1.9 FROM ghcr.io/helmfile/helmfile:v1.0.0
RUN mkdir -p /home/argocd/cmp-server/config/ RUN mkdir -p /home/argocd/cmp-server/config/
COPY plugin.yaml /home/argocd/cmp-server/config/ COPY plugin.yaml /home/argocd/cmp-server/config/
+417 -417
View File
@@ -45,432 +45,432 @@ spec:
affinity: affinity:
podAntiAffinity: podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution: preferredDuringSchedulingIgnoredDuringExecution:
- podAffinityTerm: - podAffinityTerm:
labelSelector: labelSelector:
matchLabels: matchLabels:
app.kubernetes.io/name: argocd-repo-server app.kubernetes.io/name: argocd-repo-server
topologyKey: kubernetes.io/hostname topologyKey: kubernetes.io/hostname
weight: 100 weight: 100
automountServiceAccountToken: true automountServiceAccountToken: true
containers: containers:
- args: - args:
- /usr/local/bin/argocd-repo-server - /usr/local/bin/argocd-repo-server
- --port=8081 - --port=8081
- --metrics-port=8084 - --metrics-port=8084
env: env:
- name: ARGOCD_REPO_SERVER_NAME - name: ARGOCD_REPO_SERVER_NAME
value: argocd-repo-server value: argocd-repo-server
- name: ARGOCD_RECONCILIATION_TIMEOUT - name: ARGOCD_RECONCILIATION_TIMEOUT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: timeout.reconciliation key: timeout.reconciliation
name: argocd-cm name: argocd-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LOGFORMAT - name: ARGOCD_REPO_SERVER_LOGFORMAT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.log.format key: reposerver.log.format
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LOGLEVEL - name: ARGOCD_REPO_SERVER_LOGLEVEL
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.log.level key: reposerver.log.level
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT - name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.parallelism.limit key: reposerver.parallelism.limit
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS - name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.listen.address key: reposerver.listen.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS - name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.metrics.listen.address key: reposerver.metrics.listen.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_DISABLE_TLS - name: ARGOCD_REPO_SERVER_DISABLE_TLS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.disable.tls key: reposerver.disable.tls
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_MIN_VERSION - name: ARGOCD_TLS_MIN_VERSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.minversion key: reposerver.tls.minversion
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_MAX_VERSION - name: ARGOCD_TLS_MAX_VERSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.maxversion key: reposerver.tls.maxversion
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_CIPHERS - name: ARGOCD_TLS_CIPHERS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.ciphers key: reposerver.tls.ciphers
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_CACHE_EXPIRATION - name: ARGOCD_REPO_CACHE_EXPIRATION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.repo.cache.expiration key: reposerver.repo.cache.expiration
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_SERVER - name: REDIS_SERVER
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.server key: redis.server
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_COMPRESSION - name: REDIS_COMPRESSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.compression key: redis.compression
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDISDB - name: REDISDB
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.db key: redis.db
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_USERNAME - name: REDIS_USERNAME
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: redis-username key: redis-username
name: argocd-redis name: argocd-redis
optional: true optional: true
- name: REDIS_PASSWORD - name: REDIS_PASSWORD
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: auth key: auth
name: argocd-redis name: argocd-redis
- name: REDIS_SENTINEL_USERNAME - name: REDIS_SENTINEL_USERNAME
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: redis-sentinel-username key: redis-sentinel-username
name: argocd-redis name: argocd-redis
optional: true optional: true
- name: REDIS_SENTINEL_PASSWORD - name: REDIS_SENTINEL_PASSWORD
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: redis-sentinel-password key: redis-sentinel-password
name: argocd-redis name: argocd-redis
optional: true optional: true
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION - name: ARGOCD_DEFAULT_CACHE_EXPIRATION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.default.cache.expiration key: reposerver.default.cache.expiration
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS - name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.address key: otlp.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE - name: ARGOCD_REPO_SERVER_OTLP_INSECURE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.insecure key: otlp.insecure
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS - name: ARGOCD_REPO_SERVER_OTLP_HEADERS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.headers key: otlp.headers
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE - name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.max.combined.directory.manifests.size key: reposerver.max.combined.directory.manifests.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS - name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.plugin.tar.exclusions key: reposerver.plugin.tar.exclusions
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS - name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.allow.oob.symlinks key: reposerver.allow.oob.symlinks
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.streamed.manifest.max.tar.size key: reposerver.streamed.manifest.max.tar.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.streamed.manifest.max.extracted.size key: reposerver.streamed.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.helm.manifest.max.extracted.size key: reposerver.helm.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.disable.helm.manifest.max.extracted.size key: reposerver.disable.helm.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_MODULES_ENABLED - name: ARGOCD_GIT_MODULES_ENABLED
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.enable.git.submodule key: reposerver.enable.git.submodule
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT - name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.git.lsremote.parallelism.limit key: reposerver.git.lsremote.parallelism.limit
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_REQUEST_TIMEOUT - name: ARGOCD_GIT_REQUEST_TIMEOUT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.git.request.timeout key: reposerver.git.request.timeout
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REVISION_CACHE_LOCK_TIMEOUT - name: ARGOCD_REVISION_CACHE_LOCK_TIMEOUT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.revision.cache.lock.timeout key: reposerver.revision.cache.lock.timeout
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_INCLUDE_HIDDEN_DIRECTORIES - name: ARGOCD_REPO_SERVER_INCLUDE_HIDDEN_DIRECTORIES
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.include.hidden.directories key: reposerver.include.hidden.directories
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: HELM_CACHE_HOME - name: HELM_CACHE_HOME
value: /helm-working-dir value: /helm-working-dir
- name: HELM_CONFIG_HOME - name: HELM_CONFIG_HOME
value: /helm-working-dir value: /helm-working-dir
- name: HELM_DATA_HOME - name: HELM_DATA_HOME
value: /helm-working-dir value: /helm-working-dir
image: quay.io/argoproj/argocd:v2.12.3 image: quay.io/argoproj/argocd:v2.12.3
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
livenessProbe: livenessProbe:
failureThreshold: 3 failureThreshold: 3
httpGet: httpGet:
path: /healthz?full=true path: /healthz?full=true
port: metrics port: metrics
scheme: HTTP scheme: HTTP
initialDelaySeconds: 10 initialDelaySeconds: 10
periodSeconds: 10 periodSeconds: 10
successThreshold: 1 successThreshold: 1
timeoutSeconds: 1 timeoutSeconds: 1
name: repo-server
ports:
- containerPort: 8081
name: repo-server name: repo-server
ports: protocol: TCP
- containerPort: 8081 - containerPort: 8084
name: repo-server name: metrics
protocol: TCP protocol: TCP
- containerPort: 8084 readinessProbe:
name: metrics failureThreshold: 3
protocol: TCP httpGet:
readinessProbe: path: /healthz
failureThreshold: 3 port: metrics
httpGet: scheme: HTTP
path: /healthz initialDelaySeconds: 10
port: metrics periodSeconds: 10
scheme: HTTP successThreshold: 1
initialDelaySeconds: 10 timeoutSeconds: 1
periodSeconds: 10 securityContext:
successThreshold: 1 allowPrivilegeEscalation: false
timeoutSeconds: 1 capabilities:
securityContext: drop:
allowPrivilegeEscalation: false - ALL
capabilities: readOnlyRootFilesystem: true
drop: runAsNonRoot: true
- ALL seccompProfile:
readOnlyRootFilesystem: true type: RuntimeDefault
runAsNonRoot: true terminationMessagePath: /dev/termination-log
seccompProfile: terminationMessagePolicy: File
type: RuntimeDefault volumeMounts:
terminationMessagePath: /dev/termination-log - mountPath: /app/config/ssh
terminationMessagePolicy: File name: ssh-known-hosts
volumeMounts: - mountPath: /app/config/tls
- mountPath: /app/config/ssh name: tls-certs
name: ssh-known-hosts - mountPath: /app/config/gpg/source
- mountPath: /app/config/tls name: gpg-keys
name: tls-certs - mountPath: /app/config/gpg/keys
- mountPath: /app/config/gpg/source name: gpg-keyring
name: gpg-keys - mountPath: /app/config/reposerver/tls
- mountPath: /app/config/gpg/keys name: argocd-repo-server-tls
name: gpg-keyring - mountPath: /helm-working-dir
- mountPath: /app/config/reposerver/tls name: helm-working-dir
name: argocd-repo-server-tls - mountPath: /home/argocd/cmp-server/plugins
- mountPath: /helm-working-dir name: plugins
name: helm-working-dir - mountPath: /tmp
- mountPath: /home/argocd/cmp-server/plugins name: tmp
name: plugins - command:
- mountPath: /tmp - /var/run/argocd/argocd-cmp-server
name: tmp image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
- command: imagePullPolicy: Always
- /var/run/argocd/argocd-cmp-server name: kustomize-helm-with-rewrite
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest securityContext:
imagePullPolicy: Always runAsNonRoot: true
name: kustomize-helm-with-rewrite runAsUser: 999
securityContext: terminationMessagePath: /dev/termination-log
runAsNonRoot: true terminationMessagePolicy: File
runAsUser: 999 volumeMounts:
terminationMessagePath: /dev/termination-log - mountPath: /var/run/argocd
terminationMessagePolicy: File name: var-files
volumeMounts: - mountPath: /home/argocd/cmp-server/plugins
- mountPath: /var/run/argocd name: plugins
name: var-files - mountPath: /tmp
- mountPath: /home/argocd/cmp-server/plugins name: cmp-tmp
name: plugins - mountPath: /helm-working-dir
- mountPath: /tmp name: helm-working-dir
name: cmp-tmp - command:
- mountPath: /helm-working-dir - /var/run/argocd/argocd-cmp-server
name: helm-working-dir image: registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp:latest
- command: imagePullPolicy: Always
- /var/run/argocd/argocd-cmp-server name: helm-kustomize-cmp
image: registry.gitlab.com/oceanbox/manifests/helm-kustomize-cmp:latest securityContext:
imagePullPolicy: Always runAsNonRoot: true
name: helm-kustomize-cmp runAsUser: 999
securityContext: terminationMessagePath: /dev/termination-log
runAsNonRoot: true terminationMessagePolicy: File
runAsUser: 999 volumeMounts:
terminationMessagePath: /dev/termination-log - mountPath: /var/run/argocd
terminationMessagePolicy: File name: var-files
volumeMounts: - mountPath: /home/argocd/cmp-server/plugins
- mountPath: /var/run/argocd name: plugins
name: var-files - mountPath: /tmp
- mountPath: /home/argocd/cmp-server/plugins name: cmp-tmp
name: plugins - mountPath: /helm-working-dir
- mountPath: /tmp name: helm-working-dir
name: cmp-tmp - command:
- mountPath: /helm-working-dir - /var/run/argocd/argocd-cmp-server
name: helm-working-dir image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
- command: imagePullPolicy: Always
- /var/run/argocd/argocd-cmp-server name: helmfile-cmp
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest securityContext:
imagePullPolicy: Always runAsNonRoot: true
name: helmfile-cmp runAsUser: 999
securityContext: terminationMessagePath: /dev/termination-log
runAsNonRoot: true terminationMessagePolicy: File
runAsUser: 999 volumeMounts:
terminationMessagePath: /dev/termination-log - mountPath: /var/run/argocd
terminationMessagePolicy: File name: var-files
volumeMounts: - mountPath: /home/argocd/cmp-server/plugins
- mountPath: /var/run/argocd name: plugins
name: var-files - mountPath: /tmp
- mountPath: /home/argocd/cmp-server/plugins name: cmp-tmp
name: plugins - mountPath: /helm-working-dir
- mountPath: /tmp name: helm-working-dir
name: cmp-tmp
- mountPath: /helm-working-dir
name: helm-working-dir
dnsPolicy: ClusterFirst dnsPolicy: ClusterFirst
imagePullSecrets: imagePullSecrets:
- name: gitlab-pull-secret - name: gitlab-pull-secret
initContainers: initContainers:
- command: - command:
- /bin/cp - /bin/cp
- -n - -n
- /usr/local/bin/argocd - /usr/local/bin/argocd
- /var/run/argocd/argocd-cmp-server - /var/run/argocd/argocd-cmp-server
image: quay.io/argoproj/argocd:v2.12.3 image: quay.io/argoproj/argocd:v2.12.3
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
name: copyutil name: copyutil
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
capabilities: capabilities:
drop: drop:
- ALL - ALL
readOnlyRootFilesystem: true readOnlyRootFilesystem: true
runAsNonRoot: true runAsNonRoot: true
seccompProfile: seccompProfile:
type: RuntimeDefault type: RuntimeDefault
terminationMessagePath: /dev/termination-log terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File terminationMessagePolicy: File
volumeMounts: volumeMounts:
- mountPath: /var/run/argocd - mountPath: /var/run/argocd
name: var-files name: var-files
- command: - command:
- /bin/sh - /bin/sh
- /plugin/init-helm-repos.sh - /plugin/init-helm-repos.sh
env: env:
- name: OCEANBOX_HELM_ACCESS_TOKEN - name: OCEANBOX_HELM_ACCESS_TOKEN
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: token key: token
name: oceanbox-helm name: oceanbox-helm
optional: false optional: false
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
imagePullPolicy: Always imagePullPolicy: Always
name: init-helm-repos name: init-helm-repos
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
capabilities: capabilities:
drop: drop:
- ALL - ALL
readOnlyRootFilesystem: true readOnlyRootFilesystem: true
runAsNonRoot: true runAsNonRoot: true
runAsUser: 999 runAsUser: 999
seccompProfile: seccompProfile:
type: RuntimeDefault type: RuntimeDefault
terminationMessagePath: /dev/termination-log terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File terminationMessagePolicy: File
volumeMounts: volumeMounts:
- mountPath: /helm-working-dir - mountPath: /helm-working-dir
name: helm-working-dir name: helm-working-dir
restartPolicy: Always restartPolicy: Always
schedulerName: default-scheduler schedulerName: default-scheduler
serviceAccount: argocd-repo-server serviceAccount: argocd-repo-server
serviceAccountName: argocd-repo-server serviceAccountName: argocd-repo-server
terminationGracePeriodSeconds: 30 terminationGracePeriodSeconds: 30
volumes: volumes:
- name: cmp-tmp - name: cmp-tmp
- name: helm-working-dir - name: helm-working-dir
- name: plugins - name: plugins
- name: var-files - name: var-files
- name: tmp - name: tmp
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-ssh-known-hosts-cm name: argocd-ssh-known-hosts-cm
name: ssh-known-hosts name: ssh-known-hosts
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-tls-certs-cm name: argocd-tls-certs-cm
name: tls-certs name: tls-certs
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-gpg-keys-cm name: argocd-gpg-keys-cm
name: gpg-keys name: gpg-keys
- name: gpg-keyring - name: gpg-keyring
- name: argocd-repo-server-tls - name: argocd-repo-server-tls
secret: secret:
defaultMode: 420 defaultMode: 420
items: items:
- key: tls.crt - key: tls.crt
path: tls.crt path: tls.crt
- key: tls.key - key: tls.key
path: tls.key path: tls.key
- key: ca.crt - key: ca.crt
path: ca.crt path: ca.crt
optional: true optional: true
secretName: argocd-repo-server-tls secretName: argocd-repo-server-tls
@@ -4,24 +4,24 @@ spec:
template: template:
spec: spec:
imagePullSecrets: imagePullSecrets:
- name: gitlab-pull-secret - name: gitlab-pull-secret
containers: containers:
- command: - command:
- /var/run/argocd/argocd-cmp-server - /var/run/argocd/argocd-cmp-server
image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest image: registry.gitlab.com/oceanbox/manifests/helmfile-cmp:latest
imagePullPolicy: Always imagePullPolicy: Always
name: helmfile-cmp name: helmfile-cmp
securityContext: securityContext:
runAsNonRoot: true runAsNonRoot: true
runAsUser: 999 runAsUser: 999
terminationMessagePath: /dev/termination-log terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File terminationMessagePolicy: File
volumeMounts: volumeMounts:
- mountPath: /var/run/argocd - mountPath: /var/run/argocd
name: var-files name: var-files
- mountPath: /home/argocd/cmp-server/plugins - mountPath: /home/argocd/cmp-server/plugins
name: plugins name: plugins
- mountPath: /tmp - mountPath: /tmp
name: tmp name: tmp
- mountPath: /helm-working-dir - mountPath: /helm-working-dir
name: helm-working-dir name: helm-working-dir
+2 -2
View File
@@ -3,5 +3,5 @@
img=registry.gitlab.com/oceanbox/manifests/helmfile-cmp img=registry.gitlab.com/oceanbox/manifests/helmfile-cmp
tag=${1:-latest} tag=${1:-latest}
docker build -t "${img}":"${tag}" . docker build -t $img:$tag .
docker push "${img}":"${tag}" docker push $img:$tag
+3 -4
View File
@@ -1,5 +1,4 @@
#!/bin/sh #!/bin/sh
# shellcheck disable=SC2154
# NOTE: Ensure errors are part of exitcode # NOTE: Ensure errors are part of exitcode
# set -o pipefail # set -o pipefail
@@ -11,7 +10,7 @@ export HELM_CONFIG_HOME=/tmp/helm/config
export HELMFILE_CACHE_HOME=/tmp/helmfile/cache export HELMFILE_CACHE_HOME=/tmp/helmfile/cache
export HELMFILE_TEMPDIR=/tmp/helmfile/tmp export HELMFILE_TEMPDIR=/tmp/helmfile/tmp
test -n ARGOCD_ENV_HELMFILE_ENVIRONMENT && export HELMFILE_ENVIRONMENT="${ARGOCD_ENV_HELMFILE_ENVIRONMENT}" test -n ARGOCD_ENV_HELMFILE_ENVIRONMENT && export HELMFILE_ENVIRONMENT=$ARGOCD_ENV_HELMFILE_ENVIRONMENT
test -n ARGOCD_ENV_HELMFILE_FILE_PATH && export HELMFILE_FILE_PATH="${ARGOCD_ENV_HELMFILE_FILE_PATH}" test -n ARGOCD_ENV_HELMFILE_FILE_PATH && export HELMFILE_FILE_PATH=$ARGOCD_ENV_HELMFILE_FILE_PATH
helmfile -n "${ARGOCD_APP_NAMESPACE}" "${ARGS}" template -q --include-crds helmfile -n "$ARGOCD_APP_NAMESPACE" $ARGS template --include-crds -q
+1 -1
View File
@@ -4,7 +4,7 @@ metadata:
name: helmfile-cmp name: helmfile-cmp
spec: spec:
generate: generate:
command: ["/bin/sh"] command: [ "/bin/sh" ]
args: args:
- /plugin/generate.sh - /plugin/generate.sh
lockRepo: false lockRepo: false
@@ -44,341 +44,341 @@ spec:
affinity: affinity:
podAntiAffinity: podAntiAffinity:
preferredDuringSchedulingIgnoredDuringExecution: preferredDuringSchedulingIgnoredDuringExecution:
- podAffinityTerm: - podAffinityTerm:
labelSelector: labelSelector:
matchLabels: matchLabels:
app.kubernetes.io/name: argocd-repo-server app.kubernetes.io/name: argocd-repo-server
topologyKey: kubernetes.io/hostname topologyKey: kubernetes.io/hostname
weight: 100 weight: 100
containers: containers:
- args: - args:
- /usr/local/bin/argocd-repo-server - /usr/local/bin/argocd-repo-server
- --port=8081 - --port=8081
- --metrics-port=8084 - --metrics-port=8084
env: env:
- name: ARGOCD_REPO_SERVER_NAME - name: ARGOCD_REPO_SERVER_NAME
value: argocd-repo-server value: argocd-repo-server
- name: ARGOCD_RECONCILIATION_TIMEOUT - name: ARGOCD_RECONCILIATION_TIMEOUT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: timeout.reconciliation key: timeout.reconciliation
name: argocd-cm name: argocd-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LOGFORMAT - name: ARGOCD_REPO_SERVER_LOGFORMAT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.log.format key: reposerver.log.format
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LOGLEVEL - name: ARGOCD_REPO_SERVER_LOGLEVEL
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.log.level key: reposerver.log.level
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT - name: ARGOCD_REPO_SERVER_PARALLELISM_LIMIT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.parallelism.limit key: reposerver.parallelism.limit
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS - name: ARGOCD_REPO_SERVER_LISTEN_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.listen.address key: reposerver.listen.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS - name: ARGOCD_REPO_SERVER_LISTEN_METRICS_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.metrics.listen.address key: reposerver.metrics.listen.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_DISABLE_TLS - name: ARGOCD_REPO_SERVER_DISABLE_TLS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.disable.tls key: reposerver.disable.tls
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_MIN_VERSION - name: ARGOCD_TLS_MIN_VERSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.minversion key: reposerver.tls.minversion
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_MAX_VERSION - name: ARGOCD_TLS_MAX_VERSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.maxversion key: reposerver.tls.maxversion
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_TLS_CIPHERS - name: ARGOCD_TLS_CIPHERS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.tls.ciphers key: reposerver.tls.ciphers
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_CACHE_EXPIRATION - name: ARGOCD_REPO_CACHE_EXPIRATION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.repo.cache.expiration key: reposerver.repo.cache.expiration
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_SERVER - name: REDIS_SERVER
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.server key: redis.server
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_COMPRESSION - name: REDIS_COMPRESSION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.compression key: redis.compression
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDISDB - name: REDISDB
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: redis.db key: redis.db
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: REDIS_USERNAME - name: REDIS_USERNAME
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: redis-username key: redis-username
name: argocd-redis name: argocd-redis
optional: true optional: true
- name: REDIS_PASSWORD - name: REDIS_PASSWORD
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: redis-password key: redis-password
name: argocd-redis name: argocd-redis
optional: true optional: true
- name: ARGOCD_DEFAULT_CACHE_EXPIRATION - name: ARGOCD_DEFAULT_CACHE_EXPIRATION
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.default.cache.expiration key: reposerver.default.cache.expiration
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_ADDRESS - name: ARGOCD_REPO_SERVER_OTLP_ADDRESS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.address key: otlp.address
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_INSECURE - name: ARGOCD_REPO_SERVER_OTLP_INSECURE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.insecure key: otlp.insecure
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_OTLP_HEADERS - name: ARGOCD_REPO_SERVER_OTLP_HEADERS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: otlp.headers key: otlp.headers
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE - name: ARGOCD_REPO_SERVER_MAX_COMBINED_DIRECTORY_MANIFESTS_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.max.combined.directory.manifests.size key: reposerver.max.combined.directory.manifests.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS - name: ARGOCD_REPO_SERVER_PLUGIN_TAR_EXCLUSIONS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.plugin.tar.exclusions key: reposerver.plugin.tar.exclusions
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS - name: ARGOCD_REPO_SERVER_ALLOW_OUT_OF_BOUNDS_SYMLINKS
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.allow.oob.symlinks key: reposerver.allow.oob.symlinks
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_TAR_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.streamed.manifest.max.tar.size key: reposerver.streamed.manifest.max.tar.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_STREAMED_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.streamed.manifest.max.extracted.size key: reposerver.streamed.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_HELM_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.helm.manifest.max.extracted.size key: reposerver.helm.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE - name: ARGOCD_REPO_SERVER_DISABLE_HELM_MANIFEST_MAX_EXTRACTED_SIZE
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.disable.helm.manifest.max.extracted.size key: reposerver.disable.helm.manifest.max.extracted.size
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_MODULES_ENABLED - name: ARGOCD_GIT_MODULES_ENABLED
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.enable.git.submodule key: reposerver.enable.git.submodule
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT - name: ARGOCD_GIT_LS_REMOTE_PARALLELISM_LIMIT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.git.lsremote.parallelism.limit key: reposerver.git.lsremote.parallelism.limit
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: ARGOCD_GIT_REQUEST_TIMEOUT - name: ARGOCD_GIT_REQUEST_TIMEOUT
valueFrom: valueFrom:
configMapKeyRef: configMapKeyRef:
key: reposerver.git.request.timeout key: reposerver.git.request.timeout
name: argocd-cmd-params-cm name: argocd-cmd-params-cm
optional: true optional: true
- name: HELM_CACHE_HOME - name: HELM_CACHE_HOME
value: /helm-working-dir value: /helm-working-dir
- name: HELM_CONFIG_HOME - name: HELM_CONFIG_HOME
value: /helm-working-dir value: /helm-working-dir
- name: HELM_DATA_HOME - name: HELM_DATA_HOME
value: /helm-working-dir value: /helm-working-dir
image: quay.io/argoproj/argocd:v2.10.4 image: quay.io/argoproj/argocd:v2.10.4
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
livenessProbe: livenessProbe:
failureThreshold: 3 failureThreshold: 3
httpGet: httpGet:
path: /healthz?full=true path: /healthz?full=true
port: metrics port: metrics
scheme: HTTP scheme: HTTP
initialDelaySeconds: 10 initialDelaySeconds: 10
periodSeconds: 10 periodSeconds: 10
successThreshold: 1 successThreshold: 1
timeoutSeconds: 1 timeoutSeconds: 1
name: repo-server
ports:
- containerPort: 8081
name: repo-server name: repo-server
ports: protocol: TCP
- containerPort: 8081 - containerPort: 8084
name: repo-server name: metrics
protocol: TCP protocol: TCP
- containerPort: 8084 readinessProbe:
name: metrics failureThreshold: 3
protocol: TCP httpGet:
readinessProbe: path: /healthz
failureThreshold: 3 port: metrics
httpGet: scheme: HTTP
path: /healthz initialDelaySeconds: 10
port: metrics periodSeconds: 10
scheme: HTTP successThreshold: 1
initialDelaySeconds: 10 timeoutSeconds: 1
periodSeconds: 10 resources: {}
successThreshold: 1 securityContext:
timeoutSeconds: 1 allowPrivilegeEscalation: false
resources: {} capabilities:
securityContext: drop:
allowPrivilegeEscalation: false - ALL
capabilities: readOnlyRootFilesystem: true
drop: runAsNonRoot: true
- ALL seccompProfile:
readOnlyRootFilesystem: true type: RuntimeDefault
runAsNonRoot: true terminationMessagePath: /dev/termination-log
seccompProfile: terminationMessagePolicy: File
type: RuntimeDefault volumeMounts:
terminationMessagePath: /dev/termination-log - mountPath: /app/config/ssh
terminationMessagePolicy: File name: ssh-known-hosts
volumeMounts: - mountPath: /app/config/tls
- mountPath: /app/config/ssh name: tls-certs
name: ssh-known-hosts - mountPath: /app/config/gpg/source
- mountPath: /app/config/tls name: gpg-keys
name: tls-certs - mountPath: /app/config/gpg/keys
- mountPath: /app/config/gpg/source name: gpg-keyring
name: gpg-keys - mountPath: /app/config/reposerver/tls
- mountPath: /app/config/gpg/keys name: argocd-repo-server-tls
name: gpg-keyring - mountPath: /helm-working-dir
- mountPath: /app/config/reposerver/tls name: helm-working-dir
name: argocd-repo-server-tls - mountPath: /home/argocd/cmp-server/plugins
- mountPath: /helm-working-dir name: plugins
name: helm-working-dir - mountPath: /tmp
- mountPath: /home/argocd/cmp-server/plugins name: tmp
name: plugins - command:
- mountPath: /tmp - /var/run/argocd/argocd-cmp-server
name: tmp image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
- command: imagePullPolicy: Always
- /var/run/argocd/argocd-cmp-server name: kustomize-helm-with-rewrite
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest resources: {}
imagePullPolicy: Always securityContext:
name: kustomize-helm-with-rewrite runAsNonRoot: true
resources: {} runAsUser: 999
securityContext: terminationMessagePath: /dev/termination-log
runAsNonRoot: true terminationMessagePolicy: File
runAsUser: 999 volumeMounts:
terminationMessagePath: /dev/termination-log - mountPath: /var/run/argocd
terminationMessagePolicy: File name: var-files
volumeMounts: - mountPath: /home/argocd/cmp-server/plugins
- mountPath: /var/run/argocd name: plugins
name: var-files - mountPath: /tmp
- mountPath: /home/argocd/cmp-server/plugins name: cmp-tmp
name: plugins - mountPath: /helm-working-dir
- mountPath: /tmp name: helm-working-dir
name: cmp-tmp
- mountPath: /helm-working-dir
name: helm-working-dir
dnsPolicy: ClusterFirst dnsPolicy: ClusterFirst
imagePullSecrets: imagePullSecrets:
- name: gitlab-pull-secret - name: gitlab-pull-secret
initContainers: initContainers:
- command: - command:
- /bin/cp - /bin/cp
- -n - -n
- /usr/local/bin/argocd - /usr/local/bin/argocd
- /var/run/argocd/argocd-cmp-server - /var/run/argocd/argocd-cmp-server
image: quay.io/argoproj/argocd:v2.10.4 image: quay.io/argoproj/argocd:v2.10.4
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
name: copyutil name: copyutil
resources: {} resources: {}
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
capabilities: capabilities:
drop: drop:
- ALL - ALL
readOnlyRootFilesystem: true readOnlyRootFilesystem: true
runAsNonRoot: true runAsNonRoot: true
seccompProfile: seccompProfile:
type: RuntimeDefault type: RuntimeDefault
terminationMessagePath: /dev/termination-log terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File terminationMessagePolicy: File
volumeMounts: volumeMounts:
- mountPath: /var/run/argocd - mountPath: /var/run/argocd
name: var-files name: var-files
- command: - command:
- /bin/sh - /bin/sh
- /plugin/init-helm-repos.sh - /plugin/init-helm-repos.sh
image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest image: registry.gitlab.com/oceanbox/manifests/kustomize-helm-with-rewrite:latest
imagePullPolicy: Always imagePullPolicy: Always
name: init-helm-repos name: init-helm-repos
resources: {} resources: {}
securityContext: securityContext:
allowPrivilegeEscalation: false allowPrivilegeEscalation: false
capabilities: capabilities:
drop: drop:
- ALL - ALL
readOnlyRootFilesystem: true readOnlyRootFilesystem: true
runAsUser: 999 runAsUser: 999
runAsNonRoot: true runAsNonRoot: true
seccompProfile: seccompProfile:
type: RuntimeDefault type: RuntimeDefault
terminationMessagePath: /dev/termination-log terminationMessagePath: /dev/termination-log
terminationMessagePolicy: File terminationMessagePolicy: File
env: env:
- name: OCEANBOX_HELM_ACCESS_TOKEN - name: OCEANBOX_HELM_ACCESS_TOKEN
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
key: token key: token
name: oceanbox-helm name: oceanbox-helm
optional: false optional: false
volumeMounts: volumeMounts:
- mountPath: /helm-working-dir - mountPath: /helm-working-dir
name: helm-working-dir name: helm-working-dir
restartPolicy: Always restartPolicy: Always
schedulerName: default-scheduler schedulerName: default-scheduler
securityContext: {} securityContext: {}
@@ -386,39 +386,40 @@ spec:
serviceAccountName: argocd-repo-server serviceAccountName: argocd-repo-server
terminationGracePeriodSeconds: 30 terminationGracePeriodSeconds: 30
volumes: volumes:
- emptyDir: {} - emptyDir: {}
name: cmp-tmp name: cmp-tmp
- emptyDir: {} - emptyDir: {}
name: helm-working-dir name: helm-working-dir
- emptyDir: {} - emptyDir: {}
name: plugins name: plugins
- emptyDir: {} - emptyDir: {}
name: var-files name: var-files
- emptyDir: {} - emptyDir: {}
name: tmp name: tmp
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-ssh-known-hosts-cm name: argocd-ssh-known-hosts-cm
name: ssh-known-hosts name: ssh-known-hosts
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-tls-certs-cm name: argocd-tls-certs-cm
name: tls-certs name: tls-certs
- configMap: - configMap:
defaultMode: 420 defaultMode: 420
name: argocd-gpg-keys-cm name: argocd-gpg-keys-cm
name: gpg-keys name: gpg-keys
- emptyDir: {} - emptyDir: {}
name: gpg-keyring name: gpg-keyring
- name: argocd-repo-server-tls - name: argocd-repo-server-tls
secret: secret:
defaultMode: 420 defaultMode: 420
items: items:
- key: tls.crt - key: tls.crt
path: tls.crt path: tls.crt
- key: tls.key - key: tls.key
path: tls.key path: tls.key
- key: ca.crt - key: ca.crt
path: ca.crt path: ca.crt
optional: true optional: true
secretName: argocd-repo-server-tls secretName: argocd-repo-server-tls
+1 -1
View File
@@ -13,7 +13,7 @@ kubectl --context ekman apply -f cluster-admin-token.yaml
# kubectl --context oceanbox apply -f _cluster-ekman.yaml # kubectl --context oceanbox apply -f _cluster-ekman.yaml
token=$(kubectl --context ekman get secret -n kube-system argocd-manager-token -o yaml | grep ' token:' | cut -d' ' -f4 | base64 -d) token=$(kubectl --context ekman get secret -n kube-system argocd-manager-token -o yaml | grep ' token:' | cut -d' ' -f4 | base64 -d)
sed "s/@token@/${token}/" cluster-ekman.yaml > _cluster-ekman.yaml sed "s/@token@/$token/" cluster-ekman.yaml > _cluster-ekman.yaml
echo "configure argocd ekman-cluster..." echo "configure argocd ekman-cluster..."
cat _cluster-ekman.yaml cat _cluster-ekman.yaml
kubectl --context oceanbox apply -f _cluster-ekman.yaml kubectl --context oceanbox apply -f _cluster-ekman.yaml
+1
View File
@@ -13,3 +13,4 @@ stringData:
name: staging-vcluster name: staging-vcluster
server: https://staging-vcluster.staging-vcluster server: https://staging-vcluster.staging-vcluster
type: Opaque type: Opaque
+11 -11
View File
@@ -19,12 +19,12 @@ applications:
plugin: plugin:
name: helmfile-cmp name: helmfile-cmp
env: env:
- name: CLUSTER_NAME - name: CLUSTER_NAME
value: replaceme value: replaceme
- name: HELMFILE_ENVIRONMENT - name: HELMFILE_ENVIRONMENT
value: default value: default
- name: HELMFILE_FILE_PATH - name: HELMFILE_FILE_PATH
value: system.yaml.gotmpl value: system.yaml.gotmpl
projects: projects:
sys: sys:
namespace: argocd namespace: argocd
@@ -32,12 +32,12 @@ projects:
additionalAnnotations: {} additionalAnnotations: {}
description: sys components project description: sys components project
sourceRepos: sourceRepos:
- "*" - '*'
destinations: destinations:
- namespace: "*" - namespace: '*'
server: https://kubernetes.default.svc server: https://kubernetes.default.svc
clusterResourceWhitelist: clusterResourceWhitelist:
- group: "*" - group: '*'
kind: "*" kind: '*'
orphanedResources: orphanedResources:
warn: false warn: false
-5
View File
@@ -8,8 +8,3 @@ version: v1.35.2
# This is the version number of the application being deployed. This version number should be # This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. # incremented each time you make changes to the application.
appVersion: v1.35.2 appVersion: v1.35.2
dependencies:
- name: diagrid-dashboard
version: "0.1.0"
repository: "file://../diagrid-dashboard"
condition: diagrid-dashboard.enabled
-3
View File
@@ -116,6 +116,3 @@ serviceMonitor:
nodeSelector: {} nodeSelector: {}
tolerations: [] tolerations: []
affinity: {} affinity: {}
diagrid-dashboard:
enabled: false
@@ -59,18 +59,12 @@ spec:
resources: resources:
{{- toYaml . | nindent 12 }} {{- toYaml . | nindent 12 }}
{{- end }} {{- end }}
volumeMounts:
- name: statestore
mountPath: /app/components/statestore.yaml
subPath: statestore.yaml
{{- with .Values.volumeMounts }} {{- with .Values.volumeMounts }}
volumeMounts:
{{- toYaml . | nindent 12 }} {{- toYaml . | nindent 12 }}
{{- end }} {{- end }}
volumes:
- name: statestore
configMap:
name: {{ include "diagrid-dashboard.fullname" . }}-statestore
{{- with .Values.volumes }} {{- with .Values.volumes }}
volumes:
{{- toYaml . | nindent 8 }} {{- toYaml . | nindent 8 }}
{{- end }} {{- end }}
{{- with .Values.nodeSelector }} {{- with .Values.nodeSelector }}
@@ -1,7 +1,7 @@
apiVersion: v1 apiVersion: v1
kind: ConfigMap kind: ConfigMap
metadata: metadata:
name: {{ include "diagrid-dashboard.fullname" . }}-statestore name: diadash-statestore
data: data:
statestore.yaml: | statestore.yaml: |
apiVersion: dapr.io/v1alpha1 apiVersion: dapr.io/v1alpha1
@@ -17,7 +17,10 @@ data:
- name: redisUsername - name: redisUsername
value: default value: default
- name: redisPassword - name: redisPassword
value: secret value: mrtz-password
# secretKeyRef:
# key: redis-password
# name: {{ .Values.statestore.redis }}
- name: actorStateStore - name: actorStateStore
value: "true" value: "true"
- name: redisDB - name: redisDB
+20 -14
View File
@@ -2,10 +2,6 @@
# This is a YAML-formatted file. # This is a YAML-formatted file.
# Declare variables to be passed into your templates. # Declare variables to be passed into your templates.
statestore:
scope: my-scope
redis: my-redis
# This will set the replicaset count more information can be found here: https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/ # This will set the replicaset count more information can be found here: https://kubernetes.io/docs/concepts/workloads/controllers/replicaset/
replicaCount: 1 replicaCount: 1
@@ -130,14 +126,14 @@ resources: {}
# memory: 128Mi # memory: 128Mi
# This is to setup the liveness and readiness probes more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ # This is to setup the liveness and readiness probes more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/
livenessProbe: # livenessProbe:
httpGet: # httpGet:
path: / # path: /
port: http # port: http
readinessProbe: # readinessProbe:
httpGet: # httpGet:
path: / # path: /
port: http # port: http
# This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/ # This section is for setting up autoscaling more information can be found here: https://kubernetes.io/docs/concepts/workloads/autoscaling/
autoscaling: autoscaling:
@@ -148,13 +144,23 @@ autoscaling:
# targetMemoryUtilizationPercentage: 80 # targetMemoryUtilizationPercentage: 80
# Additional volumes on the output Deployment definition. # Additional volumes on the output Deployment definition.
volumes: {} volumes:
- name: statestore
configMap:
name: diadash-statestore
# Additional volumeMounts on the output Deployment definition. # Additional volumeMounts on the output Deployment definition.
volumeMounts: {} volumeMounts:
- name: statestore
mountPath: /app/components/statestore.yaml
subPath: statestore.yaml
nodeSelector: {} nodeSelector: {}
tolerations: [] tolerations: []
affinity: {} affinity: {}
statestore:
scope: mrtz-sorcerer
redis: mrtz-sorcerer-redis
+2 -2
View File
@@ -13,9 +13,9 @@ type: application
# This is the chart version. This version number should be incremented each time you make changes # This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version. # to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/) # Versions are expected to follow Semantic Versioning (https://semver.org/)
version: v1.6.0 version: v1.2.4
# This is the version number of the application being deployed. This version number should be # This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to # incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using. # follow Semantic Versioning. They should reflect the version the application is using.
# It is recommended to use it with quotes. # It is recommended to use it with quotes.
appVersion: "v1.6.0" appVersion: "v1.2.4"
+1 -1
View File
@@ -12,7 +12,7 @@ image:
# This sets the pull policy for images. # This sets the pull policy for images.
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
# Overrides the image tag whose default is the chart appVersion. # Overrides the image tag whose default is the chart appVersion.
tag: v1.6.0 tag: v1.2.4
# This is for the secrets for pulling an image from a private repository more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/ # This is for the secrets for pulling an image from a private repository more information can be found here: https://kubernetes.io/docs/tasks/configure-pod-container/pull-image-private-registry/
imagePullSecrets: imagePullSecrets:
- name: gitlab-pull-secret - name: gitlab-pull-secret
+1 -1
View File
@@ -3,7 +3,7 @@
# Declare variables to be passed into your templates. # Declare variables to be passed into your templates.
replicaCount: 1 replicaCount: 1
image: image:
repository: registry.gitlab.com/oceanbox/makai repository: registry.gitlab.com/oceanbox/makai/makai
tag: v0.1.0 tag: v0.1.0
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
init: init:
-6
View File
@@ -1,6 +0,0 @@
dependencies:
- name: diagrid-dashboard
repository: file://../diagrid-dashboard
version: 0.1.0
digest: sha256:4fdb3148a2a6439223d7844a3083da2de324dd47e5cb3ac4a5d9c436e6e2c775
generated: "2025-12-16T19:38:21.939708629+01:00"
-5
View File
@@ -8,8 +8,3 @@ version: v1.35.2
# This is the version number of the application being deployed. This version number should be # This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. # incremented each time you make changes to the application.
appVersion: v1.35.2 appVersion: v1.35.2
dependencies:
- name: diagrid-dashboard
version: "0.1.0"
repository: "file://../diagrid-dashboard"
condition: diagrid-dashboard.enabled
-6
View File
@@ -108,9 +108,3 @@ serviceMonitor:
nodeSelector: {} nodeSelector: {}
tolerations: [] tolerations: []
affinity: {} affinity: {}
diagrid-dashboard:
enabled: false
statestore:
scope: sorcerer
redis: sorcerer-redis
+1 -1
View File
@@ -5,7 +5,7 @@
replicaCount: 1 replicaCount: 1
image: image:
repository: registry repository: registry
tag: 3 tag: 2
pullPolicy: IfNotPresent pullPolicy: IfNotPresent
init: init:
enabled: false enabled: false
+1 -1
View File
@@ -27,7 +27,7 @@ releases:
- name: argocd-apps - name: argocd-apps
namespace: argocd namespace: argocd
chart: argo/argocd-apps chart: argo/argocd-apps
version: 2.0.3 version: 0.0.9
condition: argo.apps.enabled condition: argo.apps.enabled
values: values:
- ../values/argo/values/apps.yaml.gotmpl - ../values/argo/values/apps.yaml.gotmpl
+2 -3
View File
@@ -3,8 +3,7 @@ bases:
repositories: repositories:
- name: cert-manager - name: cert-manager
oci: true url: 'https://charts.jetstack.io'
url: 'quay.io/jetstack/charts'
commonLabels: commonLabels:
tier: system tier: system
@@ -13,7 +12,7 @@ releases:
- name: cert-manager - name: cert-manager
namespace: cert-manager namespace: cert-manager
chart: cert-manager/cert-manager chart: cert-manager/cert-manager
version: v1.19.2 version: 1.12.13
condition: cert_manager.enabled condition: cert_manager.enabled
values: values:
- ../values/cert-manager/values/cert-manager.yaml.gotmpl - ../values/cert-manager/values/cert-manager.yaml.gotmpl
-44
View File
@@ -1,44 +0,0 @@
bases:
- ../envs/environments.yaml.gotmpl
repositories:
- name: forgejo
oci: true
url: code.forgejo.org/forgejo-helm
commonLabels:
tier: system
releases:
- name: forgejo
namespace: forgejo
chart: forgejo/forgejo
version: 16.0.0
condition: forgejo.enabled
values:
- ../values/forgejo/values/values.yaml
- ../values/forgejo/values/values-{{ .Environment.Name }}.yaml
postRenderer: ../bin/kustomizer
postRendererArgs:
- ../values/forgejo/kustomize/{{ .Environment.Name }}
missingFileHandler: Info
- name: manifests
namespace: forgejo
chart: manifests
condition: forgejo.enabled
missingFileHandler: Info
values:
- ../values/env.yaml
- ../values/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml
- ../values/forgejo/env.yaml.gotmpl
- ../values/forgejo/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml.gotmpl
hooks:
- events: [ prepare, cleanup ]
showlogs: true
command: ../bin/helmify
args:
- '{{`{{ if eq .Event.Name "prepare" }}build{{ else }}clean{{ end }}`}}'
- '{{`{{ .Release.Chart }}`}}'
- '{{`{{ .Environment.Name }}`}}'
- ../values/forgejo/manifests
- manifests
+1 -1
View File
@@ -12,7 +12,7 @@ releases:
- name: ingress-nginx - name: ingress-nginx
namespace: ingress-nginx namespace: ingress-nginx
chart: ingress-nginx/ingress-nginx chart: ingress-nginx/ingress-nginx
version: 4.14.1 version: 4.8.3
condition: nginx.enabled condition: nginx.enabled
values: values:
- ../values/ingress-nginx/values/ingress-nginx.yaml.gotmpl - ../values/ingress-nginx/values/ingress-nginx.yaml.gotmpl
+1 -1
View File
@@ -15,7 +15,7 @@ releases:
- name: kyverno - name: kyverno
namespace: kyverno namespace: kyverno
chart: kyverno/kyverno chart: kyverno/kyverno
version: 3.6.1 version: 3.5.1
condition: kyverno.enabled condition: kyverno.enabled
values: values:
- ../values/kyverno/values/kyverno.yaml.gotmpl - ../values/kyverno/values/kyverno.yaml.gotmpl
+1 -1
View File
@@ -12,7 +12,7 @@ releases:
- name: mariadb-operator - name: mariadb-operator
namespace: mariadb-operator namespace: mariadb-operator
chart: mariadb-operator/mariadb-operator chart: mariadb-operator/mariadb-operator
version: 25.10.3 version: 25.8.4
condition: mariadb_operator.enabled condition: mariadb_operator.enabled
values: values:
- ../values/mariadb-operator/values/mariadb-operator.yaml.gotmpl - ../values/mariadb-operator/values/mariadb-operator.yaml.gotmpl
+1 -1
View File
@@ -16,7 +16,7 @@ releases:
namespace: {{ .Environment.Name }}-openfga namespace: {{ .Environment.Name }}-openfga
{{- end }} {{- end }}
chart: openfga/openfga chart: openfga/openfga
version: 0.2.50 version: 0.2.45
condition: openfga.enabled condition: openfga.enabled
values: values:
- ../values/openfga/values/values.yaml - ../values/openfga/values/values.yaml
@@ -12,7 +12,7 @@ releases:
- name: opentelemetry-collector - name: opentelemetry-collector
namespace: otel namespace: otel
chart: open-telemetry/opentelemetry-collector chart: open-telemetry/opentelemetry-collector
version: 0.142.1 version: 0.134.1
condition: otel.enabled condition: otel.enabled
values: values:
- ../values/opentelemetry-collector/values/values.yaml - ../values/opentelemetry-collector/values/values.yaml
+1 -1
View File
@@ -15,7 +15,7 @@ releases:
- name: postgres-operator - name: postgres-operator
namespace: cnpg namespace: cnpg
chart: cloudnative-pg/cloudnative-pg chart: cloudnative-pg/cloudnative-pg
version: 0.27.0 version: 0.26.1
condition: postgres_operator.enabled condition: postgres_operator.enabled
values: values:
- ../values/postgres-operator/values/postgres-operator.yaml.gotmpl - ../values/postgres-operator/values/postgres-operator.yaml.gotmpl
+1 -1
View File
@@ -13,7 +13,7 @@ releases:
- name: {{ .Environment.Name }}-rabbitmq - name: {{ .Environment.Name }}-rabbitmq
namespace: rabbitmq namespace: rabbitmq
chart: bitnami/rabbitmq chart: bitnami/rabbitmq
version: 13.0.3 version: 12.9.0
condition: rabbitmq.enabled condition: rabbitmq.enabled
values: values:
- ../values/rabbitmq/values/values.yaml - ../values/rabbitmq/values/values.yaml
+1 -1
View File
@@ -13,7 +13,7 @@ releases:
- name: slurm-operator - name: slurm-operator
namespace: slinky namespace: slinky
chart: slurm-operator/slurm-operator chart: slurm-operator/slurm-operator
version: 0.4.1 version: 0.4.0
condition: slurm_operator.enabled condition: slurm_operator.enabled
values: values:
- ../values/slurm-operator/values/slurm-operator.yaml.gotmpl - ../values/slurm-operator/values/slurm-operator.yaml.gotmpl
+42
View File
@@ -0,0 +1,42 @@
bases:
- ../envs/environments.yaml.gotmpl
repositories:
- name: sonatype-nexus
url: git+https://github.com/sonatype/helm3-charts@deploy?ref=master
commonLabels:
tier: system
releases:
- name: sonatype-nexus
namespace: sonatype-nexus
chart: sonatype-nexus/sonatype-nexus
condition: sonatype-nexus.enabled
values:
- ../values/sonatype-nexus/values/values.yaml
- ../values/sonatype-nexus/values/values-{{ .Environment.Name }}.yaml.gotmpl
postRenderer: ../bin/kustomizer
postRendererArgs:
- ../values/sonatype-nexus/kustomize/{{ .Environment.Name }}
missingFileHandler: Info
- name: manifests
namespace: sonatype-nexus
chart: manifests
condition: nsonatype-nexus.enabled
missingFileHandler: Info
values:
- ../values/env.yaml
- ../values/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml
- ../values/sonatype-nexus/env.yaml.gotmpl
- ../values/sonatype-nexus/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml.gotmpl
hooks:
- events: [ prepare, cleanup ]
showlogs: true
command: ../bin/helmify
args:
- '{{`{{ if eq .Event.Name "prepare" }}build{{ else }}clean{{ end }}`}}'
- '{{`{{ .Release.Chart }}`}}'
- '{{`{{ .Environment.Name }}`}}'
- ../values/sonatype-nexus/manifests
- manifests
+1 -1
View File
@@ -13,7 +13,7 @@ releases:
- name: spegel - name: spegel
namespace: spegel namespace: spegel
chart: spegel/spegel chart: spegel/spegel
version: 0.6.0 version: 0.5.1
condition: spegel.enabled condition: spegel.enabled
values: values:
- ../values/spegel/values/spegel.yaml.gotmpl - ../values/spegel/values/spegel.yaml.gotmpl
+1 -1
View File
@@ -15,7 +15,7 @@ releases:
- name: velero - name: velero
namespace: velero namespace: velero
chart: velero/velero chart: velero/velero
version: 11.3.2 version: 11.1.1
condition: velero.enabled condition: velero.enabled
values: values:
- ../values/velero/values/velero.yaml.gotmpl - ../values/velero/values/velero.yaml.gotmpl
-70
View File
@@ -1,70 +0,0 @@
let
sources = import ./default.nix;
pkgs = import sources.nixpkgs { };
pre-commit = import sources.git-hooks;
globalExcludes = [
"nix/default.nix"
"attic"
"vcluster"
".*vendor"
".*chart/.*"
".*schema.json"
];
in
pre-commit.run {
src = pkgs.nix-gitignore.gitignoreSource [ ] ../.;
# Do not run at pre-commit time
default_stages = [
"pre-push"
];
# TODO(mrtz): Remove when default
package = pkgs.prek;
# Linters From https://github.com/cachix/pre-commit-hooks.nix
hooks = {
nixfmt-rfc-style = {
enable = true;
excludes = globalExcludes;
};
trim-trailing-whitespace.enable = true;
shellcheck = {
enable = true;
excludes = [
"vcluster/"
"attic/"
];
args = [
"-x"
"-o"
"all"
];
};
yamllint = {
enable = true;
excludes = [
"attic/"
"charts/templates/"
"charts/"
"values/"
"vcluster/"
];
settings = {
strict = true;
configData = ''{ extends: default, rules: { document-start: disable, line-length: {max: 300} } }'';
};
};
check-json.enable = true;
renovate-config-validator = {
enable = true;
files = "renovate.json$";
entry = "renovate-config-validator";
};
};
}
+24 -127
View File
@@ -9,15 +9,8 @@
*/ */
# Generated by npins. Do not modify; will be overwritten regularly # Generated by npins. Do not modify; will be overwritten regularly
let let
# Backwards-compatibly make something that previously didn't take any arguments take some data = builtins.fromJSON (builtins.readFile ./sources.json);
# The function must return an attrset, and will unfortunately be eagerly evaluated version = data.version;
# Same thing, but it catches eval errors on the default argument so that one may still call it with other arguments
mkFunctor =
fn:
let
e = builtins.tryEval (fn { });
in
(if e.success then e.value else { error = fn { }; }) // { __functor = _self: fn; };
# https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/lists.nix#L295 # https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/lists.nix#L295
range = range =
@@ -28,6 +21,7 @@ let
# https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/strings.nix#L269 # https://github.com/NixOS/nixpkgs/blob/0258808f5744ca980b9a1f24fe0b1e6f0fecee9c/lib/strings.nix#L269
stringAsChars = f: s: concatStrings (map f (stringToCharacters s)); stringAsChars = f: s: concatStrings (map f (stringToCharacters s));
concatMapStrings = f: list: concatStrings (map f list);
concatStrings = builtins.concatStringsSep ""; concatStrings = builtins.concatStringsSep "";
# If the environment variable NPINS_OVERRIDE_${name} is set, then use # If the environment variable NPINS_OVERRIDE_${name} is set, then use
@@ -54,87 +48,41 @@ let
mkSource = mkSource =
name: spec: name: spec:
{
pkgs ? null,
}:
assert spec ? type; assert spec ? type;
let let
# Unify across builtin and pkgs fetchers.
# `fetchGit` requires a wrapper because of slight API differences.
fetchers =
if pkgs == null then
{
inherit (builtins) fetchTarball fetchurl;
# For some fucking reason, fetchGit has a different signature than the other builtin fetchers …
fetchGit = args: (builtins.fetchGit args).outPath;
}
else
{
fetchTarball =
{
url,
sha256,
}:
pkgs.fetchzip {
inherit url sha256;
extension = "tar";
};
inherit (pkgs) fetchurl;
fetchGit =
{
url,
submodules,
rev,
name,
narHash,
}:
pkgs.fetchgit {
inherit url rev name;
fetchSubmodules = submodules;
hash = narHash;
};
};
# Dispatch to the correct code path based on the type
path = path =
if spec.type == "Git" then if spec.type == "Git" then
mkGitSource fetchers spec mkGitSource spec
else if spec.type == "GitRelease" then else if spec.type == "GitRelease" then
mkGitSource fetchers spec mkGitSource spec
else if spec.type == "PyPi" then else if spec.type == "PyPi" then
mkPyPiSource fetchers spec mkPyPiSource spec
else if spec.type == "Channel" then else if spec.type == "Channel" then
mkChannelSource fetchers spec mkChannelSource spec
else if spec.type == "Tarball" then else if spec.type == "Tarball" then
mkTarballSource fetchers spec mkTarballSource spec
else if spec.type == "Container" then
mkContainerSource pkgs spec
else else
builtins.throw "Unknown source type ${spec.type}"; builtins.throw "Unknown source type ${spec.type}";
in in
spec // { outPath = mayOverride name path; }; spec // { outPath = mayOverride name path; };
mkGitSource = mkGitSource =
{
fetchTarball,
fetchGit,
...
}:
{ {
repository, repository,
revision, revision,
url ? null, url ? null,
submodules, submodules,
hash, hash,
branch ? null,
... ...
}: }:
assert repository ? type; assert repository ? type;
# At the moment, either it is a plain git repository (which has an url), or it is a GitHub/GitLab repository # At the moment, either it is a plain git repository (which has an url), or it is a GitHub/GitLab repository
# In the latter case, there we will always be an url to the tarball # In the latter case, there we will always be an url to the tarball
if url != null && !submodules then if url != null && !submodules then
fetchTarball { builtins.fetchTarball {
inherit url; inherit url;
sha256 = hash; sha256 = hash; # FIXME: check nix version & use SRI hashes
} }
else else
let let
@@ -145,8 +93,6 @@ let
"https://github.com/${repository.owner}/${repository.repo}.git" "https://github.com/${repository.owner}/${repository.repo}.git"
else if repository.type == "GitLab" then else if repository.type == "GitLab" then
"${repository.server}/${repository.repo_path}.git" "${repository.server}/${repository.repo_path}.git"
else if repository.type == "Forgejo" then
"${repository.server}/${repository.owner}/${repository.repo}.git"
else else
throw "Unrecognized repository type ${repository.type}"; throw "Unrecognized repository type ${repository.type}";
urlToName = urlToName =
@@ -161,89 +107,40 @@ let
"${if matched == null then "source" else builtins.head matched}${appendShort}"; "${if matched == null then "source" else builtins.head matched}${appendShort}";
name = urlToName url revision; name = urlToName url revision;
in in
fetchGit { builtins.fetchGit {
rev = revision; rev = revision;
narHash = hash; inherit name;
# hash = hash;
inherit name submodules url; inherit url submodules;
}; };
mkPyPiSource = mkPyPiSource =
{ fetchurl, ... }: { url, hash, ... }:
{ builtins.fetchurl {
url,
hash,
...
}:
fetchurl {
inherit url; inherit url;
sha256 = hash; sha256 = hash;
}; };
mkChannelSource = mkChannelSource =
{ fetchTarball, ... }: { url, hash, ... }:
{ builtins.fetchTarball {
url,
hash,
...
}:
fetchTarball {
inherit url; inherit url;
sha256 = hash; sha256 = hash;
}; };
mkTarballSource = mkTarballSource =
{ fetchTarball, ... }:
{ {
url, url,
locked_url ? url, locked_url ? url,
hash, hash,
... ...
}: }:
fetchTarball { builtins.fetchTarball {
url = locked_url; url = locked_url;
sha256 = hash; sha256 = hash;
}; };
mkContainerSource =
pkgs:
{
image_name,
image_tag,
image_digest,
...
}:
if pkgs == null then
builtins.throw "container sources require passing in a Nixpkgs value: https://github.com/andir/npins/blob/master/README.md#using-the-nixpkgs-fetchers"
else
pkgs.dockerTools.pullImage {
imageName = image_name;
imageDigest = image_digest;
finalImageTag = image_tag;
};
in in
mkFunctor ( if version == 5 then
{ builtins.mapAttrs mkSource data.pins
input ? ./sources.json, else
}: throw "Unsupported format version ${toString version} in sources.json. Try running `npins upgrade`"
let
data =
if builtins.isPath input then
# while `readFile` will throw an error anyways if the path doesn't exist,
# we still need to check beforehand because *our* error can be caught but not the one from the builtin
# *piegames sighs*
if builtins.pathExists input then
builtins.fromJSON (builtins.readFile input)
else
throw "Input path ${toString input} does not exist"
else if builtins.isAttrs input then
input
else
throw "Unsupported input type ${builtins.typeOf input}, must be a path or an attrset";
version = data.version;
in
if version == 7 then
builtins.mapAttrs (name: spec: mkFunctor (mkSource name spec)) data.pins
else
throw "Unsupported format version ${toString version} in sources.json. Try running `npins upgrade`"
)
+3 -16
View File
@@ -1,24 +1,11 @@
{ {
"pins": { "pins": {
"git-hooks": {
"type": "Git",
"repository": {
"type": "GitHub",
"owner": "cachix",
"repo": "git-hooks.nix"
},
"branch": "master",
"submodules": false,
"revision": "f0927703b7b1c8d97511c4116eb9b4ec6645a0fa",
"url": "https://github.com/cachix/git-hooks.nix/archive/f0927703b7b1c8d97511c4116eb9b4ec6645a0fa.tar.gz",
"hash": "sha256-6MkqajPICgugsuZ92OMoQcgSHnD6sJHwk8AxvMcIgTE="
},
"nixpkgs": { "nixpkgs": {
"type": "Channel", "type": "Channel",
"name": "nixpkgs-unstable", "name": "nixpkgs-unstable",
"url": "https://releases.nixos.org/nixpkgs/nixpkgs-26.05pre927565.13868c071cc7/nixexprs.tar.xz", "url": "https://releases.nixos.org/nixpkgs/nixpkgs-26.05pre903996.59b6c96beacc/nixexprs.tar.xz",
"hash": "sha256-wufp5c0nWh/87f9eK7xy1eZXms5zd4yl6S4SR+LfA08=" "hash": "0b0yr9d1xyfwgpaj68bimsbjjbj7yis4whjvkrfdycfnasdf0gf0"
} }
}, },
"version": 7 "version": 5
} }
+6 -6
View File
@@ -3,7 +3,7 @@
# Simple script for uploading a base64 encoded image into our database. For # Simple script for uploading a base64 encoded image into our database. For
# grafana business image panels. # grafana business image panels.
if [[ $# -ne 2 ]] if [ $# -ne 2 ]
then then
echo "Usage: $0 <image-name> <file>.png" echo "Usage: $0 <image-name> <file>.png"
exit 1 exit 1
@@ -12,9 +12,9 @@ fi
filename=$1 filename=$1
file=$2 file=$2
if [[ ! -e "${file}" ]] if [ ! -e $file ]
then then
echo "file ${file} does not exist" echo "file $file does not exist"
exit 1 exit 1
fi fi
@@ -22,9 +22,9 @@ function create_image() {
local filename=$1 local filename=$1
local data=$2 local data=$2
cat << EOF cat << EOF
INSERT INTO images VALUES('${filename}', '${data}'); INSERT INTO images VALUES('$filename', '$data');
EOF EOF
} }
data=$(base64 -w0 < "${file}") data=$(cat $file | base64 -w0)
create_image "${filename}" "${data}" create_image $filename $data
+1
View File
@@ -1,3 +1,4 @@
// -*- mode: jsonc -*-
{ {
"$schema": "https://docs.renovatebot.com/renovate-schema.json", "$schema": "https://docs.renovatebot.com/renovate-schema.json",
"extends": [ "extends": [
+182 -203
View File
@@ -1,204 +1,183 @@
groups: groups:
- name: etcd - name: etcd
rules: rules:
- alert: etcdMembersDown - alert: etcdMembersDown
annotations: annotations:
description: description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value
'etcd cluster "{{ $labels.job }}": members are down ({{ $value }}).'
}}).' summary: etcd cluster members are down.
summary: etcd cluster members are down. expr: |-
expr: |- max without (endpoint) (
max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0)
sum without (instance) (up{job=~".*etcd.*"} == bool 0) or
or count without (To) (
count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01
sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )
) )
) > 0
> 0 for: 10m
for: 10m labels:
labels: severity: critical
severity: critical - alert: etcdInsufficientMembers
- alert: etcdInsufficientMembers annotations:
annotations: description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value
description: }}).'
'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value summary: etcd cluster has insufficient number of members.
}}).' expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"})
summary: etcd cluster has insufficient number of members. without (instance) + 1) / 2)
expr: for: 3m
sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) labels:
without (instance) + 1) / 2) severity: critical
for: 3m - alert: etcdNoLeader
labels: annotations:
severity: critical description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }}
- alert: etcdNoLeader has no leader.'
annotations: summary: etcd cluster has no leader.
description: expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} for: 1m
has no leader.' labels:
summary: etcd cluster has no leader. severity: critical
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 - alert: etcdHighNumberOfLeaderChanges
for: 1m annotations:
labels: description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes
severity: critical within the last 15 minutes. Frequent elections may be a sign of insufficient
- alert: etcdHighNumberOfLeaderChanges resources, high network latency, or disruptions by other components and should
annotations: be investigated.'
description: summary: etcd cluster has high number of leader changes.
'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"})
within the last 15 minutes. Frequent elections may be a sign of insufficient or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m])
resources, high network latency, or disruptions by other components and should >= 4
be investigated.' for: 5m
summary: etcd cluster has high number of leader changes. labels:
expr: severity: warning
increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) - alert: etcdHighNumberOfFailedGRPCRequests
or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) annotations:
>= 4 description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
for: 5m {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
labels: summary: etcd cluster has high number of failed grpc requests.
severity: warning expr: |-
- alert: etcdHighNumberOfFailedGRPCRequests 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
annotations: /
description: sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for > 1
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' for: 10m
summary: etcd cluster has high number of failed grpc requests. labels:
expr: |- severity: warning
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) - alert: etcdHighNumberOfFailedGRPCRequests
/ annotations:
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for
> 1 {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.'
for: 10m summary: etcd cluster has high number of failed grpc requests.
labels: expr: |-
severity: warning 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code)
- alert: etcdHighNumberOfFailedGRPCRequests /
annotations: sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code)
description: > 5
'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for for: 5m
{{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' labels:
summary: etcd cluster has high number of failed grpc requests. severity: critical
expr: |- - alert: etcdGRPCRequestsSlow
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) annotations:
/ description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method
> 5 }} method.'
for: 5m summary: etcd grpc requests are slow
labels: expr: |-
severity: critical histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type))
- alert: etcdGRPCRequestsSlow > 0.15
annotations: for: 10m
description: labels:
'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests severity: critical
is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method - alert: etcdMemberCommunicationSlow
}} method.' annotations:
summary: etcd grpc requests are slow description: 'etcd cluster "{{ $labels.job }}": member communication with {{
expr: |- $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) }}.'
> 0.15 summary: etcd cluster member communication is slow.
for: 10m expr: |-
labels: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m]))
severity: critical > 0.15
- alert: etcdMemberCommunicationSlow for: 10m
annotations: labels:
description: severity: warning
'etcd cluster "{{ $labels.job }}": member communication with {{ - alert: etcdHighNumberOfFailedProposals
$labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance annotations:
}}.' description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures
summary: etcd cluster member communication is slow. within the last 30 minutes on etcd instance {{ $labels.instance }}.'
expr: |- summary: etcd cluster has high number of proposal failures.
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
> 0.15 for: 15m
for: 10m labels:
labels: severity: warning
severity: warning - alert: etcdHighFsyncDurations
- alert: etcdHighNumberOfFailedProposals annotations:
annotations: description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
description: are {{ $value }}s on etcd instance {{ $labels.instance }}.'
'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures summary: etcd cluster 99th percentile fsync durations are too high.
within the last 30 minutes on etcd instance {{ $labels.instance }}.' expr: |-
summary: etcd cluster has high number of proposal failures. histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 > 0.5
for: 15m for: 10m
labels: labels:
severity: warning severity: warning
- alert: etcdHighFsyncDurations - alert: etcdHighFsyncDurations
annotations: annotations:
description: description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations are {{ $value }}s on etcd instance {{ $labels.instance }}.'
are {{ $value }}s on etcd instance {{ $labels.instance }}.' summary: etcd cluster 99th percentile fsync durations are too high.
summary: etcd cluster 99th percentile fsync durations are too high. expr: |-
expr: |- histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) > 1
> 0.5 for: 10m
for: 10m labels:
labels: severity: critical
severity: warning - alert: etcdHighCommitDurations
- alert: etcdHighFsyncDurations annotations:
annotations: description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations
description: {{ $value }}s on etcd instance {{ $labels.instance }}.'
'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations summary: etcd cluster 99th percentile commit durations are too high.
are {{ $value }}s on etcd instance {{ $labels.instance }}.' expr: |-
summary: etcd cluster 99th percentile fsync durations are too high. histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m]))
expr: |- > 0.25
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) for: 10m
> 1 labels:
for: 10m severity: warning
labels: - alert: etcdDatabaseQuotaLowSpace
severity: critical annotations:
- alert: etcdHighCommitDurations description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined
annotations: quota on etcd instance {{ $labels.instance }}, please defrag or increase the
description: quota as the writes to etcd will be disabled when it is full.'
'etcd cluster "{{ $labels.job }}": 99th percentile commit durations summary: etcd cluster database is running full.
{{ $value }}s on etcd instance {{ $labels.instance }}.' expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) /
summary: etcd cluster 99th percentile commit durations are too high. last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 >
expr: |- 95
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) for: 10m
> 0.25 labels:
for: 10m severity: critical
labels: - alert: etcdExcessiveDatabaseGrowth
severity: warning annotations:
- alert: etcdDatabaseQuotaLowSpace description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk
annotations: space in the next four hours, based on write observations within the past
description: four hours on etcd instance {{ $labels.instance }}, please check as it might
'etcd cluster "{{ $labels.job }}": database size exceeds the defined be disruptive.'
quota on etcd instance {{ $labels.instance }}, please defrag or increase the summary: etcd cluster database growing very fast.
quota as the writes to etcd will be disabled when it is full.' expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60)
summary: etcd cluster database is running full. > etcd_server_quota_backend_bytes{job=~".*etcd.*"}
expr: for: 10m
(last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / labels:
last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > severity: warning
95 - alert: etcdDatabaseHighFragmentationRatio
for: 10m annotations:
labels: description: 'etcd cluster "{{ $labels.job }}": database size in use on instance
severity: critical {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
- alert: etcdExcessiveDatabaseGrowth allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
annotations: retrieve the unused fragmented disk space.'
description: runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
'etcd cluster "{{ $labels.job }}": Predicting running out of disk summary: etcd database size in use is less than 50% of the actual allocated
space in the next four hours, based on write observations within the past storage.
four hours on etcd instance {{ $labels.instance }}, please check as it might expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
be disruptive.' / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
summary: etcd cluster database growing very fast. and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
expr: for: 10m
predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) labels:
> etcd_server_quota_backend_bytes{job=~".*etcd.*"} severity: warning
for: 10m
labels:
severity: warning
- alert: etcdDatabaseHighFragmentationRatio
annotations:
description:
'etcd cluster "{{ $labels.job }}": database size in use on instance
{{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual
allocated disk space, please run defragmentation (e.g. etcdctl defrag) to
retrieve the unused fragmented disk space.'
runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation
summary:
etcd database size in use is less than 50% of the actual allocated
storage.
expr:
(last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m])
/ last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5
and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600
for: 10m
labels:
severity: warning
+42 -46
View File
@@ -1,47 +1,43 @@
groups: groups:
- name: general.rules - name: general.rules
rules: rules:
- alert: TargetDown - alert: TargetDown
annotations: annotations:
description: description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service
'{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service }} targets in {{ $labels.namespace }} namespace are down.'
}} targets in {{ $labels.namespace }} namespace are down.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown summary: One or more targets are unreachable.
summary: One or more targets are unreachable. expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up)
expr: BY (cluster, job, namespace, service)) > 10
100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) for: 10m
BY (cluster, job, namespace, service)) > 10 labels:
for: 10m severity: warning
labels: - alert: Watchdog
severity: warning annotations:
- alert: Watchdog description: |
annotations: This is an alert meant to ensure that the entire alerting pipeline is functional.
description: | This alert is always firing, therefore it should always be firing in Alertmanager
This is an alert meant to ensure that the entire alerting pipeline is functional. and always fire against a receiver. There are integrations with various notification
This alert is always firing, therefore it should always be firing in Alertmanager mechanisms that send a notification when this alert is not firing. For example the
and always fire against a receiver. There are integrations with various notification "DeadMansSnitch" integration in PagerDuty.
mechanisms that send a notification when this alert is not firing. For example the runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog
"DeadMansSnitch" integration in PagerDuty. summary: An alert that should always be firing to certify that Alertmanager
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog is working properly.
summary: expr: vector(1)
An alert that should always be firing to certify that Alertmanager labels:
is working properly. severity: none
expr: vector(1) - alert: InfoInhibitor
labels: annotations:
severity: none description: |
- alert: InfoInhibitor This is an alert that is used to inhibit info alerts.
annotations: By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with
description: | other alerts.
This is an alert that is used to inhibit info alerts. This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a
By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with severity of 'warning' or 'critical' starts firing on the same namespace.
other alerts. This alert should be routed to a null receiver and configured to inhibit alerts with severity="info".
This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor
severity of 'warning' or 'critical' starts firing on the same namespace. summary: Info-level alert inhibition.
This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
summary: Info-level alert inhibition. labels:
expr: severity: none
ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname !=
"InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1
labels:
severity: none
+258 -277
View File
@@ -1,281 +1,262 @@
groups: groups:
- name: kubernetes-apps - name: kubernetes-apps
rules: rules:
- alert: KubePodCrashLooping - alert: KubePodCrashLooping
annotations: annotations:
description: description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container
'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is in waiting state (reason: "CrashLoopBackOff").'
}}) is in waiting state (reason: "CrashLoopBackOff").' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping summary: Pod is crash looping.
summary: Pod is crash looping. expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff",
expr: job="kube-state-metrics", namespace=~".*"}[5m]) >= 1
max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", for: 15m
job="kube-state-metrics", namespace=~".*"}[5m]) >= 1 labels:
for: 15m severity: warning
labels: - alert: KubePodNotReady
severity: warning annotations:
- alert: KubePodNotReady description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready
annotations: state for longer than 15 minutes.
description: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready
Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready summary: Pod has been in a non-ready state for more than 15 minutes.
state for longer than 15 minutes. expr: |-
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready sum by (namespace, pod, cluster) (
summary: Pod has been in a non-ready state for more than 15 minutes. max by (namespace, pod, cluster) (
expr: |- kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"}
sum by (namespace, pod, cluster) ( ) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) (
max by (namespace, pod, cluster) ( 1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"})
kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"} )
) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) ( ) > 0
1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) for: 15m
) labels:
) > 0 severity: warning
for: 15m - alert: KubeDeploymentGenerationMismatch
labels: annotations:
severity: warning description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment
- alert: KubeDeploymentGenerationMismatch }} does not match, this indicates that the Deployment has failed but has not
annotations: been rolled back.
description: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch
Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment summary: Deployment generation mismatch due to possible roll-back
}} does not match, this indicates that the Deployment has failed but has not expr: |-
been rolled back. kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch !=
summary: Deployment generation mismatch due to possible roll-back kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"}
expr: |- for: 15m
kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"} labels:
severity: warning
- alert: KubeDeploymentReplicasMismatch
annotations:
description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch
summary: Deployment has not matched the expected number of replicas.
expr: |-
(
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"}
>
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"}
) and (
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDeploymentRolloutStuck
annotations:
description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
}} is not progressing for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
summary: Deployment rollout is not progressing.
expr: |-
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
!= 0
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
summary: StatefulSet has not matched the expected number of replicas.
expr: |-
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: |-
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: |-
(
max by (namespace, statefulset) (
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
!= !=
kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"} kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
for: 15m )
labels: ) and (
severity: warning changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
- alert: KubeDeploymentReplicasMismatch ==
annotations: 0
description: )
Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has for: 15m
not matched the expected number of replicas for longer than 15 minutes. labels:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch severity: warning
summary: Deployment has not matched the expected number of replicas. - alert: KubeDaemonSetRolloutStuck
expr: |- annotations:
( description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"} finished or progressed for at least 15 minutes.
> runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"} summary: DaemonSet rollout is stuck.
) and ( expr: |-
changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) (
== (
0 kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
) !=
for: 15m
labels:
severity: warning
- alert: KubeDeploymentRolloutStuck
annotations:
description:
Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment
}} is not progressing for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck
summary: Deployment rollout is not progressing.
expr: |-
kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"}
!= 0
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetReplicasMismatch
annotations:
description:
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has
not matched the expected number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch
summary: StatefulSet has not matched the expected number of replicas.
expr: |-
(
kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"}
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetGenerationMismatch
annotations:
description:
StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset
}} does not match, this indicates that the StatefulSet has failed but has
not been rolled back.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch
summary: StatefulSet generation mismatch due to possible roll-back
expr: |-
kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning
- alert: KubeStatefulSetUpdateNotRolledOut
annotations:
description:
StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update
has not been rolled out.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout
summary: StatefulSet update has not been rolled out.
expr: |-
(
max by (namespace, statefulset) (
kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"}
unless
kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"}
)
*
(
kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"}
!=
kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}
)
) and (
changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeDaemonSetRolloutStuck
annotations:
description:
DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not
finished or progressed for at least 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck
summary: DaemonSet rollout is stuck.
expr: |-
(
(
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
) or (
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
!=
0
) or (
kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
) or (
kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
!=
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
)
) and (
changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
==
0
)
for: 15m
labels:
severity: warning
- alert: KubeContainerWaiting
annotations:
description:
pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
{{ $labels.container}} has been in waiting state for longer than 1 hour.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
summary: Pod container waiting longer than 1 hour
expr:
sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
namespace=~".*"}) > 0
for: 1h
labels:
severity: warning
- alert: KubeDaemonSetNotScheduled
annotations:
description:
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are not scheduled."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
summary: DaemonSet pods are not scheduled.
expr: |-
kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
- ) or (
kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
for: 10m
labels:
severity: warning
- alert: KubeDaemonSetMisScheduled
annotations:
description:
"{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
}} are running where they are not supposed to run."
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
summary: DaemonSet pods are misscheduled.
expr:
kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
> 0 !=
for: 15m 0
labels: ) or (
severity: warning kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}
- alert: KubeJobNotCompleted !=
annotations: kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
description: ) or (
Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"}
than {{ "43200" | humanizeDuration }} to complete. !=
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
summary: Job did not complete in time )
expr: |- ) and (
time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"} changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m])
and ==
kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200 0
labels: )
severity: warning for: 15m
- alert: KubeJobFailed labels:
annotations: severity: warning
description: - alert: KubeContainerWaiting
Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. annotations:
Removing failed job after investigation should clear this alert. description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed {{ $labels.container}} has been in waiting state for longer than 1 hour.
summary: Job failed to complete. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0 summary: Pod container waiting longer than 1 hour
for: 15m expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics",
labels: namespace=~".*"}) > 0
severity: warning for: 1h
- alert: KubeHpaReplicasMismatch labels:
annotations: severity: warning
description: - alert: KubeDaemonSetNotScheduled
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} annotations:
has not matched the desired number of replicas for longer than 15 minutes. description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch }} are not scheduled.'
summary: HPA has not matched desired number of replicas. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled
expr: |- summary: DaemonSet pods are not scheduled.
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"} expr: |-
!= kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"}
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}) -
and kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} for: 10m
> labels:
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"}) severity: warning
and - alert: KubeDaemonSetMisScheduled
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} annotations:
< description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}) }} are running where they are not supposed to run.'
and runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0 summary: DaemonSet pods are misscheduled.
for: 15m expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"}
labels: > 0
severity: warning for: 15m
- alert: KubeHpaMaxedOut labels:
annotations: severity: warning
description: - alert: KubeJobNotCompleted
HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} annotations:
has been running at max replicas for longer than 15 minutes. description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout than {{ "43200" | humanizeDuration }} to complete.
summary: HPA is running at max replicas runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted
expr: |- summary: Job did not complete in time
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} expr: |-
== time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"}
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"} and
for: 15m kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200
labels: labels:
severity: warning severity: warning
- alert: KubeJobFailed
annotations:
description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete.
Removing failed job after investigation should clear this alert.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed
summary: Job failed to complete.
expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0
for: 15m
labels:
severity: warning
- alert: KubeHpaReplicasMismatch
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
has not matched the desired number of replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch
summary: HPA has not matched desired number of replicas.
expr: |-
(kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"}
!=
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
>
kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"})
and
(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
<
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"})
and
changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0
for: 15m
labels:
severity: warning
- alert: KubeHpaMaxedOut
annotations:
description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }}
has been running at max replicas for longer than 15 minutes.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout
summary: HPA is running at max replicas
expr: |-
kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}
==
kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}
for: 15m
labels:
severity: warning
+114 -122
View File
@@ -1,123 +1,115 @@
groups: groups:
- name: kubernetes-resources - name: kubernetes-resources
rules: rules:
- alert: KubeCPUOvercommit - alert: KubeCPUOvercommit
annotations: annotations:
description: description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests for Pods by {{ $value }} CPU shares and cannot tolerate node failure.
for Pods by {{ $value }} CPU shares and cannot tolerate node failure. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit summary: Cluster has overcommitted CPU resource requests.
summary: Cluster has overcommitted CPU resource requests. expr: |-
expr: |- sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 and
and (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0
(sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 for: 10m
for: 10m labels:
labels: severity: warning
severity: warning - alert: KubeMemoryOvercommit
- alert: KubeMemoryOvercommit annotations:
annotations: description: Cluster {{ $labels.cluster }} has overcommitted memory resource
description: requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node
Cluster {{ $labels.cluster }} has overcommitted memory resource failure.
requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit
failure. summary: Cluster has overcommitted memory resource requests.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit expr: |-
summary: Cluster has overcommitted memory resource requests. sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
expr: |- and
sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0
and for: 10m
(sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 labels:
for: 10m severity: warning
labels: - alert: KubeCPUQuotaOvercommit
severity: warning annotations:
- alert: KubeCPUQuotaOvercommit description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests
annotations: for Namespaces.
description: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit
Cluster {{ $labels.cluster }} has overcommitted CPU resource requests summary: Cluster has overcommitted CPU resource requests.
for Namespaces. expr: |-
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster)
summary: Cluster has overcommitted CPU resource requests. /
expr: |- sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster)
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster) > 1.5
/ for: 5m
sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster) labels:
> 1.5 severity: warning
for: 5m - alert: KubeMemoryQuotaOvercommit
labels: annotations:
severity: warning description: Cluster {{ $labels.cluster }} has overcommitted memory resource
- alert: KubeMemoryQuotaOvercommit requests for Namespaces.
annotations: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit
description: summary: Cluster has overcommitted memory resource requests.
Cluster {{ $labels.cluster }} has overcommitted memory resource expr: |-
requests for Namespaces. sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster)
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit /
summary: Cluster has overcommitted memory resource requests. sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)
expr: |- > 1.5
sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster) for: 5m
/ labels:
sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) severity: warning
> 1.5 - alert: KubeQuotaAlmostFull
for: 5m annotations:
labels: description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
severity: warning }} of its {{ $labels.resource }} quota.
- alert: KubeQuotaAlmostFull runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull
annotations: summary: Namespace quota is going to be full.
description: expr: |-
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage kube_resourcequota{job="kube-state-metrics", type="used"}
}} of its {{ $labels.resource }} quota. / ignoring(instance, job, type)
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
summary: Namespace quota is going to be full. > 0.9 < 1
expr: |- for: 15m
kube_resourcequota{job="kube-state-metrics", type="used"} labels:
/ ignoring(instance, job, type) severity: info
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) - alert: KubeQuotaFullyUsed
> 0.9 < 1 annotations:
for: 15m description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
labels: }} of its {{ $labels.resource }} quota.
severity: info runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused
- alert: KubeQuotaFullyUsed summary: Namespace quota is fully used.
annotations: expr: |-
description: kube_resourcequota{job="kube-state-metrics", type="used"}
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage / ignoring(instance, job, type)
}} of its {{ $labels.resource }} quota. (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused == 1
summary: Namespace quota is fully used. for: 15m
expr: |- labels:
kube_resourcequota{job="kube-state-metrics", type="used"} severity: info
/ ignoring(instance, job, type) - alert: KubeQuotaExceeded
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) annotations:
== 1 description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage
for: 15m }} of its {{ $labels.resource }} quota.
labels: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded
severity: info summary: Namespace quota has exceeded the limits.
- alert: KubeQuotaExceeded expr: |-
annotations: kube_resourcequota{job="kube-state-metrics", type="used"}
description: / ignoring(instance, job, type)
Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0)
}} of its {{ $labels.resource }} quota. > 1
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded for: 15m
summary: Namespace quota has exceeded the limits. labels:
expr: |- severity: warning
kube_resourcequota{job="kube-state-metrics", type="used"} - alert: CPUThrottlingHigh
/ ignoring(instance, job, type) annotations:
(kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) description: '{{ $value | humanizePercentage }} throttling of CPU in namespace
> 1 {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod
for: 15m }}.'
labels: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh
severity: warning summary: Processes experience elevated CPU throttling.
- alert: CPUThrottlingHigh expr: |-
annotations: sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
description: /
"{{ $value | humanizePercentage }} throttling of CPU in namespace sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
{{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod > ( 25 / 100 )
}}." for: 15m
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh labels:
summary: Processes experience elevated CPU throttling. severity: info
expr: |-
sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace)
/
sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace)
> ( 25 / 100 )
for: 15m
labels:
severity: info
+108 -113
View File
@@ -1,114 +1,109 @@
groups: groups:
- name: kubernetes-storage - name: kubernetes-storage
rules: rules:
- alert: KubePersistentVolumeFillingUp - alert: KubePersistentVolumeFillingUp
annotations: annotations:
description: description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free.
{{ . }} {{- end }} is only {{ $value | humanizePercentage }} free. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup summary: PersistentVolume is filling up.
summary: PersistentVolume is filling up. expr: |-
expr: |- (
( kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} /
/ kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} ) < 0.03
) < 0.03 and
and kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 unless on (cluster, namespace, persistentvolumeclaim)
unless on (cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 unless on (cluster, namespace, persistentvolumeclaim)
unless on (cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 for: 1m
for: 1m labels:
labels: severity: critical
severity: critical - alert: KubePersistentVolumeFillingUp
- alert: KubePersistentVolumeFillingUp annotations:
annotations: description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
description: }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster | humanizePercentage }} is available.
{{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup
| humanizePercentage }} is available. summary: PersistentVolume is filling up.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup expr: |-
summary: PersistentVolume is filling up. (
expr: |- kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
( /
kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
/ ) < 0.15
kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} and
) < 0.15 kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
and and
kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
and unless on (cluster, namespace, persistentvolumeclaim)
predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
unless on (cluster, namespace, persistentvolumeclaim) unless on (cluster, namespace, persistentvolumeclaim)
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
unless on (cluster, namespace, persistentvolumeclaim) for: 1h
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 labels:
for: 1h severity: warning
labels: - alert: KubePersistentVolumeInodesFillingUp
severity: warning annotations:
- alert: KubePersistentVolumeInodesFillingUp description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim
annotations: }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
description: {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes.
The PersistentVolume claimed by {{ $labels.persistentvolumeclaim runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster summary: PersistentVolumeInodes are filling up.
{{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes. expr: |-
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup (
summary: PersistentVolumeInodes are filling up. kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
expr: |- /
( kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} ) < 0.03
/ and
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
) < 0.03 unless on (cluster, namespace, persistentvolumeclaim)
and kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 unless on (cluster, namespace, persistentvolumeclaim)
unless on (cluster, namespace, persistentvolumeclaim) kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 for: 1m
unless on (cluster, namespace, persistentvolumeclaim) labels:
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 severity: critical
for: 1m - alert: KubePersistentVolumeInodesFillingUp
labels: annotations:
severity: critical description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim
- alert: KubePersistentVolumeInodesFillingUp }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster
annotations: {{ . }} {{- end }} is expected to run out of inodes within four days. Currently
description: {{ $value | humanizePercentage }} of its inodes are free.
Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup
}} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster summary: PersistentVolumeInodes are filling up.
{{ . }} {{- end }} is expected to run out of inodes within four days. Currently expr: |-
{{ $value | humanizePercentage }} of its inodes are free. (
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}
summary: PersistentVolumeInodes are filling up. /
expr: |- kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"}
( ) < 0.15
kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} and
/ kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0
kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} and
) < 0.15 predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0
and unless on (cluster, namespace, persistentvolumeclaim)
kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1
and unless on (cluster, namespace, persistentvolumeclaim)
predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1
unless on (cluster, namespace, persistentvolumeclaim) for: 1h
kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 labels:
unless on (cluster, namespace, persistentvolumeclaim) severity: warning
kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 - alert: KubePersistentVolumeErrors
for: 1h annotations:
labels: description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster
severity: warning -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}.
- alert: KubePersistentVolumeErrors runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors
annotations: summary: PersistentVolume is having issues with provisioning.
description: expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster > 0
-}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}. for: 5m
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors labels:
summary: PersistentVolume is having issues with provisioning. severity: critical
expr:
kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"}
> 0
for: 5m
labels:
severity: critical
+339 -366
View File
@@ -1,367 +1,340 @@
groups: groups:
- name: node-exporter - name: node-exporter
rules: rules:
- alert: NodeFilesystemSpaceFillingUp - alert: NodeFilesystemSpaceFillingUp
annotations: annotations:
description: description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available space left and is filling up.
space left and is filling up. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup summary: Filesystem is predicted to run out of space within the next 24 hours.
summary: Filesystem is predicted to run out of space within the next 24 hours. expr: |-
expr: |- (
( node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 and
and predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 and
and node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 )
) for: 1h
for: 1h labels:
labels: severity: warning
severity: warning - alert: NodeFilesystemSpaceFillingUp
- alert: NodeFilesystemSpaceFillingUp annotations:
annotations: description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
description: }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint space left and is filling up fast.
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup
space left and is filling up fast. summary: Filesystem is predicted to run out of space within the next 4 hours.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup expr: |-
summary: Filesystem is predicted to run out of space within the next 4 hours. (
expr: |- node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10
( and
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
and and
predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
and )
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 for: 1h
) labels:
for: 1h severity: critical
labels: - alert: NodeFilesystemAlmostOutOfSpace
severity: critical annotations:
- alert: NodeFilesystemAlmostOutOfSpace description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
annotations: }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
description: space left.
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available summary: Filesystem has less than 5% space left.
space left. expr: |-
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace (
summary: Filesystem has less than 5% space left. node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
expr: |- and
( node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 )
and for: 30m
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 labels:
) severity: warning
for: 30m - alert: NodeFilesystemAlmostOutOfSpace
labels: annotations:
severity: warning description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
- alert: NodeFilesystemAlmostOutOfSpace }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
annotations: space left.
description: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint summary: Filesystem has less than 3% space left.
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available expr: |-
space left. (
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
summary: Filesystem has less than 3% space left. and
expr: |- node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
( )
node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 for: 30m
and labels:
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 severity: critical
) - alert: NodeFilesystemFilesFillingUp
for: 30m annotations:
labels: description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
severity: critical }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
- alert: NodeFilesystemFilesFillingUp inodes left and is filling up.
annotations: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
description: summary: Filesystem is predicted to run out of inodes within the next 24 hours.
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint expr: |-
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available (
inodes left and is filling up. node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup and
summary: Filesystem is predicted to run out of inodes within the next 24 hours. predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0
expr: |- and
( node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 )
and for: 1h
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 labels:
and severity: warning
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 - alert: NodeFilesystemFilesFillingUp
) annotations:
for: 1h description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
labels: }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
severity: warning inodes left and is filling up fast.
- alert: NodeFilesystemFilesFillingUp runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup
annotations: summary: Filesystem is predicted to run out of inodes within the next 4 hours.
description: expr: |-
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint (
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20
inodes left and is filling up fast. and
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0
summary: Filesystem is predicted to run out of inodes within the next 4 hours. and
expr: |- node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
( )
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 for: 1h
and labels:
predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 severity: critical
and - alert: NodeFilesystemAlmostOutOfFiles
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 annotations:
) description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
for: 1h }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
labels: inodes left.
severity: critical runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
- alert: NodeFilesystemAlmostOutOfFiles summary: Filesystem has less than 5% inodes left.
annotations: expr: |-
description: (
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available and
inodes left. node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles )
summary: Filesystem has less than 5% inodes left. for: 1h
expr: |- labels:
( severity: warning
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 - alert: NodeFilesystemAlmostOutOfFiles
and annotations:
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint
) }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available
for: 1h inodes left.
labels: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles
severity: warning summary: Filesystem has less than 3% inodes left.
- alert: NodeFilesystemAlmostOutOfFiles expr: |-
annotations: (
description: node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3
Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint and
}}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0
inodes left. )
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles for: 1h
summary: Filesystem has less than 3% inodes left. labels:
expr: |- severity: critical
( - alert: NodeNetworkReceiveErrs
node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 annotations:
and description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 {{ printf "%.0f" $value }} receive errors in the last two minutes.'
) runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs
for: 1h summary: Network interface is reporting many receive errors.
labels: expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m])
severity: critical > 0.01
- alert: NodeNetworkReceiveErrs for: 1h
annotations: labels:
description: severity: warning
'{{ $labels.instance }} interface {{ $labels.device }} has encountered - alert: NodeNetworkTransmitErrs
{{ printf "%.0f" $value }} receive errors in the last two minutes.' annotations:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered
summary: Network interface is reporting many receive errors. {{ printf "%.0f" $value }} transmit errors in the last two minutes.'
expr: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs
rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) summary: Network interface is reporting many transmit errors.
> 0.01 expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m])
for: 1h > 0.01
labels: for: 1h
severity: warning labels:
- alert: NodeNetworkTransmitErrs severity: warning
annotations: - alert: NodeHighNumberConntrackEntriesUsed
description: annotations:
'{{ $labels.instance }} interface {{ $labels.device }} has encountered description: '{{ $value | humanizePercentage }} of conntrack entries are used.'
{{ printf "%.0f" $value }} transmit errors in the last two minutes.' runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs summary: Number of conntrack are getting close to the limit.
summary: Network interface is reporting many transmit errors. expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit)
expr: > 0.75
rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) labels:
> 0.01 severity: warning
for: 1h - alert: NodeTextFileCollectorScrapeError
labels: annotations:
severity: warning description: Node Exporter text file collector on {{ $labels.instance }} failed
- alert: NodeHighNumberConntrackEntriesUsed to scrape.
annotations: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror
description: "{{ $value | humanizePercentage }} of conntrack entries are used." summary: Node Exporter text file collector failed to scrape.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused expr: node_textfile_scrape_error{job="node-exporter"} == 1
summary: Number of conntrack are getting close to the limit. labels:
expr: severity: warning
(node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) - alert: NodeClockSkewDetected
> 0.75 annotations:
labels: description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s.
severity: warning Ensure NTP is configured correctly on this host.
- alert: NodeTextFileCollectorScrapeError runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected
annotations: summary: Clock skew detected.
description: expr: |-
Node Exporter text file collector on {{ $labels.instance }} failed (
to scrape. node_timex_offset_seconds{job="node-exporter"} > 0.05
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror and
summary: Node Exporter text file collector failed to scrape. deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0
expr: node_textfile_scrape_error{job="node-exporter"} == 1 )
labels: or
severity: warning (
- alert: NodeClockSkewDetected node_timex_offset_seconds{job="node-exporter"} < -0.05
annotations: and
description: deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0
Clock at {{ $labels.instance }} is out of sync by more than 0.05s. )
Ensure NTP is configured correctly on this host. for: 10m
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected labels:
summary: Clock skew detected. severity: warning
expr: |- - alert: NodeClockNotSynchronising
( annotations:
node_timex_offset_seconds{job="node-exporter"} > 0.05 description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP
and is configured on this host.
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising
) summary: Clock not synchronising.
or expr: |-
( min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0
node_timex_offset_seconds{job="node-exporter"} < -0.05 and
and node_timex_maxerror_seconds{job="node-exporter"} >= 16
deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 for: 10m
) labels:
for: 10m severity: warning
labels: - alert: NodeRAIDDegraded
severity: warning annotations:
- alert: NodeClockNotSynchronising description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is
annotations: in degraded state due to one or more disks failures. Number of spare drives
description: is insufficient to fix issue automatically.
Clock at {{ $labels.instance }} is not synchronising. Ensure NTP runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded
is configured on this host. summary: RAID Array is degraded.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
summary: Clock not synchronising. - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"})
expr: |- > 0
min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 for: 15m
and labels:
node_timex_maxerror_seconds{job="node-exporter"} >= 16 severity: critical
for: 10m - alert: NodeRAIDDiskFailure
labels: annotations:
severity: warning description: At least one device in RAID array at {{ $labels.instance }} failed.
- alert: NodeRAIDDegraded Array '{{ $labels.device }}' needs attention and possibly a disk swap.
annotations: runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure
description: summary: Failed device in RAID array.
RAID array '{{ $labels.device }}' at {{ $labels.instance }} is expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}
in degraded state due to one or more disks failures. Number of spare drives > 0
is insufficient to fix issue automatically. labels:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded severity: warning
summary: RAID Array is degraded. - alert: NodeFileDescriptorLimit
expr: annotations:
node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} description: File descriptors limit at {{ $labels.instance }} is currently at
- ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) {{ printf "%.2f" $value }}%.
> 0 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
for: 15m summary: Kernel is predicted to exhaust file descriptors limit soon.
labels: expr: |-
severity: critical (
- alert: NodeRAIDDiskFailure node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70
annotations: )
description: for: 15m
At least one device in RAID array at {{ $labels.instance }} failed. labels:
Array '{{ $labels.device }}' needs attention and possibly a disk swap. severity: warning
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure - alert: NodeFileDescriptorLimit
summary: Failed device in RAID array. annotations:
expr: description: File descriptors limit at {{ $labels.instance }} is currently at
node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} {{ printf "%.2f" $value }}%.
> 0 runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit
labels: summary: Kernel is predicted to exhaust file descriptors limit soon.
severity: warning expr: |-
- alert: NodeFileDescriptorLimit (
annotations: node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90
description: )
File descriptors limit at {{ $labels.instance }} is currently at for: 15m
{{ printf "%.2f" $value }}%. labels:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit severity: critical
summary: Kernel is predicted to exhaust file descriptors limit soon. - alert: NodeCPUHighUsage
expr: |- annotations:
( description: |
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
) runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage
for: 15m summary: High CPU usage.
labels: expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter",
severity: warning mode!="idle"}[2m]))) * 100 > 90
- alert: NodeFileDescriptorLimit for: 15m
annotations: labels:
description: severity: info
File descriptors limit at {{ $labels.instance }} is currently at - alert: NodeSystemSaturation
{{ printf "%.2f" $value }}%. annotations:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit description: |
summary: Kernel is predicted to exhaust file descriptors limit soon. System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
expr: |- This might indicate this instance resources saturation and can cause it becoming unresponsive.
( runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation
node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 summary: System saturated, load per core is very high.
) expr: |-
for: 15m node_load1{job="node-exporter"}
labels: / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2
severity: critical for: 15m
- alert: NodeCPUHighUsage labels:
annotations: severity: warning
description: | - alert: NodeMemoryMajorPagesFaults
CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. annotations:
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage description: |
summary: High CPU usage. Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
expr: Please check that there is enough memory available at this instance.
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults
mode!="idle"}[2m]))) * 100 > 90 summary: Memory major page faults are occurring at very high rate.
for: 15m expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500
labels: for: 15m
severity: info labels:
- alert: NodeSystemSaturation severity: warning
annotations: - alert: NodeMemoryHighUtilization
description: | annotations:
System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. description: |
This might indicate this instance resources saturation and can cause it becoming unresponsive. Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization
summary: System saturated, load per core is very high. summary: Host is running out of memory.
expr: |- expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"}
node_load1{job="node-exporter"} * 100) > 90
/ count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 for: 15m
for: 15m labels:
labels: severity: warning
severity: warning - alert: NodeDiskIOSaturation
- alert: NodeMemoryMajorPagesFaults annotations:
annotations: description: |
description: | Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}.
Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. This symptom might indicate disk saturation.
Please check that there is enough memory available at this instance. runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults summary: Disk IO queue is high.
summary: Memory major page faults are occurring at very high rate. expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 > 10
for: 15m for: 30m
labels: labels:
severity: warning severity: warning
- alert: NodeMemoryHighUtilization - alert: NodeSystemdServiceFailed
annotations: annotations:
description: | description: Systemd service {{ $labels.name }} has entered failed state at
Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. {{ $labels.instance }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
summary: Host is running out of memory. summary: Systemd service has entered failed state.
expr: expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} for: 5m
* 100) > 90 labels:
for: 15m severity: warning
labels: - alert: NodeBondingDegraded
severity: warning annotations:
- alert: NodeDiskIOSaturation description: Bonding interface {{ $labels.master }} on {{ $labels.instance }}
annotations: is in degraded state due to one or more slave failures.
description: | runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. summary: Bonding interface is degraded
This symptom might indicate disk saturation. expr: (node_bonding_slaves - node_bonding_active) != 0
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation for: 5m
summary: Disk IO queue is high. labels:
expr: severity: warning
rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m])
> 10
for: 30m
labels:
severity: warning
- alert: NodeSystemdServiceFailed
annotations:
description:
Systemd service {{ $labels.name }} has entered failed state at
{{ $labels.instance }}
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed
summary: Systemd service has entered failed state.
expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1
for: 5m
labels:
severity: warning
- alert: NodeBondingDegraded
annotations:
description:
Bonding interface {{ $labels.master }} on {{ $labels.instance }}
is in degraded state due to one or more slave failures.
runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded
summary: Bonding interface is degraded
expr: (node_bonding_slaves - node_bonding_active) != 0
for: 5m
labels:
severity: warning
+69 -75
View File
@@ -1,76 +1,70 @@
groups: groups:
- name: node-resource-utilization.rules - name: node-resource-utilization.rules
rules: rules:
- alert: HostHighCpuLoad - alert: HostHighCpuLoad
annotations: annotations:
description: |- description: |-
CPU load is > 90% CPU load is > 90%
VALUE = {{ $value }} VALUE = {{ $value }}
LABELS = {{ $labels }} LABELS = {{ $labels }}
summary: Host high CPU load (instance {{ $labels.instance }}) summary: Host high CPU load (instance {{ $labels.instance }})
expr: expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 10m
for: 10m labels:
labels: severity: critical
severity: critical - alert: MemoryUtilizationHighWarning
- alert: MemoryUtilizationHighWarning annotations:
annotations: dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
dashboard: $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ description: Node {{ $labels.instance }} has less than 10% available memory.
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D summary: Node Memory utilization warning
description: Node {{ $labels.instance }} has less than 10% available memory. expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
summary: Node Memory utilization warning for: 5m
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 labels:
for: 5m severity: critical
labels: - alert: MemoryUtilizationHighCritical
severity: critical annotations:
- alert: MemoryUtilizationHighCritical dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
annotations: $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
dashboard: description: Node {{ $labels.instance }} has less than 5% available memory.
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ summary: Node Memory utilization critical
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
description: Node {{ $labels.instance }} has less than 5% available memory. for: 1m
summary: Node Memory utilization critical labels:
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5 severity: critical
for: 1m - alert: NodeNotReady
labels: annotations:
severity: critical description: Node {{ $labels.node }} has CPU utilization over 90%.
- alert: NodeNotReady summary: Node has been in not-ready state for longer than 3 minutes
annotations: expr: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
description: Node {{ $labels.node }} has CPU utilization over 90%. <= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
summary: Node has been in not-ready state for longer than 3 minutes > 0
expr: for: 5m
(sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m]) labels:
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"})) severity: critical
> 0 - alert: KubernetesNodeMemoryPressure
for: 5m annotations:
labels: description: |-
severity: critical Node {{ $labels.node }} has MemoryPressure condition
- alert: KubernetesNodeMemoryPressure VALUE = {{ $value }}
annotations: LABELS = {{ $labels }}
description: |- summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
Node {{ $labels.node }} has MemoryPressure condition expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
VALUE = {{ $value }} 1
LABELS = {{ $labels }} for: 2m
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }}) labels:
expr: severity: critical
kube_node_status_condition{condition="MemoryPressure",status="true"} == - alert: KubernetesContainerOomKiller
1 annotations:
for: 2m description: |-
labels: Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
severity: critical VALUE = {{ $value }}
- alert: KubernetesContainerOomKiller LABELS = {{ $labels }}
annotations: summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
description: |- expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
VALUE = {{ $value }} == 1
LABELS = {{ $labels }} for: 0m
summary: Kubernetes Container oom killer (instance {{ $labels.instance }}) labels:
expr: severity: warning
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
== 1
for: 0m
labels:
severity: warning
+20 -24
View File
@@ -1,25 +1,21 @@
groups: groups:
- name: velero - name: velero
rules: rules:
- alert: VeleroBackupPartialFailures - alert: VeleroBackupPartialFailures
annotations: annotations:
message: message: Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy
Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy failed backups.
failed backups. expr: velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
expr: > 0.25
velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""} for: 15m
> 0.25 labels:
for: 15m severity: critical
labels: - alert: VeleroBackupFailures
severity: critical annotations:
- alert: VeleroBackupFailures message: Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed
annotations: backups.
message: expr: velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""}
Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed > 0.25
backups. for: 15m
expr: labels:
velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""} severity: critical
> 0.25
for: 15m
labels:
severity: critical
+45 -51
View File
@@ -1,52 +1,46 @@
groups: groups:
- name: x509-certificate-exporter.rules - name: x509-certificate-exporter.rules
rules: rules:
- alert: X509ExporterReadErrors - alert: X509ExporterReadErrors
annotations: annotations:
description: description: Over the last 15 minutes, this x509-certificate-exporter instance
Over the last 15 minutes, this x509-certificate-exporter instance has experienced errors reading certificate files or querying the Kubernetes
has experienced errors reading certificate files or querying the Kubernetes API. This could be caused by a misconfiguration if triggered when the exporter
API. This could be caused by a misconfiguration if triggered when the exporter starts.
starts. summary: Increasing read errors for x509-certificate-exporter
summary: Increasing read errors for x509-certificate-exporter expr: delta(x509_read_errors[15m]) > 0
expr: delta(x509_read_errors[15m]) > 0 for: 5m
for: 5m labels:
labels: severity: warning
severity: warning - alert: CertificateError
- alert: CertificateError annotations:
annotations: description: Certificate could not be decoded {{if $labels.secret_name }} in
description: Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at
Certificate could not be decoded {{if $labels.secret_name }} in location "{{ $labels.filepath }}"{{end}}
Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at summary: Certificate cannot be decoded
location "{{ $labels.filepath }}"{{end}} expr: x509_cert_error > 0
summary: Certificate cannot be decoded for: 15m
expr: x509_cert_error > 0 labels:
for: 15m severity: warning
labels: - alert: CertificateRenewal
severity: warning annotations:
- alert: CertificateRenewal description: Certificate for "{{ $labels.subject_CN }}" should be renewed {{if
annotations: $labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
description: $labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
Certificate for "{{ $labels.subject_CN }}" should be renewed {{if summary: Certificate should be renewed
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{ expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}} issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28
summary: Certificate should be renewed for: 15m
expr: labels:
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="", severity: warning
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28 - alert: CertificateExpiration
for: 15m annotations:
labels: description: Certificate for "{{ $labels.subject_CN }}" is about to expire {{if
severity: warning $labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{
- alert: CertificateExpiration $labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}}
annotations: summary: Certificate is about to expire
description: expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
Certificate for "{{ $labels.subject_CN }}" is about to expire {{if issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14
$labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{ for: 15m
$labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}} labels:
summary: Certificate is about to expire severity: critical
expr:
((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="",
issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14
for: 15m
labels:
severity: critical
+21 -48
View File
@@ -6,62 +6,35 @@ let
config = { }; config = { };
overlays = [ ]; overlays = [ ];
}; };
checks = import ./nix/checks.nix;
in in
pkgs.mkShellNoCC { pkgs.mkShellNoCC {
name = "clstr"; name = "clstr";
packages = packages = with pkgs; [
with pkgs; just
[ npins
# dev tools
just
npins
# helm # helm
helmfile helmfile
kubernetes-helm kubernetes-helm
# kubectl tools # kubectl tools
kubectl-cnpg kubectl-cnpg
kubectl-neat kubectl-neat
kubelogin kubelogin
kubelogin-oidc kubelogin-oidc
kubectl-rook-ceph kubectl-rook-ceph
kubectl-graph
kubectl-klock
graphviz
# other tools activate when needed # other tools
# step-cli step-cli
# linkerd linkerd
# cmctl velero
# rclone cmctl
# velero
# renovate
# dapr # dapr
dapr-cli dapr-cli
]
++ checks.enabledPackages;
# Environment variables
ARGOCD_ENV_CLUSTER_NAME = "hel1";
HELM_GIT_ACCESS_TOKEN = "glpat-xxx";
shellHook = builtins.concatStringsSep "\n" [
checks.shellHook
]; ];
# Alternative shells ARGOCD_ENV_CLUSTER_NAME = "rossby";
passthru = pkgs.lib.mapAttrs (name: value: pkgs.mkShellNoCC (value // { inherit name; })) { HELM_GIT_ACCESS_TOKEN = "glpat-xxx";
ci-shell = {
packages = [
pkgs.npins
];
shellHook = ''
export NPINS_DIRECTORY="nix"
'';
};
};
} }
-3
View File
@@ -88,8 +88,6 @@ spec:
server: https://kubernetes.default.svc server: https://kubernetes.default.svc
- namespace: uptime - namespace: uptime
server: https://kubernetes.default.svc server: https://kubernetes.default.svc
- namespace: forgejo
server: https://kubernetes.default.svc
sourceRepos: sourceRepos:
- https://argoproj.github.io/argo-helm - https://argoproj.github.io/argo-helm
- https://kubernetes-sigs.github.io/metrics-server/ - https://kubernetes-sigs.github.io/metrics-server/
@@ -125,7 +123,6 @@ spec:
- ghcr.io/slinkyproject/charts/slurm-operator-crds - ghcr.io/slinkyproject/charts/slurm-operator-crds
- ghcr.io/spegel-org/helm-charts - ghcr.io/spegel-org/helm-charts
- ghcr.io/dragonflydb/dragonfly-operator/helm/dragonfly-operator - ghcr.io/dragonflydb/dragonfly-operator/helm/dragonfly-operator
- code.forgejo.org/forgejo-helm
- https://operator.mariadb.com/mariadb-enterprise-operator - https://operator.mariadb.com/mariadb-enterprise-operator
- https://operator.mariadb.com - https://operator.mariadb.com
- https://ot-container-kit.github.io/helm-charts - https://ot-container-kit.github.io/helm-charts
@@ -73,7 +73,7 @@
"connString": "Username=postgres;Password=secret;Host=localhost;Port=5432;Database=app;Pooling=true;", "connString": "Username=postgres;Password=secret;Host=localhost;Port=5432;Database=app;Pooling=true;",
"sorcerer" : "https://sorcerer.data.oceanbox.io", "sorcerer" : "https://sorcerer.data.oceanbox.io",
"allowedOrigins": [ "allowedOrigins": [
"https://maps.oceanbox.io" "https://maps.oceanbox.io",
], ],
"appName": "atlantis", "appName": "atlantis",
"appEnv": "prod", "appEnv": "prod",
@@ -79,9 +79,3 @@ resources:
requests: requests:
cpu: 500m cpu: 500m
memory: 1Gi memory: 1Gi
diagrid-dashboard:
enabled: false
statestore:
scope: prod-atlantis
redis: prod-atlantis-redis
@@ -1,6 +1,6 @@
replicaCount: 1 replicaCount: 1
image: image:
tag: 503ccbb2-debug tag: faa0a853-debug
podAnnotations: podAnnotations:
dapr.io/app-id: "staging-atlantis" dapr.io/app-id: "staging-atlantis"
env: env:
@@ -26,12 +26,12 @@ env:
- name: DB_USER - name: DB_USER
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: staging-atlantis-db-app name: staging-atlantis-db-superuser
key: username key: username
- name: DB_PASSWORD - name: DB_PASSWORD
valueFrom: valueFrom:
secretKeyRef: secretKeyRef:
name: staging-atlantis-db-app name: staging-atlantis-db-superuser
key: password key: password
- name: DAPR_API_TOKEN - name: DAPR_API_TOKEN
valueFrom: valueFrom:
@@ -116,6 +116,9 @@ cluster:
db: prod-atlantis-db db: prod-atlantis-db
namespace: prod-atlantis namespace: prod-atlantis
resources: resources:
limits:
cpu: 250m
memory: 1Gi
requests: requests:
cpu: 250m cpu: 250m
memory: 1Gi memory: 1Gi
@@ -130,8 +133,3 @@ redis:
resources: resources:
cpu: 150m cpu: 150m
memory: 256Mi memory: 256Mi
diagrid-dashboard:
enabled: false
statestore:
scope: staging-atlantis
redis: staging-atlantis-redis
@@ -10,4 +10,3 @@ podAnnotations:
dapr.io/sidecar-memory-request: "50Mi" dapr.io/sidecar-memory-request: "50Mi"
# dapr.io/sidecar-cpu-limit: "100m" # dapr.io/sidecar-cpu-limit: "100m"
# dapr.io/sidecar-memory-limit: "1000Mi" # dapr.io/sidecar-memory-limit: "1000Mi"
-4
View File
@@ -1,8 +1,4 @@
codex: codex:
enabled: false enabled: false
{{- if eq .Environment.Name "prod" }}
autosync: false autosync: false
{{- else }}
autosync: true
{{- end }}
env: {{ .Environment.Name }} env: {{ .Environment.Name }}
@@ -1,67 +0,0 @@
{
"Logging": {
"LogLevel": {
"Default": "Information",
"Microsoft": "Warning",
"Microsoft.Hosting": "Error"
}
},
"Debug": {
"LogLevel": {
"Default": "Debug"
}
},
"Console": {
"IncludeScopes": true,
"LogLevel": {
"Default": "Debug"
}
},
"OIDC": {
"issuer": "https://auth.oceanbox.io/realms/oceanbox",
"authorization_endpoint": "https://auth.oceanbox.io/realms/oceanbox/protocol/openid-connect/auth",
"token_endpoint": "https://auth.oceanbox.io/realms/oceanbox/protocol/openid-connect/token",
"jwks_uri": "https://auth.oceanbox.io/realms/oceanbox/protocol/openid-connect/certs",
"userinfo_endpoint": "https://auth.oceanbox.io/realms/oceanbox/protocol/openid-connect/userinfo",
"end_session_endpoint": "https://auth.oceanbox.io/realms/oceanbox/protocol/openid-connect/logout",
"device_authorization_endpoint": "https://auth.oceanbox.io/realms/oceanbox/protocol/openid-connect/auth/device",
"clientId": "atlantis",
"clientSecret": "",
"scopes": [
"openid",
"email",
"offline_access",
"profile"
],
"audiences": [
"atlantis"
]
},
"SSO": {
"cookieDomain": ".oceanbox.io",
"cookieName": ".obx.prod",
"ttl": 12.0,
"signedOutRedirectUri": "https://maps.oceanbox.io/",
"realm": "atlantis",
"environment": "prod",
"keyStore": {
"kind": "azure",
"uri": "https://atlantis.blob.core.windows.net",
"key": "dataprotection-keys"
},
"keyVault": {
"kind": "azure",
"uri": "https://atlantisvault.vault.azure.net",
"key": "dataencryption-keys"
}
},
"plainAuthUsers": [
{
"username": "admin",
"password": "en-to-tre-fire",
"groups": [ "/oceanbox" ],
"roles": [ "admin" ]
}
]
}
@@ -1,66 +0,0 @@
- op: add
path: /spec/template/spec/containers/0/envFrom
value:
- secretRef:
name: azure-keyvault
- op: add
path: /spec/template/spec/containers/0/env
value:
- name: APP_NAMESPACE
value: prod-atlantis
- name: DOTNET_ENVIRONMENT
value: Production
- name: ASPNETCORE_ENVIRONMENT
value: Production
- name: DB_HOST
valueFrom:
secretKeyRef:
name: prod-atlantis-db-app
key: host
- name: DB_PORT
valueFrom:
secretKeyRef:
name: prod-atlantis-db-app
key: port
- name: DB_DATABASE
valueFrom:
secretKeyRef:
name: prod-atlantis-db-app
key: dbname
- name: DB_USER
valueFrom:
secretKeyRef:
name: prod-atlantis-db-app
key: user
- name: DB_PASSWORD
valueFrom:
secretKeyRef:
name: prod-atlantis-db-app
key: password
- name: FGA_URL
value: http://prod-openfga.openfga.svc.cluster.local:8080
- name: FGA_DB_HOST
valueFrom:
secretKeyRef:
name: prod-openfga-db-app
key: host
- name: FGA_DB_PORT
valueFrom:
secretKeyRef:
name: prod-openfga-db-app
key: port
- name: FGA_DB_DATABASE
valueFrom:
secretKeyRef:
name: prod-openfga-db-app
key: dbname
- name: FGA_DB_USER
valueFrom:
secretKeyRef:
name: prod-openfga-db-app
key: user
- name: FGA_DB_PASSWORD
valueFrom:
secretKeyRef:
name: prod-openfga-db-app
key: password
@@ -1,15 +0,0 @@
generatorOptions:
disableNameSuffixHash: true
configMapGenerator:
- name: prod-codex-appsettings
files:
- appsettings.json
patches:
- target:
group: apps
version: v1
kind: Deployment
path: deployment_patch.yaml
resources:
- ../base
@@ -62,3 +62,4 @@
secretKeyRef: secretKeyRef:
name: staging-openfga-db-app name: staging-openfga-db-app
key: password key: password
name: azure-keyvault
@@ -1,14 +0,0 @@
{{- if .Values.clusterConfig.cilium.enabled }}
apiVersion: cilium.io/v2
kind: CiliumNetworkPolicy
metadata:
name: codex-allow-external-services
namespace: {{ .Release.Namespace }}
spec:
egress:
- toFQDNs:
- matchName: cacerts.digicert.com
endpointSelector:
matchLabels: {}
{{- end }}
-27
View File
@@ -1,27 +0,0 @@
replicaCount: 1
ingress:
enabled: true
className: "nginx"
annotations:
cert-manager.io/cluster-issuer: letsencrypt-production
nginx.ingress.kubernetes.io/backend-protocol: HTTP
nginx.ingress.kubernetes.io/ssl-redirect: "true"
oceanbox.io/expose: internal
hosts:
- host: codex.adm.oceanbox.io
paths:
- path: /
pathType: ImplementationSpecific
tls:
- hosts:
- codex.adm.oceanbox.io
secretName: prod-codex-tls
volumes:
- name: appsettings
configMap:
name: prod-codex-appsettings
volumeMounts:
- name: appsettings
mountPath: "/app/appsettings.json"
readOnly: true
subPath: appsettings.json
+2 -2
View File
@@ -1,4 +1,6 @@
replicaCount: 1 replicaCount: 1
image:
tag: 70878e14-debug
ingress: ingress:
enabled: true enabled: true
className: "nginx" className: "nginx"
@@ -30,5 +32,3 @@ volumeMounts:
mountPath: "/app/appsettings.Development.json" mountPath: "/app/appsettings.Development.json"
readOnly: true readOnly: true
subPath: appsettings.json subPath: appsettings.json
image:
tag: 2e1165d9-debug
+2 -7
View File
@@ -34,8 +34,6 @@ spec:
name: http name: http
protocol: TCP protocol: TCP
env: env:
- name: BASE_URL
value: https://fornix.hel1.oceanbox.io
- name: DRUPAL_DATABASE_HOST - name: DRUPAL_DATABASE_HOST
value: drupal-db-rw value: drupal-db-rw
- name: DRUPAL_DATABASE_PREFIX - name: DRUPAL_DATABASE_PREFIX
@@ -65,15 +63,12 @@ spec:
- mountPath: /opt/drupal/web/sites - mountPath: /opt/drupal/web/sites
name: drupal name: drupal
subPath: sites subPath: sites
- mountPath: /opt/drupal/patches
name: drupal
subPath: modules/patches
- mountPath: /opt/drupal/composer.json - mountPath: /opt/drupal/composer.json
name: drupal name: drupal
subPath: modules/composer.json subPath: modules/composer.json
- mountPath: /opt/drupal/composer.lock - mountPath: /opt/drupal/patches
name: drupal name: drupal
subPath: modules/composer.lock subPath: modules/patches
volumes: volumes:
- name: drupal - name: drupal
persistentVolumeClaim: persistentVolumeClaim:
+1 -2
View File
@@ -2,14 +2,13 @@ clusterConfig:
manifests: https://gitlab.com/oceanbox/manifests.git manifests: https://gitlab.com/oceanbox/manifests.git
env: "prod" env: "prod"
distro: "talos" distro: "talos"
domain: "adm.hel1.obx" domain: "hel1.oceanbox.io"
initca: "" initca: ""
apiserver: "" apiserver: ""
apiserverip: "" apiserverip: ""
etcd_nodes: ["10.0.1.2, 10.0.1.4, 10.0.1.5"] etcd_nodes: ["10.0.1.2, 10.0.1.4, 10.0.1.5"]
k8s_nodes: [""] k8s_nodes: [""]
cluster: "hel1" cluster: "hel1"
ingress_clusterissuer: "ca-issuer"
ingress_nodes: ["controlplane-1, controlplane-2, controlplane-3"] ingress_nodes: ["controlplane-1, controlplane-2, controlplane-3"]
ingress_replica_count: 3 ingress_replica_count: 3
ingress_loadbalancer: true ingress_loadbalancer: true

Some files were not shown because too many files have changed in this diff Show More