wip: Match chart to k8s state

This commit is contained in:
2025-06-05 13:52:36 +02:00
parent 1bb720840d
commit 9249f0eb18
82 changed files with 28211 additions and 329 deletions
File diff suppressed because it is too large Load Diff
File diff suppressed because it is too large Load Diff
@@ -0,0 +1,12 @@
apiVersion: cilium.io/v2
kind: CiliumNetworkPolicy
metadata:
name: allow-slack
namespace: prometheus
spec:
egress:
- toFQDNs:
- matchPattern: slack.com
- matchName: hooks.slack.com
endpointSelector:
matchLabels: {}
@@ -1,4 +1,4 @@
{{- if and (.Values.kyverno.enabled) (.Values.prometheus.enabled) }}
{{- if .Values.prometheus.enabled }}
apiVersion: kyverno.io/v1
kind: Policy
metadata:
+10 -3
View File
@@ -10,13 +10,18 @@ spec:
server: 'https://kubernetes.default.svc'
sources:
- repoURL: {{ .Values.clusterConfig.manifests }}
targetRevision: HEAD
path: helmfiles/prometheus
# targetRevision: HEAD
targetRevision: mrtz/helmify
path: helmfile.d
plugin:
name: helmfile
name: helmfile-cmp
env:
- name: CLUSTER_NAME
value: {{ .Values.clusterConfig.cluster }}
- name: HELMFILE_ENVIRONMENT
value: default
- name: HELMFILE_FILE_PATH
value: prometheus.yaml.gotmpl
project: sys
syncPolicy:
managedNamespaceMetadata:
@@ -24,6 +29,8 @@ spec:
component: sys
syncOptions:
- ServerSideApply=true
- CreateNamespace=true
- ApplyOutOfSyncOnly=true
{{- if .Values.prometheus.autosync }}
automated:
prune: true
+18 -14
View File
@@ -1,15 +1,19 @@
cilium:
enabled: true
nodePort:
enable: true
l2announcement:
enable: true
loadbalancerPool:
prometheus:
snitchUrl: "https://nosnch.in/136c1b564f"
pagerdutyRoutingKey: a5cff1fc46414d0bc02851e4af159ee7
certRenewCronEnabled: false
fullname: prom
enableFeatures:
- otlp-write-reciever
- remote-write-reciever
grafana:
persistence: true
thanos:
enabled: true
cidr:
- 10.255.241.11/32
- 10.255.241.12/32
- 10.255.241.13/32
- 10.255.241.14/32
- 10.255.241.15/32
coredns:
targetPort: 9153
scheduler:
targetPort: 10259
kubelet:
enabled: true
https: true
+4
View File
@@ -13,6 +13,10 @@ prometheus:
additionalScrapeConfigs: []
additionalDataSources: []
enableFeatures: []
#alert_receiver: oncall
#alert_group_by: [ "cluster", "namespace", "alertname" ]
alert_group_by: []
alert_receiver: ""
storage:
size: 50Gi
grafana:
+540 -101
View File
@@ -1,106 +1,545 @@
authentication:
mutual:
spire:
enabled: {{ .Values.cilium.spire.enabled }}
cgroup:
autoMount:
enabled: false
hostRoot: /sys/fs/cgroup
dashboards:
enabled: true
namespace: prometheus
enableXTSocketFallback: false
encryption:
enabled: {{ .Values.cilium.encryption.enabled }}
type: {{ .Values.cilium.encryption.type}}
envoy:
enabled: {{ .Values.cilium.envoy.enabled }}
prometheus:
serviceMonitor:
enabled: {{ .Values.cilium.envoy.enabled }}
extraConfig:
enable-envoy-config: "true"
hubble:
enabled: true
tls:
auto:
method: cronJob
metrics:
crds:
enabled: false
fullnameOverride: {{ .Values.prometheus.fullname | default "prometheus-kube-prometheus" }}
{{- with .Values.prometheus.defaultRules }}
defaultRules:
{{- . | toYaml | nindent 10 }}
{{- end}}
## Configuration for alertmanager
## ref: https://prometheus.io/docs/alerting/alertmanager/
##
alertmanager:
config:
route:
{{- if .Values.prometheus.alert_group_by }}
group_by:
{{- range .Values.prometheus.alert_group_by }}
- {{ . | quote }}
{{- end }}
{{- else }}
group_by: ["alertname"]
{{- end }}
group_wait: 60s
group_interval: 15m
repeat_interval: 24h
receiver: {{ .Values.prometheus.alert_receiver | default "pagerduty" }}
routes:
- match:
alertname: Watchdog
group_wait: 0s
group_interval: 1m
repeat_interval: 50s
receiver: snitch
{{- if .Values.prometheus.oncallUrl}}
- match:
alertname: .*
receiver: oncall
continue: true
{{- end }}
receivers:
- name: pagerduty
pagerduty_configs:
- routing_key: {{ default "key" .Values.prometheus.pagerdutyRoutingKey }}
url: "https://events.pagerduty.com/v2/enqueue"
severity: {{`'{{ if .CommonLabels.severity }}{{ .CommonLabels.severity | toLower }}{{ else }}critical{{ end }}'`}}
{{- if .Values.prometheus.snitchUrl}}
- name: snitch
webhook_configs:
- url: "{{ .Values.prometheus.snitchUrl }}"
send_resolved: false
{{- end }}
- name: teams
webhook_configs:
- url: "https://prometheus-msteams.{{ .Values.clusterConfig.domain }}/{{ .Values.clusterConfig.cluster }}"
http_config:
tls_config:
insecure_skip_verify: true
{{- if .Values.prometheus.oncallUrl}}
- name: oncall
webhook_configs:
- url: "{{ .Values.prometheus.oncallUrl }}"
send_resolved: true
{{- end }}
storage: {}
ingress:
enabled: true
ingressClassName: nginx
annotations:
cert-manager.io/cluster-issuer: {{ .Values.clusterConfig.ingress_clusterissuer }}
nginx.ingress.kubernetes.io/ssl-redirect: "true"
{{- with .Values.clusterConfig.ingress_whitelist_ips }}
nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }}
{{- end }}
hosts:
- alertmanager.{{ .Values.clusterConfig.domain }}
paths:
- /
pathType: ImplementationSpecific
tls:
- secretName: alertmanager-general-tls
hosts:
- alertmanager.{{ .Values.clusterConfig.domain }}
ingressPerReplica:
pathType: ImplementationSpecific
alertmanagerSpec:
affinity: {}
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: kubernetes.io/hostname
# operator: In
# values:
# - {{ .Values.clusterConfig.cluster }}-0.itpartner.intern
tolerations: []
# - key: unschedulable
# operator: Exists
# effect: NoSchedule
grafana:
defaultDashboardsEnabled: {{ .Values.prometheus.grafana.defaultDashboardsEnabled }}
{{- if .Values.prometheus.grafana.plugins }}
plugins:
{{- range .Values.prometheus.grafana.plugins }}
- {{ . }}
{{- end }}
{{- end }}
grafana.ini:
server:
root_url: "https://grafana.{{.Values.clusterConfig.domain}}:443"
security:
allow_embedding: "true"
auth:
disable_login_form: "{{ .Values.prometheus.grafana.disable_login_form }}"
users:
auto_assign_org_role: "Admin"
{{- range .Values.clusterConfig.oidc }}
{{- if eq .provider "azuread" }}
auth.{{ .provider }}:
enabled: true
name: {{ .name }}
client_id: $__file{/etc/secrets/oauth/{{ .name }}/client_id}
client_secret: $__file{/etc/secrets/oauth/{{ .name }}/client_secret}
scopes: openid email profile
auth_url: https://login.microsoftonline.com/{{ .tenant }}/oauth2/v2.0/authorize
token_url: https://login.microsoftonline.com/{{ .tenant }}/oauth2/v2.0/token
allowed_groups: {{ .group_id }}
allow_sign_up: true
role_attribute_strict: false
allow_assign_grafana_admin: true
{{- else if eq .provider "github" }}
auth.{{ .provider }}:
name: {{ .name }}
enabled: true
client_id: $__file{/etc/secrets/oauth/{{ .name }}/client_id}
client_secret: $__file{/etc/secrets/oauth/{{ .name }}/client_secret}
allowed_organizations: {{ .allowed_organizations }}
{{- if .allowed_teams }}
allowed_teams: "{{ .allowed_teams }}"
{{- end }}
scopes: user:email,read:org
auth_url: https://github.com/login/oauth/authorize
token_url: https://github.com/login/oauth/access_token
allow_sign_up: true
role_attribute_strict: false
allow_assign_grafana_admin: true
{{- end }}
{{- end }}
extraSecretMounts:
{{- range .Values.clusterConfig.oidc }}
- name: {{ .name }}
secretName: {{ .secret_ref.name }}
defaultMode: 0440
mountPath: /etc/secrets/oauth/{{ .name }}
readOnly: true
{{- end }}
{{- if .Values.prometheus.grafana.persistence }}
persistence:
enabled: true
size: 10Gi
{{- end }}
ingress:
enabled: true
ingressClassName: nginx
annotations:
cert-manager.io/cluster-issuer: {{ .Values.clusterConfig.ingress_clusterissuer }}
nginx.ingress.kubernetes.io/ssl-redirect: "true"
{{- with .Values.clusterConfig.ingress_whitelist_ips}}
nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }}
{{- end }}
hosts:
- grafana.{{ .Values.clusterConfig.domain }}
path: /
tls:
- secretName: grafana-general-tls
hosts:
- grafana.{{ .Values.clusterConfig.domain }}
sidecar:
dashboards:
enabled: true
namespace: prometheus
enabled:
- dns:query;ignoreAAAA
- drop
- tcp
- flow
- icmp
- policy:sourceContext=app|workload-name|pod|reserved-identity;destinationContext=app|workload-name|pod|dns|reserved-identity;labelsContext=source_namespace,destination_namespace
- httpV2:exemplars=false;labelsContext=source_ip,source_namespace,source_workload,destination_ip,destination_namespace,destination_workload,traffic_direction
port: 12304
serviceMonitor:
label: grafana_dashboard
folderAnnotation: grafana_folder
annotations: {}
multicluster:
global:
enabled: true
etcd:
enabled: false
provider:
allowUiUpdates: false
foldersFromFilesStructure: true
{{- if .Values.prometheus.thanos.datasource.enabled }}
datasources:
enabled: true
redact:
enabled: true
relay:
enabled: true
prometheus:
enabled: true
serviceMonitor:
enabled: true
ui:
enabled: {{ .Values.cilium.hubble.ui }}
ipam:
mode: kubernetes
kubeProxyReplacement: {{ .Values.cilium.kubeProxyReplacement }}
l2announcements:
enabled: {{ .Values.cilium.l2announcement.enabled }}
k8sServiceHost: {{ .Values.cilium.k8sServiceHost }}
k8sServicePort: {{ .Values.cilium.k8sServicePort }}
nodePort:
enabled: {{ .Values.cilium.nodePort.enabled }}
gatewayAPI:
enabled: {{ .Values.cilium.gatewayAPI.enabled }}
ingressController:
enabled: {{ .Values.cilium.ingressController.enabled }}
default: {{ .Values.cilium.ingressController.defaultClass }}
loadbalancerMode: {{ .Values.cilium.ingressController.loadbalancerMode }}
operator:
dashboards:
enabled: true
namespace: prometheus
prometheus:
enabled: true
port: 12301
serviceMointor:
enabled: true
port: 12302
rollOutPods: true
policyAuditMode: {{ .Values.cilium.policyAuditMode }}
prometheus:
enabled: true
port: 12300
defaultDatasourceEnabled: true
url: http://thanos-query-frontend.thanos.svc:9090/
# defaultDatasourceScrapeInterval: 15s
annotations: {}
## Create datasource for each Pod of Prometheus StatefulSet;
## this uses headless service `prometheus-operated` which is
## created by Prometheus Operator
## ref: https://git.io/fjaBS
createPrometheusReplicasDatasources: false
label: grafana_datasource
{{ end }}
{{- if or .Values.loki.enabled .Values.prometheus.additionalDataSources }}
additionalDataSources:
{{- end }}
{{- if .Values.tempo.enabled }}
- name: Tempo
type: tempo
uid: tempo
orgId: 1
url: http://tempo.tempo:3100
isDefault: false
version: 1
access: proxy
jsonData:
nodeGraph:
enabled: true
serviceMap:
datasourceUid: 'Prometheus'
tracesToLogs:
datasourceUid: loki
filterByTraceID: false
spanEndTimeShift: "500ms"
spanStartTimeShift: "-500ms"
timeInterval: 30s
{{- end }}
{{- if .Values.loki.enabled }}
- name: loki
type: loki
uid: loki
access: proxy
basicAuth: false
editable: false
jsonData:
tlsSkipVerify: false
{{- if .Values.tempo.enabled }}
derivedFields:
- datasourceUid: tempo
matcherRegex: trace_id
matcherType: label
name: Trace ID
url: $${__value.raw}
urlDisplayLabel: 'Trace ID: $${__value.raw}'
{{- end }}
orgId: 1
url: http://loki-read-headless.loki:3100
version: 1
{{- end }}
{{- with .Values.prometheus.additionalDataSources }}
{{- toYaml . | nindent 10 }}
{{- end }}
kubeApiServer:
tlsConfig:
serverName: kubernetes
insecureSkipVerify: true
kubelet:
serviceMonitor:
https: {{ .Values.prometheus.kubelet.https }}
cAdvisor: true
# NOTE(simkir): Including throttling seconds by setting this. We use that in some dashboards, and could be useful
## MetricRelabelConfigs to apply to samples after scraping, but before ingestion.
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig
##
cAdvisorMetricRelabelings:
# Drop less useful container CPU metrics.
- sourceLabels: [__name__]
action: drop
regex: 'container_cpu_(load_average_10s|system_seconds_total|user_seconds_total)'
# Drop less useful container / always zero filesystem metrics.
- sourceLabels: [__name__]
action: drop
regex: 'container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)'
# Drop less useful / always zero container memory metrics.
- sourceLabels: [__name__]
action: drop
regex: 'container_memory_(mapped_file|swap)'
# Drop less useful container process metrics.
- sourceLabels: [__name__]
action: drop
regex: 'container_(file_descriptors|tasks_state|threads_max)'
# Drop container_memory_failures_total{scope="hierarchy"} metrics,
# we only need the container scope.
- sourceLabels: [__name__, scope]
action: drop
regex: 'container_memory_failures_total;hierarchy'
# Drop container_network_... metrics that match various interfaces that
# correspond to CNI and similar interfaces. This avoids capturing network
# metrics for host network containers.
- sourceLabels: [__name__, interface]
action: drop
regex: 'container_network_.*;(cali|cilium|cni|lxc|nodelocaldns|tunl).*'
# Drop container spec metrics that overlap with kube-state-metrics.
- sourceLabels: [__name__]
action: drop
regex: 'container_spec.*'
# Drop cgroup metrics with no pod.
- sourceLabels: [id, pod]
action: drop
regex: '.+;'
# - sourceLabels: [__name__, image]
# separator: ;
# regex: container_([a-z_]+);
# replacement: $1
# action: drop
# - sourceLabels: [__name__]
# separator: ;
# regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
# replacement: $1
# action: drop
kubeControllerManager:
enabled: false
{{- if .Values.clusterConfig.apiserverip }}
endpoints:
- {{ .Values.clusterConfig.apiserverip }}
{{- end }}
service:
port: 10252
selector:
k8s-app: kube-controller-manager
serviceMonitor:
enabled: true
rollOutCiliumPods: true
securityContext:
capabilities:
ciliumAgent:
- CHOWN
- KILL
- NET_ADMIN
- NET_RAW
- IPC_LOCK
- SYS_ADMIN
- SYS_RESOURCE
- DAC_OVERRIDE
- FOWNER
- SETGID
- SETUID
cleanCiliumState:
- NET_ADMIN
- SYS_ADMIN
- SYS_RESOURCE
{{- with .Values.cilium.upgradeCompatability}}
upgradeCompatability: {{ . }}
{{- end }}
https: true
insecureSkipVerify: true
coreDns:
enabled: true
service:
targetPort: {{ .Values.prometheus.coredns.targetPort | default 10055 }}
selector:
k8s-app: kube-dns
kubeEtcd:
enabled: true
{{- if .Values.clusterConfig.etcd_nodes }}
endpoints: {{ .Values.clusterConfig.etcd_nodes }}
{{- end }}
service:
port: {{ .Values.prometheus.etcd.targetPort | default 2379 }}
targetPort: {{ .Values.prometheus.etcd.targetPort | default 2379 }}
serviceMonitor:
enabled: true
scheme: https
insecureSkipVerify: true
caFile: /etc/prometheus/secrets/etcd-client-cert/ca.pem
certFile: /etc/prometheus/secrets/etcd-client-cert/etcd.pem
keyFile: /etc/prometheus/secrets/etcd-client-cert/etcd-key.pem
kubeScheduler:
enabled: false
{{- if .Values.clusterConfig.apiserverip }}
endpoints:
- {{ .Values.clusterConfig.apiserverip }}
{{- end }}
service:
port: {{ .Values.prometheus.scheduler.targetPort | default 10251 }}
targetPort: {{ .Values.prometheus.scheduler.targetPort | default 10251 }}
selector:
k8s-app: kube-scheduler
kubeProxy:
enabled: false
{{- if .Values.clusterConfig.k8s_nodes }}
endpoints: {{ .Values.clusterConfig.k8s_nodes }}
{{- else }}
service:
selector:
k8s-app: kube-proxy
{{- end }}
prometheusOperator:
enabled: true
admissionWebhooks:
certManager:
enabled: true
issuerRef:
name: "ca-issuer"
kind: "ClusterIssuer"
kubeletService:
enabled: {{ .Values.prometheus.kubelet.enabled }}
prometheus:
enabled: true
thanosService:
enabled: false
type: ClusterIP
## gRPC port config
portName: grpc
port: 10901
targetPort: "grpc"
## HTTP port config (for metrics)
httpPortName: http
httpPort: 10902
targetHttpPort: "http"
# Default is to make this a headless service ("None")
# clusterIP: "None"
## Port to expose on each node, if service type is NodePort
##
nodePort: 30901
httpNodePort: 30902
{{- if .Values.prometheus.thanos.enabled }}
# ServiceMonitor to scrape Sidecar metrics
# Needs thanosService to be enabled as well
thanosServiceMonitor:
enabled: true
interval: ""
thanosIngress:
enabled: true
servicePort: 10901
ingressClassName: nginx
annotations:
cert-manager.io/cluster-issuer: {{ .Values.clusterConfig.ingress_clusterissuer }}
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
{{- with .Values.clusterConfig.ingress_whitelist_ips }}
nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }}
{{- end }}
kubernetes.io/ingress.allow-http: "false"
hosts:
- thanos-gateway.{{ .Values.clusterConfig.domain }}
paths:
- /
pathType: ImplementationSpecific
tls:
- secretName: thanos-gateway-tls
hosts:
- thanos-gateway.{{ .Values.clusterConfig.domain }}
{{- end }}
ingress:
enabled: true
ingressClassName: nginx
annotations:
cert-manager.io/cluster-issuer: {{ .Values.clusterConfig.ingress_clusterissuer }}
nginx.ingress.kubernetes.io/ssl-redirect: "true"
{{- with .Values.clusterConfig.ingress_whitelist_ips }}
nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }}
{{- end }}
hosts:
- prometheus.{{ .Values.clusterConfig.domain }}
paths:
- /
pathType: ImplementationSpecific
tls:
- secretName: prometheus-general-tls
hosts:
- prometheus.{{ .Values.clusterConfig.domain }}
ingressPerReplica:
enabled: false
pathType: ImplementationSpecific
prometheusSpec:
#{{- if .Values.install.otel.enabled }}
enableRemoteWriteReceiver: true
#{{- end }}
tolerations: []
# - key: unschedulable
# operator: Exists
# effect: NoSchedule
secrets:
- etcd-client-cert
storageSpec:
volumeClaimTemplate:
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: {{ .Values.prometheus.storage.size }}
{{- with .Values.prometheus.enableFeatures}}
enableFeatures:
{{- range . }}
- {{ . }}
{{- end }}
{{- end }}
## External labels to add to any time series or alerts when communicating with external systems
##
externalLabels:
cluster: {{ .Values.clusterConfig.cluster }}
## Name of the external label used to denote replica name
##
replicaExternalLabelName: ""
## If true, the Operator won't add the external label used to denote replica name
##
replicaExternalLabelNameClear: true
## Name of the external label used to denote Prometheus instance name
##
prometheusExternalLabelName: ""
## If true, the Operator won't add the external label used to denote Prometheus instance name
##
prometheusExternalLabelNameClear: true
serviceMonitorSelectorNilUsesHelmValues: false
podMonitorSelectorNilUsesHelmValues: false
ruleSelectorNilUsesHelmValues: false
## Thanos configuration allows configuring various aspects of a Prometheus server in a Thanos environment.
## This section is experimental, it may change significantly without deprecation notice in any release.
## This is experimental and may change significantly without backward compatibility in any release.
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#thanosspec
##
{{- if .Values.prometheus.thanos.enabled }}
thanos:
objectStorageConfig:
key: thanos.yaml
name: thanos-objstore-config
{{- end }}
# remoteWrite:
# - url: https://thanos-receive.k1.itpartner.no/api/v1/receive
# name: {{ .Values.clusterConfig.cluster }}
{{- with .Values.prometheus.additionalScrapeConfigs}}
additionalScrapeConfigs:
{{- toYaml . | nindent 12 }}
{{- end }}