{{- if .Values.prometheus.enabled }} apiVersion: argoproj.io/v1alpha1 kind: Application metadata: name: prometheus namespace: argocd finalizers: - resources-finalizer.argocd.argoproj.io spec: destination: namespace: prometheus server: 'https://kubernetes.default.svc' sources: - repoURL: {{ .Values.cluster_config.manifests }} path: {{ .Values.cluster_config.policies }}/prometheus targetRevision: HEAD - repoURL: 'https://prometheus-community.github.io/helm-charts' targetRevision: '{{ .Values.prometheus.version }}' chart: kube-prometheus-stack helm: skipCrds: true values: | crds: enabled: false fullnameOverride: {{ .Values.prometheus.fullname | default "prometheus-kube-prometheus" }} {{- with .Values.prometheus.defaultRules }} defaultRules: {{- . | toYaml | nindent 10 }} {{- end}} ## Configuration for alertmanager ## ref: https://prometheus.io/docs/alerting/alertmanager/ ## alertmanager: config: route: {{- if .Values.prometheus.alert_group_by }} group_by: {{- range .Values.prometheus.alert_group_by }} - {{ . | quote }} {{- end }} {{- else }} group_by: ["alertname"] {{- end }} group_wait: 60s group_interval: 15m repeat_interval: 24h receiver: {{ .Values.prometheus.alert_receiver | default "pagerduty" }} routes: - match: alertname: Watchdog group_wait: 0s group_interval: 1m repeat_interval: 50s receiver: snitch {{- if .Values.prometheus.oncallUrl}} - match: alertname: .* receiver: oncall continue: true {{- end }} receivers: - name: pagerduty pagerduty_configs: - routing_key: {{ default "key" .Values.prometheus.pagerdutyRoutingKey }} url: "https://events.pagerduty.com/v2/enqueue" severity: {{`'{{ if .CommonLabels.severity }}{{ .CommonLabels.severity | toLower }}{{ else }}critical{{ end }}'`}} {{- if .Values.prometheus.snitchUrl}} - name: snitch webhook_configs: - url: "{{ .Values.prometheus.snitchUrl }}" send_resolved: false {{- end }} - name: teams webhook_configs: - url: "https://prometheus-msteams.{{ .Values.cluster_config.domain }}/{{ .Values.cluster_config.cluster }}" http_config: tls_config: insecure_skip_verify: true {{- if .Values.prometheus.oncallUrl}} - name: oncall webhook_configs: - url: "{{ .Values.prometheus.oncallUrl }}" send_resolved: true {{- end }} storage: {} ingress: enabled: true ingressClassName: nginx annotations: cert-manager.io/cluster-issuer: {{ .Values.cluster_config.ingress_clusterissuer }} nginx.ingress.kubernetes.io/ssl-redirect: "true" {{- with .Values.cluster_config.ingress_whitelist_ips }} nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }} {{- end }} hosts: - alertmanager.{{ .Values.cluster_config.domain }} paths: - / pathType: ImplementationSpecific tls: - secretName: alertmanager-general-tls hosts: - alertmanager.{{ .Values.cluster_config.domain }} ingressPerReplica: pathType: ImplementationSpecific alertmanagerSpec: affinity: {} # nodeAffinity: # requiredDuringSchedulingIgnoredDuringExecution: # nodeSelectorTerms: # - matchExpressions: # - key: kubernetes.io/hostname # operator: In # values: # - {{ .Values.cluster_config.cluster }}-0.itpartner.intern tolerations: [] # - key: unschedulable # operator: Exists # effect: NoSchedule grafana: defaultDashboardsEnabled: {{ .Values.prometheus.grafana.defaultDashboardsEnabled }} {{- if .Values.prometheus.grafana.plugins }} plugins: {{- range .Values.prometheus.grafana.plugins }} - {{ . }} {{- end }} {{- end }} grafana.ini: server: root_url: "https://grafana.{{.Values.cluster_config.domain}}:443" security: allow_embedding: "true" auth: disable_login_form: "{{ .Values.prometheus.grafana.disable_login_form }}" users: auto_assign_org_role: "Admin" {{- range .Values.cluster_config.oidc }} {{- if eq .provider "azuread" }} auth.{{ .provider }}: enabled: true name: {{ .name }} client_id: $__file{/etc/secrets/oauth/{{ .name }}/client_id} client_secret: $__file{/etc/secrets/oauth/{{ .name }}/client_secret} scopes: openid email profile auth_url: https://login.microsoftonline.com/{{ .tenant }}/oauth2/v2.0/authorize token_url: https://login.microsoftonline.com/{{ .tenant }}/oauth2/v2.0/token allowed_groups: {{ .group_id }} allow_sign_up: true role_attribute_strict: false allow_assign_grafana_admin: true {{- else if eq .provider "github" }} auth.{{ .provider }}: name: {{ .name }} enabled: true client_id: $__file{/etc/secrets/oauth/{{ .name }}/client_id} client_secret: $__file{/etc/secrets/oauth/{{ .name }}/client_secret} allowed_organizations: {{ .allowed_organizations }} {{- if .allowed_teams }} allowed_teams: "{{ .allowed_teams }}" {{- end }} scopes: user:email,read:org auth_url: https://github.com/login/oauth/authorize token_url: https://github.com/login/oauth/access_token allow_sign_up: true role_attribute_strict: false allow_assign_grafana_admin: true {{- end }} {{- end }} extraSecretMounts: {{- range .Values.cluster_config.oidc }} - name: {{ .name }} secretName: {{ .secret_ref.name }} defaultMode: 0440 mountPath: /etc/secrets/oauth/{{ .name }} readOnly: true {{- end }} {{- if .Values.prometheus.grafana.persistence }} persistence: enabled: true size: 10Gi {{- end }} ingress: enabled: true ingressClassName: nginx annotations: cert-manager.io/cluster-issuer: {{ .Values.cluster_config.ingress_clusterissuer }} nginx.ingress.kubernetes.io/ssl-redirect: "true" {{- with .Values.cluster_config.ingress_whitelist_ips}} nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }} {{- end }} hosts: - grafana.{{ .Values.cluster_config.domain }} path: / tls: - secretName: grafana-general-tls hosts: - grafana.{{ .Values.cluster_config.domain }} sidecar: dashboards: enabled: true label: grafana_dashboard folderAnnotation: grafana_folder annotations: {} multicluster: global: enabled: true etcd: enabled: false provider: allowUiUpdates: false foldersFromFilesStructure: true {{- if .Values.prometheus.thanos.datasource.enabled }} datasources: enabled: true defaultDatasourceEnabled: true url: http://thanos-query-frontend.thanos.svc:9090/ # defaultDatasourceScrapeInterval: 15s annotations: {} ## Create datasource for each Pod of Prometheus StatefulSet; ## this uses headless service `prometheus-operated` which is ## created by Prometheus Operator ## ref: https://git.io/fjaBS createPrometheusReplicasDatasources: false label: grafana_datasource {{ end }} {{- if or .Values.loki.enabled .Values.prometheus.additionalDataSources }} additionalDataSources: {{- end }} {{- if .Values.tempo.enabled }} - name: Tempo type: tempo uid: tempo orgId: 1 url: http://tempo.tempo:3100 isDefault: false version: 1 access: proxy jsonData: nodeGraph: enabled: true serviceMap: datasourceUid: 'Prometheus' tracesToLogs: datasourceUid: loki filterByTraceID: false spanEndTimeShift: "500ms" spanStartTimeShift: "-500ms" timeInterval: 30s {{- end }} {{- if .Values.loki.enabled }} - name: loki type: loki uid: loki access: proxy basicAuth: false editable: false jsonData: tlsSkipVerify: false {{- if .Values.tempo.enabled }} derivedFields: - datasourceUid: tempo matcherRegex: trace_id matcherType: label name: Trace ID url: $${__value.raw} urlDisplayLabel: 'Trace ID: $${__value.raw}' {{- end }} orgId: 1 url: http://loki-read-headless.loki:3100 version: 1 {{- end }} {{- with .Values.prometheus.additionalDataSources }} {{- toYaml . | nindent 10 }} {{- end }} kubeApiServer: tlsConfig: serverName: kubernetes insecureSkipVerify: true kubelet: serviceMonitor: https: {{ .Values.prometheus.kubelet.https }} cAdvisor: true # NOTE(simkir): Including throttling seconds by setting this. We use that in some dashboards, and could be useful ## MetricRelabelConfigs to apply to samples after scraping, but before ingestion. ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig ## cAdvisorMetricRelabelings: # Drop less useful container CPU metrics. - sourceLabels: [__name__] action: drop regex: 'container_cpu_(load_average_10s|system_seconds_total|user_seconds_total)' # Drop less useful container / always zero filesystem metrics. - sourceLabels: [__name__] action: drop regex: 'container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)' # Drop less useful / always zero container memory metrics. - sourceLabels: [__name__] action: drop regex: 'container_memory_(mapped_file|swap)' # Drop less useful container process metrics. - sourceLabels: [__name__] action: drop regex: 'container_(file_descriptors|tasks_state|threads_max)' # Drop container_memory_failures_total{scope="hierarchy"} metrics, # we only need the container scope. - sourceLabels: [__name__, scope] action: drop regex: 'container_memory_failures_total;hierarchy' # Drop container_network_... metrics that match various interfaces that # correspond to CNI and similar interfaces. This avoids capturing network # metrics for host network containers. - sourceLabels: [__name__, interface] action: drop regex: 'container_network_.*;(cali|cilium|cni|lxc|nodelocaldns|tunl).*' # Drop container spec metrics that overlap with kube-state-metrics. - sourceLabels: [__name__] action: drop regex: 'container_spec.*' # Drop cgroup metrics with no pod. - sourceLabels: [id, pod] action: drop regex: '.+;' # - sourceLabels: [__name__, image] # separator: ; # regex: container_([a-z_]+); # replacement: $1 # action: drop # - sourceLabels: [__name__] # separator: ; # regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s) # replacement: $1 # action: drop kubeControllerManager: enabled: false {{- if .Values.cluster_config.apiserverip }} endpoints: - {{ .Values.cluster_config.apiserverip }} {{- end }} service: port: 10252 selector: k8s-app: kube-controller-manager serviceMonitor: enabled: true https: true insecureSkipVerify: true coreDns: enabled: true service: targetPort: {{ .Values.prometheus.coredns.targetPort | default 10055 }} selector: k8s-app: kube-dns kubeEtcd: enabled: true {{- if .Values.cluster_config.etcd_nodes }} endpoints: {{ .Values.cluster_config.etcd_nodes }} {{- end }} service: port: {{ .Values.prometheus.etcd.targetPort | default 2379 }} targetPort: {{ .Values.prometheus.etcd.targetPort | default 2379 }} serviceMonitor: enabled: true scheme: https insecureSkipVerify: true caFile: /etc/prometheus/secrets/etcd-client-cert/ca.pem certFile: /etc/prometheus/secrets/etcd-client-cert/etcd.pem keyFile: /etc/prometheus/secrets/etcd-client-cert/etcd-key.pem kubeScheduler: enabled: false {{- if .Values.cluster_config.apiserverip }} endpoints: - {{ .Values.cluster_config.apiserverip }} {{- end }} service: port: {{ .Values.prometheus.scheduler.targetPort | default 10251 }} targetPort: {{ .Values.prometheus.scheduler.targetPort | default 10251 }} selector: k8s-app: kube-scheduler kubeProxy: enabled: false {{- if .Values.cluster_config.k8s_nodes }} endpoints: {{ .Values.cluster_config.k8s_nodes }} {{- else }} service: selector: k8s-app: kube-proxy {{- end }} prometheusOperator: enabled: true admissionWebhooks: certManager: enabled: true issuerRef: name: "ca-issuer" kind: "ClusterIssuer" kubeletService: enabled: {{ .Values.prometheus.kubelet.enabled }} prometheus: enabled: true {{- if .Values.otel.enabled }} enableRemoteWriteReceiver: true {{- end }} thanosService: enabled: false type: ClusterIP ## gRPC port config portName: grpc port: 10901 targetPort: "grpc" ## HTTP port config (for metrics) httpPortName: http httpPort: 10902 targetHttpPort: "http" # Default is to make this a headless service ("None") # clusterIP: "None" ## Port to expose on each node, if service type is NodePort ## nodePort: 30901 httpNodePort: 30902 {{- if .Values.prometheus.thanos.enabled }} # ServiceMonitor to scrape Sidecar metrics # Needs thanosService to be enabled as well thanosServiceMonitor: enabled: true interval: "" thanosIngress: enabled: true servicePort: 10901 ingressClassName: nginx annotations: cert-manager.io/cluster-issuer: {{ .Values.cluster_config.ingress_clusterissuer }} nginx.ingress.kubernetes.io/ssl-redirect: "true" nginx.ingress.kubernetes.io/backend-protocol: "GRPC" {{- with .Values.cluster_config.ingress_whitelist_ips }} nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }} {{- end }} kubernetes.io/ingress.allow-http: "false" hosts: - thanos-gateway.{{ .Values.cluster_config.domain }} paths: - / pathType: ImplementationSpecific tls: - secretName: thanos-gateway-tls hosts: - thanos-gateway.{{ .Values.cluster_config.domain }} {{- end }} ingress: enabled: true ingressClassName: nginx annotations: cert-manager.io/cluster-issuer: {{ .Values.cluster_config.ingress_clusterissuer }} nginx.ingress.kubernetes.io/ssl-redirect: "true" {{- with .Values.cluster_config.ingress_whitelist_ips }} nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }} {{- end }} hosts: - prometheus.{{ .Values.cluster_config.domain }} paths: - / pathType: ImplementationSpecific tls: - secretName: prometheus-general-tls hosts: - prometheus.{{ .Values.cluster_config.domain }} ingressPerReplica: enabled: false pathType: ImplementationSpecific prometheusSpec: tolerations: [] # - key: unschedulable # operator: Exists # effect: NoSchedule secrets: - etcd-client-cert storageSpec: volumeClaimTemplate: spec: accessModes: ["ReadWriteOnce"] resources: requests: storage: {{ .Values.prometheus.storage.size }} {{- with .Values.prometheus.enableFeatures}} enableFeatures: {{- range . }} - {{ . }} {{- end }} {{- end }} ## External labels to add to any time series or alerts when communicating with external systems ## externalLabels: cluster: {{ .Values.cluster_config.cluster }} ## Name of the external label used to denote replica name ## replicaExternalLabelName: "" ## If true, the Operator won't add the external label used to denote replica name ## replicaExternalLabelNameClear: true ## Name of the external label used to denote Prometheus instance name ## prometheusExternalLabelName: "" ## If true, the Operator won't add the external label used to denote Prometheus instance name ## prometheusExternalLabelNameClear: true serviceMonitorSelectorNilUsesHelmValues: false podMonitorSelectorNilUsesHelmValues: false ruleSelectorNilUsesHelmValues: false ## Thanos configuration allows configuring various aspects of a Prometheus server in a Thanos environment. ## This section is experimental, it may change significantly without deprecation notice in any release. ## This is experimental and may change significantly without backward compatibility in any release. ## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#thanosspec ## {{- if .Values.prometheus.thanos.enabled }} thanos: objectStorageConfig: key: thanos.yaml name: thanos-objstore-config {{- end }} # remoteWrite: # - url: https://thanos-receive.k1.itpartner.no/api/v1/receive # name: {{ .Values.cluster_config.cluster }} {{- with .Values.prometheus.additionalScrapeConfigs}} additionalScrapeConfigs: {{- toYaml . | nindent 12 }} {{- end }} project: sys syncPolicy: managedNamespaceMetadata: labels: component: sys syncOptions: - ServerSideApply=true - CreateNamespace=true - ApplyOutOfSyncOnly=true {{- if .Values.prometheus.autosync }} automated: prune: true # selfHeal: false {{- end }} ignoreDifferences: - group: apps kind: Deployment jqPathExpressions: - '.spec.template.spec.containers[]?.resources' - group: monitoring.coreos.com kind: ServiceMonitor jqPathExpressions: - '.spec.endpoints[]?.relabelings' - group: admissionregistration.k8s.io kind: MutatingWebhookConfiguration jqPathExpressions: - '.webhooks[]?.clientConfig.caBundle' - group: admissionregistration.k8s.io kind: ValidatingWebhookConfiguration jqPathExpressions: - '.webhooks[]?.clientConfig.caBundle' --- apiVersion: argoproj.io/v1alpha1 kind: Application metadata: name: prometheus-crd namespace: argocd annotations: argocd.argoproj.io/sync-wave: "-1" finalizers: - resources-finalizer.argocd.argoproj.io spec: destination: namespace: prometheus server: 'https://kubernetes.default.svc' source: repoURL: 'https://prometheus-community.github.io/helm-charts' targetRevision: '{{ .Values.prometheus.crd_version }}' chart: prometheus-operator-crds project: sys syncPolicy: managedNamespaceMetadata: labels: component: sys automated: {} syncOptions: - ServerSideApply=true - CreateNamespace=true - ApplyOutOfSyncOnly=true {{- end }}