Files
manifests/attic/templates/prometheus.yaml

626 lines
23 KiB
YAML

{{- if .Values.prometheus.enabled }}
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: prometheus
namespace: argocd
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
destination:
namespace: prometheus
server: 'https://kubernetes.default.svc'
sources:
- repoURL: {{ .Values.cluster_config.manifests }}
path: {{ .Values.cluster_config.policies }}/prometheus
targetRevision: HEAD
- repoURL: 'https://prometheus-community.github.io/helm-charts'
targetRevision: '{{ .Values.prometheus.version }}'
chart: kube-prometheus-stack
helm:
skipCrds: true
values: |
crds:
enabled: false
fullnameOverride: {{ .Values.prometheus.fullname | default "prometheus-kube-prometheus" }}
{{- with .Values.prometheus.defaultRules }}
defaultRules:
{{- . | toYaml | nindent 10 }}
{{- end}}
## Configuration for alertmanager
## ref: https://prometheus.io/docs/alerting/alertmanager/
##
alertmanager:
config:
route:
{{- if .Values.prometheus.alert_group_by }}
group_by:
{{- range .Values.prometheus.alert_group_by }}
- {{ . | quote }}
{{- end }}
{{- else }}
group_by: ["alertname"]
{{- end }}
group_wait: 60s
group_interval: 15m
repeat_interval: 24h
receiver: {{ .Values.prometheus.alert_receiver | default "pagerduty" }}
routes:
- match:
alertname: Watchdog
group_wait: 0s
group_interval: 1m
repeat_interval: 50s
receiver: snitch
{{- if .Values.prometheus.oncallUrl}}
- match:
alertname: .*
receiver: oncall
continue: true
{{- end }}
receivers:
- name: pagerduty
pagerduty_configs:
- routing_key: {{ default "key" .Values.prometheus.pagerdutyRoutingKey }}
url: "https://events.pagerduty.com/v2/enqueue"
severity: {{`'{{ if .CommonLabels.severity }}{{ .CommonLabels.severity | toLower }}{{ else }}critical{{ end }}'`}}
{{- if .Values.prometheus.snitchUrl}}
- name: snitch
webhook_configs:
- url: "{{ .Values.prometheus.snitchUrl }}"
send_resolved: false
{{- end }}
- name: teams
webhook_configs:
- url: "https://prometheus-msteams.{{ .Values.cluster_config.domain }}/{{ .Values.cluster_config.cluster }}"
http_config:
tls_config:
insecure_skip_verify: true
{{- if .Values.prometheus.oncallUrl}}
- name: oncall
webhook_configs:
- url: "{{ .Values.prometheus.oncallUrl }}"
send_resolved: true
{{- end }}
storage: {}
ingress:
enabled: true
ingressClassName: nginx
annotations:
cert-manager.io/cluster-issuer: {{ .Values.cluster_config.ingress_clusterissuer }}
nginx.ingress.kubernetes.io/ssl-redirect: "true"
{{- with .Values.cluster_config.ingress_whitelist_ips }}
nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }}
{{- end }}
hosts:
- alertmanager.{{ .Values.cluster_config.domain }}
paths:
- /
pathType: ImplementationSpecific
tls:
- secretName: alertmanager-general-tls
hosts:
- alertmanager.{{ .Values.cluster_config.domain }}
ingressPerReplica:
pathType: ImplementationSpecific
alertmanagerSpec:
affinity: {}
# nodeAffinity:
# requiredDuringSchedulingIgnoredDuringExecution:
# nodeSelectorTerms:
# - matchExpressions:
# - key: kubernetes.io/hostname
# operator: In
# values:
# - {{ .Values.cluster_config.cluster }}-0.itpartner.intern
tolerations: []
# - key: unschedulable
# operator: Exists
# effect: NoSchedule
grafana:
defaultDashboardsEnabled: {{ .Values.prometheus.grafana.defaultDashboardsEnabled }}
{{- if .Values.prometheus.grafana.plugins }}
plugins:
{{- range .Values.prometheus.grafana.plugins }}
- {{ . }}
{{- end }}
{{- end }}
grafana.ini:
server:
root_url: "https://grafana.{{.Values.cluster_config.domain}}:443"
security:
allow_embedding: "true"
auth:
disable_login_form: "{{ .Values.prometheus.grafana.disable_login_form }}"
users:
auto_assign_org_role: "Admin"
{{- range .Values.cluster_config.oidc }}
{{- if eq .provider "azuread" }}
auth.{{ .provider }}:
enabled: true
name: {{ .name }}
client_id: $__file{/etc/secrets/oauth/{{ .name }}/client_id}
client_secret: $__file{/etc/secrets/oauth/{{ .name }}/client_secret}
scopes: openid email profile
auth_url: https://login.microsoftonline.com/{{ .tenant }}/oauth2/v2.0/authorize
token_url: https://login.microsoftonline.com/{{ .tenant }}/oauth2/v2.0/token
allowed_groups: {{ .group_id }}
allow_sign_up: true
role_attribute_strict: false
allow_assign_grafana_admin: true
{{- else if eq .provider "github" }}
auth.{{ .provider }}:
name: {{ .name }}
enabled: true
client_id: $__file{/etc/secrets/oauth/{{ .name }}/client_id}
client_secret: $__file{/etc/secrets/oauth/{{ .name }}/client_secret}
allowed_organizations: {{ .allowed_organizations }}
{{- if .allowed_teams }}
allowed_teams: "{{ .allowed_teams }}"
{{- end }}
scopes: user:email,read:org
auth_url: https://github.com/login/oauth/authorize
token_url: https://github.com/login/oauth/access_token
allow_sign_up: true
role_attribute_strict: false
allow_assign_grafana_admin: true
{{- end }}
{{- end }}
extraSecretMounts:
{{- range .Values.cluster_config.oidc }}
- name: {{ .name }}
secretName: {{ .secret_ref.name }}
defaultMode: 0440
mountPath: /etc/secrets/oauth/{{ .name }}
readOnly: true
{{- end }}
{{- if .Values.prometheus.grafana.persistence }}
persistence:
enabled: true
size: 10Gi
{{- end }}
ingress:
enabled: true
ingressClassName: nginx
annotations:
cert-manager.io/cluster-issuer: {{ .Values.cluster_config.ingress_clusterissuer }}
nginx.ingress.kubernetes.io/ssl-redirect: "true"
{{- with .Values.cluster_config.ingress_whitelist_ips}}
nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }}
{{- end }}
hosts:
- grafana.{{ .Values.cluster_config.domain }}
path: /
tls:
- secretName: grafana-general-tls
hosts:
- grafana.{{ .Values.cluster_config.domain }}
sidecar:
dashboards:
enabled: true
label: grafana_dashboard
folderAnnotation: grafana_folder
annotations: {}
multicluster:
global:
enabled: true
etcd:
enabled: false
provider:
allowUiUpdates: false
foldersFromFilesStructure: true
{{- if .Values.prometheus.thanos.datasource.enabled }}
datasources:
enabled: true
defaultDatasourceEnabled: true
url: http://thanos-query-frontend.thanos.svc:9090/
# defaultDatasourceScrapeInterval: 15s
annotations: {}
## Create datasource for each Pod of Prometheus StatefulSet;
## this uses headless service `prometheus-operated` which is
## created by Prometheus Operator
## ref: https://git.io/fjaBS
createPrometheusReplicasDatasources: false
label: grafana_datasource
{{ end }}
{{- if or .Values.loki.enabled .Values.prometheus.additionalDataSources }}
additionalDataSources:
{{- end }}
{{- if .Values.tempo.enabled }}
- name: Tempo
type: tempo
uid: tempo
orgId: 1
url: http://tempo.tempo:3100
isDefault: false
version: 1
access: proxy
jsonData:
nodeGraph:
enabled: true
serviceMap:
datasourceUid: 'Prometheus'
tracesToLogs:
datasourceUid: loki
filterByTraceID: false
spanEndTimeShift: "500ms"
spanStartTimeShift: "-500ms"
timeInterval: 30s
{{- end }}
{{- if .Values.loki.enabled }}
- name: loki
type: loki
uid: loki
access: proxy
basicAuth: false
editable: false
jsonData:
tlsSkipVerify: false
{{- if .Values.tempo.enabled }}
derivedFields:
- datasourceUid: tempo
matcherRegex: trace_id
matcherType: label
name: Trace ID
url: $${__value.raw}
urlDisplayLabel: 'Trace ID: $${__value.raw}'
{{- end }}
orgId: 1
url: http://loki-read-headless.loki:3100
version: 1
{{- end }}
{{- with .Values.prometheus.additionalDataSources }}
{{- toYaml . | nindent 10 }}
{{- end }}
kubeApiServer:
tlsConfig:
serverName: kubernetes
insecureSkipVerify: true
kubelet:
serviceMonitor:
https: {{ .Values.prometheus.kubelet.https }}
cAdvisor: true
# NOTE(simkir): Including throttling seconds by setting this. We use that in some dashboards, and could be useful
## MetricRelabelConfigs to apply to samples after scraping, but before ingestion.
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/api-reference/api.md#relabelconfig
##
cAdvisorMetricRelabelings:
# Drop less useful container CPU metrics.
- sourceLabels: [__name__]
action: drop
regex: 'container_cpu_(load_average_10s|system_seconds_total|user_seconds_total)'
# Drop less useful container / always zero filesystem metrics.
- sourceLabels: [__name__]
action: drop
regex: 'container_fs_(io_current|io_time_seconds_total|io_time_weighted_seconds_total|reads_merged_total|sector_reads_total|sector_writes_total|writes_merged_total)'
# Drop less useful / always zero container memory metrics.
- sourceLabels: [__name__]
action: drop
regex: 'container_memory_(mapped_file|swap)'
# Drop less useful container process metrics.
- sourceLabels: [__name__]
action: drop
regex: 'container_(file_descriptors|tasks_state|threads_max)'
# Drop container_memory_failures_total{scope="hierarchy"} metrics,
# we only need the container scope.
- sourceLabels: [__name__, scope]
action: drop
regex: 'container_memory_failures_total;hierarchy'
# Drop container_network_... metrics that match various interfaces that
# correspond to CNI and similar interfaces. This avoids capturing network
# metrics for host network containers.
- sourceLabels: [__name__, interface]
action: drop
regex: 'container_network_.*;(cali|cilium|cni|lxc|nodelocaldns|tunl).*'
# Drop container spec metrics that overlap with kube-state-metrics.
- sourceLabels: [__name__]
action: drop
regex: 'container_spec.*'
# Drop cgroup metrics with no pod.
- sourceLabels: [id, pod]
action: drop
regex: '.+;'
# - sourceLabels: [__name__, image]
# separator: ;
# regex: container_([a-z_]+);
# replacement: $1
# action: drop
# - sourceLabels: [__name__]
# separator: ;
# regex: container_(network_tcp_usage_total|network_udp_usage_total|tasks_state|cpu_load_average_10s)
# replacement: $1
# action: drop
kubeControllerManager:
enabled: false
{{- if .Values.cluster_config.apiserverip }}
endpoints:
- {{ .Values.cluster_config.apiserverip }}
{{- end }}
service:
port: 10252
selector:
k8s-app: kube-controller-manager
serviceMonitor:
enabled: true
https: true
insecureSkipVerify: true
coreDns:
enabled: true
service:
targetPort: {{ .Values.prometheus.coredns.targetPort | default 10055 }}
selector:
k8s-app: kube-dns
kubeEtcd:
enabled: true
{{- if .Values.cluster_config.etcd_nodes }}
endpoints: {{ .Values.cluster_config.etcd_nodes }}
{{- end }}
service:
port: {{ .Values.prometheus.etcd.targetPort | default 2379 }}
targetPort: {{ .Values.prometheus.etcd.targetPort | default 2379 }}
serviceMonitor:
enabled: true
scheme: https
insecureSkipVerify: true
caFile: /etc/prometheus/secrets/etcd-client-cert/ca.pem
certFile: /etc/prometheus/secrets/etcd-client-cert/etcd.pem
keyFile: /etc/prometheus/secrets/etcd-client-cert/etcd-key.pem
kubeScheduler:
enabled: false
{{- if .Values.cluster_config.apiserverip }}
endpoints:
- {{ .Values.cluster_config.apiserverip }}
{{- end }}
service:
port: {{ .Values.prometheus.scheduler.targetPort | default 10251 }}
targetPort: {{ .Values.prometheus.scheduler.targetPort | default 10251 }}
selector:
k8s-app: kube-scheduler
kubeProxy:
enabled: false
{{- if .Values.cluster_config.k8s_nodes }}
endpoints: {{ .Values.cluster_config.k8s_nodes }}
{{- else }}
service:
selector:
k8s-app: kube-proxy
{{- end }}
prometheusOperator:
enabled: true
admissionWebhooks:
certManager:
enabled: true
issuerRef:
name: "ca-issuer"
kind: "ClusterIssuer"
kubeletService:
enabled: {{ .Values.prometheus.kubelet.enabled }}
prometheus:
enabled: true
{{- if .Values.otel.enabled }}
enableRemoteWriteReceiver: true
{{- end }}
thanosService:
enabled: false
type: ClusterIP
## gRPC port config
portName: grpc
port: 10901
targetPort: "grpc"
## HTTP port config (for metrics)
httpPortName: http
httpPort: 10902
targetHttpPort: "http"
# Default is to make this a headless service ("None")
# clusterIP: "None"
## Port to expose on each node, if service type is NodePort
##
nodePort: 30901
httpNodePort: 30902
{{- if .Values.prometheus.thanos.enabled }}
# ServiceMonitor to scrape Sidecar metrics
# Needs thanosService to be enabled as well
thanosServiceMonitor:
enabled: true
interval: ""
thanosIngress:
enabled: true
servicePort: 10901
ingressClassName: nginx
annotations:
cert-manager.io/cluster-issuer: {{ .Values.cluster_config.ingress_clusterissuer }}
nginx.ingress.kubernetes.io/ssl-redirect: "true"
nginx.ingress.kubernetes.io/backend-protocol: "GRPC"
{{- with .Values.cluster_config.ingress_whitelist_ips }}
nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }}
{{- end }}
kubernetes.io/ingress.allow-http: "false"
hosts:
- thanos-gateway.{{ .Values.cluster_config.domain }}
paths:
- /
pathType: ImplementationSpecific
tls:
- secretName: thanos-gateway-tls
hosts:
- thanos-gateway.{{ .Values.cluster_config.domain }}
{{- end }}
ingress:
enabled: true
ingressClassName: nginx
annotations:
cert-manager.io/cluster-issuer: {{ .Values.cluster_config.ingress_clusterissuer }}
nginx.ingress.kubernetes.io/ssl-redirect: "true"
{{- with .Values.cluster_config.ingress_whitelist_ips }}
nginx.ingress.kubernetes.io/whitelist-source-range: {{ join "," . }}
{{- end }}
hosts:
- prometheus.{{ .Values.cluster_config.domain }}
paths:
- /
pathType: ImplementationSpecific
tls:
- secretName: prometheus-general-tls
hosts:
- prometheus.{{ .Values.cluster_config.domain }}
ingressPerReplica:
enabled: false
pathType: ImplementationSpecific
prometheusSpec:
tolerations: []
# - key: unschedulable
# operator: Exists
# effect: NoSchedule
secrets:
- etcd-client-cert
storageSpec:
volumeClaimTemplate:
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: {{ .Values.prometheus.storage.size }}
{{- with .Values.prometheus.enableFeatures}}
enableFeatures:
{{- range . }}
- {{ . }}
{{- end }}
{{- end }}
## External labels to add to any time series or alerts when communicating with external systems
##
externalLabels:
cluster: {{ .Values.cluster_config.cluster }}
## Name of the external label used to denote replica name
##
replicaExternalLabelName: ""
## If true, the Operator won't add the external label used to denote replica name
##
replicaExternalLabelNameClear: true
## Name of the external label used to denote Prometheus instance name
##
prometheusExternalLabelName: ""
## If true, the Operator won't add the external label used to denote Prometheus instance name
##
prometheusExternalLabelNameClear: true
serviceMonitorSelectorNilUsesHelmValues: false
podMonitorSelectorNilUsesHelmValues: false
ruleSelectorNilUsesHelmValues: false
## Thanos configuration allows configuring various aspects of a Prometheus server in a Thanos environment.
## This section is experimental, it may change significantly without deprecation notice in any release.
## This is experimental and may change significantly without backward compatibility in any release.
## ref: https://github.com/prometheus-operator/prometheus-operator/blob/master/Documentation/api.md#thanosspec
##
{{- if .Values.prometheus.thanos.enabled }}
thanos:
objectStorageConfig:
key: thanos.yaml
name: thanos-objstore-config
{{- end }}
# remoteWrite:
# - url: https://thanos-receive.k1.itpartner.no/api/v1/receive
# name: {{ .Values.cluster_config.cluster }}
{{- with .Values.prometheus.additionalScrapeConfigs}}
additionalScrapeConfigs:
{{- toYaml . | nindent 12 }}
{{- end }}
project: sys
syncPolicy:
managedNamespaceMetadata:
labels:
component: sys
syncOptions:
- ServerSideApply=true
- CreateNamespace=true
- ApplyOutOfSyncOnly=true
{{- if .Values.prometheus.autosync }}
automated:
prune: true
# selfHeal: false
{{- end }}
ignoreDifferences:
- group: apps
kind: Deployment
jqPathExpressions:
- '.spec.template.spec.containers[]?.resources'
- group: monitoring.coreos.com
kind: ServiceMonitor
jqPathExpressions:
- '.spec.endpoints[]?.relabelings'
- group: admissionregistration.k8s.io
kind: MutatingWebhookConfiguration
jqPathExpressions:
- '.webhooks[]?.clientConfig.caBundle'
- group: admissionregistration.k8s.io
kind: ValidatingWebhookConfiguration
jqPathExpressions:
- '.webhooks[]?.clientConfig.caBundle'
---
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: prometheus-crd
namespace: argocd
annotations:
argocd.argoproj.io/sync-wave: "-1"
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
destination:
namespace: prometheus
server: 'https://kubernetes.default.svc'
source:
repoURL: 'https://prometheus-community.github.io/helm-charts'
targetRevision: '{{ .Values.prometheus.crd_version }}'
chart: prometheus-operator-crds
project: sys
syncPolicy:
managedNamespaceMetadata:
labels:
component: sys
automated: {}
syncOptions:
- ServerSideApply=true
- CreateNamespace=true
- ApplyOutOfSyncOnly=true
{{- end }}