diff --git a/rules/etcd.yaml b/rules/etcd.yaml new file mode 100644 index 00000000..3cf28cb0 --- /dev/null +++ b/rules/etcd.yaml @@ -0,0 +1,183 @@ +groups: +- name: etcd + rules: + - alert: etcdMembersDown + annotations: + description: 'etcd cluster "{{ $labels.job }}": members are down ({{ $value + }}).' + summary: etcd cluster members are down. + expr: |- + max without (endpoint) ( + sum without (instance) (up{job=~".*etcd.*"} == bool 0) + or + count without (To) ( + sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 + ) + ) + > 0 + for: 10m + labels: + severity: critical + - alert: etcdInsufficientMembers + annotations: + description: 'etcd cluster "{{ $labels.job }}": insufficient members ({{ $value + }}).' + summary: etcd cluster has insufficient number of members. + expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) + without (instance) + 1) / 2) + for: 3m + labels: + severity: critical + - alert: etcdNoLeader + annotations: + description: 'etcd cluster "{{ $labels.job }}": member {{ $labels.instance }} + has no leader.' + summary: etcd cluster has no leader. + expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 + for: 1m + labels: + severity: critical + - alert: etcdHighNumberOfLeaderChanges + annotations: + description: 'etcd cluster "{{ $labels.job }}": {{ $value }} leader changes + within the last 15 minutes. Frequent elections may be a sign of insufficient + resources, high network latency, or disruptions by other components and should + be investigated.' + summary: etcd cluster has high number of leader changes. + expr: increase((max without (instance) (etcd_server_leader_changes_seen_total{job=~".*etcd.*"}) + or 0*absent(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}))[15m:1m]) + >= 4 + for: 5m + labels: + severity: warning + - alert: etcdHighNumberOfFailedGRPCRequests + annotations: + description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for + {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' + summary: etcd cluster has high number of failed grpc requests. + expr: |- + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) + > 1 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedGRPCRequests + annotations: + description: 'etcd cluster "{{ $labels.job }}": {{ $value }}% of requests for + {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}.' + summary: etcd cluster has high number of failed grpc requests. + expr: |- + 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) + / + sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) + > 5 + for: 5m + labels: + severity: critical + - alert: etcdGRPCRequestsSlow + annotations: + description: 'etcd cluster "{{ $labels.job }}": 99th percentile of gRPC requests + is {{ $value }}s on etcd instance {{ $labels.instance }} for {{ $labels.grpc_method + }} method.' + summary: etcd grpc requests are slow + expr: |- + histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) + > 0.15 + for: 10m + labels: + severity: critical + - alert: etcdMemberCommunicationSlow + annotations: + description: 'etcd cluster "{{ $labels.job }}": member communication with {{ + $labels.To }} is taking {{ $value }}s on etcd instance {{ $labels.instance + }}.' + summary: etcd cluster member communication is slow. + expr: |- + histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.15 + for: 10m + labels: + severity: warning + - alert: etcdHighNumberOfFailedProposals + annotations: + description: 'etcd cluster "{{ $labels.job }}": {{ $value }} proposal failures + within the last 30 minutes on etcd instance {{ $labels.instance }}.' + summary: etcd cluster has high number of proposal failures. + expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 + for: 15m + labels: + severity: warning + - alert: etcdHighFsyncDurations + annotations: + description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations + are {{ $value }}s on etcd instance {{ $labels.instance }}.' + summary: etcd cluster 99th percentile fsync durations are too high. + expr: |- + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.5 + for: 10m + labels: + severity: warning + - alert: etcdHighFsyncDurations + annotations: + description: 'etcd cluster "{{ $labels.job }}": 99th percentile fsync durations + are {{ $value }}s on etcd instance {{ $labels.instance }}.' + summary: etcd cluster 99th percentile fsync durations are too high. + expr: |- + histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 1 + for: 10m + labels: + severity: critical + - alert: etcdHighCommitDurations + annotations: + description: 'etcd cluster "{{ $labels.job }}": 99th percentile commit durations + {{ $value }}s on etcd instance {{ $labels.instance }}.' + summary: etcd cluster 99th percentile commit durations are too high. + expr: |- + histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) + > 0.25 + for: 10m + labels: + severity: warning + - alert: etcdDatabaseQuotaLowSpace + annotations: + description: 'etcd cluster "{{ $labels.job }}": database size exceeds the defined + quota on etcd instance {{ $labels.instance }}, please defrag or increase the + quota as the writes to etcd will be disabled when it is full.' + summary: etcd cluster database is running full. + expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m]) / + last_over_time(etcd_server_quota_backend_bytes{job=~".*etcd.*"}[5m]))*100 > + 95 + for: 10m + labels: + severity: critical + - alert: etcdExcessiveDatabaseGrowth + annotations: + description: 'etcd cluster "{{ $labels.job }}": Predicting running out of disk + space in the next four hours, based on write observations within the past + four hours on etcd instance {{ $labels.instance }}, please check as it might + be disruptive.' + summary: etcd cluster database growing very fast. + expr: predict_linear(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[4h], 4*60*60) + > etcd_server_quota_backend_bytes{job=~".*etcd.*"} + for: 10m + labels: + severity: warning + - alert: etcdDatabaseHighFragmentationRatio + annotations: + description: 'etcd cluster "{{ $labels.job }}": database size in use on instance + {{ $labels.instance }} is {{ $value | humanizePercentage }} of the actual + allocated disk space, please run defragmentation (e.g. etcdctl defrag) to + retrieve the unused fragmented disk space.' + runbook_url: https://etcd.io/docs/v3.5/op-guide/maintenance/#defragmentation + summary: etcd database size in use is less than 50% of the actual allocated + storage. + expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"}[5m]) + / last_over_time(etcd_mvcc_db_total_size_in_bytes{job=~".*etcd.*"}[5m])) < 0.5 + and etcd_mvcc_db_total_size_in_use_in_bytes{job=~".*etcd.*"} > 104857600 + for: 10m + labels: + severity: warning \ No newline at end of file diff --git a/rules/general.yaml b/rules/general.yaml new file mode 100644 index 00000000..1c692e6f --- /dev/null +++ b/rules/general.yaml @@ -0,0 +1,43 @@ +groups: +- name: general.rules + rules: + - alert: TargetDown + annotations: + description: '{{ printf "%.4g" $value }}% of the {{ $labels.job }}/{{ $labels.service + }} targets in {{ $labels.namespace }} namespace are down.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/targetdown + summary: One or more targets are unreachable. + expr: 100 * (count(up == 0) BY (cluster, job, namespace, service) / count(up) + BY (cluster, job, namespace, service)) > 10 + for: 10m + labels: + severity: warning + - alert: Watchdog + annotations: + description: | + This is an alert meant to ensure that the entire alerting pipeline is functional. + This alert is always firing, therefore it should always be firing in Alertmanager + and always fire against a receiver. There are integrations with various notification + mechanisms that send a notification when this alert is not firing. For example the + "DeadMansSnitch" integration in PagerDuty. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/watchdog + summary: An alert that should always be firing to certify that Alertmanager + is working properly. + expr: vector(1) + labels: + severity: none + - alert: InfoInhibitor + annotations: + description: | + This is an alert that is used to inhibit info alerts. + By themselves, the info-level alerts are sometimes very noisy, but they are relevant when combined with + other alerts. + This alert fires whenever there's a severity="info" alert, and stops firing when another alert with a + severity of 'warning' or 'critical' starts firing on the same namespace. + This alert should be routed to a null receiver and configured to inhibit alerts with severity="info". + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/general/infoinhibitor + summary: Info-level alert inhibition. + expr: ALERTS{severity = "info"} == 1 unless on (namespace) ALERTS{alertname != + "InfoInhibitor", severity =~ "warning|critical", alertstate="firing"} == 1 + labels: + severity: none \ No newline at end of file diff --git a/rules/kubernetes-apps.yaml b/rules/kubernetes-apps.yaml new file mode 100644 index 00000000..896f9b7e --- /dev/null +++ b/rules/kubernetes-apps.yaml @@ -0,0 +1,262 @@ +groups: +- name: kubernetes-apps + rules: + - alert: KubePodCrashLooping + annotations: + description: 'Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container + }}) is in waiting state (reason: "CrashLoopBackOff").' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodcrashlooping + summary: Pod is crash looping. + expr: max_over_time(kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff", + job="kube-state-metrics", namespace=~".*"}[5m]) >= 1 + for: 15m + labels: + severity: warning + - alert: KubePodNotReady + annotations: + description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready + state for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepodnotready + summary: Pod has been in a non-ready state for more than 15 minutes. + expr: |- + sum by (namespace, pod, cluster) ( + max by (namespace, pod, cluster) ( + kube_pod_status_phase{job="kube-state-metrics", namespace=~".*", phase=~"Pending|Unknown|Failed"} + ) * on (namespace, pod, cluster) group_left(owner_kind) topk by (namespace, pod, cluster) ( + 1, max by (namespace, pod, owner_kind, cluster) (kube_pod_owner{owner_kind!="Job"}) + ) + ) > 0 + for: 15m + labels: + severity: warning + - alert: KubeDeploymentGenerationMismatch + annotations: + description: Deployment generation for {{ $labels.namespace }}/{{ $labels.deployment + }} does not match, this indicates that the Deployment has failed but has not + been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentgenerationmismatch + summary: Deployment generation mismatch due to possible roll-back + expr: |- + kube_deployment_status_observed_generation{job="kube-state-metrics", namespace=~".*"} + != + kube_deployment_metadata_generation{job="kube-state-metrics", namespace=~".*"} + for: 15m + labels: + severity: warning + - alert: KubeDeploymentReplicasMismatch + annotations: + description: Deployment {{ $labels.namespace }}/{{ $labels.deployment }} has + not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentreplicasmismatch + summary: Deployment has not matched the expected number of replicas. + expr: |- + ( + kube_deployment_spec_replicas{job="kube-state-metrics", namespace=~".*"} + > + kube_deployment_status_replicas_available{job="kube-state-metrics", namespace=~".*"} + ) and ( + changes(kube_deployment_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDeploymentRolloutStuck + annotations: + description: Rollout of deployment {{ $labels.namespace }}/{{ $labels.deployment + }} is not progressing for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedeploymentrolloutstuck + summary: Deployment rollout is not progressing. + expr: |- + kube_deployment_status_condition{condition="Progressing", status="false",job="kube-state-metrics", namespace=~".*"} + != 0 + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetReplicasMismatch + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} has + not matched the expected number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetreplicasmismatch + summary: StatefulSet has not matched the expected number of replicas. + expr: |- + ( + kube_statefulset_status_replicas_ready{job="kube-state-metrics", namespace=~".*"} + != + kube_statefulset_status_replicas{job="kube-state-metrics", namespace=~".*"} + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[10m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetGenerationMismatch + annotations: + description: StatefulSet generation for {{ $labels.namespace }}/{{ $labels.statefulset + }} does not match, this indicates that the StatefulSet has failed but has + not been rolled back. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetgenerationmismatch + summary: StatefulSet generation mismatch due to possible roll-back + expr: |- + kube_statefulset_status_observed_generation{job="kube-state-metrics", namespace=~".*"} + != + kube_statefulset_metadata_generation{job="kube-state-metrics", namespace=~".*"} + for: 15m + labels: + severity: warning + - alert: KubeStatefulSetUpdateNotRolledOut + annotations: + description: StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} update + has not been rolled out. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubestatefulsetupdatenotrolledout + summary: StatefulSet update has not been rolled out. + expr: |- + ( + max by (namespace, statefulset) ( + kube_statefulset_status_current_revision{job="kube-state-metrics", namespace=~".*"} + unless + kube_statefulset_status_update_revision{job="kube-state-metrics", namespace=~".*"} + ) + * + ( + kube_statefulset_replicas{job="kube-state-metrics", namespace=~".*"} + != + kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"} + ) + ) and ( + changes(kube_statefulset_status_replicas_updated{job="kube-state-metrics", namespace=~".*"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeDaemonSetRolloutStuck + annotations: + description: DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not + finished or progressed for at least 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetrolloutstuck + summary: DaemonSet rollout is stuck. + expr: |- + ( + ( + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} + ) or ( + kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} + != + 0 + ) or ( + kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} + ) or ( + kube_daemonset_status_number_available{job="kube-state-metrics", namespace=~".*"} + != + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} + ) + ) and ( + changes(kube_daemonset_status_updated_number_scheduled{job="kube-state-metrics", namespace=~".*"}[5m]) + == + 0 + ) + for: 15m + labels: + severity: warning + - alert: KubeContainerWaiting + annotations: + description: pod/{{ $labels.pod }} in namespace {{ $labels.namespace }} on container + {{ $labels.container}} has been in waiting state for longer than 1 hour. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecontainerwaiting + summary: Pod container waiting longer than 1 hour + expr: sum by (namespace, pod, container, cluster) (kube_pod_container_status_waiting_reason{job="kube-state-metrics", + namespace=~".*"}) > 0 + for: 1h + labels: + severity: warning + - alert: KubeDaemonSetNotScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are not scheduled.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetnotscheduled + summary: DaemonSet pods are not scheduled. + expr: |- + kube_daemonset_status_desired_number_scheduled{job="kube-state-metrics", namespace=~".*"} + - + kube_daemonset_status_current_number_scheduled{job="kube-state-metrics", namespace=~".*"} > 0 + for: 10m + labels: + severity: warning + - alert: KubeDaemonSetMisScheduled + annotations: + description: '{{ $value }} Pods of DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset + }} are running where they are not supposed to run.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubedaemonsetmisscheduled + summary: DaemonSet pods are misscheduled. + expr: kube_daemonset_status_number_misscheduled{job="kube-state-metrics", namespace=~".*"} + > 0 + for: 15m + labels: + severity: warning + - alert: KubeJobNotCompleted + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} is taking more + than {{ "43200" | humanizeDuration }} to complete. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobnotcompleted + summary: Job did not complete in time + expr: |- + time() - max by (namespace, job_name, cluster) (kube_job_status_start_time{job="kube-state-metrics", namespace=~".*"} + and + kube_job_status_active{job="kube-state-metrics", namespace=~".*"} > 0) > 43200 + labels: + severity: warning + - alert: KubeJobFailed + annotations: + description: Job {{ $labels.namespace }}/{{ $labels.job_name }} failed to complete. + Removing failed job after investigation should clear this alert. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubejobfailed + summary: Job failed to complete. + expr: kube_job_failed{job="kube-state-metrics", namespace=~".*"} > 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaReplicasMismatch + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} + has not matched the desired number of replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpareplicasmismatch + summary: HPA has not matched desired number of replicas. + expr: |- + (kube_horizontalpodautoscaler_status_desired_replicas{job="kube-state-metrics", namespace=~".*"} + != + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} + > + kube_horizontalpodautoscaler_spec_min_replicas{job="kube-state-metrics", namespace=~".*"}) + and + (kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} + < + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"}) + and + changes(kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"}[15m]) == 0 + for: 15m + labels: + severity: warning + - alert: KubeHpaMaxedOut + annotations: + description: HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} + has been running at max replicas for longer than 15 minutes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubehpamaxedout + summary: HPA is running at max replicas + expr: |- + kube_horizontalpodautoscaler_status_current_replicas{job="kube-state-metrics", namespace=~".*"} + == + kube_horizontalpodautoscaler_spec_max_replicas{job="kube-state-metrics", namespace=~".*"} + for: 15m + labels: + severity: warning diff --git a/rules/kubernetes-resources.yaml b/rules/kubernetes-resources.yaml new file mode 100644 index 00000000..048cfbfe --- /dev/null +++ b/rules/kubernetes-resources.yaml @@ -0,0 +1,115 @@ +groups: +- name: kubernetes-resources + rules: + - alert: KubeCPUOvercommit + annotations: + description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests + for Pods by {{ $value }} CPU shares and cannot tolerate node failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: |- + sum(namespace_cpu:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 + and + (sum(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster) - max(kube_node_status_allocatable{job="kube-state-metrics",resource="cpu"}) by (cluster)) > 0 + for: 10m + labels: + severity: warning + - alert: KubeMemoryOvercommit + annotations: + description: Cluster {{ $labels.cluster }} has overcommitted memory resource + requests for Pods by {{ $value | humanize }} bytes and cannot tolerate node + failure. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryovercommit + summary: Cluster has overcommitted memory resource requests. + expr: |- + sum(namespace_memory:kube_pod_container_resource_requests:sum{}) by (cluster) - (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 + and + (sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) - max(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster)) > 0 + for: 10m + labels: + severity: warning + - alert: KubeCPUQuotaOvercommit + annotations: + description: Cluster {{ $labels.cluster }} has overcommitted CPU resource requests + for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubecpuquotaovercommit + summary: Cluster has overcommitted CPU resource requests. + expr: |- + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(cpu|requests.cpu)"})) by (cluster) + / + sum(kube_node_status_allocatable{resource="cpu", job="kube-state-metrics"}) by (cluster) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeMemoryQuotaOvercommit + annotations: + description: Cluster {{ $labels.cluster }} has overcommitted memory resource + requests for Namespaces. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubememoryquotaovercommit + summary: Cluster has overcommitted memory resource requests. + expr: |- + sum(min without(resource) (kube_resourcequota{job="kube-state-metrics", type="hard", resource=~"(memory|requests.memory)"})) by (cluster) + / + sum(kube_node_status_allocatable{resource="memory", job="kube-state-metrics"}) by (cluster) + > 1.5 + for: 5m + labels: + severity: warning + - alert: KubeQuotaAlmostFull + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaalmostfull + summary: Namespace quota is going to be full. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 0.9 < 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaFullyUsed + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotafullyused + summary: Namespace quota is fully used. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + == 1 + for: 15m + labels: + severity: info + - alert: KubeQuotaExceeded + annotations: + description: Namespace {{ $labels.namespace }} is using {{ $value | humanizePercentage + }} of its {{ $labels.resource }} quota. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubequotaexceeded + summary: Namespace quota has exceeded the limits. + expr: |- + kube_resourcequota{job="kube-state-metrics", type="used"} + / ignoring(instance, job, type) + (kube_resourcequota{job="kube-state-metrics", type="hard"} > 0) + > 1 + for: 15m + labels: + severity: warning + - alert: CPUThrottlingHigh + annotations: + description: '{{ $value | humanizePercentage }} throttling of CPU in namespace + {{ $labels.namespace }} for container {{ $labels.container }} in pod {{ $labels.pod + }}.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/cputhrottlinghigh + summary: Processes experience elevated CPU throttling. + expr: |- + sum(increase(container_cpu_cfs_throttled_periods_total{container!="", }[5m])) by (cluster, container, pod, namespace) + / + sum(increase(container_cpu_cfs_periods_total{}[5m])) by (cluster, container, pod, namespace) + > ( 25 / 100 ) + for: 15m + labels: + severity: info \ No newline at end of file diff --git a/rules/kubernetes-storage.yaml b/rules/kubernetes-storage.yaml new file mode 100644 index 00000000..24f06ca3 --- /dev/null +++ b/rules/kubernetes-storage.yaml @@ -0,0 +1,109 @@ + +groups: +- name: kubernetes-storage + rules: + - alert: KubePersistentVolumeFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster + {{ . }} {{- end }} is only {{ $value | humanizePercentage }} free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: |- + ( + kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster + {{ . }} {{- end }} is expected to fill up within four days. Currently {{ $value + | humanizePercentage }} is available. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumefillingup + summary: PersistentVolume is filling up. + expr: |- + ( + kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_capacity_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_used_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_available_bytes{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeInodesFillingUp + annotations: + description: The PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster + {{ . }} {{- end }} only has {{ $value | humanizePercentage }} free inodes. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup + summary: PersistentVolumeInodes are filling up. + expr: |- + ( + kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.03 + and + kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1m + labels: + severity: critical + - alert: KubePersistentVolumeInodesFillingUp + annotations: + description: Based on recent sampling, the PersistentVolume claimed by {{ $labels.persistentvolumeclaim + }} in Namespace {{ $labels.namespace }} {{ with $labels.cluster -}} on Cluster + {{ . }} {{- end }} is expected to run out of inodes within four days. Currently + {{ $value | humanizePercentage }} of its inodes are free. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeinodesfillingup + summary: PersistentVolumeInodes are filling up. + expr: |- + ( + kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"} + / + kubelet_volume_stats_inodes{job="kubelet", namespace=~".*", metrics_path="/metrics"} + ) < 0.15 + and + kubelet_volume_stats_inodes_used{job="kubelet", namespace=~".*", metrics_path="/metrics"} > 0 + and + predict_linear(kubelet_volume_stats_inodes_free{job="kubelet", namespace=~".*", metrics_path="/metrics"}[6h], 4 * 24 * 3600) < 0 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_access_mode{ access_mode="ReadOnlyMany"} == 1 + unless on (cluster, namespace, persistentvolumeclaim) + kube_persistentvolumeclaim_labels{label_excluded_from_alerts="true"} == 1 + for: 1h + labels: + severity: warning + - alert: KubePersistentVolumeErrors + annotations: + description: The persistent volume {{ $labels.persistentvolume }} {{ with $labels.cluster + -}} on Cluster {{ . }} {{- end }} has status {{ $labels.phase }}. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/kubernetes/kubepersistentvolumeerrors + summary: PersistentVolume is having issues with provisioning. + expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} + > 0 + for: 5m + labels: + severity: critical \ No newline at end of file diff --git a/rules/node-exporter.yaml b/rules/node-exporter.yaml new file mode 100644 index 00000000..2a24cb4d --- /dev/null +++ b/rules/node-exporter.yaml @@ -0,0 +1,340 @@ +groups: +- name: node-exporter + rules: + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 24 hours. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 15 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemSpaceFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemspacefillingup + summary: Filesystem is predicted to run out of space within the next 4 hours. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 10 + and + predict_linear(node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 5% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfSpace + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + space left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutofspace + summary: Filesystem has less than 3% space left. + expr: |- + ( + node_filesystem_avail_bytes{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_size_bytes{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 30m + labels: + severity: critical + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left and is filling up. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 24 hours. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 40 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 24*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemFilesFillingUp + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left and is filling up fast. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemfilesfillingup + summary: Filesystem is predicted to run out of inodes within the next 4 hours. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 20 + and + predict_linear(node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""}[6h], 4*60*60) < 0 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 5% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 5 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: warning + - alert: NodeFilesystemAlmostOutOfFiles + annotations: + description: Filesystem on {{ $labels.device }}, mounted on {{ $labels.mountpoint + }}, at {{ $labels.instance }} has only {{ printf "%.2f" $value }}% available + inodes left. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefilesystemalmostoutoffiles + summary: Filesystem has less than 3% inodes left. + expr: |- + ( + node_filesystem_files_free{job="node-exporter",fstype!="",mountpoint!=""} / node_filesystem_files{job="node-exporter",fstype!="",mountpoint!=""} * 100 < 3 + and + node_filesystem_readonly{job="node-exporter",fstype!="",mountpoint!=""} == 0 + ) + for: 1h + labels: + severity: critical + - alert: NodeNetworkReceiveErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered + {{ printf "%.0f" $value }} receive errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworkreceiveerrs + summary: Network interface is reporting many receive errors. + expr: rate(node_network_receive_errs_total{job="node-exporter"}[2m]) / rate(node_network_receive_packets_total{job="node-exporter"}[2m]) + > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeNetworkTransmitErrs + annotations: + description: '{{ $labels.instance }} interface {{ $labels.device }} has encountered + {{ printf "%.0f" $value }} transmit errors in the last two minutes.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodenetworktransmiterrs + summary: Network interface is reporting many transmit errors. + expr: rate(node_network_transmit_errs_total{job="node-exporter"}[2m]) / rate(node_network_transmit_packets_total{job="node-exporter"}[2m]) + > 0.01 + for: 1h + labels: + severity: warning + - alert: NodeHighNumberConntrackEntriesUsed + annotations: + description: '{{ $value | humanizePercentage }} of conntrack entries are used.' + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodehighnumberconntrackentriesused + summary: Number of conntrack are getting close to the limit. + expr: (node_nf_conntrack_entries{job="node-exporter"} / node_nf_conntrack_entries_limit) + > 0.75 + labels: + severity: warning + - alert: NodeTextFileCollectorScrapeError + annotations: + description: Node Exporter text file collector on {{ $labels.instance }} failed + to scrape. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodetextfilecollectorscrapeerror + summary: Node Exporter text file collector failed to scrape. + expr: node_textfile_scrape_error{job="node-exporter"} == 1 + labels: + severity: warning + - alert: NodeClockSkewDetected + annotations: + description: Clock at {{ $labels.instance }} is out of sync by more than 0.05s. + Ensure NTP is configured correctly on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclockskewdetected + summary: Clock skew detected. + expr: |- + ( + node_timex_offset_seconds{job="node-exporter"} > 0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) >= 0 + ) + or + ( + node_timex_offset_seconds{job="node-exporter"} < -0.05 + and + deriv(node_timex_offset_seconds{job="node-exporter"}[5m]) <= 0 + ) + for: 10m + labels: + severity: warning + - alert: NodeClockNotSynchronising + annotations: + description: Clock at {{ $labels.instance }} is not synchronising. Ensure NTP + is configured on this host. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodeclocknotsynchronising + summary: Clock not synchronising. + expr: |- + min_over_time(node_timex_sync_status{job="node-exporter"}[5m]) == 0 + and + node_timex_maxerror_seconds{job="node-exporter"} >= 16 + for: 10m + labels: + severity: warning + - alert: NodeRAIDDegraded + annotations: + description: RAID array '{{ $labels.device }}' at {{ $labels.instance }} is + in degraded state due to one or more disks failures. Number of spare drives + is insufficient to fix issue automatically. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddegraded + summary: RAID Array is degraded. + expr: node_md_disks_required{job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} + - ignoring (state) (node_md_disks{state="active",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}) + > 0 + for: 15m + labels: + severity: critical + - alert: NodeRAIDDiskFailure + annotations: + description: At least one device in RAID array at {{ $labels.instance }} failed. + Array '{{ $labels.device }}' needs attention and possibly a disk swap. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/noderaiddiskfailure + summary: Failed device in RAID array. + expr: node_md_disks{state="failed",job="node-exporter",device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"} + > 0 + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at + {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: |- + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 70 + ) + for: 15m + labels: + severity: warning + - alert: NodeFileDescriptorLimit + annotations: + description: File descriptors limit at {{ $labels.instance }} is currently at + {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodefiledescriptorlimit + summary: Kernel is predicted to exhaust file descriptors limit soon. + expr: |- + ( + node_filefd_allocated{job="node-exporter"} * 100 / node_filefd_maximum{job="node-exporter"} > 90 + ) + for: 15m + labels: + severity: critical + - alert: NodeCPUHighUsage + annotations: + description: | + CPU usage at {{ $labels.instance }} has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodecpuhighusage + summary: High CPU usage. + expr: sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{job="node-exporter", + mode!="idle"}[2m]))) * 100 > 90 + for: 15m + labels: + severity: info + - alert: NodeSystemSaturation + annotations: + description: | + System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + This might indicate this instance resources saturation and can cause it becoming unresponsive. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemsaturation + summary: System saturated, load per core is very high. + expr: |- + node_load1{job="node-exporter"} + / count without (cpu, mode) (node_cpu_seconds_total{job="node-exporter", mode="idle"}) > 2 + for: 15m + labels: + severity: warning + - alert: NodeMemoryMajorPagesFaults + annotations: + description: | + Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Please check that there is enough memory available at this instance. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememorymajorpagesfaults + summary: Memory major page faults are occurring at very high rate. + expr: rate(node_vmstat_pgmajfault{job="node-exporter"}[5m]) > 500 + for: 15m + labels: + severity: warning + - alert: NodeMemoryHighUtilization + annotations: + description: | + Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodememoryhighutilization + summary: Host is running out of memory. + expr: 100 - (node_memory_MemAvailable_bytes{job="node-exporter"} / node_memory_MemTotal_bytes{job="node-exporter"} + * 100) > 90 + for: 15m + labels: + severity: warning + - alert: NodeDiskIOSaturation + annotations: + description: | + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 30 minutes, is currently at {{ printf "%.2f" $value }}. + This symptom might indicate disk saturation. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodediskiosaturation + summary: Disk IO queue is high. + expr: rate(node_disk_io_time_weighted_seconds_total{job="node-exporter", device=~"(/dev/)?(mmcblk.p.+|nvme.+|rbd.+|sd.+|vd.+|xvd.+|dm-.+|md.+|dasd.+)"}[5m]) + > 10 + for: 30m + labels: + severity: warning + - alert: NodeSystemdServiceFailed + annotations: + description: Systemd service {{ $labels.name }} has entered failed state at + {{ $labels.instance }} + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodesystemdservicefailed + summary: Systemd service has entered failed state. + expr: node_systemd_unit_state{job="node-exporter", state="failed"} == 1 + for: 5m + labels: + severity: warning + - alert: NodeBondingDegraded + annotations: + description: Bonding interface {{ $labels.master }} on {{ $labels.instance }} + is in degraded state due to one or more slave failures. + runbook_url: https://runbooks.prometheus-operator.dev/runbooks/node/nodebondingdegraded + summary: Bonding interface is degraded + expr: (node_bonding_slaves - node_bonding_active) != 0 + for: 5m + labels: + severity: warning diff --git a/rules/node-resource-utilization.yaml b/rules/node-resource-utilization.yaml new file mode 100644 index 00000000..7a5fd536 --- /dev/null +++ b/rules/node-resource-utilization.yaml @@ -0,0 +1,70 @@ +groups: +- name: node-resource-utilization.rules + rules: + - alert: HostHighCpuLoad + annotations: + description: |- + CPU load is > 90% + VALUE = {{ $value }} + LABELS = {{ $labels }} + summary: Host high CPU load (instance {{ $labels.instance }}) + expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) + > 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} + for: 10m + labels: + severity: critical + - alert: MemoryUtilizationHighWarning + annotations: + dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ + $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D + description: Node {{ $labels.instance }} has less than 10% available memory. + summary: Node Memory utilization warning + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: critical + - alert: MemoryUtilizationHighCritical + annotations: + dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ + $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D + description: Node {{ $labels.instance }} has less than 5% available memory. + summary: Node Memory utilization critical + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5 + for: 1m + labels: + severity: critical + - alert: NodeNotReady + annotations: + description: Node {{ $labels.node }} has CPU utilization over 90%. + summary: Node has been in not-ready state for longer than 3 minutes + expr: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m]) + <= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"})) + > 0 + for: 5m + labels: + severity: critical + - alert: KubernetesNodeMemoryPressure + annotations: + description: |- + Node {{ $labels.node }} has MemoryPressure condition + VALUE = {{ $value }} + LABELS = {{ $labels }} + summary: Kubernetes Node memory pressure (instance {{ $labels.instance }}) + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == + 1 + for: 2m + labels: + severity: critical + - alert: KubernetesContainerOomKiller + annotations: + description: |- + Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. + VALUE = {{ $value }} + LABELS = {{ $labels }} + summary: Kubernetes Container oom killer (instance {{ $labels.instance }}) + expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total + offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) + == 1 + for: 0m + labels: + severity: warning diff --git a/rules/velero.yaml b/rules/velero.yaml new file mode 100644 index 00000000..1c8eb91c --- /dev/null +++ b/rules/velero.yaml @@ -0,0 +1,21 @@ +groups: +- name: velero + rules: + - alert: VeleroBackupPartialFailures + annotations: + message: Velero backup {{ $labels.schedule }} has {{$value | humanizePercentage}} partialy + failed backups. + expr: velero_backup_partial_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""} + > 0.25 + for: 15m + labels: + severity: critical + - alert: VeleroBackupFailures + annotations: + message: Velero backup {{$labels.schedule}} has {{$value | humanizePercentage}} failed + backups. + expr: velero_backup_failure_total{schedule!=""} / velero_backup_attempt_total{schedule!=""} + > 0.25 + for: 15m + labels: + severity: critical diff --git a/rules/x509-exporter.yaml b/rules/x509-exporter.yaml new file mode 100644 index 00000000..90cb7064 --- /dev/null +++ b/rules/x509-exporter.yaml @@ -0,0 +1,46 @@ +groups: +- name: x509-certificate-exporter.rules + rules: + - alert: X509ExporterReadErrors + annotations: + description: Over the last 15 minutes, this x509-certificate-exporter instance + has experienced errors reading certificate files or querying the Kubernetes + API. This could be caused by a misconfiguration if triggered when the exporter + starts. + summary: Increasing read errors for x509-certificate-exporter + expr: delta(x509_read_errors[15m]) > 0 + for: 5m + labels: + severity: warning + - alert: CertificateError + annotations: + description: Certificate could not be decoded {{if $labels.secret_name }} in + Kubernetes secret "{{ $labels.secret_namespace }}/{{ $labels.secret_name }}"{{else}}at + location "{{ $labels.filepath }}"{{end}} + summary: Certificate cannot be decoded + expr: x509_cert_error > 0 + for: 15m + labels: + severity: warning + - alert: CertificateRenewal + annotations: + description: Certificate for "{{ $labels.subject_CN }}" should be renewed {{if + $labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{ + $labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}} + summary: Certificate should be renewed + expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="", + issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 28 + for: 15m + labels: + severity: warning + - alert: CertificateExpiration + annotations: + description: Certificate for "{{ $labels.subject_CN }}" is about to expire {{if + $labels.secret_name }}in Kubernetes secret "{{ $labels.secret_namespace }}/{{ + $labels.secret_name }}"{{else}}at location "{{ $labels.filepath }}"{{end}} + summary: Certificate is about to expire + expr: ((x509_cert_not_after{secret_name!="linkerd-identity-issuer", issuer_O="", + issuer_CN!="webhook.linkerd.cluster.local"} - time()) / 86400) < 14 + for: 15m + labels: + severity: critical diff --git a/values/x509-exporter/values/x509-exporter.yaml.gotmpl b/values/x509-exporter/values/x509-exporter.yaml.gotmpl index 0e7ff0f3..173a6e9a 100644 --- a/values/x509-exporter/values/x509-exporter.yaml.gotmpl +++ b/values/x509-exporter/values/x509-exporter.yaml.gotmpl @@ -4,8 +4,6 @@ secretsExporter: excludeLabels: - cert-manager.io/* resources: - limits: - memory: 100Mi requests: cpu: 20m memory: 100Mi