Files
manifests/rules/node-resource-utilization.yaml

77 lines
3.9 KiB
YAML

groups:
- name: node-resource-utilization.rules
rules:
- alert: HostHighCpuLoad
annotations:
description: |-
CPU load is > 90%
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host high CPU load (instance {{ $labels.instance }})
expr:
(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 10m
labels:
severity: critical
- alert: MemoryUtilizationHighWarning
annotations:
dashboard:
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
description: Node {{ $labels.instance }} has less than 10% available memory.
summary: Node Memory utilization warning
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 5m
labels:
severity: critical
- alert: MemoryUtilizationHighCritical
annotations:
dashboard:
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
description: Node {{ $labels.instance }} has less than 5% available memory.
summary: Node Memory utilization critical
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
for: 1m
labels:
severity: critical
- alert: NodeNotReady
annotations:
description: Node {{ $labels.node }} has CPU utilization over 90%.
summary: Node has been in not-ready state for longer than 3 minutes
expr:
(sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
> 0
for: 5m
labels:
severity: critical
- alert: KubernetesNodeMemoryPressure
annotations:
description: |-
Node {{ $labels.node }} has MemoryPressure condition
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
expr:
kube_node_status_condition{condition="MemoryPressure",status="true"} ==
1
for: 2m
labels:
severity: critical
- alert: KubernetesContainerOomKiller
annotations:
description: |-
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
expr:
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
== 1
for: 0m
labels:
severity: warning