77 lines
3.9 KiB
YAML
77 lines
3.9 KiB
YAML
groups:
|
|
- name: node-resource-utilization.rules
|
|
rules:
|
|
- alert: HostHighCpuLoad
|
|
annotations:
|
|
description: |-
|
|
CPU load is > 90%
|
|
VALUE = {{ $value }}
|
|
LABELS = {{ $labels }}
|
|
summary: Host high CPU load (instance {{ $labels.instance }})
|
|
expr:
|
|
(sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
|
|
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
- alert: MemoryUtilizationHighWarning
|
|
annotations:
|
|
dashboard:
|
|
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
|
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
|
description: Node {{ $labels.instance }} has less than 10% available memory.
|
|
summary: Node Memory utilization warning
|
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: MemoryUtilizationHighCritical
|
|
annotations:
|
|
dashboard:
|
|
https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
|
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
|
description: Node {{ $labels.instance }} has less than 5% available memory.
|
|
summary: Node Memory utilization critical
|
|
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
- alert: NodeNotReady
|
|
annotations:
|
|
description: Node {{ $labels.node }} has CPU utilization over 90%.
|
|
summary: Node has been in not-ready state for longer than 3 minutes
|
|
expr:
|
|
(sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
|
|
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
|
|
> 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubernetesNodeMemoryPressure
|
|
annotations:
|
|
description: |-
|
|
Node {{ $labels.node }} has MemoryPressure condition
|
|
VALUE = {{ $value }}
|
|
LABELS = {{ $labels }}
|
|
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
|
|
expr:
|
|
kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
|
1
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
- alert: KubernetesContainerOomKiller
|
|
annotations:
|
|
description: |-
|
|
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
|
|
VALUE = {{ $value }}
|
|
LABELS = {{ $labels }}
|
|
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
|
|
expr:
|
|
(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
|
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
|
|
== 1
|
|
for: 0m
|
|
labels:
|
|
severity: warning
|