groups: - name: node-resource-utilization.rules rules: - alert: HostHighCpuLoad annotations: description: |- CPU load is > 90% VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host high CPU load (instance {{ $labels.instance }}) expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m]))) > 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"} for: 10m labels: severity: critical - alert: MemoryUtilizationHighWarning annotations: dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D description: Node {{ $labels.instance }} has less than 10% available memory. summary: Node Memory utilization warning expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 5m labels: severity: critical - alert: MemoryUtilizationHighCritical annotations: dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{ $labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D description: Node {{ $labels.instance }} has less than 5% available memory. summary: Node Memory utilization critical expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5 for: 1m labels: severity: critical - alert: NodeNotReady annotations: description: Node {{ $labels.node }} has CPU utilization over 90%. summary: Node has been in not-ready state for longer than 3 minutes expr: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m]) <= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"})) > 0 for: 5m labels: severity: critical - alert: KubernetesNodeMemoryPressure annotations: description: |- Node {{ $labels.node }} has MemoryPressure condition VALUE = {{ $value }} LABELS = {{ $labels }} summary: Kubernetes Node memory pressure (instance {{ $labels.instance }}) expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 for: 2m labels: severity: critical - alert: KubernetesContainerOomKiller annotations: description: |- Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Kubernetes Container oom killer (instance {{ $labels.instance }}) expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1 for: 0m labels: severity: warning