fix: Add rules and remove CPU limit for x509
This commit is contained in:
@@ -0,0 +1,70 @@
|
||||
groups:
|
||||
- name: node-resource-utilization.rules
|
||||
rules:
|
||||
- alert: HostHighCpuLoad
|
||||
annotations:
|
||||
description: |-
|
||||
CPU load is > 90%
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host high CPU load (instance {{ $labels.instance }})
|
||||
expr: (sum by (instance) (avg by (mode, instance) (rate(node_cpu_seconds_total{mode!="idle"}[2m])))
|
||||
> 0.9) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 10m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: MemoryUtilizationHighWarning
|
||||
annotations:
|
||||
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||
description: Node {{ $labels.instance }} has less than 10% available memory.
|
||||
summary: Node Memory utilization warning
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: MemoryUtilizationHighCritical
|
||||
annotations:
|
||||
dashboard: https://grafana.ads1.itpartner.no/explore?orgId=1&left=%7B%22datasource%22:%22Prometheus%22,%22queries%22:%5B%7B%22refId%22:%22A%22,%22instant%22:true,%22range%22:true,%22exemplar%22:false,%22expr%22:%22topk(10,%20sum(container_memory_usage_bytes%7Bcontainer!%3D%5C%22%5C%22,%20container!%3D%5C%22POD%5{
|
||||
$labels.instance }}%5C%22%7D)%20by%20(container,%20pod,%20namespace))%22%7D%5D,%22range%22:%7B%22from%22:%22now-1h%22,%22to%22:%22now%22%7D%7D
|
||||
description: Node {{ $labels.instance }} has less than 5% available memory.
|
||||
summary: Node Memory utilization critical
|
||||
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 5
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: NodeNotReady
|
||||
annotations:
|
||||
description: Node {{ $labels.node }} has CPU utilization over 90%.
|
||||
summary: Node has been in not-ready state for longer than 3 minutes
|
||||
expr: (sum(max_over_time(kube_node_status_condition{condition="Ready",status="true"}[3m])
|
||||
<= 0) by (node)) or (absent(kube_node_status_condition{condition="Ready",status="true"}))
|
||||
> 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubernetesNodeMemoryPressure
|
||||
annotations:
|
||||
description: |-
|
||||
Node {{ $labels.node }} has MemoryPressure condition
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Kubernetes Node memory pressure (instance {{ $labels.instance }})
|
||||
expr: kube_node_status_condition{condition="MemoryPressure",status="true"} ==
|
||||
1
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
- alert: KubernetesContainerOomKiller
|
||||
annotations:
|
||||
description: |-
|
||||
Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Kubernetes Container oom killer (instance {{ $labels.instance }})
|
||||
expr: (kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total
|
||||
offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m])
|
||||
== 1
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
Reference in New Issue
Block a user