{ lib }: with lib; let deviceFilter = ''fstype!="ramfs",device!="rpc_pipefs",device!="lxcfs",device!="nsfs",device!="borgfs"''; in mapAttrsToList (name: opts: { alert = name; expr = opts.condition; for = opts.time or "2m"; labels = if (opts.page or true) then { severity = "page"; } else {}; annotations = { summary = opts.summary; description = opts.description; }; }) { watchdog = { condition = "vector(1)"; summary = "An alert that should always be firing to certify that Alertmanager is working properly."; description = '' This is an alert meant to ensure that the entire alerting pipeline is functional. This alert is always firing, therefore it should always be firing in Alertmanager and always fire against a receiver. There are integrations with various notification mechanisms that send a notification when this alert is not firing. For example the "DeadMansSnitch" integration in PagerDuty. ''; time = "12h"; labels = { severity = "none"; }; }; node_down = { condition = ''up{job="node"} == 0''; summary = "{{$labels.alias}}: Node is down."; time = "10m"; description = "{{$labels.alias}} has been down for more than 10 minutes."; }; node_collector_failed = { condition = ''node_scrape_collector_success{job="node"} == 0''; summary = "{{$labels.alias}}: Node collector {{$labels.collector}} failed."; description = "{{$labels.alias}}: The collector {{$labels.collector}} of node exporter instance {{$labels.instance}} failed."; }; node_systemd_service_failed = { condition = ''node_systemd_unit_state{state="failed"} == 1''; summary = "{{$labels.alias}}: Service {{$labels.name}} failed to start."; description = "{{$labels.alias}} failed to (re)start service {{$labels.name}}."; }; node_filesystem_full_90percent = { condition = ''sort(node_filesystem_free_bytes{${deviceFilter}} < node_filesystem_size_bytes{${deviceFilter}} * 0.1) / 1024^3''; time = "10m"; page = false; summary = "{{$labels.alias}}: Filesystem is running out of space soon."; description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} got less than 10% space left on its filesystem."; }; node_load15 = { condition = ''node_load15 / on(alias) count(node_cpu_seconds_total{mode="system"}) by (alias) >= 1.0 AND node_load15{alias !~ "c[0-9]-[0-9]"}''; time = "10m"; page = false; summary = "{{$labels.alias}}: Running on high load: {{$value}}"; description = "{{$labels.alias}} is running with load15 > 1 for at least 10 minutes: {{$value}}"; }; node_ram_using_90percent = { condition = ''node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes < node_memory_MemTotal_bytes * 0.1 AND node_memory_MemFree_bytes{alias !~ "c[0-9]-[0-9]"}''; time = "1h"; page = false; summary = "{{$labels.alias}}: Using lots of RAM."; description = "{{$labels.alias}} is using at least 90% of its RAM for at least 1 hour."; }; node_hwmon_temp = { condition = "node_hwmon_temp_crit_alarm_celsius == 1"; summary = "{{$labels.alias}}: Sensor {{$labels.sensor}}/{{$labels.chip}} temp is high: {{$value}} "; description = "{{$labels.alias}} reports hwmon sensor {{$labels.sensor}}/{{$labels.chip}} temperature value is nearly critical: {{$value}}"; }; node_reboot = { condition = "time() - node_boot_time_seconds < 300"; summary = "{{$labels.alias}}: Reboot"; description = "{{$labels.alias}} just rebooted."; }; node_uptime = { condition = "time() - node_boot_time_seconds > 2592000"; page = false; summary = "{{$labels.alias}}: Uptime monster"; description = "{{$labels.alias}} has been up for more than 30 days."; }; slurm_nodes_offline = { condition = "slurm_node_down > 0 OR slurm_node_drain > 0 OR slurm_node_err > 0 OR slurm_node_fail > 0"; summary = "Slurm nodes offline: {{$value}}"; description = "Slurm node(s) have been offline for more than 5m."; }; node_filesystem_full_in_7d = { condition = ''node_filesystem_free_bytes{${deviceFilter}} '' + ''and predict_linear(node_filesystem_free_bytes{${deviceFilter}}[2d], 7*24*3600) <= 0''; page = false; time = "1h"; summary = "{{$labels.alias}}: Filesystem is running out of space in 7 days."; description = "{{$labels.alias}} device {{$labels.device}} on {{$labels.mountpoint}} is running out of space in approx. 7 days"; }; }