Files
platform/modules/hpc/monitoring.nix
2025-06-30 12:21:05 +02:00

278 lines
6.7 KiB
Nix

{ config, lib, pkgs, ... }:
with lib;
let
cfg = config.features.monitoring;
mkScrapeConfigs = configs: flip mapAttrsToList configs (k: v:
let
static_configs = flip map v.hostNames (name: {
targets = [ "${name}:${toString v.port}" ];
labels.alias = name;
});
in
(mkIf (static_configs != []) ({
inherit static_configs;
job_name = k;
scrape_interval = "15s";
} // (removeAttrs v [ "hostNames" "port" ]))));
prometheus = {
systemd.services.prometheus.serviceConfig.LimitNOFILE = 1024000;
services.prometheus = {
enable = true;
ruleFiles = singleton (pkgs.writeText "prometheus-rules.yml" (builtins.toJSON {
groups = singleton {
name = "alerting-rules";
rules = import ./alert-rules.nix { inherit lib; };
};
}));
scrapeConfigs = (mkScrapeConfigs ({
node = {
hostNames = cfg.server.scrapeHosts;
port = 9100;
};
infiniband = {
hostNames = [ "stokes" ];
port = 9683;
};
slurm = {
hostNames = [ "stokes" ];
port = 6080;
};
}));
};
};
nodeExporter = {
services.prometheus.exporters = {
node = {
enable = true;
openFirewall = true;
extraFlags = [ "--collector.disable-defaults" ];
enabledCollectors = [
"netstat"
"stat"
"systemd"
"textfile"
"textfile.directory /run/prometheus-node-exporter"
"thermal_zone"
"time"
"udp_queues"
"uname"
"vmstat"
"cpu"
"cpufreq"
"diskstats"
"edac"
"filesystem"
"hwmon"
"interrupts"
"ksmd"
"loadavg"
"meminfo"
"pressure"
"timex"
# "nfsd"
# "nfs"
# "rapl"
] ++ cfg.nodeExporter.extraCollectors;
};
};
networking.firewall.allowedTCPPorts = [ 9093 9100 ];
};
webUI = let net = config.networking; in {
services.grafana = {
enable = true;
domain = "grafana.${net.domain}";
port = 2342;
addr = "127.0.0.1";
};
security.acme = {
acceptTerms = true;
email = cfg.webUI.acmeEmail;
};
networking.firewall.allowedTCPPorts = [ 80 443 ];
services.nginx = {
enable = true;
statusPage = true;
virtualHosts = {
# "acme.${net.domain}" = {
# serverAliases = [ "*.svc.${net.domain}" ];
# # /var/lib/acme/.challenges must be writable by the ACME user
# # and readable by the Nginx user.
# locations."/.well-known/acme-challenge" = {
# root = "/var/lib/acme/acme-challenge";
# };
# locations."/" = {
# return = "301 https://$host$request_uri";
# };
# };
${config.services.grafana.domain} = {
forceSSL = true;
enableACME = true;
serverAliases = [];
locations."/" = {
proxyPass = "http://127.0.0.1:${toString config.services.grafana.port}";
proxyWebsockets = true;
extraConfig = webUIExtraConfig;
};
};
"prometheus.${net.domain}" = {
forceSSL = true;
enableACME = true;
serverAliases = [];
locations."/" = {
proxyPass = "http://127.0.0.1:${toString config.services.prometheus.port}";
proxyWebsockets = true;
extraConfig = webUIExtraConfig;
};
};
"alertmanager.${net.domain}" = {
forceSSL = true;
enableACME = true;
serverAliases = [];
locations."/" = {
proxyPass = "http://127.0.0.1:${toString config.services.prometheus.alertmanager.port}";
proxyWebsockets = true;
extraConfig = webUIExtraConfig;
};
};
};
};
};
webUIExtraConfig =
(builtins.foldl' (a: x:
a + "\nallow ${x};" ) cfg.webUI.extraConfig cfg.webUI.allow)
+ "\ndeny all;";
alertmanager = {
systemd.services.alertmanager.serviceConfig.LimitNOFILE = 1024000;
services.prometheus.alertmanager = {
enable = true;
configuration = {
route = {
receiver = "default";
routes = [
{
group_by = [ "alertname" "alias" ];
group_wait = "5s";
group_interval = "3m";
repeat_interval = "3h";
match = { severity = "page"; };
receiver = "page";
}
{
group_by = [ "alertname" "alias" ];
group_wait = "30s";
group_interval = "5m";
repeat_interval = "6h";
receiver = "default";
}
];
};
receivers = [
({ name = "default"; } // cfg.server.defaultAlertReceiver)
({ name = "page"; } // cfg.server.pageAlertReceiver)
];
inhibit_rules = [
# {
# target_match = {
# alertname = "node_collector_failed";
# };
# target_match_re = {
# alias = "c[0-9]-[0-9]";
# collector = "nfsd";
# };
# }
];
};
};
services.prometheus = {
alertmanagers = singleton {
static_configs = singleton {
targets = [ "localhost:9093" ];
# targets = flip map cfg.server.scrapeHosts (n: "${n}:9093");
};
};
};
};
in {
options.features.monitoring = {
server = {
enable = mkEnableOption "HPC cluster monitoring server with prometheus";
scrapeHosts = mkOption {
type = types.listOf types.str;
default = [];
};
defaultAlertReceiver = mkOption {
type = types.attrs;
default = {};
};
pageAlertReceiver = mkOption {
type = types.attrs;
default = {};
};
};
nodeExporter.enable = mkEnableOption "Enable node exporter";
nodeExporter.extraCollectors = mkOption {
type = types.listOf types.str;
default = [];
};
webUI = {
enable = mkEnableOption "Enable web UI for monitoring";
acmeEmail = mkOption {
type = types.str;
default = null;
};
allow = mkOption {
type = types.listOf types.str;
default = [];
};
extraConfig = mkOption {
type = types.str;
default = "";
};
};
};
config = mkMerge [
(mkIf cfg.server.enable (mkMerge [
prometheus
alertmanager
]))
(mkIf cfg.nodeExporter.enable nodeExporter)
(mkIf cfg.webUI.enable webUI)
];
imports = [ ./infiniband-exporter.nix ./slurm-exporter.nix ];
}