{ config, lib, pkgs, ... }: with lib; let cfg = config.features.monitoring; mkScrapeConfigs = configs: flip mapAttrsToList configs (k: v: let static_configs = flip map v.hostNames (name: { targets = [ "${name}:${toString v.port}" ]; labels.alias = name; }); in (mkIf (static_configs != []) ({ inherit static_configs; job_name = k; scrape_interval = "15s"; } // (removeAttrs v [ "hostNames" "port" ])))); prometheus = { systemd.services.prometheus.serviceConfig.LimitNOFILE = 1024000; services.prometheus = { enable = true; ruleFiles = singleton (pkgs.writeText "prometheus-rules.yml" (builtins.toJSON { groups = singleton { name = "alerting-rules"; rules = import ./alert-rules.nix { inherit lib; }; }; })); scrapeConfigs = (mkScrapeConfigs ({ node = { hostNames = cfg.server.scrapeHosts; port = 9100; }; infiniband = { hostNames = [ "stokes" ]; port = 9683; }; slurm = { hostNames = [ "stokes" ]; port = 6080; }; })); }; }; nodeExporter = { services.prometheus.exporters = { node = { enable = true; openFirewall = true; extraFlags = [ "--collector.disable-defaults" ]; enabledCollectors = [ "netstat" "stat" "systemd" "textfile" "textfile.directory /run/prometheus-node-exporter" "thermal_zone" "time" "udp_queues" "uname" "vmstat" "cpu" "cpufreq" "diskstats" "edac" "filesystem" "hwmon" "interrupts" "ksmd" "loadavg" "meminfo" "pressure" "timex" # "nfsd" # "nfs" # "rapl" ] ++ cfg.nodeExporter.extraCollectors; }; }; networking.firewall.allowedTCPPorts = [ 9093 9100 ]; }; webUI = let net = config.networking; in { services.grafana = { enable = true; domain = "grafana.${net.domain}"; port = 2342; addr = "127.0.0.1"; }; security.acme = { acceptTerms = true; email = cfg.webUI.acmeEmail; }; networking.firewall.allowedTCPPorts = [ 80 443 ]; services.nginx = { enable = true; statusPage = true; virtualHosts = { # "acme.${net.domain}" = { # serverAliases = [ "*.svc.${net.domain}" ]; # # /var/lib/acme/.challenges must be writable by the ACME user # # and readable by the Nginx user. # locations."/.well-known/acme-challenge" = { # root = "/var/lib/acme/acme-challenge"; # }; # locations."/" = { # return = "301 https://$host$request_uri"; # }; # }; ${config.services.grafana.domain} = { forceSSL = true; enableACME = true; serverAliases = []; locations."/" = { proxyPass = "http://127.0.0.1:${toString config.services.grafana.port}"; proxyWebsockets = true; extraConfig = webUIExtraConfig; }; }; "prometheus.${net.domain}" = { forceSSL = true; enableACME = true; serverAliases = []; locations."/" = { proxyPass = "http://127.0.0.1:${toString config.services.prometheus.port}"; proxyWebsockets = true; extraConfig = webUIExtraConfig; }; }; "alertmanager.${net.domain}" = { forceSSL = true; enableACME = true; serverAliases = []; locations."/" = { proxyPass = "http://127.0.0.1:${toString config.services.prometheus.alertmanager.port}"; proxyWebsockets = true; extraConfig = webUIExtraConfig; }; }; }; }; }; webUIExtraConfig = (builtins.foldl' (a: x: a + "\nallow ${x};" ) cfg.webUI.extraConfig cfg.webUI.allow) + "\ndeny all;"; alertmanager = { systemd.services.alertmanager.serviceConfig.LimitNOFILE = 1024000; services.prometheus.alertmanager = { enable = true; configuration = { route = { receiver = "default"; routes = [ { group_by = [ "alertname" "alias" ]; group_wait = "5s"; group_interval = "3m"; repeat_interval = "3h"; match = { severity = "page"; }; receiver = "page"; } { group_by = [ "alertname" "alias" ]; group_wait = "30s"; group_interval = "5m"; repeat_interval = "6h"; receiver = "default"; } ]; }; receivers = [ ({ name = "default"; } // cfg.server.defaultAlertReceiver) ({ name = "page"; } // cfg.server.pageAlertReceiver) ]; inhibit_rules = [ # { # target_match = { # alertname = "node_collector_failed"; # }; # target_match_re = { # alias = "c[0-9]-[0-9]"; # collector = "nfsd"; # }; # } ]; }; }; services.prometheus = { alertmanagers = singleton { static_configs = singleton { targets = [ "localhost:9093" ]; # targets = flip map cfg.server.scrapeHosts (n: "${n}:9093"); }; }; }; }; in { options.features.monitoring = { server = { enable = mkEnableOption "HPC cluster monitoring server with prometheus"; scrapeHosts = mkOption { type = types.listOf types.str; default = []; }; defaultAlertReceiver = mkOption { type = types.attrs; default = {}; }; pageAlertReceiver = mkOption { type = types.attrs; default = {}; }; }; nodeExporter.enable = mkEnableOption "Enable node exporter"; nodeExporter.extraCollectors = mkOption { type = types.listOf types.str; default = []; }; webUI = { enable = mkEnableOption "Enable web UI for monitoring"; acmeEmail = mkOption { type = types.str; default = null; }; allow = mkOption { type = types.listOf types.str; default = []; }; extraConfig = mkOption { type = types.str; default = ""; }; }; }; config = mkMerge [ (mkIf cfg.server.enable (mkMerge [ prometheus alertmanager ])) (mkIf cfg.nodeExporter.enable nodeExporter) (mkIf cfg.webUI.enable webUI) ]; imports = [ ./infiniband-exporter.nix ./slurm-exporter.nix ]; }