278 lines
6.7 KiB
Nix
278 lines
6.7 KiB
Nix
{ config, lib, pkgs, ... }:
|
|
|
|
with lib;
|
|
|
|
let
|
|
cfg = config.features.monitoring;
|
|
|
|
mkScrapeConfigs = configs: flip mapAttrsToList configs (k: v:
|
|
let
|
|
static_configs = flip map v.hostNames (name: {
|
|
targets = [ "${name}:${toString v.port}" ];
|
|
labels.alias = name;
|
|
});
|
|
in
|
|
(mkIf (static_configs != []) ({
|
|
inherit static_configs;
|
|
job_name = k;
|
|
scrape_interval = "15s";
|
|
} // (removeAttrs v [ "hostNames" "port" ]))));
|
|
|
|
prometheus = {
|
|
systemd.services.prometheus.serviceConfig.LimitNOFILE = 1024000;
|
|
|
|
services.prometheus = {
|
|
enable = true;
|
|
|
|
ruleFiles = singleton (pkgs.writeText "prometheus-rules.yml" (builtins.toJSON {
|
|
groups = singleton {
|
|
name = "alerting-rules";
|
|
rules = import ./alert-rules.nix { inherit lib; };
|
|
};
|
|
}));
|
|
|
|
scrapeConfigs = (mkScrapeConfigs ({
|
|
node = {
|
|
hostNames = cfg.server.scrapeHosts;
|
|
port = 9100;
|
|
};
|
|
infiniband = {
|
|
hostNames = [ "stokes" ];
|
|
port = 9683;
|
|
};
|
|
slurm = {
|
|
hostNames = [ "stokes" ];
|
|
port = 6080;
|
|
};
|
|
}));
|
|
};
|
|
};
|
|
|
|
nodeExporter = {
|
|
services.prometheus.exporters = {
|
|
node = {
|
|
enable = true;
|
|
openFirewall = true;
|
|
extraFlags = [ "--collector.disable-defaults" ];
|
|
enabledCollectors = [
|
|
"netstat"
|
|
"stat"
|
|
"systemd"
|
|
"textfile"
|
|
"textfile.directory /run/prometheus-node-exporter"
|
|
"thermal_zone"
|
|
"time"
|
|
"udp_queues"
|
|
"uname"
|
|
"vmstat"
|
|
"cpu"
|
|
"cpufreq"
|
|
"diskstats"
|
|
"edac"
|
|
"filesystem"
|
|
"hwmon"
|
|
"interrupts"
|
|
"ksmd"
|
|
"loadavg"
|
|
"meminfo"
|
|
"pressure"
|
|
"timex"
|
|
# "nfsd"
|
|
# "nfs"
|
|
# "rapl"
|
|
] ++ cfg.nodeExporter.extraCollectors;
|
|
};
|
|
};
|
|
|
|
networking.firewall.allowedTCPPorts = [ 9093 9100 ];
|
|
};
|
|
|
|
webUI = let net = config.networking; in {
|
|
services.grafana = {
|
|
enable = true;
|
|
domain = "grafana.${net.domain}";
|
|
port = 2342;
|
|
addr = "127.0.0.1";
|
|
};
|
|
|
|
security.acme = {
|
|
acceptTerms = true;
|
|
email = cfg.webUI.acmeEmail;
|
|
};
|
|
|
|
networking.firewall.allowedTCPPorts = [ 80 443 ];
|
|
|
|
services.nginx = {
|
|
enable = true;
|
|
|
|
statusPage = true;
|
|
|
|
virtualHosts = {
|
|
# "acme.${net.domain}" = {
|
|
# serverAliases = [ "*.svc.${net.domain}" ];
|
|
# # /var/lib/acme/.challenges must be writable by the ACME user
|
|
# # and readable by the Nginx user.
|
|
# locations."/.well-known/acme-challenge" = {
|
|
# root = "/var/lib/acme/acme-challenge";
|
|
# };
|
|
# locations."/" = {
|
|
# return = "301 https://$host$request_uri";
|
|
# };
|
|
# };
|
|
|
|
${config.services.grafana.domain} = {
|
|
forceSSL = true;
|
|
enableACME = true;
|
|
serverAliases = [];
|
|
locations."/" = {
|
|
proxyPass = "http://127.0.0.1:${toString config.services.grafana.port}";
|
|
proxyWebsockets = true;
|
|
extraConfig = webUIExtraConfig;
|
|
};
|
|
};
|
|
|
|
"prometheus.${net.domain}" = {
|
|
forceSSL = true;
|
|
enableACME = true;
|
|
serverAliases = [];
|
|
locations."/" = {
|
|
proxyPass = "http://127.0.0.1:${toString config.services.prometheus.port}";
|
|
proxyWebsockets = true;
|
|
extraConfig = webUIExtraConfig;
|
|
};
|
|
};
|
|
|
|
"alertmanager.${net.domain}" = {
|
|
forceSSL = true;
|
|
enableACME = true;
|
|
serverAliases = [];
|
|
locations."/" = {
|
|
proxyPass = "http://127.0.0.1:${toString config.services.prometheus.alertmanager.port}";
|
|
proxyWebsockets = true;
|
|
extraConfig = webUIExtraConfig;
|
|
};
|
|
};
|
|
};
|
|
};
|
|
};
|
|
|
|
webUIExtraConfig =
|
|
(builtins.foldl' (a: x:
|
|
a + "\nallow ${x};" ) cfg.webUI.extraConfig cfg.webUI.allow)
|
|
+ "\ndeny all;";
|
|
|
|
alertmanager = {
|
|
systemd.services.alertmanager.serviceConfig.LimitNOFILE = 1024000;
|
|
|
|
services.prometheus.alertmanager = {
|
|
enable = true;
|
|
configuration = {
|
|
route = {
|
|
receiver = "default";
|
|
routes = [
|
|
{
|
|
group_by = [ "alertname" "alias" ];
|
|
group_wait = "5s";
|
|
group_interval = "3m";
|
|
repeat_interval = "3h";
|
|
match = { severity = "page"; };
|
|
receiver = "page";
|
|
}
|
|
{
|
|
group_by = [ "alertname" "alias" ];
|
|
group_wait = "30s";
|
|
group_interval = "5m";
|
|
repeat_interval = "6h";
|
|
receiver = "default";
|
|
}
|
|
];
|
|
};
|
|
receivers = [
|
|
({ name = "default"; } // cfg.server.defaultAlertReceiver)
|
|
({ name = "page"; } // cfg.server.pageAlertReceiver)
|
|
];
|
|
inhibit_rules = [
|
|
# {
|
|
# target_match = {
|
|
# alertname = "node_collector_failed";
|
|
# };
|
|
# target_match_re = {
|
|
# alias = "c[0-9]-[0-9]";
|
|
# collector = "nfsd";
|
|
# };
|
|
# }
|
|
];
|
|
};
|
|
};
|
|
|
|
services.prometheus = {
|
|
alertmanagers = singleton {
|
|
static_configs = singleton {
|
|
targets = [ "localhost:9093" ];
|
|
# targets = flip map cfg.server.scrapeHosts (n: "${n}:9093");
|
|
};
|
|
};
|
|
};
|
|
};
|
|
in {
|
|
options.features.monitoring = {
|
|
server = {
|
|
enable = mkEnableOption "HPC cluster monitoring server with prometheus";
|
|
|
|
scrapeHosts = mkOption {
|
|
type = types.listOf types.str;
|
|
default = [];
|
|
};
|
|
|
|
defaultAlertReceiver = mkOption {
|
|
type = types.attrs;
|
|
default = {};
|
|
};
|
|
|
|
pageAlertReceiver = mkOption {
|
|
type = types.attrs;
|
|
default = {};
|
|
};
|
|
};
|
|
|
|
nodeExporter.enable = mkEnableOption "Enable node exporter";
|
|
|
|
nodeExporter.extraCollectors = mkOption {
|
|
type = types.listOf types.str;
|
|
default = [];
|
|
};
|
|
|
|
webUI = {
|
|
enable = mkEnableOption "Enable web UI for monitoring";
|
|
|
|
acmeEmail = mkOption {
|
|
type = types.str;
|
|
default = null;
|
|
};
|
|
|
|
allow = mkOption {
|
|
type = types.listOf types.str;
|
|
default = [];
|
|
};
|
|
|
|
extraConfig = mkOption {
|
|
type = types.str;
|
|
default = "";
|
|
};
|
|
};
|
|
};
|
|
|
|
config = mkMerge [
|
|
(mkIf cfg.server.enable (mkMerge [
|
|
prometheus
|
|
alertmanager
|
|
]))
|
|
|
|
(mkIf cfg.nodeExporter.enable nodeExporter)
|
|
|
|
(mkIf cfg.webUI.enable webUI)
|
|
];
|
|
|
|
imports = [ ./infiniband-exporter.nix ./slurm-exporter.nix ];
|
|
}
|