fix: add k8s and hpc modules to main repo
This commit is contained in:
277
modules/hpc/monitoring.nix
Normal file
277
modules/hpc/monitoring.nix
Normal file
@@ -0,0 +1,277 @@
|
||||
{ config, lib, pkgs, ... }:
|
||||
|
||||
with lib;
|
||||
|
||||
let
|
||||
cfg = config.features.monitoring;
|
||||
|
||||
mkScrapeConfigs = configs: flip mapAttrsToList configs (k: v:
|
||||
let
|
||||
static_configs = flip map v.hostNames (name: {
|
||||
targets = [ "${name}:${toString v.port}" ];
|
||||
labels.alias = name;
|
||||
});
|
||||
in
|
||||
(mkIf (static_configs != []) ({
|
||||
inherit static_configs;
|
||||
job_name = k;
|
||||
scrape_interval = "15s";
|
||||
} // (removeAttrs v [ "hostNames" "port" ]))));
|
||||
|
||||
prometheus = {
|
||||
systemd.services.prometheus.serviceConfig.LimitNOFILE = 1024000;
|
||||
|
||||
services.prometheus = {
|
||||
enable = true;
|
||||
|
||||
ruleFiles = singleton (pkgs.writeText "prometheus-rules.yml" (builtins.toJSON {
|
||||
groups = singleton {
|
||||
name = "alerting-rules";
|
||||
rules = import ./alert-rules.nix { inherit lib; };
|
||||
};
|
||||
}));
|
||||
|
||||
scrapeConfigs = (mkScrapeConfigs ({
|
||||
node = {
|
||||
hostNames = cfg.server.scrapeHosts;
|
||||
port = 9100;
|
||||
};
|
||||
infiniband = {
|
||||
hostNames = [ "stokes" ];
|
||||
port = 9683;
|
||||
};
|
||||
slurm = {
|
||||
hostNames = [ "stokes" ];
|
||||
port = 6080;
|
||||
};
|
||||
}));
|
||||
};
|
||||
};
|
||||
|
||||
nodeExporter = {
|
||||
services.prometheus.exporters = {
|
||||
node = {
|
||||
enable = true;
|
||||
openFirewall = true;
|
||||
extraFlags = [ "--collector.disable-defaults" ];
|
||||
enabledCollectors = [
|
||||
"netstat"
|
||||
"stat"
|
||||
"systemd"
|
||||
"textfile"
|
||||
"textfile.directory /run/prometheus-node-exporter"
|
||||
"thermal_zone"
|
||||
"time"
|
||||
"udp_queues"
|
||||
"uname"
|
||||
"vmstat"
|
||||
"cpu"
|
||||
"cpufreq"
|
||||
"diskstats"
|
||||
"edac"
|
||||
"filesystem"
|
||||
"hwmon"
|
||||
"interrupts"
|
||||
"ksmd"
|
||||
"loadavg"
|
||||
"meminfo"
|
||||
"pressure"
|
||||
"timex"
|
||||
# "nfsd"
|
||||
# "nfs"
|
||||
# "rapl"
|
||||
] ++ cfg.nodeExporter.extraCollectors;
|
||||
};
|
||||
};
|
||||
|
||||
networking.firewall.allowedTCPPorts = [ 9093 9100 ];
|
||||
};
|
||||
|
||||
webUI = let net = config.networking; in {
|
||||
services.grafana = {
|
||||
enable = true;
|
||||
domain = "grafana.${net.domain}";
|
||||
port = 2342;
|
||||
addr = "127.0.0.1";
|
||||
};
|
||||
|
||||
security.acme = {
|
||||
acceptTerms = true;
|
||||
email = cfg.webUI.acmeEmail;
|
||||
};
|
||||
|
||||
networking.firewall.allowedTCPPorts = [ 80 443 ];
|
||||
|
||||
services.nginx = {
|
||||
enable = true;
|
||||
|
||||
statusPage = true;
|
||||
|
||||
virtualHosts = {
|
||||
# "acme.${net.domain}" = {
|
||||
# serverAliases = [ "*.svc.${net.domain}" ];
|
||||
# # /var/lib/acme/.challenges must be writable by the ACME user
|
||||
# # and readable by the Nginx user.
|
||||
# locations."/.well-known/acme-challenge" = {
|
||||
# root = "/var/lib/acme/acme-challenge";
|
||||
# };
|
||||
# locations."/" = {
|
||||
# return = "301 https://$host$request_uri";
|
||||
# };
|
||||
# };
|
||||
|
||||
${config.services.grafana.domain} = {
|
||||
forceSSL = true;
|
||||
enableACME = true;
|
||||
serverAliases = [];
|
||||
locations."/" = {
|
||||
proxyPass = "http://127.0.0.1:${toString config.services.grafana.port}";
|
||||
proxyWebsockets = true;
|
||||
extraConfig = webUIExtraConfig;
|
||||
};
|
||||
};
|
||||
|
||||
"prometheus.${net.domain}" = {
|
||||
forceSSL = true;
|
||||
enableACME = true;
|
||||
serverAliases = [];
|
||||
locations."/" = {
|
||||
proxyPass = "http://127.0.0.1:${toString config.services.prometheus.port}";
|
||||
proxyWebsockets = true;
|
||||
extraConfig = webUIExtraConfig;
|
||||
};
|
||||
};
|
||||
|
||||
"alertmanager.${net.domain}" = {
|
||||
forceSSL = true;
|
||||
enableACME = true;
|
||||
serverAliases = [];
|
||||
locations."/" = {
|
||||
proxyPass = "http://127.0.0.1:${toString config.services.prometheus.alertmanager.port}";
|
||||
proxyWebsockets = true;
|
||||
extraConfig = webUIExtraConfig;
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
webUIExtraConfig =
|
||||
(builtins.foldl' (a: x:
|
||||
a + "\nallow ${x};" ) cfg.webUI.extraConfig cfg.webUI.allow)
|
||||
+ "\ndeny all;";
|
||||
|
||||
alertmanager = {
|
||||
systemd.services.alertmanager.serviceConfig.LimitNOFILE = 1024000;
|
||||
|
||||
services.prometheus.alertmanager = {
|
||||
enable = true;
|
||||
configuration = {
|
||||
route = {
|
||||
receiver = "default";
|
||||
routes = [
|
||||
{
|
||||
group_by = [ "alertname" "alias" ];
|
||||
group_wait = "5s";
|
||||
group_interval = "3m";
|
||||
repeat_interval = "3h";
|
||||
match = { severity = "page"; };
|
||||
receiver = "page";
|
||||
}
|
||||
{
|
||||
group_by = [ "alertname" "alias" ];
|
||||
group_wait = "30s";
|
||||
group_interval = "5m";
|
||||
repeat_interval = "6h";
|
||||
receiver = "default";
|
||||
}
|
||||
];
|
||||
};
|
||||
receivers = [
|
||||
({ name = "default"; } // cfg.server.defaultAlertReceiver)
|
||||
({ name = "page"; } // cfg.server.pageAlertReceiver)
|
||||
];
|
||||
inhibit_rules = [
|
||||
# {
|
||||
# target_match = {
|
||||
# alertname = "node_collector_failed";
|
||||
# };
|
||||
# target_match_re = {
|
||||
# alias = "c[0-9]-[0-9]";
|
||||
# collector = "nfsd";
|
||||
# };
|
||||
# }
|
||||
];
|
||||
};
|
||||
};
|
||||
|
||||
services.prometheus = {
|
||||
alertmanagers = singleton {
|
||||
static_configs = singleton {
|
||||
targets = [ "localhost:9093" ];
|
||||
# targets = flip map cfg.server.scrapeHosts (n: "${n}:9093");
|
||||
};
|
||||
};
|
||||
};
|
||||
};
|
||||
in {
|
||||
options.features.monitoring = {
|
||||
server = {
|
||||
enable = mkEnableOption "HPC cluster monitoring server with prometheus";
|
||||
|
||||
scrapeHosts = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [];
|
||||
};
|
||||
|
||||
defaultAlertReceiver = mkOption {
|
||||
type = types.attrs;
|
||||
default = {};
|
||||
};
|
||||
|
||||
pageAlertReceiver = mkOption {
|
||||
type = types.attrs;
|
||||
default = {};
|
||||
};
|
||||
};
|
||||
|
||||
nodeExporter.enable = mkEnableOption "Enable node exporter";
|
||||
|
||||
nodeExporter.extraCollectors = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [];
|
||||
};
|
||||
|
||||
webUI = {
|
||||
enable = mkEnableOption "Enable web UI for monitoring";
|
||||
|
||||
acmeEmail = mkOption {
|
||||
type = types.str;
|
||||
default = null;
|
||||
};
|
||||
|
||||
allow = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [];
|
||||
};
|
||||
|
||||
extraConfig = mkOption {
|
||||
type = types.str;
|
||||
default = "";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
config = mkMerge [
|
||||
(mkIf cfg.server.enable (mkMerge [
|
||||
prometheus
|
||||
alertmanager
|
||||
]))
|
||||
|
||||
(mkIf cfg.nodeExporter.enable nodeExporter)
|
||||
|
||||
(mkIf cfg.webUI.enable webUI)
|
||||
];
|
||||
|
||||
imports = [ ./infiniband-exporter.nix ./slurm-exporter.nix ];
|
||||
}
|
||||
Reference in New Issue
Block a user