Files
platform/modules/hpc/slurm.nix
2025-06-30 12:21:05 +02:00

235 lines
5.6 KiB
Nix

{ pkgs, lib, config, ... }:
with lib;
let
cfg = config.features.hpc.slurm;
configuration = {
services.munge.enable = true;
environment.etc."munge/munge.key" = {
source = cfg.mungeKey;
mode = "0400";
uid = cfg.mungeUid;
gid = 0;
};
services.slurm = {
controlMachine = cfg.controlMachine;
nodeName = cfg.nodeName;
partitionName = cfg.partitionName;
extraConfig = ''
# AccountingStorageType=accounting_storage/none
AccountingStorageType=accounting_storage/slurmdbd
JobAcctGatherType=jobacct_gather/linux
MailDomain=${cfg.mailDomain}
MailProg=/run/wrappers/bin/sendmail
SelectType=select/cons_tres
SelectTypeParameters=CR_Core
# AuthAltTypes=auth/jwt
# AuthAltParameters=jwt_key=/var/spool/slurm/statesave/jwt_hs256.key
'';
};
networking.firewall.allowedTCPPorts = [ 6818 ];
nixpkgs.overlays = [ slurm-ucx ];
};
slurmServer = {
services.mysql = {
enable = true;
package = pkgs.mariadb;
ensureUsers = [
{
name = "slurm";
ensurePermissions = {
"slurm_acct_db.*" = "ALL PRIVILEGES";
};
}
];
initialDatabases = [
{ name = "slurm_acct_db"; }
];
};
services.slurm = {
server.enable = true;
# extraConfig = ''
# MailDomain=itpartner.no
# MailProg=${pkgs.ssmtp}/bin/ssmtp
# '';
dbdserver = {
enable = true;
# dbdHost = cfg.controlMachine;
# storagePass = cfg.storagePass;
};
};
networking.firewall.allowedTCPPorts = [ 6817 ];
};
slurmClient = {
services.slurm.client.enable = true;
systemd.services.slurmd.serviceConfig = {
Restart = "on-failure";
};
};
slurm-ucx = self: super: with super.pkgs; {
slurm = super.slurm.overrideAttrs (attrs: {
buildInputs = attrs.buildInputs ++ [ ucx http-parser pkg-config ];
nativeBuildInputs = attrs.nativeBuildInputs ++ [ makeWrapper ];
configureFlags =
attrs.configureFlags ++ [
"--with-ucx=${ucx.dev}"
"--with-http-parser=${http-parser}"
"--enable-slurmrestd"
];
postFixup = ''
wrapProgram $out/bin/slurmstepd --set LD_LIBRARY_PATH ${ucx}/lib
wrapProgram $out/bin/srun --set SLURM_MPI_TYPE "pmix"
'';
# --set PSM3_PKEY "${cfg.pkey}" \
# --set PMIX_MCA_gds "^ds12" \
});
};
hipster = {
users.groups.hipster.gid = 2001;
users.users.hipster = {
description = "Job runner";
home = "/work/hipster";
group = "hipster";
extraGroups = [
"users"
];
uid = 2001;
isNormalUser = true;
createHome = false;
useDefaultShell = true;
};
};
slurmrestd = {
systemd.tmpfiles.rules = [ "d /run/slurmrestd 0750 hipster hipster -" ];
systemd.services.slurmrestd = {
description = "Slurm REST API service";
wantedBy = [ "multi-user.target" ];
after = [ "slurmd.service" ];
serviceConfig = {
Type = "simple";
User = "hipster";
Group = "hipster";
};
environment = {
# SLURM_JWT = "daemon";
};
script = ''
rm -f /run/slurmrestd/hipster.socket
/run/current-system/sw/bin/slurmrestd -v -a rest_auth/local unix:/run/slurmrestd/hipster.socket
'';
serviceConfig = {
RuntimeDirectory = "slurmrestd";
};
};
systemd.sockets.slurm-http-proxy = {
enable = true;
description = "Proxy slurmrestd unix socket to port 6822";
listenStreams = [ "0.0.0.0:6822" ];
wantedBy = [ "sockets.target" ];
# Allow multiple instances of corresponding service.
socketConfig.Accept = true;
};
systemd.services."slurm-http-proxy@" = {
enable = true;
description = "Proxy slurmrestd unix socket to port 6822";
serviceConfig = {
ExecStart = "-${pkgs.socat}/bin/socat STDIO UNIX-CONNECT:/run/slurmrestd/hipster.socket";
StandardInput="socket";
User = "hipster";
Group = "hipster";
};
};
};
in
{
options.features.hpc.slurm = {
enable = mkEnableOption "Enable SLURM batch system";
mungeKey = mkOption {
type = types.path;
default = null;
};
mungeUid = mkOption {
type = types.int;
default = 997;
};
pkey = mkOption {
type = types.str;
default = "0x7fff";
};
controlMachine = mkOption {
type = types.str;
default = null;
};
server = mkOption {
type = types.bool;
default = false;
};
client = mkOption {
type = types.bool;
default = false;
};
hipster = mkOption {
type = types.bool;
default = true;
};
slurmrestd = mkOption {
type = types.bool;
default = false;
};
nodeName = mkOption {
type = types.listOf types.str;
default = [];
};
partitionName = mkOption {
type = types.listOf types.str;
default = [];
};
storagePass = mkOption {
type = types.str;
default = null;
};
mailDomain = mkOption {
type = types.str;
default = null;
};
};
config = mkIf cfg.enable (
mkMerge [
configuration
(mkIf cfg.server slurmServer)
(mkIf cfg.client slurmClient)
(mkIf (cfg.hipster) hipster)
(mkIf (cfg.server && cfg.hipster) slurmrestd)
(mkIf (cfg.slurmrestd && cfg.hipster) slurmrestd)
]);
}