Files
platform/modules/hpc/slurm.nix

283 lines
6.2 KiB
Nix

{ pkgs, lib, config, ... }:
with lib;
let
cfg = config.features.hpc.slurm;
configuration = {
services.munge.enable = true;
users = {
groups = {
slurm = { gid = lib.mkForce 401; };
munge = { gid = lib.mkForce 402; };
};
users.slurm = {
group = "slurm";
uid = lib.mkForce 401;
};
users.munge = {
group = "munge";
uid = lib.mkForce 402;
};
};
environment.etc."munge/munge.key" = {
source = cfg.mungeKey;
mode = "0400";
uid = 402;
gid = 0;
};
environment.etc."slurm/jwt_hs256.key" = {
source = cfg.jwtKey;
mode = "0400";
uid = 401;
gid = 0;
};
environment.etc."slurm/slurm.key" = {
source = cfg.slurmKey;
mode = "0400";
uid = 401;
gid = 0;
};
services.slurm = {
clusterName=cfg.clusterName;
controlMachine = cfg.controlMachine;
nodeName = cfg.nodeName;
partitionName = cfg.partitionName;
extraConfig = ''
# AccountingStorageType=accounting_storage/none
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=${cfg.dbdHost}
JobAcctGatherType=jobacct_gather/linux
MailDomain=${cfg.mailDomain}
MailProg=/run/wrappers/bin/sendmail
SelectType=select/cons_tres
SelectTypeParameters=CR_Core
AuthAltTypes=auth/jwt
AuthAltParameters=jwt_key=/etc/slurm/jwt_hs256.key
${slurmAuth}
'';
extraConfigPaths = [ slurmKey ];
};
networking.firewall.allowedTCPPorts = [ 6818 ];
nixpkgs.overlays = [ slurm-ucx ];
};
slurmAuth = if cfg.slurmKey != null then "AuthType=auth/slurm" else "";
emptyKey = pkgs.writeTextDir "slurm-key" "";
slurmKey =
if cfg.slurmKey == null then
emptyKey
else
pkgs.stdenv.mkDerivation {
name = "slurm-key";
buildCommand = ''
mkdir -p $out
ln -s /etc/slurm/slurm.key $out/slurm.key
'';
};
slurmServer = {
services.mysql = {
enable = true;
package = pkgs.mariadb;
ensureUsers = [
{
name = "slurm";
ensurePermissions = {
"slurm_acct_db.*" = "ALL PRIVILEGES";
};
}
];
initialDatabases = [
{ name = "slurm_acct_db"; }
];
};
services.slurm = {
server.enable = true;
# extraConfig = ''
# MailDomain=itpartner.no
# MailProg=${pkgs.ssmtp}/bin/ssmtp
# '';
};
services.slurm.dbdserver = {
enable = cfg.dbdServer;
dbdHost = cfg.controlMachine;
# storagePass = cfg.storagePass;
};
networking.firewall.allowedTCPPorts = [ 6817 ];
};
slurmClient = {
services.slurm.client.enable = true;
systemd.services.slurmd.serviceConfig = {
Restart = "on-failure";
};
};
slurm-ucx = self: super: with super.pkgs; {
slurm = super.slurm.overrideAttrs (attrs: {
buildInputs = attrs.buildInputs ++ [ ucx http-parser pkg-config ];
nativeBuildInputs = attrs.nativeBuildInputs ++ [ makeWrapper ];
configureFlags =
attrs.configureFlags ++ [
"--with-ucx=${ucx.dev}"
"--with-http-parser=${http-parser}"
"--enable-slurmrestd"
];
postFixup = ''
wrapProgram $out/bin/slurmstepd --set LD_LIBRARY_PATH ${ucx}/lib
wrapProgram $out/bin/srun --set SLURM_MPI_TYPE "pmix"
'';
# --set PSM3_PKEY "${cfg.pkey}" \
# --set PMIX_MCA_gds "^ds12" \
});
};
hipster = {
users.groups.hipster.gid = 2001;
users.users.hipster = {
description = "Job runner";
home = "/work/hipster";
group = "hipster";
extraGroups = [
"users"
];
uid = 2001;
isNormalUser = true;
createHome = false;
useDefaultShell = true;
};
};
slurmrestd = {
systemd.services.slurmrestd = {
description = "Slurm REST API service";
wantedBy = [ "multi-user.target" ];
after = [ "slurmd.service" ];
serviceConfig = {
Type = "simple";
User = "hipster";
Group = "hipster";
};
environment = {
SLURM_JWT = "daemon";
};
script = ''
/run/current-system/sw/bin/slurmrestd -v -a rest_auth/jwt :6822
'';
serviceConfig = {
RuntimeDirectory = "slurmrestd";
};
};
};
in
{
options.features.hpc.slurm = {
enable = mkEnableOption "Enable SLURM batch system";
jwtKey = mkOption {
type = types.path;
default = null;
};
mungeKey = mkOption {
type = types.path;
default = null;
};
clusterName = mkOption {
type = types.str;
default = null;
};
pkey = mkOption {
type = types.str;
default = "0x7fff";
};
controlMachine = mkOption {
type = types.str;
default = null;
};
dbdHost = mkOption {
type = types.str;
default = "localhost";
};
server = mkOption {
type = types.bool;
default = false;
};
dbdServer = mkOption {
type = types.bool;
default = false;
};
client = mkOption {
type = types.bool;
default = false;
};
hipster = mkOption {
type = types.bool;
default = true;
};
slurmrestd = mkOption {
type = types.bool;
default = false;
};
nodeName = mkOption {
type = types.listOf types.str;
default = [];
};
partitionName = mkOption {
type = types.listOf types.str;
default = [];
};
storagePass = mkOption {
type = types.str;
default = null;
};
mailDomain = mkOption {
type = types.str;
default = null;
};
slurmKey = lib.mkOption {
type = lib.types.nullOr lib.types.path;
default = emptyKey;
description = ''File containing the slurm.key to be used for auth/slurm.'';
};
};
config = mkIf cfg.enable (
mkMerge [
configuration
(mkIf cfg.server slurmServer)
(mkIf cfg.client slurmClient)
(mkIf (cfg.hipster) hipster)
(mkIf (cfg.server && cfg.hipster) slurmrestd)
(mkIf (cfg.slurmrestd && cfg.hipster) slurmrestd)
]);
}