Files
platform/ekman/default.nix
2025-11-18 14:19:23 +01:00

361 lines
10 KiB
Nix

{
pkgs,
lib,
config,
...
}:
with lib;
let
cfg = config.features.host;
computeNodes = import ./c0/nodes.nix ++ import ./c1/nodes.nix;
mkSANs = host: [
host.name
host.address
"127.0.0.1"
];
configuration = {
system.autoUpgrade.enable = lib.mkForce false;
nixpkgs.overlays = [
(import ./overlays.nix)
(import ../modules/overrides/certmgr.nix)
];
boot = {
loader.systemd-boot.enable = true;
loader.efi.canTouchEfiVariables = true;
# kernelPackages = pkgs.linuxKernel.packages.linux_6_9;
kernelModules = [
"ib_umad"
"ib_ipoib"
"ceph"
];
# kernelParams = [
# "console=ttyS0,115200"
# "console=tty0"
# ];
};
services.resolved = {
enable = true;
dnssec = "false";
fallbackDns = [
"1.1.1.1"
"1.0.0.1"
];
# domains = [ "ekman.tos.obx" "~." ];
extraConfig = ''
DNSStubListener=no # conflicts with dnsmasq and kubernetes dns
MulticastDNS=no
LLMNR=no
'';
};
console = {
font = "Lat2-Terminus16";
keyMap = "us";
};
i18n = {
defaultLocale = "en_US.UTF-8";
extraLocaleSettings = {
LC_CTYPE = "en_DK.UTF-8";
LC_TIME = "en_DK.UTF-8";
LC_PAPER = "en_DK.UTF-8";
LC_NAME = "en_DK.UTF-8";
LC_ADDRESS = "en_DK.UTF-8";
LC_TELEPHONE = "en_DK.UTF-8";
LC_MEASUREMENT = "en_DK.UTF-8";
LC_IDENTIFICATION = "en_DK.UTF-8";
};
};
time.timeZone = "Europe/Oslo";
environment.etc = {
"aliases" = {
text = ''
root: jonas.juselius@oceanbox.io
'';
mode = "0644";
};
};
features = {
os = {
# boot.uefi = true;
adminAuthorizedKeys = [
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKiAS30ZO+wgfAqDE9Y7VhRunn2QszPHA5voUwo+fGOf jonas"
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQDULdlLC8ZLu9qBZUYsjhpr6kv5RH4yPkekXQdD7prkqapyoptUkO1nOTDwy7ZsKDxmp9Zc6OtdhgoJbowhGW3VIZPmooWO8twcaYDpkxEBLUehY/n8SlAwBtiHJ4mTLLcynJMVrjmTQLF3FeWVof0Aqy6UtZceFpLp1eNkiHTCM3anwtb9+gfr91dX1YsAOqxqv7ooRDu5rCRUvOi4OvRowepyuBcCjeWpTkJHkC9WGxuESvDV3CySWkGC2fF2LHkAu6SFsFE39UA5ZHo0b1TK+AFqRFiBAb7ULmtuno1yxhpBxbozf8+Yyc7yLfMNCyBpL1ci7WnjKkghQv7yM1xN2XMJLpF56v0slSKMoAs7ThoIlmkRm/6o3NCChgu0pkpNg/YP6A3HfYiEDgChvA6rAHX6+to50L9xF3ajqk4BUzWd/sCk7Q5Op2lzj31L53Ryg8vMP8hjDjYcgEcCCsGOcjUVgcsmfC9LupwRIEz3aF14AWg66+3zAxVho8ozjes= jonas.juselius@juselius.io"
"ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQC2tox0uyFGfU1zPNU6yAVSoGOUkeU959aiTMrqu1U9MCCOP2o4IhZIlRpZ08XVnUU/AhycCUF4HgGqdcco8oIVX0P0Cn83KJoD/DOqAiz+1VwIUUV1ylrRdNqCgf4wnmLni3sUPHJdQnuq57+pzDDjHMr9CcBL2KzOHD/QanfR+jZmv9K3OS5oDcWquSCziXkpbkWQURPactmtyzGK2FRRxONZgYrB8gRTDstlWQg/t6GHNVelzuJ7SEf+t8pk/S2e/XAvfZyRJhrVJ35iZKpmxkIn5v0g1Z+z0yX/KRSAPRtNg9uM44cmto77MFx7iFs0CuleL3zHvRvZYW1ZnsKAiP07UkEK87luMpkTzFr9CSHJGpgk1RZYA3qidQti44n6NU9YRNhzO4v+KQE6XDqO80gZCJboSXr3fnYn/QHpPXzK5JcZNWmClyMURYj10qv9So3Fh0o3LV5GThA6JgN874vUywUZanPEdn8ePBcAsjLRzA4YBGEuvJCc6FELSuY2s+/pFba8NXQvrOdJKSRC0g5USQFfaWDln4Q4zZ1G5z76p1u6GtRWxvakkUQ0fze9KAW7msxeKaw+B7uMtyvCL8V2zEE8WKFP1sNyYEe7Sgp3RVfym2VPMNTZVhEImfM/3D+WbzfoJztnJvFKXeeMCcne4G8swyef3o1s3b+CvQ== Simen Kirkvik"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHVwcJOtx9YTWy+aD4xGbyPFLOdMN6NqY8wcfDtHczyT Stig Rune Jensen"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKfgY468dPNpdXZCkD9jw1p2qA0+z56Wi/c1VYE+riki Stig Rune Jensen"
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAII77Aa2MFZMTha8PdkNg32UR8y6Hwb4R0aR9Ad9qifNq mrtz@wurst"
];
docker.enable = false;
};
cachix.enable = false;
monitoring.nodeExporter.enable = false;
mft.enable = true; # Mellanox MFT
};
networking = {
useDHCP = false;
domain = "ekman.tos.obx";
nameservers = [
"10.255.241.210"
"10.255.241.99"
];
search = [ "ekman.tos.obx" ];
extraHosts = import ../hosts.nix + import ./hosts.nix;
firewall.extraCommands = ''
iptables -I INPUT -s 10.255.241.0/24 -j ACCEPT
iptables -I INPUT -s 10.255.243.0/24 -j ACCEPT
iptables -I INPUT -s 100.64.0.0/24 -j ACCEPT
'';
};
environment.variables = { };
# systemd.services."serial-getty@ttyS0".enable = true;
# environment.etc."beegfs/connauthfile" = {
# source = ./connauthfile;
# mode = "0400";
# uid = 0;
# gid = 0;
# };
services.certmgr.validMin = "120h";
services.certmgr.renewInterval = "30m";
nix.settings = {
max-jobs = 32;
trusted-users = [ "@wheel" ];
substituters = [
];
};
system.activationScripts = {
kraken-permissions.text = ''
chmod 755 /work/kraken
'';
};
};
slurm = {
features.hpc.slurm = {
enable = true;
client = true;
clusterName = "ekman";
slurmctldHosts = [
"ekman-manage(10.255.241.99)"
];
dbdHost = "slurm-accounting";
mungeKey = ../munge.key;
jwtKey = ../jwt_hs256.key;
# slurmKey = ../slurm.key;
# pkey = "0x7666";
mailDomain = "oceanbox.io";
nodeName = [
"c0-[1-18] Sockets=2 CoresPerSocket=64 ThreadsPerCore=1 RealMemory=256000 TmpDisk=500000 State=UNKNOWN"
"c1-[1-8] Sockets=1 CoresPerSocket=64 ThreadsPerCore=1 RealMemory=256000 TmpDisk=100000 State=UNKNOWN"
"ekman Sockets=2 CoresPerSocket=64 ThreadsPerCore=1 RealMemory=256000 TmpDisk=500000 State=UNKNOWN"
"ekman-manage Sockets=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=92000 TmpDisk=200000 State=UNKNOWN"
"fs-backup Sockets=2 CoresPerSocket=20 ThreadsPerCore=1 RealMemory=92000 TmpDisk=300000 State=UNKNOWN"
];
partitionName = [
"batch Nodes=c0-[1-17] Default=YES MaxTime=INFINITE State=UP"
"ekman Nodes=ekman MaxTime=1:00:00 State=UP"
"short Nodes=c1-[1-8],c0-18 MaxTime=INFINITE State=UP"
"long Nodes=c1-[2-8],c0-18 MaxTime=INFINITE State=UP"
"stats Nodes=c1-[7-8] MaxTime=INFINITE State=UP"
"test Nodes=fs-backup MaxTime=INFINITE State=UP"
];
};
};
compute = {
system.activationScripts = {
mkWorkDir.text = "mkdir -p /work";
};
cluster.slurm = true;
features = {
hpc = {
enable = true;
};
};
};
k8s-node = {
features = {
k8s = {
enable = true;
node.enable = true;
clusterName = "ekman";
initca = ./ca;
cidr = "10.100.0.0/16";
master = {
name = "ekman-manage";
address = "10.255.241.99";
# extraSANs = [
# "frontend.oceanbox.io"
# ];
};
ingressNodes = [
"ekman-manage.oceanbox.io"
"ekman.oceanbox.io"
];
fileserver = "fs-work";
charts = {
acme_email = "acme@oceanbox.io";
};
};
};
system.activationScripts = {
copyCaKey.text = "cp ${./ca}/ca-key.pem /var/lib/kubernetes/secrets";
};
services.kubernetes.kubelet.extraSANs = mkSANs {
name = cfg.name;
address = cfg.address;
};
};
shosts = {
environment.etc."ssh/shosts.equiv" = {
mode = "0644";
uid = 0;
gid = 0;
text = ''
10.255.241.80
10.255.241.90
''
+ builtins.foldl' (a: x: a + "${x.address}\n") "" computeNodes;
};
programs.ssh.knownHosts = {
ekman-manage = {
hostNames = [
"ekman-manage"
"ekman-manage.ekman.tos.obx"
"frontend.oceanbox.io"
"10.255.241.99"
"10.255.243.99"
];
publicKeyFile = ./manage/ssh_host_key.pub;
};
ekman = {
hostNames = [
"ekman"
"ekman.ekman.tos.obx"
"ekman.oceanbox.io"
"10.255.241.100"
"10.255.243.100"
];
publicKeyFile = ./login/ssh_host_key.pub;
};
fs-work = {
hostNames = [
"fs-work"
"fs-work.ekman.tos.obx"
"10.255.241.90"
"10.255.243.90"
];
publicKeyFile = ./fs-work/ssh_host_key.pub;
};
fs-backup = {
hostNames = [
"fs-backup"
"fs-backup.ekman.tos.obx"
"10.255.241.80"
"10.255.243.80"
];
publicKeyFile = ./fs-backup/ssh_host_key.pub;
};
}
// builtins.foldl' (
a: x:
let
n = toString x.idx;
in
a
// {
"${x.name}" = {
hostNames = [
"${x.name}"
"${x.name}.ekman.tos.obx"
"10.255.241.${n}"
"10.255.243.${n}"
];
publicKeyFile = x.pubkey;
};
}
) { } computeNodes;
environment.systemPackages = [
openssh-shosts
pkgs.inotify-tools
pkgs.ceph
pkgs.ceph-client
];
security.wrappers = {
ssh-keysign = {
source = "${openssh-shosts}/libexec/ssh-keysign";
owner = "root";
group = "root";
permissions = "u+rs,g+rx,o+rx";
};
};
# Use nvd to get package diff before apply
system.activationScripts.system-diff = {
supportsDryActivation = true; # safe: only outputs to stdout
text = ''
export PATH="${pkgs.lib.makeBinPath [ pkgs.nixVersions.latest ]}:$PATH"
if [ -e /run/current-system ]; then
${pkgs.lib.getExe pkgs.nvd} diff '/run/current-system' "$systemConfig" || true
fi
'';
};
};
openssh-shosts = pkgs.openssh.overrideAttrs (attrs: {
buildFlags = [ "SSH_KEYSIGN=/run/wrappers/bin/ssh-keysign" ];
doCheck = false; # the tests take hours
});
in
{
options.cluster = {
compute = mkEnableOption "Enable compute node configs";
};
options.cluster = {
k8sNode = mkEnableOption "Enable k8s node";
};
options.cluster = {
slurm = mkEnableOption "Enable slurm";
};
config = mkMerge [
configuration
shosts
(mkIf config.cluster.compute compute)
(mkIf config.cluster.k8sNode k8s-node)
(mkIf config.cluster.slurm slurm)
];
imports = [
../modules
../nixos
../users.nix
];
}