Merge branch 'main' of gitlab.com:oceanbox/clusterfck

This commit is contained in:
2025-09-27 16:01:07 +02:00
19 changed files with 59 additions and 201 deletions

View File

@@ -11,7 +11,6 @@ let
compute = {
deployment.tags = [ "compute" "c0" ];
};
mkCompute = host:
@@ -39,8 +38,8 @@ let
name = host.name;
address = host.address;
};
os.externalInterface = "enp33s0f0np0";
hpc.compute = true;
os.externalInterface = host.iface;
hpc.computeNode = true;
hpc.knem = true;
# k8s = { inherit etcdCluster; };
};
@@ -54,7 +53,7 @@ let
networking = {
hostName = host.name;
useDHCP = false;
interfaces.enp33s0f0np0 = {
interfaces."${host.iface}" = {
useDHCP = false;
ipv4.addresses = [ {
address = host.address;

View File

@@ -1,9 +1,10 @@
with builtins;
let
nodes = genList (n: n + 1) 16; in
nodes = genList (n: n + 1) 18; in
map (n: (
rec {
idx = 100 + n;
iface = if n > 16 then "enp33s0f3np3" else "enp33s0f0np0";
name = "c0-${toString n}";
address = "10.255.241.${toString idx}";
ipoib = "10.255.243.${toString idx}";

View File

@@ -1 +0,0 @@
q丘

View File

@@ -1,88 +0,0 @@
{ pkgs ? import <nixpkgs> {} }:
let
# Pin the deployment package-set to a specific version of nixpkgs
# pkgs = import (builtins.fetchTarball {
# url = "https://github.com/NixOS/nixpkgs/archive/e6377ff35544226392b49fa2cf05590f9f0c4b43.tar.gz";
# sha256 = "1fra9wwy5gvj5ibayqkzqpwdf715bggc0qbmrfch4fghwvl5m70l";
# }) {};
# pkgs = import <nixpkgs> {};
nodes = import ./nodes.nix;
compute = {
deployment.tags = [ "compute" "c0" ];
};
mkCompute = host:
let
hw = ./hardware-configuration.d + "/${host.name}.nix";
in {
"${host.name}" = {
cluster = {
compute = true;
k8sNode = true;
mounts = {
rdma.enable = true;
automount.enable = true;
users = true;
opt = true;
work = true;
data = false;
backup = false;
ceph = false;
};
};
features = {
host = {
name = host.name;
address = host.address;
};
os.externalInterface = "enp33s0f0np0";
hpc.compute = true;
hpc.knem = true;
# k8s = { inherit etcdCluster; };
};
deployment.targetHost = host.address;
# services.udev.extraRules = ''
# KERNEL=="ibp1s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
networking = {
hostName = host.name;
useDHCP = false;
interfaces.enp33s0f3np3 = {
useDHCP = false;
ipv4.addresses = [ {
address = host.address;
prefixLength = 24;
} ];
ipv4.routes = [ {
address = "10.255.242.0";
prefixLength = 24;
via = "10.255.241.100";
} ];
};
# interfaces."ibp1s0.7666" = {
interfaces."ibp1s0" = {
useDHCP = false;
ipv4.addresses = [ {
address = host.ipoib;
prefixLength = 24;
} ];
};
};
imports = [
hw
../default.nix
../mounts.nix
# ./kernel.nix
];
}
// compute;
};
in builtins.foldl' (a: n: a // mkCompute n) {} nodes

View File

@@ -1,48 +0,0 @@
{pkgs, lib, ...}:
let
kernel = pkgs.linuxPackages.kernel;
i40e =
pkgs.stdenv.mkDerivation rec {
name = "i40e-${version}-${kernel.version}";
version = "2.13.10";
src = pkgs.fetchFromGitHub {
owner = "dmarion";
repo = "i40e";
rev = "7228a7c3b362c3170baa2f9a9c6870a900e78dbd";
sha256 = "087kvq9wrc1iw6vig8cqcx7cb6346wx8qxzb85c3n8638vq1vrxr";
};
hardeningDisable = [ "pic" ];
configurePhase = ''
cd src
kernel_version=${kernel.modDirVersion}
sed -i -e 's|/lib/modules|${kernel.dev}/lib/modules|' Makefile
sed -i -e 's|/lib/modules|${kernel.dev}/lib/modules|' common.mk
export makeFlags="BUILD_KERNEL=$kernel_version"
'';
installPhase = ''
install -v -D -m 644 i40e.ko "$out/lib/modules/$kernel_version/kernel/drivers/net/i40e/i40e2.ko"
'';
dontStrip = true;
enableParallelBuilding = true;
meta = {
description = "Linux kernel drivers for Intel Ethernet adapters and LOMs (LAN On Motherboard)";
homepage = https://github.com/dmarion/i40e;
license = lib.licenses.gpl2;
};
};
in
{
# i40e2 = i40e;
boot.kernelPackages = pkgs.linuxKernel.packages.linux_5_10;
# overlay = self: super: {
# linuxPackages_5_4 = super.linuxPackages_5_4 // { inherit i40e; };
# };
}

View File

@@ -1,11 +0,0 @@
with builtins;
let
nodes = genList (n: n + 17) 2; in
map (n: (
rec {
idx = 100 + n;
name = "c0-${toString n}";
address = "10.255.241.${toString idx}";
ipoib = "10.255.243.${toString idx}";
pubkey = ./. + "/ssh_host_key.d/c0-${toString n}.pub";
})) nodes

View File

@@ -39,7 +39,7 @@ let
address = host.address;
};
os.externalInterface = "eno33np0";
hpc.compute = true;
hpc.computeNode = true;
# k8s = { inherit etcdCluster; };
};

View File

@@ -100,6 +100,7 @@ let
};
networking = {
useDHCP = false;
domain = mkDefault "cluster.local";
defaultGateway = mkDefault "10.255.241.1";
nameservers = mkDefault [ "8.8.8.8" ];
@@ -144,12 +145,13 @@ let
features.hpc.slurm = {
enable = true;
client = true;
# clusterName = "ekman";
mungeKey = ./munge.key;
# jwtKey = ./jwt_hs256.key;
mungeUid = mkDefault 996; # hack
# pkey = "0x7666";
clusterName = "ekman";
controlMachine = "ekman-manage";
dbdHost = "10.255.241.15";
mungeKey = ./munge.key;
jwtKey = ./jwt_hs256.key;
slurmKey = ./slurm.key;
# pkey = "0x7666";
mailDomain = "oceanbox.io";
nodeName = [
"c0-[1-18] Sockets=2 CoresPerSocket=64 ThreadsPerCore=1 RealMemory=256000 TmpDisk=500000 State=UNKNOWN"

View File

@@ -69,7 +69,6 @@ in {
};
};
features.hpc.slurm.mungeUid = 991;
features.mft.enable = lib.mkForce true;
features = {

View File

@@ -46,8 +46,6 @@ in {
};
};
features.hpc.slurm.mungeUid = 994;
features = {
host = {
inherit address;

View File

@@ -17,9 +17,8 @@ let
login = import ./login { inherit pkgs; };
c0 = import ./c0 { inherit pkgs; };
c0x = import ./c0x { inherit pkgs; };
c1 = import ./c1 { inherit pkgs; };
fs-work = import ./fs-work { inherit pkgs; };
fs-backup = import ./fs-backup { inherit pkgs; };
in
{ inherit ekman-manage; frontend = ekman-manage; } // login // c0 // c0x // c1 // fs-work // fs-backup
{ inherit ekman-manage; frontend = ekman-manage; } // login // c0 // c1 // fs-work // fs-backup

View File

@@ -51,8 +51,8 @@ in
hpc = {
slurm.server = false;
slurm.slurmrestd = false;
manage = false;
login = true;
manageNode = false;
loginNode = true;
knem = false;
};

View File

@@ -86,9 +86,9 @@ in {
hpc = {
slurm.server = true;
slurm.slurmrestd = false;
slurm.mungeUid = 996;
manage = true;
slurm.slurmrestd = true;
slurm.dbdServer = false;
manageNode = true;
};
k8s = {
@@ -283,6 +283,18 @@ in {
'';
};
services.dnsmasq.enable = true;
services.dnsmasq.settings = {
domain = [ "cluster.local" ];
server = [
"/obx.hs/100.100.100.100" # headscale dns
];
address = [
"/slurmctld.cluster.local/127.0.0.1"
];
srv-host = "_slurmctld._tcp.cluster.local,slurmctld.cluster.local,6817,0,5";
};
# ssh-rsa is deprecated, but putty/winscp users use it
services.openssh.extraConfig = ''
# pubkeyacceptedalgorithms ssh-rsa,ssh-ed25519-cert-v01@openssh.com,ecdsa-sha2-nistp256-cert-v01@openssh.com,ecdsa-sha2-nistp384-cert-v01@openssh.com,ecdsa-sha2-nistp521-cert-v01@openssh.com,sk-ssh-ed25519-cert-v01@openssh.com,sk-ecdsa-sha2-nistp256-cert-v01@openssh.com,rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-ed25519,ecdsa-sha2-nistp256,ecdsa-sha2-nistp384,ecdsa-sha2-nistp521,sk-ssh-ed25519@openssh.com,sk-ecdsa-sha2-nistp256@openssh.com,rsa-sha2-512,rsa-sha2-256
@@ -323,11 +335,25 @@ in {
useRoutingFeatures = "both"; # for exit-node usage
extraUpFlags = [
"--login-server=https://headscale.svc.oceanbox.io"
"--accept-dns=false"
"--accept-dns=true" # see dnsmasq
"--accept-routes=true"
"--advertise-exit-node"
"--advertise-routes=10.255.241.0/24"
];
};
services.networkd-dispatcher = {
enable = true;
rules = {
"tailscale-router" = {
onState = [ "routable" ];
script = ''
#!${pkgs.runtimeShell}
${pkgs.ethtool}/bin/ethtool -K enp65s0np0 rx-udp-gro-forwarding on rx-gro-list off
exit 0
'';
};
};
};
imports = [
./hardware-configuration.nix

View File

@@ -94,7 +94,7 @@ let
};
cachix.enable = false;
monitoring.nodeExporter.enable = false;
mft.enable = true; # Mellanox MFT
mft.enable = false; # Mellanox MFT
};
networking = {
@@ -289,6 +289,18 @@ let
permissions = "u+rs,g+rx,o+rx";
};
};
# Use nvd to get package diff before apply
system.activationScripts.system-diff = {
supportsDryActivation = true; # safe: only outputs to stdout
text = ''
export PATH="${pkgs.lib.makeBinPath [ pkgs.nixVersions.latest ]}:$PATH"
if [ -e /run/current-system ]; then
${pkgs.lib.getExe pkgs.nvd} diff '/run/current-system' "$systemConfig" || true
fi
'';
};
};
openssh-shosts = pkgs.openssh.overrideAttrs (attrs: {

View File

@@ -205,25 +205,6 @@ in {
# secret-key-files = /etc/nix/ekman.key
'';
programs.msmtp = {
enable = true;
accounts = {
default = {
auth = false;
tls = false;
tls_starttls = false;
port = 24;
from = "rossby@oceanbox.io";
host = "smtpgw.itpartner.no";
# user = "utvikling";
# password = "S0m3rp0m@de#21!";
};
};
defaults = {
aliases = "/etc/aliases";
};
};
services.prometheus.alertmanager.configuration.global = {
smtp_smarthost = "smtpgw.itpartner.no";
# smtp_auth_username = "utvikling";
@@ -278,17 +259,6 @@ in {
srv-host = "_slurmctld._tcp.cluster.local,slurmctld.cluster.local,6817,0,5";
};
# Use nvd to get package diff before apply
system.activationScripts.system-diff = {
supportsDryActivation = true; # safe: only outputs to stdout
text = ''
export PATH="${pkgs.lib.makeBinPath [ pkgs.nixVersions.latest ]}:$PATH"
if [ -e /run/current-system ]; then
${pkgs.lib.getExe pkgs.nvd} diff '/run/current-system' "$systemConfig" || true
fi
'';
};
# ssh-rsa is deprecated, but putty/winscp users use it
services.openssh.extraConfig = ''
# pubkeyacceptedalgorithms ssh-rsa,ssh-ed25519-cert-v01@openssh.com,ecdsa-sha2-nistp256-cert-v01@openssh.com,ecdsa-sha2-nistp384-cert-v01@openssh.com,ecdsa-sha2-nistp521-cert-v01@openssh.com,sk-ssh-ed25519-cert-v01@openssh.com,sk-ecdsa-sha2-nistp256-cert-v01@openssh.com,rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-ed25519,ecdsa-sha2-nistp256,ecdsa-sha2-nistp384,ecdsa-sha2-nistp521,sk-ssh-ed25519@openssh.com,sk-ecdsa-sha2-nistp256@openssh.com,rsa-sha2-512,rsa-sha2-256