Merge branch 'main' of gitlab.com:oceanbox/clusterfck
This commit is contained in:
@@ -11,7 +11,6 @@ let
|
||||
|
||||
compute = {
|
||||
deployment.tags = [ "compute" "c0" ];
|
||||
|
||||
};
|
||||
|
||||
mkCompute = host:
|
||||
@@ -39,8 +38,8 @@ let
|
||||
name = host.name;
|
||||
address = host.address;
|
||||
};
|
||||
os.externalInterface = "enp33s0f0np0";
|
||||
hpc.compute = true;
|
||||
os.externalInterface = host.iface;
|
||||
hpc.computeNode = true;
|
||||
hpc.knem = true;
|
||||
# k8s = { inherit etcdCluster; };
|
||||
};
|
||||
@@ -54,7 +53,7 @@ let
|
||||
networking = {
|
||||
hostName = host.name;
|
||||
useDHCP = false;
|
||||
interfaces.enp33s0f0np0 = {
|
||||
interfaces."${host.iface}" = {
|
||||
useDHCP = false;
|
||||
ipv4.addresses = [ {
|
||||
address = host.address;
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
with builtins;
|
||||
let
|
||||
nodes = genList (n: n + 1) 16; in
|
||||
nodes = genList (n: n + 1) 18; in
|
||||
map (n: (
|
||||
rec {
|
||||
idx = 100 + n;
|
||||
iface = if n > 16 then "enp33s0f3np3" else "enp33s0f0np0";
|
||||
name = "c0-${toString n}";
|
||||
address = "10.255.241.${toString idx}";
|
||||
ipoib = "10.255.243.${toString idx}";
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
莵q丘
|
||||
@@ -1,88 +0,0 @@
|
||||
{ pkgs ? import <nixpkgs> {} }:
|
||||
let
|
||||
# Pin the deployment package-set to a specific version of nixpkgs
|
||||
# pkgs = import (builtins.fetchTarball {
|
||||
# url = "https://github.com/NixOS/nixpkgs/archive/e6377ff35544226392b49fa2cf05590f9f0c4b43.tar.gz";
|
||||
# sha256 = "1fra9wwy5gvj5ibayqkzqpwdf715bggc0qbmrfch4fghwvl5m70l";
|
||||
# }) {};
|
||||
# pkgs = import <nixpkgs> {};
|
||||
|
||||
nodes = import ./nodes.nix;
|
||||
|
||||
compute = {
|
||||
deployment.tags = [ "compute" "c0" ];
|
||||
};
|
||||
|
||||
mkCompute = host:
|
||||
let
|
||||
hw = ./hardware-configuration.d + "/${host.name}.nix";
|
||||
in {
|
||||
"${host.name}" = {
|
||||
cluster = {
|
||||
compute = true;
|
||||
k8sNode = true;
|
||||
mounts = {
|
||||
rdma.enable = true;
|
||||
automount.enable = true;
|
||||
users = true;
|
||||
opt = true;
|
||||
work = true;
|
||||
data = false;
|
||||
backup = false;
|
||||
ceph = false;
|
||||
};
|
||||
};
|
||||
|
||||
features = {
|
||||
host = {
|
||||
name = host.name;
|
||||
address = host.address;
|
||||
};
|
||||
os.externalInterface = "enp33s0f0np0";
|
||||
hpc.compute = true;
|
||||
hpc.knem = true;
|
||||
# k8s = { inherit etcdCluster; };
|
||||
};
|
||||
|
||||
deployment.targetHost = host.address;
|
||||
|
||||
# services.udev.extraRules = ''
|
||||
# KERNEL=="ibp1s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
|
||||
# '';
|
||||
|
||||
networking = {
|
||||
hostName = host.name;
|
||||
useDHCP = false;
|
||||
interfaces.enp33s0f3np3 = {
|
||||
useDHCP = false;
|
||||
ipv4.addresses = [ {
|
||||
address = host.address;
|
||||
prefixLength = 24;
|
||||
} ];
|
||||
ipv4.routes = [ {
|
||||
address = "10.255.242.0";
|
||||
prefixLength = 24;
|
||||
via = "10.255.241.100";
|
||||
} ];
|
||||
|
||||
};
|
||||
# interfaces."ibp1s0.7666" = {
|
||||
interfaces."ibp1s0" = {
|
||||
useDHCP = false;
|
||||
ipv4.addresses = [ {
|
||||
address = host.ipoib;
|
||||
prefixLength = 24;
|
||||
} ];
|
||||
};
|
||||
};
|
||||
imports = [
|
||||
hw
|
||||
../default.nix
|
||||
../mounts.nix
|
||||
# ./kernel.nix
|
||||
];
|
||||
}
|
||||
// compute;
|
||||
};
|
||||
in builtins.foldl' (a: n: a // mkCompute n) {} nodes
|
||||
|
||||
@@ -1,48 +0,0 @@
|
||||
{pkgs, lib, ...}:
|
||||
let
|
||||
kernel = pkgs.linuxPackages.kernel;
|
||||
i40e =
|
||||
pkgs.stdenv.mkDerivation rec {
|
||||
name = "i40e-${version}-${kernel.version}";
|
||||
version = "2.13.10";
|
||||
|
||||
src = pkgs.fetchFromGitHub {
|
||||
owner = "dmarion";
|
||||
repo = "i40e";
|
||||
rev = "7228a7c3b362c3170baa2f9a9c6870a900e78dbd";
|
||||
sha256 = "087kvq9wrc1iw6vig8cqcx7cb6346wx8qxzb85c3n8638vq1vrxr";
|
||||
};
|
||||
|
||||
hardeningDisable = [ "pic" ];
|
||||
|
||||
configurePhase = ''
|
||||
cd src
|
||||
kernel_version=${kernel.modDirVersion}
|
||||
sed -i -e 's|/lib/modules|${kernel.dev}/lib/modules|' Makefile
|
||||
sed -i -e 's|/lib/modules|${kernel.dev}/lib/modules|' common.mk
|
||||
export makeFlags="BUILD_KERNEL=$kernel_version"
|
||||
'';
|
||||
|
||||
installPhase = ''
|
||||
install -v -D -m 644 i40e.ko "$out/lib/modules/$kernel_version/kernel/drivers/net/i40e/i40e2.ko"
|
||||
'';
|
||||
|
||||
dontStrip = true;
|
||||
|
||||
enableParallelBuilding = true;
|
||||
|
||||
meta = {
|
||||
description = "Linux kernel drivers for Intel Ethernet adapters and LOMs (LAN On Motherboard)";
|
||||
homepage = https://github.com/dmarion/i40e;
|
||||
license = lib.licenses.gpl2;
|
||||
};
|
||||
};
|
||||
in
|
||||
{
|
||||
# i40e2 = i40e;
|
||||
boot.kernelPackages = pkgs.linuxKernel.packages.linux_5_10;
|
||||
# overlay = self: super: {
|
||||
# linuxPackages_5_4 = super.linuxPackages_5_4 // { inherit i40e; };
|
||||
# };
|
||||
}
|
||||
|
||||
@@ -1,11 +0,0 @@
|
||||
with builtins;
|
||||
let
|
||||
nodes = genList (n: n + 17) 2; in
|
||||
map (n: (
|
||||
rec {
|
||||
idx = 100 + n;
|
||||
name = "c0-${toString n}";
|
||||
address = "10.255.241.${toString idx}";
|
||||
ipoib = "10.255.243.${toString idx}";
|
||||
pubkey = ./. + "/ssh_host_key.d/c0-${toString n}.pub";
|
||||
})) nodes
|
||||
@@ -39,7 +39,7 @@ let
|
||||
address = host.address;
|
||||
};
|
||||
os.externalInterface = "eno33np0";
|
||||
hpc.compute = true;
|
||||
hpc.computeNode = true;
|
||||
# k8s = { inherit etcdCluster; };
|
||||
};
|
||||
|
||||
|
||||
@@ -100,6 +100,7 @@ let
|
||||
};
|
||||
|
||||
networking = {
|
||||
useDHCP = false;
|
||||
domain = mkDefault "cluster.local";
|
||||
defaultGateway = mkDefault "10.255.241.1";
|
||||
nameservers = mkDefault [ "8.8.8.8" ];
|
||||
@@ -144,12 +145,13 @@ let
|
||||
features.hpc.slurm = {
|
||||
enable = true;
|
||||
client = true;
|
||||
# clusterName = "ekman";
|
||||
mungeKey = ./munge.key;
|
||||
# jwtKey = ./jwt_hs256.key;
|
||||
mungeUid = mkDefault 996; # hack
|
||||
# pkey = "0x7666";
|
||||
clusterName = "ekman";
|
||||
controlMachine = "ekman-manage";
|
||||
dbdHost = "10.255.241.15";
|
||||
mungeKey = ./munge.key;
|
||||
jwtKey = ./jwt_hs256.key;
|
||||
slurmKey = ./slurm.key;
|
||||
# pkey = "0x7666";
|
||||
mailDomain = "oceanbox.io";
|
||||
nodeName = [
|
||||
"c0-[1-18] Sockets=2 CoresPerSocket=64 ThreadsPerCore=1 RealMemory=256000 TmpDisk=500000 State=UNKNOWN"
|
||||
|
||||
@@ -69,7 +69,6 @@ in {
|
||||
};
|
||||
};
|
||||
|
||||
features.hpc.slurm.mungeUid = 991;
|
||||
features.mft.enable = lib.mkForce true;
|
||||
|
||||
features = {
|
||||
|
||||
@@ -46,8 +46,6 @@ in {
|
||||
};
|
||||
};
|
||||
|
||||
features.hpc.slurm.mungeUid = 994;
|
||||
|
||||
features = {
|
||||
host = {
|
||||
inherit address;
|
||||
|
||||
@@ -17,9 +17,8 @@ let
|
||||
|
||||
login = import ./login { inherit pkgs; };
|
||||
c0 = import ./c0 { inherit pkgs; };
|
||||
c0x = import ./c0x { inherit pkgs; };
|
||||
c1 = import ./c1 { inherit pkgs; };
|
||||
fs-work = import ./fs-work { inherit pkgs; };
|
||||
fs-backup = import ./fs-backup { inherit pkgs; };
|
||||
in
|
||||
{ inherit ekman-manage; frontend = ekman-manage; } // login // c0 // c0x // c1 // fs-work // fs-backup
|
||||
{ inherit ekman-manage; frontend = ekman-manage; } // login // c0 // c1 // fs-work // fs-backup
|
||||
|
||||
@@ -51,8 +51,8 @@ in
|
||||
hpc = {
|
||||
slurm.server = false;
|
||||
slurm.slurmrestd = false;
|
||||
manage = false;
|
||||
login = true;
|
||||
manageNode = false;
|
||||
loginNode = true;
|
||||
knem = false;
|
||||
};
|
||||
|
||||
|
||||
@@ -86,9 +86,9 @@ in {
|
||||
|
||||
hpc = {
|
||||
slurm.server = true;
|
||||
slurm.slurmrestd = false;
|
||||
slurm.mungeUid = 996;
|
||||
manage = true;
|
||||
slurm.slurmrestd = true;
|
||||
slurm.dbdServer = false;
|
||||
manageNode = true;
|
||||
};
|
||||
|
||||
k8s = {
|
||||
@@ -283,6 +283,18 @@ in {
|
||||
'';
|
||||
};
|
||||
|
||||
services.dnsmasq.enable = true;
|
||||
services.dnsmasq.settings = {
|
||||
domain = [ "cluster.local" ];
|
||||
server = [
|
||||
"/obx.hs/100.100.100.100" # headscale dns
|
||||
];
|
||||
address = [
|
||||
"/slurmctld.cluster.local/127.0.0.1"
|
||||
];
|
||||
srv-host = "_slurmctld._tcp.cluster.local,slurmctld.cluster.local,6817,0,5";
|
||||
};
|
||||
|
||||
# ssh-rsa is deprecated, but putty/winscp users use it
|
||||
services.openssh.extraConfig = ''
|
||||
# pubkeyacceptedalgorithms ssh-rsa,ssh-ed25519-cert-v01@openssh.com,ecdsa-sha2-nistp256-cert-v01@openssh.com,ecdsa-sha2-nistp384-cert-v01@openssh.com,ecdsa-sha2-nistp521-cert-v01@openssh.com,sk-ssh-ed25519-cert-v01@openssh.com,sk-ecdsa-sha2-nistp256-cert-v01@openssh.com,rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-ed25519,ecdsa-sha2-nistp256,ecdsa-sha2-nistp384,ecdsa-sha2-nistp521,sk-ssh-ed25519@openssh.com,sk-ecdsa-sha2-nistp256@openssh.com,rsa-sha2-512,rsa-sha2-256
|
||||
@@ -323,11 +335,25 @@ in {
|
||||
useRoutingFeatures = "both"; # for exit-node usage
|
||||
extraUpFlags = [
|
||||
"--login-server=https://headscale.svc.oceanbox.io"
|
||||
"--accept-dns=false"
|
||||
"--accept-dns=true" # see dnsmasq
|
||||
"--accept-routes=true"
|
||||
"--advertise-exit-node"
|
||||
"--advertise-routes=10.255.241.0/24"
|
||||
];
|
||||
};
|
||||
services.networkd-dispatcher = {
|
||||
enable = true;
|
||||
rules = {
|
||||
"tailscale-router" = {
|
||||
onState = [ "routable" ];
|
||||
script = ''
|
||||
#!${pkgs.runtimeShell}
|
||||
${pkgs.ethtool}/bin/ethtool -K enp65s0np0 rx-udp-gro-forwarding on rx-gro-list off
|
||||
exit 0
|
||||
'';
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
imports = [
|
||||
./hardware-configuration.nix
|
||||
|
||||
@@ -94,7 +94,7 @@ let
|
||||
};
|
||||
cachix.enable = false;
|
||||
monitoring.nodeExporter.enable = false;
|
||||
mft.enable = true; # Mellanox MFT
|
||||
mft.enable = false; # Mellanox MFT
|
||||
};
|
||||
|
||||
networking = {
|
||||
@@ -289,6 +289,18 @@ let
|
||||
permissions = "u+rs,g+rx,o+rx";
|
||||
};
|
||||
};
|
||||
|
||||
# Use nvd to get package diff before apply
|
||||
system.activationScripts.system-diff = {
|
||||
supportsDryActivation = true; # safe: only outputs to stdout
|
||||
text = ''
|
||||
export PATH="${pkgs.lib.makeBinPath [ pkgs.nixVersions.latest ]}:$PATH"
|
||||
if [ -e /run/current-system ]; then
|
||||
${pkgs.lib.getExe pkgs.nvd} diff '/run/current-system' "$systemConfig" || true
|
||||
fi
|
||||
'';
|
||||
};
|
||||
|
||||
};
|
||||
|
||||
openssh-shosts = pkgs.openssh.overrideAttrs (attrs: {
|
||||
|
||||
@@ -205,25 +205,6 @@ in {
|
||||
# secret-key-files = /etc/nix/ekman.key
|
||||
'';
|
||||
|
||||
programs.msmtp = {
|
||||
enable = true;
|
||||
accounts = {
|
||||
default = {
|
||||
auth = false;
|
||||
tls = false;
|
||||
tls_starttls = false;
|
||||
port = 24;
|
||||
from = "rossby@oceanbox.io";
|
||||
host = "smtpgw.itpartner.no";
|
||||
# user = "utvikling";
|
||||
# password = "S0m3rp0m@de#21!";
|
||||
};
|
||||
};
|
||||
defaults = {
|
||||
aliases = "/etc/aliases";
|
||||
};
|
||||
};
|
||||
|
||||
services.prometheus.alertmanager.configuration.global = {
|
||||
smtp_smarthost = "smtpgw.itpartner.no";
|
||||
# smtp_auth_username = "utvikling";
|
||||
@@ -278,17 +259,6 @@ in {
|
||||
srv-host = "_slurmctld._tcp.cluster.local,slurmctld.cluster.local,6817,0,5";
|
||||
};
|
||||
|
||||
# Use nvd to get package diff before apply
|
||||
system.activationScripts.system-diff = {
|
||||
supportsDryActivation = true; # safe: only outputs to stdout
|
||||
text = ''
|
||||
export PATH="${pkgs.lib.makeBinPath [ pkgs.nixVersions.latest ]}:$PATH"
|
||||
if [ -e /run/current-system ]; then
|
||||
${pkgs.lib.getExe pkgs.nvd} diff '/run/current-system' "$systemConfig" || true
|
||||
fi
|
||||
'';
|
||||
};
|
||||
|
||||
# ssh-rsa is deprecated, but putty/winscp users use it
|
||||
services.openssh.extraConfig = ''
|
||||
# pubkeyacceptedalgorithms ssh-rsa,ssh-ed25519-cert-v01@openssh.com,ecdsa-sha2-nistp256-cert-v01@openssh.com,ecdsa-sha2-nistp384-cert-v01@openssh.com,ecdsa-sha2-nistp521-cert-v01@openssh.com,sk-ssh-ed25519-cert-v01@openssh.com,sk-ecdsa-sha2-nistp256-cert-v01@openssh.com,rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-ed25519,ecdsa-sha2-nistp256,ecdsa-sha2-nistp384,ecdsa-sha2-nistp521,sk-ssh-ed25519@openssh.com,sk-ecdsa-sha2-nistp256@openssh.com,rsa-sha2-512,rsa-sha2-256
|
||||
|
||||
Reference in New Issue
Block a user