227 Commits
ekman ... main

Author SHA1 Message Date
320c15488a fix(gitea-runner): Use public ingress 2026-01-26 10:23:13 +01:00
6e57520557 hack: Use host dns and ca-cert for gitea runner 2026-01-24 21:04:39 +01:00
f19d7c2881 feat: Add gitea runner to hashmap
Also switches to podman
2026-01-23 13:12:48 +01:00
Administrator
454fe6e713 fix: make hel1 obx ingress point to internal hetzber lb 2025-12-28 14:27:42 +01:00
Administrator
410fabe78b fix: make hel1 obx ingress round-robin and internal 2025-12-28 11:04:33 +01:00
Administrator
a4ec5acb75 feat: add *.svc to obx domains 2025-12-28 10:56:55 +01:00
Administrator
53cf811713 Merge branch 'main' of gitlab.com:oceanbox/clusterfck 2025-12-14 11:46:09 +01:00
Administrator
591bfbfe15 fix: add obx wildcard domains 2025-12-14 11:45:56 +01:00
7b23c53032 Decrease tos/hashmap dashboard refresh rate
From 5m to 1m
2025-12-02 10:04:21 +01:00
bd8ab1b6b8 Switch to resolved on tos/hashmap 2025-12-02 10:01:21 +01:00
ec1c516d1b Simplify tos/hashmap 2025-12-02 09:57:46 +01:00
dfe73d6d71 npins: Add nixos-25.11 2025-12-02 09:57:04 +01:00
Simen Kirkvik
2866de1ce9 Add /etc/nix/nix.conf to gitlab nix runner
Add pipe-operators
2025-11-29 18:26:15 +01:00
Simen Kirkvik
2cc5b08a51 Revert "Remove gitlab-runner docker pull policies"
This reverts commit 027c20d9a6.
2025-11-29 18:23:08 +01:00
Simen Kirkvik
84d677d264 Place gl runner registration file another place
/run/secrets got nuked after reboot or whatever
2025-11-29 18:22:01 +01:00
Simen Kirkvik
695f4407cb Update request concurrenty to 3 2025-11-29 18:21:48 +01:00
Simen Kirkvik
027c20d9a6 Remove gitlab-runner docker pull policies 2025-11-29 17:53:52 +01:00
042cace4f0 chore(nix): Set npins dir with direnv 2025-11-29 13:06:21 +01:00
24b586a4a0 fix: Restore configuration.nix for hashmap, add if-not-present pol for gitlab-runner 2025-11-28 20:31:43 +01:00
e9c0ce52b2 Merge branch 'simkir/coffee-kai' into 'main'
Add tos folder with coffee-kai@hashmap

See merge request oceanbox/platform!2
2025-11-28 17:01:19 +01:00
fdff8f3e48 Add .direnv to root .gitignore 2025-11-28 17:00:12 +01:00
f8a0269913 Add .direnv to .gitignore 2025-11-28 16:59:05 +01:00
ecf934e979 Ups, remove merge mistakes 2025-11-28 16:51:42 +01:00
d068384a44 Update new paths in tos/hive.nix 2025-11-28 16:45:33 +01:00
cd9d02a9d5 Create modules/gitlab-runner.nix 2025-11-28 16:45:04 +01:00
2216d589f0 Move tos/hosts/hashmap -> tos/hashmap
Also delete unused files
2025-11-28 16:44:25 +01:00
baf0547d7f Move tos packages to root 2025-11-28 16:43:31 +01:00
b4f6cd9b53 Move tos shell and sources to root 2025-11-28 16:42:56 +01:00
50dbe5183f Merge branch 'tmp/nixos-machines' into simkir/coffee-kai 2025-11-28 16:01:44 +01:00
05767f1976 Move hashmap to tos 2025-11-28 15:59:39 +01:00
b515338a54 Add skopeo to gitlab-runner 2025-11-28 14:01:31 +01:00
c7410e3978 Update gitlab runner concurrency 1 -> 16 2025-11-28 13:56:57 +01:00
a90bf694ad Disable printing and avahi 2025-11-28 13:56:46 +01:00
cf9ca8be08 Add htop and btop to coffee-kai 2025-11-28 12:44:40 +01:00
5c0e4e0388 Remove ssl path and add fancy PATH
From https://cobalt.rocks/posts/nix-gitlab/
2025-11-28 11:12:32 +01:00
cb41d50f12 Fix runner PATH 2025-11-28 11:06:15 +01:00
e21ae0780c Only start krdp on coffee-kai 2025-11-28 11:03:45 +01:00
351810aeaa Add gitlab-runner :^) 2025-11-28 11:03:36 +01:00
82a9b86531 Bump krdp 6.5.1 -> 6.5.3
Lamest releases ever. Just i18n.
2025-11-27 15:13:38 +01:00
a6617c9cce Bump sources 2025-11-27 15:13:32 +01:00
Administrator
710153d859 feat: add zellij and iperf to vtn/tos-gw 2025-11-22 17:48:17 +01:00
Administrator
dddd0fa88c fix: downgrade kernel to 5.10 on c0 on ekman 2025-11-21 13:24:30 +01:00
Administrator
422ac77c87 fix: fix and update knem 2025-11-21 13:24:02 +01:00
Administrator
cebc4e773d fix: fix fs-work to vtn routing 2025-11-20 10:11:49 +01:00
Administrator
b7c66e99fa feat: enable obx coredns on hel1-1 2025-11-18 14:31:51 +01:00
Administrator
6c05a71290 fix: misc hel1-1 tweaks 2025-11-18 14:21:19 +01:00
Administrator
0c25e3c660 fix: fix merge errors 2025-11-18 14:19:23 +01:00
Administrator
84e730b580 fix: update local hosts file 2025-11-18 13:34:07 +01:00
Administrator
644b9acf24 fix: update obx zone with proper cname:s 2025-11-18 13:31:02 +01:00
Administrator
65c5ce955f fix: add common hosts to toplevel hosts.nix 2025-11-18 13:13:44 +01:00
Administrator
665707969b fix: update fs-work for new raid-controller 2025-11-18 12:44:31 +01:00
Administrator
da994766e1 fix: ekman: add compute nodes to /etc/hosts 2025-11-18 12:25:40 +01:00
Administrator
131ffda9e8 feat: allow for multiple slurmctld hosts 2025-11-18 12:25:00 +01:00
Administrator
2634f392ca fix: add compute nodes to /etc/hosts, otherwise slurm becomes very unhappy! 2025-11-18 12:24:18 +01:00
Administrator
26f2b16bf9 Merge branch 'main' of gitlab.com:oceanbox/clusterfck 2025-11-17 17:57:46 +01:00
Administrator
a71473a548 fix: new pci risers have been installed 2025-11-17 17:56:58 +01:00
Administrator
6c405e5b91 refactor: reformat 2025-11-17 17:56:35 +01:00
Administrator
7c4d108026 fix: disable mDNS in resolved 2025-11-17 17:54:46 +01:00
ce121e7d2b fix: add talos firewalling (not used) 2025-11-14 20:00:41 +01:00
cafb434781 feat: add hetzner HCCM and CSI drivers 2025-11-13 20:58:05 +01:00
8405ce2d87 fix: update hetzner talos configs 2025-11-13 20:45:32 +01:00
81103a09c4 feat: add hetzner hel1 cluster 2025-11-13 17:46:58 +01:00
1a4318c075 feat: add rook cluster setup 2025-11-13 17:44:16 +01:00
Administrator
d36366fdb1 Merge branch 'main' of gitlab.com:oceanbox/nixos-clusters 2025-11-11 22:04:14 +01:00
Administrator
78fe8616e6 feat: add Hetzner hel1-gw-1 2025-11-11 22:04:01 +01:00
Administrator
7c3bc4ba47 feat: enable IPMI SOL on rossby nodes 2025-11-11 22:02:56 +01:00
Administrator
059fa13f3b feat: enable console on ttyS0 for IPMI SOL 2025-11-11 18:28:36 +01:00
Administrator
4a33496824 fix: add emkan.tos.obx to nameserver search 2025-11-11 13:26:39 +01:00
Administrator
2264ec5108 fix: rename nodes to match BMC ip 2025-11-11 13:25:16 +01:00
Administrator
0a3180fd2c feat: enable console on ttyS0 for IPMI SOL 2025-11-11 10:47:55 +01:00
Administrator
0f7f991cad feat: add routing to 244 via 241.99 to compute nodes 2025-11-10 18:48:25 +01:00
Administrator
92826cd6d2 fix: move kraken to /home 2025-11-10 17:51:00 +01:00
e1869e5c89 Spelling 2025-11-05 16:15:13 +01:00
ac652d366a Disable upower again 2025-11-05 16:14:14 +01:00
60b6b0e0cd Remove unused options 2025-11-05 16:12:22 +01:00
b38c934312 Set dashboard refresh to 5m 2025-11-05 16:12:22 +01:00
addee4268f fix: Build on Target 2025-11-04 15:09:52 +01:00
3aede769fd Add moritz to admin 2025-11-04 15:07:56 +01:00
2b95b82a6f Build krdp ourselves
But need to make hashmap build it itself
2025-11-04 14:56:33 +01:00
d8dbf1122b fix: Add additional options to autoUpgrade 2025-11-04 12:44:37 +01:00
02c407a579 Try disabling firewall to test RDP 2025-11-04 12:34:08 +01:00
da3fa53fe9 Change dashboard 2025-11-03 13:26:20 +01:00
78d3ce294d fix: change msmtp envelope from to tld for spf 2025-11-01 18:46:55 +01:00
Moritz Jörg
0c7f01a650 fix: Add quiteboot, automatic gc and disable flake registry 2025-10-31 10:34:37 +01:00
8a91ef2209 Add nixos-hardware
And import hashmap's cpu type comet-lake. So close to being coffee
lake!!
2025-10-31 09:43:09 +01:00
c3920d8015 Open RDP port 2025-10-31 09:07:22 +01:00
Moritz Jörg
6af717c807 fix: Format and add my ssh key 2025-10-30 20:34:52 +01:00
e64ecd2993 Add plasma and stuff 2025-10-30 16:44:30 +01:00
8d2b840bdf Initial commit 2025-10-30 14:47:01 +01:00
4b988488fb Merge branch 'main' of gitlab.com:oceanbox/clusterfck 2025-10-30 11:25:21 +01:00
9af8c6e137 fix: add c0-[19-20] back in slurm 2025-10-30 11:25:17 +01:00
Jonas Juselius
c6b720d1ac feat: enable msmtp on ekman-manage 2025-10-30 11:24:19 +01:00
3b1f689273 fix: format 2025-10-29 11:30:25 +01:00
a845b237f9 fix: Use tmpl with generic 2025-10-29 11:29:50 +01:00
63e04f57db fix: Add tmpl 2025-10-29 11:27:40 +01:00
5e69d7adaf fix: Unflake nixos-anywhere install 2025-10-29 11:18:39 +01:00
c9aa712715 fix: Update old mailconfig 2025-10-29 11:01:24 +01:00
Jonas Juselius
85a315b1ec fix: remove lint 2025-10-29 10:32:27 +01:00
Jonas Juselius
73907361b6 fix: smtp fixes and tweaks 2025-10-29 10:32:15 +01:00
5328e5b645 feat: enable acme dns01 certs and msmtp realy vi outlook.com 2025-10-29 09:53:42 +01:00
d94d4ab94c fix: revert slurm partitions and configure slurm mail 2025-10-29 09:51:57 +01:00
d7d45b0911 fix: use networking default dns domains 2025-10-29 09:51:25 +01:00
1671ea3ca1 fix: enable all users on gw-tos 2025-10-29 09:48:35 +01:00
aba775befe feat: enforce slurm accounting limits and enable mail 2025-10-29 09:47:44 +01:00
Jonas Juselius
752fe275f0 fix: fix ekman internal dns settings 2025-10-23 13:20:29 +02:00
Jonas Juselius
efb43acad1 fix: move firewall defs 2025-10-23 13:19:39 +02:00
Jonas Juselius
245435a508 feat: add external interface to tos-gw and fix firewalling 2025-10-23 13:18:52 +02:00
Jonas Juselius
31cea944f3 feat: make firewall piercing conditional 2025-10-23 13:17:18 +02:00
Jonas Juselius
50880b458b fix: update nixos module 2025-10-23 13:16:12 +02:00
cc4d50b87f fix: re-enable 239 routing on rossby-manage 2025-10-23 10:25:51 +02:00
537e4ba371 fix: fix faulty dns zone syntax (hallucination) 2025-10-23 10:25:04 +02:00
Jonas Juselius
34307093bb fix: fix tos-gw and vtn-gw dns 2025-10-23 08:01:55 +02:00
241454a2a1 fix: remove old and duplicate ssh-keys 2025-10-22 17:47:18 +02:00
530ff0aaf9 fix: add slurm-agent to the dns zone(s) 2025-10-22 17:45:58 +02:00
3ea15031a7 fix: use default route for vtn-tos and update dns 2025-10-21 14:57:33 +02:00
65ccb89fb9 fix: update vtn-gw to new server 2025-10-21 14:19:23 +02:00
Jonas Juselius
8b42df3e73 fix: fix resolved default route to avoid ib-ifaces 2025-10-21 11:29:43 +02:00
Jonas Juselius
a59452ff7a 2025-10-21 11:26:30 +02:00
Jonas Juselius
cd5e4e5c25 fix: fix slurm mysql params and rename 2025-10-21 11:25:54 +02:00
d307989e83 fix: use new slurm-accounting host and remove cruft 2025-10-20 10:15:25 +02:00
Jonas Juselius
5ff6f6d89b fix: disable dbdServer on rossby 2025-10-19 14:48:36 +02:00
Jonas Juselius
8e3272a4c1 fix: make tos-router a slurm accounting server 2025-10-19 14:47:13 +02:00
Jonas Juselius
657e289c78 fix: make extraHosts shared between all clusters and add slurm-accounting 2025-10-19 14:46:36 +02:00
Jonas Juselius
7d5a65e344 fix: fix independent slurmdbd setup 2025-10-19 14:45:32 +02:00
Jonas Juselius
4b80478ec8 fix: make dbdServer an independent config entity 2025-10-19 14:08:37 +02:00
ec3950de70 fix: enable dbdserver on rossby and disable auth/slurm and use munge instead 2025-10-19 09:56:53 +02:00
2b3dc17bf8 Merge branch 'main' of gitlab.com:oceanbox/clusterfck 2025-10-19 09:05:19 +02:00
21c6ca2fea fix: use new SlurmctldHost instead of ControlMachine 2025-10-19 09:05:05 +02:00
Jonas Juselius
2ecac6cf3c feat: add 100GbE to c0-18 and move to drifters partition 2025-10-16 18:31:56 +02:00
384d58ff84 fix: fix rossby tailscale settings 2025-10-14 13:36:25 +02:00
fc5a0923e1 Merge branch 'main' of gitlab.com:oceanbox/clusterfck 2025-10-14 13:15:33 +02:00
ee80f1ff9a fix: add cluster.obx domain to internal dns 2025-10-14 13:15:24 +02:00
Jonas Juselius
4b9a0f69c6 Merge branch 'main' of gitlab.com:oceanbox/nixos-clusters 2025-10-14 10:17:24 +02:00
Jonas Juselius
79bc969cfa fix: change relay to router due to too many typos 2025-10-14 10:17:10 +02:00
cb25301636 feat: tailscale tag rossby nodes 2025-10-13 08:25:39 +02:00
e985a8221b fix: disable resolved dns stub globally 2025-10-13 08:24:55 +02:00
7d82e3c44a Merge branch 'main' of gitlab.com:oceanbox/clusterfck 2025-10-13 08:09:51 +02:00
Jonas Juselius
35c09d6c94 feat: tailscale tag tos-relay 2025-10-13 08:08:56 +02:00
Jonas Juselius
e5bcac154b fix: fix misc networking problems and typos 2025-10-13 08:08:08 +02:00
Jonas Juselius
4dd0b495ab feat: change ekman domain name from cluster.local to ekman.tos.obx 2025-10-13 08:07:27 +02:00
05e1d24048 fix: route tos traffic via 210 2025-10-11 18:31:52 +02:00
Jonas Juselius
389ad47745 feat: use resolved and networkd on ekman 2025-10-11 18:07:23 +02:00
e101653b7f fix: move tailscale relay to fs-work for now 2025-10-11 17:53:23 +02:00
Jonas Juselius
29e65c1598 fix: misc cleanups 2025-10-11 11:09:09 +02:00
Jonas Juselius
6611267c3b feat: enable knem 2025-10-11 11:08:57 +02:00
Jonas Juselius
75848674f2 fix: add initial ib hca uuids 2025-10-11 11:08:02 +02:00
Jonas Juselius
543eb35351 fix: remove old beegfs stuff 2025-10-11 11:06:34 +02:00
8d1d892f25 feat: route tos traffic via rossby (for now) 2025-10-10 18:27:30 +02:00
bb56b8f3a6 feat: set domain to rossby.vtn.obx and fix dns stuff 2025-10-10 18:26:36 +02:00
b398364cc7 fix: use dns instead of hosts 2025-10-10 18:25:48 +02:00
c314409d3b feat: make rossby-manage a subnet router and dns server 2025-10-10 18:24:38 +02:00
86f1e0d974 fix: remove superfluous dns names from etcd certs 2025-10-10 18:21:00 +02:00
1402192618 fix: refactor dns server settings to toplevel 2025-10-10 18:20:03 +02:00
1cfd24de44 fix: make etcd listen on all interfaces (localhost included) 2025-10-10 12:33:03 +02:00
7cc6cd0eba fix: all of rossby is now on networkd and resolved 2025-10-09 22:57:48 +02:00
670c75953d feat: convert rossby to networkd and resolved 2025-10-09 20:46:41 +02:00
88a72088f1 feat: add nixos-anywhare installer 2025-10-09 18:18:07 +02:00
bb959dae3a Merge branch 'main' of gitlab.com:oceanbox/clusterfck 2025-10-09 18:04:49 +02:00
Jonas Juselius
455ad2076d fix: remove hosts in favore of coredns 2025-10-09 18:04:06 +02:00
Jonas Juselius
5c0e8cbb4c feat: enable systemd-resolved for all nodes by default 2025-10-09 18:00:55 +02:00
Jonas Juselius
3b78b4038e feat: generalize to tos and vtn realy (stub) 2025-10-09 17:54:18 +02:00
Jonas Juselius
63c852ef37 fix: add coredns and rename srv0 to tos-relay 2025-10-09 16:45:56 +02:00
b76f5e309a fix: fix ipoib address range 2025-10-08 20:03:38 +02:00
f51f30004e feat: convert rossby-manage to networkd and resolved and fix dnsmasq 2025-10-08 20:02:26 +02:00
Jonas Juselius
26cfbd7130 feat: convert rossby-manage to resolved and networkd 2025-10-08 18:52:51 +02:00
Jonas Juselius
4fa73f5a36 feat: use resolved and networkd on obx-tos-nfs0 2025-10-08 17:58:58 +02:00
Jonas Juselius
b1e713f113 feat: use resolved on obx-tos-nfs0 2025-10-08 15:38:02 +02:00
5271a33b7c fix(nfs0): Don't import self 2025-10-08 11:44:06 +02:00
Jonas Juselius
1ab635279a feat: add obx-tos-nfs0 2025-10-08 10:28:23 +02:00
Jonas Juselius
9e1deae147 fix: add jonas shellfish key 2025-10-02 20:44:59 +02:00
e1b15035cc fix: fix dnsmasq for the obx domain 2025-10-01 09:04:44 +02:00
981eda082c Merge branch 'main' of gitlab.com:oceanbox/clusterfck 2025-10-01 09:02:31 +02:00
2bf922fb6d feat: set up the raid10 device on /work 2025-10-01 09:02:24 +02:00
Jonas Juselius
f6db232ca7 fix: move sudo settings from hpc module to actual nodes 2025-09-28 12:30:56 +02:00
Jonas Juselius
8c5ca68530 Merge branch 'main' of gitlab.com:oceanbox/nixos-clusters 2025-09-28 08:49:34 +02:00
Jonas Juselius
d481f6789c fix: disable tailscale dns 2025-09-28 08:49:26 +02:00
43547f45de fix: disable tailscale dns on login and manage 2025-09-28 08:48:35 +02:00
7be10b4457 fix: fix reuid-slurm.sh 2025-09-27 17:42:22 +02:00
Jonas Juselius
be13a10c8f fix: remove keys from git and share keys at toplevel 2025-09-27 16:13:16 +02:00
Jonas Juselius
799cb6cae1 fix: add 8.8.8.8 to ekman dnsmasq 2025-09-27 16:06:08 +02:00
eabb600641 Merge branch 'main' of gitlab.com:oceanbox/clusterfck 2025-09-27 16:01:07 +02:00
bc1ce00610 fix: add 8.8.8.8 to list of dnsmasq servers 2025-09-27 16:00:56 +02:00
Jonas Juselius
6d3d18bbe0 fix: remove extra msmtp stanza 2025-09-27 15:59:12 +02:00
Jonas Juselius
30d0180b59 feat: use central, off-site slurmdbd 2025-09-27 15:57:48 +02:00
Jonas Juselius
680330d569 fix: unify ekman c0 and c0x 2025-09-27 15:57:11 +02:00
e6cf1f6232 Merge branch 'main' of gitlab.com:oceanbox/clusterfck 2025-09-27 14:04:11 +02:00
caab89f642 feat: use central sulrmdbd, and misc fixes 2025-09-27 14:03:51 +02:00
a981f5e7ba fix: fix slurm and munge uid:s and gid:s 2025-09-27 13:42:38 +02:00
34c28e18bf fix: move bin to toplevel and add reuid-slurm.sh 2025-09-27 13:42:09 +02:00
Jonas Juselius
5dfc0743eb Merge branch 'main' of gitlab.com:oceanbox/nixos-clusters 2025-09-26 16:03:47 +02:00
398af17797 fix: slurm updates for rossby 2025-09-26 16:03:31 +02:00
998d551943 fix: use jwt, simplify slurmrestd, and make slurmdbd optional 2025-09-26 16:02:30 +02:00
Jonas Juselius
b2bf32dc73 fix: fix tailscale routing, etc. 2025-09-26 15:54:24 +02:00
Jonas Juselius
312b3906ab fix: disable raid on fs-backup (for now) 2025-09-26 15:53:46 +02:00
c9624213ed fix: fix slurmdbd setup 2025-09-25 15:52:30 +02:00
bcff2e6c2f Merge branch 'main' of gitlab.com:oceanbox/clusterfck 2025-09-25 12:38:44 +02:00
3c0a7f91f5 fix: slurm and stuff 2025-09-25 12:28:59 +02:00
46cf9da93f feat: allow tailnet access 2025-09-25 12:16:42 +02:00
2e919182d4 fix: remove /opt/singularity 2025-09-25 12:16:24 +02:00
ff3f897859 fix: rename features for better clarity 2025-09-25 12:15:51 +02:00
9b798444d1 feat: enable slurm jwt and remove slocket proxy 2025-09-25 12:15:24 +02:00
d2e27a7e87 feat: add slurm key generators and remove stale scripts 2025-09-25 12:08:05 +02:00
Jonas Juselius
d5cfcd2bf9 fix: reset systemd slurmrest socketConfig to true 2025-09-24 15:22:33 +02:00
Jonas Juselius
cf4ae97e1c feat: ekman on new cluster setup 2025-09-24 12:24:28 +02:00
Jonas Juselius
96f8215c52 feat: upgrade ekman to new cluster structure 2025-09-23 13:40:16 +02:00
Jonas Juselius
46473c88dd Merge branch 'main' of gitlab.com:oceanbox/nixos-clusters 2025-09-23 12:59:13 +02:00
fac7bdd62e fix: change feature manager to manage 2025-09-23 12:58:57 +02:00
Jonas Juselius
e38b0a2317 fix: change /frontend to /users 2025-09-23 12:30:18 +02:00
82a5328d7f feat: move /home and /opt to cephfs and tweak mounts 2025-09-23 12:11:53 +02:00
8894339216 fix: enable 100GbE and disable net mounts for now 2025-09-16 13:45:37 +02:00
f5679d39f9 fix: add missing nodes and disable net mounts for now 2025-09-16 13:43:00 +02:00
59db74b265 fix: misc fixes and tweaks 2025-09-16 13:42:25 +02:00
65aba0f69d fix: update Mellanox firmware tools 2025-09-16 13:41:01 +02:00
db794e6eea fix: fix extraSANs 2025-09-13 10:13:12 +02:00
4057a00143 fix: /work mount 2025-09-13 07:31:45 +02:00
14b5f07cc6 fix: move apiserver port to standard 6443 on (new) ekman 2025-09-13 07:11:04 +02:00
33a14d1509 fix: move IB network to 10.1.6.0/24 (get it? :) 2025-09-13 07:10:25 +02:00
3af5ba3fbd fix: add fs-work and etcd cluster 2025-09-13 07:03:17 +02:00
6767eb21e6 fix: move apiserver port to standard 6443 2025-09-13 07:00:49 +02:00
eb7b1f8130 fix: fix ekman part of botched merge 2025-09-12 14:38:36 +02:00
fcd136ed4e fix: partially fix a totally botched merge. 2025-09-12 14:32:42 +02:00
Jonas Juselius
c8814ec8d9 Merge remote-tracking branch 'origin/rossby' 2025-09-12 13:52:20 +02:00
f7f6eabb0f fix: misc fixes and tweaks 2025-09-12 13:49:29 +02:00
Jonas Juselius
69e47e60d0 fix: simplify ekman hive 2025-09-12 13:28:38 +02:00
Jonas Juselius
5c72112457 major: grand unified clusterfck (ekman not tested yet) 2025-09-12 13:12:36 +02:00
Jonas Juselius
ba5f1b8add wip: convert ekman to new cluster sturcture (not complete) 2025-09-12 12:53:56 +02:00
e0846164a7 major: initial rossy cluster and biggish refactor 2025-09-12 11:59:15 +02:00
294 changed files with 26899 additions and 1938 deletions

12
.envrc Normal file
View File

@@ -0,0 +1,12 @@
#!/usr/bin/env bash
# the shebang is ignored, but nice for editors
watch_file nix/sources.json
# Load .env file if it exists
dotenv_if_exists
# Set npins dir
export NPINS_DIRECTORY="nix"
# Activate development shell
use nix

4
.gitignore vendored
View File

@@ -1,6 +1,10 @@
*.pem
*.csr
*.key
result
result-*
gcroots/
ca
configuration.nix
system
.direnv

28
bin/reuid-slurm.sh Executable file
View File

@@ -0,0 +1,28 @@
#!/bin/sh
pwunconv
sed -i 's/slurm:!:[0-9]\+:[0-9]\+:/slurm:!:401:401:/;
s/munge:!:[0-9]\+:[0-9]\+:/munge:!:402:402:/' /etc/passwd
pwconv
grpunconv
sed -i 's/slurm:x:[0-9]\+:/slurm:x:401:/;
s/munge:x:[0-9]\+:/munge:x:402:/' /etc/group
grpconv
systemctl stop munged
rm -rf /run/munge
chown -R munge:munge /etc/munge /var/lib/munge
systemctl start munged
systemctl stop slurmd
rm -rf /run/slurm
chown -R slurm:slurm /etc/slurm
systemctl start slurmd
if [ -d /var/spool/slurmctld ]; then
systemctl stop slurmctld
rm -rf /run/slurm /run/slurmctld
systemctl start slurmctld
fi

3
bin/slurm-gen-jwt_hs256.sh Executable file
View File

@@ -0,0 +1,3 @@
#!/bin/sh
dd if=/dev/random of=jwt_hs256.key bs=32 count=1
chmod 400 jwt_hs256.key

2
bin/slurm-gen-mungekey.sh Executable file
View File

@@ -0,0 +1,2 @@
#!/bin/sh
mungekey -c -b 2048 -k munge.key

View File

@@ -1,108 +0,0 @@
{ pkgs ? import <nixpkgs> {} }:
let
# Pin the deployment package-set to a specific version of nixpkgs
# pkgs = import (builtins.fetchTarball {
# url = "https://github.com/NixOS/nixpkgs/archive/e6377ff35544226392b49fa2cf05590f9f0c4b43.tar.gz";
# sha256 = "1fra9wwy5gvj5ibayqkzqpwdf715bggc0qbmrfch4fghwvl5m70l";
# }) {};
# pkgs = import <nixpkgs> {};
nodes = import ./nodes.nix;
compute = {
deployment.tags = [ "compute" "c0" ];
fileSystems = {
"/frontend" = {
device = "10.255.241.100:/home";
fsType = "nfs4";
options = [
"soft"
"defaults"
"noauto"
"x-systemd.automount"
];
};
};
systemd.automounts = [
{
where = "/frontend";
wantedBy = [ "default.target" ];
}
];
};
mkCompute = host:
let
hw = ./hw + "/${host.name}.nix";
in {
"${host.name}" = {
cluster = {
compute = true;
k8sNode = true;
mounts = {
rdma.enable = true;
automount.enable = true;
home = false;
opt = true;
work = true;
data = false;
backup = false;
ceph = false;
};
};
features = {
host = {
name = host.name;
address = host.address;
};
os.externalInterface = "enp33s0f0np0";
hpc.compute = true;
hpc.knem = true;
# k8s = { inherit etcdCluster; };
};
deployment.targetHost = host.address;
# services.udev.extraRules = ''
# KERNEL=="ibp1s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
networking = {
hostName = host.name;
useDHCP = false;
interfaces.enp33s0f0np0 = {
useDHCP = false;
ipv4.addresses = [ {
address = host.address;
prefixLength = 24;
} ];
ipv4.routes = [ {
address = "10.255.242.0";
prefixLength = 24;
via = "10.255.241.100";
} ];
};
# interfaces."ibp1s0.7666" = {
interfaces."ibp1s0" = {
useDHCP = false;
ipv4.addresses = [ {
address = host.ipoib;
prefixLength = 24;
} ];
};
};
imports = [
hw
../cluster.nix
../mounts.nix
#./kernel.nix
];
}
// compute;
};
in builtins.foldl' (a: n: a // mkCompute n) {} nodes

View File

@@ -1,11 +0,0 @@
with builtins;
let
nodes = genList (n: n + 1) 16; in
map (n: (
rec {
idx = 100 + n;
name = "c0-${toString n}";
address = "10.255.241.${toString idx}";
ipoib = "10.255.243.${toString idx}";
pubkey = ./. + "/pubkeys/c0-${toString n}.pub";
})) nodes

View File

@@ -1,108 +0,0 @@
{ pkgs ? import <nixpkgs> {} }:
let
# Pin the deployment package-set to a specific version of nixpkgs
# pkgs = import (builtins.fetchTarball {
# url = "https://github.com/NixOS/nixpkgs/archive/e6377ff35544226392b49fa2cf05590f9f0c4b43.tar.gz";
# sha256 = "1fra9wwy5gvj5ibayqkzqpwdf715bggc0qbmrfch4fghwvl5m70l";
# }) {};
# pkgs = import <nixpkgs> {};
nodes = import ./nodes.nix;
compute = {
deployment.tags = [ "compute" "c0" ];
fileSystems = {
"/frontend" = {
device = "10.255.241.100:/home";
fsType = "nfs4";
options = [
"soft"
"defaults"
"noauto"
"x-systemd.automount"
];
};
};
systemd.automounts = [
{
where = "/frontend";
wantedBy = [ "default.target" ];
}
];
};
mkCompute = host:
let
hw = ./hw + "/${host.name}.nix";
in {
"${host.name}" = {
cluster = {
compute = true;
k8sNode = true;
mounts = {
rdma.enable = true;
automount.enable = true;
home = false;
opt = true;
work = true;
data = true;
backup = true;
ceph = false;
};
};
features = {
host = {
name = host.name;
address = host.address;
};
os.externalInterface = "enp33s0f0np0";
hpc.compute = true;
hpc.knem = true;
# k8s = { inherit etcdCluster; };
};
deployment.targetHost = host.address;
# services.udev.extraRules = ''
# KERNEL=="ibp1s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
networking = {
hostName = host.name;
useDHCP = false;
interfaces.enp33s0f3np3 = {
useDHCP = false;
ipv4.addresses = [ {
address = host.address;
prefixLength = 24;
} ];
ipv4.routes = [ {
address = "10.255.242.0";
prefixLength = 24;
via = "10.255.241.100";
} ];
};
# interfaces."ibp1s0.7666" = {
interfaces."ibp1s0" = {
useDHCP = false;
ipv4.addresses = [ {
address = host.ipoib;
prefixLength = 24;
} ];
};
};
imports = [
hw
../cluster.nix
../mounts.nix
# ./kernel.nix
];
}
// compute;
};
in builtins.foldl' (a: n: a // mkCompute n) {} nodes

View File

@@ -1,48 +0,0 @@
{pkgs, lib, ...}:
let
kernel = pkgs.linuxPackages.kernel;
i40e =
pkgs.stdenv.mkDerivation rec {
name = "i40e-${version}-${kernel.version}";
version = "2.13.10";
src = pkgs.fetchFromGitHub {
owner = "dmarion";
repo = "i40e";
rev = "7228a7c3b362c3170baa2f9a9c6870a900e78dbd";
sha256 = "087kvq9wrc1iw6vig8cqcx7cb6346wx8qxzb85c3n8638vq1vrxr";
};
hardeningDisable = [ "pic" ];
configurePhase = ''
cd src
kernel_version=${kernel.modDirVersion}
sed -i -e 's|/lib/modules|${kernel.dev}/lib/modules|' Makefile
sed -i -e 's|/lib/modules|${kernel.dev}/lib/modules|' common.mk
export makeFlags="BUILD_KERNEL=$kernel_version"
'';
installPhase = ''
install -v -D -m 644 i40e.ko "$out/lib/modules/$kernel_version/kernel/drivers/net/i40e/i40e2.ko"
'';
dontStrip = true;
enableParallelBuilding = true;
meta = {
description = "Linux kernel drivers for Intel Ethernet adapters and LOMs (LAN On Motherboard)";
homepage = https://github.com/dmarion/i40e;
license = lib.licenses.gpl2;
};
};
in
{
# i40e2 = i40e;
boot.kernelPackages = pkgs.linuxKernel.packages.linux_5_10;
# overlay = self: super: {
# linuxPackages_5_4 = super.linuxPackages_5_4 // { inherit i40e; };
# };
}

View File

@@ -1,11 +0,0 @@
with builtins;
let
nodes = genList (n: n + 17) 2; in
map (n: (
rec {
idx = 100 + n;
name = "c0-${toString n}";
address = "10.255.241.${toString idx}";
ipoib = "10.255.243.${toString idx}";
pubkey = ./. + "/pubkeys/c0-${toString n}.pub";
})) nodes

View File

@@ -1,119 +0,0 @@
{ pkgs ? import <nixpkgs> {} }:
let
# Pin the deployment package-set to a specific version of nixpkgs
# pkgs = import (builtins.fetchTarball {
# url = "https://github.com/NixOS/nixpkgs/archive/e6377ff35544226392b49fa2cf05590f9f0c4b43.tar.gz";
# sha256 = "1fra9wwy5gvj5ibayqkzqpwdf715bggc0qbmrfch4fghwvl5m70l";
# }) {};
# pkgs = import <nixpkgs> {};
nodes = import ./nodes.nix;
compute = {
deployment.tags = [ "compute" "c1" ];
systemd.automounts = [
{
where = "/frontend";
wantedBy = [ "default.target" ];
}
];
};
mkCompute = host:
let
hw = ./hw + "/${host.name}.nix";
in {
"${host.name}" = {
cluster = {
compute = true;
k8sNode = true;
mounts = {
rdma.enable = false;
gbe100.enable = true;
automount.enable = true;
home = true;
opt = true;
work = true;
data = false;
backup = false;
ceph = true;
};
};
features = {
host = {
name = host.name;
address = host.address;
};
os.externalInterface = "eno33np0";
hpc.compute = true;
# k8s = { inherit etcdCluster; };
};
deployment.targetHost = host.target;
# services.udev.extraRules = ''
# KERNEL=="ibp1s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
# boot.kernel.sysctl = {
# "net.ipv4.tcp_timestamps" = 0;
# "net.ipv4.tcp_sack" = 1;
# "net.core.netdev_max_backlog" = 250000;
# "net.core.rmem_max" = 4194304;
# "net.core.wmem_max" = 4194304;
# "net.core.rmem_default" = 4194304;
# "net.core.wmem_default" = 4194304;
# "net.core.optmem_max" = 4194304;
# "net.ipv4.tcp_rmem" = "4096 87380 4194304";
# "net.ipv4.tcp_wmem" = "4096 65536 4194304";
# "net.ipv4.tcp_low_latency" = 1;
# "net.ipv4.tcp_adv_win_scale" = 1;
# };
networking = {
hostName = host.name;
useDHCP = false;
interfaces.eno33np0 = {
useDHCP = false;
ipv4.addresses = [ {
address = host.address;
prefixLength = 24;
} ];
ipv4.routes = [ {
address = "10.255.242.0";
prefixLength = 24;
via = "10.255.241.100";
} ];
};
# interfaces.ibp65s0 = {
# useDHCP = false;
# ipv4.addresses = [ {
# address = host.ipoib;
# prefixLength = 24;
# } ];
# };
interfaces.enp65s0np0 = {
useDHCP = false;
ipv4.addresses = [ {
address = host.gbe100;
prefixLength = 24;
} ];
};
# firewall.extraCommands =
# if host.name == "c1-1" then ''
# iptables -t nat -A POSTROUTING -d 10.255.244.0/24 -j MASQUERADE
# ''
# else "";
};
imports = [
hw
../cluster.nix
../mounts.nix
];
}
// compute;
};
in builtins.foldl' (a: n: a // mkCompute n) {} nodes

View File

@@ -1,358 +0,0 @@
{ pkgs ? import <nixpkgs> {} }:
let
name = "ekman";
address = "10.255.241.100";
in
{
ekman = { config, pkgs, ... }: with pkgs; {
deployment.tags = [ "ekman" "login" ];
deployment.targetHost = address;
system.autoUpgrade.enable = lib.mkForce false;
systemd.targets = {
sleep.enable = false;
suspend.enable = false;
hibernate.enable = false;
hybrid-sleep.enable = false;
};
cluster = {
compute = true;
k8sNode = true;
mounts = {
rdma.enable = true;
automount.enable = true;
home = false;
opt = false;
work = true;
data = true;
backup = true;
ceph = true;
};
};
features = {
host = {
name = "ekman";
address = "10.255.241.100";
};
myvnc.enable = false;
os = {
externalInterface = "enp33s0f0np0";
nfs.enable = true;
nfs.exports = ''
/exports 10.255.241.0/24(insecure,rw,sync,no_subtree_check,crossmnt,fsid=0,no_root_squash)
/exports 10.255.243.0/24(insecure,rw,sync,no_subtree_check,crossmnt,fsid=0,no_root_squash)
'';
};
hpc = {
slurm.server = false;
slurm.slurmrestd = false;
frontend = false;
login = true;
knem = false;
};
k8s = {
master.enable = false;
node.enable = true;
};
desktop.enable = false;
# server.enable = true;
monitoring = {
# server = {
# enable = false;
# scrapeHosts = [ "frontend" "nfs0" "nfs1" ] ++ (builtins.map (x: x.name) computeNodes);
# defaultAlertReceiver = {
# email_configs = [
# { to = "jonas.juselius@oceanbox.io"; }
# ];
# };
# pageAlertReceiver = {
# webhook_configs = [
# {
# url = "https://prometheus-msteams.k2.itpartner.no/ekman";
# http_config = {
# tls_config = { insecure_skip_verify = true; };
# };
# }
# ];
# };
# };
# webUI.enable = false;
# webUI.acmeEmail = "innovasjon@itpartner.no";
# webUI.allow = [
# "10.1.2.0/24"
# "172.19.254.0/24"
# "172.19.255.0/24"
# ];
infiniband-exporter = {
enable = true;
nameMap = ''
0x248a07030029d2fc "frontend"
0x248a07030029d104 "ekman"
0x5aa2e1fffe1edfca "fs-work"
0x1c34da0300787798 "fs-backup"
0xe8ebd3030024981e "c0-1"
0xe8ebd3030024a21a "c0-2"
0xe8ebd30300249a3a "c0-3"
0xe8ebd30300248b9e "c0-4"
0xe8ebd30300248b86 "c0-5"
0xe8ebd3030024998a "c0-6"
0xe8ebd30300248b8e "c0-7"
0xe8ebd3030024999e "c0-8"
0xe8ebd30300248fca "c0-9"
0xe8ebd3030024a216 "c0-10"
0xe8ebd30300248b96 "c0-11"
0xe8ebd30300248b9a "c0-12"
0xe8ebd303002495d2 "c0-13"
0xe8ebd303002495e2 "c0-14"
0xe8ebd30300248f42 "c0-15"
0xe8ebd303002495e6 "c0-16"
0xe8ebd3030024a2a2 "c0-17"
0xe8ebd3030024a2ae "c0-18"
'';
};
slurm-exporter = {
enable = true;
port = 6080;
};
};
};
# services.udev.extraRules = ''
# KERNEL=="ibp65s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
# boot.kernelPackages = pkgs.linuxKernel.packages.linux_6_6;
services.flannel.iface = "enp33s0f3np3";
networking = {
useDHCP = false;
hostName = "ekman";
interfaces.enp33s0f3np3 = {
useDHCP = false;
ipv4.addresses = [ {
address = "10.255.241.100";
prefixLength = 24;
} ];
# ipv4.routes = [
# {
# address = "10.255.244.0";
# prefixLength = 24;
# via = "10.255.241.99";
# }
# ];
};
interfaces."ibp65s0f0" = {
useDHCP = false;
ipv4.addresses = [ {
address = "10.255.243.100";
prefixLength = 24;
} ];
};
interfaces."enp65s0f1np1" = {
useDHCP = false;
ipv4.addresses = [ {
address = "10.255.244.100";
prefixLength = 24;
} ];
};
interfaces.enp33s0f0np0 = {
useDHCP = false;
ipv4.addresses = [ {
address = "10.255.242.2";
prefixLength = 24;
} ];
ipv4.routes = [
# {
# address = "10.1.8.0";
# prefixLength = 24;
# via = "10.255.242.1";
# }
# {
# address = "10.1.30.0";
# prefixLength = 24;
# via = "10.255.242.1";
# }
];
};
defaultGateway = "10.255.242.1";
firewall = {
allowedTCPPorts = [ 4443 ];
extraCommands = ''
# needed for nodeport access on k1 and k2
# iptables -t nat -A POSTROUTING -s 10.255.241.0/24 ! -d 10.255.0.0/16 -j SNAT --to-source 10.255.242.2
iptables -t nat -A POSTROUTING -s 10.255.243.0/24 -j MASQUERADE
'';
};
};
fileSystems = {
"/exports/home" = {
device = "/home";
options = [ "bind" ];
};
"/exports/opt/bin" = {
device = "/opt/bin";
options = [ "bind" ];
};
"/exports/opt/sif" = {
device = "/opt/sif";
options = [ "bind" ];
};
"/exports/opt/singularity" = {
device = "/opt/singularity";
options = [ "bind" ];
};
"/exports/nfs-provisioner" = {
device = "/vol/nfs-provisioner";
options = [ "bind" ];
};
"/frontend" = {
device = "/home";
options = [ "bind" ];
};
"/vol/local-storage/vol1" = {
device = "/vol/vol1";
options = [ "bind" ];
};
"/vol/local-storage/vol2" = {
device = "/vol/vol2";
options = [ "bind" ];
};
};
nix.extraOptions = ''
secret-key-files = /etc/nix/ekman.key
'';
# services.xserver = {
# enable = false;
# enableCtrlAltBackspace = true;
# layout = "us";
# xkbVariant = "altgr-intl";
# xkbOptions = "eurosign:e";
# displayManager = {
# gdm.enable = false;
# job.logToFile = true;
# };
# # desktopManager.xfce.enable = true;
# };
services.prometheus.alertmanager.configuration.global = {
smtp_smarthost = "smtpgw.itpartner.no";
# smtp_auth_username = "utvikling";
# smtp_auth_password = "S0m3rp0m@de#21!";
smtp_hello = "ekman.oceanbox.io";
smtp_from = "noreply@ekman.oceanbox.io";
};
# services.nginx = {
# virtualHosts = {
# "ds.matnoc.regnekraft.io" = {
# forceSSL = true;
# enableACME = true;
# serverAliases = [];
# locations."/" = {
# proxyPass = "http://localhost:9088";
# proxyWebsockets = false;
# extraConfig = ''
# allow 10.1.2.0/24;
# allow 172.19.254.0/24;
# allow 172.19.255.0/24;
# deny all;
# '';
# };
# };
# };
# };
# services.gitlab-runner = {
# enable = true;
# extraPackages = with pkgs; [
# singularity
# ];
# concurrent = 4;
# services = {
# sif = {
# registrationConfigFile = "/var/lib/secrets/gitlab-runner-registration";
# executor = "shell";
# tagList = [ "ekman" "sif" ];
# };
# };
# };
# security.sudo.extraConfig = ''
# gitlab-runner ALL=(ALL) NOPASSWD: /run/current-system/sw/bin/singularity
# '';
security.pam = {
services.sshd.googleAuthenticator.enable = true;
loginLimits = [
{
domain = "@users";
item = "rss";
type = "hard";
value = 16000000;
}
{
domain = "@users";
item = "cpu";
type = "hard";
value = 180;
}
];
};
system.activationScripts = {
home-permissions.text = ''
chmod 755 /home/olean
chmod 755 /home/frankgaa
chmod 755 /home/jonas
chmod 755 /home/mrtz
chmod 755 /home/avle
chmod 755 /home/stig
chmod 755 /home/bast
chmod 755 /home/simenlk
chmod 755 /work/kraken
'';
};
# ssh-rsa is deprecated, but putty/winscp users use it
services.openssh.extraConfig = ''
# pubkeyacceptedalgorithms ssh-rsa,ssh-ed25519-cert-v01@openssh.com,ecdsa-sha2-nistp256-cert-v01@openssh.com,ecdsa-sha2-nistp384-cert-v01@openssh.com,ecdsa-sha2-nistp521-cert-v01@openssh.com,sk-ssh-ed25519-cert-v01@openssh.com,sk-ecdsa-sha2-nistp256-cert-v01@openssh.com,rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-ed25519,ecdsa-sha2-nistp256,ecdsa-sha2-nistp384,ecdsa-sha2-nistp521,sk-ssh-ed25519@openssh.com,sk-ecdsa-sha2-nistp256@openssh.com,rsa-sha2-512,rsa-sha2-256
PubkeyAuthOptions verify-required
'';
environment.systemPackages = [];
virtualisation.docker.enable = pkgs.lib.mkForce true;
services.tailscale = {
enable = true;
authKeyFile = "/var/lib/secrets/tailscale.key";
useRoutingFeatures = "server"; # for exit-node usage
extraUpFlags = [
"--login-server=https://headscale.svc.oceanbox.io"
"--accept-dns"
"--advertise-exit-node"
"--advertise-routes=10.255.241.241.0/24"
"--advertise-tags=tag:ekman"
];
};
imports = [
./hardware-configuration.nix
../cluster.nix
../mounts.nix
../myvnc.nix
];
};
}

View File

@@ -1,180 +0,0 @@
{ pkgs ? import <nixpkgs> {} }:
let
name = "fs-backup";
address = "10.255.241.80";
etcdCluster = import ../etcdCluster.nix;
in {
fs-backup = { config, pkgs, ... }: with pkgs; {
deployment.tags = [ "fs" "fs-backup" ];
deployment.targetHost = address;
system.autoUpgrade.enable = lib.mkForce false;
systemd.targets = {
sleep.enable = false;
suspend.enable = false;
hibernate.enable = false;
hybrid-sleep.enable = false;
};
# services.udev.extraRules = ''
# KERNEL=="ibp65s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
environment.systemPackages = with pkgs; [
rdma-core
hwloc
xfsprogs
];
boot.swraid = {
enable = true;
mdadmConf = ''
DEVICE partitions
ARRAY /dev/md/0 metadata=1.2 UUID=b743fdd4:5b339cc7:7c43f50f:3b81243e name=fs2:0
'';
};
systemd.services.restart-md0 = {
description = "restart /dev/md0";
wantedBy = [ "multi-user.target" ];
after = [ "sys-devices-virtual-block-md0.device" "-.mount" ];
before = [ "backup.mount" ];
path = [ "/run/current-system/sw/" ];
serviceConfig = {
Type = "oneshot";
};
script = ''
restart=0
${util-linux}/bin/lsblk -o MAJ:MIN -n /dev/md0 | grep -q "254:" || restart=1
if [ $restart = 1 ]; then
${mdadm}/bin/mdadm --stop /dev/md0
${mdadm}/bin/mdadm --assemble /dev/md0
sleep 1
fi
'';
};
cluster = {
k8sNode = true;
slurm = false;
mounts = {
rdma.enable = false;
automount.enable = true;
home = false;
opt = false;
work = false;
data = false;
backup = false;
ceph = false;
};
};
features.hpc.slurm.mungeUid = 996;
features = {
host = {
inherit address;
inherit name;
};
os = {
externalInterface = "eno1";
nfs.enable = true;
nfs.exports = ''
/exports 10.255.241.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
/exports 10.255.243.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
/exports 10.255.244.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
'';
};
k8s = {
enable = true;
node.enable = true;
master.enable = false;
inherit etcdCluster;
};
};
systemd.services.rc-local = {
description = "rc.local script";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
path = [ "/run/current-system/sw/" ];
serviceConfig = {
Type = "oneshot";
};
script = ''
# if [ -e /sys/block/md126 ]; then
# echo "deadline" > /sys/block/md126/queue/scheduler
# # echo "4096" > /sys/block/md126/queue/nr_requests
# echo "4096" > /sys/block/md126/queue/read_ahead_kb
# echo "always" > /sys/kernel/mm/transparent_hugepage/enabled
# echo "always" > /sys/kernel/mm/transparent_hugepage/defrag
# fi
grep -q rdma /proc/fs/nfsd/portlist || echo "rdma 20049" > /proc/fs/nfsd/portlist
grep -q tcp /proc/fs/nfsd/portlist || echo "tcp 2049" > /proc/fs/nfsd/portlist
'';
};
boot.kernel.sysctl = {
"vm.dirty_background_ratio" = 5;
"vm.dirty_ratio" = 10;
"vm.vfs_cache_pressure" = 50;
"vm.min_free_kbytes" = 262144;
};
networking = {
hostName = name;
interfaces.eno1 = {
useDHCP = false;
ipv4.addresses = [ {
address = address;
prefixLength = 24;
} ];
ipv4.routes = [
{
address = "10.255.244.0";
prefixLength = 24;
via = "10.255.241.99";
}
];
};
interfaces.ibp59s0 = {
useDHCP = false;
ipv4.addresses = [ {
address = "10.255.243.80";
prefixLength = 24;
} ];
};
firewall = {
allowedTCPPorts = [];
allowedUDPPorts = [];
extraCommands = ''
iptables -t nat -A POSTROUTING -s 10.255.243.0/24 -j MASQUERADE
'';
};
};
services.rpcbind.enable = true;
fileSystems = {
"/exports/backup" = {
device = "/backup";
options = [ "bind" ];
};
"/exports/ekman" = {
device = "/backup/ekman-nfs";
options = [ "bind" ];
};
};
programs.singularity.enable = true;
imports = [
./hardware-configuration.nix
../cluster.nix
../mounts.nix
];
};
}

View File

@@ -1,172 +0,0 @@
{ pkgs ? import <nixpkgs> {} }:
let
# Pin the deployment package-set to a specific version of nixpkgs
# pkgs = import (builtins.fetchTarball {
# url = "https://github.com/NixOS/nixpkgs/archive/e9148dc1c30e02aae80cc52f68ceb37b772066f3.tar.gz";
# sha256 = "1ckzhh24mgz6jd1xhfgx0i9mijk6xjqxwsshnvq789xsavrmsc36";
# }) {};
# pkgs = import <nixpkgs> {};
etcdCluster = import ../etcdCluster.nix;
name = "fs-work";
address = "10.255.241.90";
in {
fs-work = { config, pkgs, ... }: with pkgs; {
deployment.tags = [ "fs" "fs-work" ];
deployment.targetHost = address;
system.autoUpgrade.enable = lib.mkForce false;
systemd.targets = {
sleep.enable = false;
suspend.enable = false;
hibernate.enable = false;
hybrid-sleep.enable = false;
};
# services.udev.extraRules = ''
# KERNEL=="ibp65s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
environment.systemPackages = with pkgs; [
rdma-core
hwloc
];
cluster = {
k8sNode = true;
slurm = false;
mounts = {
rdma.enable = true;
automount.enable = true;
home = true;
opt = false;
work = false;
data = false;
backup = false;
ceph = false;
};
};
features.hpc.slurm.mungeUid = 994;
features = {
host = {
inherit address;
inherit name;
};
os = {
externalInterface = "enp33s0f3np3";
nfs.enable = true;
nfs.exports = ''
/exports 10.255.241.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
/exports 10.255.243.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
/exports 10.255.244.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
'';
};
k8s = {
enable = true;
node.enable = true;
master.enable = false;
inherit etcdCluster;
};
};
systemd.services.rc-local = {
description = "rc.local script";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
path = [ "/run/current-system/sw/" ];
serviceConfig = {
Type = "oneshot";
};
script = ''
# if [ -e /sys/block/md126 ]; then
# echo "deadline" > /sys/block/md126/queue/scheduler
# # echo "4096" > /sys/block/md126/queue/nr_requests
# echo "4096" > /sys/block/md126/queue/read_ahead_kb
# echo "always" > /sys/kernel/mm/transparent_hugepage/enabled
# echo "always" > /sys/kernel/mm/transparent_hugepage/defrag
# fi
grep -q rdma /proc/fs/nfsd/portlist || echo "rdma 20049" > /proc/fs/nfsd/portlist
grep -q tcp /proc/fs/nfsd/portlist || echo "tcp 2049" > /proc/fs/nfsd/portlist
'';
};
boot.kernel.sysctl = {
"vm.dirty_background_ratio" = 5;
"vm.dirty_ratio" = 10;
"vm.vfs_cache_pressure" = 50;
"vm.min_free_kbytes" = 262144;
};
networking = {
hostName = name;
interfaces.enp65s0f0np0 = {
useDHCP = false;
ipv4.addresses = [
{
address = address;
prefixLength = 24;
}
];
ipv4.routes = [
{
address = "10.255.242.0";
prefixLength = 24;
via = "10.255.241.100";
}
];
};
interfaces.enp1s0f1np1 = {
useDHCP = false;
ipv4.addresses = [
{
address = "10.255.244.90";
prefixLength = 24;
}
];
};
# interfaces."ibp65s0.7666" = {
# useDHCP = false;
# };
interfaces.ibp1s0f0 = {
useDHCP = false;
ipv4.addresses = [
{
address = "10.255.243.90";
prefixLength = 24;
}
];
};
firewall = {
allowedTCPPorts = [];
allowedUDPPorts = [];
extraCommands = ''
# iptables -t nat -A POSTROUTING -s 10.255.243.0/24 -j MASQUERADE
'';
};
};
services.rpcbind.enable = true;
fileSystems = {
"/exports/work" = {
device = "/work";
options = [ "bind" ];
};
"/exports/opt" = {
device = "/opt";
options = [ "bind" ];
};
};
programs.singularity.enable = true;
imports = [
./hardware-configuration.nix
../cluster.nix
../mounts.nix
];
};
}

View File

@@ -1,83 +0,0 @@
''
10.255.242.2 ekman-gw ekman-gw.compute.local ekman-gw.cluster.local
10.255.242.3 front-gw front-gw.compute.local front-gw.cluster.local
10.255.241.90 fs-work fs-work.compute.local
10.255.241.90 nfs1 nfs1.compute.local
10.255.241.90 fs1 fs1.compute.local
10.255.241.80 fs-backup fs-backup.compute.local
10.255.241.80 fs2 fs2.compute.local
10.255.241.100 ekman ekman.compute.local ekman.cluster.local
10.255.241.100 etcd0 etcd0.compute.local
10.255.241.80 etcd1 etcd1.compute.local
10.255.241.90 etcd2 etcd2.compute.local
10.255.241.99 frontend frontend.compute.local frontend.cluster.local
10.255.243.99 ibfrontend ibfrontend.compute.local ibfrontend.cluster.local
10.255.241.101 c0-1 c0-1.compute.local
10.255.241.102 c0-2 c0-2.compute.local
10.255.241.103 c0-3 c0-3.compute.local
10.255.241.104 c0-4 c0-4.compute.local
10.255.241.105 c0-5 c0-5.compute.local
10.255.241.106 c0-6 c0-6.compute.local
10.255.241.107 c0-7 c0-7.compute.local
10.255.241.108 c0-8 c0-8.compute.local
10.255.241.109 c0-9 c0-9.compute.local
10.255.241.110 c0-10 c0-10.compute.local
10.255.241.111 c0-11 c0-11.compute.local
10.255.241.112 c0-12 c0-12.compute.local
10.255.241.113 c0-13 c0-13.compute.local
10.255.241.114 c0-14 c0-14.compute.local
10.255.241.115 c0-15 c0-15.compute.local
10.255.241.116 c0-16 c0-16.compute.local
10.255.241.117 c0-17 c0-17.compute.local
10.255.241.118 c0-18 c0-18.compute.local
10.255.241.121 c1-1 c1-1.compute.local
10.255.241.122 c1-2 c1-2.compute.local
10.255.241.123 c1-3 c1-3.compute.local
10.255.241.124 c1-4 c1-4.compute.local
10.255.241.125 c1-5 c1-5.compute.local
10.255.241.126 c1-6 c1-6.compute.local
10.255.241.127 c1-7 c1-7.compute.local
10.255.241.128 c1-8 c1-8.compute.local
10.255.243.90 ibfs-work ibfs-work.compute.local
10.255.243.90 ibnfs1 ibnfs1.compute.local
10.255.243.90 ibfs1 ibfs1.compute.local
10.255.243.80 ibfs-backup ibfs-backup.compute.local
10.255.243.80 ibfs2 ibfs2.compute.local
10.255.243.100 ibekman ibekman.compute.local
10.255.243.100 ibetcd0 ibetcd0.compute.local
10.255.243.80 ibetcd1 ibetcd1.compute.local
10.255.243.90 ibetcd2 ibetcd2.compute.local
10.255.243.101 ib0-1 ib0-1.compute.local
10.255.243.102 ib0-2 ib0-2.compute.local
10.255.243.103 ib0-3 ib0-3.compute.local
10.255.243.104 ib0-4 ib0-4.compute.local
10.255.243.105 ib0-5 ib0-5.compute.local
10.255.243.106 ib0-6 ib0-6.compute.local
10.255.243.107 ib0-7 ib0-7.compute.local
10.255.243.108 ib0-8 ib0-8.compute.local
10.255.243.109 ib0-9 ib0-9.compute.local
10.255.243.110 ib0-10 ib0-10.compute.local
10.255.243.111 ib0-11 ib0-1.compute.local
10.255.243.112 ib0-12 ib0-12.compute.local
10.255.243.113 ib0-13 ib0-13.compute.local
10.255.243.114 ib0-14 ib0-14.compute.local
10.255.243.115 ib0-15 ib0-15.compute.local
10.255.243.116 ib0-16 ib0-16.compute.local
10.255.243.117 ib0-17 ib0-17.compute.local
10.255.243.118 ib0-18 ib0-18.compute.local
10.255.243.118 ib0-18 ib0-19.compute.local
10.255.243.121 ib1-1 ib1-1.compute.local
10.255.243.122 ib1-2 ib1-2.compute.local
10.255.243.123 ib1-3 ib1-3.compute.local
10.255.243.124 ib1-4 ib1-4.compute.local
10.255.243.125 ib1-5 ib1-5.compute.local
10.255.243.126 ib1-6 ib1-6.compute.local
10.255.243.127 ib1-7 ib1-7.compute.local
10.255.243.128 ib1-8 ib1-8.compute.local
''

View File

@@ -1,2 +0,0 @@
ç£/ik±/¨÷|ñR¯E¥R®$ÃQfj5·<35>rd<0E>С¶7“{¢99âTÂîÛÃiÄŒ,ÐŒÍhçïÙ8töv:%T”
|ÈÚÈ´þΕ§VŒ00w<30>|ŸÏ®÷íà|È_ŸY{3L_!F1TdÔ&F7õ™B°R

View File

@@ -1,349 +0,0 @@
{ pkgs, ...}:
let
computeNodes =
import ./cluster/c0/nodes.nix ++
import ./cluster/c1/nodes.nix ++
[
rec {
idx = 100;
name = "ekman";
address = "10.255.241.${toString idx}";
ipoib = "10.255.243.${toString idx}";
pubkey = ./cluster/ekman/ekman.pub;
}
rec {
idx = 90;
name = "fs-work";
address = "10.255.241.${toString idx}";
ipoib = "10.255.243.${toString idx}";
pubkey = ./cluster/fs-work/fs-work.pub;
}
rec {
idx = 81;
name = "fs-backup";
address = "10.255.241.${toString idx}";
ipoib = "10.255.243.${toString idx}";
pubkey = ./cluster/fs-backup/fs-backup.pub;
}
];
etcdCluster = import ./cluster/etcdCluster.nix;
name = "frontend";
address = "10.255.241.99";
ipoib = "10.255.243.99";
in {
systemd.targets = {
sleep.enable = false;
suspend.enable = false;
hibernate.enable = false;
hybrid-sleep.enable = false;
};
# services.udev.extraRules = ''
# KERNEL=="ibp65s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
environment.systemPackages = with pkgs; [
rdma-core
hwloc
headscale
];
cluster = {
k8sNode = true;
compute = false;
slurm = true;
mounts = {
rdma.enable = true;
automount.enable = true;
home = true;
opt = true;
work = true;
data = true;
backup = true;
ceph = true;
};
};
features = {
desktop.enable = false;
cachix.enable = false;
host = {
inherit address;
inherit name;
};
myvnc.enable = false;
os = {
externalInterface = "eno1";
nfs.enable = false;
nfs.exports = ''
/exports 10.255.241.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
/exports 10.255.243.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
'';
};
hpc = {
slurm.server = true;
slurm.slurmrestd = false;
slurm.mungeUid = 996;
frontend = true;
};
k8s = {
master.enable = true;
node.enable = true;
nodes = computeNodes;
inherit etcdCluster;
};
monitoring = {
server = {
enable = false;
scrapeHosts = [
"frontend"
"ekman"
"fs-work"
"fs-backup"
] ++ (builtins.map (x: x.name) computeNodes);
defaultAlertReceiver = {
email_configs = [
{ to = "jonas.juselius@oceanbox.io"; }
];
};
pageAlertReceiver = {
webhook_configs = [
{
url = "https://prometheus-msteams.k2.itpartner.no/ekman";
http_config = {
tls_config = { insecure_skip_verify = true; };
};
}
];
};
};
webUI.enable = false;
webUI.acmeEmail = "innovasjon@itpartner.no";
webUI.allow = [
"10.1.2.0/24"
"172.19.254.0/24"
"172.19.255.0/24"
];
infiniband-exporter = {
enable = true;
nameMap = ''
0xe8ebd3030024a2c6 "ekman"
0x0c42a10300ddc4bc "frontend"
0xe8ebd3030024a2ae "fs-work"
0x1c34da0300787798 "fs-backup"
0xe8ebd3030024981e "c0-1"
0xe8ebd3030024a21a "c0-2"
0xe8ebd30300249a3a "c0-3"
0xe8ebd30300248b9e "c0-4"
0xe8ebd30300248b86 "c0-5"
0xe8ebd3030024998a "c0-6"
0xe8ebd30300248b8e "c0-7"
0xe8ebd3030024999e "c0-8"
0xe8ebd30300248fca "c0-9"
0xe8ebd3030024a216 "c0-10"
0xe8ebd30300248b96 "c0-11"
0xe8ebd30300248b9a "c0-12"
0xe8ebd303002495d2 "c0-13"
0xe8ebd303002495e2 "c0-14"
0xe8ebd30300248f42 "c0-15"
0xe8ebd303002495e6 "c0-16"
0x0c42a10300dbe7f4 "c1-1"
0x0c42a10300dbe7d8 "c1-2"
0x0c42a10300dbe800 "c1-3"
0x0c42a10300dbec80 "c1-4"
0x0c42a10300dbea50 "c1-5"
0x0c42a10300dbeb2c "c1-6"
0x0c42a10300dbe7fc "c1-7"
0x0c42a10300dbe5a0 "c1-8"
'';
};
slurm-exporter = {
enable = true;
port = 6080;
};
};
};
programs.singularity.enable = true;
# services.udev.extraRules = ''
# KERNEL=="ibp65s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
services.kubernetes.apiserver.extraOpts = ''--oidc-client-id=9b6daef0-02fa-4574-8949-f7c1b5fccd15 --oidc-groups-claim=roles --oidc-issuer-url=https://login.microsoftonline.com/3f737008-e9a0-4485-9d27-40329d288089/v2.0'';
services.flannel.iface = "eno2";
networking = {
useDHCP = false;
hostName = name;
interfaces.eno1 = {
useDHCP = false;
ipv4.addresses = [ {
address = "10.255.242.3";
prefixLength = 24;
} ];
};
interfaces.eno2 = {
useDHCP = false;
ipv4.addresses = [
{
inherit address;
prefixLength = 24;
}
];
};
interfaces.ens2f1np1 = {
useDHCP = false;
ipv4.addresses = [
{
address = "10.255.244.99";
prefixLength = 24;
}
];
};
interfaces.ibs2f0 = {
useDHCP = false;
ipv4.addresses = [
{
address = ipoib;
prefixLength = 24;
}
];
};
defaultGateway = "10.255.242.1";
firewall = {
allowedTCPPorts = [ 4443 4725 ];
extraCommands = ''
# needed for nodeport access on k1 and k2
# iptables -t nat -A POSTROUTING -s 10.255.241.0/24 ! -d 10.255.0.0/16 -j SNAT --to-source 10.255.242.3
iptables -t nat -A POSTROUTING -s 10.255.243.0/24 -j MASQUERADE
# iptables -t nat -A POSTROUTING -s 100.64.0.0/24 -j MASQUERADE
# iptables -t nat -A POSTROUTING -d 10.255.244.0/24 -j MASQUERADE
# iptables -t nat -A POSTROUTING -s 10.255.244.0/24 -d 10.255.241.0/16 -j SNAT --to-source 10.255.241.99
# iptables -t nat -A POSTROUTING -s 10.255.244.0/24 -j SNAT --to-source 10.255.242.3
'';
};
};
fileSystems = {
"/exports/public" = {
device = "/srv/public";
options = [ "bind" ];
};
};
nix.extraOptions = ''
# secret-key-files = /etc/nix/ekman.key
'';
services.prometheus.alertmanager.configuration.global = {
smtp_smarthost = "smtpgw.itpartner.no";
# smtp_auth_username = "utvikling";
# smtp_auth_password = "S0m3rp0m@de#21!";
smtp_hello = "ekman.oceanbox.io";
smtp_from = "noreply@ekman.oceanbox.io";
};
security.pam = {
services.sshd.googleAuthenticator.enable = true;
loginLimits = [
{
domain = "@users";
item = "rss";
type = "hard";
value = 16000000;
}
{
domain = "@users";
item = "cpu";
type = "hard";
value = 180;
}
];
};
system.activationScripts = {
home-permissions.text = ''
chmod 755 /home/olean
chmod 755 /home/frankgaa
chmod 755 /home/jonas
chmod 755 /home/stig
chmod 755 /home/bast
chmod 755 /home/mrtz
chmod 755 /home/avle
chmod 755 /home/simenlk
chmod 755 /home/ole
'';
};
# Use nvd to get package diff before apply
system.activationScripts.system-diff = {
supportsDryActivation = true; # safe: only outputs to stdout
text = ''
export PATH="${pkgs.lib.makeBinPath [ pkgs.nixVersions.latest ]}:$PATH"
if [ -e /run/current-system ]; then
${pkgs.lib.getExe pkgs.nvd} diff '/run/current-system' "$systemConfig" || true
fi
'';
};
# ssh-rsa is deprecated, but putty/winscp users use it
services.openssh.extraConfig = ''
# pubkeyacceptedalgorithms ssh-rsa,ssh-ed25519-cert-v01@openssh.com,ecdsa-sha2-nistp256-cert-v01@openssh.com,ecdsa-sha2-nistp384-cert-v01@openssh.com,ecdsa-sha2-nistp521-cert-v01@openssh.com,sk-ssh-ed25519-cert-v01@openssh.com,sk-ecdsa-sha2-nistp256-cert-v01@openssh.com,rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-ed25519,ecdsa-sha2-nistp256,ecdsa-sha2-nistp384,ecdsa-sha2-nistp521,sk-ssh-ed25519@openssh.com,sk-ecdsa-sha2-nistp256@openssh.com,rsa-sha2-512,rsa-sha2-256
PubkeyAuthOptions verify-required
'';
# boot.kernelPackages = pkgs.linuxKernel.packages.linux_6_1;
virtualisation.docker.enable = pkgs.lib.mkForce true;
# Configuration for the coordination server for a tailscale network run using headscale.
#
# We can set it up to provide several exit nodes through which traffic can be routed.
#
# Servers can join using this command:
# `tailscale up --login-server net.b0.itpartner.no --accept-dns=false --advertise-exit-node`
#
# with the following config:
#
# service.tailscale = {
# enable = true;
# useRoutingFeatures = "server"; # for exit-node usage
# };
#
# Clients can join using this command:
# `tailscale up --login-server net.b0.itpartner.no --accept-dns=false`
#
# services.headscale = {
# enable = true;
# address = "0.0.0.0";
# port = 4725; # hscl
# settings = import ./headscale/settings.nix;
# };
services.tailscale = {
enable = true;
authKeyFile = "/var/lib/secrets/tailscale.key";
useRoutingFeatures = "both"; # for exit-node usage
extraUpFlags = [
"--login-server=https://headscale.svc.oceanbox.io"
"--accept-dns=false"
"--advertise-exit-node"
];
};
imports = [
./hardware-configuration.nix
./cluster/cluster.nix
./cluster/mounts.nix
./cluster/myvnc.nix
];
}

27
dns.nix Normal file
View File

@@ -0,0 +1,27 @@
{ pkgs, ... }:
let
zonefile = builtins.toFile "obx.zone" (builtins.readFile ./obx.zone);
in {
services.coredns = {
enable = true;
config = ''
. {
errors
log . {
class error
}
file ${zonefile} obx.
cache 30 {
disable success cluster.local
disable denial cluster.local
}
header {
response set ra
}
forward . 8.8.8.8 8.8.4.4
loop
reload
}
'';
};
}

29
ekman/bin/adduser.sh Executable file
View File

@@ -0,0 +1,29 @@
#!/usr/bin/env bash
id=$1
user=$2
name="$3"
grp="\ $user = { gid = "$id"; };"
read -d '' usr << EOF
\\\ $user = {\\\n\
description = "$name";\\\n\
home = "/home/$user";\\\n\
group = "$user";\\\n\
extraGroups = [\\\n\
"users"\\\n\
"docker"\\\n\
];\\\n\
uid = $id;\\\n\
isNormalUser = true;\\\n\
createHome = true;\\\n\
openssh.authorizedKeys.keys = [];\\\n\
};\\\n\
EOF
sed -i "
/# @grp@/i $grp
/# @usr@/i $usr
" stokes/users.nix

18
ekman/bin/initca.sh Executable file
View File

@@ -0,0 +1,18 @@
#!/usr/bin/env bash
TOP="$(cd "$(dirname "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)/.."
if [ "x$1" = "x" ]; then
echo "usage: initca.sh {cluster}"
exit 1
fi
ca=$TOP/modules/initca.nix
cd $TOP/$1
echo "--- Preparing CA certificate"
nix-build -o ca $ca
echo "--- Safeguarding CA certificate"
nix-store --add-root $(pwd)/ca --indirect -r $(nix-instantiate --add-root $ca)

136
ekman/c0/default.nix Normal file
View File

@@ -0,0 +1,136 @@
{
pkgs ? import <nixpkgs> { },
}:
let
# Pin the deployment package-set to a specific version of nixpkgs
# pkgs = import (builtins.fetchTarball {
# url = "https://github.com/NixOS/nixpkgs/archive/e6377ff35544226392b49fa2cf05590f9f0c4b43.tar.gz";
# sha256 = "1fra9wwy5gvj5ibayqkzqpwdf715bggc0qbmrfch4fghwvl5m70l";
# }) {};
# pkgs = import <nixpkgs> {};
nodes = import ./nodes.nix;
has100GbE = [ "c0-18" ];
mkCompute =
host:
let
hw = ./hardware-configuration.d + "/${host.name}.nix";
in
{
"${host.name}" = {
deployment.tags = [
"compute"
"c0"
"cluster"
];
deployment.targetHost = host.address;
cluster = {
compute = true;
k8sNode = true;
mounts = {
rdma.enable = true;
automount.enable = true;
users = true;
opt = true;
work = true;
data = false;
ceph = false;
backup = false;
}
// (
if (builtins.elem host.name has100GbE) then
{
data = true;
ceph = true;
}
else
{ }
);
};
features = {
host = {
name = host.name;
address = host.address;
};
os.networkmanager.enable = false;
os.externalInterface = host.iface;
hpc.computeNode = true;
hpc.knem = true;
};
# services.udev.extraRules = ''
# KERNEL=="ibp1s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
networking = {
useNetworkd = true;
hostName = host.name;
useDHCP = false;
};
# systemd.services.systemd-networkd-wait-online.enable = false;
systemd.network = {
wait-online.ignoredInterfaces = [ "ibp1s0" ];
# wait-online.enable = false;
networks = {
"40-${host.iface}" = {
matchConfig.Name = host.iface;
address = [ "${host.address}/24" ];
# networkConfig = {
# DNSDefaultRoute = true;
# };
routes = [
{ Gateway = "10.255.241.1"; }
{
Destination = "172.16.239.0/24";
Gateway = "10.255.241.210";
}
{
Destination = "10.255.242.0/24";
Gateway = "10.255.241.100";
}
];
};
"45-ibp1s0" = {
matchConfig.Name = "ibp1s0";
address = [ "${host.ipoib}/24" ];
};
}
// (
if (builtins.elem host.name has100GbE) then
{
"42-enp65s0f1np1" = {
DHCP = "no";
matchConfig.Name = "enp65s0f1np1 ";
address = [ "${host.gbe100}/24" ];
};
}
else
{ }
);
};
boot.kernelParams = [
"console=tty0"
"console=ttyS0,115200"
];
systemd.services."serial-getty@ttyS0" = {
enable = true;
wantedBy = [ "getty.target" ];
serviceConfig.Restart = "always";
};
imports = [
hw
../default.nix
../mounts.nix
./kernel.nix
];
};
};
in
builtins.foldl' (a: n: a // mkCompute n) { } nodes

View File

@@ -0,0 +1,12 @@
mv c0-8.nix c0-6.nix.tmp
mv c0-6.nix c0-7.nix.tmp
mv c0-7.nix c0-8.nix.tmp
mv c0-15.nix c0-10.nix.tmp
mv c0-14.nix c0-12.nix.tmp
mv c0-12.nix c0-14.nix.tmp
mv c0-10.nix c0-15.nix.tmp
for i in *.tmp; do
mv $i $(basename $i .tmp)
done

58
ekman/c0/kernel.nix Normal file
View File

@@ -0,0 +1,58 @@
{ pkgs, lib, ... }:
let
kernel = pkgs.linuxPackages.kernel;
i40e = pkgs.stdenv.mkDerivation rec {
name = "i40e-${version}-${kernel.version}";
version = "2.13.10";
src = pkgs.fetchFromGitHub {
owner = "dmarion";
repo = "i40e";
rev = "7228a7c3b362c3170baa2f9a9c6870a900e78dbd";
sha256 = "087kvq9wrc1iw6vig8cqcx7cb6346wx8qxzb85c3n8638vq1vrxr";
};
hardeningDisable = [ "pic" ];
configurePhase = ''
cd src
kernel_version=${kernel.modDirVersion}
sed -i -e 's|/lib/modules|${kernel.dev}/lib/modules|' Makefile
sed -i -e 's|/lib/modules|${kernel.dev}/lib/modules|' common.mk
export makeFlags="BUILD_KERNEL=$kernel_version"
'';
installPhase = ''
install -v -D -m 644 i40e.ko "$out/lib/modules/$kernel_version/kernel/drivers/net/i40e/i40e2.ko"
'';
dontStrip = true;
enableParallelBuilding = true;
meta = {
description = "Linux kernel drivers for Intel Ethernet adapters and LOMs (LAN On Motherboard)";
homepage = "https://github.com/dmarion/i40e";
license = lib.licenses.gpl2;
};
};
in
{
# i40e2 = i40e;
boot.kernelPackages = pkgs.linuxPackagesFor (
pkgs.linux_5_10.override {
argsOverride = rec {
src = pkgs.fetchurl {
url = "mirror://kernel/linux/kernel/v5.x/linux-${version}.tar.xz";
sha256 = "1nzhl1y6avfl77fyqwjwy3qc6679gp92k0d3aarscrdydcml5yid";
};
version = "5.10.239";
modDirVersion = "5.10.239";
};
}
);
# boot.kernelPackages = pkgs.linuxKernel.packages.linux_5_10;
# overlay = self: super: {
# linuxPackages_5_4 = super.linuxPackages_5_4 // { inherit i40e; };
# };
}

13
ekman/c0/nodes.nix Normal file
View File

@@ -0,0 +1,13 @@
with builtins;
let
nodes = genList (n: n + 1) 18; in
map (n: (
rec {
idx = 100 + n;
iface = if n > 16 then "enp33s0f3np3" else "enp33s0f0np0";
name = "c0-${toString n}";
address = "10.255.241.${toString idx}";
ipoib = "10.255.243.${toString idx}";
gbe100 = "10.255.244.${toString idx}";
pubkey = ./. + "/ssh_host_key.d/c0-${toString n}.pub";
})) nodes

View File

@@ -0,0 +1,12 @@
mv c0-8.pub c0-6.pub.tmp
mv c0-6.pub c0-7.pub.tmp
mv c0-7.pub c0-8.pub.tmp
mv c0-15.pub c0-10.pub.tmp
mv c0-14.pub c0-12.pub.tmp
mv c0-12.pub c0-14.pub.tmp
mv c0-10.pub c0-15.pub.tmp
for i in *.tmp; do
mv $i $(basename $i .tmp)
done

129
ekman/c1/default.nix Normal file
View File

@@ -0,0 +1,129 @@
{
pkgs ? import <nixpkgs> { },
}:
let
# Pin the deployment package-set to a specific version of nixpkgs
# pkgs = import (builtins.fetchTarball {
# url = "https://github.com/NixOS/nixpkgs/archive/e6377ff35544226392b49fa2cf05590f9f0c4b43.tar.gz";
# sha256 = "1fra9wwy5gvj5ibayqkzqpwdf715bggc0qbmrfch4fghwvl5m70l";
# }) {};
# pkgs = import <nixpkgs> {};
nodes = import ./nodes.nix;
mkCompute =
host:
let
hw = ./hardware-configuration.d + "/${host.name}.nix";
in
{
"${host.name}" = {
deployment.tags = [
"compute"
"c1"
"cluster"
];
deployment.targetHost = host.address;
cluster = {
compute = true;
k8sNode = true;
mounts = {
rdma.enable = false;
gbe100.enable = true;
automount.enable = true;
users = true;
opt = true;
work = true;
data = true;
ceph = true;
backup = false;
};
};
features = {
host = {
name = host.name;
address = host.address;
};
os.networkmanager.enable = false;
os.externalInterface = "eno33np0";
hpc.computeNode = true;
};
# services.udev.extraRules = ''
# KERNEL=="ibp1s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
networking = {
useNetworkd = true;
hostName = host.name;
useDHCP = false;
};
# systemd.services.systemd-networkd-wait-online.enable = false;
# systemd.network.wait-online.ignoredInterfaces = [ "ibp1s0" ];
systemd.network = {
# wait-online.enable = false;
networks = {
"40-${host.iface}" = {
DHCP = "no";
matchConfig.Name = host.iface;
address = [ "${host.address}/24" ];
networkConfig = {
DNSDefaultRoute = true;
};
routes = [
{ Gateway = "10.255.241.1"; }
{
Destination = "172.16.239.0/24";
Gateway = "10.255.241.210";
}
{
Destination = "10.255.242.0/24";
Gateway = "10.255.241.100";
}
];
};
"42-enp65s0np0" = {
DHCP = "no";
matchConfig.Name = "enp65s0np0 ";
address = [ "${host.gbe100}/24" ];
};
};
};
# boot.kernel.sysctl = {
# "net.ipv4.tcp_timestamps" = 0;
# "net.ipv4.tcp_sack" = 1;
# "net.core.netdev_max_backlog" = 250000;
# "net.core.rmem_max" = 4194304;
# "net.core.wmem_max" = 4194304;
# "net.core.rmem_default" = 4194304;
# "net.core.wmem_default" = 4194304;
# "net.core.optmem_max" = 4194304;
# "net.ipv4.tcp_rmem" = "4096 87380 4194304";
# "net.ipv4.tcp_wmem" = "4096 65536 4194304";
# "net.ipv4.tcp_low_latency" = 1;
# "net.ipv4.tcp_adv_win_scale" = 1;
# };
boot.kernelParams = [
"console=tty0"
"console=ttyS0,115200"
];
systemd.services."serial-getty@ttyS0" = {
enable = true;
wantedBy = [ "getty.target" ];
serviceConfig.Restart = "always";
};
imports = [
hw
../default.nix
../mounts.nix
];
};
};
in
builtins.foldl' (a: n: a // mkCompute n) { } nodes

View File

@@ -3,10 +3,11 @@ let nodes = genList (n: n + 1) 8; in
map (n: (
rec {
idx = 120 + n;
iface = "eno33np0";
name = "c1-${toString n}";
target = "10.255.241.${toString (idx + 100)}";
# target = "10.255.241.${toString (idx + 100)}";
address = "10.255.241.${toString idx}";
ipoib = "10.255.243.${toString idx}";
gbe100 = "10.255.244.${toString idx}";
pubkey = ./. + "/pubkeys/c1-${toString n}.pub";
pubkey = ./. + "/ssh_host_key.d/c1-${toString n}.pub";
})) nodes

View File

@@ -1,11 +1,14 @@
{ pkgs, lib, config, ... }:
{
pkgs,
lib,
config,
...
}:
with lib;
let
cfg = config.features.host;
computeNodes =
import ./c0/nodes.nix ++
import ./c1/nodes.nix;
computeNodes = import ./c0/nodes.nix ++ import ./c1/nodes.nix;
mkSANs = host: [
host.name
@@ -25,13 +28,32 @@ let
loader.systemd-boot.enable = true;
loader.efi.canTouchEfiVariables = true;
# kernelPackages = pkgs.linuxKernel.packages.linux_6_9;
kernelModules = [ "ib_umad" "ib_ipoib" "ceph" ];
kernelModules = [
"ib_umad"
"ib_ipoib"
"ceph"
];
# kernelParams = [
# "console=ttyS0,115200"
# "console=tty0"
# ];
};
services.resolved = {
enable = true;
dnssec = "false";
fallbackDns = [
"1.1.1.1"
"1.0.0.1"
];
# domains = [ "ekman.tos.obx" "~." ];
extraConfig = ''
DNSStubListener=no # conflicts with dnsmasq and kubernetes dns
MulticastDNS=no
LLMNR=no
'';
};
console = {
font = "Lat2-Terminus16";
keyMap = "us";
@@ -40,38 +62,19 @@ let
i18n = {
defaultLocale = "en_US.UTF-8";
extraLocaleSettings = {
LC_CTYPE="en_DK.UTF-8";
LC_TIME="en_DK.UTF-8";
LC_PAPER="en_DK.UTF-8";
LC_NAME="en_DK.UTF-8";
LC_ADDRESS="en_DK.UTF-8";
LC_TELEPHONE="en_DK.UTF-8";
LC_MEASUREMENT="en_DK.UTF-8";
LC_IDENTIFICATION="en_DK.UTF-8";
LC_CTYPE = "en_DK.UTF-8";
LC_TIME = "en_DK.UTF-8";
LC_PAPER = "en_DK.UTF-8";
LC_NAME = "en_DK.UTF-8";
LC_ADDRESS = "en_DK.UTF-8";
LC_TELEPHONE = "en_DK.UTF-8";
LC_MEASUREMENT = "en_DK.UTF-8";
LC_IDENTIFICATION = "en_DK.UTF-8";
};
};
time.timeZone = "Europe/Oslo";
programs.msmtp = {
enable = true;
accounts = {
default = {
auth = false;
tls = false;
tls_starttls = false;
port = 24;
from = "ekman@oceanbox.io";
host = "smtpgw.itpartner.no";
# user = "utvikling";
# password = "S0m3rp0m@de#21!";
};
};
defaults = {
aliases = "/etc/aliases";
};
};
environment.etc = {
"aliases" = {
text = ''
@@ -96,25 +99,28 @@ let
};
cachix.enable = false;
monitoring.nodeExporter.enable = false;
hpc.mft.enable = false; # Mellanox MFT
mft.enable = true; # Mellanox MFT
};
networking = {
domain = mkDefault "compute.local";
defaultGateway = mkDefault "10.255.241.1";
nameservers = mkDefault [ "8.8.8.8" ];
search = mkDefault [];
extraHosts = import ./hosts.nix;
useDHCP = false;
domain = "ekman.tos.obx";
nameservers = [
"10.255.241.210"
"10.255.241.99"
];
search = [ "ekman.tos.obx" ];
extraHosts = import ../hosts.nix + import ./hosts.nix;
firewall.extraCommands = ''
iptables -I INPUT -s 10.255.241.0/24 -j ACCEPT
iptables -I INPUT -s 10.255.243.0/24 -j ACCEPT
iptables -I INPUT -s 100.64.0.0/24 -j ACCEPT
'';
};
environment.variables = {};
environment.variables = { };
# systemd.services."serial-getty@ttyS0".enable = true;
# environment.etc."beegfs/connauthfile" = {
# source = ./connauthfile;
# mode = "0400";
@@ -133,60 +139,52 @@ let
};
system.activationScripts = {
kraken-permissions.text = ''
chmod 755 /work/kraken
'';
kraken-permissions.text = ''
chmod 755 /work/kraken
'';
};
};
slurm = {
features.hpc.slurm = {
enable = true;
client = true;
mungeKey = ./munge.key;
mungeUid = mkDefault 996; # hack
# pkey = "0x7666";
controlMachine = "frontend";
mailDomain = "oceanbox.io";
nodeName = [
"c0-[1-18] Sockets=2 CoresPerSocket=64 ThreadsPerCore=1 RealMemory=256000 TmpDisk=500000 State=UNKNOWN"
"c1-[1-8] Sockets=1 CoresPerSocket=64 ThreadsPerCore=1 RealMemory=256000 TmpDisk=100000 State=UNKNOWN"
"ekman Sockets=2 CoresPerSocket=64 ThreadsPerCore=2 RealMemory=256000 TmpDisk=500000 State=UNKNOWN"
"frontend Sockets=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=92000 TmpDisk=200000 State=UNKNOWN"
];
partitionName = [
"batch Nodes=c0-[1-18] Default=YES MaxTime=INFINITE State=UP"
"ekman Nodes=ekman MaxTime=1:00:00 State=UP"
"short Nodes=c1-[1-8] MaxTime=INFINITE State=UP"
"long Nodes=c1-[3-8] MaxTime=INFINITE State=UP"
"stats Nodes=c1-[7-8] MaxTime=INFINITE State=UP"
];
enable = true;
client = true;
clusterName = "ekman";
slurmctldHosts = [
"ekman-manage(10.255.241.99)"
];
dbdHost = "slurm-accounting";
mungeKey = ../munge.key;
jwtKey = ../jwt_hs256.key;
# slurmKey = ../slurm.key;
# pkey = "0x7666";
mailDomain = "oceanbox.io";
nodeName = [
"c0-[1-18] Sockets=2 CoresPerSocket=64 ThreadsPerCore=1 RealMemory=256000 TmpDisk=500000 State=UNKNOWN"
"c1-[1-8] Sockets=1 CoresPerSocket=64 ThreadsPerCore=1 RealMemory=256000 TmpDisk=100000 State=UNKNOWN"
"ekman Sockets=2 CoresPerSocket=64 ThreadsPerCore=1 RealMemory=256000 TmpDisk=500000 State=UNKNOWN"
"ekman-manage Sockets=2 CoresPerSocket=16 ThreadsPerCore=2 RealMemory=92000 TmpDisk=200000 State=UNKNOWN"
"fs-backup Sockets=2 CoresPerSocket=20 ThreadsPerCore=1 RealMemory=92000 TmpDisk=300000 State=UNKNOWN"
];
partitionName = [
"batch Nodes=c0-[1-17] Default=YES MaxTime=INFINITE State=UP"
"ekman Nodes=ekman MaxTime=1:00:00 State=UP"
"short Nodes=c1-[1-8],c0-18 MaxTime=INFINITE State=UP"
"long Nodes=c1-[2-8],c0-18 MaxTime=INFINITE State=UP"
"stats Nodes=c1-[7-8] MaxTime=INFINITE State=UP"
"test Nodes=fs-backup MaxTime=INFINITE State=UP"
];
};
};
compute = {
system.activationScripts = {
mkWorkDir.text = "mkdir -p /work";
mkWorkDir.text = "mkdir -p /work";
};
cluster.slurm = true;
features = {
hpc = {
enable = true;
beegfs = {
enable = false;
# beegfs = {
# work = {
# mgmtdHost = "ibbeegfs0";
# connAuthFile = "/etc/beegfs/connauthfile";
# client = {
# enable = false;
# mountPoint = "/work";
# };
# };
# };
};
};
};
};
@@ -200,32 +198,31 @@ let
initca = ./ca;
cidr = "10.100.0.0/16";
master = {
name = "frontend";
name = "ekman-manage";
address = "10.255.241.99";
# extraSANs = [
# "frontend.oceanbox.io"
# ];
};
ingressNodes = [
"ekman-manage.oceanbox.io"
"ekman.oceanbox.io"
];
fileserver = "fs-work";
charts = {
acme_email = "acme@oceanbox.io";
# grafana_smtp_user = "utvikling";
# grafana_smtp_password = "S0m3rp0m@de#21!";
};
};
};
system.activationScripts = {
copyCaKey.text = "cp ${./ca}/ca-key.pem /var/lib/kubernetes/secrets";
copyCaKey.text = "cp ${./ca}/ca-key.pem /var/lib/kubernetes/secrets";
};
# services.kubernetes.kubelet.extraSANs = mkSANs {
# name = cfg.name;
# address = cfg.address;
# };
services.kubernetes.kubelet.extraSANs = mkSANs {
name = cfg.name;
address = cfg.address;
};
};
shosts = {
@@ -236,63 +233,96 @@ let
text = ''
10.255.241.80
10.255.241.90
'' + builtins.foldl' (a: x: a + "${x.address}\n") "" computeNodes;
''
+ builtins.foldl' (a: x: a + "${x.address}\n") "" computeNodes;
};
programs.ssh.knownHosts = {
frontend = {
ekman-manage = {
hostNames = [
"frontend" "frontend.compute.local" "frontend.oceanbox.io" "10.255.241.99" "10.255.243.99"
"ekman-manage"
"ekman-manage.ekman.tos.obx"
"frontend.oceanbox.io"
"10.255.241.99"
"10.255.243.99"
];
publicKeyFile = ../frontend.pub;
publicKeyFile = ./manage/ssh_host_key.pub;
};
ekman = {
hostNames = [
"ekman" "ekman.compute.local" "ekman.oceanbox.io" "10.255.241.100" "10.255.243.100"
"ekman"
"ekman.ekman.tos.obx"
"ekman.oceanbox.io"
"10.255.241.100"
"10.255.243.100"
];
publicKeyFile = ./ekman/ekman.pub;
publicKeyFile = ./login/ssh_host_key.pub;
};
fs-work = {
hostNames = [
"fs-work" "fs-work.compute.local" "10.255.241.90" "10.255.243.90"
"fs-work"
"fs-work.ekman.tos.obx"
"10.255.241.90"
"10.255.243.90"
];
publicKeyFile = ./fs-work/fs-work.pub;
publicKeyFile = ./fs-work/ssh_host_key.pub;
};
fs-backup = {
hostNames = [
"fs-backup" "fs-backup.compute.local" "10.255.241.80" "10.255.243.80"
"fs-backup"
"fs-backup.ekman.tos.obx"
"10.255.241.80"
"10.255.243.80"
];
publicKeyFile = ./fs-backup/fs-backup.pub;
publicKeyFile = ./fs-backup/ssh_host_key.pub;
};
} // builtins.foldl' (a: x:
let n = toString x.idx;
in a // {
"${x.name}" = {
hostNames = [
"${x.name}"
"${x.name}.compute.local"
"10.255.241.${n}"
"10.255.243.${n}"
];
publicKeyFile = x.pubkey;
};
}) {} computeNodes;
}
// builtins.foldl' (
a: x:
let
n = toString x.idx;
in
a
// {
"${x.name}" = {
hostNames = [
"${x.name}"
"${x.name}.ekman.tos.obx"
"10.255.241.${n}"
"10.255.243.${n}"
];
publicKeyFile = x.pubkey;
};
}
) { } computeNodes;
environment.systemPackages = [
openssh-shosts
pkgs.inotify-tools
pkgs.ceph
pkgs.ceph-client
openssh-shosts
pkgs.inotify-tools
pkgs.ceph
pkgs.ceph-client
];
security.wrappers = {
ssh-keysign = {
source = "${openssh-shosts}/libexec/ssh-keysign";
owner = "root";
group = "root";
permissions = "u+rs,g+rx,o+rx";
};
ssh-keysign = {
source = "${openssh-shosts}/libexec/ssh-keysign";
owner = "root";
group = "root";
permissions = "u+rs,g+rx,o+rx";
};
};
# Use nvd to get package diff before apply
system.activationScripts.system-diff = {
supportsDryActivation = true; # safe: only outputs to stdout
text = ''
export PATH="${pkgs.lib.makeBinPath [ pkgs.nixVersions.latest ]}:$PATH"
if [ -e /run/current-system ]; then
${pkgs.lib.getExe pkgs.nvd} diff '/run/current-system' "$systemConfig" || true
fi
'';
};
};
openssh-shosts = pkgs.openssh.overrideAttrs (attrs: {
@@ -300,7 +330,8 @@ let
doCheck = false; # the tests take hours
});
in {
in
{
options.cluster = {
compute = mkEnableOption "Enable compute node configs";
};
@@ -324,7 +355,6 @@ in {
imports = [
../modules
../nixos
./users.nix
../users.nix
];
}

235
ekman/fs-backup/default.nix Normal file
View File

@@ -0,0 +1,235 @@
{
pkgs ? import <nixpkgs> { },
}:
let
name = "fs-backup";
address = "10.255.241.80";
etcdCluster = import ../etcdCluster.nix;
in
{
fs-backup =
{ config, pkgs, ... }:
with pkgs;
{
deployment.tags = [
"fs"
"fs-backup"
];
deployment.targetHost = address;
system.autoUpgrade.enable = lib.mkForce false;
systemd.targets = {
sleep.enable = false;
suspend.enable = false;
hibernate.enable = false;
hybrid-sleep.enable = false;
};
# services.udev.extraRules = ''
# KERNEL=="ibp65s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
environment.systemPackages = with pkgs; [
rdma-core
hwloc
xfsprogs
];
cluster = {
k8sNode = true;
slurm = true;
mounts = {
rdma.enable = false;
automount.enable = true;
users = true;
opt = true;
work = true;
data = true;
ceph = true;
backup = false;
};
};
features = {
host = {
inherit address;
inherit name;
};
os = {
networkmanager.enable = false;
externalInterface = "eno1";
nfs.enable = true;
nfs.exports = ''
/exports 10.255.241.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
/exports 10.255.244.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
'';
};
k8s = {
enable = true;
node.enable = true;
master.enable = false;
inherit etcdCluster;
};
};
systemd.services.rc-local = {
description = "rc.local script";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
path = [ "/run/current-system/sw/" ];
serviceConfig = {
Type = "oneshot";
};
script = ''
# if [ -e /sys/block/md126 ]; then
# echo "deadline" > /sys/block/md126/queue/scheduler
# # echo "4096" > /sys/block/md126/queue/nr_requests
# echo "4096" > /sys/block/md126/queue/read_ahead_kb
# echo "always" > /sys/kernel/mm/transparent_hugepage/enabled
# echo "always" > /sys/kernel/mm/transparent_hugepage/defrag
# fi
grep -q rdma /proc/fs/nfsd/portlist || echo "rdma 20049" > /proc/fs/nfsd/portlist
grep -q tcp /proc/fs/nfsd/portlist || echo "tcp 2049" > /proc/fs/nfsd/portlist
'';
};
boot.kernel.sysctl = {
"vm.dirty_background_ratio" = 5;
"vm.dirty_ratio" = 10;
"vm.vfs_cache_pressure" = 50;
"vm.min_free_kbytes" = 262144;
};
networking = {
useNetworkd = true;
useDHCP = false;
hostName = name;
firewall = {
allowedTCPPorts = [ ];
allowedUDPPorts = [ ];
extraCommands = ''
# iptables -t nat -A POSTROUTING -s 10.255.243.0/24 -j MASQUERADE
'';
};
};
systemd.network = {
networks."40-eno1" = {
DHCP = "no";
matchConfig.Name = "eno1";
address = [ "${address}/24" ];
networkConfig = {
DNSDefaultRoute = true;
};
routes = [
{ Gateway = "10.255.241.1"; }
{
Destination = "10.255.242.0/24";
Gateway = "10.255.241.100";
}
{
Destination = "172.16.239.0/24";
Gateway = "10.255.241.210";
}
];
};
networks."40-enp59s0np0" = {
DHCP = "no";
matchConfig.Name = "enp59s0np0";
address = [ "10.255.244.80/24" ];
};
};
services.rpcbind.enable = true;
fileSystems = {
"/exports/backup" = {
device = "/backup";
options = [ "bind" ];
};
"/exports/ekman" = {
device = "/backup/ekman-nfs";
options = [ "bind" ];
};
};
programs.singularity.enable = true;
boot.swraid = {
enable = true;
mdadmConf = ''
DEVICE partitions
ARRAY /dev/md/0 metadata=1.2 UUID=b743fdd4:5b339cc7:7c43f50f:3b81243e name=fs2:0
'';
};
systemd.services.restart-md0 = {
description = "restart /dev/md0";
wantedBy = [ "multi-user.target" ];
after = [
"sys-devices-virtual-block-md0.device"
"-.mount"
];
before = [ "backup.mount" ];
path = [ "/run/current-system/sw/" ];
serviceConfig = {
Type = "oneshot";
};
script = ''
restart=0
${util-linux}/bin/lsblk -o MAJ:MIN -n /dev/md0 | grep -q "254:" || restart=1
if [ $restart = 1 ]; then
${mdadm}/bin/mdadm --stop /dev/md0
${mdadm}/bin/mdadm --assemble /dev/md0
sleep 1
fi
'';
};
#services.tailscale = {
# enable = true;
# authKeyFile = "/var/lib/secrets/tailscale.key";
# useRoutingFeatures = "both";
# extraUpFlags = [
# "--login-server=https://headscale.svc.oceanbox.io"
# "--accept-dns=true"
# "--accept-routes=true"
# "--snat-subnet-routes=true"
# "--advertise-routes=10.255.241.0/24"
# ];
#};
#services.networkd-dispatcher = {
# enable = true;
# rules = {
# "tailscale-router" = {
# onState = [ "routable" ];
# script = ''
# #!${pkgs.runtimeShell}
# ${pkgs.ethtool}/bin/ethtool -K eno1 rx-udp-gro-forwarding on
# ${pkgs.ethtool}/bin/ethtool -K eno1 rx-gro-list off
# ${pkgs.ethtool}/bin/ethtool -K eno1 tx-udp-segmentation on
# exit 0
# '';
# };
# };
#};
boot.kernelParams = [
"console=tty0"
"console=ttyS0,115200"
];
systemd.services."serial-getty@ttyS0" = {
enable = true;
wantedBy = [ "getty.target" ];
serviceConfig.Restart = "always";
};
imports = [
./hardware-configuration.nix
../default.nix
../mounts.nix
];
};
}

View File

@@ -4,25 +4,32 @@
{ config, lib, pkgs, modulesPath, ... }:
{
imports =
[ (modulesPath + "/installer/scan/not-detected.nix")
];
imports = [ (modulesPath + "/installer/scan/not-detected.nix") ];
boot.initrd.availableKernelModules = [ "ahci" "xhci_pci" "megaraid_sas" "mpt3sas" "usbhid" "usb_storage" "sd_mod" "sr_mod" ];
boot.initrd.availableKernelModules = [
"ahci"
"xhci_pci"
"megaraid_sas"
"mpt3sas"
"usbhid"
"usb_storage"
"sd_mod"
"sr_mod"
];
boot.initrd.kernelModules = [ "dm-snapshot" ];
boot.kernelModules = [ ];
boot.extraModulePackages = [ ];
fileSystems."/" =
{ device = "/dev/disk/by-uuid/19b7e607-b138-442a-9026-3ae1092046c9";
fsType = "ext4";
};
fileSystems."/" = {
device = "/dev/disk/by-uuid/19b7e607-b138-442a-9026-3ae1092046c9";
fsType = "ext4";
};
fileSystems."/backup" =
{ device = "/dev/vg1/data";
fsType = "xfs";
options = [ "ro" "noauto" ];
};
# fileSystems."/backup" =
# { device = "/dev/vg1/data";
# fsType = "xfs";
# options = [ "ro" "noauto" ];
# };
swapDevices = [ ];
@@ -37,5 +44,6 @@
# networking.interfaces.eno4.useDHCP = lib.mkDefault true;
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
hardware.cpu.intel.updateMicrocode = lib.mkDefault config.hardware.enableRedistributableFirmware;
hardware.cpu.intel.updateMicrocode =
lib.mkDefault config.hardware.enableRedistributableFirmware;
}

186
ekman/fs-work/default.nix Normal file
View File

@@ -0,0 +1,186 @@
{
pkgs ? import <nixpkgs> { },
}:
let
name = "fs-work";
address = "10.255.241.90";
etcdCluster = import ../etcdCluster.nix;
in
{
fs-work =
{ config, pkgs, ... }:
with pkgs;
{
deployment.tags = [
"fs"
"fs-backup"
];
deployment.targetHost = address;
system.autoUpgrade.enable = lib.mkForce false;
systemd.targets = {
sleep.enable = false;
suspend.enable = false;
hibernate.enable = false;
hybrid-sleep.enable = false;
};
# services.udev.extraRules = ''
# KERNEL=="ibp65s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
environment.systemPackages = with pkgs; [
rdma-core
hwloc
xfsprogs
];
cluster = {
k8sNode = true;
slurm = false;
mounts = {
rdma.enable = true;
automount.enable = true;
users = true;
opt = true;
work = false;
data = true;
ceph = true;
backup = false;
};
};
features = {
host = {
inherit address;
inherit name;
};
os = {
networkmanager.enable = false;
externalInterface = "enp33s0f3np3";
nfs.enable = true;
nfs.exports = ''
/exports 10.255.241.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
/exports 10.255.243.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
/exports 10.255.244.0/24(insecure,rw,async,no_subtree_check,crossmnt,fsid=0,no_root_squash)
'';
};
k8s = {
enable = true;
node.enable = true;
master.enable = false;
inherit etcdCluster;
};
};
systemd.services.rc-local = {
description = "rc.local script";
wantedBy = [ "multi-user.target" ];
after = [ "network.target" ];
path = [ "/run/current-system/sw/" ];
serviceConfig = {
Type = "oneshot";
};
script = ''
# if [ -e /sys/block/md126 ]; then
# echo "deadline" > /sys/block/md126/queue/scheduler
# # echo "4096" > /sys/block/md126/queue/nr_requests
# echo "4096" > /sys/block/md126/queue/read_ahead_kb
# echo "always" > /sys/kernel/mm/transparent_hugepage/enabled
# echo "always" > /sys/kernel/mm/transparent_hugepage/defrag
# fi
grep -q rdma /proc/fs/nfsd/portlist || echo "rdma 20049" > /proc/fs/nfsd/portlist
grep -q tcp /proc/fs/nfsd/portlist || echo "tcp 2049" > /proc/fs/nfsd/portlist
'';
};
boot.kernel.sysctl = {
"vm.dirty_background_ratio" = 5;
"vm.dirty_ratio" = 10;
"vm.vfs_cache_pressure" = 50;
"vm.min_free_kbytes" = 262144;
};
networking = {
useNetworkd = true;
useDHCP = false;
hostName = name;
firewall = {
allowedTCPPorts = [ ];
allowedUDPPorts = [ ];
extraCommands = ''
# iptables -t nat -A POSTROUTING -s 10.255.243.0/24 -j MASQUERADE
'';
};
};
systemd.network = {
networks."40-enp65s0f0np0" = {
DHCP = "no";
matchConfig.Name = "enp65s0f0np0";
address = [ "${address}/24" ];
networkConfig = {
DNSDefaultRoute = true;
};
routes = [
{ Gateway = "10.255.241.1"; }
{
Destination = "10.255.242.0/24";
Gateway = "10.255.241.100";
}
{
Destination = "172.16.239.0/24";
Gateway = "10.255.241.210";
}
];
};
networks."40-enp1s0f1np1" = {
DHCP = "no";
matchConfig.Name = "enp1s0f1np1";
address = [ "10.255.244.90/24" ];
};
networks."42-ibp1s0f0" = {
DHCP = "no";
matchConfig.Name = "ibp1s0f0 ";
address = [ "10.255.243.90/24" ];
};
};
services.rpcbind.enable = true;
fileSystems = {
"/exports/work" = {
device = "/work";
options = [ "bind" ];
};
"/exports/opt" = {
device = "/opt";
options = [ "bind" ];
};
};
programs.singularity.enable = true;
security.sudo.extraConfig = ''
%sif ALL=(ALL) NOPASSWD: /run/current-system/sw/bin/singularity
%admin ALL=(admin) NOPASSWD: ALL
'';
boot.kernelParams = [
"console=tty0"
"console=ttyS0,115200"
];
systemd.services."serial-getty@ttyS0" = {
enable = true;
wantedBy = [ "getty.target" ];
serviceConfig.Restart = "always";
};
imports = [
./hardware-configuration.nix
../default.nix
../mounts.nix
];
};
}

24
ekman/hive.nix Normal file
View File

@@ -0,0 +1,24 @@
let
# Pin the deployment package-set to a specific version of nixpkgs
# pkgs = import (builtins.fetchTarball {
# url = "https://github.com/NixOS/nixpkgs/archive/e6377ff35544226392b49fa2cf05590f9f0c4b43.tar.gz";
# sha256 = "1fra9wwy5gvj5ibayqkzqpwdf715bggc0qbmrfch4fghwvl5m70l";
# }) {};
pkgs = import <nixpkgs> {};
ekman-manage = {
deployment = {
tags = [ "manage" "ekman" ];
allowLocalDeployment = true;
targetHost = null;
};
imports = [ ./manage ];
};
login = import ./login { inherit pkgs; };
c0 = import ./c0 { inherit pkgs; };
c1 = import ./c1 { inherit pkgs; };
fs-work = import ./fs-work { inherit pkgs; };
fs-backup = import ./fs-backup { inherit pkgs; };
in
{ inherit ekman-manage; frontend = ekman-manage; } // login // c0 // c1 // fs-work // fs-backup

32
ekman/hosts.nix Normal file
View File

@@ -0,0 +1,32 @@
''
10.255.241.99 ekman-manage
10.255.241.100 ekman-login
10.255.241.100 ekman
10.255.241.101 c0-1
10.255.241.102 c0-2
10.255.241.103 c0-3
10.255.241.104 c0-4
10.255.241.105 c0-5
10.255.241.106 c0-6
10.255.241.107 c0-7
10.255.241.108 c0-8
10.255.241.109 c0-9
10.255.241.110 c0-10
10.255.241.111 c0-11
10.255.241.112 c0-12
10.255.241.113 c0-13
10.255.241.114 c0-14
10.255.241.115 c0-15
10.255.241.116 c0-16
10.255.241.117 c0-17
10.255.241.118 c0-18
10.255.241.121 c1-1
10.255.241.122 c1-2
10.255.241.123 c1-3
10.255.241.124 c1-4
10.255.241.125 c1-5
10.255.241.126 c1-6
10.255.241.127 c1-7
10.255.241.128 c1-8
''

337
ekman/login/default.nix Normal file
View File

@@ -0,0 +1,337 @@
{
pkgs ? import <nixpkgs> { },
}:
let
name = "ekman";
address = "10.255.241.100";
in
{
ekman-login =
{ config, pkgs, ... }:
with pkgs;
{
deployment.tags = [
"login"
"cluster"
];
deployment.targetHost = address;
system.autoUpgrade.enable = lib.mkForce false;
systemd.targets = {
sleep.enable = false;
suspend.enable = false;
hibernate.enable = false;
hybrid-sleep.enable = false;
};
cluster = {
compute = true;
k8sNode = true;
mounts = {
rdma.enable = true;
automount.enable = true;
users = false;
opt = false;
work = true;
data = true;
ceph = true;
backup = false;
};
};
features = {
host = {
inherit name;
inherit address;
};
myvnc.enable = false;
os = {
networkmanager.enable = false;
externalInterface = "enp33s0f0np0";
nfs.enable = true;
nfs.exports = ''
/exports 10.255.241.0/24(insecure,rw,sync,no_subtree_check,crossmnt,fsid=0,no_root_squash)
/exports 10.255.243.0/24(insecure,rw,sync,no_subtree_check,crossmnt,fsid=0,no_root_squash)
'';
};
hpc = {
slurm.server = false;
slurm.slurmrestd = false;
manageNode = false;
loginNode = true;
knem = false;
};
k8s = {
master.enable = false;
node.enable = true;
};
desktop.enable = false;
# server.enable = true;
monitoring = {
# server = {
# enable = false;
# scrapeHosts = [ "frontend" "nfs0" "nfs1" ] ++ (builtins.map (x: x.name) computeNodes);
# defaultAlertReceiver = {
# email_configs = [
# { to = "jonas.juselius@oceanbox.io"; }
# ];
# };
# pageAlertReceiver = {
# webhook_configs = [
# {
# url = "https://prometheus-msteams.k2.itpartner.no/ekman";
# http_config = {
# tls_config = { insecure_skip_verify = true; };
# };
# }
# ];
# };
# };
# webUI.enable = false;
# webUI.acmeEmail = "innovasjon@itpartner.no";
# webUI.allow = [
# "10.1.2.0/24"
# "172.19.254.0/24"
# "172.19.255.0/24"
# ];
};
};
# services.udev.extraRules = ''
# KERNEL=="ibp65s0", SUBSYSTEM=="net", ATTR{create_child}:="0x7666"
# '';
# boot.kernelPackages = pkgs.linuxKernel.packages.linux_6_6;
services.flannel.iface = "enp33s0f3np3";
networking = {
useNetworkd = true;
useDHCP = false;
hostName = name;
firewall = {
allowedTCPPorts = [ 6443 ];
extraCommands = ''
# needed for nodeport access on k1 and k2
# iptables -t nat -A POSTROUTING -s 10.255.241.0/24 ! -d 10.255.0.0/16 -j SNAT --to-source 10.255.242.2
iptables -t nat -A POSTROUTING -s 10.255.243.0/24 -j MASQUERADE
'';
};
};
systemd.network = {
networks = {
"40-enp33s0f0np0" = {
DHCP = "no";
matchConfig.Name = "enp33s0f0np0";
address = [ "10.255.242.2/24" ];
routes = [
{ Gateway = "10.255.242.1"; }
];
};
"40-enp33s0f3np3" = {
DHCP = "no";
matchConfig.Name = "enp33s0f3np3";
address = [ "${address}/24" ];
networkConfig = {
DNSDefaultRoute = true;
};
routes = [
{
Destination = "172.16.239.0/24";
Gateway = "10.255.241.210";
}
];
};
"41-enp65s0f1np1" = {
DHCP = "no";
matchConfig.Name = "enp65s0f1np1";
address = [ "10.255.244.100/24" ];
};
"45-ibp65s0f0" = {
DHCP = "no";
matchConfig.Name = "ibp65s0f0";
address = [ "10.255.243.100/24" ];
};
};
};
services.resolved = {
# DNS=[::1]:53
extraConfig = ''
DNSStubListener=no
'';
};
fileSystems = {
"/exports/users" = {
device = "/home";
options = [ "bind" ];
};
"/exports/opt/bin" = {
device = "/opt/bin";
options = [ "bind" ];
};
"/exports/opt/sif" = {
device = "/opt/sif";
options = [ "bind" ];
};
"/exports/nfs-provisioner" = {
device = "/vol/nfs-provisioner";
options = [ "bind" ];
};
"/users" = {
device = "/home";
options = [ "bind" ];
};
"/vol/local-storage/vol1" = {
device = "/vol/vol1";
options = [ "bind" ];
};
"/vol/local-storage/vol2" = {
device = "/vol/vol2";
options = [ "bind" ];
};
};
nix.extraOptions = ''
secret-key-files = /etc/nix/ekman.key
'';
# services.xserver = {
# enable = false;
# enableCtrlAltBackspace = true;
# layout = "us";
# xkbVariant = "altgr-intl";
# xkbOptions = "eurosign:e";
# displayManager = {
# gdm.enable = false;
# job.logToFile = true;
# };
# # desktopManager.xfce.enable = true;
# };
services.prometheus.alertmanager.configuration.global = {
smtp_smarthost = "smtpgw.itpartner.no";
# smtp_auth_username = "utvikling";
# smtp_auth_password = "S0m3rp0m@de#21!";
smtp_hello = "ekman.oceanbox.io";
smtp_from = "noreply@ekman.oceanbox.io";
};
# services.nginx = {
# virtualHosts = {
# "ds.matnoc.regnekraft.io" = {
# forceSSL = true;
# enableACME = true;
# serverAliases = [];
# locations."/" = {
# proxyPass = "http://localhost:9088";
# proxyWebsockets = false;
# extraConfig = ''
# allow 10.1.2.0/24;
# allow 172.19.254.0/24;
# allow 172.19.255.0/24;
# deny all;
# '';
# };
# };
# };
# };
# services.gitlab-runner = {
# enable = true;
# extraPackages = with pkgs; [
# singularity
# ];
# concurrent = 4;
# services = {
# sif = {
# registrationConfigFile = "/var/lib/secrets/gitlab-runner-registration";
# executor = "shell";
# tagList = [ "ekman" "sif" ];
# };
# };
# };
security.sudo.extraConfig = ''
%sif ALL=(ALL) NOPASSWD: /run/current-system/sw/bin/singularity
%admin ALL=(admin) NOPASSWD: ALL
# gitlab-runner ALL=(ALL) NOPASSWD: /run/current-system/sw/bin/singularity
'';
security.pam = {
services.sshd.googleAuthenticator.enable = true;
loginLimits = [
{
domain = "@users";
item = "rss";
type = "hard";
value = 16000000;
}
{
domain = "@users";
item = "cpu";
type = "hard";
value = 180;
}
];
};
system.activationScripts = {
home-permissions.text = ''
chmod 755 /home/olean
chmod 755 /home/frankgaa
chmod 755 /home/jonas
chmod 755 /home/mrtz
chmod 755 /home/avle
chmod 755 /home/stig
chmod 755 /home/bast
chmod 755 /home/simenlk
chmod 755 /work/kraken
'';
};
# ssh-rsa is deprecated, but putty/winscp users use it
services.openssh.extraConfig = ''
# pubkeyacceptedalgorithms ssh-rsa,ssh-ed25519-cert-v01@openssh.com,ecdsa-sha2-nistp256-cert-v01@openssh.com,ecdsa-sha2-nistp384-cert-v01@openssh.com,ecdsa-sha2-nistp521-cert-v01@openssh.com,sk-ssh-ed25519-cert-v01@openssh.com,sk-ecdsa-sha2-nistp256-cert-v01@openssh.com,rsa-sha2-512-cert-v01@openssh.com,rsa-sha2-256-cert-v01@openssh.com,ssh-ed25519,ecdsa-sha2-nistp256,ecdsa-sha2-nistp384,ecdsa-sha2-nistp521,sk-ssh-ed25519@openssh.com,sk-ecdsa-sha2-nistp256@openssh.com,rsa-sha2-512,rsa-sha2-256
PubkeyAuthOptions verify-required
'';
environment.systemPackages = [ ];
virtualisation.docker.enable = pkgs.lib.mkForce true;
services.tailscale = {
enable = true;
authKeyFile = "/var/lib/secrets/tailscale.key";
useRoutingFeatures = "client";
extraUpFlags = [
"--login-server=https://headscale.svc.oceanbox.io"
"--accept-dns=true"
"--advertise-tags=tag:hpc"
];
};
boot.kernelParams = [
"console=tty0"
"console=ttyS0,115200"
];
systemd.services."serial-getty@ttyS0" = {
enable = true;
wantedBy = [ "getty.target" ];
serviceConfig.Restart = "always";
};
imports = [
./hardware-configuration.nix
../default.nix
../mounts.nix
../myvnc.nix
];
};
}

Some files were not shown because too many files have changed in this diff Show More