From b22d29c4ff7b90229be7fb14355fcbd843eb211e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 15:32:07 +0100 Subject: [PATCH 01/17] minor(kueue): Add localQueue for prod/staging --- values/kueue/manifests/clusterQueue.yaml | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/values/kueue/manifests/clusterQueue.yaml b/values/kueue/manifests/clusterQueue.yaml index 716ce700..5eab7b8d 100644 --- a/values/kueue/manifests/clusterQueue.yaml +++ b/values/kueue/manifests/clusterQueue.yaml @@ -1,12 +1,12 @@ apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: - name: compute + name: compute # Just needs to exist, can be managed with tains/tolerations --- apiVersion: kueue.x-k8s.io/v1beta1 kind: ClusterQueue metadata: - name: sample-jobs + name: jobs spec: cohort: general namespaceSelector: {} # Accept workloads from any namespace @@ -19,3 +19,19 @@ spec: nominalQuota: '4' - name: "memory" nominalQuota: 8Gi +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + name: prod-queue + namespace: prod-sorcerer +spec: + clusterQueue: jobs +--- +apiVersion: kueue.x-k8s.io/v1beta1 +kind: LocalQueue +metadata: + name: staging-queue + namespace: staging-sorcerer +spec: + clusterQueue: jobs From dbf1e73f79fa330406f69d147f3a341d4521d5e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 15:39:39 +0100 Subject: [PATCH 02/17] fix(kueue): Enable metrics --- values/kueue/values/values.yaml | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/values/kueue/values/values.yaml b/values/kueue/values/values.yaml index 70bf2e1b..6e883cd2 100644 --- a/values/kueue/values/values.yaml +++ b/values/kueue/values/values.yaml @@ -12,8 +12,28 @@ controllerManager: frameworks: - batch/job - jobset.x-k8s.io/jobset +# For metrics +enableCertManager: true +enablePrometheus: true +metrics: + prometheusNamespace: prometheus + service + tlsConfig: + serverName: kueue-controller-manager-metrics-service.kueue-system.svc + ca: + secret: + name: kueue-metrics-server-cert + key: ca.crt + cert: + secret: + name: kueue-metrics-server-cert + key: tls.crt + keySecret: + name: kueue-metrics-server-cert + key: tls.keyMonitor: -enableKueueViz: true +# NOTE: Disable Visualization for now +enableKueueViz: false kueueViz: backend: env: From bb3586b7c5930e7e59f5a4b17afa4fd34c0c10d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 15:45:11 +0100 Subject: [PATCH 03/17] fix(kueue): Add ingress --- values/kueue/manifests/ing.yaml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) create mode 100644 values/kueue/manifests/ing.yaml diff --git a/values/kueue/manifests/ing.yaml b/values/kueue/manifests/ing.yaml new file mode 100644 index 00000000..e6e59373 --- /dev/null +++ b/values/kueue/manifests/ing.yaml @@ -0,0 +1,28 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: kueueviz-ingress + namespace: kueue-system +spec: + rules: + - host: kueue.{{ .Values.clusterConfig.domain }} + http: + paths: + - path: /api(/|$)(.*) + pathType: Prefix + backend: + service: + name: kueue-kueueviz-backend + port: + number: 8080 + - path: / + pathType: Prefix + backend: + service: + name: kueue-kueueviz-frontend + port: + number: 8080 + tls: + - hosts: + - kueue.{{ .Values.clusterConfig.domain }} + secretName: kueueviz-tls From de19337d2cd4ddfdedee09ab93c17a09b7a84ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 15:46:33 +0100 Subject: [PATCH 04/17] fix(headscale): Add kueue ing --- values/headscale/values/values.yaml | 2 ++ values/kueue/values/values.yaml | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/values/headscale/values/values.yaml b/values/headscale/values/values.yaml index ee4d1aac..981216b8 100644 --- a/values/headscale/values/values.yaml +++ b/values/headscale/values/values.yaml @@ -281,6 +281,7 @@ configMaps: { "name": "sorcrerer.ekman.oceanbox.io", "type": "A", "value": "10.255.241.99" }, { "name": "plume.data.oceanbox.io", "type": "A", "value": "10.255.241.99" }, { "name": "slurm-agent.ekman.oceanbox.io", "type": "A", "value": "10.255.241.99" }, + { "name": "kueue.ekman.oceanbox.io", "type": "A", "value": "10.255.241.99" }, { "name": "slurm-agent.rossby.oceanbox.io", "type": "A", "value": "172.16.239.222" }, @@ -289,6 +290,7 @@ configMaps: { "name": "prometheus.adm.vtn.obx", "type": "A", "value": "172.16.239.221" }, { "name": "alertmanager.adm.vtn.obx", "type": "A", "value": "172.16.239.221" }, { "name": "slurm-agent.adm.vtn.obx", "type": "A", "value": "172.16.239.221" }, + { "name": "kueue.adm.vtn.obx", "type": "A", "value": "172.16.239.221" }, { "name": "dashboard.ob-ceph.local", "type": "A", "value": "10.255.241.10" }, { "name": "grafana.ob-ceph.local", "type": "A", "value": "10.255.241.10" }, diff --git a/values/kueue/values/values.yaml b/values/kueue/values/values.yaml index 6e883cd2..050f7fac 100644 --- a/values/kueue/values/values.yaml +++ b/values/kueue/values/values.yaml @@ -32,8 +32,8 @@ metrics: name: kueue-metrics-server-cert key: tls.keyMonitor: -# NOTE: Disable Visualization for now -enableKueueViz: false +# For Visualization +enableKueueViz: true kueueViz: backend: env: From a4e5901c76c39e3ef0fb5c95862f3cfd1c9050d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 15:49:19 +0100 Subject: [PATCH 05/17] fix(kueue): Correct ingress --- values/kueue/manifests/ing.yaml | 37 +++++++++++++++++++++++++++++++-- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/values/kueue/manifests/ing.yaml b/values/kueue/manifests/ing.yaml index e6e59373..4b4c288d 100644 --- a/values/kueue/manifests/ing.yaml +++ b/values/kueue/manifests/ing.yaml @@ -1,3 +1,4 @@ +{{- if eq .Values.clusterConfig.cluster "ekman"}} apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -5,7 +6,7 @@ metadata: namespace: kueue-system spec: rules: - - host: kueue.{{ .Values.clusterConfig.domain }} + - host: kueue.dev.tos.obx http: paths: - path: /api(/|$)(.*) @@ -24,5 +25,37 @@ spec: number: 8080 tls: - hosts: - - kueue.{{ .Values.clusterConfig.domain }} + - kueue.dev.tos.obx secretName: kueueviz-tls +{{-end}} +--- +{{- if eq .Values.clusterConfig.cluster "rossby"}} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: kueueviz-ingress + namespace: kueue-system +spec: + rules: + - host: kueue.dev.vtn.obx + http: + paths: + - path: /api(/|$)(.*) + pathType: Prefix + backend: + service: + name: kueue-kueueviz-backend + port: + number: 8080 + - path: / + pathType: Prefix + backend: + service: + name: kueue-kueueviz-frontend + port: + number: 8080 + tls: + - hosts: + - kueue.dev.vtn.obx + secretName: kueueviz-tls +{{-end}} From 9f922a494df50b2a34d257495ebd5ff7cae2e5fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 15:53:35 +0100 Subject: [PATCH 06/17] fix(kueue): Correct yaml --- values/kueue/manifests/ing.yaml | 4 ++-- values/kueue/values/values.yaml | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/values/kueue/manifests/ing.yaml b/values/kueue/manifests/ing.yaml index 4b4c288d..9a6c8c71 100644 --- a/values/kueue/manifests/ing.yaml +++ b/values/kueue/manifests/ing.yaml @@ -27,7 +27,7 @@ spec: - hosts: - kueue.dev.tos.obx secretName: kueueviz-tls -{{-end}} +{{- end}} --- {{- if eq .Values.clusterConfig.cluster "rossby"}} apiVersion: networking.k8s.io/v1 @@ -58,4 +58,4 @@ spec: - hosts: - kueue.dev.vtn.obx secretName: kueueviz-tls -{{-end}} +{{- end}} diff --git a/values/kueue/values/values.yaml b/values/kueue/values/values.yaml index 050f7fac..29675a94 100644 --- a/values/kueue/values/values.yaml +++ b/values/kueue/values/values.yaml @@ -12,12 +12,11 @@ controllerManager: frameworks: - batch/job - jobset.x-k8s.io/jobset -# For metrics enableCertManager: true enablePrometheus: true metrics: prometheusNamespace: prometheus - service + service: tlsConfig: serverName: kueue-controller-manager-metrics-service.kueue-system.svc ca: @@ -30,9 +29,7 @@ metrics: key: tls.crt keySecret: name: kueue-metrics-server-cert - key: tls.keyMonitor: - -# For Visualization + key: tls.keyMonitor enableKueueViz: true kueueViz: backend: From 3c9f2e4c4aad3611cb93250d0b3950ed94e5d47c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 15:55:29 +0100 Subject: [PATCH 07/17] fix(kueue): Use ca-issuer --- values/kueue/manifests/ing.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/values/kueue/manifests/ing.yaml b/values/kueue/manifests/ing.yaml index 9a6c8c71..1094ac3e 100644 --- a/values/kueue/manifests/ing.yaml +++ b/values/kueue/manifests/ing.yaml @@ -4,6 +4,8 @@ kind: Ingress metadata: name: kueueviz-ingress namespace: kueue-system + annotations: + cert-manager.io/cluster-issuer: ca-issuer spec: rules: - host: kueue.dev.tos.obx @@ -35,6 +37,8 @@ kind: Ingress metadata: name: kueueviz-ingress namespace: kueue-system + annotations: + cert-manager.io/cluster-issuer: ca-issuer spec: rules: - host: kueue.dev.vtn.obx From dc5fbb49ca6cd97e52d8102f28da44945b71b84b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 15:56:15 +0100 Subject: [PATCH 08/17] fix(hs): Use dev.x.obx --- values/headscale/values/values.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/values/headscale/values/values.yaml b/values/headscale/values/values.yaml index 981216b8..4013123a 100644 --- a/values/headscale/values/values.yaml +++ b/values/headscale/values/values.yaml @@ -281,7 +281,8 @@ configMaps: { "name": "sorcrerer.ekman.oceanbox.io", "type": "A", "value": "10.255.241.99" }, { "name": "plume.data.oceanbox.io", "type": "A", "value": "10.255.241.99" }, { "name": "slurm-agent.ekman.oceanbox.io", "type": "A", "value": "10.255.241.99" }, - { "name": "kueue.ekman.oceanbox.io", "type": "A", "value": "10.255.241.99" }, + + { "name": "kueue.dev.tos.obx", "type": "A", "value": "10.255.241.99" }, { "name": "slurm-agent.rossby.oceanbox.io", "type": "A", "value": "172.16.239.222" }, @@ -290,7 +291,8 @@ configMaps: { "name": "prometheus.adm.vtn.obx", "type": "A", "value": "172.16.239.221" }, { "name": "alertmanager.adm.vtn.obx", "type": "A", "value": "172.16.239.221" }, { "name": "slurm-agent.adm.vtn.obx", "type": "A", "value": "172.16.239.221" }, - { "name": "kueue.adm.vtn.obx", "type": "A", "value": "172.16.239.221" }, + + { "name": "kueue.dev.vtn.obx", "type": "A", "value": "172.16.239.221" }, { "name": "dashboard.ob-ceph.local", "type": "A", "value": "10.255.241.10" }, { "name": "grafana.ob-ceph.local", "type": "A", "value": "10.255.241.10" }, From dbb17345b61ab88a0b2e0c43c8234d2604466493 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 16:02:56 +0100 Subject: [PATCH 09/17] fix(kueue): Disable internal certs --- values/kueue/values/values.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/values/kueue/values/values.yaml b/values/kueue/values/values.yaml index 29675a94..952d1d6b 100644 --- a/values/kueue/values/values.yaml +++ b/values/kueue/values/values.yaml @@ -12,6 +12,8 @@ controllerManager: frameworks: - batch/job - jobset.x-k8s.io/jobset + internalCertManagement: + enable: false enableCertManager: true enablePrometheus: true metrics: From 07cfd8013dd8fade776883d1dfcff51f85d5d7d4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 16:05:47 +0100 Subject: [PATCH 10/17] fix(kueue): I'm stupid --- values/kueue/values/values.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/values/kueue/values/values.yaml b/values/kueue/values/values.yaml index 952d1d6b..a0daae3d 100644 --- a/values/kueue/values/values.yaml +++ b/values/kueue/values/values.yaml @@ -18,7 +18,7 @@ enableCertManager: true enablePrometheus: true metrics: prometheusNamespace: prometheus - service: + serviceMonitor: tlsConfig: serverName: kueue-controller-manager-metrics-service.kueue-system.svc ca: @@ -31,7 +31,7 @@ metrics: key: tls.crt keySecret: name: kueue-metrics-server-cert - key: tls.keyMonitor + key: tls.key enableKueueViz: true kueueViz: backend: From fb7110204947d06a8213665439fb2b55fc37b81e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 16:08:14 +0100 Subject: [PATCH 11/17] fix(kueue): Check prom --- values/kueue/values/values.yaml | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/values/kueue/values/values.yaml b/values/kueue/values/values.yaml index a0daae3d..4dae10d4 100644 --- a/values/kueue/values/values.yaml +++ b/values/kueue/values/values.yaml @@ -14,31 +14,15 @@ controllerManager: - jobset.x-k8s.io/jobset internalCertManagement: enable: false -enableCertManager: true enablePrometheus: true metrics: prometheusNamespace: prometheus serviceMonitor: tlsConfig: - serverName: kueue-controller-manager-metrics-service.kueue-system.svc - ca: - secret: - name: kueue-metrics-server-cert - key: ca.crt - cert: - secret: - name: kueue-metrics-server-cert - key: tls.crt - keySecret: - name: kueue-metrics-server-cert - key: tls.key + insecureSkipVerify: true enableKueueViz: true kueueViz: backend: env: - name: KUEUEVIZ_ALLOWED_ORIGINS value: "http://frontend.kueueviz.local" - frontend: - env: - - name: REACT_APP_WEBSOCKET_URL - value: "wss://kueue-kueueviz-backend" From 3d423a8111ec6db60578be314e73bb7d1a5df702 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 16:09:06 +0100 Subject: [PATCH 12/17] fix(kueue): Disable internal --- values/kueue/values/values.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/values/kueue/values/values.yaml b/values/kueue/values/values.yaml index 4dae10d4..684cd4d2 100644 --- a/values/kueue/values/values.yaml +++ b/values/kueue/values/values.yaml @@ -12,8 +12,6 @@ controllerManager: frameworks: - batch/job - jobset.x-k8s.io/jobset - internalCertManagement: - enable: false enablePrometheus: true metrics: prometheusNamespace: prometheus From 0b634744daa582a27162cdef7f8e7e343664ea93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 12 Dec 2025 16:10:45 +0100 Subject: [PATCH 13/17] fix(kueue): Lets try again --- values/kueue/values/values.yaml | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/values/kueue/values/values.yaml b/values/kueue/values/values.yaml index 684cd4d2..810b948e 100644 --- a/values/kueue/values/values.yaml +++ b/values/kueue/values/values.yaml @@ -12,12 +12,26 @@ controllerManager: frameworks: - batch/job - jobset.x-k8s.io/jobset + internalCertManagement: + enable: false +enableCertManager: true enablePrometheus: true metrics: prometheusNamespace: prometheus serviceMonitor: tlsConfig: - insecureSkipVerify: true + serverName: kueue-controller-manager-metrics-service.kueue-system.svc + ca: + secret: + name: kueue-metrics-server-cert + key: ca.crt + cert: + secret: + name: kueue-metrics-server-cert + key: tls.crt + keySecret: + name: kueue-metrics-server-cert + key: tls.key enableKueueViz: true kueueViz: backend: From 083cd50d6ad9443e4425b274c6352b9ef13adc4c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Sat, 13 Dec 2025 13:23:59 +0100 Subject: [PATCH 14/17] fix(kueue): Undo certs --- values/kueue/values/values.yaml | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/values/kueue/values/values.yaml b/values/kueue/values/values.yaml index 810b948e..c9aeb7a6 100644 --- a/values/kueue/values/values.yaml +++ b/values/kueue/values/values.yaml @@ -13,25 +13,11 @@ controllerManager: - batch/job - jobset.x-k8s.io/jobset internalCertManagement: - enable: false -enableCertManager: true + enable: true +enableCertManager: false enablePrometheus: true metrics: prometheusNamespace: prometheus - serviceMonitor: - tlsConfig: - serverName: kueue-controller-manager-metrics-service.kueue-system.svc - ca: - secret: - name: kueue-metrics-server-cert - key: ca.crt - cert: - secret: - name: kueue-metrics-server-cert - key: tls.crt - keySecret: - name: kueue-metrics-server-cert - key: tls.key enableKueueViz: true kueueViz: backend: From 3ab4a94bb2b1953ac7cbe2320c7613a6f09e362a Mon Sep 17 00:00:00 2001 From: Radovan Bast Date: Sat, 13 Dec 2025 17:52:57 +0000 Subject: [PATCH 15/17] ci: makai --- values/makai/values/values-staging.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/values/makai/values/values-staging.yaml b/values/makai/values/values-staging.yaml index a0380467..9cd9e1a7 100644 --- a/values/makai/values/values-staging.yaml +++ b/values/makai/values/values-staging.yaml @@ -1,6 +1,6 @@ replicaCount: 1 image: - tag: "832fdf2d-debug" + tag: "65c45b31-debug" env: - name: APP_VERSION value: "0.0.0-staging" From f7e4b100e1d3b588a351e3b0f633bdd52215540a Mon Sep 17 00:00:00 2001 From: Radovan Bast Date: Sat, 13 Dec 2025 18:02:57 +0000 Subject: [PATCH 16/17] ci: makai --- values/makai/values/values-staging.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/values/makai/values/values-staging.yaml b/values/makai/values/values-staging.yaml index 9cd9e1a7..e38b5331 100644 --- a/values/makai/values/values-staging.yaml +++ b/values/makai/values/values-staging.yaml @@ -1,6 +1,6 @@ replicaCount: 1 image: - tag: "65c45b31-debug" + tag: "f6162f27-debug" env: - name: APP_VERSION value: "0.0.0-staging" From a3609c40728f611d55109fd38de2a18baaa382b7 Mon Sep 17 00:00:00 2001 From: Radovan Bast Date: Sun, 14 Dec 2025 10:14:15 +0000 Subject: [PATCH 17/17] ci: makai --- values/makai/values/values-staging.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/values/makai/values/values-staging.yaml b/values/makai/values/values-staging.yaml index e38b5331..35f72822 100644 --- a/values/makai/values/values-staging.yaml +++ b/values/makai/values/values-staging.yaml @@ -1,6 +1,6 @@ replicaCount: 1 image: - tag: "f6162f27-debug" + tag: "52d23ede-debug" env: - name: APP_VERSION value: "0.0.0-staging"