From be7954d499f1e8fa56bca78d330717f61aef5394 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Mon, 16 Mar 2026 14:52:43 +0100 Subject: [PATCH] feat: Add Kueue and JobSet to ekman --- helmfile.d/kueue.yaml.gotmpl | 2 +- values/jobset/manifests/jobset.yaml | 2 +- values/kueue/env-rossby.yaml.gotmpl | 3 - values/kueue/manifests/clusterQueue.yaml | 59 +++++++++------- values/kueue/manifests/ing.yaml | 89 ------------------------ values/kueue/values/values-ekman.yaml | 9 --- values/kueue/values/values-rossby.yaml | 9 --- values/kueue/values/values.yaml | 14 ++-- 8 files changed, 41 insertions(+), 146 deletions(-) delete mode 100644 values/kueue/env-rossby.yaml.gotmpl delete mode 100644 values/kueue/manifests/ing.yaml delete mode 100644 values/kueue/values/values-ekman.yaml delete mode 100644 values/kueue/values/values-rossby.yaml diff --git a/helmfile.d/kueue.yaml.gotmpl b/helmfile.d/kueue.yaml.gotmpl index 57f781b7..dcb9de72 100644 --- a/helmfile.d/kueue.yaml.gotmpl +++ b/helmfile.d/kueue.yaml.gotmpl @@ -8,7 +8,7 @@ releases: - name: kueue namespace: kueue-system chart: oci://registry.k8s.io/kueue/charts/kueue - version: 0.15.0 + version: 0.16.2 condition: kueue.enabled values: - ../values/kueue/values/values.yaml diff --git a/values/jobset/manifests/jobset.yaml b/values/jobset/manifests/jobset.yaml index 7addc471..3d5858e6 100644 --- a/values/jobset/manifests/jobset.yaml +++ b/values/jobset/manifests/jobset.yaml @@ -38,7 +38,7 @@ spec: - group: "" kind: Secret name: jobset-webhook-server-cert - namespace: default + namespace: jobset-system jsonPointers: - /data {{- end }} diff --git a/values/kueue/env-rossby.yaml.gotmpl b/values/kueue/env-rossby.yaml.gotmpl deleted file mode 100644 index 0e677a53..00000000 --- a/values/kueue/env-rossby.yaml.gotmpl +++ /dev/null @@ -1,3 +0,0 @@ -kueue: - enabled: true - autosync: false diff --git a/values/kueue/manifests/clusterQueue.yaml b/values/kueue/manifests/clusterQueue.yaml index 2c71bffe..d0d5b6e4 100644 --- a/values/kueue/manifests/clusterQueue.yaml +++ b/values/kueue/manifests/clusterQueue.yaml @@ -2,50 +2,55 @@ apiVersion: kueue.x-k8s.io/v1beta1 kind: ResourceFlavor metadata: name: compute # Just needs to exist, can be managed with tains/tolerations +spec: + nodeLabels: + node-role.kubernetes.io/compute: compute + topology.kubernetes.io/group: c1 # Only run on C1 for now --- apiVersion: kueue.x-k8s.io/v1beta1 kind: ClusterQueue metadata: - name: jobs + name: cluster-queue spec: cohort: general namespaceSelector: {} # Accept workloads from any namespace - preemption: - withinClusterQueue: "LowerPriority" # Allow higher priority to preempt lower + queueingStrategy: BestEffortFIFO + # preemption: + # withinClusterQueue: "LowerPriority" # Allow higher priority to preempt lower resourceGroups: - coveredResources: ["cpu", "memory"] # Cover both memory and cpu resources flavors: - name: compute resources: - name: "cpu" - nominalQuota: '4' + nominalQuota: '32' - name: "memory" - nominalQuota: 8Gi ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: LocalQueue -metadata: - name: prod-queue - namespace: prod-sorcerer -spec: - clusterQueue: jobs + nominalQuota: 64Gi +# --- +# apiVersion: kueue.x-k8s.io/v1beta1 +# kind: LocalQueue +# metadata: +# name: prod-queue +# namespace: prod-queue +# spec: +# clusterQueue: cluster-queue --- apiVersion: kueue.x-k8s.io/v1beta1 kind: LocalQueue metadata: name: staging-queue - namespace: staging-sorcerer + namespace: dev-queue spec: - clusterQueue: jobs ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: WorkloadPriorityClass -metadata: - name: "normal" -value: 100 ---- -apiVersion: kueue.x-k8s.io/v1beta1 -kind: WorkloadPriorityClass -metadata: - name: "high" -value: 200 # Higher value = higher priority + clusterQueue: cluster-queue +# --- +# apiVersion: kueue.x-k8s.io/v1beta1 +# kind: WorkloadPriorityClass +# metadata: +# name: "normal" +# value: 100 +# --- +# apiVersion: kueue.x-k8s.io/v1beta1 +# kind: WorkloadPriorityClass +# metadata: +# name: "high" +# value: 200 # Higher value = higher priority diff --git a/values/kueue/manifests/ing.yaml b/values/kueue/manifests/ing.yaml deleted file mode 100644 index a6ba0d3a..00000000 --- a/values/kueue/manifests/ing.yaml +++ /dev/null @@ -1,89 +0,0 @@ -{{- if eq .Values.clusterConfig.cluster "ekman"}} -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: kueueviz-ingress - namespace: kueue-system - annotations: - cert-manager.io/cluster-issuer: ca-issuer - nginx.ingress.kubernetes.io/backend-protocol: HTTP - nginx.ingress.kubernetes.io/proxy-buffer-size: 128k - nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" - nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" - nginx.ingress.kubernetes.io/ssl-passthrough: "true" - nginx.ingress.kubernetes.io/ssl-redirect: "true" - nginx.ingress.kubernetes.io/websocket-services: kueue-kueueviz-backend -spec: - rules: - - host: kueue.dev.tos.obx - http: - paths: - - path: /ws - pathType: Prefix - backend: - service: - name: kueue-kueueviz-backend - port: - number: 8080 - - path: /api(/|$)(.*) - pathType: Prefix - backend: - service: - name: kueue-kueueviz-backend - port: - number: 8080 - - path: / - pathType: Prefix - backend: - service: - name: kueue-kueueviz-frontend - port: - number: 8080 - tls: - - hosts: - - kueue.dev.tos.obx - secretName: kueueviz-tls -{{- end}} ---- -{{- if eq .Values.clusterConfig.cluster "rossby"}} -apiVersion: networking.k8s.io/v1 -kind: Ingress -metadata: - name: kueueviz-ingress - namespace: kueue-system - annotations: - cert-manager.io/cluster-issuer: ca-issuer - nginx.ingress.kubernetes.io/websocket-services: kueue-kueueviz-backend - nginx.ingress.kubernetes.io/proxy-read-timeout: "3600" - nginx.ingress.kubernetes.io/proxy-send-timeout: "3600" -spec: - rules: - - host: kueue.dev.vtn.obx - http: - paths: - - path: /ws - pathType: Prefix - backend: - service: - name: kueue-kueueviz-backend - port: - number: 8080 - - path: /api(/|$)(.*) - pathType: Prefix - backend: - service: - name: kueue-kueueviz-backend - port: - number: 8080 - - path: / - pathType: Prefix - backend: - service: - name: kueue-kueueviz-frontend - port: - number: 8080 - tls: - - hosts: - - kueue.dev.vtn.obx - secretName: kueueviz-tls -{{- end}} diff --git a/values/kueue/values/values-ekman.yaml b/values/kueue/values/values-ekman.yaml deleted file mode 100644 index ae3c90ef..00000000 --- a/values/kueue/values/values-ekman.yaml +++ /dev/null @@ -1,9 +0,0 @@ -kueueViz: - backend: - env: - - name: KUEUEVIZ_ALLOWED_ORIGINS - value: "https://kueue.dev.tos.obx" - frontend: - env: - - name: REACT_APP_WEBSOCKET_URL - value: "wss://kueue.dev.tos.obx" diff --git a/values/kueue/values/values-rossby.yaml b/values/kueue/values/values-rossby.yaml deleted file mode 100644 index 31394214..00000000 --- a/values/kueue/values/values-rossby.yaml +++ /dev/null @@ -1,9 +0,0 @@ -kueueViz: - backend: - env: - - name: KUEUEVIZ_ALLOWED_ORIGINS - value: "https://kueue.dev.vtn.obx" - frontend: - env: - - name: REACT_APP_WEBSOCKET_URL - value: "wss://kueue.dev.vtn.obx" diff --git a/values/kueue/values/values.yaml b/values/kueue/values/values.yaml index f5c2d337..8a739e1a 100644 --- a/values/kueue/values/values.yaml +++ b/values/kueue/values/values.yaml @@ -1,9 +1,9 @@ controllerManager: - featureGates: - - name: TopologyAwareScheduling - enabled: true - - name: LocalQueueMetrics - enabled: true + # featureGates: + # - name: TopologyAwareScheduling + # enabled: true + # - name: LocalQueueMetrics + # enabled: true managerConfig: controllerManagerConfigYaml: | apiVersion: config.kueue.x-k8s.io/v1beta1 @@ -14,8 +14,8 @@ controllerManager: - jobset.x-k8s.io/jobset internalCertManagement: enable: false -enableCertManager: false +enableCertManager: true enablePrometheus: true metrics: prometheusNamespace: prometheus -enableKueueViz: true +enableKueueViz: false