From 54b50759e78654e8aaf790c6ac62a461fe3f3551 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Moritz=20J=C3=B6rg?= Date: Fri, 26 Sep 2025 13:34:45 +0200 Subject: [PATCH] fix: Persist slurm-web-agent --- values/slurm-web-agent/manifests/cm.yaml | 546 ++++++++++++++++++ .../slurm-web-agent/manifests/deployment.yaml | 72 +++ values/slurm-web-agent/manifests/ing.yaml | 27 + .../manifests/slurm-web-agent.yaml | 42 ++ values/slurm-web-agent/manifests/smon.yaml | 14 + values/slurm-web-agent/manifests/sva.yaml | 5 + values/slurm-web-agent/manifests/svc.yaml | 16 + 7 files changed, 722 insertions(+) create mode 100644 values/slurm-web-agent/manifests/cm.yaml create mode 100644 values/slurm-web-agent/manifests/deployment.yaml create mode 100644 values/slurm-web-agent/manifests/ing.yaml create mode 100644 values/slurm-web-agent/manifests/slurm-web-agent.yaml create mode 100644 values/slurm-web-agent/manifests/smon.yaml create mode 100644 values/slurm-web-agent/manifests/sva.yaml create mode 100644 values/slurm-web-agent/manifests/svc.yaml diff --git a/values/slurm-web-agent/manifests/cm.yaml b/values/slurm-web-agent/manifests/cm.yaml new file mode 100644 index 00000000..f7a88ea2 --- /dev/null +++ b/values/slurm-web-agent/manifests/cm.yaml @@ -0,0 +1,546 @@ +apiVersion: v1 +data: + policy.ini: | + # Slurm-web default vendor RBAC policy. DO NOT MODIFY THIS FILE! Create a file + # /etc/slurm-web/policy.conf with your custom rules and Slurm-web will ignore + # this file. Your modifications in this file will be overwritten and lost on + # software upgrade. + + [roles] + # Enable anonymous role with basic views + anonymous + # All authenticated users have the user role + user=ALL + + [anonymous] + actions=view-stats,view-jobs,view-nodes + + [user] + actions=view-stats,view-jobs,view-nodes,view-partitions,view-qos,view-accounts,view-reservations,cache-view + agent.ini: | + [service] + cluster=ekman + + [slurmrestd] + uri=unix:/run/slurmrestd/hipster.socket + auth=local + + [policy] + definition=data/conf/policy.yml + vendor_roles=data/conf/policy.ini + + [racksdb] + enabled=no + + [jwt] + key=data/secrets/jwt.key + policy.yml: | + actions: + view-stats: | + View general cluster statistics in home dashboard and clusters list. + view-jobs: | + Get _Jobs_ entry in main menu and permission to view all users jobs in + queue and all jobs details. + view-nodes: | + Get _Resources_ entry in main menu and permission to view the list of + compute nodes in cluster with their status in Slurm. + view-partitions: | + Permission to filter jobs and nodes by partition. + view-qos: | + Get _QOS_ entry in main menu and permission to view the list of defined QOS. + view-accounts: | + Permission to filter jobs by account. + view-reservations: | + Get _Reservations_ entry in main menu and permission to view the list of + defined reservations. + cache-view: | + Access to cache information and metrics in settings panel. + agent.yml: | + # This file contains the configuration settings definition for Slurm-web agent + # in RFL ConfigurationLoader format. + service: + cluster: + type: str + required: true + doc: Name of cluster served by agent + ex: atlas + interface: + type: str + default: localhost + doc: Interface address to bind for incoming connections + port: + type: int + default: 5012 + doc: TCP port to listen for incoming connections + cors: + type: bool + default: false + doc: When true, Cross-Origin Resource Sharing (CORS) headers are enabled. + debug: + type: bool + default: false + doc: Enable debug mode + log_flags: + type: list + content: str + default: + - ALL + choices: + - slurmweb + - rfl + - werkzeug + - urllib3 + - racksdb + - ALL + doc: List of log flags to enable. Special value `ALL` enables all log flags. + debug_flags: + type: list + content: str + default: + - slurmweb + choices: + - slurmweb + - rfl + - werkzeug + - urllib3 + - racksdb + - ALL + doc: | + List of debug flags to enable. Special value `ALL` enables all debug + flags. + + slurmrestd: + uri: + type: uri + default: unix:///run/slurmrestd/slurmrestd.socket + doc: | + URI to slurmrestd HTTP server. It can either be in the form + http://host:port for TCP/IP server or unix:///path/to/slurmrestd.socket + for Unix socket. + socket: + type: path + deprecated: + section: slurmrestd + parameter: uri + doc: Path to slurmrestd Unix socket. + auth: + type: str + choices: + - local + - jwt + default: jwt + doc: | + Authentication method with slurmrestd. + + The `jwt` authentication method is supported by both TCP/IP and Unix + sockets URIs. + + Note that `local` authentication method is only supported with Unix socket + URI and Slurm <= 24.11. With this method, Slurm-web agent must run with + the _slurm_ system user as well as `slurmrestd` service. Running + `slurmrestd` as _slurm_ system user is not possible with Slurm >= 25.05. + jwt_mode: + type: str + default: auto + choices: + - auto + - static + doc: | + Slurmrestd JWT authentication mode, either _auto_ or _static_. + + In _auto_ mode, Slurm-web agent generates tokens with the signature key + specified in `jwt_key`. The tokens have a limited lifespan as defined with + `jwt_lifespan`. Tokens are automatically renewed upon expiration. This is + the recommended mode. + + In _static_ mode, Slurm-web simply use the token provided with + `jwt_token`. + + This parameter is used only when `auth` is _jwt_. + jwt_user: + type: str + default: slurm + doc: | + The user name used in HTTP headers with JWT authentication. + + This parameter is used only when `auth` is _jwt_. + jwt_lifespan: + type: int + default: 3600 + doc: | + Lifespan of JWT tokens generated by Slurm-web in seconds. The default + value is 1 hour. + + This parameter is used only when `auth` is _jwt_ and `jwt_mode` is _auto_. + jwt_key: + type: path + default: /var/lib/slurm-web/slurmrestd.key + doc: | + Path to private key shared with Slurm for JWT signature. The key is used + by Slurm-web to generate its token for authentication on slurmrestd in + _auto_ mode. It must be the same key as used in Slurm `AuthAltParameters` + so that Slurm services can validate JWT generated by Slurm-web. + + This parameter is used only when `auth` is _jwt_ and `jwt_mode` is + _auto_. + jwt_token: + type: password + doc: | + The static JSON Web Token (JWT) used in HTTP headers with JWT + authentication, typically generated with `scontrol token`. While this is + generally not a good practice, it is recommended to generate tokens with + infinite lifespan to avoid failures due to expired token. + + This parameter is used only when `auth` is _jwt_ and `jwt_mode` is + _static_. + version: + type: str + default: '0.0.41' + doc: | + Slurm REST API version. + + CAUTION: You SHOULD NOT change this parameter unless you really know what + you are doing. This parameter is more intented for Slurm-web developers + rather than end users. Slurm-web is officially tested and validated with + the default value only. + + filters: + jobs: + type: list + content: str + default: + - account + - cpus + - gres_detail + - job_id + - job_state + - node_count + - nodes + - partition + - priority + - qos + - sockets_per_node + - state_reason + - tasks + - tres_per_job + - tres_per_node + - tres_per_socket + - tres_per_task + - user_name + doc: | + List of jobs fields selected in slurmrestd API when retrieving a list of + jobs, all other fields arefiltered out. + acctjob: + type: list + content: str + default: + - association + - comment + - derived_exit_code + - exit_code + - group + - name + - nodes + - partition + - priority + - qos + - script + - state + - steps + - submit_line + - time + - tres + - used_gres + - user + - wckey + - working_directory + doc: | + List of slurmdbd job fields selected in slurmrestd API when retrieving a + unique job, all other fields are filtered out. + ctldjob: + type: list + content: str + default: + - accrue_time + - batch_flag + - command + - cpus + - current_working_directory + - exclusive + - gres_detail + - last_sched_evaluation + - node_count + - partition + - sockets_per_node + - standard_error + - standard_input + - standard_output + - tasks + - tres_per_job + - tres_per_node + - tres_per_socket + - tres_per_task + - tres_req_str + doc: | + List of slurmctld job fields selected in slurmrestd API when retrieving a + unique job, all other fields are filtered out. + nodes: + type: list + content: str + default: + - name + - cpus + - sockets + - cores + - gres + - gres_used + - real_memory + - state + - reason + - partitions + - alloc_cpus + - alloc_idle_cpus + doc: | + List of nodes fields selected in slurmrestd API, all other fields are + filtered out. + node: + type: list + content: str + default: + - name + - architecture + - operating_system + - boot_time + - last_busy + - cpus + - sockets + - cores + - threads + - real_memory + - gres + - gres_used + - state + - reason + - partitions + - alloc_cpus + - alloc_idle_cpus + - alloc_memory + doc: | + List of invidual node fields selected in slurmrestd API, all other fields + are filtered out. + partitions: + type: list + content: str + default: + - name + - node_sets + doc: | + List of partitions fields selected in slurmrestd API, all other fields are + filtered out. + qos: + type: list + content: str + default: + - name + - description + - priority + - flags + - limits + doc: | + List of qos fields selected in slurmrestd API, all other fields are + filtered out. + reservations: + type: list + content: str + default: + - name + - users + - accounts + - node_list + - node_count + - start_time + - end_time + - flags + doc: | + List of reservations fields selected in slurmrestd API, all other fields + are filtered out. + accounts: + type: list + content: str + default: + - name + doc: | + List of accounts fields selected in slurmrestd API, all other fields are + filtered out. + + policy: + definition: + type: path + default: /usr/share/slurm-web/conf/policy.yml + doc: Path to RBAC policy definition file with available actions + vendor_roles: + type: path + default: /usr/share/slurm-web/conf/policy.ini + doc: | + Path to default vendor RBAC policy definition file with roles and + permitted actions + roles: + type: path + default: /etc/slurm-web/policy.ini + doc: | + Path to site RBAC policy definition file with roles and permitted actions + + jwt: + key: + type: path + default: /var/lib/slurm-web/jwt.key + doc: Path to private key for Slurm-web internal JWT signature. + algorithm: + type: str + choices: + # Full list available in PyJWT documentation: + # https://pyjwt.readthedocs.io/en/latest/algorithms.html + - HS256 + - HS384 + - HS512 + - ES256 + - ES256K + - ES384 + - ES512 + - RS256 + - RS384 + - RS512 + - PS256 + - PS384 + - PS512 + - EdDSA + default: HS256 + doc: Cryptographic algorithm used to sign JWT + audience: + type: str + default: slurm-web + doc: | + Audience defined in generated JWT and expected in JWT provided by clients + + racksdb: + enabled: + type: bool + default: true + doc: | + Control if RacksDB integration feature for advanced visualization of + resources is enabled. + # The default values in this section must be synchronized with RacksDB library + # defaults. + db: + type: path + default: /var/lib/racksdb + doc: Path to RacksDB database + schema: + type: path + default: /usr/share/racksdb/schemas/racksdb.yml + doc: Path to RacksDB database schema + extensions: + type: path + default: /etc/racksdb/extensions.yml + doc: Path to site-specific RacksDB schema extensions + drawings_schema: + type: path + default: /usr/share/racksdb/schemas/drawings.yml + doc: Path to RacksDB database schema + infrastructure: + type: str + doc: | + Name of the infrastructure for the cluster in RacksDB. By default, the + cluster name is used. + ex: atlas + tags: + type: list + content: str + default: + - compute + doc: List of tags applied to compute nodes in RacksDB database + + cache: + enabled: + type: bool + default: false + doc: Determine if caching is enabled + host: + type: str + default: localhost + doc: Hostname of Redis cache server + port: + type: int + default: 6379 + doc: TCP port of Redis cache server + password: + type: password + doc: | + Password to connect to protected Redis server. When this parameter is + not defined, Redis server is accessed without password. + ex: SECR3T + version: + type: int + default: 1800 + doc: Expiration delay in seconds for Slurm version in cache + jobs: + type: int + default: 30 + doc: Expiration delay in seconds for jobs in cache + job: + type: int + default: 10 + doc: Expiration delay in seconds for invidual jobs in cache + nodes: + type: int + default: 30 + doc: Expiration delay in seconds for nodes in cache + node: + type: int + default: 10 + doc: Expiration delay in seconds for node in cache + partitions: + type: int + default: 60 + doc: Expiration delay in seconds for partitions in cache + qos: + type: int + default: 60 + doc: Expiration delay in seconds for QOS in cache + reservations: + type: int + default: 60 + doc: Expiration delay in seconds for reservations in cache + accounts: + type: int + default: 60 + doc: Expiration delay in seconds for accounts in cache + + metrics: + enabled: + type: bool + default: false + doc: | + Determine if metrics feature and integration with Prometheus (or + compatible) is enabled. + restrict: + type: list + content: network + default: + - 127.0.0.0/24 + - ::1/128 + doc: | + Restricted list of IP networks permitted to request metrics. + host: + type: uri + default: http://localhost:9090 + doc: | + URL of Prometheus server (or compatible) to requests metrics with PromQL. + job: + type: str + default: slurm + doc: Name of Prometheus job which scrapes Slurm-web metrics. +kind: ConfigMap +metadata: + name: config + namespace: slurm-web diff --git a/values/slurm-web-agent/manifests/deployment.yaml b/values/slurm-web-agent/manifests/deployment.yaml new file mode 100644 index 00000000..1b2e74fa --- /dev/null +++ b/values/slurm-web-agent/manifests/deployment.yaml @@ -0,0 +1,72 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: slurm-web-agent + namespace: slurm-web +spec: + progressDeadlineSeconds: 600 + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + app.kubernetes.io/instance: slurm-web-agent + app.kubernetes.io/name: slurm-web-agent + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + labels: + app.kubernetes.io/instance: slurm-web-agent + app.kubernetes.io/name: slurm-web-agent + namespace: slurm-web + spec: + containers: + - command: + - slurm-web-agent + - --conf-defs + - /data/conf/agent.yml + - --conf + - /data/conf/agent.ini + image: registry.gitlab.com/oceanbox/slurm-web/agent:d512c0c5-debug + imagePullPolicy: IfNotPresent + name: slurm-web-agent + terminationMessagePath: /dev/termination-log + ports: + - containerPort: 5012 + name: http + protocol: TCP + readinessProbe: + failureThreshold: 3 + httpGet: + path: /info + port: http + scheme: HTTP + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 1 + terminationMessagePolicy: File + volumeMounts: + - mountPath: /data/conf + name: config + - mountPath: /data/secrets/key + subPath: key + name: jwt + dnsPolicy: ClusterFirst + serviceAccountName: slurm-web-agent + imagePullSecrets: + - name: gitlab-pull-secret + restartPolicy: Always + schedulerName: default-scheduler + terminationGracePeriodSeconds: 30 + volumes: + - secret: + defaultMode: 420 + secretName: jwt + name: jwt + - configMap: + defaultMode: 420 + name: config + name: config diff --git a/values/slurm-web-agent/manifests/ing.yaml b/values/slurm-web-agent/manifests/ing.yaml new file mode 100644 index 00000000..0cf48d9a --- /dev/null +++ b/values/slurm-web-agent/manifests/ing.yaml @@ -0,0 +1,27 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + cert-manager.io/cluster-issuer: letsencrypt-production + nginx.ingress.kubernetes.io/backend-protocol: HTTP + nginx.ingress.kubernetes.io/ssl-redirect: "true" + oceanbox.io/expose: internal + name: slurm-web-agent + namespace: slurm-web +spec: + ingressClassName: nginx + rules: + - host: slurm-agent.ekman.oceanbox.io + http: + paths: + - backend: + service: + name: slurm-web-agent + port: + number: 80 + path: / + pathType: ImplementationSpecific + tls: + - hosts: + - slurm-agent.ekman.oceanbox.io + secretName: agent-tls diff --git a/values/slurm-web-agent/manifests/slurm-web-agent.yaml b/values/slurm-web-agent/manifests/slurm-web-agent.yaml new file mode 100644 index 00000000..5305122b --- /dev/null +++ b/values/slurm-web-agent/manifests/slurm-web-agent.yaml @@ -0,0 +1,42 @@ +{{- if .Values.clusterConfig.argo.enabled }} +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: slurm-web-agent + namespace: argocd + annotations: + argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true + finalizers: + - resources-finalizer.argocd.argoproj.io +spec: + destination: + namespace: slurm-web + server: https://kubernetes.default.svc + sources: + - repoURL: {{ .Values.clusterConfig.manifests }} + targetRevision: HEAD + path: helmfile.d + plugin: + name: helmfile-cmp + env: + - name: CLUSTER_NAME + value: {{ .Values.clusterConfig.cluster }} + - name: HELMFILE_ENVIRONMENT + value: default + - name: HELMFILE_FILE_PATH + value: slurm-web-agent.yaml.gotmpl + project: default + syncPolicy: + managedNamespaceMetadata: + labels: + component: sys + syncOptions: + - CreateNamespace=true + - ApplyOutOfSyncOnly=true + # - ServerSideApply=true + {{- if .Values.slurm_web_agent.autosync }} + automated: + prune: true + # selfHeal: false + {{- end }} +{{- end }} diff --git a/values/slurm-web-agent/manifests/smon.yaml b/values/slurm-web-agent/manifests/smon.yaml new file mode 100644 index 00000000..b77565cb --- /dev/null +++ b/values/slurm-web-agent/manifests/smon.yaml @@ -0,0 +1,14 @@ +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: slurm-web-agent + namespace: slurm-web +spec: + endpoints: + - interval: 60s + port: metrics + scrapeTimeout: 30s + selector: + matchLabels: + app.kubernetes.io/instance: slurm-web-agent + app.kubernetes.io/name: slurm-web-agent diff --git a/values/slurm-web-agent/manifests/sva.yaml b/values/slurm-web-agent/manifests/sva.yaml new file mode 100644 index 00000000..915dabea --- /dev/null +++ b/values/slurm-web-agent/manifests/sva.yaml @@ -0,0 +1,5 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: slurm-web-agent + namespace: slurm-web diff --git a/values/slurm-web-agent/manifests/svc.yaml b/values/slurm-web-agent/manifests/svc.yaml new file mode 100644 index 00000000..b673f843 --- /dev/null +++ b/values/slurm-web-agent/manifests/svc.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: slurm-web-agent + namespace: slurm-web +spec: + ports: + - name: http + port: 5012 + protocol: TCP + targetPort: http + selector: + app.kubernetes.io/instance: slurm-web-agent + app.kubernetes.io/name: slurm-web-agent + sessionAffinity: None + type: ClusterIP