Merge branch 'main' of gitlab.com:oceanbox/manifests

This commit is contained in:
2025-09-26 14:54:28 +02:00
18 changed files with 910 additions and 0 deletions
+44
View File
@@ -0,0 +1,44 @@
bases:
- ../envs/environments.yaml.gotmpl
repositories:
- name: slurm-operator
oci: true
url: ghcr.io/slinkyproject/charts
commonLabels:
tier: system
releases:
- name: slurm-operator
namespace: slinky
chart: slurm-operator/slurm-operator
version: 0.4.0
condition: slurm_operator.enabled
values:
- ../values/slurm-operator/values/slurm-operator.yaml.gotmpl
- ../values/slurm-operator/values/slurm-operator-{{ .Environment.Name }}.yaml.gotmpl
postRenderer: ../bin/kustomizer
postRendererArgs:
- ../values/slurm-operator/kustomize/{{ .Environment.Name }}
missingFileHandler: Info
- name: manifests
namespace: slinky
chart: manifests
condition: slurm_operator.enabled
missingFileHandler: Info
values:
- ../values/env.yaml
- ../values/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml
- ../values/slurm-operator/env.yaml.gotmpl
- ../values/slurm-operator/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml.gotmpl
hooks:
- events: [ prepare, cleanup ]
showlogs: true
command: ../bin/helmify
args:
- '{{`{{ if eq .Event.Name "prepare" }}build{{ else }}clean{{ end }}`}}'
- '{{`{{ .Release.Chart }}`}}'
- '{{`{{ .Environment.Name }}`}}'
- ../values/slurm-operator/manifests
- manifests
+38
View File
@@ -0,0 +1,38 @@
bases:
- ../envs/environments.yaml.gotmpl
commonLabels:
tier: oceanbox
releases:
- name: slurm-web-agent
namespace: slurm-web
chart: ../charts/slurm-web-agent
condition: slurm_web_agent.enabled
values:
- ../values/slurm-web-agent/values/values.yaml
- ../values/slurm-web-agent/values/values-{{ .Environment.Name }}.yaml
postRenderer: ../bin/kustomizer
postRendererArgs:
- ../values/slurm-web-agent/kustomize/{{ .Environment.Name }}
missingFileHandler: Info
- name: manifests
namespace: slurm-web
chart: manifests
condition: slurm_web_agent.enabled
missingFileHandler: Info
values:
- ../values/env.yaml
- ../values/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml
- ../values/slurm-web-agent/env.yaml.gotmpl
- ../values/slurm-web-agent/env-{{ requiredEnv "ARGOCD_ENV_CLUSTER_NAME" }}.yaml.gotmpl
hooks:
- events: [ prepare, cleanup ]
showlogs: true
command: ../bin/helmify
args:
- '{{`{{ if eq .Event.Name "prepare" }}build{{ else }}clean{{ end }}`}}'
- '{{`{{ .Release.Chart }}`}}'
- '{{`{{ .Environment.Name }}`}}'
- ../values/slurm-web-agent/manifests
- manifests
+7
View File
@@ -74,6 +74,8 @@ spec:
server: https://kubernetes.default.svc
- namespace: ncps
server: https://kubernetes.default.svc
- namespace: slinky
server: https://kubernetes.default.svc
sourceRepos:
- https://argoproj.github.io/argo-helm
- https://kubernetes-sigs.github.io/metrics-server/
@@ -100,3 +102,8 @@ spec:
- https://dapr.github.io/helm-charts
- https://charts.gabe565.com
- https://open-telemetry.github.io/opentelemetry-helm-charts
- https://ghcr.io/slinkyproject/charts/slurm-operator
- https://ghcr.io/slinkyproject/charts/slurm-operator-crds
- ghcr.io/slinkyproject/charts
- ghcr.io/slinkyproject/charts/slurm-operator
- ghcr.io/slinkyproject/charts/slurm-operator-crds
@@ -0,0 +1,3 @@
slurm_operator:
enabled: true
autosync: false
+3
View File
@@ -0,0 +1,3 @@
slurm_operator:
enabled: false
autosync: false
@@ -0,0 +1,18 @@
{{- if .Values.clusterConfig.cilium.enabled }}
apiVersion: cilium.io/v2
kind: CiliumNetworkPolicy
metadata:
name: allow-api-server
namespace: slinky
spec:
egress:
- toEntities:
- kube-apiserver
toPorts:
- ports:
- port: "6443"
protocol: TCP
endpointSelector:
matchLabels:
app.kubernetes.io/instance: slurm-operator
{{- end }}
@@ -0,0 +1,66 @@
{{- if .Values.clusterConfig.argo.enabled }}
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: slurm-operator
namespace: argocd
spec:
destination:
namespace: slinky
server: 'https://kubernetes.default.svc'
sources:
- repoURL: {{ .Values.clusterConfig.manifests }}
targetRevision: HEAD
path: helmfile.d
plugin:
name: helmfile-cmp
env:
- name: CLUSTER_NAME
value: {{ .Values.clusterConfig.cluster }}
- name: HELMFILE_ENVIRONMENT
value: default
- name: HELMFILE_FILE_PATH
value: slurm-operator.yaml.gotmpl
project: sys
syncPolicy:
managedNamespaceMetadata:
labels:
component: sys
syncOptions:
- CreateNamespace=true
- ApplyOutOfSyncOnly=true
- ServerSideApply=true
{{- if .Values.slurm_operator.autosync }}
automated:
prune: true
# selfHeal: false
{{- end }}
---
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: slurm-crd
namespace: argocd
annotations:
argocd.argoproj.io/sync-wave: "-1"
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
destination:
namespace: slinky
server: 'https://kubernetes.default.svc'
source:
repoURL: 'ghcr.io/slinkyproject/charts'
targetRevision: '0.4.0'
chart: slurm-operator-crds
project: sys
syncPolicy:
managedNamespaceMetadata:
labels:
component: sys
automated: {}
syncOptions:
- ServerSideApply=true
- CreateNamespace=true
- ApplyOutOfSyncOnly=true
{{- end }}
@@ -0,0 +1,2 @@
crds:
enabled: false
@@ -0,0 +1,3 @@
slurm_web_agent:
enabled: true
autosync: false
+3
View File
@@ -0,0 +1,3 @@
slurm_web_agent:
enabled: false
autosync: false
+546
View File
@@ -0,0 +1,546 @@
apiVersion: v1
data:
policy.ini: |
# Slurm-web default vendor RBAC policy. DO NOT MODIFY THIS FILE! Create a file
# /etc/slurm-web/policy.conf with your custom rules and Slurm-web will ignore
# this file. Your modifications in this file will be overwritten and lost on
# software upgrade.
[roles]
# Enable anonymous role with basic views
anonymous
# All authenticated users have the user role
user=ALL
[anonymous]
actions=view-stats,view-jobs,view-nodes
[user]
actions=view-stats,view-jobs,view-nodes,view-partitions,view-qos,view-accounts,view-reservations,cache-view
agent.ini: |
[service]
cluster=ekman
[slurmrestd]
uri=unix:/run/slurmrestd/hipster.socket
auth=local
[policy]
definition=data/conf/policy.yml
vendor_roles=data/conf/policy.ini
[racksdb]
enabled=no
[jwt]
key=data/secrets/jwt.key
policy.yml: |
actions:
view-stats: |
View general cluster statistics in home dashboard and clusters list.
view-jobs: |
Get _Jobs_ entry in main menu and permission to view all users jobs in
queue and all jobs details.
view-nodes: |
Get _Resources_ entry in main menu and permission to view the list of
compute nodes in cluster with their status in Slurm.
view-partitions: |
Permission to filter jobs and nodes by partition.
view-qos: |
Get _QOS_ entry in main menu and permission to view the list of defined QOS.
view-accounts: |
Permission to filter jobs by account.
view-reservations: |
Get _Reservations_ entry in main menu and permission to view the list of
defined reservations.
cache-view: |
Access to cache information and metrics in settings panel.
agent.yml: |
# This file contains the configuration settings definition for Slurm-web agent
# in RFL ConfigurationLoader format.
service:
cluster:
type: str
required: true
doc: Name of cluster served by agent
ex: atlas
interface:
type: str
default: localhost
doc: Interface address to bind for incoming connections
port:
type: int
default: 5012
doc: TCP port to listen for incoming connections
cors:
type: bool
default: false
doc: When true, Cross-Origin Resource Sharing (CORS) headers are enabled.
debug:
type: bool
default: false
doc: Enable debug mode
log_flags:
type: list
content: str
default:
- ALL
choices:
- slurmweb
- rfl
- werkzeug
- urllib3
- racksdb
- ALL
doc: List of log flags to enable. Special value `ALL` enables all log flags.
debug_flags:
type: list
content: str
default:
- slurmweb
choices:
- slurmweb
- rfl
- werkzeug
- urllib3
- racksdb
- ALL
doc: |
List of debug flags to enable. Special value `ALL` enables all debug
flags.
slurmrestd:
uri:
type: uri
default: unix:///run/slurmrestd/slurmrestd.socket
doc: |
URI to slurmrestd HTTP server. It can either be in the form
http://host:port for TCP/IP server or unix:///path/to/slurmrestd.socket
for Unix socket.
socket:
type: path
deprecated:
section: slurmrestd
parameter: uri
doc: Path to slurmrestd Unix socket.
auth:
type: str
choices:
- local
- jwt
default: jwt
doc: |
Authentication method with slurmrestd.
The `jwt` authentication method is supported by both TCP/IP and Unix
sockets URIs.
Note that `local` authentication method is only supported with Unix socket
URI and Slurm <= 24.11. With this method, Slurm-web agent must run with
the _slurm_ system user as well as `slurmrestd` service. Running
`slurmrestd` as _slurm_ system user is not possible with Slurm >= 25.05.
jwt_mode:
type: str
default: auto
choices:
- auto
- static
doc: |
Slurmrestd JWT authentication mode, either _auto_ or _static_.
In _auto_ mode, Slurm-web agent generates tokens with the signature key
specified in `jwt_key`. The tokens have a limited lifespan as defined with
`jwt_lifespan`. Tokens are automatically renewed upon expiration. This is
the recommended mode.
In _static_ mode, Slurm-web simply use the token provided with
`jwt_token`.
This parameter is used only when `auth` is _jwt_.
jwt_user:
type: str
default: slurm
doc: |
The user name used in HTTP headers with JWT authentication.
This parameter is used only when `auth` is _jwt_.
jwt_lifespan:
type: int
default: 3600
doc: |
Lifespan of JWT tokens generated by Slurm-web in seconds. The default
value is 1 hour.
This parameter is used only when `auth` is _jwt_ and `jwt_mode` is _auto_.
jwt_key:
type: path
default: /var/lib/slurm-web/slurmrestd.key
doc: |
Path to private key shared with Slurm for JWT signature. The key is used
by Slurm-web to generate its token for authentication on slurmrestd in
_auto_ mode. It must be the same key as used in Slurm `AuthAltParameters`
so that Slurm services can validate JWT generated by Slurm-web.
This parameter is used only when `auth` is _jwt_ and `jwt_mode` is
_auto_.
jwt_token:
type: password
doc: |
The static JSON Web Token (JWT) used in HTTP headers with JWT
authentication, typically generated with `scontrol token`. While this is
generally not a good practice, it is recommended to generate tokens with
infinite lifespan to avoid failures due to expired token.
This parameter is used only when `auth` is _jwt_ and `jwt_mode` is
_static_.
version:
type: str
default: '0.0.41'
doc: |
Slurm REST API version.
CAUTION: You SHOULD NOT change this parameter unless you really know what
you are doing. This parameter is more intented for Slurm-web developers
rather than end users. Slurm-web is officially tested and validated with
the default value only.
filters:
jobs:
type: list
content: str
default:
- account
- cpus
- gres_detail
- job_id
- job_state
- node_count
- nodes
- partition
- priority
- qos
- sockets_per_node
- state_reason
- tasks
- tres_per_job
- tres_per_node
- tres_per_socket
- tres_per_task
- user_name
doc: |
List of jobs fields selected in slurmrestd API when retrieving a list of
jobs, all other fields arefiltered out.
acctjob:
type: list
content: str
default:
- association
- comment
- derived_exit_code
- exit_code
- group
- name
- nodes
- partition
- priority
- qos
- script
- state
- steps
- submit_line
- time
- tres
- used_gres
- user
- wckey
- working_directory
doc: |
List of slurmdbd job fields selected in slurmrestd API when retrieving a
unique job, all other fields are filtered out.
ctldjob:
type: list
content: str
default:
- accrue_time
- batch_flag
- command
- cpus
- current_working_directory
- exclusive
- gres_detail
- last_sched_evaluation
- node_count
- partition
- sockets_per_node
- standard_error
- standard_input
- standard_output
- tasks
- tres_per_job
- tres_per_node
- tres_per_socket
- tres_per_task
- tres_req_str
doc: |
List of slurmctld job fields selected in slurmrestd API when retrieving a
unique job, all other fields are filtered out.
nodes:
type: list
content: str
default:
- name
- cpus
- sockets
- cores
- gres
- gres_used
- real_memory
- state
- reason
- partitions
- alloc_cpus
- alloc_idle_cpus
doc: |
List of nodes fields selected in slurmrestd API, all other fields are
filtered out.
node:
type: list
content: str
default:
- name
- architecture
- operating_system
- boot_time
- last_busy
- cpus
- sockets
- cores
- threads
- real_memory
- gres
- gres_used
- state
- reason
- partitions
- alloc_cpus
- alloc_idle_cpus
- alloc_memory
doc: |
List of invidual node fields selected in slurmrestd API, all other fields
are filtered out.
partitions:
type: list
content: str
default:
- name
- node_sets
doc: |
List of partitions fields selected in slurmrestd API, all other fields are
filtered out.
qos:
type: list
content: str
default:
- name
- description
- priority
- flags
- limits
doc: |
List of qos fields selected in slurmrestd API, all other fields are
filtered out.
reservations:
type: list
content: str
default:
- name
- users
- accounts
- node_list
- node_count
- start_time
- end_time
- flags
doc: |
List of reservations fields selected in slurmrestd API, all other fields
are filtered out.
accounts:
type: list
content: str
default:
- name
doc: |
List of accounts fields selected in slurmrestd API, all other fields are
filtered out.
policy:
definition:
type: path
default: /usr/share/slurm-web/conf/policy.yml
doc: Path to RBAC policy definition file with available actions
vendor_roles:
type: path
default: /usr/share/slurm-web/conf/policy.ini
doc: |
Path to default vendor RBAC policy definition file with roles and
permitted actions
roles:
type: path
default: /etc/slurm-web/policy.ini
doc: |
Path to site RBAC policy definition file with roles and permitted actions
jwt:
key:
type: path
default: /var/lib/slurm-web/jwt.key
doc: Path to private key for Slurm-web internal JWT signature.
algorithm:
type: str
choices:
# Full list available in PyJWT documentation:
# https://pyjwt.readthedocs.io/en/latest/algorithms.html
- HS256
- HS384
- HS512
- ES256
- ES256K
- ES384
- ES512
- RS256
- RS384
- RS512
- PS256
- PS384
- PS512
- EdDSA
default: HS256
doc: Cryptographic algorithm used to sign JWT
audience:
type: str
default: slurm-web
doc: |
Audience defined in generated JWT and expected in JWT provided by clients
racksdb:
enabled:
type: bool
default: true
doc: |
Control if RacksDB integration feature for advanced visualization of
resources is enabled.
# The default values in this section must be synchronized with RacksDB library
# defaults.
db:
type: path
default: /var/lib/racksdb
doc: Path to RacksDB database
schema:
type: path
default: /usr/share/racksdb/schemas/racksdb.yml
doc: Path to RacksDB database schema
extensions:
type: path
default: /etc/racksdb/extensions.yml
doc: Path to site-specific RacksDB schema extensions
drawings_schema:
type: path
default: /usr/share/racksdb/schemas/drawings.yml
doc: Path to RacksDB database schema
infrastructure:
type: str
doc: |
Name of the infrastructure for the cluster in RacksDB. By default, the
cluster name is used.
ex: atlas
tags:
type: list
content: str
default:
- compute
doc: List of tags applied to compute nodes in RacksDB database
cache:
enabled:
type: bool
default: false
doc: Determine if caching is enabled
host:
type: str
default: localhost
doc: Hostname of Redis cache server
port:
type: int
default: 6379
doc: TCP port of Redis cache server
password:
type: password
doc: |
Password to connect to protected Redis server. When this parameter is
not defined, Redis server is accessed without password.
ex: SECR3T
version:
type: int
default: 1800
doc: Expiration delay in seconds for Slurm version in cache
jobs:
type: int
default: 30
doc: Expiration delay in seconds for jobs in cache
job:
type: int
default: 10
doc: Expiration delay in seconds for invidual jobs in cache
nodes:
type: int
default: 30
doc: Expiration delay in seconds for nodes in cache
node:
type: int
default: 10
doc: Expiration delay in seconds for node in cache
partitions:
type: int
default: 60
doc: Expiration delay in seconds for partitions in cache
qos:
type: int
default: 60
doc: Expiration delay in seconds for QOS in cache
reservations:
type: int
default: 60
doc: Expiration delay in seconds for reservations in cache
accounts:
type: int
default: 60
doc: Expiration delay in seconds for accounts in cache
metrics:
enabled:
type: bool
default: false
doc: |
Determine if metrics feature and integration with Prometheus (or
compatible) is enabled.
restrict:
type: list
content: network
default:
- 127.0.0.0/24
- ::1/128
doc: |
Restricted list of IP networks permitted to request metrics.
host:
type: uri
default: http://localhost:9090
doc: |
URL of Prometheus server (or compatible) to requests metrics with PromQL.
job:
type: str
default: slurm
doc: Name of Prometheus job which scrapes Slurm-web metrics.
kind: ConfigMap
metadata:
name: config
namespace: slurm-web
@@ -0,0 +1,72 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: slurm-web-agent
namespace: slurm-web
spec:
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 10
selector:
matchLabels:
app.kubernetes.io/instance: slurm-web-agent
app.kubernetes.io/name: slurm-web-agent
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
labels:
app.kubernetes.io/instance: slurm-web-agent
app.kubernetes.io/name: slurm-web-agent
namespace: slurm-web
spec:
containers:
- command:
- slurm-web-agent
- --conf-defs
- /data/conf/agent.yml
- --conf
- /data/conf/agent.ini
image: registry.gitlab.com/oceanbox/slurm-web/agent:d512c0c5-debug
imagePullPolicy: IfNotPresent
name: slurm-web-agent
terminationMessagePath: /dev/termination-log
ports:
- containerPort: 5012
name: http
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /info
port: http
scheme: HTTP
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 1
terminationMessagePolicy: File
volumeMounts:
- mountPath: /data/conf
name: config
- mountPath: /data/secrets/key
subPath: key
name: jwt
dnsPolicy: ClusterFirst
serviceAccountName: slurm-web-agent
imagePullSecrets:
- name: gitlab-pull-secret
restartPolicy: Always
schedulerName: default-scheduler
terminationGracePeriodSeconds: 30
volumes:
- secret:
defaultMode: 420
secretName: jwt
name: jwt
- configMap:
defaultMode: 420
name: config
name: config
+27
View File
@@ -0,0 +1,27 @@
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
annotations:
cert-manager.io/cluster-issuer: letsencrypt-production
nginx.ingress.kubernetes.io/backend-protocol: HTTP
nginx.ingress.kubernetes.io/ssl-redirect: "true"
oceanbox.io/expose: internal
name: slurm-web-agent
namespace: slurm-web
spec:
ingressClassName: nginx
rules:
- host: slurm-agent.ekman.oceanbox.io
http:
paths:
- backend:
service:
name: slurm-web-agent
port:
number: 80
path: /
pathType: ImplementationSpecific
tls:
- hosts:
- slurm-agent.ekman.oceanbox.io
secretName: agent-tls
@@ -0,0 +1,42 @@
{{- if .Values.clusterConfig.argo.enabled }}
apiVersion: argoproj.io/v1alpha1
kind: Application
metadata:
name: slurm-web-agent
namespace: argocd
annotations:
argocd.argoproj.io/sync-options: SkipDryRunOnMissingResource=true
finalizers:
- resources-finalizer.argocd.argoproj.io
spec:
destination:
namespace: slurm-web
server: https://kubernetes.default.svc
sources:
- repoURL: {{ .Values.clusterConfig.manifests }}
targetRevision: HEAD
path: helmfile.d
plugin:
name: helmfile-cmp
env:
- name: CLUSTER_NAME
value: {{ .Values.clusterConfig.cluster }}
- name: HELMFILE_ENVIRONMENT
value: default
- name: HELMFILE_FILE_PATH
value: slurm-web-agent.yaml.gotmpl
project: default
syncPolicy:
managedNamespaceMetadata:
labels:
component: sys
syncOptions:
- CreateNamespace=true
- ApplyOutOfSyncOnly=true
# - ServerSideApply=true
{{- if .Values.slurm_web_agent.autosync }}
automated:
prune: true
# selfHeal: false
{{- end }}
{{- end }}
@@ -0,0 +1,14 @@
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
name: slurm-web-agent
namespace: slurm-web
spec:
endpoints:
- interval: 60s
port: metrics
scrapeTimeout: 30s
selector:
matchLabels:
app.kubernetes.io/instance: slurm-web-agent
app.kubernetes.io/name: slurm-web-agent
@@ -0,0 +1,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: slurm-web-agent
namespace: slurm-web
+16
View File
@@ -0,0 +1,16 @@
apiVersion: v1
kind: Service
metadata:
name: slurm-web-agent
namespace: slurm-web
spec:
ports:
- name: http
port: 5012
protocol: TCP
targetPort: http
selector:
app.kubernetes.io/instance: slurm-web-agent
app.kubernetes.io/name: slurm-web-agent
sessionAffinity: None
type: ClusterIP
@@ -0,0 +1 @@