fix: Persist slurm-web-agent

This commit is contained in:
2025-09-26 13:34:45 +02:00
parent 527f35ea01
commit 54b50759e7
7 changed files with 722 additions and 0 deletions
+546
View File
@@ -0,0 +1,546 @@
apiVersion: v1
data:
policy.ini: |
# Slurm-web default vendor RBAC policy. DO NOT MODIFY THIS FILE! Create a file
# /etc/slurm-web/policy.conf with your custom rules and Slurm-web will ignore
# this file. Your modifications in this file will be overwritten and lost on
# software upgrade.
[roles]
# Enable anonymous role with basic views
anonymous
# All authenticated users have the user role
user=ALL
[anonymous]
actions=view-stats,view-jobs,view-nodes
[user]
actions=view-stats,view-jobs,view-nodes,view-partitions,view-qos,view-accounts,view-reservations,cache-view
agent.ini: |
[service]
cluster=ekman
[slurmrestd]
uri=unix:/run/slurmrestd/hipster.socket
auth=local
[policy]
definition=data/conf/policy.yml
vendor_roles=data/conf/policy.ini
[racksdb]
enabled=no
[jwt]
key=data/secrets/jwt.key
policy.yml: |
actions:
view-stats: |
View general cluster statistics in home dashboard and clusters list.
view-jobs: |
Get _Jobs_ entry in main menu and permission to view all users jobs in
queue and all jobs details.
view-nodes: |
Get _Resources_ entry in main menu and permission to view the list of
compute nodes in cluster with their status in Slurm.
view-partitions: |
Permission to filter jobs and nodes by partition.
view-qos: |
Get _QOS_ entry in main menu and permission to view the list of defined QOS.
view-accounts: |
Permission to filter jobs by account.
view-reservations: |
Get _Reservations_ entry in main menu and permission to view the list of
defined reservations.
cache-view: |
Access to cache information and metrics in settings panel.
agent.yml: |
# This file contains the configuration settings definition for Slurm-web agent
# in RFL ConfigurationLoader format.
service:
cluster:
type: str
required: true
doc: Name of cluster served by agent
ex: atlas
interface:
type: str
default: localhost
doc: Interface address to bind for incoming connections
port:
type: int
default: 5012
doc: TCP port to listen for incoming connections
cors:
type: bool
default: false
doc: When true, Cross-Origin Resource Sharing (CORS) headers are enabled.
debug:
type: bool
default: false
doc: Enable debug mode
log_flags:
type: list
content: str
default:
- ALL
choices:
- slurmweb
- rfl
- werkzeug
- urllib3
- racksdb
- ALL
doc: List of log flags to enable. Special value `ALL` enables all log flags.
debug_flags:
type: list
content: str
default:
- slurmweb
choices:
- slurmweb
- rfl
- werkzeug
- urllib3
- racksdb
- ALL
doc: |
List of debug flags to enable. Special value `ALL` enables all debug
flags.
slurmrestd:
uri:
type: uri
default: unix:///run/slurmrestd/slurmrestd.socket
doc: |
URI to slurmrestd HTTP server. It can either be in the form
http://host:port for TCP/IP server or unix:///path/to/slurmrestd.socket
for Unix socket.
socket:
type: path
deprecated:
section: slurmrestd
parameter: uri
doc: Path to slurmrestd Unix socket.
auth:
type: str
choices:
- local
- jwt
default: jwt
doc: |
Authentication method with slurmrestd.
The `jwt` authentication method is supported by both TCP/IP and Unix
sockets URIs.
Note that `local` authentication method is only supported with Unix socket
URI and Slurm <= 24.11. With this method, Slurm-web agent must run with
the _slurm_ system user as well as `slurmrestd` service. Running
`slurmrestd` as _slurm_ system user is not possible with Slurm >= 25.05.
jwt_mode:
type: str
default: auto
choices:
- auto
- static
doc: |
Slurmrestd JWT authentication mode, either _auto_ or _static_.
In _auto_ mode, Slurm-web agent generates tokens with the signature key
specified in `jwt_key`. The tokens have a limited lifespan as defined with
`jwt_lifespan`. Tokens are automatically renewed upon expiration. This is
the recommended mode.
In _static_ mode, Slurm-web simply use the token provided with
`jwt_token`.
This parameter is used only when `auth` is _jwt_.
jwt_user:
type: str
default: slurm
doc: |
The user name used in HTTP headers with JWT authentication.
This parameter is used only when `auth` is _jwt_.
jwt_lifespan:
type: int
default: 3600
doc: |
Lifespan of JWT tokens generated by Slurm-web in seconds. The default
value is 1 hour.
This parameter is used only when `auth` is _jwt_ and `jwt_mode` is _auto_.
jwt_key:
type: path
default: /var/lib/slurm-web/slurmrestd.key
doc: |
Path to private key shared with Slurm for JWT signature. The key is used
by Slurm-web to generate its token for authentication on slurmrestd in
_auto_ mode. It must be the same key as used in Slurm `AuthAltParameters`
so that Slurm services can validate JWT generated by Slurm-web.
This parameter is used only when `auth` is _jwt_ and `jwt_mode` is
_auto_.
jwt_token:
type: password
doc: |
The static JSON Web Token (JWT) used in HTTP headers with JWT
authentication, typically generated with `scontrol token`. While this is
generally not a good practice, it is recommended to generate tokens with
infinite lifespan to avoid failures due to expired token.
This parameter is used only when `auth` is _jwt_ and `jwt_mode` is
_static_.
version:
type: str
default: '0.0.41'
doc: |
Slurm REST API version.
CAUTION: You SHOULD NOT change this parameter unless you really know what
you are doing. This parameter is more intented for Slurm-web developers
rather than end users. Slurm-web is officially tested and validated with
the default value only.
filters:
jobs:
type: list
content: str
default:
- account
- cpus
- gres_detail
- job_id
- job_state
- node_count
- nodes
- partition
- priority
- qos
- sockets_per_node
- state_reason
- tasks
- tres_per_job
- tres_per_node
- tres_per_socket
- tres_per_task
- user_name
doc: |
List of jobs fields selected in slurmrestd API when retrieving a list of
jobs, all other fields arefiltered out.
acctjob:
type: list
content: str
default:
- association
- comment
- derived_exit_code
- exit_code
- group
- name
- nodes
- partition
- priority
- qos
- script
- state
- steps
- submit_line
- time
- tres
- used_gres
- user
- wckey
- working_directory
doc: |
List of slurmdbd job fields selected in slurmrestd API when retrieving a
unique job, all other fields are filtered out.
ctldjob:
type: list
content: str
default:
- accrue_time
- batch_flag
- command
- cpus
- current_working_directory
- exclusive
- gres_detail
- last_sched_evaluation
- node_count
- partition
- sockets_per_node
- standard_error
- standard_input
- standard_output
- tasks
- tres_per_job
- tres_per_node
- tres_per_socket
- tres_per_task
- tres_req_str
doc: |
List of slurmctld job fields selected in slurmrestd API when retrieving a
unique job, all other fields are filtered out.
nodes:
type: list
content: str
default:
- name
- cpus
- sockets
- cores
- gres
- gres_used
- real_memory
- state
- reason
- partitions
- alloc_cpus
- alloc_idle_cpus
doc: |
List of nodes fields selected in slurmrestd API, all other fields are
filtered out.
node:
type: list
content: str
default:
- name
- architecture
- operating_system
- boot_time
- last_busy
- cpus
- sockets
- cores
- threads
- real_memory
- gres
- gres_used
- state
- reason
- partitions
- alloc_cpus
- alloc_idle_cpus
- alloc_memory
doc: |
List of invidual node fields selected in slurmrestd API, all other fields
are filtered out.
partitions:
type: list
content: str
default:
- name
- node_sets
doc: |
List of partitions fields selected in slurmrestd API, all other fields are
filtered out.
qos:
type: list
content: str
default:
- name
- description
- priority
- flags
- limits
doc: |
List of qos fields selected in slurmrestd API, all other fields are
filtered out.
reservations:
type: list
content: str
default:
- name
- users
- accounts
- node_list
- node_count
- start_time
- end_time
- flags
doc: |
List of reservations fields selected in slurmrestd API, all other fields
are filtered out.
accounts:
type: list
content: str
default:
- name
doc: |
List of accounts fields selected in slurmrestd API, all other fields are
filtered out.
policy:
definition:
type: path
default: /usr/share/slurm-web/conf/policy.yml
doc: Path to RBAC policy definition file with available actions
vendor_roles:
type: path
default: /usr/share/slurm-web/conf/policy.ini
doc: |
Path to default vendor RBAC policy definition file with roles and
permitted actions
roles:
type: path
default: /etc/slurm-web/policy.ini
doc: |
Path to site RBAC policy definition file with roles and permitted actions
jwt:
key:
type: path
default: /var/lib/slurm-web/jwt.key
doc: Path to private key for Slurm-web internal JWT signature.
algorithm:
type: str
choices:
# Full list available in PyJWT documentation:
# https://pyjwt.readthedocs.io/en/latest/algorithms.html
- HS256
- HS384
- HS512
- ES256
- ES256K
- ES384
- ES512
- RS256
- RS384
- RS512
- PS256
- PS384
- PS512
- EdDSA
default: HS256
doc: Cryptographic algorithm used to sign JWT
audience:
type: str
default: slurm-web
doc: |
Audience defined in generated JWT and expected in JWT provided by clients
racksdb:
enabled:
type: bool
default: true
doc: |
Control if RacksDB integration feature for advanced visualization of
resources is enabled.
# The default values in this section must be synchronized with RacksDB library
# defaults.
db:
type: path
default: /var/lib/racksdb
doc: Path to RacksDB database
schema:
type: path
default: /usr/share/racksdb/schemas/racksdb.yml
doc: Path to RacksDB database schema
extensions:
type: path
default: /etc/racksdb/extensions.yml
doc: Path to site-specific RacksDB schema extensions
drawings_schema:
type: path
default: /usr/share/racksdb/schemas/drawings.yml
doc: Path to RacksDB database schema
infrastructure:
type: str
doc: |
Name of the infrastructure for the cluster in RacksDB. By default, the
cluster name is used.
ex: atlas
tags:
type: list
content: str
default:
- compute
doc: List of tags applied to compute nodes in RacksDB database
cache:
enabled:
type: bool
default: false
doc: Determine if caching is enabled
host:
type: str
default: localhost
doc: Hostname of Redis cache server
port:
type: int
default: 6379
doc: TCP port of Redis cache server
password:
type: password
doc: |
Password to connect to protected Redis server. When this parameter is
not defined, Redis server is accessed without password.
ex: SECR3T
version:
type: int
default: 1800
doc: Expiration delay in seconds for Slurm version in cache
jobs:
type: int
default: 30
doc: Expiration delay in seconds for jobs in cache
job:
type: int
default: 10
doc: Expiration delay in seconds for invidual jobs in cache
nodes:
type: int
default: 30
doc: Expiration delay in seconds for nodes in cache
node:
type: int
default: 10
doc: Expiration delay in seconds for node in cache
partitions:
type: int
default: 60
doc: Expiration delay in seconds for partitions in cache
qos:
type: int
default: 60
doc: Expiration delay in seconds for QOS in cache
reservations:
type: int
default: 60
doc: Expiration delay in seconds for reservations in cache
accounts:
type: int
default: 60
doc: Expiration delay in seconds for accounts in cache
metrics:
enabled:
type: bool
default: false
doc: |
Determine if metrics feature and integration with Prometheus (or
compatible) is enabled.
restrict:
type: list
content: network
default:
- 127.0.0.0/24
- ::1/128
doc: |
Restricted list of IP networks permitted to request metrics.
host:
type: uri
default: http://localhost:9090
doc: |
URL of Prometheus server (or compatible) to requests metrics with PromQL.
job:
type: str
default: slurm
doc: Name of Prometheus job which scrapes Slurm-web metrics.
kind: ConfigMap
metadata:
name: config
namespace: slurm-web