feat(sorcerer): Add raid-data pvc and cronjobs

This commit is contained in:
2026-04-27 15:45:51 +02:00
parent 1e188543ae
commit 0c46dff7bf
8 changed files with 512 additions and 5 deletions
+2 -3
View File
@@ -109,8 +109,6 @@ spec:
template:
spec:
restartPolicy: "OnFailure"
securityContext:
runAsUser: 5000
containers:
- name: cronpod
image: juselius/busynix:1.1
@@ -120,7 +118,8 @@ spec:
- -c
- |
nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py'
chown -R kraken /data/hdd/data/AROME
chown -R 5000:5000 /data/hdd/data/AROME
chmod -R g+w /data/hdd/data/AROME
resources: {}
volumeMounts:
- name: data
+174
View File
@@ -0,0 +1,174 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: mur-script
namespace: cron
data:
download.py: |
import argparse
import os
import sys
import requests
from datetime import datetime
parser = argparse.ArgumentParser(description="Download MUR SST files from NASA Earthdata")
parser.add_argument("-sd", "--start_date", required=True, help="Start date (YYYY-MM-DD)")
parser.add_argument("-ed", "--end_date", required=True, help="End date (YYYY-MM-DD)")
parser.add_argument("-o", "--out_dir", default="MUR_SST_nc", help="Output directory")
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
args = parser.parse_args()
def create_session():
session = requests.Session()
session.headers.update({
"User-Agent": "mur-sst-downloader",
"Accept-Encoding": "identity"
})
return session
def get_download_urls(startdate, enddate, verbose=False):
url = (
"https://cmr.earthdata.nasa.gov/search/granules.umm_json"
f"?collection_concept_id=C1996881146-POCLOUD"
f"&temporal={startdate}T00:00:00Z,{enddate}T00:00:00Z"
"&pageSize=365"
)
r = requests.get(url)
r.raise_for_status()
data = r.json()
urls = []
for item in data["items"]:
for link in item["umm"]["RelatedUrls"]:
# Prefer direct HTTPS download links
if link.get("Type") == "GET DATA":
urls.append(link["URL"])
if verbose:
print(f"Found {len(urls)} files")
return urls
def download_file(session, url, out_dir, verbose=False, retries=3):
filename = os.path.basename(url)
local_path = os.path.join(out_dir, filename)
if os.path.exists(local_path):
if verbose:
print(f"Skipping existing: {filename}")
return
for attempt in range(retries):
try:
if verbose:
print(f"Downloading ({attempt+1}/{retries}): {filename}")
with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
if r.status_code == 401:
raise Exception("Unauthorized (check .netrc credentials)")
r.raise_for_status()
with open(local_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
if verbose:
print(f"Saved: {filename}")
return
except Exception as e:
if attempt == retries - 1:
print(f"Failed: {filename} -> {e}")
else:
if verbose:
print(f"Retrying {filename}... ({e})")
def validate_dates(start, end):
try:
datetime.strptime(start, "%Y-%m-%d")
datetime.strptime(end, "%Y-%m-%d")
except ValueError:
print("Error: Dates must be in YYYY-MM-DD format")
sys.exit(1)
def main():
validate_dates(args.start_date, args.end_date)
# os.makedirs(args.out_dir, exist_ok=True)
session = create_session()
urls = get_download_urls(args.start_date, args.end_date, args.verbose)
for url in urls:
download_file(session, url, args.out_dir, args.verbose)
print(f"\nDone. Downloaded files to: {args.out_dir}")
if __name__ == "__main__":
main()
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: mur
namespace: cron
spec:
schedule: "0 6 * * *" # Everyday at 06:00, use https://crontab.guru
concurrencyPolicy: "Forbid"
successfulJobsHistoryLimit: 10
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 10
template:
spec:
restartPolicy: "OnFailure"
containers:
- name: cronpod
image: juselius/busynix:1.1
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- |
nix-shell -p 'python3.withPackages(ps: [ps.requests])' coreutils --run '
python3 /scripts/download.py \
-sd $(date -d "3 days ago" +%Y-%m-%d) \
-ed $(date +%Y-%m-%d) \
-o /data/hdd/data/river-data/MUR/MUR_SST_nc \
-v
chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc
chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc
'
resources: {}
volumeMounts:
- name: data
mountPath: /data
- name: script
mountPath: /scripts
- name: netrc
mountPath: /root/.netrc
subPath: .netrc
readOnly: true
securityContext: {}
volumes:
- name: data
persistentVolumeClaim:
claimName: ekman-data
- name: script
configMap:
name: mur-script
defaultMode: 0755
- name: netrc
secret:
secretName: mur-netrc
+225
View File
@@ -0,0 +1,225 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: nemo-script
namespace: cron
data:
download.sh: |
#!/usr/bin/env bash
# this script downloads files from
# https://data.marine.copernicus.eu/product/NWSHELF_ANALYSISFORECAST_PHY_004_013
set -euf -o pipefail
START_DATE="$1"
END_DATE="$2"
current_date="$START_DATE"
while [[ "$current_date" < "$END_DATE" ]]; do
next_date=$(date -I -d "$current_date + 1 day")
echo "Running subset for $current_date to $next_date"
outfile="cmems_mod_nws_phy-sal_anfc_1.5km-3D_PT1H-i_${current_date}--${next_date}.nc"
copernicusmarine subset \
--dataset-id cmems_mod_nws_phy-sal_anfc_1.5km-3D_PT1H-i \
-t "$current_date" \
-T "$next_date" \
-f "$outfile" \
-o /data/hdd/data/NEMO/
echo "Downloaded salt"
outfile="cmems_mod_nws_phy-cur_anfc_1.5km-3D_PT1H-i_${current_date}--${next_date}.nc"
copernicusmarine subset \
--dataset-id cmems_mod_nws_phy-cur_anfc_1.5km-3D_PT1H-i \
-t "$current_date" \
-T "$next_date" \
-f "$outfile" \
-o /data/hdd/data/NEMO/
echo "Downloaded currents"
outfile="cmems_mod_nws_phy-tem_anfc_1.5km-3D_PT1H-i_${current_date}--${next_date}.nc"
copernicusmarine subset \
--dataset-id cmems_mod_nws_phy-tem_anfc_1.5km-3D_PT1H-i \
-t "$current_date" \
-T "$next_date" \
-f "$outfile" \
-o /data/hdd/data/NEMO/
echo "Downloaded temperature"
outfile="cmems_mod_nws_phy-ssh_anfc_1.5km-2D_PT15M-i_${current_date}--${next_date}.nc"
copernicusmarine subset \
--dataset-id cmems_mod_nws_phy-ssh_anfc_1.5km-2D_PT15M-i \
-t "$current_date" \
-T "$next_date" \
-f "$outfile" \
-o /data/hdd/data/NEMO/
echo "Downloaded ssh"
current_date="$next_date"
done
---
apiVersion: v1
kind: ConfigMap
metadata:
name: nemo-nix
namespace: cron
data:
shell.nix: |
let
nixpkgs = builtins.fetchTarball {
url = "https://releases.nixos.org/nixos/25.11/nixos-25.11.9586.10e7ad5bbcb4/nixexprs.tar.xz";
sha256 = "sha256-wjAIDqQxE+kWV2lbykQCcS+F0ArQwmN8iNw0kcj4iaA=";
};
pkgs = import nixpkgs { overlays = [ (import ./default.nix) ]; };
in pkgs.mkShell {
buildInputs = [
(pkgs.python3.withPackages (ps: [ pkgs.copernicusmarine ]))
pkgs.coreutils
pkgs.bash
];
}
default.nix: |
final: prev: {
arcosparse = prev.callPackage ./arcosparse.nix { };
copernicusmarine = prev.callPackage ./copernicusmarine.nix {
arcosparse = final.arcosparse;
};
}
copernicusmarine.nix: |
{
fetchPypi,
python3Packages,
arcosparse,
}:
python3Packages.buildPythonPackage rec {
pname = "copernicusmarine";
version = "2.2.2";
format = "pyproject";
src = fetchPypi {
inherit version;
pname = "copernicusmarine";
sha256 = "sha256-5T3iH4Hh08wIao2MMveb/bVnVz0pK0PoN4CRk811P0g=";
};
pythonRelaxDeps = true;
nativeBuildInputs = [ python3Packages.poetry-core ];
propagatedBuildInputs = with python3Packages; [
boto3
click
dask
h5netcdf
arcosparse
lxml
numpy
pydantic
pystac
requests
semver
setuptools
tqdm
xarray
zarr
];
}
arcosparse.nix: |
{
fetchPypi,
python3Packages,
}:
python3Packages.buildPythonPackage rec {
pname = "arcosparse";
version = "0.4.2";
format = "pyproject";
src = fetchPypi {
inherit version;
pname = "arcosparse";
sha256 = "sha256-Z8NW+dsC3uXk101kr8tzsgjAoFb4KNdGkxyFkJ5UhFA=";
};
pythonRelaxDeps = true;
nativeBuildInputs = [ python3Packages.poetry-core ];
propagatedBuildInputs = with python3Packages; [
pyarrow
pandas
pystac
tqdm
requests
];
}
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: nemo
namespace: cron
spec:
schedule: "0 13 * * *" # Everyday at 13:00, use https://crontab.guru
concurrencyPolicy: "Forbid"
successfulJobsHistoryLimit: 10
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 10
template:
spec:
restartPolicy: "OnFailure"
containers:
- name: cronpod
image: ghcr.io/lix-project/lix:latest
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- |
nix-shell /nix-overlay/shell.nix \
--keep COPERNICUSMARINE_SERVICE_USERNAME \
--keep COPERNICUSMARINE_SERVICE_PASSWORD \
--run '
copernicusmarine login \
--username "$COPERNICUSMARINE_SERVICE_USERNAME" \
--password "$COPERNICUSMARINE_SERVICE_PASSWORD" \
--force-overwrite
bash /scripts/download.sh \
$(date -d "2 days ago" +%Y-%m-%d) \
$(date +%Y-%m-%d)
chown -R 5000:5000 /data/hdd/data/NEMO
chmod -R g+w /data/hdd/data/NEMO
'
env:
- name: COPERNICUSMARINE_SERVICE_USERNAME
valueFrom:
secretKeyRef:
name: nemo-credentials
key: username
- name: COPERNICUSMARINE_SERVICE_PASSWORD
valueFrom:
secretKeyRef:
name: nemo-credentials
key: password
resources: {}
volumeMounts:
- name: data
mountPath: /data
- name: script
mountPath: /scripts
- name: nix
mountPath: /nix-overlay
securityContext: {}
volumes:
- name: data
persistentVolumeClaim:
claimName: ekman-data
- name: script
configMap:
name: nemo-script
defaultMode: 0755
- name: nix
configMap:
name: nemo-nix
defaultMode: 0644
+2 -2
View File
@@ -71,8 +71,6 @@ spec:
template:
spec:
restartPolicy: "OnFailure"
securityContext:
runAsUser: 5000
containers:
- name: cronpod
image: juselius/busynix:1.1
@@ -83,6 +81,8 @@ spec:
- |
nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash
bash /scripts/download.sh
chown -R 5000:5000 /data/hdd/data/norshelf
chmod -R g+w /data/hdd/data/norshelf
resources: {}
volumeMounts:
- name: data
+55
View File
@@ -0,0 +1,55 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: nve-config
namespace: cron
data:
appsettings.json: |
{
"NveUrl": "https://chartserver.nve.no/ShowData.aspx?req=getchart&ver=1.0",
"DataDir": "/data/hdd/data/river-data"
}
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: nve
namespace: cron
spec:
schedule: "0 8 * * *" # Everyday at 08:00, use https://crontab.guru
concurrencyPolicy: "Forbid"
successfulJobsHistoryLimit: 10
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 10
template:
spec:
restartPolicy: "OnFailure"
containers:
- name: cronpod
image: git.oceanbox.io/oceanbox/churn/riverrun:24a8bbbc-debug
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- |
riverrun data --download --ndays 5000
chown -R 5000:5000 /data/hdd/data/river-data/Data
chmod -R g+w /data/hdd/data/river-data/Data
resources: {}
volumeMounts:
- name: data
mountPath: /data
- name: config
mountPath: /app/appsettings.json
subPath: appsettings.json
readOnly: true
securityContext: {}
volumes:
- name: data
persistentVolumeClaim:
claimName: ekman-data
- name: config
configMap:
name: nve-config