fix(cron): Move backoff inside scripts

This commit is contained in:
2026-04-29 08:58:32 +02:00
parent 773504c908
commit 738112f980
6 changed files with 100 additions and 75 deletions
+27 -22
View File
@@ -6,7 +6,7 @@ metadata:
data:
download.py: |
import os
import time
import sys
from netCDF4 import Dataset
import re
from datetime import datetime, timedelta
@@ -81,26 +81,26 @@ data:
os.makedirs(outdir, exist_ok=True)
fList = generate_thredds_names("2026-04-24", datetime.today().strftime("%Y-%m-%d"))
failed = False
for fname in fList:
savename = os.path.join(outdir, fname.split("/")[-1].split(".")[0] + ".nc")
if os.path.exists(savename):
print(f"Skipping {savename}, already exists")
continue
print(savename)
retries = 4
for attempt in range(retries):
try:
try:
try:
copy_thredds_file(fname, savename)
except:
alt_fname = re.sub("sfc", "2_5km", fname)
alt_fname = re.sub("ncml", "nc", alt_fname)
copy_thredds_file(alt_fname, savename)
break
except Exception as e:
if attempt == retries - 1:
print("File not found: " + fname)
else:
backoff = min(2 ** attempt * 5, 60)
print(f"Retrying in {backoff}s... ({e})")
time.sleep(backoff)
copy_thredds_file(fname, savename)
except:
alt_fname = re.sub("sfc", "2_5km", fname)
alt_fname = re.sub("ncml", "nc", alt_fname)
copy_thredds_file(alt_fname, savename)
except Exception as e:
print(f"File not found: {fname} ({e})")
failed = True
if failed:
sys.exit(1)
---
apiVersion: batch/v1
kind: CronJob
@@ -114,10 +114,10 @@ spec:
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 10
backoffLimit: 3
template:
spec:
restartPolicy: "OnFailure"
restartPolicy: "Never"
containers:
- name: cronpod
image: juselius/busynix:1.1
@@ -126,9 +126,14 @@ spec:
- /bin/sh
- -c
- |
nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py'
chown -R 5000:5000 /data/hdd/data/AROME
chmod -R g+w /data/hdd/data/AROME
if nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py'; then
chown -R 5000:5000 /data/hdd/data/AROME
chmod -R g+w /data/hdd/data/AROME
else
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
fi
resources: {}
volumeMounts:
- name: data