fix(cron): Move backoff inside scripts

This commit is contained in:
2026-04-29 08:58:32 +02:00
parent 773504c908
commit 738112f980
6 changed files with 100 additions and 75 deletions
+27 -22
View File
@@ -6,7 +6,7 @@ metadata:
data:
download.py: |
import os
import time
import sys
from netCDF4 import Dataset
import re
from datetime import datetime, timedelta
@@ -81,26 +81,26 @@ data:
os.makedirs(outdir, exist_ok=True)
fList = generate_thredds_names("2026-04-24", datetime.today().strftime("%Y-%m-%d"))
failed = False
for fname in fList:
savename = os.path.join(outdir, fname.split("/")[-1].split(".")[0] + ".nc")
if os.path.exists(savename):
print(f"Skipping {savename}, already exists")
continue
print(savename)
retries = 4
for attempt in range(retries):
try:
try:
try:
copy_thredds_file(fname, savename)
except:
alt_fname = re.sub("sfc", "2_5km", fname)
alt_fname = re.sub("ncml", "nc", alt_fname)
copy_thredds_file(alt_fname, savename)
break
except Exception as e:
if attempt == retries - 1:
print("File not found: " + fname)
else:
backoff = min(2 ** attempt * 5, 60)
print(f"Retrying in {backoff}s... ({e})")
time.sleep(backoff)
copy_thredds_file(fname, savename)
except:
alt_fname = re.sub("sfc", "2_5km", fname)
alt_fname = re.sub("ncml", "nc", alt_fname)
copy_thredds_file(alt_fname, savename)
except Exception as e:
print(f"File not found: {fname} ({e})")
failed = True
if failed:
sys.exit(1)
---
apiVersion: batch/v1
kind: CronJob
@@ -114,10 +114,10 @@ spec:
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 10
backoffLimit: 3
template:
spec:
restartPolicy: "OnFailure"
restartPolicy: "Never"
containers:
- name: cronpod
image: juselius/busynix:1.1
@@ -126,9 +126,14 @@ spec:
- /bin/sh
- -c
- |
nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py'
chown -R 5000:5000 /data/hdd/data/AROME
chmod -R g+w /data/hdd/data/AROME
if nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py'; then
chown -R 5000:5000 /data/hdd/data/AROME
chmod -R g+w /data/hdd/data/AROME
else
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
fi
resources: {}
volumeMounts:
- name: data
+33 -32
View File
@@ -8,7 +8,6 @@ data:
import argparse
import os
import sys
import time
import requests
from datetime import datetime
@@ -54,43 +53,37 @@ data:
return urls
def download_file(session, url, out_dir, verbose=False, retries=3):
def download_file(session, url, out_dir, verbose=False):
filename = os.path.basename(url)
local_path = os.path.join(out_dir, filename)
if os.path.exists(local_path):
if verbose:
print(f"Skipping existing: {filename}")
return
return True
for attempt in range(retries):
try:
if verbose:
print(f"Downloading ({attempt+1}/{retries}): {filename}")
if verbose:
print(f"Downloading: {filename}")
with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
if r.status_code == 401:
raise Exception("Unauthorized (check .netrc credentials)")
try:
with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
if r.status_code == 401:
raise Exception("Unauthorized (check .netrc credentials)")
r.raise_for_status()
r.raise_for_status()
with open(local_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
with open(local_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
if verbose:
print(f"Saved: {filename}")
return
if verbose:
print(f"Saved: {filename}")
return True
except Exception as e:
if attempt == retries - 1:
print(f"Failed: {filename} -> {e}")
else:
backoff = min(2 ** attempt * 5, 60)
if verbose:
print(f"Retrying {filename} in {backoff}s... ({e})")
time.sleep(backoff)
except Exception as e:
print(f"Failed: {filename} -> {e}")
return False
def validate_dates(start, end):
@@ -111,9 +104,13 @@ data:
urls = get_download_urls(args.start_date, args.end_date, args.verbose)
failed = False
for url in urls:
download_file(session, url, args.out_dir, args.verbose)
if not download_file(session, url, args.out_dir, args.verbose):
failed = True
if failed:
sys.exit(1)
print(f"\nDone. Downloaded files to: {args.out_dir}")
@@ -132,10 +129,10 @@ spec:
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 10
backoffLimit: 3
template:
spec:
restartPolicy: "OnFailure"
restartPolicy: "Never"
containers:
- name: cronpod
image: juselius/busynix:1.1
@@ -149,10 +146,14 @@ spec:
-sd $(date -d "3 days ago" +%Y-%m-%d) \
-ed $(date +%Y-%m-%d) \
-o /data/hdd/data/river-data/MUR/MUR_SST_nc \
-v
chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc
-v &&
chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc &&
chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc
'
' || {
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
}
resources: {}
volumeMounts:
- name: data
+10 -6
View File
@@ -181,10 +181,10 @@ spec:
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 10
backoffLimit: 3
template:
spec:
restartPolicy: "OnFailure"
restartPolicy: "Never"
containers:
- name: cronpod
image: ghcr.io/lix-project/lix:latest
@@ -200,13 +200,17 @@ spec:
copernicusmarine login \
--username "$COPERNICUSMARINE_SERVICE_USERNAME" \
--password "$COPERNICUSMARINE_SERVICE_PASSWORD" \
--force-overwrite
--force-overwrite &&
bash /scripts/download.sh \
$(date -d "2 days ago" +%Y-%m-%d) \
$(date +%Y-%m-%d)
chown -R 5000:5000 /data/hdd/data/NEMO
$(date +%Y-%m-%d) &&
chown -R 5000:5000 /data/hdd/data/NEMO &&
chmod -R g+w /data/hdd/data/NEMO
'
' || {
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
}
env:
- name: COPERNICUSMARINE_SERVICE_USERNAME
valueFrom:
+10 -5
View File
@@ -75,10 +75,10 @@ spec:
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 10
backoffLimit: 3
template:
spec:
restartPolicy: "OnFailure"
restartPolicy: "Never"
containers:
- name: cronpod
image: juselius/busynix:1.1
@@ -88,9 +88,14 @@ spec:
- -c
- |
nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash nixpkgs.parallel
bash /scripts/download.sh
chown -R 10000:10000 /data/hdd/data/norkyst
chmod -R g+w /data/hdd/data/norkyst
if bash /scripts/download.sh; then
chown -R 10000:10000 /data/hdd/data/norkyst
chmod -R g+w /data/hdd/data/norkyst
else
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
fi
resources: {}
volumeMounts:
- name: data
+10 -5
View File
@@ -67,10 +67,10 @@ spec:
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 10
backoffLimit: 3
template:
spec:
restartPolicy: "OnFailure"
restartPolicy: "Never"
containers:
- name: cronpod
image: juselius/busynix:1.1
@@ -80,9 +80,14 @@ spec:
- -c
- |
nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash
bash /scripts/download.sh
chown -R 5000:5000 /data/hdd/data/norshelf
chmod -R g+w /data/hdd/data/norshelf
if bash /scripts/download.sh; then
chown -R 5000:5000 /data/hdd/data/norshelf
chmod -R g+w /data/hdd/data/norshelf
else
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
fi
resources: {}
volumeMounts:
- name: data
+10 -5
View File
@@ -22,10 +22,10 @@ spec:
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 10
backoffLimit: 3
template:
spec:
restartPolicy: "OnFailure"
restartPolicy: "Never"
containers:
- name: cronpod
image: git.oceanbox.io/oceanbox/churn/riverrun:24a8bbbc-debug
@@ -34,9 +34,14 @@ spec:
- /bin/sh
- -c
- |
riverrun data --download --ndays 5000
chown -R 5000:5000 /data/hdd/data/river-data/Data
chmod -R g+w /data/hdd/data/river-data/Data
if riverrun data --download --ndays 5000; then
chown -R 5000:5000 /data/hdd/data/river-data/Data
chmod -R g+w /data/hdd/data/river-data/Data
else
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
fi
resources: {}
volumeMounts:
- name: data