fix(cron): Move backoff inside scripts

This commit is contained in:
2026-04-29 08:58:32 +02:00
parent 773504c908
commit 738112f980
6 changed files with 100 additions and 75 deletions
+27 -22
View File
@@ -6,7 +6,7 @@ metadata:
data: data:
download.py: | download.py: |
import os import os
import time import sys
from netCDF4 import Dataset from netCDF4 import Dataset
import re import re
from datetime import datetime, timedelta from datetime import datetime, timedelta
@@ -81,26 +81,26 @@ data:
os.makedirs(outdir, exist_ok=True) os.makedirs(outdir, exist_ok=True)
fList = generate_thredds_names("2026-04-24", datetime.today().strftime("%Y-%m-%d")) fList = generate_thredds_names("2026-04-24", datetime.today().strftime("%Y-%m-%d"))
failed = False
for fname in fList: for fname in fList:
savename = os.path.join(outdir, fname.split("/")[-1].split(".")[0] + ".nc") savename = os.path.join(outdir, fname.split("/")[-1].split(".")[0] + ".nc")
if os.path.exists(savename):
print(f"Skipping {savename}, already exists")
continue
print(savename) print(savename)
retries = 4 try:
for attempt in range(retries):
try: try:
try: copy_thredds_file(fname, savename)
copy_thredds_file(fname, savename) except:
except: alt_fname = re.sub("sfc", "2_5km", fname)
alt_fname = re.sub("sfc", "2_5km", fname) alt_fname = re.sub("ncml", "nc", alt_fname)
alt_fname = re.sub("ncml", "nc", alt_fname) copy_thredds_file(alt_fname, savename)
copy_thredds_file(alt_fname, savename) except Exception as e:
break print(f"File not found: {fname} ({e})")
except Exception as e: failed = True
if attempt == retries - 1:
print("File not found: " + fname) if failed:
else: sys.exit(1)
backoff = min(2 ** attempt * 5, 60)
print(f"Retrying in {backoff}s... ({e})")
time.sleep(backoff)
--- ---
apiVersion: batch/v1 apiVersion: batch/v1
kind: CronJob kind: CronJob
@@ -114,10 +114,10 @@ spec:
failedJobsHistoryLimit: 3 failedJobsHistoryLimit: 3
jobTemplate: jobTemplate:
spec: spec:
backoffLimit: 10 backoffLimit: 3
template: template:
spec: spec:
restartPolicy: "OnFailure" restartPolicy: "Never"
containers: containers:
- name: cronpod - name: cronpod
image: juselius/busynix:1.1 image: juselius/busynix:1.1
@@ -126,9 +126,14 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
- | - |
nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py' if nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py'; then
chown -R 5000:5000 /data/hdd/data/AROME chown -R 5000:5000 /data/hdd/data/AROME
chmod -R g+w /data/hdd/data/AROME chmod -R g+w /data/hdd/data/AROME
else
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
fi
resources: {} resources: {}
volumeMounts: volumeMounts:
- name: data - name: data
+33 -32
View File
@@ -8,7 +8,6 @@ data:
import argparse import argparse
import os import os
import sys import sys
import time
import requests import requests
from datetime import datetime from datetime import datetime
@@ -54,43 +53,37 @@ data:
return urls return urls
def download_file(session, url, out_dir, verbose=False, retries=3): def download_file(session, url, out_dir, verbose=False):
filename = os.path.basename(url) filename = os.path.basename(url)
local_path = os.path.join(out_dir, filename) local_path = os.path.join(out_dir, filename)
if os.path.exists(local_path): if os.path.exists(local_path):
if verbose: if verbose:
print(f"Skipping existing: {filename}") print(f"Skipping existing: {filename}")
return return True
for attempt in range(retries): if verbose:
try: print(f"Downloading: {filename}")
if verbose:
print(f"Downloading ({attempt+1}/{retries}): {filename}")
with session.get(url, stream=True, allow_redirects=True, timeout=60) as r: try:
if r.status_code == 401: with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
raise Exception("Unauthorized (check .netrc credentials)") if r.status_code == 401:
raise Exception("Unauthorized (check .netrc credentials)")
r.raise_for_status() r.raise_for_status()
with open(local_path, "wb") as f: with open(local_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192): for chunk in r.iter_content(chunk_size=8192):
if chunk: if chunk:
f.write(chunk) f.write(chunk)
if verbose: if verbose:
print(f"Saved: {filename}") print(f"Saved: {filename}")
return return True
except Exception as e: except Exception as e:
if attempt == retries - 1: print(f"Failed: {filename} -> {e}")
print(f"Failed: {filename} -> {e}") return False
else:
backoff = min(2 ** attempt * 5, 60)
if verbose:
print(f"Retrying {filename} in {backoff}s... ({e})")
time.sleep(backoff)
def validate_dates(start, end): def validate_dates(start, end):
@@ -111,9 +104,13 @@ data:
urls = get_download_urls(args.start_date, args.end_date, args.verbose) urls = get_download_urls(args.start_date, args.end_date, args.verbose)
failed = False
for url in urls: for url in urls:
download_file(session, url, args.out_dir, args.verbose) if not download_file(session, url, args.out_dir, args.verbose):
failed = True
if failed:
sys.exit(1)
print(f"\nDone. Downloaded files to: {args.out_dir}") print(f"\nDone. Downloaded files to: {args.out_dir}")
@@ -132,10 +129,10 @@ spec:
failedJobsHistoryLimit: 3 failedJobsHistoryLimit: 3
jobTemplate: jobTemplate:
spec: spec:
backoffLimit: 10 backoffLimit: 3
template: template:
spec: spec:
restartPolicy: "OnFailure" restartPolicy: "Never"
containers: containers:
- name: cronpod - name: cronpod
image: juselius/busynix:1.1 image: juselius/busynix:1.1
@@ -149,10 +146,14 @@ spec:
-sd $(date -d "3 days ago" +%Y-%m-%d) \ -sd $(date -d "3 days ago" +%Y-%m-%d) \
-ed $(date +%Y-%m-%d) \ -ed $(date +%Y-%m-%d) \
-o /data/hdd/data/river-data/MUR/MUR_SST_nc \ -o /data/hdd/data/river-data/MUR/MUR_SST_nc \
-v -v &&
chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc &&
chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc
' ' || {
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
}
resources: {} resources: {}
volumeMounts: volumeMounts:
- name: data - name: data
+10 -6
View File
@@ -181,10 +181,10 @@ spec:
failedJobsHistoryLimit: 3 failedJobsHistoryLimit: 3
jobTemplate: jobTemplate:
spec: spec:
backoffLimit: 10 backoffLimit: 3
template: template:
spec: spec:
restartPolicy: "OnFailure" restartPolicy: "Never"
containers: containers:
- name: cronpod - name: cronpod
image: ghcr.io/lix-project/lix:latest image: ghcr.io/lix-project/lix:latest
@@ -200,13 +200,17 @@ spec:
copernicusmarine login \ copernicusmarine login \
--username "$COPERNICUSMARINE_SERVICE_USERNAME" \ --username "$COPERNICUSMARINE_SERVICE_USERNAME" \
--password "$COPERNICUSMARINE_SERVICE_PASSWORD" \ --password "$COPERNICUSMARINE_SERVICE_PASSWORD" \
--force-overwrite --force-overwrite &&
bash /scripts/download.sh \ bash /scripts/download.sh \
$(date -d "2 days ago" +%Y-%m-%d) \ $(date -d "2 days ago" +%Y-%m-%d) \
$(date +%Y-%m-%d) $(date +%Y-%m-%d) &&
chown -R 5000:5000 /data/hdd/data/NEMO chown -R 5000:5000 /data/hdd/data/NEMO &&
chmod -R g+w /data/hdd/data/NEMO chmod -R g+w /data/hdd/data/NEMO
' ' || {
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
}
env: env:
- name: COPERNICUSMARINE_SERVICE_USERNAME - name: COPERNICUSMARINE_SERVICE_USERNAME
valueFrom: valueFrom:
+10 -5
View File
@@ -75,10 +75,10 @@ spec:
failedJobsHistoryLimit: 3 failedJobsHistoryLimit: 3
jobTemplate: jobTemplate:
spec: spec:
backoffLimit: 10 backoffLimit: 3
template: template:
spec: spec:
restartPolicy: "OnFailure" restartPolicy: "Never"
containers: containers:
- name: cronpod - name: cronpod
image: juselius/busynix:1.1 image: juselius/busynix:1.1
@@ -88,9 +88,14 @@ spec:
- -c - -c
- | - |
nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash nixpkgs.parallel nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash nixpkgs.parallel
bash /scripts/download.sh if bash /scripts/download.sh; then
chown -R 10000:10000 /data/hdd/data/norkyst chown -R 10000:10000 /data/hdd/data/norkyst
chmod -R g+w /data/hdd/data/norkyst chmod -R g+w /data/hdd/data/norkyst
else
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
fi
resources: {} resources: {}
volumeMounts: volumeMounts:
- name: data - name: data
+10 -5
View File
@@ -67,10 +67,10 @@ spec:
failedJobsHistoryLimit: 3 failedJobsHistoryLimit: 3
jobTemplate: jobTemplate:
spec: spec:
backoffLimit: 10 backoffLimit: 3
template: template:
spec: spec:
restartPolicy: "OnFailure" restartPolicy: "Never"
containers: containers:
- name: cronpod - name: cronpod
image: juselius/busynix:1.1 image: juselius/busynix:1.1
@@ -80,9 +80,14 @@ spec:
- -c - -c
- | - |
nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash
bash /scripts/download.sh if bash /scripts/download.sh; then
chown -R 5000:5000 /data/hdd/data/norshelf chown -R 5000:5000 /data/hdd/data/norshelf
chmod -R g+w /data/hdd/data/norshelf chmod -R g+w /data/hdd/data/norshelf
else
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
fi
resources: {} resources: {}
volumeMounts: volumeMounts:
- name: data - name: data
+10 -5
View File
@@ -22,10 +22,10 @@ spec:
failedJobsHistoryLimit: 3 failedJobsHistoryLimit: 3
jobTemplate: jobTemplate:
spec: spec:
backoffLimit: 10 backoffLimit: 3
template: template:
spec: spec:
restartPolicy: "OnFailure" restartPolicy: "Never"
containers: containers:
- name: cronpod - name: cronpod
image: git.oceanbox.io/oceanbox/churn/riverrun:24a8bbbc-debug image: git.oceanbox.io/oceanbox/churn/riverrun:24a8bbbc-debug
@@ -34,9 +34,14 @@ spec:
- /bin/sh - /bin/sh
- -c - -c
- | - |
riverrun data --download --ndays 5000 if riverrun data --download --ndays 5000; then
chown -R 5000:5000 /data/hdd/data/river-data/Data chown -R 5000:5000 /data/hdd/data/river-data/Data
chmod -R g+w /data/hdd/data/river-data/Data chmod -R g+w /data/hdd/data/river-data/Data
else
echo "Job failed, sleeping 30 minutes before retry..."
sleep 1800
exit 1
fi
resources: {} resources: {}
volumeMounts: volumeMounts:
- name: data - name: data