fix(cron): Move backoff inside scripts
This commit is contained in:
@@ -6,7 +6,7 @@ metadata:
|
||||
data:
|
||||
download.py: |
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
from netCDF4 import Dataset
|
||||
import re
|
||||
from datetime import datetime, timedelta
|
||||
@@ -81,26 +81,26 @@ data:
|
||||
os.makedirs(outdir, exist_ok=True)
|
||||
|
||||
fList = generate_thredds_names("2026-04-24", datetime.today().strftime("%Y-%m-%d"))
|
||||
failed = False
|
||||
for fname in fList:
|
||||
savename = os.path.join(outdir, fname.split("/")[-1].split(".")[0] + ".nc")
|
||||
if os.path.exists(savename):
|
||||
print(f"Skipping {savename}, already exists")
|
||||
continue
|
||||
print(savename)
|
||||
retries = 4
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
try:
|
||||
try:
|
||||
copy_thredds_file(fname, savename)
|
||||
except:
|
||||
alt_fname = re.sub("sfc", "2_5km", fname)
|
||||
alt_fname = re.sub("ncml", "nc", alt_fname)
|
||||
copy_thredds_file(alt_fname, savename)
|
||||
break
|
||||
except Exception as e:
|
||||
if attempt == retries - 1:
|
||||
print("File not found: " + fname)
|
||||
else:
|
||||
backoff = min(2 ** attempt * 5, 60)
|
||||
print(f"Retrying in {backoff}s... ({e})")
|
||||
time.sleep(backoff)
|
||||
copy_thredds_file(fname, savename)
|
||||
except:
|
||||
alt_fname = re.sub("sfc", "2_5km", fname)
|
||||
alt_fname = re.sub("ncml", "nc", alt_fname)
|
||||
copy_thredds_file(alt_fname, savename)
|
||||
except Exception as e:
|
||||
print(f"File not found: {fname} ({e})")
|
||||
failed = True
|
||||
|
||||
if failed:
|
||||
sys.exit(1)
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
@@ -114,10 +114,10 @@ spec:
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 10
|
||||
backoffLimit: 3
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: "OnFailure"
|
||||
restartPolicy: "Never"
|
||||
containers:
|
||||
- name: cronpod
|
||||
image: juselius/busynix:1.1
|
||||
@@ -126,9 +126,14 @@ spec:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py'
|
||||
chown -R 5000:5000 /data/hdd/data/AROME
|
||||
chmod -R g+w /data/hdd/data/AROME
|
||||
if nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py'; then
|
||||
chown -R 5000:5000 /data/hdd/data/AROME
|
||||
chmod -R g+w /data/hdd/data/AROME
|
||||
else
|
||||
echo "Job failed, sleeping 30 minutes before retry..."
|
||||
sleep 1800
|
||||
exit 1
|
||||
fi
|
||||
resources: {}
|
||||
volumeMounts:
|
||||
- name: data
|
||||
|
||||
@@ -8,7 +8,6 @@ data:
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import requests
|
||||
from datetime import datetime
|
||||
|
||||
@@ -54,43 +53,37 @@ data:
|
||||
return urls
|
||||
|
||||
|
||||
def download_file(session, url, out_dir, verbose=False, retries=3):
|
||||
def download_file(session, url, out_dir, verbose=False):
|
||||
filename = os.path.basename(url)
|
||||
local_path = os.path.join(out_dir, filename)
|
||||
|
||||
if os.path.exists(local_path):
|
||||
if verbose:
|
||||
print(f"Skipping existing: {filename}")
|
||||
return
|
||||
return True
|
||||
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
if verbose:
|
||||
print(f"Downloading ({attempt+1}/{retries}): {filename}")
|
||||
if verbose:
|
||||
print(f"Downloading: {filename}")
|
||||
|
||||
with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
|
||||
if r.status_code == 401:
|
||||
raise Exception("Unauthorized (check .netrc credentials)")
|
||||
try:
|
||||
with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
|
||||
if r.status_code == 401:
|
||||
raise Exception("Unauthorized (check .netrc credentials)")
|
||||
|
||||
r.raise_for_status()
|
||||
r.raise_for_status()
|
||||
|
||||
with open(local_path, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
with open(local_path, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
if verbose:
|
||||
print(f"Saved: {filename}")
|
||||
return
|
||||
if verbose:
|
||||
print(f"Saved: {filename}")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
if attempt == retries - 1:
|
||||
print(f"Failed: {filename} -> {e}")
|
||||
else:
|
||||
backoff = min(2 ** attempt * 5, 60)
|
||||
if verbose:
|
||||
print(f"Retrying {filename} in {backoff}s... ({e})")
|
||||
time.sleep(backoff)
|
||||
except Exception as e:
|
||||
print(f"Failed: {filename} -> {e}")
|
||||
return False
|
||||
|
||||
|
||||
def validate_dates(start, end):
|
||||
@@ -111,9 +104,13 @@ data:
|
||||
|
||||
urls = get_download_urls(args.start_date, args.end_date, args.verbose)
|
||||
|
||||
failed = False
|
||||
for url in urls:
|
||||
download_file(session, url, args.out_dir, args.verbose)
|
||||
if not download_file(session, url, args.out_dir, args.verbose):
|
||||
failed = True
|
||||
|
||||
if failed:
|
||||
sys.exit(1)
|
||||
print(f"\nDone. Downloaded files to: {args.out_dir}")
|
||||
|
||||
|
||||
@@ -132,10 +129,10 @@ spec:
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 10
|
||||
backoffLimit: 3
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: "OnFailure"
|
||||
restartPolicy: "Never"
|
||||
containers:
|
||||
- name: cronpod
|
||||
image: juselius/busynix:1.1
|
||||
@@ -149,10 +146,14 @@ spec:
|
||||
-sd $(date -d "3 days ago" +%Y-%m-%d) \
|
||||
-ed $(date +%Y-%m-%d) \
|
||||
-o /data/hdd/data/river-data/MUR/MUR_SST_nc \
|
||||
-v
|
||||
chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc
|
||||
-v &&
|
||||
chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc &&
|
||||
chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc
|
||||
'
|
||||
' || {
|
||||
echo "Job failed, sleeping 30 minutes before retry..."
|
||||
sleep 1800
|
||||
exit 1
|
||||
}
|
||||
resources: {}
|
||||
volumeMounts:
|
||||
- name: data
|
||||
|
||||
@@ -181,10 +181,10 @@ spec:
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 10
|
||||
backoffLimit: 3
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: "OnFailure"
|
||||
restartPolicy: "Never"
|
||||
containers:
|
||||
- name: cronpod
|
||||
image: ghcr.io/lix-project/lix:latest
|
||||
@@ -200,13 +200,17 @@ spec:
|
||||
copernicusmarine login \
|
||||
--username "$COPERNICUSMARINE_SERVICE_USERNAME" \
|
||||
--password "$COPERNICUSMARINE_SERVICE_PASSWORD" \
|
||||
--force-overwrite
|
||||
--force-overwrite &&
|
||||
bash /scripts/download.sh \
|
||||
$(date -d "2 days ago" +%Y-%m-%d) \
|
||||
$(date +%Y-%m-%d)
|
||||
chown -R 5000:5000 /data/hdd/data/NEMO
|
||||
$(date +%Y-%m-%d) &&
|
||||
chown -R 5000:5000 /data/hdd/data/NEMO &&
|
||||
chmod -R g+w /data/hdd/data/NEMO
|
||||
'
|
||||
' || {
|
||||
echo "Job failed, sleeping 30 minutes before retry..."
|
||||
sleep 1800
|
||||
exit 1
|
||||
}
|
||||
env:
|
||||
- name: COPERNICUSMARINE_SERVICE_USERNAME
|
||||
valueFrom:
|
||||
|
||||
@@ -75,10 +75,10 @@ spec:
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 10
|
||||
backoffLimit: 3
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: "OnFailure"
|
||||
restartPolicy: "Never"
|
||||
containers:
|
||||
- name: cronpod
|
||||
image: juselius/busynix:1.1
|
||||
@@ -88,9 +88,14 @@ spec:
|
||||
- -c
|
||||
- |
|
||||
nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash nixpkgs.parallel
|
||||
bash /scripts/download.sh
|
||||
chown -R 10000:10000 /data/hdd/data/norkyst
|
||||
chmod -R g+w /data/hdd/data/norkyst
|
||||
if bash /scripts/download.sh; then
|
||||
chown -R 10000:10000 /data/hdd/data/norkyst
|
||||
chmod -R g+w /data/hdd/data/norkyst
|
||||
else
|
||||
echo "Job failed, sleeping 30 minutes before retry..."
|
||||
sleep 1800
|
||||
exit 1
|
||||
fi
|
||||
resources: {}
|
||||
volumeMounts:
|
||||
- name: data
|
||||
|
||||
@@ -67,10 +67,10 @@ spec:
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 10
|
||||
backoffLimit: 3
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: "OnFailure"
|
||||
restartPolicy: "Never"
|
||||
containers:
|
||||
- name: cronpod
|
||||
image: juselius/busynix:1.1
|
||||
@@ -80,9 +80,14 @@ spec:
|
||||
- -c
|
||||
- |
|
||||
nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash
|
||||
bash /scripts/download.sh
|
||||
chown -R 5000:5000 /data/hdd/data/norshelf
|
||||
chmod -R g+w /data/hdd/data/norshelf
|
||||
if bash /scripts/download.sh; then
|
||||
chown -R 5000:5000 /data/hdd/data/norshelf
|
||||
chmod -R g+w /data/hdd/data/norshelf
|
||||
else
|
||||
echo "Job failed, sleeping 30 minutes before retry..."
|
||||
sleep 1800
|
||||
exit 1
|
||||
fi
|
||||
resources: {}
|
||||
volumeMounts:
|
||||
- name: data
|
||||
|
||||
@@ -22,10 +22,10 @@ spec:
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 10
|
||||
backoffLimit: 3
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: "OnFailure"
|
||||
restartPolicy: "Never"
|
||||
containers:
|
||||
- name: cronpod
|
||||
image: git.oceanbox.io/oceanbox/churn/riverrun:24a8bbbc-debug
|
||||
@@ -34,9 +34,14 @@ spec:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
riverrun data --download --ndays 5000
|
||||
chown -R 5000:5000 /data/hdd/data/river-data/Data
|
||||
chmod -R g+w /data/hdd/data/river-data/Data
|
||||
if riverrun data --download --ndays 5000; then
|
||||
chown -R 5000:5000 /data/hdd/data/river-data/Data
|
||||
chmod -R g+w /data/hdd/data/river-data/Data
|
||||
else
|
||||
echo "Job failed, sleeping 30 minutes before retry..."
|
||||
sleep 1800
|
||||
exit 1
|
||||
fi
|
||||
resources: {}
|
||||
volumeMounts:
|
||||
- name: data
|
||||
|
||||
Reference in New Issue
Block a user