fix(cron): Move backoff inside scripts
This commit is contained in:
@@ -6,7 +6,7 @@ metadata:
|
|||||||
data:
|
data:
|
||||||
download.py: |
|
download.py: |
|
||||||
import os
|
import os
|
||||||
import time
|
import sys
|
||||||
from netCDF4 import Dataset
|
from netCDF4 import Dataset
|
||||||
import re
|
import re
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
@@ -81,26 +81,26 @@ data:
|
|||||||
os.makedirs(outdir, exist_ok=True)
|
os.makedirs(outdir, exist_ok=True)
|
||||||
|
|
||||||
fList = generate_thredds_names("2026-04-24", datetime.today().strftime("%Y-%m-%d"))
|
fList = generate_thredds_names("2026-04-24", datetime.today().strftime("%Y-%m-%d"))
|
||||||
|
failed = False
|
||||||
for fname in fList:
|
for fname in fList:
|
||||||
savename = os.path.join(outdir, fname.split("/")[-1].split(".")[0] + ".nc")
|
savename = os.path.join(outdir, fname.split("/")[-1].split(".")[0] + ".nc")
|
||||||
|
if os.path.exists(savename):
|
||||||
|
print(f"Skipping {savename}, already exists")
|
||||||
|
continue
|
||||||
print(savename)
|
print(savename)
|
||||||
retries = 4
|
try:
|
||||||
for attempt in range(retries):
|
|
||||||
try:
|
try:
|
||||||
try:
|
copy_thredds_file(fname, savename)
|
||||||
copy_thredds_file(fname, savename)
|
except:
|
||||||
except:
|
alt_fname = re.sub("sfc", "2_5km", fname)
|
||||||
alt_fname = re.sub("sfc", "2_5km", fname)
|
alt_fname = re.sub("ncml", "nc", alt_fname)
|
||||||
alt_fname = re.sub("ncml", "nc", alt_fname)
|
copy_thredds_file(alt_fname, savename)
|
||||||
copy_thredds_file(alt_fname, savename)
|
except Exception as e:
|
||||||
break
|
print(f"File not found: {fname} ({e})")
|
||||||
except Exception as e:
|
failed = True
|
||||||
if attempt == retries - 1:
|
|
||||||
print("File not found: " + fname)
|
if failed:
|
||||||
else:
|
sys.exit(1)
|
||||||
backoff = min(2 ** attempt * 5, 60)
|
|
||||||
print(f"Retrying in {backoff}s... ({e})")
|
|
||||||
time.sleep(backoff)
|
|
||||||
---
|
---
|
||||||
apiVersion: batch/v1
|
apiVersion: batch/v1
|
||||||
kind: CronJob
|
kind: CronJob
|
||||||
@@ -114,10 +114,10 @@ spec:
|
|||||||
failedJobsHistoryLimit: 3
|
failedJobsHistoryLimit: 3
|
||||||
jobTemplate:
|
jobTemplate:
|
||||||
spec:
|
spec:
|
||||||
backoffLimit: 10
|
backoffLimit: 3
|
||||||
template:
|
template:
|
||||||
spec:
|
spec:
|
||||||
restartPolicy: "OnFailure"
|
restartPolicy: "Never"
|
||||||
containers:
|
containers:
|
||||||
- name: cronpod
|
- name: cronpod
|
||||||
image: juselius/busynix:1.1
|
image: juselius/busynix:1.1
|
||||||
@@ -126,9 +126,14 @@ spec:
|
|||||||
- /bin/sh
|
- /bin/sh
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py'
|
if nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py'; then
|
||||||
chown -R 5000:5000 /data/hdd/data/AROME
|
chown -R 5000:5000 /data/hdd/data/AROME
|
||||||
chmod -R g+w /data/hdd/data/AROME
|
chmod -R g+w /data/hdd/data/AROME
|
||||||
|
else
|
||||||
|
echo "Job failed, sleeping 30 minutes before retry..."
|
||||||
|
sleep 1800
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
resources: {}
|
resources: {}
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: data
|
- name: data
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ data:
|
|||||||
import argparse
|
import argparse
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import time
|
|
||||||
import requests
|
import requests
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
@@ -54,43 +53,37 @@ data:
|
|||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|
||||||
def download_file(session, url, out_dir, verbose=False, retries=3):
|
def download_file(session, url, out_dir, verbose=False):
|
||||||
filename = os.path.basename(url)
|
filename = os.path.basename(url)
|
||||||
local_path = os.path.join(out_dir, filename)
|
local_path = os.path.join(out_dir, filename)
|
||||||
|
|
||||||
if os.path.exists(local_path):
|
if os.path.exists(local_path):
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"Skipping existing: {filename}")
|
print(f"Skipping existing: {filename}")
|
||||||
return
|
return True
|
||||||
|
|
||||||
for attempt in range(retries):
|
if verbose:
|
||||||
try:
|
print(f"Downloading: {filename}")
|
||||||
if verbose:
|
|
||||||
print(f"Downloading ({attempt+1}/{retries}): {filename}")
|
|
||||||
|
|
||||||
with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
|
try:
|
||||||
if r.status_code == 401:
|
with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
|
||||||
raise Exception("Unauthorized (check .netrc credentials)")
|
if r.status_code == 401:
|
||||||
|
raise Exception("Unauthorized (check .netrc credentials)")
|
||||||
|
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
|
||||||
with open(local_path, "wb") as f:
|
with open(local_path, "wb") as f:
|
||||||
for chunk in r.iter_content(chunk_size=8192):
|
for chunk in r.iter_content(chunk_size=8192):
|
||||||
if chunk:
|
if chunk:
|
||||||
f.write(chunk)
|
f.write(chunk)
|
||||||
|
|
||||||
if verbose:
|
if verbose:
|
||||||
print(f"Saved: {filename}")
|
print(f"Saved: {filename}")
|
||||||
return
|
return True
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if attempt == retries - 1:
|
print(f"Failed: {filename} -> {e}")
|
||||||
print(f"Failed: {filename} -> {e}")
|
return False
|
||||||
else:
|
|
||||||
backoff = min(2 ** attempt * 5, 60)
|
|
||||||
if verbose:
|
|
||||||
print(f"Retrying {filename} in {backoff}s... ({e})")
|
|
||||||
time.sleep(backoff)
|
|
||||||
|
|
||||||
|
|
||||||
def validate_dates(start, end):
|
def validate_dates(start, end):
|
||||||
@@ -111,9 +104,13 @@ data:
|
|||||||
|
|
||||||
urls = get_download_urls(args.start_date, args.end_date, args.verbose)
|
urls = get_download_urls(args.start_date, args.end_date, args.verbose)
|
||||||
|
|
||||||
|
failed = False
|
||||||
for url in urls:
|
for url in urls:
|
||||||
download_file(session, url, args.out_dir, args.verbose)
|
if not download_file(session, url, args.out_dir, args.verbose):
|
||||||
|
failed = True
|
||||||
|
|
||||||
|
if failed:
|
||||||
|
sys.exit(1)
|
||||||
print(f"\nDone. Downloaded files to: {args.out_dir}")
|
print(f"\nDone. Downloaded files to: {args.out_dir}")
|
||||||
|
|
||||||
|
|
||||||
@@ -132,10 +129,10 @@ spec:
|
|||||||
failedJobsHistoryLimit: 3
|
failedJobsHistoryLimit: 3
|
||||||
jobTemplate:
|
jobTemplate:
|
||||||
spec:
|
spec:
|
||||||
backoffLimit: 10
|
backoffLimit: 3
|
||||||
template:
|
template:
|
||||||
spec:
|
spec:
|
||||||
restartPolicy: "OnFailure"
|
restartPolicy: "Never"
|
||||||
containers:
|
containers:
|
||||||
- name: cronpod
|
- name: cronpod
|
||||||
image: juselius/busynix:1.1
|
image: juselius/busynix:1.1
|
||||||
@@ -149,10 +146,14 @@ spec:
|
|||||||
-sd $(date -d "3 days ago" +%Y-%m-%d) \
|
-sd $(date -d "3 days ago" +%Y-%m-%d) \
|
||||||
-ed $(date +%Y-%m-%d) \
|
-ed $(date +%Y-%m-%d) \
|
||||||
-o /data/hdd/data/river-data/MUR/MUR_SST_nc \
|
-o /data/hdd/data/river-data/MUR/MUR_SST_nc \
|
||||||
-v
|
-v &&
|
||||||
chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc
|
chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc &&
|
||||||
chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc
|
chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc
|
||||||
'
|
' || {
|
||||||
|
echo "Job failed, sleeping 30 minutes before retry..."
|
||||||
|
sleep 1800
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
resources: {}
|
resources: {}
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: data
|
- name: data
|
||||||
|
|||||||
@@ -181,10 +181,10 @@ spec:
|
|||||||
failedJobsHistoryLimit: 3
|
failedJobsHistoryLimit: 3
|
||||||
jobTemplate:
|
jobTemplate:
|
||||||
spec:
|
spec:
|
||||||
backoffLimit: 10
|
backoffLimit: 3
|
||||||
template:
|
template:
|
||||||
spec:
|
spec:
|
||||||
restartPolicy: "OnFailure"
|
restartPolicy: "Never"
|
||||||
containers:
|
containers:
|
||||||
- name: cronpod
|
- name: cronpod
|
||||||
image: ghcr.io/lix-project/lix:latest
|
image: ghcr.io/lix-project/lix:latest
|
||||||
@@ -200,13 +200,17 @@ spec:
|
|||||||
copernicusmarine login \
|
copernicusmarine login \
|
||||||
--username "$COPERNICUSMARINE_SERVICE_USERNAME" \
|
--username "$COPERNICUSMARINE_SERVICE_USERNAME" \
|
||||||
--password "$COPERNICUSMARINE_SERVICE_PASSWORD" \
|
--password "$COPERNICUSMARINE_SERVICE_PASSWORD" \
|
||||||
--force-overwrite
|
--force-overwrite &&
|
||||||
bash /scripts/download.sh \
|
bash /scripts/download.sh \
|
||||||
$(date -d "2 days ago" +%Y-%m-%d) \
|
$(date -d "2 days ago" +%Y-%m-%d) \
|
||||||
$(date +%Y-%m-%d)
|
$(date +%Y-%m-%d) &&
|
||||||
chown -R 5000:5000 /data/hdd/data/NEMO
|
chown -R 5000:5000 /data/hdd/data/NEMO &&
|
||||||
chmod -R g+w /data/hdd/data/NEMO
|
chmod -R g+w /data/hdd/data/NEMO
|
||||||
'
|
' || {
|
||||||
|
echo "Job failed, sleeping 30 minutes before retry..."
|
||||||
|
sleep 1800
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
env:
|
env:
|
||||||
- name: COPERNICUSMARINE_SERVICE_USERNAME
|
- name: COPERNICUSMARINE_SERVICE_USERNAME
|
||||||
valueFrom:
|
valueFrom:
|
||||||
|
|||||||
@@ -75,10 +75,10 @@ spec:
|
|||||||
failedJobsHistoryLimit: 3
|
failedJobsHistoryLimit: 3
|
||||||
jobTemplate:
|
jobTemplate:
|
||||||
spec:
|
spec:
|
||||||
backoffLimit: 10
|
backoffLimit: 3
|
||||||
template:
|
template:
|
||||||
spec:
|
spec:
|
||||||
restartPolicy: "OnFailure"
|
restartPolicy: "Never"
|
||||||
containers:
|
containers:
|
||||||
- name: cronpod
|
- name: cronpod
|
||||||
image: juselius/busynix:1.1
|
image: juselius/busynix:1.1
|
||||||
@@ -88,9 +88,14 @@ spec:
|
|||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash nixpkgs.parallel
|
nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash nixpkgs.parallel
|
||||||
bash /scripts/download.sh
|
if bash /scripts/download.sh; then
|
||||||
chown -R 10000:10000 /data/hdd/data/norkyst
|
chown -R 10000:10000 /data/hdd/data/norkyst
|
||||||
chmod -R g+w /data/hdd/data/norkyst
|
chmod -R g+w /data/hdd/data/norkyst
|
||||||
|
else
|
||||||
|
echo "Job failed, sleeping 30 minutes before retry..."
|
||||||
|
sleep 1800
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
resources: {}
|
resources: {}
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: data
|
- name: data
|
||||||
|
|||||||
@@ -67,10 +67,10 @@ spec:
|
|||||||
failedJobsHistoryLimit: 3
|
failedJobsHistoryLimit: 3
|
||||||
jobTemplate:
|
jobTemplate:
|
||||||
spec:
|
spec:
|
||||||
backoffLimit: 10
|
backoffLimit: 3
|
||||||
template:
|
template:
|
||||||
spec:
|
spec:
|
||||||
restartPolicy: "OnFailure"
|
restartPolicy: "Never"
|
||||||
containers:
|
containers:
|
||||||
- name: cronpod
|
- name: cronpod
|
||||||
image: juselius/busynix:1.1
|
image: juselius/busynix:1.1
|
||||||
@@ -80,9 +80,14 @@ spec:
|
|||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash
|
nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash
|
||||||
bash /scripts/download.sh
|
if bash /scripts/download.sh; then
|
||||||
chown -R 5000:5000 /data/hdd/data/norshelf
|
chown -R 5000:5000 /data/hdd/data/norshelf
|
||||||
chmod -R g+w /data/hdd/data/norshelf
|
chmod -R g+w /data/hdd/data/norshelf
|
||||||
|
else
|
||||||
|
echo "Job failed, sleeping 30 minutes before retry..."
|
||||||
|
sleep 1800
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
resources: {}
|
resources: {}
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: data
|
- name: data
|
||||||
|
|||||||
@@ -22,10 +22,10 @@ spec:
|
|||||||
failedJobsHistoryLimit: 3
|
failedJobsHistoryLimit: 3
|
||||||
jobTemplate:
|
jobTemplate:
|
||||||
spec:
|
spec:
|
||||||
backoffLimit: 10
|
backoffLimit: 3
|
||||||
template:
|
template:
|
||||||
spec:
|
spec:
|
||||||
restartPolicy: "OnFailure"
|
restartPolicy: "Never"
|
||||||
containers:
|
containers:
|
||||||
- name: cronpod
|
- name: cronpod
|
||||||
image: git.oceanbox.io/oceanbox/churn/riverrun:24a8bbbc-debug
|
image: git.oceanbox.io/oceanbox/churn/riverrun:24a8bbbc-debug
|
||||||
@@ -34,9 +34,14 @@ spec:
|
|||||||
- /bin/sh
|
- /bin/sh
|
||||||
- -c
|
- -c
|
||||||
- |
|
- |
|
||||||
riverrun data --download --ndays 5000
|
if riverrun data --download --ndays 5000; then
|
||||||
chown -R 5000:5000 /data/hdd/data/river-data/Data
|
chown -R 5000:5000 /data/hdd/data/river-data/Data
|
||||||
chmod -R g+w /data/hdd/data/river-data/Data
|
chmod -R g+w /data/hdd/data/river-data/Data
|
||||||
|
else
|
||||||
|
echo "Job failed, sleeping 30 minutes before retry..."
|
||||||
|
sleep 1800
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
resources: {}
|
resources: {}
|
||||||
volumeMounts:
|
volumeMounts:
|
||||||
- name: data
|
- name: data
|
||||||
|
|||||||
Reference in New Issue
Block a user