diff --git a/raw/ekman/cronjobs/arome/cron.yaml b/raw/ekman/cronjobs/arome/cron.yaml index 2cc0cd36..0a6c50fc 100644 --- a/raw/ekman/cronjobs/arome/cron.yaml +++ b/raw/ekman/cronjobs/arome/cron.yaml @@ -6,7 +6,7 @@ metadata: data: download.py: | import os - import time + import sys from netCDF4 import Dataset import re from datetime import datetime, timedelta @@ -81,26 +81,26 @@ data: os.makedirs(outdir, exist_ok=True) fList = generate_thredds_names("2026-04-24", datetime.today().strftime("%Y-%m-%d")) + failed = False for fname in fList: savename = os.path.join(outdir, fname.split("/")[-1].split(".")[0] + ".nc") + if os.path.exists(savename): + print(f"Skipping {savename}, already exists") + continue print(savename) - retries = 4 - for attempt in range(retries): + try: try: - try: - copy_thredds_file(fname, savename) - except: - alt_fname = re.sub("sfc", "2_5km", fname) - alt_fname = re.sub("ncml", "nc", alt_fname) - copy_thredds_file(alt_fname, savename) - break - except Exception as e: - if attempt == retries - 1: - print("File not found: " + fname) - else: - backoff = min(2 ** attempt * 5, 60) - print(f"Retrying in {backoff}s... ({e})") - time.sleep(backoff) + copy_thredds_file(fname, savename) + except: + alt_fname = re.sub("sfc", "2_5km", fname) + alt_fname = re.sub("ncml", "nc", alt_fname) + copy_thredds_file(alt_fname, savename) + except Exception as e: + print(f"File not found: {fname} ({e})") + failed = True + + if failed: + sys.exit(1) --- apiVersion: batch/v1 kind: CronJob @@ -114,10 +114,10 @@ spec: failedJobsHistoryLimit: 3 jobTemplate: spec: - backoffLimit: 10 + backoffLimit: 3 template: spec: - restartPolicy: "OnFailure" + restartPolicy: "Never" containers: - name: cronpod image: juselius/busynix:1.1 @@ -126,9 +126,14 @@ spec: - /bin/sh - -c - | - nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py' - chown -R 5000:5000 /data/hdd/data/AROME - chmod -R g+w /data/hdd/data/AROME + if nix-shell -p 'python3.withPackages(ps: [ps.netcdf4])' --run 'python3 /scripts/download.py'; then + chown -R 5000:5000 /data/hdd/data/AROME + chmod -R g+w /data/hdd/data/AROME + else + echo "Job failed, sleeping 30 minutes before retry..." + sleep 1800 + exit 1 + fi resources: {} volumeMounts: - name: data diff --git a/raw/ekman/cronjobs/mur/cron.yaml b/raw/ekman/cronjobs/mur/cron.yaml index 4b5750fd..48500f41 100644 --- a/raw/ekman/cronjobs/mur/cron.yaml +++ b/raw/ekman/cronjobs/mur/cron.yaml @@ -8,7 +8,6 @@ data: import argparse import os import sys - import time import requests from datetime import datetime @@ -54,43 +53,37 @@ data: return urls - def download_file(session, url, out_dir, verbose=False, retries=3): + def download_file(session, url, out_dir, verbose=False): filename = os.path.basename(url) local_path = os.path.join(out_dir, filename) if os.path.exists(local_path): if verbose: print(f"Skipping existing: {filename}") - return + return True - for attempt in range(retries): - try: - if verbose: - print(f"Downloading ({attempt+1}/{retries}): {filename}") + if verbose: + print(f"Downloading: {filename}") - with session.get(url, stream=True, allow_redirects=True, timeout=60) as r: - if r.status_code == 401: - raise Exception("Unauthorized (check .netrc credentials)") + try: + with session.get(url, stream=True, allow_redirects=True, timeout=60) as r: + if r.status_code == 401: + raise Exception("Unauthorized (check .netrc credentials)") - r.raise_for_status() + r.raise_for_status() - with open(local_path, "wb") as f: - for chunk in r.iter_content(chunk_size=8192): - if chunk: - f.write(chunk) + with open(local_path, "wb") as f: + for chunk in r.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) - if verbose: - print(f"Saved: {filename}") - return + if verbose: + print(f"Saved: {filename}") + return True - except Exception as e: - if attempt == retries - 1: - print(f"Failed: {filename} -> {e}") - else: - backoff = min(2 ** attempt * 5, 60) - if verbose: - print(f"Retrying {filename} in {backoff}s... ({e})") - time.sleep(backoff) + except Exception as e: + print(f"Failed: {filename} -> {e}") + return False def validate_dates(start, end): @@ -111,9 +104,13 @@ data: urls = get_download_urls(args.start_date, args.end_date, args.verbose) + failed = False for url in urls: - download_file(session, url, args.out_dir, args.verbose) + if not download_file(session, url, args.out_dir, args.verbose): + failed = True + if failed: + sys.exit(1) print(f"\nDone. Downloaded files to: {args.out_dir}") @@ -132,10 +129,10 @@ spec: failedJobsHistoryLimit: 3 jobTemplate: spec: - backoffLimit: 10 + backoffLimit: 3 template: spec: - restartPolicy: "OnFailure" + restartPolicy: "Never" containers: - name: cronpod image: juselius/busynix:1.1 @@ -149,10 +146,14 @@ spec: -sd $(date -d "3 days ago" +%Y-%m-%d) \ -ed $(date +%Y-%m-%d) \ -o /data/hdd/data/river-data/MUR/MUR_SST_nc \ - -v - chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc + -v && + chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc && chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc - ' + ' || { + echo "Job failed, sleeping 30 minutes before retry..." + sleep 1800 + exit 1 + } resources: {} volumeMounts: - name: data diff --git a/raw/ekman/cronjobs/nemo/cron.yaml b/raw/ekman/cronjobs/nemo/cron.yaml index 6aa126ca..2190faf8 100644 --- a/raw/ekman/cronjobs/nemo/cron.yaml +++ b/raw/ekman/cronjobs/nemo/cron.yaml @@ -181,10 +181,10 @@ spec: failedJobsHistoryLimit: 3 jobTemplate: spec: - backoffLimit: 10 + backoffLimit: 3 template: spec: - restartPolicy: "OnFailure" + restartPolicy: "Never" containers: - name: cronpod image: ghcr.io/lix-project/lix:latest @@ -200,13 +200,17 @@ spec: copernicusmarine login \ --username "$COPERNICUSMARINE_SERVICE_USERNAME" \ --password "$COPERNICUSMARINE_SERVICE_PASSWORD" \ - --force-overwrite + --force-overwrite && bash /scripts/download.sh \ $(date -d "2 days ago" +%Y-%m-%d) \ - $(date +%Y-%m-%d) - chown -R 5000:5000 /data/hdd/data/NEMO + $(date +%Y-%m-%d) && + chown -R 5000:5000 /data/hdd/data/NEMO && chmod -R g+w /data/hdd/data/NEMO - ' + ' || { + echo "Job failed, sleeping 30 minutes before retry..." + sleep 1800 + exit 1 + } env: - name: COPERNICUSMARINE_SERVICE_USERNAME valueFrom: diff --git a/raw/ekman/cronjobs/norkyst/cron.yaml b/raw/ekman/cronjobs/norkyst/cron.yaml index 83313cca..154ddd61 100644 --- a/raw/ekman/cronjobs/norkyst/cron.yaml +++ b/raw/ekman/cronjobs/norkyst/cron.yaml @@ -75,10 +75,10 @@ spec: failedJobsHistoryLimit: 3 jobTemplate: spec: - backoffLimit: 10 + backoffLimit: 3 template: spec: - restartPolicy: "OnFailure" + restartPolicy: "Never" containers: - name: cronpod image: juselius/busynix:1.1 @@ -88,9 +88,14 @@ spec: - -c - | nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash nixpkgs.parallel - bash /scripts/download.sh - chown -R 10000:10000 /data/hdd/data/norkyst - chmod -R g+w /data/hdd/data/norkyst + if bash /scripts/download.sh; then + chown -R 10000:10000 /data/hdd/data/norkyst + chmod -R g+w /data/hdd/data/norkyst + else + echo "Job failed, sleeping 30 minutes before retry..." + sleep 1800 + exit 1 + fi resources: {} volumeMounts: - name: data diff --git a/raw/ekman/cronjobs/norshelf/cron.yaml b/raw/ekman/cronjobs/norshelf/cron.yaml index 94298f74..62c62396 100644 --- a/raw/ekman/cronjobs/norshelf/cron.yaml +++ b/raw/ekman/cronjobs/norshelf/cron.yaml @@ -67,10 +67,10 @@ spec: failedJobsHistoryLimit: 3 jobTemplate: spec: - backoffLimit: 10 + backoffLimit: 3 template: spec: - restartPolicy: "OnFailure" + restartPolicy: "Never" containers: - name: cronpod image: juselius/busynix:1.1 @@ -80,9 +80,14 @@ spec: - -c - | nix-env -iA nixpkgs.wget nixpkgs.coreutils nixpkgs.bash - bash /scripts/download.sh - chown -R 5000:5000 /data/hdd/data/norshelf - chmod -R g+w /data/hdd/data/norshelf + if bash /scripts/download.sh; then + chown -R 5000:5000 /data/hdd/data/norshelf + chmod -R g+w /data/hdd/data/norshelf + else + echo "Job failed, sleeping 30 minutes before retry..." + sleep 1800 + exit 1 + fi resources: {} volumeMounts: - name: data diff --git a/raw/ekman/cronjobs/nve/cron.yaml b/raw/ekman/cronjobs/nve/cron.yaml index 3761120c..eefdb472 100644 --- a/raw/ekman/cronjobs/nve/cron.yaml +++ b/raw/ekman/cronjobs/nve/cron.yaml @@ -22,10 +22,10 @@ spec: failedJobsHistoryLimit: 3 jobTemplate: spec: - backoffLimit: 10 + backoffLimit: 3 template: spec: - restartPolicy: "OnFailure" + restartPolicy: "Never" containers: - name: cronpod image: git.oceanbox.io/oceanbox/churn/riverrun:24a8bbbc-debug @@ -34,9 +34,14 @@ spec: - /bin/sh - -c - | - riverrun data --download --ndays 5000 - chown -R 5000:5000 /data/hdd/data/river-data/Data - chmod -R g+w /data/hdd/data/river-data/Data + if riverrun data --download --ndays 5000; then + chown -R 5000:5000 /data/hdd/data/river-data/Data + chmod -R g+w /data/hdd/data/river-data/Data + else + echo "Job failed, sleeping 30 minutes before retry..." + sleep 1800 + exit 1 + fi resources: {} volumeMounts: - name: data