apiVersion: v1 kind: ConfigMap metadata: name: mur-script namespace: cron data: download.py: | import argparse import os import sys import requests from datetime import datetime parser = argparse.ArgumentParser(description="Download MUR SST files from NASA Earthdata") parser.add_argument("-sd", "--start_date", required=True, help="Start date (YYYY-MM-DD)") parser.add_argument("-ed", "--end_date", required=True, help="End date (YYYY-MM-DD)") parser.add_argument("-o", "--out_dir", default="MUR_SST_nc", help="Output directory") parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output") args = parser.parse_args() def create_session(): session = requests.Session() session.headers.update({ "User-Agent": "mur-sst-downloader", "Accept-Encoding": "identity" }) return session def get_download_urls(startdate, enddate, verbose=False): url = ( "https://cmr.earthdata.nasa.gov/search/granules.umm_json" f"?collection_concept_id=C1996881146-POCLOUD" f"&temporal={startdate}T00:00:00Z,{enddate}T00:00:00Z" "&pageSize=365" ) r = requests.get(url) r.raise_for_status() data = r.json() urls = [] for item in data["items"]: for link in item["umm"]["RelatedUrls"]: # Prefer direct HTTPS download links if link.get("Type") == "GET DATA": urls.append(link["URL"]) if verbose: print(f"Found {len(urls)} files") return urls def download_file(session, url, out_dir, verbose=False): filename = os.path.basename(url) local_path = os.path.join(out_dir, filename) if os.path.exists(local_path): if verbose: print(f"Skipping existing: {filename}") return True if verbose: print(f"Downloading: {filename}") try: with session.get(url, stream=True, allow_redirects=True, timeout=60) as r: if r.status_code == 401: raise Exception("Unauthorized (check .netrc credentials)") r.raise_for_status() with open(local_path, "wb") as f: for chunk in r.iter_content(chunk_size=8192): if chunk: f.write(chunk) if verbose: print(f"Saved: {filename}") return True except Exception as e: print(f"Failed: {filename} -> {e}") return False def validate_dates(start, end): try: datetime.strptime(start, "%Y-%m-%d") datetime.strptime(end, "%Y-%m-%d") except ValueError: print("Error: Dates must be in YYYY-MM-DD format") sys.exit(1) def main(): validate_dates(args.start_date, args.end_date) # os.makedirs(args.out_dir, exist_ok=True) session = create_session() urls = get_download_urls(args.start_date, args.end_date, args.verbose) failed = False for url in urls: if not download_file(session, url, args.out_dir, args.verbose): failed = True if failed: sys.exit(1) print(f"\nDone. Downloaded files to: {args.out_dir}") if __name__ == "__main__": main() --- apiVersion: batch/v1 kind: CronJob metadata: name: mur namespace: cron spec: schedule: "0 6 * * *" # Everyday at 06:00, use https://crontab.guru concurrencyPolicy: "Forbid" successfulJobsHistoryLimit: 10 failedJobsHistoryLimit: 3 jobTemplate: spec: backoffLimit: 3 template: spec: restartPolicy: "Never" containers: - name: cronpod image: juselius/busynix:1.1 imagePullPolicy: IfNotPresent command: - /bin/sh - -c - | nix-shell -p 'python3.withPackages(ps: [ps.requests])' coreutils --run ' python3 /scripts/download.py \ -sd $(date -d "3 days ago" +%Y-%m-%d) \ -ed $(date +%Y-%m-%d) \ -o /data/hdd/data/river-data/MUR/MUR_SST_nc \ -v && chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc && chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc ' || { echo "Job failed, sleeping 30 minutes before retry..." sleep 1800 exit 1 } resources: {} volumeMounts: - name: data mountPath: /data - name: script mountPath: /scripts - name: netrc mountPath: /root/.netrc subPath: .netrc readOnly: true securityContext: {} volumes: - name: data persistentVolumeClaim: claimName: ekman-data - name: script configMap: name: mur-script defaultMode: 0755 - name: netrc secret: secretName: mur-netrc