apiVersion: v1
kind: ConfigMap
metadata:
  name: mur-script
  namespace: cron
data:
  download.py: |
    import argparse
    import os
    import sys
    import requests
    from datetime import datetime

    parser = argparse.ArgumentParser(description="Download MUR SST files from NASA Earthdata")
    parser.add_argument("-sd", "--start_date", required=True, help="Start date (YYYY-MM-DD)")
    parser.add_argument("-ed", "--end_date", required=True, help="End date (YYYY-MM-DD)")
    parser.add_argument("-o", "--out_dir", default="MUR_SST_nc", help="Output directory")
    parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
    args = parser.parse_args()


    def create_session():
        session = requests.Session()
        session.headers.update({
            "User-Agent": "mur-sst-downloader",
            "Accept-Encoding": "identity"
        })
        return session


    def get_download_urls(startdate, enddate, verbose=False):
        url = (
            "https://cmr.earthdata.nasa.gov/search/granules.umm_json"
            f"?collection_concept_id=C1996881146-POCLOUD"
            f"&temporal={startdate}T00:00:00Z,{enddate}T00:00:00Z"
            "&pageSize=365"
        )

        r = requests.get(url)
        r.raise_for_status()
        data = r.json()

        urls = []
        for item in data["items"]:
            for link in item["umm"]["RelatedUrls"]:
                # Prefer direct HTTPS download links
                if link.get("Type") == "GET DATA":
                    urls.append(link["URL"])

        if verbose:
            print(f"Found {len(urls)} files")

        return urls


    def download_file(session, url, out_dir, verbose=False):
        filename = os.path.basename(url)
        local_path = os.path.join(out_dir, filename)

        if os.path.exists(local_path):
            if verbose:
                print(f"Skipping existing: {filename}")
            return True

        if verbose:
            print(f"Downloading: {filename}")

        try:
            with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
                if r.status_code == 401:
                    raise Exception("Unauthorized (check .netrc credentials)")

                r.raise_for_status()

                with open(local_path, "wb") as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)

            if verbose:
                print(f"Saved: {filename}")
            return True

        except Exception as e:
            print(f"Failed: {filename} -> {e}")
            return False


    def validate_dates(start, end):
        try:
            datetime.strptime(start, "%Y-%m-%d")
            datetime.strptime(end, "%Y-%m-%d")
        except ValueError:
            print("Error: Dates must be in YYYY-MM-DD format")
            sys.exit(1)


    def main():
        validate_dates(args.start_date, args.end_date)

        # os.makedirs(args.out_dir, exist_ok=True)

        session = create_session()

        urls = get_download_urls(args.start_date, args.end_date, args.verbose)

        failed = False
        for url in urls:
            if not download_file(session, url, args.out_dir, args.verbose):
                failed = True

        if failed:
            sys.exit(1)
        print(f"\nDone. Downloaded files to: {args.out_dir}")


    if __name__ == "__main__":
        main()
---
apiVersion: batch/v1
kind: CronJob
metadata:
  name: mur
  namespace: cron
spec:
  schedule: "0 6 * * *" # Everyday at 06:00, use https://crontab.guru
  concurrencyPolicy: "Forbid"
  successfulJobsHistoryLimit: 10
  failedJobsHistoryLimit: 3
  jobTemplate:
    spec:
      backoffLimit: 3
      template:
        spec:
          restartPolicy: "Never"
          containers:
            - name: cronpod
              image: juselius/busynix:1.1
              imagePullPolicy: IfNotPresent
              command:
                - /bin/sh
                - -c
                - |
                  nix-shell -p 'python3.withPackages(ps: [ps.requests])' coreutils --run '
                    python3 /scripts/download.py \
                      -sd $(date -d "3 days ago" +%Y-%m-%d) \
                      -ed $(date +%Y-%m-%d) \
                      -o /data/hdd/data/river-data/MUR/MUR_SST_nc \
                      -v &&
                    chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc &&
                    chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc
                  ' || {
                    echo "Job failed, sleeping 30 minutes before retry..."
                    sleep 1800
                    exit 1
                  }
              resources: {}
              volumeMounts:
                - name: data
                  mountPath: /data
                - name: script
                  mountPath: /scripts
                - name: netrc
                  mountPath: /root/.netrc
                  subPath: .netrc
                  readOnly: true
              securityContext: {}
          volumes:
            - name: data
              persistentVolumeClaim:
                claimName: ekman-data
            - name: script
              configMap:
                name: mur-script
                defaultMode: 0755
            - name: netrc
              secret:
                secretName: mur-netrc