Files
manifests/raw/ekman/cronjobs/mur/cron.yaml
T
2026-04-29 08:20:03 +02:00

178 lines
5.4 KiB
YAML

apiVersion: v1
kind: ConfigMap
metadata:
name: mur-script
namespace: cron
data:
download.py: |
import argparse
import os
import sys
import time
import requests
from datetime import datetime
parser = argparse.ArgumentParser(description="Download MUR SST files from NASA Earthdata")
parser.add_argument("-sd", "--start_date", required=True, help="Start date (YYYY-MM-DD)")
parser.add_argument("-ed", "--end_date", required=True, help="End date (YYYY-MM-DD)")
parser.add_argument("-o", "--out_dir", default="MUR_SST_nc", help="Output directory")
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
args = parser.parse_args()
def create_session():
session = requests.Session()
session.headers.update({
"User-Agent": "mur-sst-downloader",
"Accept-Encoding": "identity"
})
return session
def get_download_urls(startdate, enddate, verbose=False):
url = (
"https://cmr.earthdata.nasa.gov/search/granules.umm_json"
f"?collection_concept_id=C1996881146-POCLOUD"
f"&temporal={startdate}T00:00:00Z,{enddate}T00:00:00Z"
"&pageSize=365"
)
r = requests.get(url)
r.raise_for_status()
data = r.json()
urls = []
for item in data["items"]:
for link in item["umm"]["RelatedUrls"]:
# Prefer direct HTTPS download links
if link.get("Type") == "GET DATA":
urls.append(link["URL"])
if verbose:
print(f"Found {len(urls)} files")
return urls
def download_file(session, url, out_dir, verbose=False, retries=3):
filename = os.path.basename(url)
local_path = os.path.join(out_dir, filename)
if os.path.exists(local_path):
if verbose:
print(f"Skipping existing: {filename}")
return
for attempt in range(retries):
try:
if verbose:
print(f"Downloading ({attempt+1}/{retries}): {filename}")
with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
if r.status_code == 401:
raise Exception("Unauthorized (check .netrc credentials)")
r.raise_for_status()
with open(local_path, "wb") as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
if verbose:
print(f"Saved: {filename}")
return
except Exception as e:
if attempt == retries - 1:
print(f"Failed: {filename} -> {e}")
else:
backoff = min(2 ** attempt * 5, 60)
if verbose:
print(f"Retrying {filename} in {backoff}s... ({e})")
time.sleep(backoff)
def validate_dates(start, end):
try:
datetime.strptime(start, "%Y-%m-%d")
datetime.strptime(end, "%Y-%m-%d")
except ValueError:
print("Error: Dates must be in YYYY-MM-DD format")
sys.exit(1)
def main():
validate_dates(args.start_date, args.end_date)
# os.makedirs(args.out_dir, exist_ok=True)
session = create_session()
urls = get_download_urls(args.start_date, args.end_date, args.verbose)
for url in urls:
download_file(session, url, args.out_dir, args.verbose)
print(f"\nDone. Downloaded files to: {args.out_dir}")
if __name__ == "__main__":
main()
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: mur
namespace: cron
spec:
schedule: "0 6 * * *" # Everyday at 06:00, use https://crontab.guru
concurrencyPolicy: "Forbid"
successfulJobsHistoryLimit: 10
failedJobsHistoryLimit: 3
jobTemplate:
spec:
backoffLimit: 10
template:
spec:
restartPolicy: "OnFailure"
containers:
- name: cronpod
image: juselius/busynix:1.1
imagePullPolicy: IfNotPresent
command:
- /bin/sh
- -c
- |
nix-shell -p 'python3.withPackages(ps: [ps.requests])' coreutils --run '
python3 /scripts/download.py \
-sd $(date -d "3 days ago" +%Y-%m-%d) \
-ed $(date +%Y-%m-%d) \
-o /data/hdd/data/river-data/MUR/MUR_SST_nc \
-v
chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc
chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc
'
resources: {}
volumeMounts:
- name: data
mountPath: /data
- name: script
mountPath: /scripts
- name: netrc
mountPath: /root/.netrc
subPath: .netrc
readOnly: true
securityContext: {}
volumes:
- name: data
persistentVolumeClaim:
claimName: ekman-data
- name: script
configMap:
name: mur-script
defaultMode: 0755
- name: netrc
secret:
secretName: mur-netrc