178 lines
5.4 KiB
YAML
178 lines
5.4 KiB
YAML
apiVersion: v1
|
|
kind: ConfigMap
|
|
metadata:
|
|
name: mur-script
|
|
namespace: cron
|
|
data:
|
|
download.py: |
|
|
import argparse
|
|
import os
|
|
import sys
|
|
import time
|
|
import requests
|
|
from datetime import datetime
|
|
|
|
parser = argparse.ArgumentParser(description="Download MUR SST files from NASA Earthdata")
|
|
parser.add_argument("-sd", "--start_date", required=True, help="Start date (YYYY-MM-DD)")
|
|
parser.add_argument("-ed", "--end_date", required=True, help="End date (YYYY-MM-DD)")
|
|
parser.add_argument("-o", "--out_dir", default="MUR_SST_nc", help="Output directory")
|
|
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
|
args = parser.parse_args()
|
|
|
|
|
|
def create_session():
|
|
session = requests.Session()
|
|
session.headers.update({
|
|
"User-Agent": "mur-sst-downloader",
|
|
"Accept-Encoding": "identity"
|
|
})
|
|
return session
|
|
|
|
|
|
def get_download_urls(startdate, enddate, verbose=False):
|
|
url = (
|
|
"https://cmr.earthdata.nasa.gov/search/granules.umm_json"
|
|
f"?collection_concept_id=C1996881146-POCLOUD"
|
|
f"&temporal={startdate}T00:00:00Z,{enddate}T00:00:00Z"
|
|
"&pageSize=365"
|
|
)
|
|
|
|
r = requests.get(url)
|
|
r.raise_for_status()
|
|
data = r.json()
|
|
|
|
urls = []
|
|
for item in data["items"]:
|
|
for link in item["umm"]["RelatedUrls"]:
|
|
# Prefer direct HTTPS download links
|
|
if link.get("Type") == "GET DATA":
|
|
urls.append(link["URL"])
|
|
|
|
if verbose:
|
|
print(f"Found {len(urls)} files")
|
|
|
|
return urls
|
|
|
|
|
|
def download_file(session, url, out_dir, verbose=False, retries=3):
|
|
filename = os.path.basename(url)
|
|
local_path = os.path.join(out_dir, filename)
|
|
|
|
if os.path.exists(local_path):
|
|
if verbose:
|
|
print(f"Skipping existing: {filename}")
|
|
return
|
|
|
|
for attempt in range(retries):
|
|
try:
|
|
if verbose:
|
|
print(f"Downloading ({attempt+1}/{retries}): {filename}")
|
|
|
|
with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
|
|
if r.status_code == 401:
|
|
raise Exception("Unauthorized (check .netrc credentials)")
|
|
|
|
r.raise_for_status()
|
|
|
|
with open(local_path, "wb") as f:
|
|
for chunk in r.iter_content(chunk_size=8192):
|
|
if chunk:
|
|
f.write(chunk)
|
|
|
|
if verbose:
|
|
print(f"Saved: {filename}")
|
|
return
|
|
|
|
except Exception as e:
|
|
if attempt == retries - 1:
|
|
print(f"Failed: {filename} -> {e}")
|
|
else:
|
|
backoff = min(2 ** attempt * 5, 60)
|
|
if verbose:
|
|
print(f"Retrying {filename} in {backoff}s... ({e})")
|
|
time.sleep(backoff)
|
|
|
|
|
|
def validate_dates(start, end):
|
|
try:
|
|
datetime.strptime(start, "%Y-%m-%d")
|
|
datetime.strptime(end, "%Y-%m-%d")
|
|
except ValueError:
|
|
print("Error: Dates must be in YYYY-MM-DD format")
|
|
sys.exit(1)
|
|
|
|
|
|
def main():
|
|
validate_dates(args.start_date, args.end_date)
|
|
|
|
# os.makedirs(args.out_dir, exist_ok=True)
|
|
|
|
session = create_session()
|
|
|
|
urls = get_download_urls(args.start_date, args.end_date, args.verbose)
|
|
|
|
for url in urls:
|
|
download_file(session, url, args.out_dir, args.verbose)
|
|
|
|
print(f"\nDone. Downloaded files to: {args.out_dir}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
---
|
|
apiVersion: batch/v1
|
|
kind: CronJob
|
|
metadata:
|
|
name: mur
|
|
namespace: cron
|
|
spec:
|
|
schedule: "0 6 * * *" # Everyday at 06:00, use https://crontab.guru
|
|
concurrencyPolicy: "Forbid"
|
|
successfulJobsHistoryLimit: 10
|
|
failedJobsHistoryLimit: 3
|
|
jobTemplate:
|
|
spec:
|
|
backoffLimit: 10
|
|
template:
|
|
spec:
|
|
restartPolicy: "OnFailure"
|
|
containers:
|
|
- name: cronpod
|
|
image: juselius/busynix:1.1
|
|
imagePullPolicy: IfNotPresent
|
|
command:
|
|
- /bin/sh
|
|
- -c
|
|
- |
|
|
nix-shell -p 'python3.withPackages(ps: [ps.requests])' coreutils --run '
|
|
python3 /scripts/download.py \
|
|
-sd $(date -d "3 days ago" +%Y-%m-%d) \
|
|
-ed $(date +%Y-%m-%d) \
|
|
-o /data/hdd/data/river-data/MUR/MUR_SST_nc \
|
|
-v
|
|
chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc
|
|
chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc
|
|
'
|
|
resources: {}
|
|
volumeMounts:
|
|
- name: data
|
|
mountPath: /data
|
|
- name: script
|
|
mountPath: /scripts
|
|
- name: netrc
|
|
mountPath: /root/.netrc
|
|
subPath: .netrc
|
|
readOnly: true
|
|
securityContext: {}
|
|
volumes:
|
|
- name: data
|
|
persistentVolumeClaim:
|
|
claimName: ekman-data
|
|
- name: script
|
|
configMap:
|
|
name: mur-script
|
|
defaultMode: 0755
|
|
- name: netrc
|
|
secret:
|
|
secretName: mur-netrc
|