feat(sorcerer): Add raid-data pvc and cronjobs
This commit is contained in:
@@ -0,0 +1,174 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: mur-script
|
||||
namespace: cron
|
||||
data:
|
||||
download.py: |
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import requests
|
||||
from datetime import datetime
|
||||
|
||||
parser = argparse.ArgumentParser(description="Download MUR SST files from NASA Earthdata")
|
||||
parser.add_argument("-sd", "--start_date", required=True, help="Start date (YYYY-MM-DD)")
|
||||
parser.add_argument("-ed", "--end_date", required=True, help="End date (YYYY-MM-DD)")
|
||||
parser.add_argument("-o", "--out_dir", default="MUR_SST_nc", help="Output directory")
|
||||
parser.add_argument("-v", "--verbose", action="store_true", help="Verbose output")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def create_session():
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
"User-Agent": "mur-sst-downloader",
|
||||
"Accept-Encoding": "identity"
|
||||
})
|
||||
return session
|
||||
|
||||
|
||||
def get_download_urls(startdate, enddate, verbose=False):
|
||||
url = (
|
||||
"https://cmr.earthdata.nasa.gov/search/granules.umm_json"
|
||||
f"?collection_concept_id=C1996881146-POCLOUD"
|
||||
f"&temporal={startdate}T00:00:00Z,{enddate}T00:00:00Z"
|
||||
"&pageSize=365"
|
||||
)
|
||||
|
||||
r = requests.get(url)
|
||||
r.raise_for_status()
|
||||
data = r.json()
|
||||
|
||||
urls = []
|
||||
for item in data["items"]:
|
||||
for link in item["umm"]["RelatedUrls"]:
|
||||
# Prefer direct HTTPS download links
|
||||
if link.get("Type") == "GET DATA":
|
||||
urls.append(link["URL"])
|
||||
|
||||
if verbose:
|
||||
print(f"Found {len(urls)} files")
|
||||
|
||||
return urls
|
||||
|
||||
|
||||
def download_file(session, url, out_dir, verbose=False, retries=3):
|
||||
filename = os.path.basename(url)
|
||||
local_path = os.path.join(out_dir, filename)
|
||||
|
||||
if os.path.exists(local_path):
|
||||
if verbose:
|
||||
print(f"Skipping existing: {filename}")
|
||||
return
|
||||
|
||||
for attempt in range(retries):
|
||||
try:
|
||||
if verbose:
|
||||
print(f"Downloading ({attempt+1}/{retries}): {filename}")
|
||||
|
||||
with session.get(url, stream=True, allow_redirects=True, timeout=60) as r:
|
||||
if r.status_code == 401:
|
||||
raise Exception("Unauthorized (check .netrc credentials)")
|
||||
|
||||
r.raise_for_status()
|
||||
|
||||
with open(local_path, "wb") as f:
|
||||
for chunk in r.iter_content(chunk_size=8192):
|
||||
if chunk:
|
||||
f.write(chunk)
|
||||
|
||||
if verbose:
|
||||
print(f"Saved: {filename}")
|
||||
return
|
||||
|
||||
except Exception as e:
|
||||
if attempt == retries - 1:
|
||||
print(f"Failed: {filename} -> {e}")
|
||||
else:
|
||||
if verbose:
|
||||
print(f"Retrying {filename}... ({e})")
|
||||
|
||||
|
||||
def validate_dates(start, end):
|
||||
try:
|
||||
datetime.strptime(start, "%Y-%m-%d")
|
||||
datetime.strptime(end, "%Y-%m-%d")
|
||||
except ValueError:
|
||||
print("Error: Dates must be in YYYY-MM-DD format")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def main():
|
||||
validate_dates(args.start_date, args.end_date)
|
||||
|
||||
# os.makedirs(args.out_dir, exist_ok=True)
|
||||
|
||||
session = create_session()
|
||||
|
||||
urls = get_download_urls(args.start_date, args.end_date, args.verbose)
|
||||
|
||||
for url in urls:
|
||||
download_file(session, url, args.out_dir, args.verbose)
|
||||
|
||||
print(f"\nDone. Downloaded files to: {args.out_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: mur
|
||||
namespace: cron
|
||||
spec:
|
||||
schedule: "0 6 * * *" # Everyday at 06:00, use https://crontab.guru
|
||||
concurrencyPolicy: "Forbid"
|
||||
successfulJobsHistoryLimit: 10
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
backoffLimit: 10
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: "OnFailure"
|
||||
containers:
|
||||
- name: cronpod
|
||||
image: juselius/busynix:1.1
|
||||
imagePullPolicy: IfNotPresent
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
nix-shell -p 'python3.withPackages(ps: [ps.requests])' coreutils --run '
|
||||
python3 /scripts/download.py \
|
||||
-sd $(date -d "3 days ago" +%Y-%m-%d) \
|
||||
-ed $(date +%Y-%m-%d) \
|
||||
-o /data/hdd/data/river-data/MUR/MUR_SST_nc \
|
||||
-v
|
||||
chown -R 5000:5000 /data/hdd/data/river-data/MUR/MUR_SST_nc
|
||||
chmod -R g+w /data/hdd/data/river-data/MUR/MUR_SST_nc
|
||||
'
|
||||
resources: {}
|
||||
volumeMounts:
|
||||
- name: data
|
||||
mountPath: /data
|
||||
- name: script
|
||||
mountPath: /scripts
|
||||
- name: netrc
|
||||
mountPath: /root/.netrc
|
||||
subPath: .netrc
|
||||
readOnly: true
|
||||
securityContext: {}
|
||||
volumes:
|
||||
- name: data
|
||||
persistentVolumeClaim:
|
||||
claimName: ekman-data
|
||||
- name: script
|
||||
configMap:
|
||||
name: mur-script
|
||||
defaultMode: 0755
|
||||
- name: netrc
|
||||
secret:
|
||||
secretName: mur-netrc
|
||||
Reference in New Issue
Block a user