From abd788ac36471182f4c56d5bd6d0c3264806b24f Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Tue, 15 Apr 2025 13:31:16 -0500 Subject: [PATCH 01/32] initial --- parm/config.orion.yaml | 7 + ush/python/pyobsforge/obsdb/amsr2_db.py | 91 ++++++++++++ ush/python/pyobsforge/obsdb/obsdb.py | 5 +- ush/python/pyobsforge/task/marine_prepobs.py | 23 +++ ush/python/pyobsforge/task/providers.py | 3 + .../pyobsforge/tests/test_amsr2_database.py | 132 ++++++++++++++++++ 6 files changed, 260 insertions(+), 1 deletion(-) create mode 100644 ush/python/pyobsforge/obsdb/amsr2_db.py create mode 100644 ush/python/pyobsforge/tests/test_amsr2_database.py diff --git a/parm/config.orion.yaml b/parm/config.orion.yaml index 98108e3b..ac36db3a 100644 --- a/parm/config.orion.yaml +++ b/parm/config.orion.yaml @@ -55,6 +55,13 @@ marinedump: min: -2.0 max: 3.0 error ratio: 1.0 + rads: + list: + - AMSR2-SEAICE-NH + - AMSR2-SEAICE-SH + qc config: + min: 0.0 + max: 1.0 WALLTIME_MARINE_DUMP: '00:10:00' TASK_GEOM_MARINE_DUMP: '1:ppn=20:tpp=2' diff --git a/ush/python/pyobsforge/obsdb/amsr2_db.py b/ush/python/pyobsforge/obsdb/amsr2_db.py new file mode 100644 index 00000000..1ac3d1eb --- /dev/null +++ b/ush/python/pyobsforge/obsdb/amsr2_db.py @@ -0,0 +1,91 @@ +import os +import glob +from datetime import datetime +from pyobsforge.obsdb import BaseDatabase + + +class Amsr2Database(BaseDatabase): + """Class to manage an observation file database for data assimilation.""" + + def __init__(self, db_name="amsr2.db", + dcom_dir="/lfs/h1/ops/prod/dcom/", + obs_dir="seaice/pda"): + base_dir = os.path.join(dcom_dir, '*', obs_dir) + super().__init__(db_name, base_dir) + + def create_database(self): + """ + Create the SQLite database and observation files table. + + This method initializes the database with a table named `obs_files` to store metadata + about observation files. The table contains the following columns: + + - `id`: A unique identifier for each record (auto-incremented primary key). + - `filename`: The full path to the observation file (must be unique). + - `obs_time`: The timestamp of the observation, extracted from the filename. + - `receipt_time`: The timestamp when the file was added to the `dcom` directory. + - `satellite`: The satellite from which the observation was collected (e.g., GW1). + + The table is created if it does not already exist. + """ + query = """ + CREATE TABLE IF NOT EXISTS obs_files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + filename TEXT UNIQUE, + obs_time TIMESTAMP, + receipt_time TIMESTAMP, + satellite TEXT + ) + """ + self.execute_query(query) + +# def parse_filename(self, filename): +# """Extract metadata from AMSR2 SEAICE filenames.""" +# # Pattern: AMSR2-SEAICE-NH_v2r2_GW1_s202503160338250_e202503160514230_c202503160545510.nc +# basename = os.path.basename(filename) +# parts = basename.split('_') +# try: +# if len(parts) >= 4 and parts[0] == "AMSR2-SEAICE-?H": +# satellite = parts[2] +# obs_time = datetime.strptime(parts[3][1:13], "%Y%m%d%H%M") +# receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) + def parse_filename(self, filename): + basename = os.path.basename(filename) + parts = basename.split('_') + try: + if len(parts) >= 5 and parts[0].startswith("AMSR2-SEAICE"): + satellite = parts[2] # e.g., "GW1" + obs_time = datetime.strptime(parts[3][1:15], "%Y%m%d%H%M%S") # e.g., s202503160653240 + receipt_time_str = parts[5][1:15] # e.g., c202503160902250 + + if len(receipt_time_str) == 14: + receipt_time_str += "000000" # add microseconds if missing + + receipt_time = datetime.strptime(receipt_time_str, "%Y%m%d%H%M%S%f") + return filename, obs_time, receipt_time, satellite + except ValueError as e: + print(f"[DEBUG] Error parsing filename {filename}: {e}") + return None + + + def ingest_files(self): + """Scan the directory for new observation files and insert them into the database.""" + obs_files = glob.glob(os.path.join(self.base_dir, "*.nc")) + print(f"Found {len(obs_files)} new files to ingest") + + # Counter for successful ingestions + ingested_count = 0 + + for file in obs_files: + parsed_data = self.parse_filename(file) + if parsed_data: + query = """ + INSERT INTO obs_files (filename, obs_time, receipt_time, satellite) + VALUES (?, ?, ?, ?) + """ + try: + self.insert_record(query, parsed_data) + ingested_count += 1 + except Exception as e: + print(f"Failed to insert record for {file}: {e}") + print(f"################################ Successfully ingested {ingested_count} files into the database.") diff --git a/ush/python/pyobsforge/obsdb/obsdb.py b/ush/python/pyobsforge/obsdb/obsdb.py index fa92d311..e1faac9a 100644 --- a/ush/python/pyobsforge/obsdb/obsdb.py +++ b/ush/python/pyobsforge/obsdb/obsdb.py @@ -107,7 +107,10 @@ def get_valid_files(self, if check_receipt in ["gdas", "gfs"]: query = "SELECT receipt_time FROM obs_files WHERE filename = ?" receipt_time = self.execute_query(query, (filename,))[0][0] - receipt_time = datetime.strptime(receipt_time, "%Y-%m-%d %H:%M:%S.%f") + try: + receipt_time = datetime.strptime(receipt_time, "%Y-%m-%d %H:%M:%S.%f") + except ValueError: + receipt_time = datetime.strptime(receipt_time, "%Y-%m-%d %H:%M:%S") # Try parsing without microseconds if it fails if receipt_time <= window_end - timedelta(minutes=minutes_behind_realtime[check_receipt]): continue diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index 7e184d14..231b25b1 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -37,6 +37,7 @@ def __init__(self, config: Dict[str, Any]) -> None: # Initialize the Providers self.ghrsst = ProviderConfig.from_task_config("ghrsst", self.task_config) self.rads = ProviderConfig.from_task_config("rads", self.task_config) + self.amsr2 = ProviderConfig.from_task_config("amsr2", self.task_config) # Initialize the list of processed ioda files # TODO: Does not work. This should be a list of gathered ioda files that are created @@ -50,6 +51,7 @@ def initialize(self) -> None: # Update the database with new files self.ghrsst.db.ingest_files() self.rads.db.ingest_files() + self.amsr2.db.ingest_files() @logit(logger) def execute(self) -> None: @@ -127,6 +129,27 @@ def process_obs_space(self, } result = self.rads.process_obs_space(**kwargs) return result + + # Process AMSR2 + if provider == "amsr2": + parts = obs_space.split("_") + instrument = parts[1].upper() + platform = parts[2].upper() + + # Process the observation space + kwargs = { + 'provider': provider, + 'obs_space': obs_space, + 'instrument': instrument, + 'platform': platform, + 'obs_type': "SEAICE", + 'output_file': output_file, + 'window_begin': self.task_config.window_begin, + 'window_end': self.task_config.window_end, + 'task_config': self.task_config + } + result = self.amsr2.process_obs_space(**kwargs) + return result else: logger.error(f"Provider {provider} not supported") diff --git a/ush/python/pyobsforge/task/providers.py b/ush/python/pyobsforge/task/providers.py index 2dd61077..98aa810d 100644 --- a/ush/python/pyobsforge/task/providers.py +++ b/ush/python/pyobsforge/task/providers.py @@ -1,6 +1,7 @@ from logging import getLogger from pyobsforge.obsdb.ghrsst_db import GhrSstDatabase from pyobsforge.obsdb.rads_db import RADSDatabase +from pyobsforge.obsdb.amsr2_db import Amsr2Database from typing import Any from dataclasses import dataclass from wxflow import AttrDict @@ -59,6 +60,8 @@ def from_task_config(cls, provider_name: str, task_config: AttrDict) -> "Provide db = GhrSstDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="sst") elif provider_name == "rads": db = RADSDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="wgrdbul/adt") + elif provider_name == "asmr2" + db = Amsr2Database(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="seaice/pda") else: raise NotImplementedError(f"DB setup for provider {provider_name} not yet implemented") diff --git a/ush/python/pyobsforge/tests/test_amsr2_database.py b/ush/python/pyobsforge/tests/test_amsr2_database.py new file mode 100644 index 00000000..340f45aa --- /dev/null +++ b/ush/python/pyobsforge/tests/test_amsr2_database.py @@ -0,0 +1,132 @@ +import os +import glob +import tempfile +import shutil +import sqlite3 +from datetime import datetime, timedelta + +import pytest + +from pyobsforge.obsdb.amsr2_db import Amsr2Database # Adjust as needed + + +@pytest.fixture +def temp_obs_dir(): + """Create a temp directory with mock AMSR2 NetCDF files.""" + base_dir = tempfile.mkdtemp() + sub_dir = os.path.join(base_dir, "some_subdir", "seaice/pda") + os.makedirs(sub_dir) + + # Desired datetime for file timestamps + mock_time = datetime(2025, 3, 16, 0, 0, 0).timestamp() + + # Create mock NetCDF files + filenames = [ + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160514240_e202503160653220_c202503160725420.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161326240_e202503161502220_c202503161540340.nc", + "invalid_file.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160514240_e202503160653220_c202503160725420.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161326240_e202503161502220_c202503161540340.nc" + ] + for fname in filenames: + fname_tmp = os.path.join(sub_dir, fname) + with open(fname_tmp, "w") as f: + f.write("fake content") + os.utime(fname_tmp, (mock_time, mock_time)) # (access_time, modification_time) + + yield base_dir + shutil.rmtree(base_dir) + + +@pytest.fixture +def db(temp_obs_dir): + """Initialize test database.""" + db_path = os.path.join(temp_obs_dir, "amsr2_test.db") + database = Amsr2Database( + db_name=db_path, + dcom_dir=temp_obs_dir, + obs_dir="seaice/pda" + ) + return database + + +def test_create_database(db): + db.create_database() + conn = sqlite3.connect(db.db_name) + cursor = conn.cursor() + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='obs_files'") + assert cursor.fetchone() is not None + conn.close() + + +def test_parse_valid_filename(db): + print(glob.glob(os.path.join(db.base_dir, "*"))) + fname = "AMSR2-SEAICE-NH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc" + fname = glob.glob(os.path.join(db.base_dir, fname))[0] + parsed = db.parse_filename(fname) + creation_time = datetime.fromtimestamp(os.path.getctime(fname)) + + assert parsed is not None + assert parsed[0] == fname + assert parsed[1] == datetime(2025, 3, 16, 6, 53, 24) # Start time + # assert parsed[2] == creation_time + assert parsed[2] == datetime(2025, 3, 16, 9, 2, 25) + assert parsed[3] == "GW1" + + +def test_parse_invalid_filename(db): + assert db.parse_filename("junk.nc") is None + assert db.parse_filename("AMSR2-SEAICE-NH_v2r2_GW1_invalid.nc") is None + + +def test_ingest_files(db): + db.ingest_files() + conn = sqlite3.connect(db.db_name) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM obs_files") + count = cursor.fetchone()[0] + conn.close() + assert count == 6, "Should ingest 3 valid AMSR2 files" + + +def test_get_valid_files(db): + db.ingest_files() + da_cycle = "20250316060000" + window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) + window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) + dst_dir = 'seaice/pda' + # Test for AVHRRF_MB + valid_files = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + # instrument="AMSR2", + satellite="GW1") #, + # obs_type="SEAICE") + + # Files at 10:00 and 12:00 are within +/- 3h of 00:00 + assert any("202503160514" in f for f in valid_files) + assert any("202503160653" in f for f in valid_files) + assert all("202503161326" not in f for f in valid_files) + assert len(valid_files) == 4 + + +def test_get_valid_files_receipt(db): + db.ingest_files() + da_cycle = "20250316060000" + window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) + window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) + dst_dir = 'seaice/pda' + + # Test for AVHRRF_MB + valid_files = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + # instrument="AMSR2", + satellite="GW1", + # obs_type="SEAICE", + check_receipt='gfs') + + # TODO (G): Giving up for now on trying to mock the receipt time, will revisit later + assert len(valid_files) == 2 From ce7a13980a903765c76af82cbd84e36ed84cae88 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Tue, 15 Apr 2025 15:39:57 -0500 Subject: [PATCH 02/32] initial config --- parm/config.orion.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/parm/config.orion.yaml b/parm/config.orion.yaml index ac36db3a..174fca34 100644 --- a/parm/config.orion.yaml +++ b/parm/config.orion.yaml @@ -1,11 +1,11 @@ obsforge: PSLOT: obsforge - HOMEobsforge: /work/noaa/da/gvernier/runs/obsForge + HOMEobsforge: /work2/noaa/da/mchoi3/temp/obsForge SDATE: 202503141800 EDATE: 202503150000 - COMROOT: /work/noaa/da/gvernier/runs/test_obsforge/COMROOT + COMROOT: /work2/noaa/da/mchoi3/temp/test_obsForge/COMROOT DCOMROOT: /work2/noaa/da/common/lfs/h1/ops/prod/dcom - DATAROOT: /work/noaa/da/gvernier/runs/test_obsforge/RUNDIRS + DATAROOT: /work2/noaa/da/mchoi3/temp/test_obsForge/RUNDIRS SCHEDULER: slurm ACCOUNT: da-cpu QUEUE: debug @@ -55,7 +55,7 @@ marinedump: min: -2.0 max: 3.0 error ratio: 1.0 - rads: + amsr2: list: - AMSR2-SEAICE-NH - AMSR2-SEAICE-SH From 00aa670f24c33583da7fd670c4cfff83557596da Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Thu, 17 Apr 2025 08:34:15 -0500 Subject: [PATCH 03/32] dcom folder --- ush/python/pyobsforge/obsdb/rads_db.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ush/python/pyobsforge/obsdb/rads_db.py b/ush/python/pyobsforge/obsdb/rads_db.py index f4e97e96..9d33554a 100644 --- a/ush/python/pyobsforge/obsdb/rads_db.py +++ b/ush/python/pyobsforge/obsdb/rads_db.py @@ -9,7 +9,7 @@ class RADSDatabase(BaseDatabase): def __init__(self, db_name="rads.db", dcom_dir="/lfs/h1/ops/prod/dcom/", - obs_dir="sst"): + obs_dir="wgrdbul/adt"): base_dir = os.path.join(dcom_dir, '*', obs_dir) super().__init__(db_name, base_dir) From 2dd51a6dcbb685f717fb37a86a01592cbc0d1bcd Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Thu, 17 Apr 2025 12:11:03 -0500 Subject: [PATCH 04/32] add the sss database --- ush/python/pyobsforge/obsdb/smap_db.py | 98 ++++++++++++ ush/python/pyobsforge/obsdb/smos_db.py | 77 +++++++++ .../pyobsforge/tests/test_amsr2_database.py | 4 +- .../pyobsforge/tests/test_smap_database.py | 151 ++++++++++++++++++ .../pyobsforge/tests/test_smos_database.py | 151 ++++++++++++++++++ 5 files changed, 479 insertions(+), 2 deletions(-) create mode 100644 ush/python/pyobsforge/obsdb/smap_db.py create mode 100644 ush/python/pyobsforge/obsdb/smos_db.py create mode 100644 ush/python/pyobsforge/tests/test_smap_database.py create mode 100644 ush/python/pyobsforge/tests/test_smos_database.py diff --git a/ush/python/pyobsforge/obsdb/smap_db.py b/ush/python/pyobsforge/obsdb/smap_db.py new file mode 100644 index 00000000..1f6fd845 --- /dev/null +++ b/ush/python/pyobsforge/obsdb/smap_db.py @@ -0,0 +1,98 @@ +import os +import glob +from datetime import datetime +from pyobsforge.obsdb import BaseDatabase + + +class SmapDatabase(BaseDatabase): + """Class to manage an observation file database for data assimilation.""" + + def __init__(self, db_name="smap.db", + dcom_dir="/lfs/h1/ops/prod/dcom/", + obs_dir="wtxtbul/satSSS/SMAP"): + base_dir = os.path.join(dcom_dir, '*', obs_dir) + super().__init__(db_name, base_dir) + + def create_database(self): + """ + Create the SQLite database and observation files table. + + This method initializes the database with a table named `obs_files` to store metadata + about observation files. The table contains the following columns: + + - `id`: A unique identifier for each record (auto-incremented primary key). + - `filename`: The full path to the observation file (must be unique). + - `obs_time`: The timestamp of the observation, extracted from the filename. + - `receipt_time`: The timestamp when the file was added to the `dcom` directory. + - `satellite`: The satellite from which the observation was collected (e.g., GW1). + + The table is created if it does not already exist. + """ + query = """ + CREATE TABLE IF NOT EXISTS obs_files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + filename TEXT UNIQUE, + obs_time TIMESTAMP, + receipt_time TIMESTAMP, + satellite TEXT + ) + """ + self.execute_query(query) + +# def parse_filename(self, filename): + # pattern: SMAP_L2B_SSS_NRT_54047_A_20250315T011742.h5 +# basename = os.path.basename(filename) +# parts = basename.split('_') +# try: +# if basename.startswith("SMAP_L2B_SSSi_NRT"): +# satellite = "SMAP" +# datetime_part = parts[-1].split('.')[0] # Get '20250315T011742' without '.h5' +# obs_time = datetime.strptime(datetime_part, "%Y%m%dT%H%M%S") + + # Use file creation time +# receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) + +# return filename, obs_time, receipt_time, satellite +# except ValueError as e: +# print(f"[DEBUG] Error parsing filename {filename}: {e}") +# return None + def parse_filename(self, filename): + basename = os.path.basename(filename) + parts = basename.split('_') + try: + if basename.startswith("SMAP_L2B_SSS_NRT") and len(parts) >= 7: + satellite = "SMAP" + timestamp_with_ext = parts[6] # e.g., "20250316T001612.h5" + timestamp_str = os.path.splitext(timestamp_with_ext)[0] # remove .h5 +# obs_time_dt = datetime.strptime(timestamp_str, "%Y%m%dT%H%M%S") +# obs_time = obs_time_dt.strftime("%Y%m%d%H%M%S") + obs_time = datetime.strptime(timestamp_str, "%Y%m%dT%H%M%S") + receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) + return filename, obs_time, receipt_time, satellite + + except ValueError as e: + print(f"[DEBUG] Error parsing filename {filename}: {e}") + return None + + + def ingest_files(self): + """Scan the directory for new observation files and insert them into the database.""" + obs_files = glob.glob(os.path.join(self.base_dir, "*.h5")) + print(f"Found {len(obs_files)} new files to ingest") + + # Counter for successful ingestions + ingested_count = 0 + + for file in obs_files: + parsed_data = self.parse_filename(file) + if parsed_data: + query = """ + INSERT INTO obs_files (filename, obs_time, receipt_time, satellite) + VALUES (?, ?, ?, ?) + """ + try: + self.insert_record(query, parsed_data) + ingested_count += 1 + except Exception as e: + print(f"Failed to insert record for {file}: {e}") + print(f"################################ Successfully ingested {ingested_count} files into the database.") diff --git a/ush/python/pyobsforge/obsdb/smos_db.py b/ush/python/pyobsforge/obsdb/smos_db.py new file mode 100644 index 00000000..3fb250aa --- /dev/null +++ b/ush/python/pyobsforge/obsdb/smos_db.py @@ -0,0 +1,77 @@ +import os +import glob +from datetime import datetime +from pyobsforge.obsdb import BaseDatabase + + +class SmosDatabase(BaseDatabase): + """Class to manage an observation file database for data assimilation.""" + + def __init__(self, db_name="smos.db", + dcom_dir="/lfs/h1/ops/prod/dcom/", + obs_dir="wtxtbul/satSSS/SMOS"): + base_dir = os.path.join(dcom_dir, '*', obs_dir) + super().__init__(db_name, base_dir) + + def create_database(self): + """ + Create the SQLite database and observation files table. + + This method initializes the database with a table named `obs_files` to store metadata + about observation files. The table contains the following columns: + + - `id`: A unique identifier for each record (auto-incremented primary key). + - `filename`: The full path to the observation file (must be unique). + - `obs_time`: The timestamp of the observation, extracted from the filename. + - `receipt_time`: The timestamp when the file was added to the `dcom` directory. + - `satellite`: The satellite from which the observation was collected (e.g., GW1). + + The table is created if it does not already exist. + """ + query = """ + CREATE TABLE IF NOT EXISTS obs_files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + filename TEXT UNIQUE, + obs_time TIMESTAMP, + receipt_time TIMESTAMP, + satellite TEXT + ) + """ + self.execute_query(query) + + def parse_filename(self, filename): + basename = os.path.basename(filename) + parts = basename.split('_') + try: + if basename.startswith("SM_OPER_MIR_OSUDP") and len(parts) >= 6: + satellite = "SMOS" + start_time_str = parts[4] # e.g., "20250316T002309" + obs_time = datetime.strptime(start_time_str, "%Y%m%dT%H%M%S") + receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) + return filename, obs_time, receipt_time, satellite + + except ValueError as e: + print(f"[DEBUG] Error parsing filename {filename}: {e}") + return None + + def ingest_files(self): + """Scan the directory for new observation files and insert them into the database.""" + obs_files = glob.glob(os.path.join(self.base_dir, "*.nc")) + print(f"Found {len(obs_files)} new files to ingest") + + # Counter for successful ingestions + ingested_count = 0 + + for file in obs_files: + parsed_data = self.parse_filename(file) + if parsed_data: + query = """ + INSERT INTO obs_files (filename, obs_time, receipt_time, satellite) + VALUES (?, ?, ?, ?) + """ + try: + self.insert_record(query, parsed_data) + ingested_count += 1 + except Exception as e: + print(f"Failed to insert record for {file}: {e}") + print(f"################################ Successfully ingested {ingested_count} files into the database.") diff --git a/ush/python/pyobsforge/tests/test_amsr2_database.py b/ush/python/pyobsforge/tests/test_amsr2_database.py index 340f45aa..eec5f686 100644 --- a/ush/python/pyobsforge/tests/test_amsr2_database.py +++ b/ush/python/pyobsforge/tests/test_amsr2_database.py @@ -96,7 +96,7 @@ def test_get_valid_files(db): da_cycle = "20250316060000" window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) - dst_dir = 'seaice/pda' + dst_dir = 'seaice' # Test for AVHRRF_MB valid_files = db.get_valid_files(window_begin=window_begin, window_end=window_end, @@ -117,7 +117,7 @@ def test_get_valid_files_receipt(db): da_cycle = "20250316060000" window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) - dst_dir = 'seaice/pda' + dst_dir = 'seaice' # Test for AVHRRF_MB valid_files = db.get_valid_files(window_begin=window_begin, diff --git a/ush/python/pyobsforge/tests/test_smap_database.py b/ush/python/pyobsforge/tests/test_smap_database.py new file mode 100644 index 00000000..c7371b40 --- /dev/null +++ b/ush/python/pyobsforge/tests/test_smap_database.py @@ -0,0 +1,151 @@ +import os +import glob +import tempfile +import shutil +import sqlite3 +from datetime import datetime, timedelta + +import pytest + +from pyobsforge.obsdb.smap_db import SmapDatabase # Adjust as needed + + +@pytest.fixture +def temp_obs_dir(): + """Create a temp directory with mock SMAP_SSS h5 files.""" + base_dir = tempfile.mkdtemp() + sub_dir = os.path.join(base_dir, "some_subdir", "wtxtbul/satSSS/SMAP") + os.makedirs(sub_dir) + + # Desired datetime for file timestamps + mock_time = datetime(2025, 3, 16, 6, 0, 0).timestamp() + + # Create mock NetCDF files + filenames = [ + "SMAP_L2B_SSS_NRT_54061_A_20250316T001612.h5", + "SMAP_L2B_SSS_NRT_54061_D_20250316T001612.h5", + "SMAP_L2B_SSS_NRT_54062_A_20250316T015440.h5", + "SMAP_L2B_SSS_NRT_54062_D_20250316T015440.h5", + "SMAP_L2B_SSS_NRT_54063_A_20250316T033308.h5", + "SMAP_L2B_SSS_NRT_54063_D_20250316T033308.h5", + "SMAP_L2B_SSS_NRT_54064_A_20250316T051136.h5", + "SMAP_L2B_SSS_NRT_54064_D_20250316T051136.h5", + "SMAP_L2B_SSS_NRT_54065_A_20250316T065004.h5", + "SMAP_L2B_SSS_NRT_54065_D_20250316T065004.h5", + "SMAP_L2B_SSS_NRT_54066_A_20250316T082832.h5", + "SMAP_L2B_SSS_NRT_54066_D_20250316T082832.h5", + "SMAP_L2B_SSS_NRT_54067_A_20250316T100700.h5", + "SMAP_L2B_SSS_NRT_54067_D_20250316T100700.h5", + "SMAP_L2B_SSS_NRT_54068_D_20250316T114527.h5", + "SMAP_L2B_SSS_NRT_54069_A_20250316T132356.h5", + "SMAP_L2B_SSS_NRT_54069_D_20250316T132356.h5", + "SMAP_L2B_SSS_NRT_54070_A_20250316T150223.h5", + "SMAP_L2B_SSS_NRT_54070_D_20250316T150223.h5", + "SMAP_L2B_SSS_NRT_54071_A_20250316T164051.h5", + "SMAP_L2B_SSS_NRT_54071_D_20250316T164051.h5", + "SMAP_L2B_SSS_NRT_54072_A_20250316T181918.h5", + "SMAP_L2B_SSS_NRT_54072_D_20250316T181918.h5", + "SMAP_L2B_SSS_NRT_54073_A_20250316T195746.h5", + "SMAP_L2B_SSS_NRT_54073_D_20250316T195746.h5", + "SMAP_L2B_SSS_NRT_54074_A_20250316T213615.h5", + "SMAP_L2B_SSS_NRT_54074_D_20250316T213615.h5", + "SMAP_L2B_SSS_NRT_54075_A_20250316T231442.h5" + ] + for fname in filenames: + fname_tmp = os.path.join(sub_dir, fname) + with open(fname_tmp, "w") as f: + f.write("fake content") + os.utime(fname_tmp, (mock_time, mock_time)) # (access_time, modification_time) + + yield base_dir + shutil.rmtree(base_dir) + + +@pytest.fixture +def db(temp_obs_dir): + """Initialize test database.""" + db_path = os.path.join(temp_obs_dir, "smap_test.db") + database = SmapDatabase( + db_name=db_path, + dcom_dir=temp_obs_dir, + obs_dir="wtxtbul/satSSS/SMAP" + ) + return database + + +def test_create_database(db): + db.create_database() + conn = sqlite3.connect(db.db_name) + cursor = conn.cursor() + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='obs_files'") + assert cursor.fetchone() is not None + conn.close() + + +def test_parse_valid_filename(db): + print(glob.glob(os.path.join(db.base_dir, "*"))) + fname = "SMAP_L2B_SSS_NRT_54065_A_20250316T065004.h5" + fname = glob.glob(os.path.join(db.base_dir, fname))[0] + parsed = db.parse_filename(fname) + creation_time = datetime.fromtimestamp(os.path.getctime(fname)) + + assert parsed is not None + assert parsed[0] == fname + assert parsed[1] == datetime(2025, 3, 16, 6, 50, 4) # Start time + assert parsed[2] == creation_time + # assert parsed[2] == datetime(2025, 3, 16, 9, 2, 25) + assert parsed[3] == "SMAP" + + +def test_parse_invalid_filename(db): + assert db.parse_filename("junk.nc") is None + assert db.parse_filename("SMAP_L2B_SSS_NRT_invalid.nc") is None + + +def test_ingest_files(db): + db.ingest_files() + conn = sqlite3.connect(db.db_name) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM obs_files") + count = cursor.fetchone()[0] + conn.close() + assert count == 28, "Should ingest 28 valid SMAP files" + + +def test_get_valid_files(db): + db.ingest_files() + da_cycle = "20250316060000" + window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) + window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) + dst_dir = 'sss' + # Test for AVHRRF_MB + valid_files = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + satellite="SMAP") + + print("Valid files in window:", valid_files) + + # Files at 10:00 and 12:00 are within +/- 3h of 00:00 + assert any("20250316T0511" in f for f in valid_files) + assert any("20250316T0650" in f for f in valid_files) + assert all("20250316T1007" not in f for f in valid_files) + assert len(valid_files) == 8 + + +def test_get_valid_files_receipt(db): + db.ingest_files() + da_cycle = "20250316060000" + window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) + window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) + dst_dir = 'sss' + + # Test for AVHRRF_MB + valid_files = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + satellite="SMAP", + check_receipt='gfs') + + # TODO (G): Giving up for now on trying to mock the receipt time, will revisit later + assert len(valid_files) == 8 diff --git a/ush/python/pyobsforge/tests/test_smos_database.py b/ush/python/pyobsforge/tests/test_smos_database.py new file mode 100644 index 00000000..34456a97 --- /dev/null +++ b/ush/python/pyobsforge/tests/test_smos_database.py @@ -0,0 +1,151 @@ +import os +import glob +import tempfile +import shutil +import sqlite3 +from datetime import datetime, timedelta + +import pytest + +from pyobsforge.obsdb.smos_db import SmosDatabase # Adjust as needed + + +@pytest.fixture +def temp_obs_dir(): + """Create a temp directory with mock SMOS_SSS nc files.""" + base_dir = tempfile.mkdtemp() + sub_dir = os.path.join(base_dir, "some_subdir", "wtxtbul/satSSS/SMOS") + os.makedirs(sub_dir) + + # Desired datetime for file timestamps + mock_time = datetime(2025, 3, 16, 6, 0, 0).timestamp() + + # Create mock NetCDF files + filenames = [ + "SM_OPER_MIR_OSUDP2_20250316T002309_20250316T011621_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T011306_20250316T020624_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T020312_20250316T025626_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T025309_20250316T034629_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T034319_20250316T043631_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T043313_20250316T052634_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T052327_20250316T061636_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T061318_20250316T070637_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T070327_20250316T075640_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T075327_20250316T084642_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T084330_20250316T093645_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T093328_20250316T102647_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T102335_20250316T111649_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T111332_20250316T120652_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T120340_20250316T125654_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T125337_20250316T134656_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T134343_20250316T143658_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T143340_20250316T152700_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T152349_20250316T161703_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T161346_20250316T170705_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T170353_20250316T175708_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T175351_20250316T184710_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T184359_20250316T193713_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T193354_20250316T202714_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T202402_20250316T211716_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T211359_20250316T220719_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T220407_20250316T225721_700_001_1.nc", + "SM_OPER_MIR_OSUDP2_20250316T225404_20250316T234724_700_001_1.nc" + ] + for fname in filenames: + fname_tmp = os.path.join(sub_dir, fname) + with open(fname_tmp, "w") as f: + f.write("fake content") + os.utime(fname_tmp, (mock_time, mock_time)) # (access_time, modification_time) + + yield base_dir + shutil.rmtree(base_dir) + + +@pytest.fixture +def db(temp_obs_dir): + """Initialize test database.""" + db_path = os.path.join(temp_obs_dir, "smos_test.db") + database = SmosDatabase( + db_name=db_path, + dcom_dir=temp_obs_dir, + obs_dir="wtxtbul/satSSS/SMOS" + ) + return database + + +def test_create_database(db): + db.create_database() + conn = sqlite3.connect(db.db_name) + cursor = conn.cursor() + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='obs_files'") + assert cursor.fetchone() is not None + conn.close() + + +def test_parse_valid_filename(db): + print(glob.glob(os.path.join(db.base_dir, "*"))) + fname = "SM_OPER_MIR_OSUDP2_20250316T061318_20250316T070637_700_001_1.nc" + fname = glob.glob(os.path.join(db.base_dir, fname))[0] + parsed = db.parse_filename(fname) + creation_time = datetime.fromtimestamp(os.path.getctime(fname)) + + assert parsed is not None + assert parsed[0] == fname + assert parsed[1] == datetime(2025, 3, 16, 6, 13, 18) # Start time + assert parsed[2] == creation_time + # assert parsed[2] == datetime(2025, 3, 16, 9, 2, 25) + assert parsed[3] == "SMOS" + + +def test_parse_invalid_filename(db): + assert db.parse_filename("junk.nc") is None + assert db.parse_filename("SM_OPER_MIR_OSUDP2_invalid.nc") is None + + +def test_ingest_files(db): + db.ingest_files() + conn = sqlite3.connect(db.db_name) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM obs_files") + count = cursor.fetchone()[0] + conn.close() + assert count == 28, "Should ingest 28 valid SMAP files" + + +def test_get_valid_files(db): + db.ingest_files() + da_cycle = "20250316060000" + window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) + window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) + dst_dir = 'sss' + # Test for AVHRRF_MB + valid_files = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + satellite="SMOS") + + print("Valid files in window:", valid_files) + + # Files at 10:00 and 12:00 are within +/- 3h of 00:00 + assert any("20250316T0523" in f for f in valid_files) + assert any("20250316T0613" in f for f in valid_files) + assert all("20250316T1023" not in f for f in valid_files) + assert len(valid_files) == 7 + + +def test_get_valid_files_receipt(db): + db.ingest_files() + da_cycle = "20250316060000" + window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) + window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) + dst_dir = 'sss' + + # Test for AVHRRF_MB + valid_files = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + satellite="SMOS", + check_receipt='gfs') + + # TODO (G): Giving up for now on trying to mock the receipt time, will revisit later + assert len(valid_files) == 7 From c44112ac5bde2693e8b1d9ca92adf0b9409137d8 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Thu, 17 Apr 2025 13:49:41 -0500 Subject: [PATCH 05/32] initial processing test --- parm/config.orion.yaml | 6 +++--- ush/python/pyobsforge/obsdb/amsr2_db.py | 10 ---------- ush/python/pyobsforge/task/marine_prepobs.py | 13 ++++++------- ush/python/pyobsforge/task/providers.py | 2 +- 4 files changed, 10 insertions(+), 21 deletions(-) diff --git a/parm/config.orion.yaml b/parm/config.orion.yaml index 174fca34..55b2425b 100644 --- a/parm/config.orion.yaml +++ b/parm/config.orion.yaml @@ -9,7 +9,7 @@ obsforge: SCHEDULER: slurm ACCOUNT: da-cpu QUEUE: debug - PARTITION: orion + PARTITION: hercules KEEPDATA: NO assim_freq: 6 @@ -57,8 +57,8 @@ marinedump: error ratio: 1.0 amsr2: list: - - AMSR2-SEAICE-NH - - AMSR2-SEAICE-SH + - seaice_gw1_nh + - seaice_gw1_sh qc config: min: 0.0 max: 1.0 diff --git a/ush/python/pyobsforge/obsdb/amsr2_db.py b/ush/python/pyobsforge/obsdb/amsr2_db.py index 1ac3d1eb..3cd6684b 100644 --- a/ush/python/pyobsforge/obsdb/amsr2_db.py +++ b/ush/python/pyobsforge/obsdb/amsr2_db.py @@ -39,16 +39,6 @@ def create_database(self): """ self.execute_query(query) -# def parse_filename(self, filename): -# """Extract metadata from AMSR2 SEAICE filenames.""" -# # Pattern: AMSR2-SEAICE-NH_v2r2_GW1_s202503160338250_e202503160514230_c202503160545510.nc -# basename = os.path.basename(filename) -# parts = basename.split('_') -# try: -# if len(parts) >= 4 and parts[0] == "AMSR2-SEAICE-?H": -# satellite = parts[2] -# obs_time = datetime.strptime(parts[3][1:13], "%Y%m%d%H%M") -# receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) def parse_filename(self, filename): basename = os.path.basename(filename) parts = basename.split('_') diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index 231b25b1..e2197ea2 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -133,16 +133,15 @@ def process_obs_space(self, # Process AMSR2 if provider == "amsr2": parts = obs_space.split("_") - instrument = parts[1].upper() - platform = parts[2].upper() - - # Process the observation space + # instrument = "AMSR2" + platform = parts[1].upper() # "GW1" + hemisphere = parts[2].upper() # "NH" or "SH" kwargs = { 'provider': provider, 'obs_space': obs_space, - 'instrument': instrument, - 'platform': platform, - 'obs_type': "SEAICE", + # AMSR2 does not need 'instrument' in the database query 'instrument': instrument, + 'platform': f"{platform}_{hemisphere}", # e.g., "GW1_NH" + 'obs_type': "seaice", 'output_file': output_file, 'window_begin': self.task_config.window_begin, 'window_end': self.task_config.window_end, diff --git a/ush/python/pyobsforge/task/providers.py b/ush/python/pyobsforge/task/providers.py index 98aa810d..40fb057b 100644 --- a/ush/python/pyobsforge/task/providers.py +++ b/ush/python/pyobsforge/task/providers.py @@ -60,7 +60,7 @@ def from_task_config(cls, provider_name: str, task_config: AttrDict) -> "Provide db = GhrSstDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="sst") elif provider_name == "rads": db = RADSDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="wgrdbul/adt") - elif provider_name == "asmr2" + elif provider_name == "amsr2": db = Amsr2Database(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="seaice/pda") else: raise NotImplementedError(f"DB setup for provider {provider_name} not yet implemented") From dc4393a87ec5c48476c2cd0e9b5cb6827ce32490 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Tue, 22 Apr 2025 13:53:39 -0500 Subject: [PATCH 06/32] 2nd checking --- ush/python/pyobsforge/obsdb/amsr2_db.py | 15 +++++-- ush/python/pyobsforge/obsdb/obsdb.py | 44 ++++++++++--------- ush/python/pyobsforge/obsdb/smap_db.py | 24 ++-------- ush/python/pyobsforge/obsdb/smos_db.py | 3 +- ush/python/pyobsforge/task/marine_prepobs.py | 2 +- ush/python/pyobsforge/task/providers.py | 33 +++++++++++--- .../pyobsforge/tests/test_amsr2_database.py | 42 +++++++++++++++--- .../pyobsforge/tests/test_smap_database.py | 7 ++- .../pyobsforge/tests/test_smos_database.py | 7 ++- 9 files changed, 108 insertions(+), 69 deletions(-) diff --git a/ush/python/pyobsforge/obsdb/amsr2_db.py b/ush/python/pyobsforge/obsdb/amsr2_db.py index 3cd6684b..55215010 100644 --- a/ush/python/pyobsforge/obsdb/amsr2_db.py +++ b/ush/python/pyobsforge/obsdb/amsr2_db.py @@ -43,10 +43,16 @@ def parse_filename(self, filename): basename = os.path.basename(filename) parts = basename.split('_') try: - if len(parts) >= 5 and parts[0].startswith("AMSR2-SEAICE"): - satellite = parts[2] # e.g., "GW1" - obs_time = datetime.strptime(parts[3][1:15], "%Y%m%d%H%M%S") # e.g., s202503160653240 - receipt_time_str = parts[5][1:15] # e.g., c202503160902250 + if len(parts) >= 6 and parts[0].startswith("AMSR2-SEAICE"): + satellite = parts[2] # "GW1" + obs_time_str = parts[3][1:15] # s202503160653240 + receipt_time_str = parts[5][1:15] # c202503160902250 + # obs_type = parts[0].split('-')[2] # "SH", "NH" + if len(obs_time_str) == 15: + obs_time = datetime.strptime(obs_time_str, "%Y%m%d%H%M%S%f") + else: + obs_time = datetime.strptime(obs_time_str, "%Y%m%d%H%M%S") + if len(receipt_time_str) == 14: receipt_time_str += "000000" # add microseconds if missing @@ -62,6 +68,7 @@ def ingest_files(self): """Scan the directory for new observation files and insert them into the database.""" obs_files = glob.glob(os.path.join(self.base_dir, "*.nc")) print(f"Found {len(obs_files)} new files to ingest") + print(f"[DEBUG] Files found: {obs_files}") # Counter for successful ingestions ingested_count = 0 diff --git a/ush/python/pyobsforge/obsdb/obsdb.py b/ush/python/pyobsforge/obsdb/obsdb.py index e1faac9a..a36a9826 100644 --- a/ush/python/pyobsforge/obsdb/obsdb.py +++ b/ush/python/pyobsforge/obsdb/obsdb.py @@ -30,7 +30,7 @@ def get_connection(self): """Return the database connection.""" return self.connection - def parse_filename(self): + def parse_filename(self, filename): """Parse a filename and extract relevant metadata. Must be implemented by subclasses.""" raise NotImplementedError("Subclasses must implement parse_filename method") @@ -63,26 +63,14 @@ def get_valid_files(self, window_begin: datetime, window_end: datetime, dst_dir: str, - instrument: str = None, satellite: str = None, + instrument: str = None, obs_type: str = None, check_receipt: str = "none") -> list: """ - Retrieve and copy to dst_dir a list of observation files within a specified time window, possibly filtered by instrument, - satellite, and observation type. The check_receipt parameter can be 'gdas', 'gfs', or 'none'. If 'gdas' or - 'gfs' is specified, files are further filtered based on their receipt time to ensure they meet the - required delay criteria. - - :param window_begin: Start of the time window (datetime object). - :param window_end: End of the time window (datetime object). - :param dst_dir: Destination directory where valid files will be copied. - :param instrument: (Optional) Filter by instrument name. - :param satellite: (Optional) Filter by satellite name. - :param obs_type: (Optional) Filter by observation type. - :param check_receipt: (Optional) Specify receipt time check ('gdas', 'gfs', or 'none'). - :return: List of valid observation file paths in the destination directory. + Retrieve and copy to dst_dir a list of valid observation files within a specified time window. + Optionally filter by satellite, instrument, obs_type, and receipt time. """ - query = """ SELECT filename FROM obs_files WHERE obs_time BETWEEN ? AND ? @@ -90,20 +78,29 @@ def get_valid_files(self, minutes_behind_realtime = {'gdas': 160, 'gfs': 20} params = [window_begin, window_end] - if instrument: - query += " AND instrument = ?" - params.append(instrument) + # Optionally filter by satellite if provided if satellite: query += " AND satellite = ?" params.append(satellite) + + # Optionally filter by instrument if available and provided + if instrument: + query += " AND instrument = ?" + params.append(instrument) + + # Optionally filter by obs_type if available and provided if obs_type: query += " AND obs_type = ?" params.append(obs_type) + # Execute query to get relevant files results = self.execute_query(query, tuple(params)) valid_files = [] + for row in results: filename = row[0] + + # Optional receipt time filtering based on check_receipt parameter if check_receipt in ["gdas", "gfs"]: query = "SELECT receipt_time FROM obs_files WHERE filename = ?" receipt_time = self.execute_query(query, (filename,))[0][0] @@ -111,20 +108,25 @@ def get_valid_files(self, receipt_time = datetime.strptime(receipt_time, "%Y-%m-%d %H:%M:%S.%f") except ValueError: receipt_time = datetime.strptime(receipt_time, "%Y-%m-%d %H:%M:%S") # Try parsing without microseconds if it fails + + # Filter based on receipt time threshold if receipt_time <= window_end - timedelta(minutes=minutes_behind_realtime[check_receipt]): continue valid_files.append(filename) - # Copy files to the destination directory + # Copy valid files to the destination directory dst_files = [] if len(valid_files) > 0: - src_dst_obs_list = [] # list of [src_file, dst_file] + src_dst_obs_list = [] # List of [src_file, dst_file] for src_file in valid_files: dst_file = join(dst_dir, f"{basename(src_file)}") dst_files.append(dst_file) src_dst_obs_list.append([src_file, dst_file]) + + # Ensure the destination directory exists FileHandler({'mkdir': [dst_dir]}).sync() FileHandler({'copy': src_dst_obs_list}).sync() return dst_files + diff --git a/ush/python/pyobsforge/obsdb/smap_db.py b/ush/python/pyobsforge/obsdb/smap_db.py index 1f6fd845..759672aa 100644 --- a/ush/python/pyobsforge/obsdb/smap_db.py +++ b/ush/python/pyobsforge/obsdb/smap_db.py @@ -39,33 +39,15 @@ def create_database(self): """ self.execute_query(query) -# def parse_filename(self, filename): - # pattern: SMAP_L2B_SSS_NRT_54047_A_20250315T011742.h5 -# basename = os.path.basename(filename) -# parts = basename.split('_') -# try: -# if basename.startswith("SMAP_L2B_SSSi_NRT"): -# satellite = "SMAP" -# datetime_part = parts[-1].split('.')[0] # Get '20250315T011742' without '.h5' -# obs_time = datetime.strptime(datetime_part, "%Y%m%dT%H%M%S") - - # Use file creation time -# receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) - -# return filename, obs_time, receipt_time, satellite -# except ValueError as e: -# print(f"[DEBUG] Error parsing filename {filename}: {e}") -# return None def parse_filename(self, filename): + # patten: SMAP_L2B_SSS_NRT_54047_A_20250315T011742.h5 basename = os.path.basename(filename) parts = basename.split('_') try: if basename.startswith("SMAP_L2B_SSS_NRT") and len(parts) >= 7: satellite = "SMAP" - timestamp_with_ext = parts[6] # e.g., "20250316T001612.h5" - timestamp_str = os.path.splitext(timestamp_with_ext)[0] # remove .h5 -# obs_time_dt = datetime.strptime(timestamp_str, "%Y%m%dT%H%M%S") -# obs_time = obs_time_dt.strftime("%Y%m%d%H%M%S") + timestamp_with_ext = parts[6] + timestamp_str = os.path.splitext(timestamp_with_ext)[0] obs_time = datetime.strptime(timestamp_str, "%Y%m%dT%H%M%S") receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) return filename, obs_time, receipt_time, satellite diff --git a/ush/python/pyobsforge/obsdb/smos_db.py b/ush/python/pyobsforge/obsdb/smos_db.py index 3fb250aa..41ad43cd 100644 --- a/ush/python/pyobsforge/obsdb/smos_db.py +++ b/ush/python/pyobsforge/obsdb/smos_db.py @@ -40,12 +40,13 @@ def create_database(self): self.execute_query(query) def parse_filename(self, filename): + # patten: SM_OPER_MIR_OSUDP2_20250315T001156_20250315T010515_700_001_1.nc basename = os.path.basename(filename) parts = basename.split('_') try: if basename.startswith("SM_OPER_MIR_OSUDP") and len(parts) >= 6: satellite = "SMOS" - start_time_str = parts[4] # e.g., "20250316T002309" + start_time_str = parts[4] obs_time = datetime.strptime(start_time_str, "%Y%m%dT%H%M%S") receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) return filename, obs_time, receipt_time, satellite diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index e2197ea2..81890742 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -141,7 +141,7 @@ def process_obs_space(self, 'obs_space': obs_space, # AMSR2 does not need 'instrument' in the database query 'instrument': instrument, 'platform': f"{platform}_{hemisphere}", # e.g., "GW1_NH" - 'obs_type': "seaice", + # 'obs_type': "seaice", 'output_file': output_file, 'window_begin': self.task_config.window_begin, 'window_end': self.task_config.window_end, diff --git a/ush/python/pyobsforge/task/providers.py b/ush/python/pyobsforge/task/providers.py index 40fb057b..23c88e12 100644 --- a/ush/python/pyobsforge/task/providers.py +++ b/ush/python/pyobsforge/task/providers.py @@ -95,13 +95,34 @@ def process_obs_space(self, **kwargs) -> None: window_end = kwargs.get('window_end') task_config = kwargs.get('task_config') + logger.debug(f"obs_type for provider {provider}: {obs_type}") + # Query the database for valid files - input_files = self.db.get_valid_files(window_begin=window_begin, - window_end=window_end, - dst_dir=obs_space, - instrument=instrument, - satellite=platform, - obs_type=obs_type) + #input_files = self.db.get_valid_files(window_begin=window_begin, + ## window_end=window_end, + # dst_dir=obs_space, + # instrument=instrument, + # satellite=platform, + # obs_type=obs_type) + + # Check if this database accepts obs_type (not all do) + db_accepts_obs_type = provider in ("ghrsst", "rads") # Add more if needed + + # Build kwargs dynamically + db_kwargs = { + "window_begin": window_begin, + "window_end": window_end, + "dst_dir": obs_space, + "instrument": instrument, + "satellite": platform, + } + + if db_accepts_obs_type and obs_type is not None: + db_kwargs["obs_type"] = obs_type + + # Now run the query + input_files = self.db.get_valid_files(**db_kwargs) + logger.info(f"number of valid files: {len(input_files)}") # Process the observations if the obs space is not empty diff --git a/ush/python/pyobsforge/tests/test_amsr2_database.py b/ush/python/pyobsforge/tests/test_amsr2_database.py index eec5f686..d4a243d4 100644 --- a/ush/python/pyobsforge/tests/test_amsr2_database.py +++ b/ush/python/pyobsforge/tests/test_amsr2_database.py @@ -22,13 +22,37 @@ def temp_obs_dir(): # Create mock NetCDF files filenames = [ + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160020240_e202503160159220_c202503160230450.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160159240_e202503160338230_c202503160410050.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160338250_e202503160514230_c202503160545510.nc", "AMSR2-SEAICE-NH_v2r2_GW1_s202503160514240_e202503160653220_c202503160725420.nc", "AMSR2-SEAICE-NH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160829250_e202503161008230_c202503161121060.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161008240_e202503161147220_c202503161300120.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161147230_e202503161326230_c202503161357200.nc", "AMSR2-SEAICE-NH_v2r2_GW1_s202503161326240_e202503161502220_c202503161540340.nc", - "invalid_file.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161502240_e202503161641220_c202503161715510.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161641230_e202503161820230_c202503161856520.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161820240_e202503162002220_c202503162039030.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503162002240_e202503162144230_c202503162217280.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503162144250_e202503162323220_c202503162358480.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503162323240_e202503170102220_c202503170137120.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160020240_e202503160159220_c202503160230450.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160159240_e202503160338230_c202503160410050.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160338250_e202503160514230_c202503160545510.nc", "AMSR2-SEAICE-SH_v2r2_GW1_s202503160514240_e202503160653220_c202503160725420.nc", "AMSR2-SEAICE-SH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503161326240_e202503161502220_c202503161540340.nc" + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160829250_e202503161008230_c202503161121060.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161008240_e202503161147220_c202503161300120.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161147230_e202503161326230_c202503161357200.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161326240_e202503161502220_c202503161540340.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161502240_e202503161641220_c202503161715510.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161641230_e202503161820230_c202503161856520.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161820240_e202503162002220_c202503162039030.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503162002240_e202503162144230_c202503162217280.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503162144250_e202503162323220_c202503162358480.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503162323240_e202503170102220_c202503170137120.nc", + "invalid_file.nc" ] for fname in filenames: fname_tmp = os.path.join(sub_dir, fname) @@ -88,7 +112,7 @@ def test_ingest_files(db): cursor.execute("SELECT COUNT(*) FROM obs_files") count = cursor.fetchone()[0] conn.close() - assert count == 6, "Should ingest 3 valid AMSR2 files" + assert count == 30, "Should ingest 8 valid AMSR2 files" def test_get_valid_files(db): @@ -97,7 +121,7 @@ def test_get_valid_files(db): window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) dst_dir = 'seaice' - # Test for AVHRRF_MB + # Test for AMSR2 ICEC valid_files = db.get_valid_files(window_begin=window_begin, window_end=window_end, dst_dir=dst_dir, @@ -109,7 +133,7 @@ def test_get_valid_files(db): assert any("202503160514" in f for f in valid_files) assert any("202503160653" in f for f in valid_files) assert all("202503161326" not in f for f in valid_files) - assert len(valid_files) == 4 + assert len(valid_files) == 8 def test_get_valid_files_receipt(db): @@ -119,7 +143,7 @@ def test_get_valid_files_receipt(db): window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) dst_dir = 'seaice' - # Test for AVHRRF_MB + # Test for AMSR2 ICEC valid_files = db.get_valid_files(window_begin=window_begin, window_end=window_end, dst_dir=dst_dir, @@ -128,5 +152,9 @@ def test_get_valid_files_receipt(db): # obs_type="SEAICE", check_receipt='gfs') + print("Valid files found:", len(valid_files)) + for f in valid_files: + print(" -", f) + # TODO (G): Giving up for now on trying to mock the receipt time, will revisit later - assert len(valid_files) == 2 + assert len(valid_files) == 4 diff --git a/ush/python/pyobsforge/tests/test_smap_database.py b/ush/python/pyobsforge/tests/test_smap_database.py index c7371b40..1b67b3f8 100644 --- a/ush/python/pyobsforge/tests/test_smap_database.py +++ b/ush/python/pyobsforge/tests/test_smap_database.py @@ -93,7 +93,6 @@ def test_parse_valid_filename(db): assert parsed[0] == fname assert parsed[1] == datetime(2025, 3, 16, 6, 50, 4) # Start time assert parsed[2] == creation_time - # assert parsed[2] == datetime(2025, 3, 16, 9, 2, 25) assert parsed[3] == "SMAP" @@ -118,7 +117,7 @@ def test_get_valid_files(db): window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) dst_dir = 'sss' - # Test for AVHRRF_MB + # Test for SMAP SSS valid_files = db.get_valid_files(window_begin=window_begin, window_end=window_end, dst_dir=dst_dir, @@ -126,7 +125,7 @@ def test_get_valid_files(db): print("Valid files in window:", valid_files) - # Files at 10:00 and 12:00 are within +/- 3h of 00:00 + # Files at 03:00 and 09:00 are within +/- 3h of 06:00 assert any("20250316T0511" in f for f in valid_files) assert any("20250316T0650" in f for f in valid_files) assert all("20250316T1007" not in f for f in valid_files) @@ -140,7 +139,7 @@ def test_get_valid_files_receipt(db): window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) dst_dir = 'sss' - # Test for AVHRRF_MB + # Test for SMAP SSS valid_files = db.get_valid_files(window_begin=window_begin, window_end=window_end, dst_dir=dst_dir, diff --git a/ush/python/pyobsforge/tests/test_smos_database.py b/ush/python/pyobsforge/tests/test_smos_database.py index 34456a97..c9a58dc6 100644 --- a/ush/python/pyobsforge/tests/test_smos_database.py +++ b/ush/python/pyobsforge/tests/test_smos_database.py @@ -93,7 +93,6 @@ def test_parse_valid_filename(db): assert parsed[0] == fname assert parsed[1] == datetime(2025, 3, 16, 6, 13, 18) # Start time assert parsed[2] == creation_time - # assert parsed[2] == datetime(2025, 3, 16, 9, 2, 25) assert parsed[3] == "SMOS" @@ -118,7 +117,7 @@ def test_get_valid_files(db): window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) dst_dir = 'sss' - # Test for AVHRRF_MB + # Test for SMOS SSS valid_files = db.get_valid_files(window_begin=window_begin, window_end=window_end, dst_dir=dst_dir, @@ -126,7 +125,7 @@ def test_get_valid_files(db): print("Valid files in window:", valid_files) - # Files at 10:00 and 12:00 are within +/- 3h of 00:00 + # Files at 03:00 and 09:00 are within +/- 3h of 06:00 assert any("20250316T0523" in f for f in valid_files) assert any("20250316T0613" in f for f in valid_files) assert all("20250316T1023" not in f for f in valid_files) @@ -140,7 +139,7 @@ def test_get_valid_files_receipt(db): window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) dst_dir = 'sss' - # Test for AVHRRF_MB + # Test for SMOS SSS valid_files = db.get_valid_files(window_begin=window_begin, window_end=window_end, dst_dir=dst_dir, From 7fbfb18564f7f25f315241ad3eae5275be19ffa8 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 16:08:39 +0000 Subject: [PATCH 07/32] add the database --- ush/python/pyobsforge/obsdb/amsr2_db.py | 45 ++++++++++++------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/ush/python/pyobsforge/obsdb/amsr2_db.py b/ush/python/pyobsforge/obsdb/amsr2_db.py index 55215010..a6f9375d 100644 --- a/ush/python/pyobsforge/obsdb/amsr2_db.py +++ b/ush/python/pyobsforge/obsdb/amsr2_db.py @@ -24,7 +24,9 @@ def create_database(self): - `filename`: The full path to the observation file (must be unique). - `obs_time`: The timestamp of the observation, extracted from the filename. - `receipt_time`: The timestamp when the file was added to the `dcom` directory. + - `instrument`: The instrument used to collect the observation (e.g., AMSR2). - `satellite`: The satellite from which the observation was collected (e.g., GW1). + - `obs_type`: The type of observation (e.g., SEAICE) The table is created if it does not already exist. """ @@ -34,32 +36,29 @@ def create_database(self): filename TEXT UNIQUE, obs_time TIMESTAMP, receipt_time TIMESTAMP, - satellite TEXT + instrument TEXT, + satellite TEXT, + obs_type TEXT ) """ self.execute_query(query) def parse_filename(self, filename): - basename = os.path.basename(filename) - parts = basename.split('_') + # Example filename: + # AMSR2-SEAICE-NH_v2r2_GW1_s202503140032240_e202503140211220_c202503140245560.nc + parts = os.path.basename(filename).replace('_', '-').split('-') try: - if len(parts) >= 6 and parts[0].startswith("AMSR2-SEAICE"): - satellite = parts[2] # "GW1" - obs_time_str = parts[3][1:15] # s202503160653240 - receipt_time_str = parts[5][1:15] # c202503160902250 - # obs_type = parts[0].split('-')[2] # "SH", "NH" - if len(obs_time_str) == 15: - obs_time = datetime.strptime(obs_time_str, "%Y%m%d%H%M%S%f") - else: - obs_time = datetime.strptime(obs_time_str, "%Y%m%d%H%M%S") - - - if len(receipt_time_str) == 14: - receipt_time_str += "000000" # add microseconds if missing + if len(parts) >= 8 and parts[0] == 'AMSR2': + instrument = parts[0] + obs_type = parts[1] + satellite = parts[4] + obs_time_str = parts[5][1:16] # s202503140032240 to '202503140032240' + receipt_time_str = parts[7].split('.')[0][1:16] # c202503140245560 to '202503140245560' + obs_time = datetime.strptime(obs_time_str, "%Y%m%d%H%M%S%f") receipt_time = datetime.strptime(receipt_time_str, "%Y%m%d%H%M%S%f") - return filename, obs_time, receipt_time, satellite - except ValueError as e: + return filename, obs_time, receipt_time, instrument, satellite, obs_type + except Exception as e: print(f"[DEBUG] Error parsing filename {filename}: {e}") return None @@ -67,8 +66,8 @@ def parse_filename(self, filename): def ingest_files(self): """Scan the directory for new observation files and insert them into the database.""" obs_files = glob.glob(os.path.join(self.base_dir, "*.nc")) - print(f"Found {len(obs_files)} new files to ingest") - print(f"[DEBUG] Files found: {obs_files}") + print(f"[INFO] Found {len(obs_files)} new files to ingest") + print(f"[INFO] Files found: {obs_files}") # Counter for successful ingestions ingested_count = 0 @@ -77,12 +76,12 @@ def ingest_files(self): parsed_data = self.parse_filename(file) if parsed_data: query = """ - INSERT INTO obs_files (filename, obs_time, receipt_time, satellite) - VALUES (?, ?, ?, ?) + INSERT INTO obs_files (filename, obs_time, receipt_time, instrument, satellite, obs_type) + VALUES (?, ?, ?, ?, ?, ?) """ try: self.insert_record(query, parsed_data) ingested_count += 1 except Exception as e: - print(f"Failed to insert record for {file}: {e}") + print(f"[DEBUG] Failed to insert record for {file}: {e}") print(f"################################ Successfully ingested {ingested_count} files into the database.") From a0628e6d3279179509b99f79be6e8824aa84aede Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 17:36:10 +0000 Subject: [PATCH 08/32] py-norm_1 --- ush/python/pyobsforge/obsdb/amsr2_db.py | 4 +- ush/python/pyobsforge/obsdb/obsdb.py | 46 +++++++++---------- .../pyobsforge/tests/test_amsr2_database.py | 27 ++++++----- 3 files changed, 40 insertions(+), 37 deletions(-) diff --git a/ush/python/pyobsforge/obsdb/amsr2_db.py b/ush/python/pyobsforge/obsdb/amsr2_db.py index a6f9375d..2d263cfe 100644 --- a/ush/python/pyobsforge/obsdb/amsr2_db.py +++ b/ush/python/pyobsforge/obsdb/amsr2_db.py @@ -52,8 +52,8 @@ def parse_filename(self, filename): instrument = parts[0] obs_type = parts[1] satellite = parts[4] - obs_time_str = parts[5][1:16] # s202503140032240 to '202503140032240' - receipt_time_str = parts[7].split('.')[0][1:16] # c202503140245560 to '202503140245560' + obs_time_str = parts[5][1:16] + receipt_time_str = parts[7].split('.')[0][1:16] obs_time = datetime.strptime(obs_time_str, "%Y%m%d%H%M%S%f") receipt_time = datetime.strptime(receipt_time_str, "%Y%m%d%H%M%S%f") diff --git a/ush/python/pyobsforge/obsdb/obsdb.py b/ush/python/pyobsforge/obsdb/obsdb.py index a36a9826..2cb7e4ca 100644 --- a/ush/python/pyobsforge/obsdb/obsdb.py +++ b/ush/python/pyobsforge/obsdb/obsdb.py @@ -30,7 +30,7 @@ def get_connection(self): """Return the database connection.""" return self.connection - def parse_filename(self, filename): + def parse_filename(self): """Parse a filename and extract relevant metadata. Must be implemented by subclasses.""" raise NotImplementedError("Subclasses must implement parse_filename method") @@ -63,14 +63,26 @@ def get_valid_files(self, window_begin: datetime, window_end: datetime, dst_dir: str, - satellite: str = None, instrument: str = None, + satellite: str = None, obs_type: str = None, check_receipt: str = "none") -> list: """ - Retrieve and copy to dst_dir a list of valid observation files within a specified time window. - Optionally filter by satellite, instrument, obs_type, and receipt time. + Retrieve and copy to dst_dir a list of observation files within a specified time window, possibly filtered by instrument, + satellite, and observation type. The check_receipt parameter can be 'gdas', 'gfs', or 'none'. If 'gdas' or + 'gfs' is specified, files are further filtered based on their receipt time to ensure they meet the + required delay criteria. + + :param window_begin: Start of the time window (datetime object). + :param window_end: End of the time window (datetime object). + :param dst_dir: Destination directory where valid files will be copied. + :param instrument: (Optional) Filter by instrument name. + :param satellite: (Optional) Filter by satellite name. + :param obs_type: (Optional) Filter by observation type. + :param check_receipt: (Optional) Specify receipt time check ('gdas', 'gfs', or 'none'). + :return: List of valid observation file paths in the destination directory. """ + query = """ SELECT filename FROM obs_files WHERE obs_time BETWEEN ? AND ? @@ -78,55 +90,41 @@ def get_valid_files(self, minutes_behind_realtime = {'gdas': 160, 'gfs': 20} params = [window_begin, window_end] - # Optionally filter by satellite if provided - if satellite: - query += " AND satellite = ?" - params.append(satellite) - - # Optionally filter by instrument if available and provided if instrument: query += " AND instrument = ?" params.append(instrument) - - # Optionally filter by obs_type if available and provided + if satellite: + query += " AND satellite = ?" + params.append(satellite) if obs_type: query += " AND obs_type = ?" params.append(obs_type) - # Execute query to get relevant files results = self.execute_query(query, tuple(params)) valid_files = [] - for row in results: filename = row[0] - - # Optional receipt time filtering based on check_receipt parameter if check_receipt in ["gdas", "gfs"]: query = "SELECT receipt_time FROM obs_files WHERE filename = ?" receipt_time = self.execute_query(query, (filename,))[0][0] try: receipt_time = datetime.strptime(receipt_time, "%Y-%m-%d %H:%M:%S.%f") except ValueError: - receipt_time = datetime.strptime(receipt_time, "%Y-%m-%d %H:%M:%S") # Try parsing without microseconds if it fails - - # Filter based on receipt time threshold + receipt_time = datetime.strptime(receipt_time, "%Y-%m-%d %H:%M:%S") if receipt_time <= window_end - timedelta(minutes=minutes_behind_realtime[check_receipt]): continue valid_files.append(filename) - # Copy valid files to the destination directory + # Copy files to the destination directory dst_files = [] if len(valid_files) > 0: - src_dst_obs_list = [] # List of [src_file, dst_file] + src_dst_obs_list = [] # list of [src_file, dst_file] for src_file in valid_files: dst_file = join(dst_dir, f"{basename(src_file)}") dst_files.append(dst_file) src_dst_obs_list.append([src_file, dst_file]) - - # Ensure the destination directory exists FileHandler({'mkdir': [dst_dir]}).sync() FileHandler({'copy': src_dst_obs_list}).sync() return dst_files - diff --git a/ush/python/pyobsforge/tests/test_amsr2_database.py b/ush/python/pyobsforge/tests/test_amsr2_database.py index d4a243d4..eea6b9e7 100644 --- a/ush/python/pyobsforge/tests/test_amsr2_database.py +++ b/ush/python/pyobsforge/tests/test_amsr2_database.py @@ -90,14 +90,14 @@ def test_parse_valid_filename(db): fname = "AMSR2-SEAICE-NH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc" fname = glob.glob(os.path.join(db.base_dir, fname))[0] parsed = db.parse_filename(fname) - creation_time = datetime.fromtimestamp(os.path.getctime(fname)) - + assert parsed is not None assert parsed[0] == fname - assert parsed[1] == datetime(2025, 3, 16, 6, 53, 24) # Start time - # assert parsed[2] == creation_time + assert parsed[1] == datetime(2025, 3, 16, 6, 53, 24) assert parsed[2] == datetime(2025, 3, 16, 9, 2, 25) - assert parsed[3] == "GW1" + assert parsed[3] == "AMSR2" + assert parsed[4] == "GW1" + assert parsed[5] == "SEAICE" def test_parse_invalid_filename(db): @@ -112,7 +112,7 @@ def test_ingest_files(db): cursor.execute("SELECT COUNT(*) FROM obs_files") count = cursor.fetchone()[0] conn.close() - assert count == 30, "Should ingest 8 valid AMSR2 files" + assert count == 30, "Should ingest 30 valid AMSR2 files" def test_get_valid_files(db): @@ -125,14 +125,19 @@ def test_get_valid_files(db): valid_files = db.get_valid_files(window_begin=window_begin, window_end=window_end, dst_dir=dst_dir, - # instrument="AMSR2", - satellite="GW1") #, - # obs_type="SEAICE") + instrument="AMSR2", + satellite="GW1", + obs_type="SEAICE") # Files at 10:00 and 12:00 are within +/- 3h of 00:00 assert any("202503160514" in f for f in valid_files) assert any("202503160653" in f for f in valid_files) assert all("202503161326" not in f for f in valid_files) + + print("Valid files found:", len(valid_files)) + for f in valid_files: + print(" -", f) + assert len(valid_files) == 8 @@ -147,9 +152,9 @@ def test_get_valid_files_receipt(db): valid_files = db.get_valid_files(window_begin=window_begin, window_end=window_end, dst_dir=dst_dir, - # instrument="AMSR2", + instrument="AMSR2", satellite="GW1", - # obs_type="SEAICE", + obs_type="SEAICE", check_receipt='gfs') print("Valid files found:", len(valid_files)) From 0d6cc7df8bb6c95f0b75c8793bc7c5ea2fe07435 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 17:46:23 +0000 Subject: [PATCH 09/32] py-test_2 --- ush/python/pyobsforge/obsdb/amsr2_db.py | 1 - ush/python/pyobsforge/obsdb/smap_db.py | 1 - ush/python/pyobsforge/task/providers.py | 31 ++++--------------- .../pyobsforge/tests/test_amsr2_database.py | 1 - 4 files changed, 6 insertions(+), 28 deletions(-) diff --git a/ush/python/pyobsforge/obsdb/amsr2_db.py b/ush/python/pyobsforge/obsdb/amsr2_db.py index 2d263cfe..13b2de16 100644 --- a/ush/python/pyobsforge/obsdb/amsr2_db.py +++ b/ush/python/pyobsforge/obsdb/amsr2_db.py @@ -62,7 +62,6 @@ def parse_filename(self, filename): print(f"[DEBUG] Error parsing filename {filename}: {e}") return None - def ingest_files(self): """Scan the directory for new observation files and insert them into the database.""" obs_files = glob.glob(os.path.join(self.base_dir, "*.nc")) diff --git a/ush/python/pyobsforge/obsdb/smap_db.py b/ush/python/pyobsforge/obsdb/smap_db.py index 759672aa..36108884 100644 --- a/ush/python/pyobsforge/obsdb/smap_db.py +++ b/ush/python/pyobsforge/obsdb/smap_db.py @@ -56,7 +56,6 @@ def parse_filename(self, filename): print(f"[DEBUG] Error parsing filename {filename}: {e}") return None - def ingest_files(self): """Scan the directory for new observation files and insert them into the database.""" obs_files = glob.glob(os.path.join(self.base_dir, "*.h5")) diff --git a/ush/python/pyobsforge/task/providers.py b/ush/python/pyobsforge/task/providers.py index 23c88e12..d8a45111 100644 --- a/ush/python/pyobsforge/task/providers.py +++ b/ush/python/pyobsforge/task/providers.py @@ -98,31 +98,12 @@ def process_obs_space(self, **kwargs) -> None: logger.debug(f"obs_type for provider {provider}: {obs_type}") # Query the database for valid files - #input_files = self.db.get_valid_files(window_begin=window_begin, - ## window_end=window_end, - # dst_dir=obs_space, - # instrument=instrument, - # satellite=platform, - # obs_type=obs_type) - - # Check if this database accepts obs_type (not all do) - db_accepts_obs_type = provider in ("ghrsst", "rads") # Add more if needed - - # Build kwargs dynamically - db_kwargs = { - "window_begin": window_begin, - "window_end": window_end, - "dst_dir": obs_space, - "instrument": instrument, - "satellite": platform, - } - - if db_accepts_obs_type and obs_type is not None: - db_kwargs["obs_type"] = obs_type - - # Now run the query - input_files = self.db.get_valid_files(**db_kwargs) - + input_files = self.db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=obs_space, + instrument=instrument, + satellite=platform, + obs_type=obs_type) logger.info(f"number of valid files: {len(input_files)}") # Process the observations if the obs space is not empty diff --git a/ush/python/pyobsforge/tests/test_amsr2_database.py b/ush/python/pyobsforge/tests/test_amsr2_database.py index eea6b9e7..78c0d8fe 100644 --- a/ush/python/pyobsforge/tests/test_amsr2_database.py +++ b/ush/python/pyobsforge/tests/test_amsr2_database.py @@ -137,7 +137,6 @@ def test_get_valid_files(db): print("Valid files found:", len(valid_files)) for f in valid_files: print(" -", f) - assert len(valid_files) == 8 From b7b215bc68d83a91e9a6b5d0fd8dff1bbb9afafe Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 17:52:35 +0000 Subject: [PATCH 10/32] py-norm_3 --- ush/python/pyobsforge/tests/test_smap_database.py | 6 +++--- ush/python/pyobsforge/tests/test_smos_database.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ush/python/pyobsforge/tests/test_smap_database.py b/ush/python/pyobsforge/tests/test_smap_database.py index 1b67b3f8..e30d05b5 100644 --- a/ush/python/pyobsforge/tests/test_smap_database.py +++ b/ush/python/pyobsforge/tests/test_smap_database.py @@ -50,7 +50,7 @@ def temp_obs_dir(): "SMAP_L2B_SSS_NRT_54074_A_20250316T213615.h5", "SMAP_L2B_SSS_NRT_54074_D_20250316T213615.h5", "SMAP_L2B_SSS_NRT_54075_A_20250316T231442.h5" - ] + ] for fname in filenames: fname_tmp = os.path.join(sub_dir, fname) with open(fname_tmp, "w") as f: @@ -88,10 +88,10 @@ def test_parse_valid_filename(db): fname = glob.glob(os.path.join(db.base_dir, fname))[0] parsed = db.parse_filename(fname) creation_time = datetime.fromtimestamp(os.path.getctime(fname)) - + assert parsed is not None assert parsed[0] == fname - assert parsed[1] == datetime(2025, 3, 16, 6, 50, 4) # Start time + assert parsed[1] == datetime(2025, 3, 16, 6, 50, 4) assert parsed[2] == creation_time assert parsed[3] == "SMAP" diff --git a/ush/python/pyobsforge/tests/test_smos_database.py b/ush/python/pyobsforge/tests/test_smos_database.py index c9a58dc6..35c9abd7 100644 --- a/ush/python/pyobsforge/tests/test_smos_database.py +++ b/ush/python/pyobsforge/tests/test_smos_database.py @@ -50,7 +50,7 @@ def temp_obs_dir(): "SM_OPER_MIR_OSUDP2_20250316T211359_20250316T220719_700_001_1.nc", "SM_OPER_MIR_OSUDP2_20250316T220407_20250316T225721_700_001_1.nc", "SM_OPER_MIR_OSUDP2_20250316T225404_20250316T234724_700_001_1.nc" - ] + ] for fname in filenames: fname_tmp = os.path.join(sub_dir, fname) with open(fname_tmp, "w") as f: @@ -87,8 +87,8 @@ def test_parse_valid_filename(db): fname = "SM_OPER_MIR_OSUDP2_20250316T061318_20250316T070637_700_001_1.nc" fname = glob.glob(os.path.join(db.base_dir, fname))[0] parsed = db.parse_filename(fname) - creation_time = datetime.fromtimestamp(os.path.getctime(fname)) - + creation_time = datetime.fromtimestamp(os.path.getctime(fname) + assert parsed is not None assert parsed[0] == fname assert parsed[1] == datetime(2025, 3, 16, 6, 13, 18) # Start time From 547497f40e358656f8a73f0272c8f9bda93510df Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 18:08:44 +0000 Subject: [PATCH 11/32] py-norm_4 --- ush/python/pyobsforge/tests/test_smap_database.py | 2 +- ush/python/pyobsforge/tests/test_smos_database.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ush/python/pyobsforge/tests/test_smap_database.py b/ush/python/pyobsforge/tests/test_smap_database.py index e30d05b5..77abaee2 100644 --- a/ush/python/pyobsforge/tests/test_smap_database.py +++ b/ush/python/pyobsforge/tests/test_smap_database.py @@ -93,7 +93,7 @@ def test_parse_valid_filename(db): assert parsed[0] == fname assert parsed[1] == datetime(2025, 3, 16, 6, 50, 4) assert parsed[2] == creation_time - assert parsed[3] == "SMAP" + assert parsed[3] == "SMAP" def test_parse_invalid_filename(db): diff --git a/ush/python/pyobsforge/tests/test_smos_database.py b/ush/python/pyobsforge/tests/test_smos_database.py index 35c9abd7..859c492e 100644 --- a/ush/python/pyobsforge/tests/test_smos_database.py +++ b/ush/python/pyobsforge/tests/test_smos_database.py @@ -87,7 +87,7 @@ def test_parse_valid_filename(db): fname = "SM_OPER_MIR_OSUDP2_20250316T061318_20250316T070637_700_001_1.nc" fname = glob.glob(os.path.join(db.base_dir, fname))[0] parsed = db.parse_filename(fname) - creation_time = datetime.fromtimestamp(os.path.getctime(fname) + creation_time = datetime.fromtimestamp(os.path.getctime(fname)) assert parsed is not None assert parsed[0] == fname From 8af6dfa664ce9de0aabd71fd8bf75545e9fb20e4 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 18:11:31 +0000 Subject: [PATCH 12/32] py-norm_5 --- ush/python/pyobsforge/tests/test_smos_database.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ush/python/pyobsforge/tests/test_smos_database.py b/ush/python/pyobsforge/tests/test_smos_database.py index 859c492e..5ca5c912 100644 --- a/ush/python/pyobsforge/tests/test_smos_database.py +++ b/ush/python/pyobsforge/tests/test_smos_database.py @@ -93,9 +93,8 @@ def test_parse_valid_filename(db): assert parsed[0] == fname assert parsed[1] == datetime(2025, 3, 16, 6, 13, 18) # Start time assert parsed[2] == creation_time - assert parsed[3] == "SMOS" - - + assert parsed[3] == "SMOS" + def test_parse_invalid_filename(db): assert db.parse_filename("junk.nc") is None assert db.parse_filename("SM_OPER_MIR_OSUDP2_invalid.nc") is None From ae9a031b22a22d764e8bd20920294b14d1f77e98 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 18:13:28 +0000 Subject: [PATCH 13/32] py-norm_6 --- ush/python/pyobsforge/tests/test_smos_database.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ush/python/pyobsforge/tests/test_smos_database.py b/ush/python/pyobsforge/tests/test_smos_database.py index 5ca5c912..c72e74b9 100644 --- a/ush/python/pyobsforge/tests/test_smos_database.py +++ b/ush/python/pyobsforge/tests/test_smos_database.py @@ -94,7 +94,8 @@ def test_parse_valid_filename(db): assert parsed[1] == datetime(2025, 3, 16, 6, 13, 18) # Start time assert parsed[2] == creation_time assert parsed[3] == "SMOS" - + + def test_parse_invalid_filename(db): assert db.parse_filename("junk.nc") is None assert db.parse_filename("SM_OPER_MIR_OSUDP2_invalid.nc") is None From dbae9f29a4d8939431dfa21c2e386b856f733b38 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 18:33:56 +0000 Subject: [PATCH 14/32] update globing amsr2 --- ush/python/pyobsforge/task/marine_prepobs.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index 81890742..34a6eb4d 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -133,15 +133,13 @@ def process_obs_space(self, # Process AMSR2 if provider == "amsr2": parts = obs_space.split("_") - # instrument = "AMSR2" - platform = parts[1].upper() # "GW1" - hemisphere = parts[2].upper() # "NH" or "SH" + platform = parts[1].upper() + hemisphere = parts[2].upper() kwargs = { 'provider': provider, 'obs_space': obs_space, - # AMSR2 does not need 'instrument' in the database query 'instrument': instrument, - 'platform': f"{platform}_{hemisphere}", # e.g., "GW1_NH" - # 'obs_type': "seaice", + 'platform': platform, + 'obs_type': "seaice", 'output_file': output_file, 'window_begin': self.task_config.window_begin, 'window_end': self.task_config.window_end, From d0373ba62d132d9fe08608d370f5eee363d759fc Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 18:38:53 +0000 Subject: [PATCH 15/32] remove variable --- ush/python/pyobsforge/task/marine_prepobs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index 34a6eb4d..9e516a12 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -134,7 +134,6 @@ def process_obs_space(self, if provider == "amsr2": parts = obs_space.split("_") platform = parts[1].upper() - hemisphere = parts[2].upper() kwargs = { 'provider': provider, 'obs_space': obs_space, From ba96dfbe3deba16cb3f209e88301304a2d41562f Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 19:52:42 +0000 Subject: [PATCH 16/32] address comments --- ush/python/pyobsforge/obsdb/amsr2_db.py | 86 --------- ush/python/pyobsforge/obsdb/obsdb.py | 5 +- ush/python/pyobsforge/task/marine_prepobs.py | 6 +- ush/python/pyobsforge/task/providers.py | 4 +- .../pyobsforge/tests/test_amsr2_database.py | 164 ------------------ 5 files changed, 6 insertions(+), 259 deletions(-) delete mode 100644 ush/python/pyobsforge/obsdb/amsr2_db.py delete mode 100644 ush/python/pyobsforge/tests/test_amsr2_database.py diff --git a/ush/python/pyobsforge/obsdb/amsr2_db.py b/ush/python/pyobsforge/obsdb/amsr2_db.py deleted file mode 100644 index 13b2de16..00000000 --- a/ush/python/pyobsforge/obsdb/amsr2_db.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -import glob -from datetime import datetime -from pyobsforge.obsdb import BaseDatabase - - -class Amsr2Database(BaseDatabase): - """Class to manage an observation file database for data assimilation.""" - - def __init__(self, db_name="amsr2.db", - dcom_dir="/lfs/h1/ops/prod/dcom/", - obs_dir="seaice/pda"): - base_dir = os.path.join(dcom_dir, '*', obs_dir) - super().__init__(db_name, base_dir) - - def create_database(self): - """ - Create the SQLite database and observation files table. - - This method initializes the database with a table named `obs_files` to store metadata - about observation files. The table contains the following columns: - - - `id`: A unique identifier for each record (auto-incremented primary key). - - `filename`: The full path to the observation file (must be unique). - - `obs_time`: The timestamp of the observation, extracted from the filename. - - `receipt_time`: The timestamp when the file was added to the `dcom` directory. - - `instrument`: The instrument used to collect the observation (e.g., AMSR2). - - `satellite`: The satellite from which the observation was collected (e.g., GW1). - - `obs_type`: The type of observation (e.g., SEAICE) - - The table is created if it does not already exist. - """ - query = """ - CREATE TABLE IF NOT EXISTS obs_files ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - filename TEXT UNIQUE, - obs_time TIMESTAMP, - receipt_time TIMESTAMP, - instrument TEXT, - satellite TEXT, - obs_type TEXT - ) - """ - self.execute_query(query) - - def parse_filename(self, filename): - # Example filename: - # AMSR2-SEAICE-NH_v2r2_GW1_s202503140032240_e202503140211220_c202503140245560.nc - parts = os.path.basename(filename).replace('_', '-').split('-') - try: - if len(parts) >= 8 and parts[0] == 'AMSR2': - instrument = parts[0] - obs_type = parts[1] - satellite = parts[4] - obs_time_str = parts[5][1:16] - receipt_time_str = parts[7].split('.')[0][1:16] - - obs_time = datetime.strptime(obs_time_str, "%Y%m%d%H%M%S%f") - receipt_time = datetime.strptime(receipt_time_str, "%Y%m%d%H%M%S%f") - return filename, obs_time, receipt_time, instrument, satellite, obs_type - except Exception as e: - print(f"[DEBUG] Error parsing filename {filename}: {e}") - return None - - def ingest_files(self): - """Scan the directory for new observation files and insert them into the database.""" - obs_files = glob.glob(os.path.join(self.base_dir, "*.nc")) - print(f"[INFO] Found {len(obs_files)} new files to ingest") - print(f"[INFO] Files found: {obs_files}") - - # Counter for successful ingestions - ingested_count = 0 - - for file in obs_files: - parsed_data = self.parse_filename(file) - if parsed_data: - query = """ - INSERT INTO obs_files (filename, obs_time, receipt_time, instrument, satellite, obs_type) - VALUES (?, ?, ?, ?, ?, ?) - """ - try: - self.insert_record(query, parsed_data) - ingested_count += 1 - except Exception as e: - print(f"[DEBUG] Failed to insert record for {file}: {e}") - print(f"################################ Successfully ingested {ingested_count} files into the database.") diff --git a/ush/python/pyobsforge/obsdb/obsdb.py b/ush/python/pyobsforge/obsdb/obsdb.py index 2cb7e4ca..fa92d311 100644 --- a/ush/python/pyobsforge/obsdb/obsdb.py +++ b/ush/python/pyobsforge/obsdb/obsdb.py @@ -107,10 +107,7 @@ def get_valid_files(self, if check_receipt in ["gdas", "gfs"]: query = "SELECT receipt_time FROM obs_files WHERE filename = ?" receipt_time = self.execute_query(query, (filename,))[0][0] - try: - receipt_time = datetime.strptime(receipt_time, "%Y-%m-%d %H:%M:%S.%f") - except ValueError: - receipt_time = datetime.strptime(receipt_time, "%Y-%m-%d %H:%M:%S") + receipt_time = datetime.strptime(receipt_time, "%Y-%m-%d %H:%M:%S.%f") if receipt_time <= window_end - timedelta(minutes=minutes_behind_realtime[check_receipt]): continue diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index 9e516a12..65633cf2 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -37,7 +37,7 @@ def __init__(self, config: Dict[str, Any]) -> None: # Initialize the Providers self.ghrsst = ProviderConfig.from_task_config("ghrsst", self.task_config) self.rads = ProviderConfig.from_task_config("rads", self.task_config) - self.amsr2 = ProviderConfig.from_task_config("amsr2", self.task_config) + self.nesdis_amsr2 = ProviderConfig.from_task_config("nesdis_amsr2", self.task_config) # Initialize the list of processed ioda files # TODO: Does not work. This should be a list of gathered ioda files that are created @@ -51,7 +51,7 @@ def initialize(self) -> None: # Update the database with new files self.ghrsst.db.ingest_files() self.rads.db.ingest_files() - self.amsr2.db.ingest_files() + self.nesdis_amsr2.db.ingest_files() @logit(logger) def execute(self) -> None: @@ -131,7 +131,7 @@ def process_obs_space(self, return result # Process AMSR2 - if provider == "amsr2": + if provider == "nesdis_amsr2": parts = obs_space.split("_") platform = parts[1].upper() kwargs = { diff --git a/ush/python/pyobsforge/task/providers.py b/ush/python/pyobsforge/task/providers.py index d8a45111..7bb71487 100644 --- a/ush/python/pyobsforge/task/providers.py +++ b/ush/python/pyobsforge/task/providers.py @@ -1,7 +1,7 @@ from logging import getLogger from pyobsforge.obsdb.ghrsst_db import GhrSstDatabase from pyobsforge.obsdb.rads_db import RADSDatabase -from pyobsforge.obsdb.amsr2_db import Amsr2Database +from pyobsforge.obsdb.nesdisamsr2_db import NesdisAmsr2Database from typing import Any from dataclasses import dataclass from wxflow import AttrDict @@ -60,7 +60,7 @@ def from_task_config(cls, provider_name: str, task_config: AttrDict) -> "Provide db = GhrSstDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="sst") elif provider_name == "rads": db = RADSDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="wgrdbul/adt") - elif provider_name == "amsr2": + elif provider_name == "nesdis_amsr2": db = Amsr2Database(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="seaice/pda") else: raise NotImplementedError(f"DB setup for provider {provider_name} not yet implemented") diff --git a/ush/python/pyobsforge/tests/test_amsr2_database.py b/ush/python/pyobsforge/tests/test_amsr2_database.py deleted file mode 100644 index 78c0d8fe..00000000 --- a/ush/python/pyobsforge/tests/test_amsr2_database.py +++ /dev/null @@ -1,164 +0,0 @@ -import os -import glob -import tempfile -import shutil -import sqlite3 -from datetime import datetime, timedelta - -import pytest - -from pyobsforge.obsdb.amsr2_db import Amsr2Database # Adjust as needed - - -@pytest.fixture -def temp_obs_dir(): - """Create a temp directory with mock AMSR2 NetCDF files.""" - base_dir = tempfile.mkdtemp() - sub_dir = os.path.join(base_dir, "some_subdir", "seaice/pda") - os.makedirs(sub_dir) - - # Desired datetime for file timestamps - mock_time = datetime(2025, 3, 16, 0, 0, 0).timestamp() - - # Create mock NetCDF files - filenames = [ - "AMSR2-SEAICE-NH_v2r2_GW1_s202503160020240_e202503160159220_c202503160230450.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503160159240_e202503160338230_c202503160410050.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503160338250_e202503160514230_c202503160545510.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503160514240_e202503160653220_c202503160725420.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503160829250_e202503161008230_c202503161121060.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503161008240_e202503161147220_c202503161300120.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503161147230_e202503161326230_c202503161357200.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503161326240_e202503161502220_c202503161540340.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503161502240_e202503161641220_c202503161715510.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503161641230_e202503161820230_c202503161856520.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503161820240_e202503162002220_c202503162039030.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503162002240_e202503162144230_c202503162217280.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503162144250_e202503162323220_c202503162358480.nc", - "AMSR2-SEAICE-NH_v2r2_GW1_s202503162323240_e202503170102220_c202503170137120.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503160020240_e202503160159220_c202503160230450.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503160159240_e202503160338230_c202503160410050.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503160338250_e202503160514230_c202503160545510.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503160514240_e202503160653220_c202503160725420.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503160829250_e202503161008230_c202503161121060.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503161008240_e202503161147220_c202503161300120.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503161147230_e202503161326230_c202503161357200.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503161326240_e202503161502220_c202503161540340.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503161502240_e202503161641220_c202503161715510.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503161641230_e202503161820230_c202503161856520.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503161820240_e202503162002220_c202503162039030.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503162002240_e202503162144230_c202503162217280.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503162144250_e202503162323220_c202503162358480.nc", - "AMSR2-SEAICE-SH_v2r2_GW1_s202503162323240_e202503170102220_c202503170137120.nc", - "invalid_file.nc" - ] - for fname in filenames: - fname_tmp = os.path.join(sub_dir, fname) - with open(fname_tmp, "w") as f: - f.write("fake content") - os.utime(fname_tmp, (mock_time, mock_time)) # (access_time, modification_time) - - yield base_dir - shutil.rmtree(base_dir) - - -@pytest.fixture -def db(temp_obs_dir): - """Initialize test database.""" - db_path = os.path.join(temp_obs_dir, "amsr2_test.db") - database = Amsr2Database( - db_name=db_path, - dcom_dir=temp_obs_dir, - obs_dir="seaice/pda" - ) - return database - - -def test_create_database(db): - db.create_database() - conn = sqlite3.connect(db.db_name) - cursor = conn.cursor() - cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='obs_files'") - assert cursor.fetchone() is not None - conn.close() - - -def test_parse_valid_filename(db): - print(glob.glob(os.path.join(db.base_dir, "*"))) - fname = "AMSR2-SEAICE-NH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc" - fname = glob.glob(os.path.join(db.base_dir, fname))[0] - parsed = db.parse_filename(fname) - - assert parsed is not None - assert parsed[0] == fname - assert parsed[1] == datetime(2025, 3, 16, 6, 53, 24) - assert parsed[2] == datetime(2025, 3, 16, 9, 2, 25) - assert parsed[3] == "AMSR2" - assert parsed[4] == "GW1" - assert parsed[5] == "SEAICE" - - -def test_parse_invalid_filename(db): - assert db.parse_filename("junk.nc") is None - assert db.parse_filename("AMSR2-SEAICE-NH_v2r2_GW1_invalid.nc") is None - - -def test_ingest_files(db): - db.ingest_files() - conn = sqlite3.connect(db.db_name) - cursor = conn.cursor() - cursor.execute("SELECT COUNT(*) FROM obs_files") - count = cursor.fetchone()[0] - conn.close() - assert count == 30, "Should ingest 30 valid AMSR2 files" - - -def test_get_valid_files(db): - db.ingest_files() - da_cycle = "20250316060000" - window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) - window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) - dst_dir = 'seaice' - # Test for AMSR2 ICEC - valid_files = db.get_valid_files(window_begin=window_begin, - window_end=window_end, - dst_dir=dst_dir, - instrument="AMSR2", - satellite="GW1", - obs_type="SEAICE") - - # Files at 10:00 and 12:00 are within +/- 3h of 00:00 - assert any("202503160514" in f for f in valid_files) - assert any("202503160653" in f for f in valid_files) - assert all("202503161326" not in f for f in valid_files) - - print("Valid files found:", len(valid_files)) - for f in valid_files: - print(" -", f) - assert len(valid_files) == 8 - - -def test_get_valid_files_receipt(db): - db.ingest_files() - da_cycle = "20250316060000" - window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) - window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) - dst_dir = 'seaice' - - # Test for AMSR2 ICEC - valid_files = db.get_valid_files(window_begin=window_begin, - window_end=window_end, - dst_dir=dst_dir, - instrument="AMSR2", - satellite="GW1", - obs_type="SEAICE", - check_receipt='gfs') - - print("Valid files found:", len(valid_files)) - for f in valid_files: - print(" -", f) - - # TODO (G): Giving up for now on trying to mock the receipt time, will revisit later - assert len(valid_files) == 4 From 281a351f34e5c517c628101bf1d062cd1a52eed6 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 19:56:21 +0000 Subject: [PATCH 17/32] correct provider name --- ush/python/pyobsforge/task/providers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ush/python/pyobsforge/task/providers.py b/ush/python/pyobsforge/task/providers.py index 7bb71487..087a0850 100644 --- a/ush/python/pyobsforge/task/providers.py +++ b/ush/python/pyobsforge/task/providers.py @@ -1,7 +1,7 @@ from logging import getLogger from pyobsforge.obsdb.ghrsst_db import GhrSstDatabase from pyobsforge.obsdb.rads_db import RADSDatabase -from pyobsforge.obsdb.nesdisamsr2_db import NesdisAmsr2Database +from pyobsforge.obsdb.nesdis_amsr2_db import NesdisAmsr2Database from typing import Any from dataclasses import dataclass from wxflow import AttrDict @@ -61,7 +61,7 @@ def from_task_config(cls, provider_name: str, task_config: AttrDict) -> "Provide elif provider_name == "rads": db = RADSDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="wgrdbul/adt") elif provider_name == "nesdis_amsr2": - db = Amsr2Database(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="seaice/pda") + db = NesdisAmsr2Database(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="seaice/pda") else: raise NotImplementedError(f"DB setup for provider {provider_name} not yet implemented") From ca36d10db3a61bf22a55471e67a62430ac7af9f0 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 23 Apr 2025 19:59:54 +0000 Subject: [PATCH 18/32] add name -nesdis- to seaice product --- .../pyobsforge/obsdb/nesdis_amsr2_db.py | 84 +++++++++ .../tests/test_nesdis_amsr2_database.py | 165 ++++++++++++++++++ 2 files changed, 249 insertions(+) create mode 100644 ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py create mode 100644 ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py diff --git a/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py new file mode 100644 index 00000000..5cdd705b --- /dev/null +++ b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py @@ -0,0 +1,84 @@ +import os +import glob +from datetime import datetime +from pyobsforge.obsdb import BaseDatabase + + +class NesdisAmsr2Database(BaseDatabase): + """Class to manage an observation file database for data assimilation.""" + + def __init__(self, db_name="nesdis_amsr2.db", + dcom_dir="/lfs/h1/ops/prod/dcom/", + obs_dir="seaice/pda"): + base_dir = os.path.join(dcom_dir, '*', obs_dir) + super().__init__(db_name, base_dir) + + def create_database(self): + """ + Create the SQLite database and observation files table. + + This method initializes the database with a table named `obs_files` to store metadata + about observation files. The table contains the following columns: + + - `id`: A unique identifier for each record (auto-incremented primary key). + - `filename`: The full path to the observation file (must be unique). + - `obs_time`: The timestamp of the observation, extracted from the filename. + - `receipt_time`: The timestamp when the file was added to the `dcom` directory. + - `instrument`: The instrument used to collect the observation (e.g., AMSR2). + - `satellite`: The satellite from which the observation was collected (e.g., GW1). + - `obs_type`: The type of observation (e.g., SEAICE) + + The table is created if it does not already exist. + """ + query = """ + CREATE TABLE IF NOT EXISTS obs_files ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + filename TEXT UNIQUE, + obs_time TIMESTAMP, + receipt_time TIMESTAMP, + instrument TEXT, + satellite TEXT, + obs_type TEXT + ) + """ + self.execute_query(query) + + def parse_filename(self, filename): + # Example filename: + # AMSR2-SEAICE-NH_v2r2_GW1_s202503140032240_e202503140211220_c202503140245560.nc + parts = os.path.basename(filename).replace('_', '-').split('-') + try: + if len(parts) >= 8 and parts[0] == 'AMSR2': + instrument = parts[0] + obs_type = parts[1] + satellite = parts[4] + obs_time_str = parts[5][1:16] + obs_time = datetime.strptime(obs_time_str, "%Y%m%d%H%M%S%f") + receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) + return filename, obs_time, receipt_time, instrument, satellite, obs_type + except Exception as e: + print(f"[DEBUG] Error parsing filename {filename}: {e}") + return None + + def ingest_files(self): + """Scan the directory for new observation files and insert them into the database.""" + obs_files = glob.glob(os.path.join(self.base_dir, "*.nc")) + print(f"[INFO] Found {len(obs_files)} new files to ingest") + print(f"[INFO] Files found: {obs_files}") + + # Counter for successful ingestions + ingested_count = 0 + + for file in obs_files: + parsed_data = self.parse_filename(file) + if parsed_data: + query = """ + INSERT INTO obs_files (filename, obs_time, receipt_time, instrument, satellite, obs_type) + VALUES (?, ?, ?, ?, ?, ?) + """ + try: + self.insert_record(query, parsed_data) + ingested_count += 1 + except Exception as e: + print(f"[DEBUG] Failed to insert record for {file}: {e}") + print(f"################################ Successfully ingested {ingested_count} files into the database.") diff --git a/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py b/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py new file mode 100644 index 00000000..34ac881b --- /dev/null +++ b/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py @@ -0,0 +1,165 @@ +import os +import glob +import tempfile +import shutil +import sqlite3 +from datetime import datetime, timedelta + +import pytest + +from pyobsforge.obsdb.nesdis_amsr2_db import NesdisAmsr2Database # Adjust as needed + + +@pytest.fixture +def temp_obs_dir(): + """Create a temp directory with mock NESDIS AMSR2 NetCDF files.""" + base_dir = tempfile.mkdtemp() + sub_dir = os.path.join(base_dir, "some_subdir", "seaice/pda") + os.makedirs(sub_dir) + + # Desired datetime for file timestamps + mock_time = datetime(2025, 3, 16, 0, 0, 0).timestamp() + + # Create mock NetCDF files + filenames = [ + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160020240_e202503160159220_c202503160230450.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160159240_e202503160338230_c202503160410050.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160338250_e202503160514230_c202503160545510.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160514240_e202503160653220_c202503160725420.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503160829250_e202503161008230_c202503161121060.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161008240_e202503161147220_c202503161300120.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161147230_e202503161326230_c202503161357200.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161326240_e202503161502220_c202503161540340.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161502240_e202503161641220_c202503161715510.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161641230_e202503161820230_c202503161856520.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503161820240_e202503162002220_c202503162039030.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503162002240_e202503162144230_c202503162217280.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503162144250_e202503162323220_c202503162358480.nc", + "AMSR2-SEAICE-NH_v2r2_GW1_s202503162323240_e202503170102220_c202503170137120.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160020240_e202503160159220_c202503160230450.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160159240_e202503160338230_c202503160410050.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160338250_e202503160514230_c202503160545510.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160514240_e202503160653220_c202503160725420.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503160829250_e202503161008230_c202503161121060.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161008240_e202503161147220_c202503161300120.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161147230_e202503161326230_c202503161357200.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161326240_e202503161502220_c202503161540340.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161502240_e202503161641220_c202503161715510.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161641230_e202503161820230_c202503161856520.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503161820240_e202503162002220_c202503162039030.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503162002240_e202503162144230_c202503162217280.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503162144250_e202503162323220_c202503162358480.nc", + "AMSR2-SEAICE-SH_v2r2_GW1_s202503162323240_e202503170102220_c202503170137120.nc", + "invalid_file.nc" + ] + for fname in filenames: + fname_tmp = os.path.join(sub_dir, fname) + with open(fname_tmp, "w") as f: + f.write("fake content") + os.utime(fname_tmp, (mock_time, mock_time)) # (access_time, modification_time) + + yield base_dir + shutil.rmtree(base_dir) + + +@pytest.fixture +def db(temp_obs_dir): + """Initialize test database.""" + db_path = os.path.join(temp_obs_dir, "amsr2_test.db") + database = NesdisAmsr2Database( + db_name=db_path, + dcom_dir=temp_obs_dir, + obs_dir="seaice/pda" + ) + return database + + +def test_create_database(db): + db.create_database() + conn = sqlite3.connect(db.db_name) + cursor = conn.cursor() + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='obs_files'") + assert cursor.fetchone() is not None + conn.close() + + +def test_parse_valid_filename(db): + print(glob.glob(os.path.join(db.base_dir, "*"))) + fname = "AMSR2-SEAICE-NH_v2r2_GW1_s202503160653240_e202503160829230_c202503160902250.nc" + fname = glob.glob(os.path.join(db.base_dir, fname))[0] + parsed = db.parse_filename(fname) + creation_time = datetime.fromtimestamp(os.path.getctime(fname)) + + assert parsed is not None + assert parsed[0] == fname + assert parsed[1] == datetime(2025, 3, 16, 6, 53, 24) + assert parsed[2] == creation_time + assert parsed[3] == "AMSR2" + assert parsed[4] == "GW1" + assert parsed[5] == "SEAICE" + + +def test_parse_invalid_filename(db): + assert db.parse_filename("junk.nc") is None + assert db.parse_filename("AMSR2-SEAICE-NH_v2r2_GW1_invalid.nc") is None + + +def test_ingest_files(db): + db.ingest_files() + conn = sqlite3.connect(db.db_name) + cursor = conn.cursor() + cursor.execute("SELECT COUNT(*) FROM obs_files") + count = cursor.fetchone()[0] + conn.close() + assert count == 30, "Should ingest 30 valid AMSR2 files" + + +def test_get_valid_files(db): + db.ingest_files() + da_cycle = "20250316060000" + window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) + window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) + dst_dir = 'seaice' + # Test for AMSR2 ICEC + valid_files = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + instrument="AMSR2", + satellite="GW1", + obs_type="SEAICE") + + # Files at 10:00 and 12:00 are within +/- 3h of 00:00 + assert any("202503160514" in f for f in valid_files) + assert any("202503160653" in f for f in valid_files) + assert all("202503161326" not in f for f in valid_files) + + print("Valid files found:", len(valid_files)) + for f in valid_files: + print(" -", f) + assert len(valid_files) == 8 + + +def test_get_valid_files_receipt(db): + db.ingest_files() + da_cycle = "20250316060000" + window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) + window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) + dst_dir = 'seaice' + + # Test for AMSR2 ICEC + valid_files = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + instrument="AMSR2", + satellite="GW1", + obs_type="SEAICE", + check_receipt='gfs') + + print("Valid files found:", len(valid_files)) + for f in valid_files: + print(" -", f) + + # TODO (G): Giving up for now on trying to mock the receipt time, will revisit later + assert len(valid_files) == 8 From 383f63ab430f98dfd56ebe198a4652d742848a74 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Thu, 24 Apr 2025 02:07:18 +0000 Subject: [PATCH 19/32] revise config --- parm/config.orion.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parm/config.orion.yaml b/parm/config.orion.yaml index 55b2425b..f509dcd7 100644 --- a/parm/config.orion.yaml +++ b/parm/config.orion.yaml @@ -55,7 +55,7 @@ marinedump: min: -2.0 max: 3.0 error ratio: 1.0 - amsr2: + nesdis_amsr2: list: - seaice_gw1_nh - seaice_gw1_sh From 6cb2318ff0d4ffd38352990e7f5cba0c7b19fa3e Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Thu, 24 Apr 2025 14:34:35 +0000 Subject: [PATCH 20/32] revise kwargs --- parm/config.hera.yaml | 68 ++++++++++++++++++++ ush/python/pyobsforge/task/marine_prepobs.py | 2 +- 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 parm/config.hera.yaml diff --git a/parm/config.hera.yaml b/parm/config.hera.yaml new file mode 100644 index 00000000..de592bb0 --- /dev/null +++ b/parm/config.hera.yaml @@ -0,0 +1,68 @@ +obsforge: + PSLOT: obsforge + HOMEobsforge: /scratch1/NCEPDEV/da/Mindo.Choi/test/obsForge + SDATE: 202503141800 + EDATE: 202503150000 + COMROOT: /scratch1/NCEPDEV/da/Mindo.Choi/test/test_obsForge/COMROOT + DCOMROOT: /scratch1/NCEPDEV/da/common/realtime_sample/lfs/h1/ops/prod/dcom + DATAROOT: /scratch1/NCEPDEV/da/Mindo.Choi/test/test_obsForge/RUNDIRS + SCHEDULER: slurm + ACCOUNT: da-cpu + QUEUE: debug + PARTITION: hera + KEEPDATA: NO + assim_freq: 6 + +aoddump: + provider: VIIRSAOD + platforms: ['npp', 'n20', 'n21'] + thinning_threshold: 0 + channel: 4 + preqc: 2 + WALLTIME_AOD_DUMP: '00:10:00' + TASK_GEOM_AOD_DUMP: '1:ppn=1:tpp=1' + MEMORY_AOD_DUMP: 96GB + +marinedump: + providers: + ghrsst: + list: + - sst_viirs_n21_l3u + - sst_viirs_n20_l3u + - sst_viirs_npp_l3u + - sst_avhrrf_ma_l3u + - sst_avhrrf_mb_l3u + - sst_avhrrf_mc_l3u + - sst_ahi_h08_l3c + - sst_abi_g17_l3c + - sst_abi_g16_l3c + qc config: + min: -2.0 + max: 45.0 + stride: 15 + min number of obs: 10 + rads: + list: + - rads_adt_3a + - rads_adt_3b + - rads_adt_6a + - rads_adt_c2 + - rads_adt_j2 + - rads_adt_j3 + - rads_adt_sa + - rads_adt_sw + qc config: + min: -2.0 + max: 3.0 + error ratio: 1.0 + nesdis_amsr2: + list: + - seaice_gw1_nh + - seaice_gw1_sh + qc config: + min: 0.0 + max: 1.0 + + WALLTIME_MARINE_DUMP: '00:10:00' + TASK_GEOM_MARINE_DUMP: '1:ppn=20:tpp=2' + MEMORY_MARINE_DUMP: 32GB diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index 65633cf2..afdb091a 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -144,7 +144,7 @@ def process_obs_space(self, 'window_end': self.task_config.window_end, 'task_config': self.task_config } - result = self.amsr2.process_obs_space(**kwargs) + result = self.nesdis_amsr2.process_obs_space(**kwargs) return result else: logger.error(f"Provider {provider} not supported") From 4929d94e0619fbed0388c0dfdc2d7ad61032af32 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Thu, 24 Apr 2025 15:56:04 +0000 Subject: [PATCH 21/32] config, provider and obs_space --- parm/config.hera.yaml | 3 +-- parm/config.orion.yaml | 3 +-- parm/config.yaml | 2 +- ush/python/pyobsforge/task/marine_prepobs.py | 4 ++-- 4 files changed, 5 insertions(+), 7 deletions(-) diff --git a/parm/config.hera.yaml b/parm/config.hera.yaml index de592bb0..e5ed2296 100644 --- a/parm/config.hera.yaml +++ b/parm/config.hera.yaml @@ -57,8 +57,7 @@ marinedump: error ratio: 1.0 nesdis_amsr2: list: - - seaice_gw1_nh - - seaice_gw1_sh + - seaice_gw1 qc config: min: 0.0 max: 1.0 diff --git a/parm/config.orion.yaml b/parm/config.orion.yaml index f509dcd7..b884f2c9 100644 --- a/parm/config.orion.yaml +++ b/parm/config.orion.yaml @@ -57,8 +57,7 @@ marinedump: error ratio: 1.0 nesdis_amsr2: list: - - seaice_gw1_nh - - seaice_gw1_sh + - seaice_gw1 qc config: min: 0.0 max: 1.0 diff --git a/parm/config.yaml b/parm/config.yaml index 1d8374ce..bcb9e07b 120000 --- a/parm/config.yaml +++ b/parm/config.yaml @@ -1 +1 @@ -config.orion.yaml \ No newline at end of file +config.hera.yaml \ No newline at end of file diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index afdb091a..7ba657a3 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -135,10 +135,10 @@ def process_obs_space(self, parts = obs_space.split("_") platform = parts[1].upper() kwargs = { - 'provider': provider, + 'provider': "amsr2", 'obs_space': obs_space, 'platform': platform, - 'obs_type': "seaice", + 'obs_type': "SEAICE", 'output_file': output_file, 'window_begin': self.task_config.window_begin, 'window_end': self.task_config.window_end, From da77a0cfd2f787a27d2a47d040f2f24d99ad0675 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Thu, 24 Apr 2025 18:01:46 +0000 Subject: [PATCH 22/32] clean up and obs_type convention --- parm/config.hera.yaml | 2 +- parm/config.orion.yaml | 2 +- ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py | 2 -- ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/parm/config.hera.yaml b/parm/config.hera.yaml index e5ed2296..745300af 100644 --- a/parm/config.hera.yaml +++ b/parm/config.hera.yaml @@ -57,7 +57,7 @@ marinedump: error ratio: 1.0 nesdis_amsr2: list: - - seaice_gw1 + - seaice_amsr2_gw1_l2 qc config: min: 0.0 max: 1.0 diff --git a/parm/config.orion.yaml b/parm/config.orion.yaml index b884f2c9..f5cb9bbf 100644 --- a/parm/config.orion.yaml +++ b/parm/config.orion.yaml @@ -57,7 +57,7 @@ marinedump: error ratio: 1.0 nesdis_amsr2: list: - - seaice_gw1 + - seaice_amsr2_gw1_l2 qc config: min: 0.0 max: 1.0 diff --git a/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py index 5cdd705b..7a7c6d8b 100644 --- a/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py +++ b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py @@ -63,8 +63,6 @@ def parse_filename(self, filename): def ingest_files(self): """Scan the directory for new observation files and insert them into the database.""" obs_files = glob.glob(os.path.join(self.base_dir, "*.nc")) - print(f"[INFO] Found {len(obs_files)} new files to ingest") - print(f"[INFO] Files found: {obs_files}") # Counter for successful ingestions ingested_count = 0 diff --git a/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py b/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py index 34ac881b..97289c68 100644 --- a/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py +++ b/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py @@ -67,7 +67,7 @@ def temp_obs_dir(): @pytest.fixture def db(temp_obs_dir): """Initialize test database.""" - db_path = os.path.join(temp_obs_dir, "amsr2_test.db") + db_path = os.path.join(temp_obs_dir, "nesdis_amsr2_test.db") database = NesdisAmsr2Database( db_name=db_path, dcom_dir=temp_obs_dir, From b16a4ed89d584020850a984fb26ffadccd62bd43 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Mon, 28 Apr 2025 16:15:10 +0000 Subject: [PATCH 23/32] obs_space correction --- parm/config.hera.yaml | 3 +- .../pyobsforge/obsdb/nesdis_amsr2_db.py | 26 +++++++--- ush/python/pyobsforge/task/marine_prepobs.py | 44 ++++++++++++---- .../tests/test_nesdis_amsr2_database.py | 52 +++++++++++++------ 4 files changed, 92 insertions(+), 33 deletions(-) diff --git a/parm/config.hera.yaml b/parm/config.hera.yaml index 745300af..52cef6f9 100644 --- a/parm/config.hera.yaml +++ b/parm/config.hera.yaml @@ -57,7 +57,8 @@ marinedump: error ratio: 1.0 nesdis_amsr2: list: - - seaice_amsr2_gw1_l2 + - icec_amsr2_north + - icec_amsr2_south qc config: min: 0.0 max: 1.0 diff --git a/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py index 7a7c6d8b..576ee30e 100644 --- a/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py +++ b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py @@ -46,15 +46,29 @@ def create_database(self): def parse_filename(self, filename): # Example filename: # AMSR2-SEAICE-NH_v2r2_GW1_s202503140032240_e202503140211220_c202503140245560.nc - parts = os.path.basename(filename).replace('_', '-').split('-') + parts = os.path.basename(filename).split('_') + # parts = os.path.basename(filename).replace('_', '-').split('-') try: - if len(parts) >= 8 and parts[0] == 'AMSR2': - instrument = parts[0] - obs_type = parts[1] - satellite = parts[4] - obs_time_str = parts[5][1:16] + if parts[0].startswith("AMSR2-SEAICE"): + # Extract hemisphere from the first hyphen-separated segment + name_parts = parts[0].split('-') + instrument = name_parts[0] + raw_obs_type = name_parts[1] + hemisphere = name_parts[2].lower() + + # Determine obs_type + if hemisphere == 'nh': + obs_type = 'icec_amsr2_north' + elif hemisphere == 'sh': + obs_type = 'icec_amsr2_south' + else: + raise ValueError(f"Unrecognized hemisphere in filename: {filename}") + + satellite = parts[2] + obs_time_str = parts[3][1:16] # sYYYYMMDDHHMMSSf obs_time = datetime.strptime(obs_time_str, "%Y%m%d%H%M%S%f") receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) + return filename, obs_time, receipt_time, instrument, satellite, obs_type except Exception as e: print(f"[DEBUG] Error parsing filename {filename}: {e}") diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index 7ba657a3..0bbb98e2 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -130,15 +130,26 @@ def process_obs_space(self, result = self.rads.process_obs_space(**kwargs) return result - # Process AMSR2 + # Process NESDIS_AMSR2 if provider == "nesdis_amsr2": parts = obs_space.split("_") - platform = parts[1].upper() + if obs_space.startswith("icec_amsr2_"): + platform = "GW1" + instrument = "AMSR2" + obs_type = "SEAICE" + satellite = "GW1" + else: + platform = parts[1].upper() + instrument = "AMSR2" + obs_type = "SEAICE" + satellite = "GW1" kwargs = { 'provider': "amsr2", 'obs_space': obs_space, 'platform': platform, - 'obs_type': "SEAICE", + 'instrument': instrument, + 'satellite': satellite, + 'obs_type': obs_space, 'output_file': output_file, 'window_begin': self.task_config.window_begin, 'window_end': self.task_config.window_end, @@ -167,13 +178,26 @@ def finalize(self) -> None: obs_types = ['sst', 'adt', 'icec', 'sss'] src_dst_obs_list = [] # list of [src_file, dst_file] for obs_type in obs_types: - # Create the destination directory - comout_tmp = join(comout, obs_type) - FileHandler({'mkdir': [comout_tmp]}).sync() - - # Glob the ioda files - ioda_files = glob.glob(join(self.task_config['DATA'], - f"{self.task_config['PREFIX']}*{obs_type}_*.nc")) + if obs_type == 'icec': + # Special handling for icec + comout_tmp = join(comout, 'icec') + FileHandler({'mkdir': [comout_tmp]}).sync() + + # Find BOTH north and south files + ioda_files = [] + ioda_files += glob.glob(join(self.task_config['DATA'], + f"{self.task_config['PREFIX']}*icec_amsr2_north*.nc")) + ioda_files += glob.glob(join(self.task_config['DATA'], + f"{self.task_config['PREFIX']}*icec_amsr2_south*.nc")) + else: + # Standard handling for other obs types + # Create the destination directory + comout_tmp = join(comout, obs_type) + FileHandler({'mkdir': [comout_tmp]}).sync() + + # Glob the ioda files + ioda_files = glob.glob(join(self.task_config['DATA'], + f"{self.task_config['PREFIX']}*{obs_type}_*.nc")) for ioda_file in ioda_files: logger.info(f"ioda_file: {ioda_file}") src_file = ioda_file diff --git a/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py b/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py index 97289c68..f16f9a75 100644 --- a/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py +++ b/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py @@ -98,7 +98,8 @@ def test_parse_valid_filename(db): assert parsed[2] == creation_time assert parsed[3] == "AMSR2" assert parsed[4] == "GW1" - assert parsed[5] == "SEAICE" + # assert parsed[5] == "SEAICE" + assert parsed[5] == "icec_amsr2_north" def test_parse_invalid_filename(db): @@ -121,14 +122,23 @@ def test_get_valid_files(db): da_cycle = "20250316060000" window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) - dst_dir = 'seaice' + dst_dir = 'icec' # Test for AMSR2 ICEC - valid_files = db.get_valid_files(window_begin=window_begin, - window_end=window_end, - dst_dir=dst_dir, - instrument="AMSR2", - satellite="GW1", - obs_type="SEAICE") + valid_files_north = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + instrument="AMSR2", + satellite="GW1", + obs_type="icec_amsr2_north") + + valid_files_south = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + instrument="AMSR2", + satellite="GW1", + obs_type="icec_amsr2_south") + + valid_files = valid_files_north + valid_files_south # Files at 10:00 and 12:00 are within +/- 3h of 00:00 assert any("202503160514" in f for f in valid_files) @@ -146,16 +156,26 @@ def test_get_valid_files_receipt(db): da_cycle = "20250316060000" window_begin = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") - timedelta(hours=3) window_end = datetime.strptime(da_cycle, "%Y%m%d%H%M%S") + timedelta(hours=3) - dst_dir = 'seaice' + dst_dir = 'icec' # Test for AMSR2 ICEC - valid_files = db.get_valid_files(window_begin=window_begin, - window_end=window_end, - dst_dir=dst_dir, - instrument="AMSR2", - satellite="GW1", - obs_type="SEAICE", - check_receipt='gfs') + valid_files_north = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + instrument="AMSR2", + satellite="GW1", + obs_type="icec_amsr2_north", + check_receipt='gfs') + + valid_files_south = db.get_valid_files(window_begin=window_begin, + window_end=window_end, + dst_dir=dst_dir, + instrument="AMSR2", + satellite="GW1", + obs_type="icec_amsr2_south", + check_receipt='gfs') + + valid_files = valid_files_north + valid_files_south print("Valid files found:", len(valid_files)) for f in valid_files: From 223851dbb005a4efad125b41c23f12da5f46cc80 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Mon, 28 Apr 2025 16:23:06 +0000 Subject: [PATCH 24/32] fix pytest --- ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py | 3 +-- ush/python/pyobsforge/task/marine_prepobs.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py index 576ee30e..a03f08b6 100644 --- a/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py +++ b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py @@ -53,10 +53,9 @@ def parse_filename(self, filename): # Extract hemisphere from the first hyphen-separated segment name_parts = parts[0].split('-') instrument = name_parts[0] - raw_obs_type = name_parts[1] hemisphere = name_parts[2].lower() - # Determine obs_type + # Determine obs_type if hemisphere == 'nh': obs_type = 'icec_amsr2_north' elif hemisphere == 'sh': diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index 0bbb98e2..1ccabdaa 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -149,7 +149,7 @@ def process_obs_space(self, 'platform': platform, 'instrument': instrument, 'satellite': satellite, - 'obs_type': obs_space, + 'obs_type': obs_type, 'output_file': output_file, 'window_begin': self.task_config.window_begin, 'window_end': self.task_config.window_end, From bc63ab1faafa4ccf79f149ff962669929e4fddf0 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Mon, 28 Apr 2025 18:03:22 +0000 Subject: [PATCH 25/32] marine icec kwargs --- ush/python/pyobsforge/task/marine_prepobs.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index 1ccabdaa..7bc68dff 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -136,12 +136,12 @@ def process_obs_space(self, if obs_space.startswith("icec_amsr2_"): platform = "GW1" instrument = "AMSR2" - obs_type = "SEAICE" + # obs_type = "SEAICE" satellite = "GW1" else: platform = parts[1].upper() instrument = "AMSR2" - obs_type = "SEAICE" + # obs_type = "SEAICE" satellite = "GW1" kwargs = { 'provider': "amsr2", @@ -149,7 +149,7 @@ def process_obs_space(self, 'platform': platform, 'instrument': instrument, 'satellite': satellite, - 'obs_type': obs_type, + 'obs_type': obs_space, 'output_file': output_file, 'window_begin': self.task_config.window_begin, 'window_end': self.task_config.window_end, From a2cb073097b867d7b8655a67cd70d0518cf67022 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Tue, 29 Apr 2025 15:44:25 +0000 Subject: [PATCH 26/32] address comments --- parm/config.hera.yaml | 6 +++ parm/config.orion.yaml | 5 +- .../pyobsforge/obsdb/nesdis_amsr2_db.py | 51 ++++++++++--------- ush/python/pyobsforge/obsdb/smap_db.py | 23 +++++++++ ush/python/pyobsforge/obsdb/smos_db.py | 22 +++++--- 5 files changed, 74 insertions(+), 33 deletions(-) diff --git a/parm/config.hera.yaml b/parm/config.hera.yaml index 52cef6f9..c71e4739 100644 --- a/parm/config.hera.yaml +++ b/parm/config.hera.yaml @@ -62,6 +62,12 @@ marinedump: qc config: min: 0.0 max: 1.0 + smap: + list: + - sss_smap + qc config: + min: 0.1 + max: 40.0 WALLTIME_MARINE_DUMP: '00:10:00' TASK_GEOM_MARINE_DUMP: '1:ppn=20:tpp=2' diff --git a/parm/config.orion.yaml b/parm/config.orion.yaml index f5cb9bbf..6d10448c 100644 --- a/parm/config.orion.yaml +++ b/parm/config.orion.yaml @@ -9,7 +9,7 @@ obsforge: SCHEDULER: slurm ACCOUNT: da-cpu QUEUE: debug - PARTITION: hercules + PARTITION: orion KEEPDATA: NO assim_freq: 6 @@ -57,7 +57,8 @@ marinedump: error ratio: 1.0 nesdis_amsr2: list: - - seaice_amsr2_gw1_l2 + - icec_amsr2_north + - icec_amsr2_south qc config: min: 0.0 max: 1.0 diff --git a/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py index a03f08b6..bc231932 100644 --- a/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py +++ b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py @@ -44,31 +44,36 @@ def create_database(self): self.execute_query(query) def parse_filename(self, filename): - # Example filename: - # AMSR2-SEAICE-NH_v2r2_GW1_s202503140032240_e202503140211220_c202503140245560.nc + """Extract metadata from filenames matching the AMSR2-SEAICE pattern.""" + # Make sure the filename matches the expected pattern + # Pattern: AMSR2-SEAICE-NH_v2r2_GW1_s202503140032240_e202503140211220_c202503140245560.nc parts = os.path.basename(filename).split('_') - # parts = os.path.basename(filename).replace('_', '-').split('-') + + # Pre-check: Must be an AMSR2-SEAICE file + if not parts[0].startswith("AMSR2-SEAICE"): + print(f"[DEBUG] Skipping non AMSR2-SEAICE file: {filename}") + return None + try: - if parts[0].startswith("AMSR2-SEAICE"): - # Extract hemisphere from the first hyphen-separated segment - name_parts = parts[0].split('-') - instrument = name_parts[0] - hemisphere = name_parts[2].lower() - - # Determine obs_type - if hemisphere == 'nh': - obs_type = 'icec_amsr2_north' - elif hemisphere == 'sh': - obs_type = 'icec_amsr2_south' - else: - raise ValueError(f"Unrecognized hemisphere in filename: {filename}") - - satellite = parts[2] - obs_time_str = parts[3][1:16] # sYYYYMMDDHHMMSSf - obs_time = datetime.strptime(obs_time_str, "%Y%m%d%H%M%S%f") - receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) - - return filename, obs_time, receipt_time, instrument, satellite, obs_type + # Extract hemisphere from the first hyphen-separated segment + name_parts = parts[0].split('-') + instrument = name_parts[0] + hemisphere = name_parts[2].lower() + + # Determine obs_type + if hemisphere == 'nh': + obs_type = 'icec_amsr2_north' + elif hemisphere == 'sh': + obs_type = 'icec_amsr2_south' + else: + print(f"[DEBUG] Unrecognized hemisphere in filename: {filename}") + return None + + satellite = parts[2] + obs_time = datetime.strptime(parts[3][1:16], "%Y%m%d%H%M%S%f") + receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) + return filename, obs_time, receipt_time, instrument, satellite, obs_type + except Exception as e: print(f"[DEBUG] Error parsing filename {filename}: {e}") return None diff --git a/ush/python/pyobsforge/obsdb/smap_db.py b/ush/python/pyobsforge/obsdb/smap_db.py index 36108884..efa464e4 100644 --- a/ush/python/pyobsforge/obsdb/smap_db.py +++ b/ush/python/pyobsforge/obsdb/smap_db.py @@ -39,6 +39,29 @@ def create_database(self): """ self.execute_query(query) + def parse_filename(self, filename): + # Pattern: SMAP_L2B_SSS_NRT_54047_A_20250315T011742.h5 + basename = os.path.basename(filename) + parts = basename.split('_') + + # Pre-check: Must match SMAP_L2B_SSS_NRT structure + if not basename.startswith("SMAP_L2B_SSS_NRT") or len(parts) < 7: + print(f"[DEBUG] Skipping non-SMAP_L2B_SSS_NRT file: {filename}") + return None + + try: + satellite = "SMAP" + timestamp_with_ext = parts[6] + timestamp_str = os.path.splitext(timestamp_with_ext)[0] + obs_time = datetime.strptime(timestamp_str, "%Y%m%dT%H%M%S") + receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) + return filename, obs_time, receipt_time, satellite + + except Exception as e: + print(f"[DEBUG] Error parsing filename {filename}: {e}") + return None + + def parse_filename(self, filename): # patten: SMAP_L2B_SSS_NRT_54047_A_20250315T011742.h5 basename = os.path.basename(filename) diff --git a/ush/python/pyobsforge/obsdb/smos_db.py b/ush/python/pyobsforge/obsdb/smos_db.py index 41ad43cd..dd4c3275 100644 --- a/ush/python/pyobsforge/obsdb/smos_db.py +++ b/ush/python/pyobsforge/obsdb/smos_db.py @@ -40,18 +40,24 @@ def create_database(self): self.execute_query(query) def parse_filename(self, filename): - # patten: SM_OPER_MIR_OSUDP2_20250315T001156_20250315T010515_700_001_1.nc + # Extract metadata from filenames matching the SMOS OSUDP2 pattern. + # Pattern: SM_OPER_MIR_OSUDP2_20250315T001156_20250315T010515_700_001_1.nc basename = os.path.basename(filename) parts = basename.split('_') + + # Pre-check: Must match expected prefix and structure + if not basename.startswith("SM_OPER_MIR_OSUDP") or len(parts) < 6: + print(f"[DEBUG] Skipping non-SMOS OSUDP2 file: {filename}") + return None + try: - if basename.startswith("SM_OPER_MIR_OSUDP") and len(parts) >= 6: - satellite = "SMOS" - start_time_str = parts[4] - obs_time = datetime.strptime(start_time_str, "%Y%m%dT%H%M%S") - receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) - return filename, obs_time, receipt_time, satellite + satellite = "SMOS" + start_time_str = parts[4] + obs_time = datetime.strptime(start_time_str, "%Y%m%dT%H%M%S") + receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) + return filename, obs_time, receipt_time, satellite - except ValueError as e: + except Exception as e: print(f"[DEBUG] Error parsing filename {filename}: {e}") return None From 4c10563fb937cea07e87f1cd68c29102eafa7740 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Tue, 29 Apr 2025 15:48:27 +0000 Subject: [PATCH 27/32] clean up --- ush/python/pyobsforge/obsdb/smap_db.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/ush/python/pyobsforge/obsdb/smap_db.py b/ush/python/pyobsforge/obsdb/smap_db.py index efa464e4..611c9bc8 100644 --- a/ush/python/pyobsforge/obsdb/smap_db.py +++ b/ush/python/pyobsforge/obsdb/smap_db.py @@ -61,24 +61,6 @@ def parse_filename(self, filename): print(f"[DEBUG] Error parsing filename {filename}: {e}") return None - - def parse_filename(self, filename): - # patten: SMAP_L2B_SSS_NRT_54047_A_20250315T011742.h5 - basename = os.path.basename(filename) - parts = basename.split('_') - try: - if basename.startswith("SMAP_L2B_SSS_NRT") and len(parts) >= 7: - satellite = "SMAP" - timestamp_with_ext = parts[6] - timestamp_str = os.path.splitext(timestamp_with_ext)[0] - obs_time = datetime.strptime(timestamp_str, "%Y%m%dT%H%M%S") - receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) - return filename, obs_time, receipt_time, satellite - - except ValueError as e: - print(f"[DEBUG] Error parsing filename {filename}: {e}") - return None - def ingest_files(self): """Scan the directory for new observation files and insert them into the database.""" obs_files = glob.glob(os.path.join(self.base_dir, "*.h5")) From c6a117b1d6510bc6721e166caad5cb0c3d3c3a5e Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Tue, 29 Apr 2025 16:04:08 +0000 Subject: [PATCH 28/32] config yaml for hercules --- parm/config.hercules.yaml | 68 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 parm/config.hercules.yaml diff --git a/parm/config.hercules.yaml b/parm/config.hercules.yaml new file mode 100644 index 00000000..d43c301b --- /dev/null +++ b/parm/config.hercules.yaml @@ -0,0 +1,68 @@ +obsforge: + PSLOT: obsforge + HOMEobsforge: /work2/noaa/da/mchoi3/temp/obsForge + SDATE: 202503141800 + EDATE: 202503150000 + COMROOT: /work2/noaa/da/mchoi3/temp/test_obsForge/COMROOT + DCOMROOT: /work2/noaa/da/common/lfs/h1/ops/prod/dcom + DATAROOT: /work2/noaa/da/mchoi3/temp/test_obsForge/RUNDIRS + SCHEDULER: slurm + ACCOUNT: da-cpu + QUEUE: debug + PARTITION: hercules + KEEPDATA: NO + assim_freq: 6 + +aoddump: + provider: VIIRSAOD + platforms: ['npp', 'n20', 'n21'] + thinning_threshold: 0 + channel: 4 + preqc: 2 + WALLTIME_AOD_DUMP: '00:10:00' + TASK_GEOM_AOD_DUMP: '1:ppn=1:tpp=1' + MEMORY_AOD_DUMP: 96GB + +marinedump: + providers: + ghrsst: + list: + - sst_viirs_n21_l3u + - sst_viirs_n20_l3u + - sst_viirs_npp_l3u + - sst_avhrrf_ma_l3u + - sst_avhrrf_mb_l3u + - sst_avhrrf_mc_l3u + - sst_ahi_h08_l3c + - sst_abi_g17_l3c + - sst_abi_g16_l3c + qc config: + min: -2.0 + max: 45.0 + stride: 15 + min number of obs: 10 + rads: + list: + - rads_adt_3a + - rads_adt_3b + - rads_adt_6a + - rads_adt_c2 + - rads_adt_j2 + - rads_adt_j3 + - rads_adt_sa + - rads_adt_sw + qc config: + min: -2.0 + max: 3.0 + error ratio: 1.0 + nesdis_amsr2: + list: + - icec_amsr2_north + - icec_amsr2_south + qc config: + min: 0.0 + max: 1.0 + + WALLTIME_MARINE_DUMP: '00:10:00' + TASK_GEOM_MARINE_DUMP: '1:ppn=20:tpp=2' + MEMORY_MARINE_DUMP: 32GB From 6050ede524f0ee6b7e741b5838a8bd12020a87de Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Tue, 29 Apr 2025 20:42:30 +0000 Subject: [PATCH 29/32] smap success --- parm/config.hera.yaml | 8 +- parm/config.hercules.yaml | 22 +++-- parm/config.orion.yaml | 22 +++-- .../pyobsforge/obsdb/nesdis_amsr2_db.py | 8 +- ush/python/pyobsforge/obsdb/smap_db.py | 10 ++- ush/python/pyobsforge/obsdb/smos_db.py | 10 ++- ush/python/pyobsforge/task/marine_prepobs.py | 86 ++++++++++++------- ush/python/pyobsforge/task/providers.py | 6 ++ .../tests/test_nesdis_amsr2_database.py | 4 +- .../pyobsforge/tests/test_smap_database.py | 3 + .../pyobsforge/tests/test_smos_database.py | 3 + 11 files changed, 126 insertions(+), 56 deletions(-) diff --git a/parm/config.hera.yaml b/parm/config.hera.yaml index c71e4739..379d057b 100644 --- a/parm/config.hera.yaml +++ b/parm/config.hera.yaml @@ -64,7 +64,13 @@ marinedump: max: 1.0 smap: list: - - sss_smap + - sss_smap_l2 + qc config: + min: 0.1 + max: 40.0 + smos: + list: + - sss_smos_l2 qc config: min: 0.1 max: 40.0 diff --git a/parm/config.hercules.yaml b/parm/config.hercules.yaml index d43c301b..379d057b 100644 --- a/parm/config.hercules.yaml +++ b/parm/config.hercules.yaml @@ -1,15 +1,15 @@ obsforge: PSLOT: obsforge - HOMEobsforge: /work2/noaa/da/mchoi3/temp/obsForge + HOMEobsforge: /scratch1/NCEPDEV/da/Mindo.Choi/test/obsForge SDATE: 202503141800 EDATE: 202503150000 - COMROOT: /work2/noaa/da/mchoi3/temp/test_obsForge/COMROOT - DCOMROOT: /work2/noaa/da/common/lfs/h1/ops/prod/dcom - DATAROOT: /work2/noaa/da/mchoi3/temp/test_obsForge/RUNDIRS + COMROOT: /scratch1/NCEPDEV/da/Mindo.Choi/test/test_obsForge/COMROOT + DCOMROOT: /scratch1/NCEPDEV/da/common/realtime_sample/lfs/h1/ops/prod/dcom + DATAROOT: /scratch1/NCEPDEV/da/Mindo.Choi/test/test_obsForge/RUNDIRS SCHEDULER: slurm ACCOUNT: da-cpu QUEUE: debug - PARTITION: hercules + PARTITION: hera KEEPDATA: NO assim_freq: 6 @@ -62,6 +62,18 @@ marinedump: qc config: min: 0.0 max: 1.0 + smap: + list: + - sss_smap_l2 + qc config: + min: 0.1 + max: 40.0 + smos: + list: + - sss_smos_l2 + qc config: + min: 0.1 + max: 40.0 WALLTIME_MARINE_DUMP: '00:10:00' TASK_GEOM_MARINE_DUMP: '1:ppn=20:tpp=2' diff --git a/parm/config.orion.yaml b/parm/config.orion.yaml index 6d10448c..379d057b 100644 --- a/parm/config.orion.yaml +++ b/parm/config.orion.yaml @@ -1,15 +1,15 @@ obsforge: PSLOT: obsforge - HOMEobsforge: /work2/noaa/da/mchoi3/temp/obsForge + HOMEobsforge: /scratch1/NCEPDEV/da/Mindo.Choi/test/obsForge SDATE: 202503141800 EDATE: 202503150000 - COMROOT: /work2/noaa/da/mchoi3/temp/test_obsForge/COMROOT - DCOMROOT: /work2/noaa/da/common/lfs/h1/ops/prod/dcom - DATAROOT: /work2/noaa/da/mchoi3/temp/test_obsForge/RUNDIRS + COMROOT: /scratch1/NCEPDEV/da/Mindo.Choi/test/test_obsForge/COMROOT + DCOMROOT: /scratch1/NCEPDEV/da/common/realtime_sample/lfs/h1/ops/prod/dcom + DATAROOT: /scratch1/NCEPDEV/da/Mindo.Choi/test/test_obsForge/RUNDIRS SCHEDULER: slurm ACCOUNT: da-cpu QUEUE: debug - PARTITION: orion + PARTITION: hera KEEPDATA: NO assim_freq: 6 @@ -62,6 +62,18 @@ marinedump: qc config: min: 0.0 max: 1.0 + smap: + list: + - sss_smap_l2 + qc config: + min: 0.1 + max: 40.0 + smos: + list: + - sss_smos_l2 + qc config: + min: 0.1 + max: 40.0 WALLTIME_MARINE_DUMP: '00:10:00' TASK_GEOM_MARINE_DUMP: '1:ppn=20:tpp=2' diff --git a/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py index bc231932..1debdd41 100644 --- a/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py +++ b/ush/python/pyobsforge/obsdb/nesdis_amsr2_db.py @@ -61,10 +61,10 @@ def parse_filename(self, filename): hemisphere = name_parts[2].lower() # Determine obs_type - if hemisphere == 'nh': - obs_type = 'icec_amsr2_north' - elif hemisphere == 'sh': - obs_type = 'icec_amsr2_south' + if hemisphere == "nh": + obs_type = "icec_amsr2_north" + elif hemisphere == "sh": + obs_type = "icec_amsr2_south" else: print(f"[DEBUG] Unrecognized hemisphere in filename: {filename}") return None diff --git a/ush/python/pyobsforge/obsdb/smap_db.py b/ush/python/pyobsforge/obsdb/smap_db.py index 611c9bc8..fbe115e4 100644 --- a/ush/python/pyobsforge/obsdb/smap_db.py +++ b/ush/python/pyobsforge/obsdb/smap_db.py @@ -34,7 +34,8 @@ def create_database(self): filename TEXT UNIQUE, obs_time TIMESTAMP, receipt_time TIMESTAMP, - satellite TEXT + satellite TEXT, + obs_type TEXT ) """ self.execute_query(query) @@ -51,11 +52,12 @@ def parse_filename(self, filename): try: satellite = "SMAP" + obs_type = "sss_smap_l2" timestamp_with_ext = parts[6] timestamp_str = os.path.splitext(timestamp_with_ext)[0] obs_time = datetime.strptime(timestamp_str, "%Y%m%dT%H%M%S") receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) - return filename, obs_time, receipt_time, satellite + return filename, obs_time, receipt_time, satellite, obs_type except Exception as e: print(f"[DEBUG] Error parsing filename {filename}: {e}") @@ -73,8 +75,8 @@ def ingest_files(self): parsed_data = self.parse_filename(file) if parsed_data: query = """ - INSERT INTO obs_files (filename, obs_time, receipt_time, satellite) - VALUES (?, ?, ?, ?) + INSERT INTO obs_files (filename, obs_time, receipt_time, satellite, obs_type) + VALUES (?, ?, ?, ?, ?) """ try: self.insert_record(query, parsed_data) diff --git a/ush/python/pyobsforge/obsdb/smos_db.py b/ush/python/pyobsforge/obsdb/smos_db.py index dd4c3275..3ea89705 100644 --- a/ush/python/pyobsforge/obsdb/smos_db.py +++ b/ush/python/pyobsforge/obsdb/smos_db.py @@ -34,7 +34,8 @@ def create_database(self): filename TEXT UNIQUE, obs_time TIMESTAMP, receipt_time TIMESTAMP, - satellite TEXT + satellite TEXT, + obs_type TEXT ) """ self.execute_query(query) @@ -52,10 +53,11 @@ def parse_filename(self, filename): try: satellite = "SMOS" + obs_type = "sss_smos_l2" start_time_str = parts[4] obs_time = datetime.strptime(start_time_str, "%Y%m%dT%H%M%S") receipt_time = datetime.fromtimestamp(os.path.getctime(filename)) - return filename, obs_time, receipt_time, satellite + return filename, obs_time, receipt_time, satellite, obs_type except Exception as e: print(f"[DEBUG] Error parsing filename {filename}: {e}") @@ -73,8 +75,8 @@ def ingest_files(self): parsed_data = self.parse_filename(file) if parsed_data: query = """ - INSERT INTO obs_files (filename, obs_time, receipt_time, satellite) - VALUES (?, ?, ?, ?) + INSERT INTO obs_files (filename, obs_time, receipt_time, satellite, obs_type) + VALUES (?, ?, ?, ?, ?) """ try: self.insert_record(query, parsed_data) diff --git a/ush/python/pyobsforge/task/marine_prepobs.py b/ush/python/pyobsforge/task/marine_prepobs.py index 7bc68dff..10bcd2d9 100644 --- a/ush/python/pyobsforge/task/marine_prepobs.py +++ b/ush/python/pyobsforge/task/marine_prepobs.py @@ -38,6 +38,8 @@ def __init__(self, config: Dict[str, Any]) -> None: self.ghrsst = ProviderConfig.from_task_config("ghrsst", self.task_config) self.rads = ProviderConfig.from_task_config("rads", self.task_config) self.nesdis_amsr2 = ProviderConfig.from_task_config("nesdis_amsr2", self.task_config) + self.smap = ProviderConfig.from_task_config("smap", self.task_config) + self.smos = ProviderConfig.from_task_config("smos", self.task_config) # Initialize the list of processed ioda files # TODO: Does not work. This should be a list of gathered ioda files that are created @@ -52,6 +54,8 @@ def initialize(self) -> None: self.ghrsst.db.ingest_files() self.rads.db.ingest_files() self.nesdis_amsr2.db.ingest_files() + self.smap.db.ingest_files() + self.smos.db.ingest_files() @logit(logger) def execute(self) -> None: @@ -132,17 +136,10 @@ def process_obs_space(self, # Process NESDIS_AMSR2 if provider == "nesdis_amsr2": - parts = obs_space.split("_") - if obs_space.startswith("icec_amsr2_"): - platform = "GW1" - instrument = "AMSR2" - # obs_type = "SEAICE" - satellite = "GW1" - else: - platform = parts[1].upper() - instrument = "AMSR2" - # obs_type = "SEAICE" - satellite = "GW1" + # Only handling "icec_amsr2_" cases + platform = "GW1" + instrument = "AMSR2" + satellite = "GW1" kwargs = { 'provider': "amsr2", 'obs_space': obs_space, @@ -157,6 +154,46 @@ def process_obs_space(self, } result = self.nesdis_amsr2.process_obs_space(**kwargs) return result + + # Process SMAP + if provider == "smap": + platform = None + satellite = "SMAP" + instrument = None + kwargs = { + 'provider': provider, + 'obs_space': obs_space, + 'platform': platform, + 'instrument': instrument, + 'satellite': satellite, + 'obs_type': obs_space, + 'output_file': output_file, + 'window_begin': self.task_config.window_begin, + 'window_end': self.task_config.window_end, + 'task_config': self.task_config + } + result = self.smap.process_obs_space(**kwargs) + return result + + # Process SMOS SSS + if provider == "smos": + platform = None + satellite = "SMOS" + instrument = None + kwargs = { + 'provider': provider, + 'obs_space': obs_space, + 'platform': platform, + 'instrument': instrument, + 'satellite': satellite, + 'obs_type': obs_space, + 'output_file': output_file, + 'window_begin': self.task_config.window_begin, + 'window_end': self.task_config.window_end, + 'task_config': self.task_config + } + result = self.smos.process_obs_space(**kwargs) + return result else: logger.error(f"Provider {provider} not supported") @@ -178,26 +215,13 @@ def finalize(self) -> None: obs_types = ['sst', 'adt', 'icec', 'sss'] src_dst_obs_list = [] # list of [src_file, dst_file] for obs_type in obs_types: - if obs_type == 'icec': - # Special handling for icec - comout_tmp = join(comout, 'icec') - FileHandler({'mkdir': [comout_tmp]}).sync() - - # Find BOTH north and south files - ioda_files = [] - ioda_files += glob.glob(join(self.task_config['DATA'], - f"{self.task_config['PREFIX']}*icec_amsr2_north*.nc")) - ioda_files += glob.glob(join(self.task_config['DATA'], - f"{self.task_config['PREFIX']}*icec_amsr2_south*.nc")) - else: - # Standard handling for other obs types - # Create the destination directory - comout_tmp = join(comout, obs_type) - FileHandler({'mkdir': [comout_tmp]}).sync() - - # Glob the ioda files - ioda_files = glob.glob(join(self.task_config['DATA'], - f"{self.task_config['PREFIX']}*{obs_type}_*.nc")) + # Create the destination directory + comout_tmp = join(comout, obs_type) + FileHandler({'mkdir': [comout_tmp]}).sync() + + # Glob the ioda files + ioda_files = glob.glob(join(self.task_config['DATA'], + f"{self.task_config['PREFIX']}*{obs_type}_*.nc")) for ioda_file in ioda_files: logger.info(f"ioda_file: {ioda_file}") src_file = ioda_file diff --git a/ush/python/pyobsforge/task/providers.py b/ush/python/pyobsforge/task/providers.py index 087a0850..ef8f9220 100644 --- a/ush/python/pyobsforge/task/providers.py +++ b/ush/python/pyobsforge/task/providers.py @@ -2,6 +2,8 @@ from pyobsforge.obsdb.ghrsst_db import GhrSstDatabase from pyobsforge.obsdb.rads_db import RADSDatabase from pyobsforge.obsdb.nesdis_amsr2_db import NesdisAmsr2Database +from pyobsforge.obsdb.smap_db import SmapDatabase +from pyobsforge.obsdb.smos_db import SmosDatabase from typing import Any from dataclasses import dataclass from wxflow import AttrDict @@ -62,6 +64,10 @@ def from_task_config(cls, provider_name: str, task_config: AttrDict) -> "Provide db = RADSDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="wgrdbul/adt") elif provider_name == "nesdis_amsr2": db = NesdisAmsr2Database(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="seaice/pda") + elif provider_name == "smap": + db = SmapDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="wtxtbul/satSSS/SMAP") + elif provider_name == "smos": + db = SmapDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="wtxtbul/satSSS/SMOS") else: raise NotImplementedError(f"DB setup for provider {provider_name} not yet implemented") diff --git a/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py b/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py index f16f9a75..ef93e971 100644 --- a/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py +++ b/ush/python/pyobsforge/tests/test_nesdis_amsr2_database.py @@ -165,7 +165,7 @@ def test_get_valid_files_receipt(db): instrument="AMSR2", satellite="GW1", obs_type="icec_amsr2_north", - check_receipt='gfs') + check_receipt="gfs") valid_files_south = db.get_valid_files(window_begin=window_begin, window_end=window_end, @@ -173,7 +173,7 @@ def test_get_valid_files_receipt(db): instrument="AMSR2", satellite="GW1", obs_type="icec_amsr2_south", - check_receipt='gfs') + check_receipt="gfs") valid_files = valid_files_north + valid_files_south diff --git a/ush/python/pyobsforge/tests/test_smap_database.py b/ush/python/pyobsforge/tests/test_smap_database.py index 77abaee2..a96469dd 100644 --- a/ush/python/pyobsforge/tests/test_smap_database.py +++ b/ush/python/pyobsforge/tests/test_smap_database.py @@ -94,6 +94,7 @@ def test_parse_valid_filename(db): assert parsed[1] == datetime(2025, 3, 16, 6, 50, 4) assert parsed[2] == creation_time assert parsed[3] == "SMAP" + assert parsed[4] == "sss_smap" def test_parse_invalid_filename(db): @@ -121,6 +122,7 @@ def test_get_valid_files(db): valid_files = db.get_valid_files(window_begin=window_begin, window_end=window_end, dst_dir=dst_dir, + obs_type="sss_smap_l2", satellite="SMAP") print("Valid files in window:", valid_files) @@ -144,6 +146,7 @@ def test_get_valid_files_receipt(db): window_end=window_end, dst_dir=dst_dir, satellite="SMAP", + obs_type="sss_smap_l2", check_receipt='gfs') # TODO (G): Giving up for now on trying to mock the receipt time, will revisit later diff --git a/ush/python/pyobsforge/tests/test_smos_database.py b/ush/python/pyobsforge/tests/test_smos_database.py index c72e74b9..20701e03 100644 --- a/ush/python/pyobsforge/tests/test_smos_database.py +++ b/ush/python/pyobsforge/tests/test_smos_database.py @@ -94,6 +94,7 @@ def test_parse_valid_filename(db): assert parsed[1] == datetime(2025, 3, 16, 6, 13, 18) # Start time assert parsed[2] == creation_time assert parsed[3] == "SMOS" + assert parsed[4] == "sss_smos_l2" def test_parse_invalid_filename(db): @@ -121,6 +122,7 @@ def test_get_valid_files(db): valid_files = db.get_valid_files(window_begin=window_begin, window_end=window_end, dst_dir=dst_dir, + obs_type="sss_smos_l2", satellite="SMOS") print("Valid files in window:", valid_files) @@ -144,6 +146,7 @@ def test_get_valid_files_receipt(db): window_end=window_end, dst_dir=dst_dir, satellite="SMOS", + obs_type="sss_smos_l2", check_receipt='gfs') # TODO (G): Giving up for now on trying to mock the receipt time, will revisit later From c8ceb9f6c88bbd442eaee5b484aea278b4cfe93d Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Tue, 29 Apr 2025 20:52:38 +0000 Subject: [PATCH 30/32] pytest --- ush/python/pyobsforge/task/providers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ush/python/pyobsforge/task/providers.py b/ush/python/pyobsforge/task/providers.py index ef8f9220..32899a80 100644 --- a/ush/python/pyobsforge/task/providers.py +++ b/ush/python/pyobsforge/task/providers.py @@ -67,7 +67,7 @@ def from_task_config(cls, provider_name: str, task_config: AttrDict) -> "Provide elif provider_name == "smap": db = SmapDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="wtxtbul/satSSS/SMAP") elif provider_name == "smos": - db = SmapDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="wtxtbul/satSSS/SMOS") + db = SmosDatabase(db_name=f"{provider_name}.db", dcom_dir=task_config.DCOMROOT, obs_dir="wtxtbul/satSSS/SMOS") else: raise NotImplementedError(f"DB setup for provider {provider_name} not yet implemented") From 539a129c7a469664bc4a5f531a45bfd4aedb7c0d Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 30 Apr 2025 14:56:26 +0000 Subject: [PATCH 31/32] fix pytest --- ush/python/pyobsforge/tests/test_smap_database.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ush/python/pyobsforge/tests/test_smap_database.py b/ush/python/pyobsforge/tests/test_smap_database.py index a96469dd..94bec114 100644 --- a/ush/python/pyobsforge/tests/test_smap_database.py +++ b/ush/python/pyobsforge/tests/test_smap_database.py @@ -94,7 +94,7 @@ def test_parse_valid_filename(db): assert parsed[1] == datetime(2025, 3, 16, 6, 50, 4) assert parsed[2] == creation_time assert parsed[3] == "SMAP" - assert parsed[4] == "sss_smap" + assert parsed[4] == "sss_smap_l2" def test_parse_invalid_filename(db): From 7f7c7fd4a58be721fff3be82fc0c8cbf9ed506b1 Mon Sep 17 00:00:00 2001 From: Mindo Choi Date: Wed, 30 Apr 2025 14:59:46 +0000 Subject: [PATCH 32/32] address copilot's review --- ush/python/pyobsforge/task/providers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/ush/python/pyobsforge/task/providers.py b/ush/python/pyobsforge/task/providers.py index 32899a80..a7c530cc 100644 --- a/ush/python/pyobsforge/task/providers.py +++ b/ush/python/pyobsforge/task/providers.py @@ -101,8 +101,6 @@ def process_obs_space(self, **kwargs) -> None: window_end = kwargs.get('window_end') task_config = kwargs.get('task_config') - logger.debug(f"obs_type for provider {provider}: {obs_type}") - # Query the database for valid files input_files = self.db.get_valid_files(window_begin=window_begin, window_end=window_end,