Add Jasechko et al. 2024 WTD dataset subset (#162)

* update tests to include additional sites caught in filter * initial commit add jasechko dataset * incorporate new dataset, add tests * add data catalog entry * add dataset description * reconcile merge conflicts * add catalog entry for new dataset * update test to account for added dataset
hydroframe · Apr 19, 2024 · 2bea890 · 2bea890
1 parent 93cd9e0
commit 2bea890
Showing 10 changed files with 174 additions and 28 deletions.
diff --git a/docs/dataset_text.yaml b/docs/dataset_text.yaml
@@ -162,3 +162,22 @@ datasets:
       - Consolidating multiple concurrent data series
         - The AmeriFlux data sometimes provides multiple concurrent observation series for the same variable for the same site. In these cases, we consolidate the multiple series into a single series following these prioritizations:
         - We prioritize series without any suffix in the original variable names. If none is present, we prioritize variables with a "PI" suffix in the variable name, to indicate the data has been QA/QC reviewed by the tower team. 
+
+  "jasechko_2024":
+    summary: >
+      A subset of the annual median water table depth measurements for sites included in `Jasechko et al. 2024 <https://www.nature.com/articles/s41586-023-06879-8>`_.
+      These sites are subset to only those within the `CONUS2 <https://hydroframe.org/parflow-conus2>`_ boundary, and only includes data that the authors were approved to publicly release. 
+
+    processing_notes: >
+      Details on the initial data processing and collection are provided in `Jasechko et al. 2024 <https://www.nature.com/articles/s41586-023-06879-8>`_.
+      The data provided here was acquired from https://zenodo.org/records/10003697. Data was filtered to the CONUS2 domain and all sites
+      were also mapped to the Natural Earth state boundary shapefiles to include each site's state, where this mapping was unambiguous.
+
+      All sites were compared to existing USGS well records. If a site was within 0.001 degree latitude/longitude of an existing well, the
+      site is flagged as being a usgs_site.
+
+      Notes:
+
+      - The data included here is only a subset of the data in the original paper as not all data was made publicly available. 
+       
+      - The data included in this dataset are a static entry matching only what was in the publication. For up to date groundwater records refer to the USGS well dataset which is updated regularly with the most recent observations.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "hf_hydrodata"
-version = "1.1.8"
+version = "1.1.9"
 description = "hydroframe tools and utilities"
 authors = ["William M. Hasling", "Laura Condon", "Reed Maxwell",  "George Artavanis", "Amy M. Johnson", "Amy C. Defnet"]
 license = "MIT"

diff --git a/src/hf_hydrodata/model/aggregation.csv b/src/hf_hydrodata/model/aggregation.csv
@@ -12,4 +12,5 @@ sod,Start of Day,Value and the start of the day
 wy,Water Year,Aggregate by water year into files
 site_id,SiteId,Aggregate by site_it into files
 accumulated,Accumulated,Accumulated over time
-sum_snow_adjusted,Snow Ajusted Total,Snow Adjusted Total
+sum_snow_adjusted,Snow Ajusted Total,Snow Adjusted Total
+median,Median,
diff --git a/src/hf_hydrodata/model/data_catalog_entry.csv b/src/hf_hydrodata/model/data_catalog_entry.csv
@@ -399,3 +399,4 @@ id,dataset,file_type,variable,dataset_var,entry_start_date,entry_end_date,tempor
 522,usgs_nwis,sql,water_table_depth,wtd,01/01/1801,,instantaneous,m,-,,,1,,,groundwater
 523,conus1_domain,tiff,flow_direction,band_data,,,static,-,-,conus1,,1,/hydrodata/PFCLM/CONUS1_baseline/other_domain_files/flow_direction.tif,"The flow directions for every grid cell (1=down, 2=left, 3=up, 4=right)",
 524,conus1_domain,pfb,flow_direction,band_data,,,static,-,-,conus1,,1,/hydrodata/PFCLM/CONUS1_baseline/other_domain_files/flow_direction.pfb,"The flow directions for every grid cell (1=down, 2=left, 3=up, 4=right)",
+525,jasechko_2024,sql,water_table_depth,wtd,1801,2022,yearly,m,median,,,1,,,groundwater
diff --git a/src/hf_hydrodata/model/dataset.csv b/src/hf_hydrodata/model/dataset.csv
@@ -17,3 +17,4 @@ snotel,point_observations,Snow Telemetry,usda,,,,,point,UTC-00:00,,
 scan,point_observations,Soil Climate Analysis Network,usda,,,,,point,UTC-00:00,,
 ameriflux,point_observations,Ameriflux,ameriflux,,,,,point,UTC-00:00,,
 noaa,forcing,NOAA Climate Data,noaa,,https://www.climate.gov/maps-data/dataset/daily-temperature-and-precipitation-reports-data-tables,6/1/22,,gridded,UTC-00:00,,
+jasechko_2024,point_observations,"Jasechko et al. 2024 water table depth dataset, subset to the CONUS2 domain",jasechko_2024,10.1038/s41586-023-06879-8,https://zenodo.org/records/10003697,1801,2022,point,,,
diff --git a/src/hf_hydrodata/model/datasource.csv b/src/hf_hydrodata/model/datasource.csv
@@ -5,3 +5,4 @@ usda,United States Department of Agriculture
 usgs,United State Geological Survey
 ameriflux,Ameriflux
 noaa,National Oceanic and Atospheric Administration
+jasechko_2024,Jasechko et al. 2024
diff --git a/src/hf_hydrodata/model/temporal_resolution.csv b/src/hf_hydrodata/model/temporal_resolution.csv
@@ -7,4 +7,5 @@ static,static,Variables that do not change with time
 monthly_clim,Monthly Climatology,Long term monthly mean values
 annual_clim,Annual Climatology,Long term annual mean values
 daily_of_week,Daily Over Week,Daily values aggregated over week
-instantaneous,Instantaneous,Data values reported at a given point in time
+instantaneous,Instantaneous,Data values reported at a given point in time
+yearly,Yearly,Data values stored per calendar year
diff --git a/src/hf_hydrodata/point.py b/src/hf_hydrodata/point.py
@@ -51,6 +51,7 @@
     "well_attributes",
     "snotel_station_attributes",
     "flux_tower_attributes",
+    "jasechko_attributes",
 ]
 
 
@@ -67,14 +68,14 @@ def get_point_data(*args, **kwargs):
     ----------
     dataset : str, required
         Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
-        'scan', 'ameriflux'.
+        'scan', 'ameriflux', 'jasechko_2024'.
     variable : str, required
         Description of type of data requested. Currently supported: 'streamflow', 'water_table_depth', 'swe',
         'precipitation', 'air_temp', 'soil_moisture', 'latent_heat', 'sensible_heat',
         'downward_shortwave', 'downward_longwave', 'vapor_pressure_deficit', 'wind_speed'.
     temporal_resolution : str, required
-        Collection frequency of data requested. Currently supported: 'daily', 'hourly', and 'instantaneous'.
-        Please see the documentation for allowable combinations with `variable`.
+        Collection frequency of data requested. Currently supported: 'daily', 'hourly', 'instantaneous', and.
+        'yearly'. Please see the documentation for allowable combinations with `variable`.
     aggregation : str, required
         Additional information specifying the aggregation method for the variable to be returned.
         Options include descriptors such as 'mean' and 'sum'. Please see the documentation
@@ -241,7 +242,7 @@ def get_point_data(*args, **kwargs):
     if (var_id in (1, 2, 3, 4)) | (var_id in range(6, 25)):
         data_df = _get_data_nc(site_list, var_id, *args, **kwargs)
 
-    elif var_id == 5:
+    elif var_id in (5, 25):
         data_df = _get_data_sql(conn, site_list, var_id, *args, **kwargs)
 
     conn.close()
@@ -257,13 +258,14 @@ def get_point_metadata(*args, **kwargs):
     ----------
     dataset : str, required
         Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
-        'scan', 'ameriflux'.
+        'scan', 'ameriflux', 'jasechko_2024'.
     variable : str, required
         Description of type of data requested. Currently supported: 'streamflow', 'water_table_depth', 'swe',
         'precipitation', 'air_temp', 'soil_moisture', 'latent_heat', 'sensible_heat',
         'downward_shortwave', 'downward_longwave', 'vapor_pressure_deficit', 'wind_speed'.
     temporal_resolution : str, required
-        Collection frequency of data requested. Currently supported: 'daily', 'hourly', and 'instantaneous'.
+        Collection frequency of data requested. Currently supported: 'daily', 'hourly', 'instantaneous',
+        and 'yearly'.
         Please see the documentation for allowable combinations with `variable`.
     aggregation : str, required
         Additional information specifying the aggregation method for the variable to be returned.
@@ -404,7 +406,9 @@ class AS gagesii_class,
         )
         metadata_df = pd.merge(metadata_df, attributes_df, how="left", on="site_id")
 
-    if "groundwater well" in metadata_df["site_type"].unique():
+    if ("groundwater well" in metadata_df["site_type"].unique()) and (
+        options["dataset"] == "usgs_nwis"
+    ):
         attributes_df = pd.read_sql_query(
             """SELECT site_id, conus1_i, conus1_j, conus2_i, conus2_j,
                       nat_aqfr_cd AS usgs_nat_aqfr_cd,
@@ -440,6 +444,18 @@ class AS gagesii_class,
                 how="inner",
             )
 
+    if ("groundwater well" in metadata_df["site_type"].unique()) and (
+        options["dataset"] == "jasechko_2024"
+    ):
+        attributes_df = pd.read_sql_query(
+            """SELECT site_id, conus1_i, conus1_j, conus2_i, conus2_j, usgs_site
+               FROM jasechko_attributes WHERE site_id IN (%s)"""
+            % ",".join("?" * len(site_ids)),
+            conn,
+            params=site_ids,
+        )
+        metadata_df = pd.merge(metadata_df, attributes_df, how="left", on="site_id")
+
     if ("SNOTEL station" in metadata_df["site_type"].unique()) or (
         "SCAN station" in metadata_df["site_type"].unique()
     ):
@@ -602,7 +618,13 @@ def get_site_variables(*args, **kwargs):
     # Data source
     if "dataset" in options and options["dataset"] is not None:
         try:
-            assert options["dataset"] in ["usgs_nwis", "snotel", "scan", "ameriflux"]
+            assert options["dataset"] in [
+                "usgs_nwis",
+                "snotel",
+                "scan",
+                "ameriflux",
+                "jasechko_2024",
+            ]
         except:
             raise ValueError(
                 f"dataset must be one of 'usgs_nwis', 'snotel', 'scan', 'ameriflux'. You provided {options['dataset']}"
@@ -614,6 +636,9 @@ def get_site_variables(*args, **kwargs):
         elif options["dataset"] == "ameriflux":
             dataset_query = """ AND agency == ?"""
             param_list.append("AmeriFlux")
+        elif options["dataset"] == "jasechko_2024":
+            dataset_query = """ AND agency == ?"""
+            param_list.append("Jasechko_et_al_2024")
         elif options["dataset"] == "snotel":
             dataset_query = """ AND site_type == ?"""
             param_list.append("SNOTEL station")
@@ -1049,18 +1074,18 @@ def _get_point_citations(dataset):
     ----------
     dataset : str
         Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
-        'scan', 'ameriflux'.
+        'scan', 'ameriflux', 'jasechko_2024'.
 
     Returns
     -------
     str
         String containing overall attribution instructions for the provided dataset.
     """
     try:
-        assert dataset in ["usgs_nwis", "snotel", "scan", "ameriflux"]
+        assert dataset in ["usgs_nwis", "snotel", "scan", "ameriflux", "jasechko_2024"]
     except:
         raise ValueError(
-            f"Unexpected value of dataset, {dataset}. Supported values include 'usgs_nwis', 'snotel', 'scan', and 'ameriflux'"
+            f"Unexpected value of dataset, {dataset}. Supported values include 'usgs_nwis', 'snotel', 'scan', 'ameriflux', and 'jasechko_2024"
         )
 
     if dataset == "usgs_nwis":
@@ -1094,6 +1119,9 @@ def _get_point_citations(dataset):
             "Source: https://ameriflux.lbl.gov/data/data-policy/"
         )
 
+    elif dataset == "jasechko_2024":
+        c = "Dataset DOI: 10.1038/s41586-023-06879-8"
+
     return c
 
 
@@ -1151,13 +1179,14 @@ def _check_inputs(dataset, variable, temporal_resolution, aggregation, *args, **
     ----------
     dataset : str
         Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
-        'scan', 'ameriflux'.
+        'scan', 'ameriflux', 'jasechko_2024'.
     variable : str, required
         Description of type of data requested. Currently supported: 'streamflow', 'water_table_depth', 'swe',
         'precipitation', 'air_temp', 'soil_moisture', 'latent_heat', 'sensible_heat',
         'downward_shortwave', 'downward_longwave', 'vapor_pressure_deficit', 'wind_speed'.
     temporal_resolution : str
-        Collection frequency of data requested. Currently supported: 'daily', 'hourly', and 'instantaneous'.
+        Collection frequency of data requested. Currently supported: 'daily', 'hourly', 'instantaneous',
+        and 'yearly'.
     aggregation : str
         Additional information specifying the aggregation method for the variable to be returned.
         Options include descriptors such as 'mean' and 'sum'. Please see the documentation
@@ -1182,7 +1211,7 @@ def _check_inputs(dataset, variable, temporal_resolution, aggregation, *args, **
         options = kwargs
 
     try:
-        assert temporal_resolution in ["daily", "hourly", "instantaneous"]
+        assert temporal_resolution in ["daily", "hourly", "instantaneous", "yearly"]
     except:
         raise ValueError(
             f"Unexpected value for temporal_resolution, {temporal_resolution}. Please see the documentation for allowed values."
@@ -1211,6 +1240,7 @@ def _check_inputs(dataset, variable, temporal_resolution, aggregation, *args, **
     try:
         assert aggregation in [
             "mean",
+            "median",
             "instantaneous",
             "-",
             "sum",
@@ -1226,7 +1256,7 @@ def _check_inputs(dataset, variable, temporal_resolution, aggregation, *args, **
         )
 
     try:
-        assert dataset in ["usgs_nwis", "snotel", "scan", "ameriflux"]
+        assert dataset in ["usgs_nwis", "snotel", "scan", "ameriflux", "jasechko_2024"]
     except:
         raise ValueError(
             f"Unexpected value for dataset, {dataset} Please see the documentation for allowed values."
@@ -1254,13 +1284,14 @@ def _get_var_id(
         The Connection object associated with the SQLite database to query from.
     dataset : str
         Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
-        'scan', 'ameriflux'.
+        'scan', 'ameriflux', 'jasechko_2024'.
     variable : str, required
         Description of type of data requested. Currently supported: 'streamflow', 'water_table_depth', 'swe',
         'precipitation', 'air_temp', 'soil_moisture', 'latent_heat', 'sensible_heat',
         'downward_shortwave', 'downward_longwave', 'vapor_pressure_deficit', 'wind_speed'.
     temporal_resolution : str
-        Collection frequency of data requested. Currently supported: 'daily', 'hourly', and 'instantaneous'.
+        Collection frequency of data requested. Currently supported: 'daily', 'hourly', 'instantaneous', and
+        'yearly'.
     aggregation : str
         Additional information specifying the aggregation method for the variable to be returned.
         Options include descriptors such as 'mean' and 'sum'. Please see the documentation
@@ -1392,13 +1423,14 @@ def _get_sites(
         query from.
     dataset : str
         Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
-        'scan', 'ameriflux'.
+        'scan', 'ameriflux', 'jasechko_2024'.
     variable : str, required
         Description of type of data requested. Currently supported: 'streamflow', 'water_table_depth', 'swe',
         'precipitation', 'air_temp', 'soil_moisture', 'latent_heat', 'sensible_heat',
         'downward_shortwave', 'downward_longwave', 'vapor_pressure_deficit', 'wind_speed'.
     temporal_resolution : str
-        Collection frequency of data requested. Currently supported: 'daily', 'hourly', and 'instantaneous'.
+        Collection frequency of data requested. Currently supported: 'daily', 'hourly', 'instantaneous',
+        and 'yearly'.
         Please see the documentation for allowable combinations with `variable`.
     aggregation : str
         Additional information specifying the aggregation method for the variable to be returned.
@@ -1520,6 +1552,8 @@ def _get_sites(
             tbl = "snotel_station_attributes"
         elif dataset == "ameriflux":
             tbl = "flux_tower_attributes"
+        elif dataset == "jasechko_2024":
+            tbl = "jasechko_attributes"
 
         grid = options["grid"]
         grid_bounds = options["grid_bounds"]
@@ -2029,7 +2063,14 @@ def _get_data_sql(conn, site_list, var_id, *args, **kwargs):
         Stacked observations data for a single variable, filtered to only sites that
         have the minimum number of observations specified.
     """
-    assert var_id == 5
+    assert var_id in (5, 25)
+    if var_id == 5:
+        tbl_name = "wtd_discrete_data"
+        var_names = "w.wtd, w.pumping_status"
+
+    elif var_id == 25:
+        tbl_name = "jasechko_wtd_data"
+        var_names = "w.wtd"
 
     if len(args) > 0 and isinstance(args[0], dict):
         options = args[0]
@@ -2045,6 +2086,18 @@ def _get_data_sql(conn, site_list, var_id, *args, **kwargs):
     else:
         min_num_obs = options["min_num_obs"]
 
+    # This is a yearly variable. For date filtering to work properly, only consider the
+    # year provided in the input arguments.
+    if var_id == 25:
+        if "date_start" in options and options["date_start"] is not None:
+            options["date_start"] = datetime.datetime.strptime(
+                options["date_start"], "%Y-%m-%d"
+            ).year
+        if "date_end" in options and options["date_end"] is not None:
+            options["date_end"] = datetime.datetime.strptime(
+                options["date_end"], "%Y-%m-%d"
+            ).year
+
     if ("date_start" not in options) and ("date_end" not in options):
         date_query = """"""
         param_list = [min_num_obs]
@@ -2071,11 +2124,11 @@ def _get_data_sql(conn, site_list, var_id, *args, **kwargs):
 
     # Filter on all spatial observations for the desired time range (if any)
     query = (
-        """
-            SELECT w.site_id, w.date, w.wtd, w.pumping_status
-            FROM wtd_discrete_data AS w
+        f"""
+            SELECT w.site_id, w.date, {var_names}
+            FROM {tbl_name} AS w
             INNER JOIN (SELECT w.site_id, COUNT(*) AS num_obs
-                FROM wtd_discrete_data AS w
+                FROM {tbl_name} AS w
                 """
         + date_query
         + """

diff --git a/tests/hf_hydrodata/test_gridded.py b/tests/hf_hydrodata/test_gridded.py
@@ -314,6 +314,7 @@ def test_files_exist():
             "253",
             "254",
             "522",
+            "525",
         ]:
             paths = gr.get_file_paths(
                 row,
@@ -1217,7 +1218,7 @@ def test_get_datasets():
     """Test get_datasets."""
 
     datasets = hf.get_datasets()
-    assert len(datasets) == 18
+    assert len(datasets) == 19
     assert datasets[0] == "CW3E"
 
     datasets = hf.get_datasets(variable="air_temp")

diff --git a/tests/hf_hydrodata/test_point.py b/tests/hf_hydrodata/test_point.py
@@ -1657,5 +1657,73 @@ def test_fail_no_sites_get_site_variables():
     assert str(exc.value) == "There are no sites within the provided grid_bounds."
 
 
+def test_get_data_jasechko():
+    """Test getting data for Jasechko dataset."""
+    # No date range
+    df = point.get_point_data(
+        dataset="jasechko_2024",
+        variable="water_table_depth",
+        temporal_resolution="yearly",
+        aggregation="median",
+        site_ids=["1000000106"],
+    )
+    assert len(df) == 30
+
+    # Date range
+    df = point.get_point_data(
+        dataset="jasechko_2024",
+        variable="water_table_depth",
+        temporal_resolution="yearly",
+        aggregation="median",
+        date_start="2000-01-01",
+        date_end="2002-01-01",
+        site_ids=["1000000106"],
+    )
+    assert len(df) == 2
+
+
+def test_get_metadata_jasechko():
+    """Test getting metadata for Jasechko dataset."""
+    metadata_df = point.get_point_metadata(
+        dataset="jasechko_2024",
+        variable="water_table_depth",
+        temporal_resolution="yearly",
+        aggregation="median",
+        site_ids=["1000000106"],
+    )
+    assert len(metadata_df) == 1
+    assert "1000000106" in list(metadata_df["site_id"])
+
+    metadata_df = point.get_point_metadata(
+        dataset="jasechko_2024",
+        variable="water_table_depth",
+        temporal_resolution="yearly",
+        aggregation="median",
+        date_start="2021-01-01",
+        date_end="2022-12-31",
+        state="CO",
+    )
+    assert len(metadata_df) == 281
+    assert "usgs_site" in metadata_df.columns
+
+
+def test_get_citations_jasechko():
+    """Test citations for jasechko_2024 dataset."""
+    t = point._get_point_citations("jasechko_2024")
+    assert t == "Dataset DOI: 10.1038/s41586-023-06879-8"
+
+
+def test_get_site_variables_jasechko():
+    """Test for get_site_variables with jasechko dataset."""
+    df = point.get_site_variables(
+        dataset="jasechko_2024",
+        variable="water_table_depth",
+        date_start="2021-01-01",
+        date_end="2022-12-31",
+        state="CO",
+    )
+    assert len(df) == 281
+
+
 if __name__ == "__main__":
     pytest.main()