Skip to content

Commit

Permalink
Add Jasechko et al. 2024 WTD dataset subset (#162)
Browse files Browse the repository at this point in the history
* update tests to include additional sites caught in filter

* initial commit add jasechko dataset

* incorporate new dataset, add tests

* add data catalog entry

* add dataset description

* reconcile merge conflicts

* add catalog entry for new dataset

* update test to account for added dataset
amy-defnet authored Apr 19, 2024
1 parent 93cd9e0 commit 2bea890
Showing 10 changed files with 174 additions and 28 deletions.
19 changes: 19 additions & 0 deletions docs/dataset_text.yaml
Original file line number Diff line number Diff line change
@@ -162,3 +162,22 @@ datasets:
- Consolidating multiple concurrent data series
- The AmeriFlux data sometimes provides multiple concurrent observation series for the same variable for the same site. In these cases, we consolidate the multiple series into a single series following these prioritizations:
- We prioritize series without any suffix in the original variable names. If none is present, we prioritize variables with a "PI" suffix in the variable name, to indicate the data has been QA/QC reviewed by the tower team.
"jasechko_2024":
summary: >
A subset of the annual median water table depth measurements for sites included in `Jasechko et al. 2024 <https://www.nature.com/articles/s41586-023-06879-8>`_.
These sites are subset to only those within the `CONUS2 <https://hydroframe.org/parflow-conus2>`_ boundary, and only includes data that the authors were approved to publicly release.
processing_notes: >
Details on the initial data processing and collection are provided in `Jasechko et al. 2024 <https://www.nature.com/articles/s41586-023-06879-8>`_.
The data provided here was acquired from https://zenodo.org/records/10003697. Data was filtered to the CONUS2 domain and all sites
were also mapped to the Natural Earth state boundary shapefiles to include each site's state, where this mapping was unambiguous.
All sites were compared to existing USGS well records. If a site was within 0.001 degree latitude/longitude of an existing well, the
site is flagged as being a usgs_site.
Notes:
- The data included here is only a subset of the data in the original paper as not all data was made publicly available.
- The data included in this dataset are a static entry matching only what was in the publication. For up to date groundwater records refer to the USGS well dataset which is updated regularly with the most recent observations.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "hf_hydrodata"
version = "1.1.8"
version = "1.1.9"
description = "hydroframe tools and utilities"
authors = ["William M. Hasling", "Laura Condon", "Reed Maxwell", "George Artavanis", "Amy M. Johnson", "Amy C. Defnet"]
license = "MIT"
3 changes: 2 additions & 1 deletion src/hf_hydrodata/model/aggregation.csv
Original file line number Diff line number Diff line change
@@ -12,4 +12,5 @@ sod,Start of Day,Value and the start of the day
wy,Water Year,Aggregate by water year into files
site_id,SiteId,Aggregate by site_it into files
accumulated,Accumulated,Accumulated over time
sum_snow_adjusted,Snow Ajusted Total,Snow Adjusted Total
sum_snow_adjusted,Snow Ajusted Total,Snow Adjusted Total
median,Median,
1 change: 1 addition & 0 deletions src/hf_hydrodata/model/data_catalog_entry.csv
Original file line number Diff line number Diff line change
@@ -399,3 +399,4 @@ id,dataset,file_type,variable,dataset_var,entry_start_date,entry_end_date,tempor
522,usgs_nwis,sql,water_table_depth,wtd,01/01/1801,,instantaneous,m,-,,,1,,,groundwater
523,conus1_domain,tiff,flow_direction,band_data,,,static,-,-,conus1,,1,/hydrodata/PFCLM/CONUS1_baseline/other_domain_files/flow_direction.tif,"The flow directions for every grid cell (1=down, 2=left, 3=up, 4=right)",
524,conus1_domain,pfb,flow_direction,band_data,,,static,-,-,conus1,,1,/hydrodata/PFCLM/CONUS1_baseline/other_domain_files/flow_direction.pfb,"The flow directions for every grid cell (1=down, 2=left, 3=up, 4=right)",
525,jasechko_2024,sql,water_table_depth,wtd,1801,2022,yearly,m,median,,,1,,,groundwater
1 change: 1 addition & 0 deletions src/hf_hydrodata/model/dataset.csv
Original file line number Diff line number Diff line change
@@ -17,3 +17,4 @@ snotel,point_observations,Snow Telemetry,usda,,,,,point,UTC-00:00,,
scan,point_observations,Soil Climate Analysis Network,usda,,,,,point,UTC-00:00,,
ameriflux,point_observations,Ameriflux,ameriflux,,,,,point,UTC-00:00,,
noaa,forcing,NOAA Climate Data,noaa,,https://www.climate.gov/maps-data/dataset/daily-temperature-and-precipitation-reports-data-tables,6/1/22,,gridded,UTC-00:00,,
jasechko_2024,point_observations,"Jasechko et al. 2024 water table depth dataset, subset to the CONUS2 domain",jasechko_2024,10.1038/s41586-023-06879-8,https://zenodo.org/records/10003697,1801,2022,point,,,
1 change: 1 addition & 0 deletions src/hf_hydrodata/model/datasource.csv
Original file line number Diff line number Diff line change
@@ -5,3 +5,4 @@ usda,United States Department of Agriculture
usgs,United State Geological Survey
ameriflux,Ameriflux
noaa,National Oceanic and Atospheric Administration
jasechko_2024,Jasechko et al. 2024
3 changes: 2 additions & 1 deletion src/hf_hydrodata/model/temporal_resolution.csv
Original file line number Diff line number Diff line change
@@ -7,4 +7,5 @@ static,static,Variables that do not change with time
monthly_clim,Monthly Climatology,Long term monthly mean values
annual_clim,Annual Climatology,Long term annual mean values
daily_of_week,Daily Over Week,Daily values aggregated over week
instantaneous,Instantaneous,Data values reported at a given point in time
instantaneous,Instantaneous,Data values reported at a given point in time
yearly,Yearly,Data values stored per calendar year
101 changes: 77 additions & 24 deletions src/hf_hydrodata/point.py
Original file line number Diff line number Diff line change
@@ -51,6 +51,7 @@
"well_attributes",
"snotel_station_attributes",
"flux_tower_attributes",
"jasechko_attributes",
]


@@ -67,14 +68,14 @@ def get_point_data(*args, **kwargs):
----------
dataset : str, required
Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
'scan', 'ameriflux'.
'scan', 'ameriflux', 'jasechko_2024'.
variable : str, required
Description of type of data requested. Currently supported: 'streamflow', 'water_table_depth', 'swe',
'precipitation', 'air_temp', 'soil_moisture', 'latent_heat', 'sensible_heat',
'downward_shortwave', 'downward_longwave', 'vapor_pressure_deficit', 'wind_speed'.
temporal_resolution : str, required
Collection frequency of data requested. Currently supported: 'daily', 'hourly', and 'instantaneous'.
Please see the documentation for allowable combinations with `variable`.
Collection frequency of data requested. Currently supported: 'daily', 'hourly', 'instantaneous', and.
'yearly'. Please see the documentation for allowable combinations with `variable`.
aggregation : str, required
Additional information specifying the aggregation method for the variable to be returned.
Options include descriptors such as 'mean' and 'sum'. Please see the documentation
@@ -241,7 +242,7 @@ def get_point_data(*args, **kwargs):
if (var_id in (1, 2, 3, 4)) | (var_id in range(6, 25)):
data_df = _get_data_nc(site_list, var_id, *args, **kwargs)

elif var_id == 5:
elif var_id in (5, 25):
data_df = _get_data_sql(conn, site_list, var_id, *args, **kwargs)

conn.close()
@@ -257,13 +258,14 @@ def get_point_metadata(*args, **kwargs):
----------
dataset : str, required
Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
'scan', 'ameriflux'.
'scan', 'ameriflux', 'jasechko_2024'.
variable : str, required
Description of type of data requested. Currently supported: 'streamflow', 'water_table_depth', 'swe',
'precipitation', 'air_temp', 'soil_moisture', 'latent_heat', 'sensible_heat',
'downward_shortwave', 'downward_longwave', 'vapor_pressure_deficit', 'wind_speed'.
temporal_resolution : str, required
Collection frequency of data requested. Currently supported: 'daily', 'hourly', and 'instantaneous'.
Collection frequency of data requested. Currently supported: 'daily', 'hourly', 'instantaneous',
and 'yearly'.
Please see the documentation for allowable combinations with `variable`.
aggregation : str, required
Additional information specifying the aggregation method for the variable to be returned.
@@ -404,7 +406,9 @@ class AS gagesii_class,
)
metadata_df = pd.merge(metadata_df, attributes_df, how="left", on="site_id")

if "groundwater well" in metadata_df["site_type"].unique():
if ("groundwater well" in metadata_df["site_type"].unique()) and (
options["dataset"] == "usgs_nwis"
):
attributes_df = pd.read_sql_query(
"""SELECT site_id, conus1_i, conus1_j, conus2_i, conus2_j,
nat_aqfr_cd AS usgs_nat_aqfr_cd,
@@ -440,6 +444,18 @@ class AS gagesii_class,
how="inner",
)

if ("groundwater well" in metadata_df["site_type"].unique()) and (
options["dataset"] == "jasechko_2024"
):
attributes_df = pd.read_sql_query(
"""SELECT site_id, conus1_i, conus1_j, conus2_i, conus2_j, usgs_site
FROM jasechko_attributes WHERE site_id IN (%s)"""
% ",".join("?" * len(site_ids)),
conn,
params=site_ids,
)
metadata_df = pd.merge(metadata_df, attributes_df, how="left", on="site_id")

if ("SNOTEL station" in metadata_df["site_type"].unique()) or (
"SCAN station" in metadata_df["site_type"].unique()
):
@@ -602,7 +618,13 @@ def get_site_variables(*args, **kwargs):
# Data source
if "dataset" in options and options["dataset"] is not None:
try:
assert options["dataset"] in ["usgs_nwis", "snotel", "scan", "ameriflux"]
assert options["dataset"] in [
"usgs_nwis",
"snotel",
"scan",
"ameriflux",
"jasechko_2024",
]
except:
raise ValueError(
f"dataset must be one of 'usgs_nwis', 'snotel', 'scan', 'ameriflux'. You provided {options['dataset']}"
@@ -614,6 +636,9 @@ def get_site_variables(*args, **kwargs):
elif options["dataset"] == "ameriflux":
dataset_query = """ AND agency == ?"""
param_list.append("AmeriFlux")
elif options["dataset"] == "jasechko_2024":
dataset_query = """ AND agency == ?"""
param_list.append("Jasechko_et_al_2024")
elif options["dataset"] == "snotel":
dataset_query = """ AND site_type == ?"""
param_list.append("SNOTEL station")
@@ -1049,18 +1074,18 @@ def _get_point_citations(dataset):
----------
dataset : str
Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
'scan', 'ameriflux'.
'scan', 'ameriflux', 'jasechko_2024'.
Returns
-------
str
String containing overall attribution instructions for the provided dataset.
"""
try:
assert dataset in ["usgs_nwis", "snotel", "scan", "ameriflux"]
assert dataset in ["usgs_nwis", "snotel", "scan", "ameriflux", "jasechko_2024"]
except:
raise ValueError(
f"Unexpected value of dataset, {dataset}. Supported values include 'usgs_nwis', 'snotel', 'scan', and 'ameriflux'"
f"Unexpected value of dataset, {dataset}. Supported values include 'usgs_nwis', 'snotel', 'scan', 'ameriflux', and 'jasechko_2024"
)

if dataset == "usgs_nwis":
@@ -1094,6 +1119,9 @@ def _get_point_citations(dataset):
"Source: https://ameriflux.lbl.gov/data/data-policy/"
)

elif dataset == "jasechko_2024":
c = "Dataset DOI: 10.1038/s41586-023-06879-8"

return c


@@ -1151,13 +1179,14 @@ def _check_inputs(dataset, variable, temporal_resolution, aggregation, *args, **
----------
dataset : str
Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
'scan', 'ameriflux'.
'scan', 'ameriflux', 'jasechko_2024'.
variable : str, required
Description of type of data requested. Currently supported: 'streamflow', 'water_table_depth', 'swe',
'precipitation', 'air_temp', 'soil_moisture', 'latent_heat', 'sensible_heat',
'downward_shortwave', 'downward_longwave', 'vapor_pressure_deficit', 'wind_speed'.
temporal_resolution : str
Collection frequency of data requested. Currently supported: 'daily', 'hourly', and 'instantaneous'.
Collection frequency of data requested. Currently supported: 'daily', 'hourly', 'instantaneous',
and 'yearly'.
aggregation : str
Additional information specifying the aggregation method for the variable to be returned.
Options include descriptors such as 'mean' and 'sum'. Please see the documentation
@@ -1182,7 +1211,7 @@ def _check_inputs(dataset, variable, temporal_resolution, aggregation, *args, **
options = kwargs

try:
assert temporal_resolution in ["daily", "hourly", "instantaneous"]
assert temporal_resolution in ["daily", "hourly", "instantaneous", "yearly"]
except:
raise ValueError(
f"Unexpected value for temporal_resolution, {temporal_resolution}. Please see the documentation for allowed values."
@@ -1211,6 +1240,7 @@ def _check_inputs(dataset, variable, temporal_resolution, aggregation, *args, **
try:
assert aggregation in [
"mean",
"median",
"instantaneous",
"-",
"sum",
@@ -1226,7 +1256,7 @@ def _check_inputs(dataset, variable, temporal_resolution, aggregation, *args, **
)

try:
assert dataset in ["usgs_nwis", "snotel", "scan", "ameriflux"]
assert dataset in ["usgs_nwis", "snotel", "scan", "ameriflux", "jasechko_2024"]
except:
raise ValueError(
f"Unexpected value for dataset, {dataset} Please see the documentation for allowed values."
@@ -1254,13 +1284,14 @@ def _get_var_id(
The Connection object associated with the SQLite database to query from.
dataset : str
Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
'scan', 'ameriflux'.
'scan', 'ameriflux', 'jasechko_2024'.
variable : str, required
Description of type of data requested. Currently supported: 'streamflow', 'water_table_depth', 'swe',
'precipitation', 'air_temp', 'soil_moisture', 'latent_heat', 'sensible_heat',
'downward_shortwave', 'downward_longwave', 'vapor_pressure_deficit', 'wind_speed'.
temporal_resolution : str
Collection frequency of data requested. Currently supported: 'daily', 'hourly', and 'instantaneous'.
Collection frequency of data requested. Currently supported: 'daily', 'hourly', 'instantaneous', and
'yearly'.
aggregation : str
Additional information specifying the aggregation method for the variable to be returned.
Options include descriptors such as 'mean' and 'sum'. Please see the documentation
@@ -1392,13 +1423,14 @@ def _get_sites(
query from.
dataset : str
Source from which requested data originated. Currently supported: 'usgs_nwis', 'snotel',
'scan', 'ameriflux'.
'scan', 'ameriflux', 'jasechko_2024'.
variable : str, required
Description of type of data requested. Currently supported: 'streamflow', 'water_table_depth', 'swe',
'precipitation', 'air_temp', 'soil_moisture', 'latent_heat', 'sensible_heat',
'downward_shortwave', 'downward_longwave', 'vapor_pressure_deficit', 'wind_speed'.
temporal_resolution : str
Collection frequency of data requested. Currently supported: 'daily', 'hourly', and 'instantaneous'.
Collection frequency of data requested. Currently supported: 'daily', 'hourly', 'instantaneous',
and 'yearly'.
Please see the documentation for allowable combinations with `variable`.
aggregation : str
Additional information specifying the aggregation method for the variable to be returned.
@@ -1520,6 +1552,8 @@ def _get_sites(
tbl = "snotel_station_attributes"
elif dataset == "ameriflux":
tbl = "flux_tower_attributes"
elif dataset == "jasechko_2024":
tbl = "jasechko_attributes"

grid = options["grid"]
grid_bounds = options["grid_bounds"]
@@ -2029,7 +2063,14 @@ def _get_data_sql(conn, site_list, var_id, *args, **kwargs):
Stacked observations data for a single variable, filtered to only sites that
have the minimum number of observations specified.
"""
assert var_id == 5
assert var_id in (5, 25)
if var_id == 5:
tbl_name = "wtd_discrete_data"
var_names = "w.wtd, w.pumping_status"

elif var_id == 25:
tbl_name = "jasechko_wtd_data"
var_names = "w.wtd"

if len(args) > 0 and isinstance(args[0], dict):
options = args[0]
@@ -2045,6 +2086,18 @@ def _get_data_sql(conn, site_list, var_id, *args, **kwargs):
else:
min_num_obs = options["min_num_obs"]

# This is a yearly variable. For date filtering to work properly, only consider the
# year provided in the input arguments.
if var_id == 25:
if "date_start" in options and options["date_start"] is not None:
options["date_start"] = datetime.datetime.strptime(
options["date_start"], "%Y-%m-%d"
).year
if "date_end" in options and options["date_end"] is not None:
options["date_end"] = datetime.datetime.strptime(
options["date_end"], "%Y-%m-%d"
).year

if ("date_start" not in options) and ("date_end" not in options):
date_query = """"""
param_list = [min_num_obs]
@@ -2071,11 +2124,11 @@ def _get_data_sql(conn, site_list, var_id, *args, **kwargs):

# Filter on all spatial observations for the desired time range (if any)
query = (
"""
SELECT w.site_id, w.date, w.wtd, w.pumping_status
FROM wtd_discrete_data AS w
f"""
SELECT w.site_id, w.date, {var_names}
FROM {tbl_name} AS w
INNER JOIN (SELECT w.site_id, COUNT(*) AS num_obs
FROM wtd_discrete_data AS w
FROM {tbl_name} AS w
"""
+ date_query
+ """
3 changes: 2 additions & 1 deletion tests/hf_hydrodata/test_gridded.py
Original file line number Diff line number Diff line change
@@ -314,6 +314,7 @@ def test_files_exist():
"253",
"254",
"522",
"525",
]:
paths = gr.get_file_paths(
row,
@@ -1217,7 +1218,7 @@ def test_get_datasets():
"""Test get_datasets."""

datasets = hf.get_datasets()
assert len(datasets) == 18
assert len(datasets) == 19
assert datasets[0] == "CW3E"

datasets = hf.get_datasets(variable="air_temp")
68 changes: 68 additions & 0 deletions tests/hf_hydrodata/test_point.py
Original file line number Diff line number Diff line change
@@ -1657,5 +1657,73 @@ def test_fail_no_sites_get_site_variables():
assert str(exc.value) == "There are no sites within the provided grid_bounds."


def test_get_data_jasechko():
"""Test getting data for Jasechko dataset."""
# No date range
df = point.get_point_data(
dataset="jasechko_2024",
variable="water_table_depth",
temporal_resolution="yearly",
aggregation="median",
site_ids=["1000000106"],
)
assert len(df) == 30

# Date range
df = point.get_point_data(
dataset="jasechko_2024",
variable="water_table_depth",
temporal_resolution="yearly",
aggregation="median",
date_start="2000-01-01",
date_end="2002-01-01",
site_ids=["1000000106"],
)
assert len(df) == 2


def test_get_metadata_jasechko():
"""Test getting metadata for Jasechko dataset."""
metadata_df = point.get_point_metadata(
dataset="jasechko_2024",
variable="water_table_depth",
temporal_resolution="yearly",
aggregation="median",
site_ids=["1000000106"],
)
assert len(metadata_df) == 1
assert "1000000106" in list(metadata_df["site_id"])

metadata_df = point.get_point_metadata(
dataset="jasechko_2024",
variable="water_table_depth",
temporal_resolution="yearly",
aggregation="median",
date_start="2021-01-01",
date_end="2022-12-31",
state="CO",
)
assert len(metadata_df) == 281
assert "usgs_site" in metadata_df.columns


def test_get_citations_jasechko():
"""Test citations for jasechko_2024 dataset."""
t = point._get_point_citations("jasechko_2024")
assert t == "Dataset DOI: 10.1038/s41586-023-06879-8"


def test_get_site_variables_jasechko():
"""Test for get_site_variables with jasechko dataset."""
df = point.get_site_variables(
dataset="jasechko_2024",
variable="water_table_depth",
date_start="2021-01-01",
date_end="2022-12-31",
state="CO",
)
assert len(df) == 281


if __name__ == "__main__":
pytest.main()

0 comments on commit 2bea890

Please sign in to comment.