diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 0cec06fb3f..bb425bb602 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -76,3 +76,4 @@ cdf40d265cc82775607a1bf25f5f527bacc97405 ac03492012837799b7111607188acff9f739044a d858665d799690d73b56bcb961684382551193f4 c0c6da391ee359f2765439426f3a2a4593a95343 +598de2f05638286b3d99ac0ed120977cbc554c3d diff --git a/cime_config/testdefs/ExpectedTestFails.xml b/cime_config/testdefs/ExpectedTestFails.xml index d19bcf05a6..b00f3e720e 100644 --- a/cime_config/testdefs/ExpectedTestFails.xml +++ b/cime_config/testdefs/ExpectedTestFails.xml @@ -36,6 +36,27 @@ + + + FAIL + #3740 + + + + + + FAIL + #3740 + + + + + + FAIL + #3740 + + + FAIL diff --git a/python/ctsm/crop_calendars/generate_gdds.py b/python/ctsm/crop_calendars/generate_gdds.py index 7af82f9fa1..196d7a96da 100644 --- a/python/ctsm/crop_calendars/generate_gdds.py +++ b/python/ctsm/crop_calendars/generate_gdds.py @@ -21,6 +21,9 @@ from ctsm.ctsm_logging import log, error # pylint: disable=wrong-import-position import ctsm.crop_calendars.cropcal_module as cc # pylint: disable=wrong-import-position import ctsm.crop_calendars.generate_gdds_functions as gddfn # pylint: disable=wrong-import-position +from ctsm.crop_calendars.import_ds import ( # pylint: disable=wrong-import-position + get_files_in_time_slice, # pylint: disable=wrong-import-position +) # pylint: disable=wrong-import-position # Functions here were written with too many positional arguments. At some point that should be # fixed. For now, we'll just disable the warning. @@ -42,6 +45,72 @@ def _get_max_growing_season_lengths(max_season_length_from_hdates_file, paramfil return mxmats +def _get_history_yr_range(first_season, last_season): + """ + Get a range object that can be used for looping over all years we need to process timestamps + from. + """ + # Saving at the end of a year receive the timestamp of the END of the year's final timestep, + # which means it will actually be 00:00 of Jan. 1 of the next year. + first_history_yr = first_season + 1 + + # Same deal for the last history timestep, but we have to read an extra year in that case, + # because in some places the last growing season won't complete until the year after it was + # planted. + last_history_yr = last_season + 2 + + # last_history_yr + 1 because range() will iterate up to but not including the second value. + history_yr_range = range(first_history_yr, last_history_yr + 1) + + return history_yr_range + + +def _get_time_slice_list(first_season, last_season): + """ + Given the requested first and last seasons, get the list of time slices that the script should + look for. The assumptions here, as in import_and_process_1yr and as instructed in the docs, are + that the user (a) is saving instantaneous annual files and (b) started on Jan. 1. + """ + + # Input checks + if not all(isinstance(i, int) for i in [first_season, last_season]): + raise TypeError("_get_time_slice_list() arguments must be integers") + if first_season > last_season: + raise ValueError(f"first_season ({first_season}) > last_season ({last_season})") + + slice_list = [] + for history_yr in _get_history_yr_range(first_season, last_season): + slice_start = f"{history_yr}-01-01" + # Stop could probably be the same as start, since there should just be one value saved per + # year and that should get the Jan. 1 timestamp. + slice_stop = f"{history_yr}-12-31" + slice_list.append(slice(slice_start, slice_stop)) + + # We should be reading one more than the total number of years in [first_season, last_season]. + assert len(slice_list) == last_season - first_season + 2 + + return slice_list + + +def _get_file_lists(input_dir, time_slice_list, logger): + """ + For each time slice in a list, find the file(s) that need to be read to get all history + timesteps in the slice. Returns both h1i and h2i file lists. + """ + output_file_lists_list = [None, None] + for i, h in enumerate([1, 2]): + all_h_files = gddfn.find_inst_hist_files(input_dir, h=h, logger=logger) + h_file_lists = [] + for time_slice in time_slice_list: + try: + h_file_lists.append(get_files_in_time_slice(all_h_files, time_slice, logger=logger)) + except FileNotFoundError as e: + raise FileNotFoundError(f"No h{h} timesteps found in {time_slice}") from e + output_file_lists_list[i] = h_file_lists + h1_file_lists, h2_file_lists = tuple(output_file_lists_list) + return h1_file_lists, h2_file_lists + + def main( *, input_dir=None, @@ -126,6 +195,9 @@ def main( + "(years are +1 because of CTSM output naming)", ) + # This script uses pickle to save work in progress. In case of interruption, when the script + # is resumed, it will look for a pickle file. It will resume from the year after + # pickle_year, which is the last processed year in the pickle file. pickle_file = os.path.join(output_dir, f"{first_season}-{last_season}.pickle") h2_ds_file = os.path.join(output_dir, f"{first_season}-{last_season}.h2_ds.nc") if os.path.exists(pickle_file) and not no_pickle: @@ -162,10 +234,20 @@ def main( max_season_length_from_hdates_file, paramfile, max_season_length_cushion ) - h1_instantaneous = None - for yr_index, this_yr in enumerate(np.arange(first_season + 1, last_season + 3)): + # Get lists of history timesteps and files to read + time_slice_list = _get_time_slice_list(first_season, last_season) + h1_file_lists, h2_file_lists = _get_file_lists(input_dir, time_slice_list, logger) + + for yr_index, this_yr in enumerate(_get_history_yr_range(first_season, last_season)): + # If resuming from a pickled file, we continue until we reach a year that hasn't yet + # been processed. if this_yr <= pickle_year: continue + log(logger, f"netCDF year {this_yr}...") + + # Get h1 and h2 files to read for this year + h1_file_list = h1_file_lists[yr_index] # pylint: disable=unsubscriptable-object + h2_file_list = h2_file_lists[yr_index] # pylint: disable=unsubscriptable-object ( h2_ds, @@ -179,12 +261,10 @@ def main( incl_vegtypes_str, incl_patches1d_itype_veg, mxsowings, - h1_instantaneous, ) = gddfn.import_and_process_1yr( first_season, last_season, yr_index, - this_yr, sdates_rx, hdates_rx, gddaccum_yp_list, @@ -192,7 +272,6 @@ def main( skip_patches_for_isel_nan_lastyear, lastyear_active_patch_indices_list, incorrectly_daily, - input_dir, incl_vegtypes_str, h2_ds_file, mxmats, @@ -200,7 +279,8 @@ def main( skip_crops, outdir_figs, logger, - h1_instantaneous, + h1_file_list, + h2_file_list, ) log(logger, f" Saving pickle file ({pickle_file})...") diff --git a/python/ctsm/crop_calendars/generate_gdds_functions.py b/python/ctsm/crop_calendars/generate_gdds_functions.py index 4f6dd6b966..ed271bdd8d 100644 --- a/python/ctsm/crop_calendars/generate_gdds_functions.py +++ b/python/ctsm/crop_calendars/generate_gdds_functions.py @@ -10,7 +10,6 @@ import numpy as np import xarray as xr -from ctsm.utils import is_instantaneous from ctsm.ctsm_logging import log, error import ctsm.crop_calendars.cropcal_utils as utils import ctsm.crop_calendars.cropcal_module as cc @@ -290,7 +289,6 @@ def import_and_process_1yr( year_1, year_n, year_index, - this_year, sdates_rx, hdates_rx, gddaccum_yp_list, @@ -298,7 +296,6 @@ def import_and_process_1yr( skip_patches_for_isel_nan_last_year, last_year_active_patch_indices_list, incorrectly_daily, - indir, incl_vegtypes_str_in, h2_ds_file, mxmats, @@ -306,13 +303,13 @@ def import_and_process_1yr( skip_crops, outdir_figs, logger, - h1_instantaneous, + h1_filelist, + h2_filelist, ): """ Import one year of CLM output data for GDD generation """ save_figs = True - log(logger, f"netCDF year {this_year}...") # Without dask, this can take a LONG time at resolutions finer than 2-deg if importlib_util.find_spec("dask"): @@ -320,34 +317,17 @@ def import_and_process_1yr( else: chunks = None - # Get h1 file (list) - h1_pattern = os.path.join(indir, "*h1i.*.nc") - h1_filelist = glob.glob(h1_pattern) - if not h1_filelist: - h1_pattern = os.path.join(indir, "*h1i.*.nc.base") - h1_filelist = glob.glob(h1_pattern) - if not h1_filelist: - error(logger, "No files found matching pattern '*h1i.*.nc(.base)'") - # Get list of crops to include if skip_crops is not None: crops_to_read = [c for c in utils.define_mgdcrop_list_withgrasses() if c not in skip_crops] else: crops_to_read = utils.define_mgdcrop_list_withgrasses() - # Are h1 files instantaneous? - if h1_instantaneous is None: - h1_instantaneous = is_instantaneous(xr.open_dataset(h1_filelist[0])["time"]) - - if h1_instantaneous: - slice_year = this_year - else: - slice_year = this_year - 1 + # Read h1 file(s) dates_ds = import_ds( h1_filelist, my_vars=["SDATES", "HDATES"], my_vegtypes=crops_to_read, - time_slice=slice(f"{slice_year}-01-01", f"{slice_year}-12-31"), chunks=chunks, logger=logger, ) @@ -631,16 +611,8 @@ def import_and_process_1yr( log(logger, " Importing accumulated GDDs...") clm_gdd_var = "GDDACCUM" my_vars = [clm_gdd_var, "GDDHARV"] - patterns = [f"*h2i.{this_year-1}-01*.nc", f"*h2i.{this_year-1}-01*.nc.base"] - for pat in patterns: - pattern = os.path.join(indir, pat) - h2_files = glob.glob(pattern) - if h2_files: - break - if not h2_files: - error(logger, f"No files found matching patterns: {patterns}") h2_ds = import_ds( - h2_files, + h2_filelist, my_vars=my_vars, my_vegtypes=crops_to_read, chunks=chunks, @@ -892,10 +864,80 @@ def import_and_process_1yr( incl_vegtypes_str, incl_patches1d_itype_veg, mxsowings, - h1_instantaneous, ) +def find_inst_hist_files(indir, *, h, this_year=None, logger=None): + """ + Find all the instantaneous history files for a given tape number, optionally looking just for + one year in filename. + + Args: + indir: Directory to search for history files + h: History tape number (must be an integer, e.g., 1 for h1, 2 for h2) + this_year: Optional year to filter files by. If provided, only files with dates starting + with "{this_year}-01" will be returned. If None, all files matching the + history tape number will be returned. + logger: Optional logger for error messages. If None, errors are raised without logging. + + Returns: + List of file paths matching the search criteria + + Raises: + TypeError: If h is not an integer + FileNotFoundError: If no files matching the patterns are found + RuntimeError: If files from multiple case names are found (indicates mixed output from + different simulations, which is pathological) + + Notes: + - Searches for files matching patterns: "*h{h}i.*.nc" or "*h{h}i.*.nc.base" + - When this_year is specified, searches for: "*h{h}i.{this_year}-01*.nc" or + "*h{h}i.{this_year}-01*.nc.base" + - Prefers .nc files over .nc.base files (searches .nc pattern first) + - All returned files must be from the same case name (extracted from filename before + ".clm2.h#i.") + """ + if this_year is None: + patterns = [f"*h{h}i.*.nc", f"*h{h}i.*.nc.base"] + else: + if not isinstance(h, int): + err_msg = f"h ({h}) must be an integer, not {type(h)}" + err_type = TypeError + if logger: + error(logger, err_msg, error_type=err_type) + raise err_type(err_msg) + patterns = [f"*h{h}i.{this_year}-01*.nc", f"*h{h}i.{this_year}-01*.nc.base"] + for pat in patterns: + pattern = os.path.join(indir, pat) + file_list = glob.glob(pattern) + if file_list: + break + if not file_list: + err_msg = f"No files found matching patterns: {patterns}" + err_type = FileNotFoundError + if logger: + error(logger, err_msg, error_type=err_type) + raise err_type(err_msg) + + # Error if files found from multiple cases + case_names = set() + for file in file_list: + basename = os.path.basename(file) + # Extract case name (everything before .clm2.h#i.) + parts = basename.split(".clm2.") + if len(parts) > 1: + case_name = parts[0] + case_names.add(case_name) + if len(case_names) > 1: + err_msg = f"Found files from multiple case names: {sorted(case_names)}" + err_type = RuntimeError + if logger: + error(logger, err_msg, error_type=err_type) + raise err_type(err_msg) + + return file_list + + def get_multicrop_maps(this_ds, these_vars, crop_fracs_yx, dummy_fill, gdd_units): # pylint: disable=missing-function-docstring # Get GDDs for these crops diff --git a/python/ctsm/crop_calendars/import_ds.py b/python/ctsm/crop_calendars/import_ds.py index 66a0ec9746..656d10985e 100644 --- a/python/ctsm/crop_calendars/import_ds.py +++ b/python/ctsm/crop_calendars/import_ds.py @@ -247,23 +247,7 @@ def import_ds( # elements through end-1 will be selected, but that seems not to be the case in the xarray # implementation. if time_slice: - new_filelist = [] - for file in sorted(filelist): - log(logger, f"Getting filetime from file: {file}") - filetime = xr.open_dataset(file).time - filetime_sel = utils.safer_timeslice(filetime, time_slice) - include_this_file = filetime_sel.size - if include_this_file: - log(logger, f"Including filetime : {filetime_sel['time'].values}") - new_filelist.append(file) - - # If you found some matching files, but then you find one that doesn't, stop going - # through the list. - elif new_filelist: - break - if not new_filelist: - raise RuntimeError(f"No files found in time_slice {time_slice}") - filelist = new_filelist + filelist = get_files_in_time_slice(filelist, time_slice, logger) # The xarray open_mfdataset() "preprocess" argument requires a function that takes exactly one # variable (an xarray.Dataset object). Wrapping mfdataset_preproc() in this lambda function @@ -324,3 +308,29 @@ def import_ds( log(logger, "End") return this_ds + + +def get_files_in_time_slice(filelist, time_slice, logger=None): + """ + For a given list of files, find the files that need to be read in order to get all history + timesteps in the slice. + """ + new_filelist = [] + for file in sorted(filelist): + if logger: + log(logger, f"Getting filetime from file: {file}") + filetime = xr.open_dataset(file).time + filetime_sel = utils.safer_timeslice(filetime, time_slice) + include_this_file = filetime_sel.size + if include_this_file: + if logger: + log(logger, f"Including filetime : {filetime_sel['time'].values}") + new_filelist.append(file) + + # If you found some matching files, but then you find one that doesn't, stop going + # through the list. + elif new_filelist: + break + if not new_filelist: + raise FileNotFoundError(f"No files found in time_slice {time_slice}") + return new_filelist diff --git a/python/ctsm/test/test_unit_generate_gdds.py b/python/ctsm/test/test_unit_generate_gdds.py index 4976097b7d..5c0219cd09 100755 --- a/python/ctsm/test/test_unit_generate_gdds.py +++ b/python/ctsm/test/test_unit_generate_gdds.py @@ -7,9 +7,14 @@ import unittest import os import argparse +import tempfile +import shutil +import logging +import re import numpy as np import xarray as xr +from cftime import DatetimeNoLeap from ctsm import unit_testing from ctsm.crop_calendars import generate_gdds as gg @@ -125,8 +130,10 @@ def test_generate_gdds_args_error_with_paramfile_and_nomxmat(self): gg._parse_args(args) def test_generate_gdds_args_error_with_nomxmat_and_cushion(self): - """Should error if both --max-season-length-cushion and --max-season-length-from-hdates-file - are given""" + """ + Should error if both --max-season-length-cushion and --max-season-length-from-hdates-file + are given + """ args = [ "--input-dir", self._input_dir, @@ -230,6 +237,49 @@ def test_generate_gdds_get_mxmats_cushionneg14(self): self.assertEqual(mxmats["miscanthus"], 210 - cushion) +class TestGetTimeSliceList(unittest.TestCase): + """Tests for _get_time_slice_list()""" + + def test_generate_gdds_get_time_slice_list(self): + """Test that _get_time_slice_list works with two different years""" + season_list = [1986, 1987] + result = gg._get_time_slice_list(season_list[0], season_list[-1]) + expected = [ + slice("1987-01-01", "1987-12-31"), + slice("1988-01-01", "1988-12-31"), + slice("1989-01-01", "1989-12-31"), + ] + assert result == expected + + def test_generate_gdds_get_time_slice_list_1yr(self): + """Test that _get_time_slice_list works with the same year""" + result = gg._get_time_slice_list(1987, 1987) + expected = [ + slice("1988-01-01", "1988-12-31"), + slice("1989-01-01", "1989-12-31"), + ] + assert result == expected + + def test_generate_gdds_get_time_slice_list_valueerror(self): + """Test that _get_time_slice_list raises ValueError if last < first""" + with self.assertRaisesRegex(ValueError, "first_season.* > last_season"): + gg._get_time_slice_list(1987, 1986) + + def test_generate_gdds_get_time_slice_list_typeerror_first(self): + """Test that _get_time_slice_list raises TypeError if not given integer first season""" + with self.assertRaisesRegex( + TypeError, r"_get_time_slice_list\(\) arguments must be integers" + ): + gg._get_time_slice_list(1986.3, 1987) + + def test_generate_gdds_get_time_slice_list_typeerror_last(self): + """Test that _get_time_slice_list raises TypeError if not given integer last season""" + with self.assertRaisesRegex( + TypeError, r"_get_time_slice_list\(\) arguments must be integers" + ): + gg._get_time_slice_list(1986, None) + + class TestCheckGridMatch(unittest.TestCase): """Tests check_grid_match()""" @@ -325,6 +375,373 @@ def test_check_grid_match_matchnans_falseshape_dada(self): self.assertIsNone(max_abs_diff) +class TestFindInstHistFiles(unittest.TestCase): + """Tests of find_inst_hist_files()""" + + def setUp(self): + """ + Set up and change to temporary directory + """ + self.prev_dir = os.getcwd() + self.temp_dir = tempfile.mkdtemp() + os.chdir(self.temp_dir) + + def tearDown(self): + """ + Delete temporary directory + """ + os.chdir(self.prev_dir) + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def _create_test_file(self, filename): + """Helper to create an empty test file""" + filepath = os.path.join(self.temp_dir, filename) + with open(filepath, "a", encoding="utf-8"): + pass + return filepath + + def test_find_inst_hist_files_h1_no_year(self): + """Test finding h1 files without specifying year""" + # Create test files + file1 = self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc") + file2 = self._create_test_file("test.clm2.h1i.2000-02-01-00000.nc") + file3 = self._create_test_file("test.clm2.h1i.2001-01-01-00000.nc") + + result = gf.find_inst_hist_files(self.temp_dir, h=1, this_year=None) + + # Should find all h1i files + self.assertEqual(len(result), 3) + self.assertIn(file1, result) + self.assertIn(file2, result) + self.assertIn(file3, result) + + def test_find_inst_hist_files_h2_no_year(self): + """Test finding h2 files without specifying year""" + # Create test files + file1 = self._create_test_file("test.clm2.h2i.2000-01-01-00000.nc") + file2 = self._create_test_file("test.clm2.h2i.2001-01-01-00000.nc") + # Create h1 file that should not be found + self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc") + + result = gf.find_inst_hist_files(self.temp_dir, h=2, this_year=None) + + # Should find only h2i files + self.assertEqual(len(result), 2) + self.assertIn(file1, result) + self.assertIn(file2, result) + + def test_find_inst_hist_files_with_year(self): + """Test finding files for a specific year""" + # Create test files + file_2000 = self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc") + file_2001 = self._create_test_file("test.clm2.h1i.2001-01-01-00000.nc") + file_2002 = self._create_test_file("test.clm2.h1i.2002-01-01-00000.nc") + + result = gf.find_inst_hist_files(self.temp_dir, h=1, this_year=2001) + + # Should find only 2001 file + self.assertEqual(len(result), 1) + self.assertIn(file_2001, result) + self.assertNotIn(file_2000, result) + self.assertNotIn(file_2002, result) + + def test_find_inst_hist_files_base_extension(self): + """Test finding files with .nc.base extension""" + # Create test files with .nc.base extension + file1 = self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc.base") + file2 = self._create_test_file("test.clm2.h1i.2001-01-01-00000.nc.base") + + result = gf.find_inst_hist_files(self.temp_dir, h=1, this_year=None) + + # Should find .nc.base files + self.assertEqual(len(result), 2) + self.assertIn(file1, result) + self.assertIn(file2, result) + + def test_find_inst_hist_files_prefer_nc_over_base(self): + """Test that .nc files are preferred over .nc.base files""" + # Create both .nc and .nc.base files + file_nc = self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc") + file_nc_base = self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc.base") + + result = gf.find_inst_hist_files(self.temp_dir, h=1, this_year=None) + + # Should find .nc files first (pattern order preference) + self.assertIn(file_nc, result) + self.assertNotIn(file_nc_base, result) + + def test_find_inst_hist_files_multiple_months_same_year(self): + """Test finding multiple files from the same year""" + # Create multiple files from 2000 + file1 = self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc") + file2 = self._create_test_file("test.clm2.h1i.2000-01-15-00000.nc") + file3 = self._create_test_file("test.clm2.h1i.2000-01-31-00000.nc") + # Create file from different year + self._create_test_file("test.clm2.h1i.2001-01-01-00000.nc") + + result = gf.find_inst_hist_files(self.temp_dir, h=1, this_year=2000) + + # Should find all January 2000 files + self.assertEqual(len(result), 3) + self.assertIn(file1, result) + self.assertIn(file2, result) + self.assertIn(file3, result) + + def test_find_inst_hist_files_no_files_found(self): + """Test error when no matching files are found""" + # Create a non-matching file + self._create_test_file("test.clm2.h0.2000-01-01-00000.nc") + + # Should raise a FileNotFoundError error + with self.assertRaisesRegex(FileNotFoundError, "No files found matching patterns"): + gf.find_inst_hist_files(self.temp_dir, h=1, this_year=None) + + def test_find_inst_hist_files_different_case_names(self): + """Test that RuntimeError is raised when files from different case names are found""" + # Create files with different case names + self._create_test_file("case1.clm2.h1i.2000-01-01-00000.nc") + self._create_test_file("case2.clm2.h1i.2000-01-01-00000.nc") + self._create_test_file("longcasename.clm2.h1i.2000-01-01-00000.nc") + + # Should raise RuntimeError due to multiple case names + with self.assertRaisesRegex(RuntimeError, "Found files from multiple case names"): + gf.find_inst_hist_files(self.temp_dir, h=1, this_year=2000) + + def test_find_inst_hist_files_different_case_names_with_logger(self): + """ + Test that RuntimeError is raised when files from different case names are found, with logger + """ + # Create a logger + logger = logging.getLogger("test_logger_case_names") + logger.setLevel(logging.DEBUG) + + # Create files with different case names + self._create_test_file("case1.clm2.h1i.2000-01-01-00000.nc") + self._create_test_file("case2.clm2.h1i.2000-01-01-00000.nc") + self._create_test_file("longcasename.clm2.h1i.2000-01-01-00000.nc") + + # Should raise RuntimeError due to multiple case names, even with logger + with self.assertRaisesRegex(RuntimeError, "Found files from multiple case names"): + gf.find_inst_hist_files(self.temp_dir, h=1, this_year=2000, logger=logger) + + def test_find_inst_hist_files_no_files_found_with_logger(self): + """Test error when no matching files are found, with logger""" + # Create a logger + logger = logging.getLogger("test_logger_no_files") + logger.setLevel(logging.DEBUG) + + # Create a non-matching file + self._create_test_file("test.clm2.h0.2000-01-01-00000.nc") + + # Should raise a FileNotFoundError even with logger + with self.assertRaisesRegex(FileNotFoundError, "No files found matching patterns"): + gf.find_inst_hist_files(self.temp_dir, h=1, this_year=None, logger=logger) + + def test_find_inst_hist_files_h_str_with_logger(self): + """Test that TypeError is raised when h is a string, with logger""" + # Create a logger + logger = logging.getLogger("test_logger_h_str") + logger.setLevel(logging.DEBUG) + + self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc") + + with self.assertRaisesRegex(TypeError, "must be an integer, not"): + gf.find_inst_hist_files(self.temp_dir, h="1", this_year=2000, logger=logger) + + def test_find_inst_hist_files_h_float_with_logger(self): + """Test that TypeError is raised when h is a float, with logger""" + # Create a logger + logger = logging.getLogger("test_logger_h_float") + logger.setLevel(logging.DEBUG) + + self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc") + + with self.assertRaisesRegex(TypeError, "must be an integer, not"): + gf.find_inst_hist_files(self.temp_dir, h=1.0, this_year=2000, logger=logger) + + +class TestGetFileLists(unittest.TestCase): + """Tests of _get_file_lists()""" + + def setUp(self): + """ + Set up and change to temporary directory + """ + self.prev_dir = os.getcwd() + self.temp_dir = tempfile.mkdtemp() + os.chdir(self.temp_dir) + + def tearDown(self): + """ + Delete temporary directory + """ + os.chdir(self.prev_dir) + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def _create_test_file(self, filename): + """Helper to create an empty test file with time coordinate""" + filepath = os.path.join(self.temp_dir, filename) + + # Extract date from filename using regex (format: *.h#i.YYYY-MM-DD-*.nc) + match = re.search(r"(\d{4})-(\d{2})-(\d{2})", filename) + if match: + year, month, day = match.groups() + time_val = DatetimeNoLeap(int(year), int(month), int(day), has_year_zero=True) + else: + raise ValueError(f"Could not extract date from filename: {filename}") + + # Create a simple dataset with time coordinate + time = xr.DataArray([time_val], dims=["time"], name="time") + ds = xr.Dataset({"time": time}) + ds.to_netcdf(filepath) + + return filepath + + def test_get_file_lists_single_year(self): + """Test _get_file_lists with a single year of data""" + # Create h1 and h2 files for 2000 + h1_file = self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc") + h2_file = self._create_test_file("test.clm2.h2i.2000-01-01-00000.nc") + + time_slice_list = [slice("2000-01-01", "2000-12-31")] + + h1_file_lists, h2_file_lists = gg._get_file_lists( + self.temp_dir, time_slice_list, logger=None + ) + + # Should have one list for each time slice + self.assertEqual(len(h1_file_lists), 1) + self.assertEqual(len(h2_file_lists), 1) + + # Check contents of file lists + # pylint: disable=unsubscriptable-object + self.assertEqual(len(h1_file_lists[0]), 1) + self.assertEqual(len(h2_file_lists[0]), 1) + self.assertEqual(h1_file_lists[0], [h1_file]) + self.assertEqual(h2_file_lists[0], [h2_file]) + + def test_get_file_lists_multiple_years(self): + """Test _get_file_lists with multiple years of data""" + # Create h1 and h2 files for 2000-2002 + h1_files = [] + h2_files = [] + for year in [2000, 2001, 2002]: + h1_files.append(self._create_test_file(f"test.clm2.h1i.{year}-01-01-00000.nc")) + h2_files.append(self._create_test_file(f"test.clm2.h2i.{year}-01-01-00000.nc")) + + time_slice_list = [ + slice("2000-01-01", "2000-12-31"), + slice("2001-01-01", "2001-12-31"), + slice("2002-01-01", "2002-12-31"), + ] + + h1_file_lists, h2_file_lists = gg._get_file_lists( + self.temp_dir, time_slice_list, logger=None + ) + + # Should have one list for each time slice + self.assertEqual(len(h1_file_lists), 3) + self.assertEqual(len(h2_file_lists), 3) + + # Check contents of file lists + # pylint: disable=unsubscriptable-object + for i in range(3): + self.assertEqual(len(h1_file_lists[i]), 1) + self.assertEqual(len(h2_file_lists[i]), 1) + self.assertEqual(h1_file_lists[i], [h1_files[i]]) + self.assertEqual(h2_file_lists[i], [h2_files[i]]) + + def test_get_file_lists_multiple_files_per_slice(self): + """Test _get_file_lists when multiple files fall within a time slice""" + # Create multiple h1 and h2 files for 2000 + h1_files = [] + h2_files = [] + for month in ["01", "06", "12"]: + h1_files.append(self._create_test_file(f"test.clm2.h1i.2000-{month}-01-00000.nc")) + h2_files.append(self._create_test_file(f"test.clm2.h2i.2000-{month}-01-00000.nc")) + + time_slice_list = [slice("2000-01-01", "2000-12-31")] + + h1_file_lists, h2_file_lists = gg._get_file_lists( + self.temp_dir, time_slice_list, logger=None + ) + + # Should have one list for the time slice + self.assertEqual(len(h1_file_lists), 1) + self.assertEqual(len(h2_file_lists), 1) + + # Check contents of file lists (should be sorted) + # pylint: disable=unsubscriptable-object + self.assertEqual(len(h1_file_lists[0]), 3) + self.assertEqual(len(h2_file_lists[0]), 3) + self.assertEqual(h1_file_lists[0], sorted(h1_files)) + self.assertEqual(h2_file_lists[0], sorted(h2_files)) + + def test_get_file_lists_no_h1_files(self): + """Test _get_file_lists when h1 files are missing""" + # Create only h2 files + self._create_test_file("test.clm2.h2i.2000-01-01-00000.nc") + + time_slice_list = [slice("2000-01-01", "2000-12-31")] + + # Should raise FileNotFoundError when h1 files are not found + with self.assertRaisesRegex(FileNotFoundError, "No files found matching patterns"): + gg._get_file_lists(self.temp_dir, time_slice_list, logger=None) + + def test_get_file_lists_no_h2_files(self): + """Test _get_file_lists when h2 files are missing""" + # Create only h1 files + self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc") + + time_slice_list = [slice("2000-01-01", "2000-12-31")] + + # Should raise FileNotFoundError when h2 files are not found + with self.assertRaisesRegex(FileNotFoundError, "No files found matching patterns"): + gg._get_file_lists(self.temp_dir, time_slice_list, logger=None) + + def test_get_file_lists_h1_outside_time_slice(self): + """Test _get_file_lists when h1 files exist but have no timesteps in the slice""" + # Create h1 files for 2000 and h2 files for 2001 + self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc") + self._create_test_file("test.clm2.h2i.2001-01-01-00000.nc") + + # Request time slice for 2001 (h1 files exist but are outside the slice) + time_slice_list = [slice("2001-01-01", "2001-12-31")] + + # Should raise FileNotFoundError when h1 files have no timesteps in slice + with self.assertRaisesRegex(FileNotFoundError, "No h1 timesteps found in"): + gg._get_file_lists(self.temp_dir, time_slice_list, logger=None) + + def test_get_file_lists_h2_outside_time_slice(self): + """Test _get_file_lists when h2 files exist but have no timesteps in the slice""" + # Create h1 files for 2001 and h2 files for 2000 + self._create_test_file("test.clm2.h1i.2001-01-01-00000.nc") + self._create_test_file("test.clm2.h2i.2000-01-01-00000.nc") + + # Request time slice for 2001 (h2 files exist but are outside the slice) + time_slice_list = [slice("2001-01-01", "2001-12-31")] + + # Should raise FileNotFoundError when h2 files have no timesteps in slice + with self.assertRaisesRegex(FileNotFoundError, "No h2 timesteps found in"): + gg._get_file_lists(self.temp_dir, time_slice_list, logger=None) + + def test_get_file_lists_partial_overlap(self): + """Test _get_file_lists when some time slices have files and others don't""" + # Create h1 and h2 files for 2000 only + self._create_test_file("test.clm2.h1i.2000-01-01-00000.nc") + self._create_test_file("test.clm2.h2i.2000-01-01-00000.nc") + + # Request time slices for 2000 and 2001 + time_slice_list = [ + slice("2000-01-01", "2000-12-31"), + slice("2001-01-01", "2001-12-31"), + ] + + # Should raise FileNotFoundError when second time slice has no files + with self.assertRaisesRegex(FileNotFoundError, "No h1 timesteps found in"): + gg._get_file_lists(self.temp_dir, time_slice_list, logger=None) + + if __name__ == "__main__": unit_testing.setup_for_tests() unittest.main() diff --git a/python/ctsm/test/test_unit_import_ds.py b/python/ctsm/test/test_unit_import_ds.py new file mode 100755 index 0000000000..89349be9d4 --- /dev/null +++ b/python/ctsm/test/test_unit_import_ds.py @@ -0,0 +1,354 @@ +#!/usr/bin/env python3 + +""" +Unit tests for import_ds.py +""" + +import unittest +import os +import tempfile +import shutil + +import xarray as xr +from cftime import DatetimeNoLeap + +from ctsm import unit_testing +from ctsm.crop_calendars import import_ds + +# Allow test names that pylint doesn't like; otherwise hard to make them +# readable +# pylint: disable=invalid-name + +# pylint: disable=protected-access + + +def _make_timestep(str_in): + """ + Because of float imprecision, microseconds should be specified like: + 1,8 instead of 1.000008 + and: + 1,800000 instead of 1.8 + """ + h = minute = s = us = 0 + + str_in_split = str_in.split(" ") + y, month, d = str_in_split[0].split("-") + if len(str_in_split) > 1: + h, minute, s = str_in_split[1].split(":") + if "," in s: + s, us = s.split(",") + inputs = [int(x) for x in [y, month, d, h, minute, s, us]] + return DatetimeNoLeap(*inputs, has_year_zero=True) + + +class TestMakeTimestep(unittest.TestCase): + """Test this test module's _make_timestep() function""" + + def test_make_timestep_ymd(self): + """Test with YYYY-MM-DD""" + self.assertEqual( + _make_timestep("1987-07-24"), + DatetimeNoLeap(1987, 7, 24, 0, 0, 0, 0, has_year_zero=True), + ) + + def test_make_timestep_hms(self): + """Test with YYYY-MM-DD hh:mm:ss""" + self.assertEqual( + _make_timestep("1987-07-24 09:25:07"), + DatetimeNoLeap(1987, 7, 24, 9, 25, 7, 0, has_year_zero=True), + ) + + def test_make_timestep_microsec_leadzeros(self): + """Test with microseconds with leading zeros""" + self.assertEqual( + _make_timestep("1987-07-24 09:25:07,8"), + DatetimeNoLeap(1987, 7, 24, 9, 25, 7, 8, has_year_zero=True), + ) + + def test_make_timestep_microsec_noleadzeros(self): + """Test with microseconds without leading zeros""" + self.assertEqual( + _make_timestep("1987-07-24 09:25:07,800000"), + DatetimeNoLeap(1987, 7, 24, 9, 25, 7, 800000, has_year_zero=True), + ) + + +class TestGetFilesInTimeSlice(unittest.TestCase): + """Tests of get_files_in_time_slice()""" + + def setUp(self): + """ + Set up and change to temporary directory + """ + self.prev_dir = os.getcwd() + self.temp_dir = tempfile.mkdtemp() + os.chdir(self.temp_dir) + + def tearDown(self): + """ + Delete temporary directory and any files within + """ + os.chdir(self.prev_dir) + shutil.rmtree(self.temp_dir, ignore_errors=True) + + def _create_annual_test_files(self, years): + """ + Helper method to create test files with one timestep per file (annual) + + Args: + years: List of years to create files for + + Returns: + List of filenames created + """ + filelist = [] + for year in years: + filename = os.path.join(self.temp_dir, f"test_{year}.nc") + filelist.append(filename) + + # Create a simple dataset with one time step + time = xr.DataArray( + [_make_timestep(f"{year}-01-01")], + dims=["time"], + name="time", + ) + ds = xr.Dataset({"time": time}) + ds.to_netcdf(filename) + + return filelist + + def _create_monthly_test_files(self, year_month_list): + """ + Helper method to create test files with multiple timesteps per file (monthly) + + Args: + year_month_list: List of tuples (year, list_of_months) where each file + contains multiple monthly timesteps + + Returns: + List of filenames created + """ + filelist = [] + for year, months in year_month_list: + filename = os.path.join(self.temp_dir, f"test_{year}.nc") + filelist.append(filename) + + # Create a dataset with multiple monthly time steps + timesteps = [_make_timestep(f"{year}-{month:02d}-15 12:00:00") for month in months] + time = xr.DataArray(timesteps, dims=["time"], name="time") + ds = xr.Dataset({"time": time}) + ds.to_netcdf(filename) + + return filelist + + def _create_daily_test_file(self, year, month, days, *, hour=0, minute=0, second=0): + """ + Helper method to create a test file with daily timesteps + + Args: + year: Year for the file + month: Month for the file + days: List of days to include + hour: Hour of day (default 0) + minute: Minute of hour (default 0) + second: Second of minute (default 0) + + Returns: + Filename created + """ + filename = os.path.join(self.temp_dir, f"test_{year}_{month:02d}.nc") + + # Create a dataset with daily time steps + time_str = f"{hour:02d}:{minute:02d}:{second:02d}" if hour or minute or second else "" + if time_str: + timesteps = [_make_timestep(f"{year}-{month:02d}-{day:02d} {time_str}") for day in days] + else: + timesteps = [_make_timestep(f"{year}-{month:02d}-{day:02d}") for day in days] + time = xr.DataArray(timesteps, dims=["time"], name="time") + ds = xr.Dataset({"time": time}) + ds.to_netcdf(filename) + + return filename + + def test_get_files_in_time_slice_middle(self): + """Test get_files_in_time_slice with a slice in the middle of the range""" + years = [2000, 2001, 2002, 2003, 2004] + filelist = self._create_annual_test_files(years) + + time_slice = slice(_make_timestep("2001-01-01"), _make_timestep("2003-01-01")) + result = import_ds.get_files_in_time_slice(filelist, time_slice) + result_basenames = [os.path.basename(f) for f in result] + expected = ["test_2001.nc", "test_2002.nc", "test_2003.nc"] + self.assertEqual(result_basenames, expected) + + def test_get_files_in_time_slice_middle_slice_just_strings(self): + """ + As test_get_files_in_time_slice_middle, but with the slice containing strings instead of + actual cftime timestamps + """ + years = [2000, 2001, 2002, 2003, 2004] + filelist = self._create_annual_test_files(years) + + time_slice = slice("2001-01-01", "2003-01-01") + result = import_ds.get_files_in_time_slice(filelist, time_slice) + result_basenames = [os.path.basename(f) for f in result] + expected = ["test_2001.nc", "test_2002.nc", "test_2003.nc"] + self.assertEqual(result_basenames, expected) + + def test_get_files_in_time_slice_from_beginning(self): + """Test get_files_in_time_slice with unbounded start (from beginning)""" + years = [2000, 2001, 2002, 2003, 2004] + filelist = self._create_annual_test_files(years) + + time_slice = slice(None, _make_timestep("2001-01-01")) + result = import_ds.get_files_in_time_slice(filelist, time_slice) + result_basenames = [os.path.basename(f) for f in result] + expected = ["test_2000.nc", "test_2001.nc"] + self.assertEqual(result_basenames, expected) + + def test_get_files_in_time_slice_to_end(self): + """Test get_files_in_time_slice with unbounded end (to the end)""" + years = [2000, 2001, 2002, 2003, 2004] + filelist = self._create_annual_test_files(years) + + time_slice = slice(_make_timestep("2003-01-01"), None) + result = import_ds.get_files_in_time_slice(filelist, time_slice) + result_basenames = [os.path.basename(f) for f in result] + expected = ["test_2003.nc", "test_2004.nc"] + self.assertEqual(result_basenames, expected) + + def test_get_files_in_time_slice_all_files(self): + """Test get_files_in_time_slice with unbounded slice (all files)""" + years = [2000, 2001, 2002, 2003, 2004] + filelist = self._create_annual_test_files(years) + + time_slice = slice(None, None) + result = import_ds.get_files_in_time_slice(filelist, time_slice) + # For this test, compare full paths since expected is also full paths + self.assertEqual(result, filelist) + + def test_get_files_in_time_slice_no_match(self): + """Test get_files_in_time_slice with no matching files (should raise FileNotFoundError)""" + years = [2000, 2001, 2002, 2003, 2004] + filelist = self._create_annual_test_files(years) + + time_slice = slice(_make_timestep("2010-01-01"), _make_timestep("2011-01-01")) + with self.assertRaises(FileNotFoundError): + import_ds.get_files_in_time_slice(filelist, time_slice) + + def test_get_files_in_time_slice_monthly_multiple_per_file(self): + """Test get_files_in_time_slice with monthly data, multiple timesteps per file""" + # Create files with monthly data: each file has 12 months + year_month_list = [ + (2000, list(range(1, 13))), # Jan-Dec 2000 + (2001, list(range(1, 13))), # Jan-Dec 2001 + (2002, list(range(1, 13))), # Jan-Dec 2002 + ] + filelist = self._create_monthly_test_files(year_month_list) + + # Select from mid-2000 to mid-2001 + time_slice = slice(_make_timestep("2000-06-01"), _make_timestep("2001-08-01")) + result = import_ds.get_files_in_time_slice(filelist, time_slice) + result_basenames = [os.path.basename(f) for f in result] + expected = ["test_2000.nc", "test_2001.nc"] + self.assertEqual(result_basenames, expected) + + def test_get_files_in_time_slice_monthly_partial_overlap(self): + """Test get_files_in_time_slice with monthly data selecting partial year""" + year_month_list = [ + (2000, list(range(1, 13))), + (2001, list(range(1, 13))), + (2002, list(range(1, 13))), + ] + filelist = self._create_monthly_test_files(year_month_list) + + # Select only within 2001 + time_slice = slice(_make_timestep("2001-03-01"), _make_timestep("2001-09-01")) + result = import_ds.get_files_in_time_slice(filelist, time_slice) + result_basenames = [os.path.basename(f) for f in result] + expected = ["test_2001.nc"] + self.assertEqual(result_basenames, expected) + + def test_get_files_in_time_slice_with_hours_minutes_seconds(self): + """Test get_files_in_time_slice with timesteps including hours, minutes, seconds""" + # Create daily files with specific times + filelist = [] + filelist.append( + self._create_daily_test_file(2000, 6, list(range(1, 31)), hour=6, minute=30, second=15) + ) + filelist.append( + self._create_daily_test_file(2000, 7, list(range(1, 32)), hour=6, minute=30, second=15) + ) + filelist.append( + self._create_daily_test_file(2000, 8, list(range(1, 32)), hour=6, minute=30, second=15) + ) + + # Select from mid-June to mid-July with specific time + time_slice = slice( + _make_timestep("2000-06-15 06:30:15"), + _make_timestep("2000-07-20 06:30:15"), + ) + result = import_ds.get_files_in_time_slice(filelist, time_slice) + result_basenames = [os.path.basename(f) for f in result] + expected = ["test_2000_06.nc", "test_2000_07.nc"] + self.assertEqual(result_basenames, expected) + + def test_get_files_in_time_slice_mixed_frequencies(self): + """Test get_files_in_time_slice with files containing different numbers of timesteps""" + filelist = [] + # File 1: Single timestep (annual) + filename1 = os.path.join(self.temp_dir, "test_1999.nc") + time1 = xr.DataArray([_make_timestep("1999-07-01 00:00:00")], dims=["time"], name="time") + xr.Dataset({"time": time1}).to_netcdf(filename1) + filelist.append(filename1) + + # File 2: Monthly timesteps + filename2 = os.path.join(self.temp_dir, "test_2000.nc") + timesteps2 = [_make_timestep(f"2000-{m:02d}-15 12:00:00") for m in range(1, 13)] + time2 = xr.DataArray(timesteps2, dims=["time"], name="time") + xr.Dataset({"time": time2}).to_netcdf(filename2) + filelist.append(filename2) + + # File 3: Daily timesteps for one month + filelist.append( + self._create_daily_test_file(2001, 1, list(range(1, 32)), hour=3, minute=0, second=0) + ) + + # Select from late 1999 to mid-2000 + time_slice = slice(_make_timestep("1999-06-01"), _make_timestep("2000-08-01")) + result = import_ds.get_files_in_time_slice(filelist, time_slice) + result_basenames = [os.path.basename(f) for f in result] + expected = ["test_1999.nc", "test_2000.nc"] + self.assertEqual(result_basenames, expected) + + def test_get_files_in_time_slice_exact_boundary_match(self): + """Test get_files_in_time_slice with exact timestamp boundary matches at file edges""" + filelist = [] + filelist.append( + self._create_daily_test_file( + 2000, 12, list(range(1, 32)), hour=23, minute=59, second=59 + ) + ) + filelist.append( + self._create_daily_test_file(2001, 1, list(range(1, 32)), hour=23, minute=59, second=59) + ) + filelist.append( + self._create_daily_test_file(2001, 2, list(range(1, 29)), hour=23, minute=59, second=59) + ) + + # Select from last timestep of December file to first timestep of February file + # This tests exact boundary matching at file edges + time_slice = slice( + _make_timestep("2000-12-31 23:59:59"), # Last timestep in first file + _make_timestep("2001-02-01 23:59:59"), # First timestep in third file + ) + result = import_ds.get_files_in_time_slice(filelist, time_slice) + result_basenames = [os.path.basename(f) for f in result] + # Should include all three files since boundaries match exactly + expected = ["test_2000_12.nc", "test_2001_01.nc", "test_2001_02.nc"] + self.assertEqual(result_basenames, expected) + + +if __name__ == "__main__": + unit_testing.setup_for_tests() + unittest.main()