diff --git a/esmvalcore/esgf/_download.py b/esmvalcore/esgf/_download.py index a499879d6b..6749c59aaa 100644 --- a/esmvalcore/esgf/_download.py +++ b/esmvalcore/esgf/_download.py @@ -8,6 +8,7 @@ import logging import os import random +import re import shutil from pathlib import Path from statistics import median @@ -178,14 +179,16 @@ class ESGFFile: Attributes ---------- - urls : :class:`list` of :class:`str` - The URLs where the file can be downloaded. dataset : str The name of the dataset that the file is part of. + facets : dict[str,str] + Facets describing the file. name : str The name of the file. size : int The size of the file in bytes. + urls : list[str] + The URLs where the file can be downloaded. """ def __init__(self, results): @@ -193,6 +196,7 @@ def __init__(self, results): self.name = str(Path(results[0].filename).with_suffix('.nc')) self.size = results[0].size self.dataset = self._get_dataset_id(results) + self.facets = self._get_facets(results) self.urls = [] self._checksums = [] for result in results: @@ -225,10 +229,64 @@ def same_file(result): logger.debug( "Ignoring file(s) %s containing wrong variable '%s' in" " found in search for variable '%s'", file.urls, variable, - facets['variable']) + facets.get('variable', facets.get('variable_id', '?'))) return files + @staticmethod + def _get_facets(results): + """Read the facets from the `dataset_id`.""" + # This reads the facets from the dataset_id because the facets + # provided by ESGF are unreliable. + # + # Example dataset_id_template_ values: + # CMIP3: '%(project)s.%(institute)s.%(model)s.%(experiment)s. + # %(time_frequency)s.%(realm)s.%(ensemble)s.%(variable)s' + # CMIP5: 'cmip5.%(product)s.%(valid_institute)s.%(model)s. + # %(experiment)s.%(time_frequency)s.%(realm)s.%(cmor_table)s. + # %(ensemble)s' + # CMIP6: '%(mip_era)s.%(activity_drs)s.%(institution_id)s. + # %(source_id)s.%(experiment_id)s.%(member_id)s.%(table_id)s. + # %(variable_id)s.%(grid_label)s' + # CORDEX: 'cordex.%(product)s.%(domain)s.%(institute)s. + # %(driving_model)s.%(experiment)s.%(ensemble)s.%(rcm_name)s. + # %(rcm_version)s.%(time_frequency)s.%(variable)s' + # obs4MIPs: '%(project)s.%(institute)s.%(source_id)s.%(realm)s. + # %(time_frequency)s' + project = results[0].json['project'][0] + + # Read the keys from `dataset_id_template_` and translate to our keys + template = results[0].json['dataset_id_template_'][0] + keys = re.findall(r"%\((.*?)\)s", template) + reverse_facet_map = {v: k for k, v in FACETS[project].items()} + reverse_facet_map['mip_era'] = 'project' # CMIP6 oddity + reverse_facet_map['variable_id'] = 'short_name' # CMIP6 oddity + reverse_facet_map['valid_institute'] = 'institute' # CMIP5 oddity + keys = [reverse_facet_map.get(k, k) for k in keys] + keys.append('version') + if keys[0] == 'project': + # The project is sometimes hardcoded all lowercase in the template + keys = keys[1:] + + # Read values from dataset_id + # Pick the first dataset_id if there are differences in case + dataset_id = sorted(r.json['dataset_id'].split('|')[0] + for r in results)[0] + values = dataset_id.split('.')[1:] + + # Compose facets + facets = { + 'project': project, + } + for idx, key in enumerate(keys): + facets[key] = values[idx] + # The dataset_id does not contain the short_name for all projects, + # so get it from the filename if needed: + if 'short_name' not in facets: + facets['short_name'] = results[0].json['title'].split('_')[0] + + return facets + @staticmethod def _get_dataset_id(results): """Simplify dataset_id so it is always composed of the same facets.""" @@ -256,7 +314,10 @@ def __repr__(self): def __eq__(self, other): """Compare `self` to `other`.""" - return (self.dataset, self.name) == (other.dataset, other.name) + return ( + isinstance(other, self.__class__) + and (self.dataset, self.name) == (other.dataset, other.name) + ) def __lt__(self, other): """Compare `self` to `other`.""" diff --git a/esmvalcore/esgf/_search.py b/esmvalcore/esgf/_search.py index 7807e25eaa..7e3949596f 100644 --- a/esmvalcore/esgf/_search.py +++ b/esmvalcore/esgf/_search.py @@ -26,6 +26,9 @@ def get_esgf_facets(variable): for our_name, esgf_name in FACETS[project].items(): if our_name in variable: values = variable[our_name] + if values == '*': + # Wildcards can be specified on ESGF by omitting the facet + continue if isinstance(values, (tuple, list)): values = list(values) @@ -42,7 +45,7 @@ def get_esgf_facets(variable): return facets -def select_latest_versions(files): +def select_latest_versions(files, versions): """Select only the latest version of files.""" result = [] @@ -52,14 +55,24 @@ def same_file(file): dataset = file.dataset.rsplit('.', 1)[0] return (dataset, file.name) + if isinstance(versions, str): + versions = (versions, ) + files = sorted(files, key=same_file) - for _, versions in itertools.groupby(files, key=same_file): - versions = sorted(versions, reverse=True) - latest_version = versions[0] + for _, group in itertools.groupby(files, key=same_file): + group = sorted(group, reverse=True) + if versions: + selection = [f for f in group if f.facets['version'] in versions] + if not selection: + raise FileNotFoundError( + f"Requested versions {', '.join(versions)} of file not " + f"found. Available files: {group}") + group = selection + latest_version = group[0] result.append(latest_version) - if len(versions) > 1: + if len(group) > 1: logger.debug("Only using the latest version %s, not %s", - latest_version, versions[1:]) + latest_version, group[1:]) return result @@ -100,7 +113,6 @@ def _search_index_nodes(facets): context = connection.new_context( pyesgf.search.context.FileSearchContext, **facets, - latest=True, ) logger.debug("Searching %s for datasets using facets=%s", url, facets) try: @@ -139,8 +151,6 @@ def esgf_search_files(facets): files = ESGFFile._from_results(results, facets) - files = select_latest_versions(files) - msg = 'none' if not files else '\n' + '\n'.join(str(f) for f in files) logger.debug("Found the following files matching facets %s: %s", facets, msg) @@ -150,6 +160,10 @@ def esgf_search_files(facets): def select_by_time(files, timerange): """Select files containing data between a timerange.""" + if '*' in timerange: + # TODO: support * combined with a period + return files + selection = [] for file in files: @@ -180,9 +194,11 @@ def find_files(*, project, short_name, dataset, **facets): The name of the variable. dataset : str The name of the dataset. - **facets: - Any other search facets. Values can be strings, list of strings, or - 'start_year' and 'end_year' with values of type :obj:`int`. + **facets : typing.Union[str, list[str]] + Any other search facets. The special value ``'*'`` will match anything. + If no ``version`` facet is specified, the function returns only the + latest version of each file, while other omitted facets will default + to ``'*'``. Examples -------- @@ -236,13 +252,12 @@ def find_files(*, project, short_name, dataset, **facets): ... ensemble='r1i1p1', ... domain='EUR-11', ... driver='MPI-M-MPI-ESM-LR', - ... start_year=1990, - ... end_year=2000, + ... timerange='1990/2000', ... ) # doctest: +SKIP [ESGFFile:cordex/output/EUR-11/CLMcom-ETH/MPI-M-MPI-ESM-LR/historical/r1i1p1/COSMO-crCLIM-v1-1/v1/mon/tas/v20191219/tas_EUR-11_MPI-M-MPI-ESM-LR_historical_r1i1p1_CLMcom-ETH-COSMO-crCLIM-v1-1_v1_mon_198101-199012.nc, - ESGFFile:cordex/output/EUR-11/CLMcom-ETH/MPI-M-MPI-ESM-LR/historical/r1i1p1/COSMO-crCLIM-v1-1/v1/mon/tas/v20191219/tas_EUR-11_MPI-M-MPI-ESM-LR_historical_r1i1p1_CLMcom-ETH-COSMO-crCLIM-v1-1_v1_mon_199101-200012.nc] + ESGFFile:cordex/output/EUR-11/CLMcom-ETH/MPI-M-MPI-ESM-LR/historical/r1i1p1/COSMO-crCLIM-v1-1/v1/mon/tas/v20191219/tas_EUR-11_MPI-M-MPI-ESM-LR_historical_r1i1p1_CLMcom-ETH-COSMO-crCLIM-v1-1_v1_mon_199101-200012.nc] - Search for a obs4MIPs dataset: + Search for an obs4MIPs dataset: >>> find_files( ... project='obs4MIPs', @@ -252,6 +267,48 @@ def find_files(*, project, short_name, dataset, **facets): ... ) # doctest: +SKIP [ESGFFile:obs4MIPs/NASA-LaRC/CERES-EBAF/atmos/mon/v20160610/rsutcs_CERES-EBAF_L3B_Ed2-8_200003-201404.nc] + Search for any ensemble member: + + >>> find_files( + ... project='CMIP6', + ... mip='Amon', + ... short_name='tas', + ... dataset='BCC-CSM2-MR', + ... exp='historical', + ... ensemble='*', + ... ) # doctest: +SKIP + [ESGFFile:CMIP6/CMIP/BCC/BCC-CSM2-MR/historical/r1i1p1f1/Amon/tas/gn/v20181126/tas_Amon_BCC-CSM2-MR_historical_r1i1p1f1_gn_185001-201412.nc, + ESGFFile:CMIP6/CMIP/BCC/BCC-CSM2-MR/historical/r2i1p1f1/Amon/tas/gn/v20181115/tas_Amon_BCC-CSM2-MR_historical_r2i1p1f1_gn_185001-201412.nc, + ESGFFile:CMIP6/CMIP/BCC/BCC-CSM2-MR/historical/r3i1p1f1/Amon/tas/gn/v20181119/tas_Amon_BCC-CSM2-MR_historical_r3i1p1f1_gn_185001-201412.nc] + + Search for all available versions of a file: + + >>> find_files( + ... project='CMIP5', + ... mip='Amon', + ... short_name='tas', + ... dataset='CCSM4', + ... exp='historical', + ... ensemble='r1i1p1', + ... version='*', + ... ) # doctest: +SKIP + [ESGFFile:cmip5/output1/NCAR/CCSM4/historical/mon/atmos/Amon/r1i1p1/v20121031/tas_Amon_CCSM4_historical_r1i1p1_185001-200512.nc, + ESGFFile:cmip5/output1/NCAR/CCSM4/historical/mon/atmos/Amon/r1i1p1/v20130425/tas_Amon_CCSM4_historical_r1i1p1_185001-200512.nc, + ESGFFile:cmip5/output1/NCAR/CCSM4/historical/mon/atmos/Amon/r1i1p1/v20160829/tas_Amon_CCSM4_historical_r1i1p1_185001-200512.nc] + + Search for a specific version of a file: + + >>> find_files( + ... project='CMIP5', + ... mip='Amon', + ... short_name='tas', + ... dataset='CCSM4', + ... exp='historical', + ... ensemble='r1i1p1', + ... version='v20130425', + ... ) # doctest: +SKIP + [ESGFFile:cmip5/output1/NCAR/CCSM4/historical/mon/atmos/Amon/r1i1p1/v20130425/tas_Amon_CCSM4_historical_r1i1p1_185001-200512.nc] + Returns ------- :obj:`list` of :obj:`ESGFFile` @@ -287,10 +344,12 @@ def cached_search(**facets): """ esgf_facets = get_esgf_facets(facets) files = esgf_search_files(esgf_facets) + + if 'version' not in facets or facets['version'] != '*': + files = select_latest_versions(files, facets.get('version')) + _get_timerange_from_years(facets) - filter_timerange = (facets.get('frequency', '') != 'fx' - and 'timerange' in facets) - if filter_timerange: + if 'timerange' in facets: files = select_by_time(files, facets['timerange']) logger.debug("Selected files:\n%s", '\n'.join(str(f) for f in files)) diff --git a/esmvalcore/esgf/facets.py b/esmvalcore/esgf/facets.py index 9a071b6c97..dde658d76a 100644 --- a/esmvalcore/esgf/facets.py +++ b/esmvalcore/esgf/facets.py @@ -16,14 +16,17 @@ 'dataset': 'model', 'ensemble': 'ensemble', 'exp': 'experiment', + 'frequency': 'time_frequency', 'mip': 'cmor_table', 'product': 'product', 'short_name': 'variable', }, 'CMIP6': { + 'activity': 'activity_drs', 'dataset': 'source_id', - 'ensemble': 'variant_label', + 'ensemble': 'member_id', 'exp': 'experiment_id', + 'institute': 'institution_id', 'grid': 'grid_label', 'mip': 'table_id', 'short_name': 'variable', diff --git a/tests/integration/esgf/test_search_download.py b/tests/integration/esgf/test_search_download.py index 32650c459b..171d1dbdda 100644 --- a/tests/integration/esgf/test_search_download.py +++ b/tests/integration/esgf/test_search_download.py @@ -15,6 +15,7 @@ 'frequency': 'mon', 'project': 'CMIP3', 'short_name': 'tas', + 'version': 'v1', }, { 'dataset': 'inmcm4', 'ensemble': 'r1i1p1', @@ -22,6 +23,7 @@ 'mip': 'Amon', 'project': 'CMIP5', 'short_name': 'tas', + 'version': 'v20130207', }, { 'dataset': 'FIO-ESM', 'ensemble': 'r1i1p1', @@ -36,8 +38,7 @@ 'mip': 'Amon', 'project': 'CMIP5', 'short_name': 'tas', - 'start_year': 2080, - 'end_year': 2100, + 'timerange': '2080/2100', }, { 'dataset': 'EC-EARTH', 'ensemble': 'r1i1p1', @@ -45,7 +46,7 @@ 'mip': 'Amon', 'project': 'CMIP5', 'short_name': 'tas', - 'start_year': 1990, + 'start_year': 1990, # test legacy way of specifying timerange 'end_year': 1999, }, { 'dataset': 'AWI-ESM-1-1-LR', @@ -55,8 +56,8 @@ 'mip': 'Amon', 'project': 'CMIP6', 'short_name': 'tas', - 'start_year': 2000, - 'end_year': 2001, + 'timerange': '2000/2001', + 'version': 'v20200212', }, { 'dataset': 'RACMO22E', 'driver': 'MOHC-HadGEM2-ES', @@ -66,13 +67,14 @@ 'frequency': 'mon', 'project': 'CORDEX', 'short_name': 'tas', - 'start_year': 1950, - 'end_year': 1952, + 'timerange': '1950/1952', + 'version': 'v20160620', }, { 'dataset': 'CERES-EBAF', 'frequency': 'mon', 'project': 'obs4MIPs', 'short_name': 'rsutcs', + 'version': 'v20160610', }] @@ -84,7 +86,6 @@ def search(self, **kwargs): class MockConnection: def new_context(self, *args, **kwargs): - assert kwargs.pop('latest') assert kwargs == facets return MockFileSearchContext() @@ -209,7 +210,7 @@ def test_real_search_many(): ], [ 'cmip5.output1.ICHEC.EC-EARTH.historical.mon.atmos.Amon.r1i1p1' - '.v20121115', + '.v20131231', ], [ 'CMIP6.CMIP.AWI.AWI-ESM-1-1-LR.historical.r1i1p1f1.Amon.tas.gn' @@ -238,6 +239,15 @@ def test_real_search_many(): print(found_datasets) print(datasets) assert found_datasets == datasets + print(result[0].facets) + for file in result: + for key, value in variable.items(): + if key in ('start_year', 'end_year', 'timerange'): + continue + if isinstance(value, list): + assert file.facets.get(key) in value + else: + assert file.facets.get(key) == value @pytest.mark.skip(reason="This will actually download the data") diff --git a/tests/integration/test_recipe_checks.py b/tests/integration/test_recipe_checks.py index ef441e1969..065a6e1301 100644 --- a/tests/integration/test_recipe_checks.py +++ b/tests/integration/test_recipe_checks.py @@ -179,7 +179,8 @@ def test_data_availability_nonexistent(tmp_path): } result = pyesgf.search.results.FileResult( json={ - 'dataset_id': 'ABC', + 'dataset_id': 'CMIP6.ABC.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 10, 'title': 'tas_1990-1992.nc', diff --git a/tests/unit/esgf/test_download.py b/tests/unit/esgf/test_download.py index bc48effbfc..ae571769b0 100644 --- a/tests/unit/esgf/test_download.py +++ b/tests/unit/esgf/test_download.py @@ -212,7 +212,8 @@ def test_init(): url = f'http://something.org/ABC/v1/{filename}' result = FileResult( json={ - 'dataset_id': 'ABC.v1|something.org', + 'dataset_id': 'CMIP6.ABC.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 10, 'source_id': ['ABC'], @@ -229,9 +230,15 @@ def test_init(): assert file.size == 10 assert file.urls == [url] assert file._checksums == [('MD5', 'abc')] - txt = f"ESGFFile:ABC/v1/{filename} on hosts ['something.org']" + assert file.facets == { + 'dataset': 'ABC', + 'project': 'CMIP6', + 'short_name': 'tas', + 'version': 'v1', + } + txt = f"ESGFFile:CMIP6/ABC/v1/{filename} on hosts ['something.org']" assert repr(file) == txt - assert hash(file) == hash(('ABC.v1', filename)) + assert hash(file) == hash(('CMIP6.ABC.v1', filename)) def test_from_results(): @@ -246,7 +253,8 @@ def test_from_results(): url = f'http://something.org/ABC/v1/{filename}' result = FileResult( json={ - 'dataset_id': f'ABC{i}.v1|something.org', + 'dataset_id': f'CMIP6.ABC{i}.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 10, 'source_id': [f'ABC{i}'], @@ -262,7 +270,8 @@ def test_from_results(): results.append( FileResult( json={ - 'dataset_id': f'ABC{i}.v1|something.org', + 'dataset_id': f'CMIP6.ABC{i}.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 10, 'source_id': [f'ABC{i}'], @@ -281,7 +290,8 @@ def test_sorting(): result1 = FileResult( json={ - 'dataset_id': 'ABC.v1|something.org', + 'dataset_id': 'CMIP6.ABC.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 1, 'title': 'abc_2000-2001.nc', @@ -290,7 +300,8 @@ def test_sorting(): ) result2 = FileResult( json={ - 'dataset_id': 'ABC.v1|something.org', + 'dataset_id': 'CMIP6.ABC.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 1, 'title': 'abc_2001-2002.nc', @@ -310,10 +321,10 @@ def test_sorting(): def test_local_file(): local_path = '/path/to/somewhere' filename = 'abc_2000-2001.nc' - dataset = 'CMIP6.ABC.v1' result = FileResult( json={ - 'dataset_id': f'{dataset}|something.org', + 'dataset_id': 'CMIP6.ABC.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 10, 'source_id': ['ABC'], @@ -323,6 +334,7 @@ def test_local_file(): ) file = _download.ESGFFile([result]) + print(file.dataset) reference_path = Path(local_path) / 'CMIP6' / 'ABC' / 'v1' / filename assert file.local_file(local_path) == reference_path @@ -344,10 +356,16 @@ def test_merge_datasets(): dataset1 = ('cmip5.output1.FIO.fio-esm.historical.' 'mon.atmos.Amon.r1i1p1.v20121010') + cmip5_template = ( + 'cmip5.%(product)s.%(valid_institute)s.%(model)s.' + '%(experiment)s.%(time_frequency)s.%(realm)s.%(cmor_table)s.' + '%(ensemble)s') + results = [ FileResult( { 'dataset_id': dataset0 + '|esgf2.dkrz.de', + 'dataset_id_template_': [cmip5_template], 'project': ['CMIP5'], 'size': 200, 'title': filename, @@ -358,6 +376,7 @@ def test_merge_datasets(): FileResult( { 'dataset_id': dataset1 + '|aims3.llnl.gov', + 'dataset_id_template_': [cmip5_template], 'project': ['CMIP5'], 'size': 200, 'title': filename, @@ -397,11 +416,11 @@ def test_single_download(mocker, tmp_path, checksum): dest_folder = tmp_path filename = 'abc_2000-2001.nc' - dataset = 'CMIP6.ABC.v1' url = f'http://something.org/CMIP6/ABC/v1/{filename}' json = { - 'dataset_id': f'{dataset}|something.org', + 'dataset_id': 'CMIP6.ABC.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 12, 'source_id': ['ABC'], @@ -449,7 +468,8 @@ def test_download_skip_existing(tmp_path, caplog): dest_folder = tmp_path json = { - 'dataset_id': f'{dataset}|something.org', + 'dataset_id': f'CMIP6.{dataset}.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 12, 'title': filename, @@ -458,7 +478,7 @@ def test_download_skip_existing(tmp_path, caplog): # Create local file local_file = file.local_file(dest_folder) - local_file.parent.mkdir() + local_file.parent.mkdir(parents=True) local_file.touch() caplog.set_level(logging.DEBUG) @@ -488,7 +508,8 @@ def test_single_download_fail(mocker, tmp_path): url = f'http://something.org/CMIP6/ABC/v1/{filename}' json = { - 'dataset_id': f'{dataset}|something.org', + 'dataset_id': f'CMIP6.{dataset}.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 12, 'title': filename, @@ -506,7 +527,8 @@ def test_get_download_message(): result1 = FileResult( json={ - 'dataset_id': 'ABC.v1|something.org', + 'dataset_id': 'CMIP6.ABC.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 4 * 10**9, 'title': 'abc_1850-1900.nc', @@ -516,7 +538,8 @@ def test_get_download_message(): ) result2 = FileResult( json={ - 'dataset_id': 'ABC.v1|something.org', + 'dataset_id': 'CMIP6.ABC.v1|something.org', + 'dataset_id_template_': ["%(mip_era)s.%(source_id)s"], 'project': ['CMIP6'], 'size': 6 * 10**9, 'title': 'abc_1900-1950.nc', @@ -529,8 +552,8 @@ def test_get_download_message(): expected = textwrap.dedent(""" Will download 10 GB Will download the following files: - 4 GB\tESGFFile:ABC/v1/abc_1850-1900.nc on hosts ['xyz.org'] - 6 GB\tESGFFile:ABC/v1/abc_1900-1950.nc on hosts ['abc.com'] + 4 GB\tESGFFile:CMIP6/ABC/v1/abc_1850-1900.nc on hosts ['xyz.org'] + 6 GB\tESGFFile:CMIP6/ABC/v1/abc_1900-1950.nc on hosts ['abc.com'] Downloading 10 GB.. """).strip() assert msg == expected diff --git a/tests/unit/esgf/test_search.py b/tests/unit/esgf/test_search.py index 136b0d43c1..7699995619 100644 --- a/tests/unit/esgf/test_search.py +++ b/tests/unit/esgf/test_search.py @@ -55,6 +55,12 @@ 'project': 'obs4MIPs', 'short_name': 'rsutcs', }, + { + 'dataset': 'CERES-EBAF', + 'frequency': '*', + 'project': 'obs4MIPs', + 'short_name': 'rsutcs', + }, ) ESGF_FACETS = ( @@ -77,7 +83,7 @@ { 'project': 'CMIP6', 'source_id': 'AWI-ESM-1-1-LR', - 'variant_label': 'r1i1p1f1', + 'member_id': 'r1i1p1f1', 'experiment_id': 'historical', 'grid_label': 'gn', 'table_id': 'Amon', @@ -99,6 +105,11 @@ 'time_frequency': 'mon', 'variable': 'rsutcs', }, + { + 'project': 'obs4MIPs', + 'source_id': 'CERES-EBAF', + 'variable': 'rsutcs', + }, ) @@ -147,6 +158,11 @@ def test_esgf_search_files(mocker): # Set up some fake FileResults dataset_id = ('cmip5.output1.INM.inmcm4.historical' '.mon.atmos.Amon.r1i1p1.v20130207') + dataset_id_template = ( + 'cmip5.%(product)s.%(valid_institute)s.%(model)s.' + '%(experiment)s.%(time_frequency)s.%(realm)s.%(cmor_table)s.' + '%(ensemble)s' + ) filename0 = 'tas_Amon_inmcm4_historical_r1i1p1_185001-189912.nc' filename1 = 'tas_Amon_inmcm4_historical_r1i1p1_190001-200512.nc' @@ -165,6 +181,7 @@ def test_esgf_search_files(mocker): 'checksum': ['123'], 'checksum_type': ['SHA256'], 'dataset_id': dataset_id + '|aims3.llnl.gov', + 'dataset_id_template_': [dataset_id_template], 'project': ['CMIP5'], 'size': 100, 'title': filename0, @@ -176,6 +193,7 @@ def test_esgf_search_files(mocker): file_aims1 = FileResult( { 'dataset_id': dataset_id + '|aims3.llnl.gov', + 'dataset_id_template_': [dataset_id_template], 'project': ['CMIP5'], 'size': 200, 'title': filename1, @@ -189,6 +207,7 @@ def test_esgf_search_files(mocker): 'checksum': ['456'], 'checksum_type': ['MD5'], 'dataset_id': dataset_id + '|esgf2.dkrz.de', + 'dataset_id_template_': [dataset_id_template], 'project': ['CMIP5'], 'size': 100, 'title': filename0, @@ -213,7 +232,6 @@ def test_esgf_search_files(mocker): connection.new_context.assert_called_with( pyesgf.search.context.FileSearchContext, **facets, - latest=True, ) context.search.assert_called_with( batch_size=500, @@ -279,9 +297,30 @@ def test_esgf_search_fails(mocker): assert str(excinfo.value) == error_message -def test_select_by_time(): +def test_select_latest_versions_filenotfound(mocker): + """Test `select_latest_versions` raises FileNotFoundError.""" + file = mocker.create_autospec(ESGFFile, instance=True) + file.name = 'ta.nc' + file.dataset = 'CMIP6.MODEL.v1' + file.facets = {'version': 'v1'} + file.__repr__ = lambda _: 'ESGFFile:CMIP6/MODEL/v1/ta.nc' + with pytest.raises(FileNotFoundError): + _search.select_latest_versions(files=[file], versions='v2') + + +@pytest.mark.parametrize('timerange,selection', [ + ('1851/1852', slice(1, 3)), + ('1851/P1Y', slice(1, 3)), + ('*', slice(None)), +]) +def test_select_by_time(timerange, selection): dataset_id = ('CMIP6.CMIP.AWI.AWI-ESM-1-1-LR.historical' '.r1i1p1f1.Amon.tas.gn.v20200212') + dataset_id_template = ( + '%(mip_era)s.%(activity_drs)s.%(institution_id)s.' + '%(source_id)s.%(experiment_id)s.%(member_id)s.%(table_id)s.' + '%(variable_id)s.%(grid_label)s' + ) filenames = [ 'tas_Amon_AWI-ESM-1-1-LR_historical_r1i1p1f1_gn_185001-185012.nc', 'tas_Amon_AWI-ESM-1-1-LR_historical_r1i1p1f1_gn_185101-185112.nc', @@ -293,6 +332,7 @@ def test_select_by_time(): json={ 'title': filename, 'dataset_id': dataset_id + '|xyz.com', + 'dataset_id_template_': [dataset_id_template], 'project': ['CMIP5'], 'size': 100, }, @@ -301,20 +341,25 @@ def test_select_by_time(): ] files = [ESGFFile([r]) for r in results] - result = _search.select_by_time(files, '1851/1852') - reference = files[1:3] + result = _search.select_by_time(files, timerange) + reference = files[selection] assert sorted(result) == sorted(reference) def test_select_by_time_nodate(): dataset_id = ( 'cmip3.MIROC.miroc3_2_hires.historical.mon.atmos.run1.tas.v1') + dataset_id_template = ( + '%(project)s.%(institute)s.%(model)s.%(experiment)s.' + '%(time_frequency)s.%(realm)s.%(ensemble)s.%(variable)s' + ) filenames = ['tas_A1.nc'] results = [ FileResult( json={ 'title': filename, 'dataset_id': dataset_id + '|xyz.com', + 'dataset_id_template_': [dataset_id_template], 'project': ['CMIP5'], 'size': 100, }, @@ -327,34 +372,6 @@ def test_select_by_time_nodate(): assert result == files -def test_select_by_time_period(): - - dataset_id = ('CMIP6.CMIP.AWI.AWI-ESM-1-1-LR.historical' - '.r1i1p1f1.Amon.tas.gn.v20200212') - filenames = [ - 'tas_Amon_AWI-ESM-1-1-LR_historical_r1i1p1f1_gn_185001-185012.nc', - 'tas_Amon_AWI-ESM-1-1-LR_historical_r1i1p1f1_gn_185101-185112.nc', - 'tas_Amon_AWI-ESM-1-1-LR_historical_r1i1p1f1_gn_185201-185212.nc', - 'tas_Amon_AWI-ESM-1-1-LR_historical_r1i1p1f1_gn_185301-185312.nc', - ] - results = [ - FileResult( - json={ - 'title': filename, - 'dataset_id': dataset_id + '|xyz.com', - 'project': ['CMIP5'], - 'size': 100, - }, - context=None, - ) for filename in filenames - ] - files = [ESGFFile([r]) for r in results] - - result = _search.select_by_time(files, '1851/P1Y') - reference = files[1:3] - assert sorted(result) == sorted(reference) - - def test_search_unknown_project(): project = 'Unknown' msg = (f"Unable to download from ESGF, because project {project} is not on" diff --git a/tests/unit/test_recipe.py b/tests/unit/test_recipe.py index 2b1f9e0ad7..d74f570370 100644 --- a/tests/unit/test_recipe.py +++ b/tests/unit/test_recipe.py @@ -178,12 +178,20 @@ def test_resume_preprocessor_tasks(mocker, tmp_path): def create_esgf_search_results(): """Prepare some fake ESGF search results.""" + dataset_id = ( + 'CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3.historical.r1i1p1f1' + '.Amon.tas.gr.v20200310|esgf-data1.llnl.gov' + ) + dataset_id_template = ( + '%(mip_era)s.%(activity_drs)s.%(institution_id)s.' + '%(source_id)s.%(experiment_id)s.%(member_id)s.%(table_id)s.' + '%(variable_id)s.%(grid_label)s' + ) file0 = ESGFFile([ pyesgf.search.results.FileResult( json={ - 'dataset_id': - 'CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3.historical.r1i1p1f1' - '.Amon.tas.gr.v20200310|esgf-data1.llnl.gov', + 'dataset_id': dataset_id, + 'dataset_id_template_': [dataset_id_template], 'project': ['CMIP6'], 'size': 4745571, @@ -204,9 +212,8 @@ def create_esgf_search_results(): file1 = ESGFFile([ pyesgf.search.results.FileResult( { - 'dataset_id': - 'CMIP6.CMIP.EC-Earth-Consortium.EC-Earth3.historical.r1i1p1f1' - '.Amon.tas.gr.v20200310|esgf-data1.llnl.gov', + 'dataset_id': dataset_id, + 'dataset_id_template_': [dataset_id_template], 'project': ['CMIP6'], 'size': 4740192,