diff --git a/doc/recipe/overview.rst b/doc/recipe/overview.rst index 7798878620..e49abab13e 100644 --- a/doc/recipe/overview.rst +++ b/doc/recipe/overview.rst @@ -82,6 +82,8 @@ data specifications: ``RCP8.5``) - mip (for CMIP data, key ``mip``, value e.g. ``Amon``, ``Omon``, ``LImon``) - ensemble member (key ``ensemble``, value e.g. ``r1i1p1``, ``r1i1p1f1``) +- sub-experiment id (key `sub_experiment`, value e.g. `s2000`, `s(2000:2002)`, + for DCPP data only) - time range (e.g. key-value ``start_year: 1982``, ``end_year: 1990``. Please note that `yaml`_ interprets numbers with a leading ``0`` as octal numbers, so we recommend to avoid them. For example, use ``128`` to specify the year @@ -97,6 +99,7 @@ For example, a datasets section could be: - {dataset: CanESM2, project: CMIP5, exp: historical, ensemble: r1i1p1, start_year: 2001, end_year: 2004} - {dataset: UKESM1-0-LL, project: CMIP6, exp: historical, ensemble: r1i1p1f2, start_year: 2001, end_year: 2004, grid: gn} - {dataset: EC-EARTH3, alias: custom_alias, project: CMIP6, exp: historical, ensemble: r1i1p1f1, start_year: 2001, end_year: 2004, grid: gn} + - {dataset: HadGEM3-GC31-MM, alias: custom_alias, project: CMIP6, exp: dcppA-hindcast, ensemble: r1i1p1f1, sub_experiment: s2000, grid: gn, start_year: 2000, end_year, 2002} It is possible to define the experiment as a list to concatenate two experiments. Here it is an example concatenating the `historical` experiment with `rcp85` @@ -136,6 +139,14 @@ Please, bear in mind that this syntax can only be used in the ensemble tag. Also, note that the combination of multiple experiments and ensembles, like exp: [historical, rcp85], ensemble: [r1i1p1, "r(2:3)i1p1"] is not supported and will raise an error. +The same simplified syntax can be used to add multiple sub-experiment ids: + +.. code-block:: yaml + + datasets: + - {dataset: MIROC6, project: CMIP6, exp: dcppA-hindcast, ensemble: r1i1p1f1, sub_experiment: s(2000:2002), grid: gn, start_year: 2003, end_year: 2004} + + Note that this section is not required, as datasets can also be provided in the Diagnostics_ section. diff --git a/esmvalcore/_data_finder.py b/esmvalcore/_data_finder.py index ad0367799f..8c700b2210 100644 --- a/esmvalcore/_data_finder.py +++ b/esmvalcore/_data_finder.py @@ -92,13 +92,21 @@ def select_files(filenames, start_year, end_year): def _replace_tags(paths, variable): """Replace tags in the config-developer's file with actual values.""" if isinstance(paths, str): - paths = (paths.strip('/'), ) + paths = set((paths.strip('/'),)) else: - paths = [path.strip('/') for path in paths] + paths = set(path.strip('/') for path in paths) tlist = set() - for path in paths: tlist = tlist.union(re.findall(r'{([^}]*)}', path)) + if 'sub_experiment' in variable: + new_paths = [] + for path in paths: + new_paths.extend(( + re.sub(r'(\b{ensemble}\b)', r'{sub_experiment}-\1', path), + re.sub(r'({ensemble})', r'{sub_experiment}-\1', path) + )) + tlist.add('sub_experiment') + paths = new_paths logger.debug(tlist) for tag in tlist: @@ -112,7 +120,6 @@ def _replace_tags(paths, variable): else: raise KeyError("Dataset key {} must be specified for {}, check " "your recipe entry".format(tag, variable)) - paths = _replace_tag(paths, original_tag, replacewith) return paths @@ -127,7 +134,7 @@ def _replace_tag(paths, tag, replacewith): else: text = _apply_caps(str(replacewith), lower, upper) result.extend(p.replace('{' + tag + '}', text) for p in paths) - return result + return list(set(result)) def _get_caps_options(tag): diff --git a/esmvalcore/_recipe.py b/esmvalcore/_recipe.py index a57dac9675..bf536c84cc 100644 --- a/esmvalcore/_recipe.py +++ b/esmvalcore/_recipe.py @@ -1027,37 +1027,38 @@ def _initialize_datasets(raw_datasets): return datasets @staticmethod - def _expand_ensemble(variables): - """Expand ensemble members to multiple datasets. + def _expand_tag(variables, input_tag): + """ + Expand tags such as ensemble members or stardates to multiple datasets. Expansion only supports ensembles defined as strings, not lists. """ expanded = [] regex = re.compile(r'\(\d+:\d+\)') - def expand_ensemble(variable): - ens = variable.get('ensemble', "") - match = regex.search(ens) + def expand_tag(variable, input_tag): + tag = variable.get(input_tag, "") + match = regex.search(tag) if match: start, end = match.group(0)[1:-1].split(':') for i in range(int(start), int(end) + 1): expand = deepcopy(variable) - expand['ensemble'] = regex.sub(str(i), ens, 1) - expand_ensemble(expand) + expand[input_tag] = regex.sub(str(i), tag, 1) + expand_tag(expand, input_tag) else: expanded.append(variable) for variable in variables: - ensemble = variable.get('ensemble', "") - if isinstance(ensemble, (list, tuple)): - for elem in ensemble: + tag = variable.get(input_tag, "") + if isinstance(tag, (list, tuple)): + for elem in tag: if regex.search(elem): raise RecipeError( - f"In variable {variable}: ensemble expansion " - "cannot be combined with ensemble lists") + f"In variable {variable}: {input_tag} expansion " + f"cannot be combined with {input_tag} lists") expanded.append(variable) else: - expand_ensemble(variable) + expand_tag(variable, input_tag) return expanded @@ -1104,8 +1105,14 @@ def _initialize_variables(self, raw_variable, raw_datasets): activity = get_activity(variable) if activity: variable['activity'] = activity - check.variable(variable, required_keys) - variables = self._expand_ensemble(variables) + if 'sub_experiment' in variable: + subexperiment_keys = deepcopy(required_keys) + subexperiment_keys.update({'sub_experiment'}) + check.variable(variable, subexperiment_keys) + else: + check.variable(variable, required_keys) + variables = self._expand_tag(variables, 'ensemble') + variables = self._expand_tag(variables, 'sub_experiment') return variables def _initialize_preprocessor_output(self, diagnostic_name, raw_variables, diff --git a/tests/unit/data_finder/test_replace_tags.py b/tests/unit/data_finder/test_replace_tags.py index 93ba42b41a..097cc7b328 100644 --- a/tests/unit/data_finder/test_replace_tags.py +++ b/tests/unit/data_finder/test_replace_tags.py @@ -1,22 +1,65 @@ """Tests for _replace_tags in _data_finder.py.""" - from esmvalcore._data_finder import _replace_tags VARIABLE = { + 'project': 'CMIP6', + 'dataset': 'ACCURATE-MODEL', + 'activity': 'act', + 'exp': 'experiment', + 'institute': 'HMA', + 'ensemble': 'r1i1p1f1', + 'mip': 'Amon', 'short_name': 'tas', + 'grid': 'gr', } -def test_replace_tags_str(): - assert _replace_tags('folder/subfolder/{short_name}', - VARIABLE) == ['folder/subfolder/tas'] +def test_replace_tags(): + """Tests for get_start_end_year function.""" + path = _replace_tags( + '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/' + '{grid}/{latestversion}', VARIABLE) + input_file = _replace_tags( + '{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc', VARIABLE) + output_file = _replace_tags( + '{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}', VARIABLE) + assert path == [ + 'act/HMA/ACCURATE-MODEL/experiment/r1i1p1f1/Amon/tas/gr/' + '{latestversion}' + ] + assert input_file == ['tas_Amon_ACCURATE-MODEL_experiment_r1i1p1f1_gr*.nc'] + assert output_file == ['CMIP6_ACCURATE-MODEL_Amon_experiment_r1i1p1f1_tas'] def test_replace_tags_list_of_str(): - assert _replace_tags(('folder/subfolder/{short_name}', - 'folder2/{short_name}', 'subfolder/{short_name}'), - VARIABLE) == [ - 'folder/subfolder/tas', - 'folder2/tas', - 'subfolder/tas', - ] + assert sorted( + _replace_tags(('folder/subfolder/{short_name}', 'folder2/{short_name}', + 'subfolder/{short_name}'), VARIABLE)) == sorted([ + 'folder2/tas', + 'folder/subfolder/tas', + 'subfolder/tas', + ]) + + +def test_replace_tags_with_subexperiment(): + """Tests for get_start_end_year function.""" + variable = {'sub_experiment': '199411', **VARIABLE} + path = _replace_tags( + '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/' + '{grid}/{latestversion}', variable) + input_file = _replace_tags( + '{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc', variable) + output_file = _replace_tags( + '{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}', variable) + assert sorted(path) == sorted([ + 'act/HMA/ACCURATE-MODEL/experiment/r1i1p1f1/Amon/tas/gr/' + '{latestversion}', + 'act/HMA/ACCURATE-MODEL/experiment/199411-r1i1p1f1/Amon/tas/gr/' + '{latestversion}' + ]) + assert input_file == [ + 'tas_Amon_ACCURATE-MODEL_experiment_199411-r1i1p1f1_gr*.nc' + ] + assert output_file == [ + 'CMIP6_ACCURATE-MODEL_Amon_experiment_199411-r1i1p1f1_tas' + ] diff --git a/tests/unit/test_recipe.py b/tests/unit/test_recipe.py index 885684cee2..493f366fef 100644 --- a/tests/unit/test_recipe.py +++ b/tests/unit/test_recipe.py @@ -14,7 +14,7 @@ def test_expand_ensemble(self): }, ] - expanded = Recipe._expand_ensemble(datasets) + expanded = Recipe._expand_tag(datasets, 'ensemble') ensembles = [ 'r1i2p3', @@ -29,6 +29,31 @@ def test_expand_ensemble(self): for i, ensemble in enumerate(ensembles): assert expanded[i] == {'dataset': 'XYZ', 'ensemble': ensemble} + def test_expand_subexperiment(self): + + datasets = [ + { + 'dataset': 'XYZ', + 'sub_experiment': 's(1998:2005)', + }, + ] + + expanded = Recipe._expand_tag(datasets, 'sub_experiment') + + subexperiments = [ + 's1998', + 's1999', + 's2000', + 's2001', + 's2002', + 's2003', + 's2004', + 's2005', + ] + for i, subexperiment in enumerate(subexperiments): + assert expanded[i] == {'dataset': 'XYZ', + 'sub_experiment': subexperiment} + def test_expand_ensemble_nolist(self): datasets = [ @@ -39,7 +64,7 @@ def test_expand_ensemble_nolist(self): ] with pytest.raises(RecipeError): - Recipe._expand_ensemble(datasets) + Recipe._expand_tag(datasets, 'ensemble') VAR_A = {'dataset': 'A'}