ESMValGroup · valeriupredoi · May 13, 2021 · May 11, 2020 · May 12, 2020 · May 13, 2020
diff --git a/doc/recipe/overview.rst b/doc/recipe/overview.rst
@@ -82,6 +82,8 @@ data specifications:
   ``RCP8.5``)
 - mip (for CMIP data, key ``mip``, value e.g. ``Amon``, ``Omon``, ``LImon``)
 - ensemble member (key ``ensemble``, value e.g. ``r1i1p1``, ``r1i1p1f1``)
+- sub-experiment id (key `sub_experiment`, value e.g. `s2000`, `s(2000:2002)`, 
+  for DCPP data only)
 - time range (e.g. key-value ``start_year: 1982``, ``end_year: 1990``. Please
   note that `yaml`_ interprets numbers with a leading ``0`` as octal numbers,
   so we recommend to avoid them. For example, use ``128`` to specify the year
@@ -97,6 +99,7 @@ For example, a datasets section could be:
       - {dataset: CanESM2, project: CMIP5, exp: historical, ensemble: r1i1p1, start_year: 2001, end_year: 2004}
       - {dataset: UKESM1-0-LL, project: CMIP6, exp: historical, ensemble: r1i1p1f2, start_year: 2001, end_year: 2004, grid: gn}
       - {dataset: EC-EARTH3, alias: custom_alias, project: CMIP6, exp: historical, ensemble: r1i1p1f1, start_year: 2001, end_year: 2004, grid: gn}
+      - {dataset: HadGEM3-GC31-MM, alias: custom_alias, project: CMIP6, exp: dcppA-hindcast, ensemble: r1i1p1f1, sub_experiment: s2000, grid: gn, start_year: 2000, end_year, 2002}
 
 It is possible to define the experiment as a list to concatenate two experiments.
 Here it is an example concatenating the `historical` experiment with `rcp85`
@@ -136,6 +139,14 @@ Please, bear in mind that this syntax can only be used in the ensemble tag.
 Also, note that the combination of multiple experiments and ensembles, like
 exp: [historical, rcp85], ensemble: [r1i1p1, "r(2:3)i1p1"] is not supported and will raise an error.
 
+The same simplified syntax can be used to add multiple sub-experiment ids:
+
+.. code-block:: yaml
+
+    datasets:
+      - {dataset: MIROC6, project: CMIP6, exp: dcppA-hindcast, ensemble: r1i1p1f1, sub_experiment: s(2000:2002), grid: gn, start_year: 2003, end_year: 2004}
+
+
 Note that this section is not required, as datasets can also be provided in the
 Diagnostics_ section.
 

diff --git a/esmvalcore/_data_finder.py b/esmvalcore/_data_finder.py
@@ -92,13 +92,21 @@ def select_files(filenames, start_year, end_year):
 def _replace_tags(paths, variable):
     """Replace tags in the config-developer's file with actual values."""
     if isinstance(paths, str):
-        paths = (paths.strip('/'), )
+        paths = set((paths.strip('/'),))
     else:
-        paths = [path.strip('/') for path in paths]
+        paths = set(path.strip('/') for path in paths)
     tlist = set()
-
     for path in paths:
         tlist = tlist.union(re.findall(r'{([^}]*)}', path))
+    if 'sub_experiment' in variable:
+        new_paths = []
+        for path in paths:
+            new_paths.extend((
+                re.sub(r'(\b{ensemble}\b)', r'{sub_experiment}-\1', path),
+                re.sub(r'({ensemble})', r'{sub_experiment}-\1', path)
+            ))
+            tlist.add('sub_experiment')
+        paths = new_paths
     logger.debug(tlist)
 
     for tag in tlist:
@@ -112,7 +120,6 @@ def _replace_tags(paths, variable):
         else:
             raise KeyError("Dataset key {} must be specified for {}, check "
                            "your recipe entry".format(tag, variable))
-
         paths = _replace_tag(paths, original_tag, replacewith)
     return paths
 
@@ -127,7 +134,7 @@ def _replace_tag(paths, tag, replacewith):
     else:
         text = _apply_caps(str(replacewith), lower, upper)
         result.extend(p.replace('{' + tag + '}', text) for p in paths)
-    return result
+    return list(set(result))
 
 
 def _get_caps_options(tag):

diff --git a/esmvalcore/_recipe.py b/esmvalcore/_recipe.py
@@ -1027,37 +1027,38 @@ def _initialize_datasets(raw_datasets):
         return datasets
 
     @staticmethod
-    def _expand_ensemble(variables):
-        """Expand ensemble members to multiple datasets.
+    def _expand_tag(variables, input_tag):
+        """
+        Expand tags such as ensemble members or stardates to multiple datasets.
 
         Expansion only supports ensembles defined as strings, not lists.
         """
         expanded = []
         regex = re.compile(r'\(\d+:\d+\)')
 
-        def expand_ensemble(variable):
-            ens = variable.get('ensemble', "")
-            match = regex.search(ens)
+        def expand_tag(variable, input_tag):
+            tag = variable.get(input_tag, "")
+            match = regex.search(tag)
             if match:
                 start, end = match.group(0)[1:-1].split(':')
                 for i in range(int(start), int(end) + 1):
                     expand = deepcopy(variable)
-                    expand['ensemble'] = regex.sub(str(i), ens, 1)
-                    expand_ensemble(expand)
+                    expand[input_tag] = regex.sub(str(i), tag, 1)
+                    expand_tag(expand, input_tag)
             else:
                 expanded.append(variable)
 
         for variable in variables:
-            ensemble = variable.get('ensemble', "")
-            if isinstance(ensemble, (list, tuple)):
-                for elem in ensemble:
+            tag = variable.get(input_tag, "")
+            if isinstance(tag, (list, tuple)):
+                for elem in tag:
                     if regex.search(elem):
                         raise RecipeError(
-                            f"In variable {variable}: ensemble expansion "
-                            "cannot be combined with ensemble lists")
+                            f"In variable {variable}: {input_tag} expansion "
+                            f"cannot be combined with {input_tag} lists")
                 expanded.append(variable)
             else:
-                expand_ensemble(variable)
+                expand_tag(variable, input_tag)
 
         return expanded
 
@@ -1104,8 +1105,14 @@ def _initialize_variables(self, raw_variable, raw_datasets):
                 activity = get_activity(variable)
                 if activity:
                     variable['activity'] = activity
-            check.variable(variable, required_keys)
-        variables = self._expand_ensemble(variables)
+            if 'sub_experiment' in variable:
+                subexperiment_keys = deepcopy(required_keys)
+                subexperiment_keys.update({'sub_experiment'})
+                check.variable(variable, subexperiment_keys)
+            else:
+                check.variable(variable, required_keys)
+        variables = self._expand_tag(variables, 'ensemble')
+        variables = self._expand_tag(variables, 'sub_experiment')
         return variables
 
     def _initialize_preprocessor_output(self, diagnostic_name, raw_variables,

diff --git a/tests/unit/data_finder/test_replace_tags.py b/tests/unit/data_finder/test_replace_tags.py
@@ -1,22 +1,65 @@
 """Tests for _replace_tags in _data_finder.py."""
-
 from esmvalcore._data_finder import _replace_tags
 
 VARIABLE = {
+    'project': 'CMIP6',
+    'dataset': 'ACCURATE-MODEL',
+    'activity': 'act',
+    'exp': 'experiment',
+    'institute': 'HMA',
+    'ensemble': 'r1i1p1f1',
+    'mip': 'Amon',
     'short_name': 'tas',
+    'grid': 'gr',
 }
 
 
-def test_replace_tags_str():
-    assert _replace_tags('folder/subfolder/{short_name}',
-                         VARIABLE) == ['folder/subfolder/tas']
+def test_replace_tags():
+    """Tests for get_start_end_year function."""
+    path = _replace_tags(
+        '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/'
+        '{grid}/{latestversion}', VARIABLE)
+    input_file = _replace_tags(
+        '{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc', VARIABLE)
+    output_file = _replace_tags(
+        '{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}', VARIABLE)
+    assert path == [
+        'act/HMA/ACCURATE-MODEL/experiment/r1i1p1f1/Amon/tas/gr/'
+        '{latestversion}'
+    ]
+    assert input_file == ['tas_Amon_ACCURATE-MODEL_experiment_r1i1p1f1_gr*.nc']
+    assert output_file == ['CMIP6_ACCURATE-MODEL_Amon_experiment_r1i1p1f1_tas']
 
 
 def test_replace_tags_list_of_str():
-    assert _replace_tags(('folder/subfolder/{short_name}',
-                          'folder2/{short_name}', 'subfolder/{short_name}'),
-                         VARIABLE) == [
-                             'folder/subfolder/tas',
-                             'folder2/tas',
-                             'subfolder/tas',
-                         ]
+    assert sorted(
+        _replace_tags(('folder/subfolder/{short_name}', 'folder2/{short_name}',
+                       'subfolder/{short_name}'), VARIABLE)) == sorted([
+                           'folder2/tas',
+                           'folder/subfolder/tas',
+                           'subfolder/tas',
+                       ])
+
+
+def test_replace_tags_with_subexperiment():
+    """Tests for get_start_end_year function."""
+    variable = {'sub_experiment': '199411', **VARIABLE}
+    path = _replace_tags(
+        '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/'
+        '{grid}/{latestversion}', variable)
+    input_file = _replace_tags(
+        '{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc', variable)
+    output_file = _replace_tags(
+        '{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}', variable)
+    assert sorted(path) == sorted([
+        'act/HMA/ACCURATE-MODEL/experiment/r1i1p1f1/Amon/tas/gr/'
+        '{latestversion}',
+        'act/HMA/ACCURATE-MODEL/experiment/199411-r1i1p1f1/Amon/tas/gr/'
+        '{latestversion}'
+    ])
+    assert input_file == [
+        'tas_Amon_ACCURATE-MODEL_experiment_199411-r1i1p1f1_gr*.nc'
+    ]
+    assert output_file == [
+        'CMIP6_ACCURATE-MODEL_Amon_experiment_199411-r1i1p1f1_tas'
+    ]
diff --git a/tests/unit/test_recipe.py b/tests/unit/test_recipe.py
@@ -14,7 +14,7 @@ def test_expand_ensemble(self):
             },
         ]
 
-        expanded = Recipe._expand_ensemble(datasets)
+        expanded = Recipe._expand_tag(datasets, 'ensemble')
 
         ensembles = [
             'r1i2p3',
@@ -29,6 +29,31 @@ def test_expand_ensemble(self):
         for i, ensemble in enumerate(ensembles):
             assert expanded[i] == {'dataset': 'XYZ', 'ensemble': ensemble}
 
+    def test_expand_subexperiment(self):
+
+        datasets = [
+            {
+                'dataset': 'XYZ',
+                'sub_experiment': 's(1998:2005)',
+            },
+        ]
+
+        expanded = Recipe._expand_tag(datasets, 'sub_experiment')
+
+        subexperiments = [
+            's1998',
+            's1999',
+            's2000',
+            's2001',
+            's2002',
+            's2003',
+            's2004',
+            's2005',
+        ]
+        for i, subexperiment in enumerate(subexperiments):
+            assert expanded[i] == {'dataset': 'XYZ',
+                                   'sub_experiment': subexperiment}
+
     def test_expand_ensemble_nolist(self):
 
         datasets = [
@@ -39,7 +64,7 @@ def test_expand_ensemble_nolist(self):
         ]
 
         with pytest.raises(RecipeError):
-            Recipe._expand_ensemble(datasets)
+            Recipe._expand_tag(datasets, 'ensemble')
 
 
 VAR_A = {'dataset': 'A'}