ESMValGroup · valeriupredoi · Dec 13, 2022 · Nov 29, 2022 · Nov 29, 2022 · Nov 29, 2022
diff --git a/doc/api/esmvalcore.local.rst b/doc/api/esmvalcore.local.rst
@@ -0,0 +1,5 @@
+Find files on the local filesystem
+==================================
+
+.. automodule:: esmvalcore.local
+    :no-inherited-members:
diff --git a/doc/api/esmvalcore.rst b/doc/api/esmvalcore.rst
@@ -14,5 +14,7 @@ library. This section documents the public API of ESMValCore.
    esmvalcore.esgf
    esmvalcore.exceptions
    esmvalcore.iris_helpers
+   esmvalcore.local
    esmvalcore.preprocessor
+   esmvalcore.typing
    esmvalcore.experimental
diff --git a/doc/api/esmvalcore.typing.rst b/doc/api/esmvalcore.typing.rst
@@ -0,0 +1,6 @@
+Type hints
+==========
+
+.. automodule:: esmvalcore.typing
+    :no-inherited-members:
+    :no-special-members:
diff --git a/doc/develop/fixing_data.rst b/doc/develop/fixing_data.rst
@@ -377,7 +377,7 @@ To allow ESMValCore to locate the data files, use the following steps:
         native6:
           ...
           input_dir:
-            default: 'Tier{tier}/{dataset}/{latestversion}/{frequency}/{short_name}'
+            default: 'Tier{tier}/{dataset}/{version}/{frequency}/{short_name}'
             MY_DATA_ORG: '{dataset}/{exp}/{simulation}/{version}/{type}'
           input_file:
             default: '*.nc'

diff --git a/doc/quickstart/configure.rst b/doc/quickstart/configure.rst
@@ -438,8 +438,8 @@ Example of the CMIP6 project configuration:
    CMIP6:
      input_dir:
        default: '/'
-       BADC: '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{latestversion}'
-       DKRZ: '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{latestversion}'
+       BADC: '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}'
+       DKRZ: '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}'
        ETHZ: '{exp}/{mip}/{short_name}/{dataset}/{ensemble}/{grid}/'
      input_file: '{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc'
      output_file: '{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}'
@@ -462,7 +462,7 @@ at each site. As an example, the CMIP6 directory path on BADC would be:
 
 .. code-block:: yaml
 
-   '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{latestversion}'
+   '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}'
 
 The resulting directory path would look something like this:
 
@@ -475,8 +475,8 @@ which may be needed:
 
 .. code-block:: yaml
 
-  - '{exp}/{ensemble}/original/{mip}/{short_name}/{grid}/{latestversion}'
-  - '{exp}/{ensemble}/computed/{mip}/{short_name}/{grid}/{latestversion}'
+  - '{exp}/{ensemble}/original/{mip}/{short_name}/{grid}/{version}'
+  - '{exp}/{ensemble}/computed/{mip}/{short_name}/{grid}/{version}'
 
 In that case, the resultant directories will be:
 
@@ -629,7 +629,7 @@ Example:
    native6:
      cmor_strict: false
      input_dir:
-       default: 'Tier{tier}/{dataset}/{latestversion}/{frequency}/{short_name}'
+       default: 'Tier{tier}/{dataset}/{version}/{frequency}/{short_name}'
      input_file:
        default: '*.nc'
      output_file: '{project}_{dataset}_{type}_{version}_{mip}_{short_name}'

diff --git a/doc/quickstart/find_data.rst b/doc/quickstart/find_data.rst
@@ -33,16 +33,16 @@ ensures that files and paths to them are named according to a
 standardized convention. Examples of this convention, also used by
 ESMValTool for file discovery and data retrieval, include:
 
-* CMIP6 file: ``[variable_short_name]_[mip]_[dataset_name]_[experiment]_[ensemble]_[grid]_[start-date]-[end-date].nc``
-* CMIP5 file: ``[variable_short_name]_[mip]_[dataset_name]_[experiment]_[ensemble]_[start-date]-[end-date].nc``
-* OBS file: ``[project]_[dataset_name]_[type]_[version]_[mip]_[short_name]_[start-date]-[end-date].nc``
+* CMIP6 file: ``{variable_short_name}_{mip}_{dataset_name}_{experiment}_{ensemble}_{grid}_{start-date}-{end-date}.nc``
+* CMIP5 file: ``{variable_short_name}_{mip}_{dataset_name}_{experiment}_{ensemble}_{start-date}-{end-date}.nc``
+* OBS file: ``{project}_{dataset_name}_{type}_{version}_{mip}_{short_name}_{start-date}-{end-date}.nc``
 
 Similar standards exist for the standard paths (input directories); for the
 ESGF data nodes, these paths differ slightly, for example:
 
-* CMIP6 path for BADC: ``ROOT-BADC/[institute]/[dataset_name]/[experiment]/[ensemble]/[mip]/
-  [variable_short_name]/[grid]``;
-* CMIP6 path for ETHZ: ``ROOT-ETHZ/[experiment]/[mip]/[variable_short_name]/[dataset_name]/[ensemble]/[grid]``
+* CMIP6 path for BADC: ``ROOT-BADC/{institute}/{dataset_name}/{experiment}/{ensemble}/{mip}/
+  {variable_short_name}/{grid}``;
+* CMIP6 path for ETHZ: ``ROOT-ETHZ/{experiment}/{mip}/{variable_short_name}/{dataset_name}/{ensemble}/{grid}``
 
 From the ESMValTool user perspective the number of data input parameters is
 optimized to allow for ease of use. We detail this procedure in the next
@@ -130,7 +130,7 @@ MSWEP
 - Supported frequencies: ``mon``, ``day``, ``3hr``.
 - Tier: 3
 
-For example for monthly data, place the files in the ``/Tier3/MSWEP/latestversion/mon/pr`` subdirectory of your ``native6`` project location.
+For example for monthly data, place the files in the ``/Tier3/MSWEP/version/mon/pr`` subdirectory of your ``native6`` project location.
 
 .. note::
   For monthly data (``V220``), the data must be postfixed with the date, i.e. rename ``global_monthly_050deg.nc`` to ``global_monthly_050deg_197901-201710.nc``
@@ -168,9 +168,9 @@ The default naming conventions for input directories and files for CESM are
 
 * input directories: 3 different types supported:
    * ``/`` (run directory)
-   * ``[case]/[gcomp]/hist`` (short-term archiving)
-   * ``[case]/[gcomp]/proc/[tdir]/[tperiod]`` (post-processed data)
-* input files: ``[case].[scomp].[type].[string]*nc``
+   * ``{case}/{gcomp}/hist`` (short-term archiving)
+   * ``{case}/{gcomp}/proc/{tdir}/{tperiod}`` (post-processed data)
+* input files: ``{case}.{scomp}.{type}.{string}*nc``
 
 as configured in the :ref:`config-developer file <config-developer>` (using the
 default DRS ``drs: default`` in the :ref:`user configuration file`).
@@ -179,12 +179,12 @@ More information about CESM naming conventions are given `here
 
 .. note::
 
-   The ``[string]`` entry in the input file names above does not only
+   The ``{string}`` entry in the input file names above does not only
    correspond to the (optional) ``$string`` entry for `CESM model output files
    <https://www.cesm.ucar.edu/models/cesm2/naming_conventions.html#modelOutputFilenames>`__,
    but can also be used to read `post-processed files
    <https://www.cesm.ucar.edu/models/cesm2/naming_conventions.html#ppDataFilenames>`__.
-   In the latter case, ``[string]`` corresponds to the combination
+   In the latter case, ``{string}`` corresponds to the combination
    ``$SSTRING.$TSTRING``.
 
 Thus, example dataset entries could look like this:
@@ -244,8 +244,8 @@ model output.
 
 The default naming conventions for input directories and files for EMAC are
 
-* input directories: ``[exp]/[channel]``
-* input files: ``[exp]*[channel][postproc_flag].nc``
+* input directories: ``{exp}/{channel}``
+* input files: ``{exp}*{channel}{postproc_flag}.nc``
 
 as configured in the :ref:`config-developer file <config-developer>` (using the
 default DRS ``drs: default`` in the :ref:`user configuration file`).
@@ -313,8 +313,8 @@ ESMValTool is able to read native `ICON
 
 The default naming conventions for input directories and files for ICON are
 
-* input directories: ``[exp]`` or ``{exp}/outdata``
-* input files: ``[exp]_[var_type]*.nc``
+* input directories: ``{exp}`` or ``{exp}/outdata``
+* input files: ``{exp}_{var_type}*.nc``
 
 as configured in the :ref:`config-developer file <config-developer>` (using the
 default DRS ``drs: default`` in the :ref:`user configuration file`).
@@ -478,11 +478,11 @@ type of root paths they need the data from, e.g.:
 will tell the tool that the user needs data from a repository structured
 according to the BADC DRS structure, i.e.:
 
-``ROOT/[institute]/[dataset_name]/[experiment]/[ensemble]/[mip]/[variable_short_name]/[grid]``;
+``ROOT/{institute}/{dataset_name}/{experiment}/{ensemble}/{mip}/{variable_short_name}/{grid}``;
 
 setting the ``ROOT`` parameter is explained below. This is a
 strictly-structured repository tree and if there are any sort of irregularities
-(e.g. there is no ``[mip]`` directory) the data will not be found! ``BADC`` can
+(e.g. there is no ``{mip}`` directory) the data will not be found! ``BADC`` can
 be replaced with ``DKRZ`` or ``ETHZ`` depending on the existing ``ROOT``
 directory structure.
 The snippet
@@ -561,7 +561,7 @@ datasets are listed in any recipe, under either the ``datasets`` and/or
     - {dataset: HadGEM2-CC, project: CMIP5, exp: historical, ensemble: r1i1p1, start_year: 2001, end_year: 2004}
     - {dataset: UKESM1-0-LL, project: CMIP6, exp: historical, ensemble: r1i1p1f2, grid: gn, start_year: 2004, end_year: 2014}
 
-``_data_finder`` will use this information to find data for **all** the variables specified in ``diagnostics/variables``.
+The data finding feature will use this information to find data for **all** the variables specified in ``diagnostics/variables``.
 
 Recap and example
 =================

diff --git a/esmvalcore/_provenance.py b/esmvalcore/_provenance.py
@@ -194,7 +194,7 @@ def _initialize_entity(self):
             for k, v in self.attributes.items()
             if k not in ('authors', 'projects')
         }
-        self.entity = self.provenance.entity('file:' + self.filename,
+        self.entity = self.provenance.entity(f'file:{self.filename}',
                                              attributes)
 
         attribute_to_authors(self.entity, self.attributes.get('authors', []))

diff --git a/esmvalcore/_recipe.py b/esmvalcore/_recipe.py
@@ -16,17 +16,6 @@
 from . import __version__
 from . import _recipe_checks as check
 from . import esgf
-from ._data_finder import (
-    _find_input_files,
-    _get_timerange_from_years,
-    _parse_period,
-    _truncate_dates,
-    dates_to_timerange,
-    get_input_filelist,
-    get_multiproduct_filename,
-    get_output_file,
-    get_start_end_date,
-)
 from ._provenance import TrackedFile, get_recipe_provenance
 from ._task import DiagnosticTask, ResumeTask, TaskSet
 from .cmor.check import CheckLevels
@@ -39,6 +28,16 @@
 )
 from .config._diagnostics import TAGS
 from .exceptions import InputFilesNotFound, RecipeError
+from .local import _dates_to_timerange as dates_to_timerange
+from .local import _get_multiproduct_filename as get_multiproduct_filename
+from .local import _get_output_file as get_output_file
+from .local import _get_start_end_date as get_start_end_date
+from .local import (
+    _get_timerange_from_years,
+    _parse_period,
+    _truncate_dates,
+    find_files,
+)
 from .preprocessor import (
     DEFAULT_ORDER,
     FINAL_STEPS,
@@ -225,20 +224,19 @@ def _augment(base, update):
 
 def _dataset_to_file(variable, config_user):
     """Find the first file belonging to dataset from variable info."""
-    (files, dirnames, filenames) = _get_input_files(variable, config_user)
+    (files, globs) = _get_input_files(variable, config_user)
     if not files and variable.get('derive'):
         required_vars = get_required(variable['short_name'],
                                      variable['project'])
         for required_var in required_vars:
             _augment(required_var, variable)
             _add_cmor_info(required_var, override=True)
             _add_extra_facets(required_var, config_user['extra_facets_dir'])
-            (files, dirnames,
-             filenames) = _get_input_files(required_var, config_user)
+            (files, globs) = _get_input_files(required_var, config_user)
             if files:
                 variable = required_var
                 break
-    check.data_availability(files, variable, dirnames, filenames)
+    check.data_availability(files, variable, globs)
     return files[0]
 
 
@@ -584,10 +582,13 @@ def _get_input_files(variable, config_user):
 
         variable['start_year'] = start_year
         variable['end_year'] = end_year
-    (input_files, dirnames,
-     filenames) = get_input_filelist(variable=variable,
-                                     rootpath=config_user['rootpath'],
-                                     drs=config_user['drs'])
+
+    variable = dict(variable)
+    if variable['project'] == 'CMIP5' and variable['frequency'] == 'fx':
+        variable['ensemble'] = 'r0i0p0'
+    if variable['frequency'] == 'fx':
+        variable.pop('timerange', None)
+    input_files, globs = find_files(debug=True, **variable)
 
     # Set up downloading from ESGF if requested.
     if (not config_user['offline']
@@ -596,8 +597,7 @@ def _get_input_files(variable, config_user):
             check.data_availability(
                 input_files,
                 variable,
-                dirnames,
-                filenames,
+                globs,
                 log=False,
             )
         except RecipeError:
@@ -611,15 +611,14 @@ def _get_input_files(variable, config_user):
                         DOWNLOAD_FILES.add(file)
                     input_files.append(str(local_copy))
 
-            dirnames.append('ESGF:')
+            globs.append('ESGF')
 
-    return (input_files, dirnames, filenames)
+    return (input_files, globs)
 
 
 def _get_ancestors(variable, config_user):
     """Get the input files for a single dataset and setup provenance."""
-    (input_files, dirnames,
-     filenames) = _get_input_files(variable, config_user)
+    (input_files, globs) = _get_input_files(variable, config_user)
 
     logger.debug(
         "Using input files for variable %s of dataset %s:\n%s",
@@ -629,7 +628,7 @@ def _get_ancestors(variable, config_user):
             f'{f} (will be downloaded)' if not os.path.exists(f) else str(f)
             for f in input_files),
     )
-    check.data_availability(input_files, variable, dirnames, filenames)
+    check.data_availability(input_files, variable, globs)
     logger.info("Found input files for %s",
                 variable['alias'].replace('_', ' '))
 
@@ -836,11 +835,10 @@ def _update_timerange(variable, config_user):
     check.valid_time_selection(timerange)
 
     if '*' in timerange:
-        (files, _, _) = _find_input_files(
-            variable, config_user['rootpath'], config_user['drs'])
+        facets = deepcopy(variable)
+        facets.pop('timerange', None)
+        files = find_files(**facets)
         if not files and not config_user.get('offline', True):
-            facets = deepcopy(variable)
-            facets.pop('timerange', None)
             files = [file.name for file in esgf.find_files(**facets)]
 
         if not files:
@@ -928,6 +926,8 @@ def _get_preprocessor_products(variables, profile, order, ancestor_products,
     preproc_dir = config_user['preproc_dir']
 
     for variable in variables:
+        if variable['frequency'] == 'fx':
+            variable.pop('timerange', None)
         _update_timerange(variable, config_user)
         variable['filename'] = get_output_file(variable,
                                                config_user['preproc_dir'])
@@ -1094,7 +1094,7 @@ def _get_single_preprocessor_task(variables,
 
     logger.info("PreprocessingTask %s created.", task.name)
     logger.debug("PreprocessingTask %s will create the files:\n%s", task.name,
-                 '\n'.join(p.filename for p in task.products))
+                 '\n'.join(str(p.filename) for p in task.products))
 
     return task