diff --git a/.circleci/config.yml b/.circleci/config.yml index 2950d4feed..e22070c4d4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -66,7 +66,7 @@ commands: - run: name: Install git+ssh environment: - DEBIAN_FRONTEND: noninteractive # needed to install tzdata + DEBIAN_FRONTEND: noninteractive # needed to install tzdata command: apt update && apt install -y git ssh - checkout - check_changes @@ -141,6 +141,7 @@ jobs: . /opt/conda/etc/profile.d/conda.sh mkdir /logs conda activate esmvaltool + pip install "intake-esgf>=2025.10.22" # TODO: remove after merge pip install --no-deps .[test] > /logs/install.txt 2>&1 pip check - test_and_report: @@ -155,7 +156,7 @@ jobs: name: Install gpg (required by codecov orb) command: apt update && apt install -y gpg - codecov/upload: - files: 'test-reports/coverage.xml' + files: "test-reports/coverage.xml" disable_search: true test_installation_from_source_test_mode: diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 93ce6ae46a..9e9af922ed 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -1,2 +1,7 @@ # Use ruff formatter (#2524) 436558caacda69d4966a5aff35959ce9188cac37 +# Enable more ruff rules (#2715) +77c370314bac1e72400392ed98c8f2f75b1b5a98 +02583113c99304e4e99bd010a05856ea429259ea +0e8e7164ef96c574f77367948f51c7ab822bb694 +e2f8cab2cc5a509452ca720e50f041f46f380ee2 diff --git a/.gitignore b/.gitignore index 7f17eca52c..ee821184de 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Autogenerated files _sidebar.rst.inc +jupyter_execute/ # Distribution / packaging .Python diff --git a/doc/api/esmvalcore.esgf.rst b/doc/api/esmvalcore.esgf.rst index c6fac3553b..58f84f9401 100644 --- a/doc/api/esmvalcore.esgf.rst +++ b/doc/api/esmvalcore.esgf.rst @@ -1,18 +1,10 @@ Find and download files from ESGF ================================= -This module provides the function :py:func:`esmvalcore.esgf.find_files` -for searching for files on ESGF using the ESMValTool vocabulary. -It returns :py:class:`esmvalcore.esgf.ESGFFile` objects, which have a convenient -:py:meth:`esmvalcore.esgf.ESGFFile.download` method for downloading the files. - -See :ref:`config-esgf` for instructions on configuring this module. - esmvalcore.esgf --------------- -.. autofunction:: esmvalcore.esgf.find_files -.. autofunction:: esmvalcore.esgf.download -.. autoclass:: esmvalcore.esgf.ESGFFile +.. automodule:: esmvalcore.esgf + :no-inherited-members: esmvalcore.esgf.facets ---------------------- diff --git a/doc/api/esmvalcore.io.intake_esgf.rst b/doc/api/esmvalcore.io.intake_esgf.rst new file mode 100644 index 0000000000..4fcb6c0bde --- /dev/null +++ b/doc/api/esmvalcore.io.intake_esgf.rst @@ -0,0 +1,5 @@ +esmvalcore.io.intake_esgf +========================= + +.. automodule:: esmvalcore.io.intake_esgf + :no-inherited-members: diff --git a/doc/api/esmvalcore.io.protocol.rst b/doc/api/esmvalcore.io.protocol.rst new file mode 100644 index 0000000000..f785893af9 --- /dev/null +++ b/doc/api/esmvalcore.io.protocol.rst @@ -0,0 +1,5 @@ +esmvalcore.io.protocol +====================== + +.. automodule:: esmvalcore.io.protocol + :no-inherited-members: diff --git a/doc/api/esmvalcore.io.rst b/doc/api/esmvalcore.io.rst new file mode 100644 index 0000000000..5d41a029c0 --- /dev/null +++ b/doc/api/esmvalcore.io.rst @@ -0,0 +1,18 @@ +Access data from any source +=========================== + +ESMValCore supports a modular system for reading data from various data sources. +In the future, this module may be extended with support for writing output data. + +The interface is defined in the :mod:`esmvalcore.io.protocol` module and +the other modules here provide an implementation for a particular data source. + +.. toctree:: + :maxdepth: 1 + + esmvalcore.io.protocol + esmvalcore.io.intake_esgf + +esmvalcore.io +------------- +.. automodule:: esmvalcore.io diff --git a/doc/api/esmvalcore.rst b/doc/api/esmvalcore.rst index d160246243..a2833b821e 100644 --- a/doc/api/esmvalcore.rst +++ b/doc/api/esmvalcore.rst @@ -14,6 +14,7 @@ library. This section documents the public API of ESMValCore. esmvalcore.dataset esmvalcore.esgf esmvalcore.exceptions + esmvalcore.io esmvalcore.iris_helpers esmvalcore.local esmvalcore.preprocessor diff --git a/doc/conf.py b/doc/conf.py index 9279513182..0b656a4461 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -466,6 +466,7 @@ 'dask': ('https://docs.dask.org/en/stable/', None), 'distributed': ('https://distributed.dask.org/en/stable/', None), 'iris': ('https://scitools-iris.readthedocs.io/en/stable/', None), + 'intake_esgf': ('https://intake-esgf.readthedocs.io/en/stable/', None), 'esmf_regrid': ('https://iris-esmf-regrid.readthedocs.io/en/stable/', None), 'matplotlib': ('https://matplotlib.org/stable/', None), 'ncdata': ('https://ncdata.readthedocs.io/en/stable/', None), diff --git a/doc/configurations b/doc/configurations new file mode 120000 index 0000000000..17a515d17e --- /dev/null +++ b/doc/configurations @@ -0,0 +1 @@ +../esmvalcore/config/configurations \ No newline at end of file diff --git a/doc/contributing.rst b/doc/contributing.rst index 5f4d2d5102..69837e3e62 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -409,7 +409,10 @@ is cloned and run the command Optionally you can skip tests which require additional dependencies for supported diagnostic script languages by adding ``-m 'not installation'`` to the -previous command. To only run tests from a single file, run the command +previous command. If you are working in a place with a slow internet connection, +you may want to skip tests that require an internet connection by adding +``-m 'not online'`` to the command. +To only run tests from a single file, run the command .. code-block:: bash diff --git a/doc/quickstart/configure.rst b/doc/quickstart/configure.rst index c8f4d1e550..c34fab928a 100644 --- a/doc/quickstart/configure.rst +++ b/doc/quickstart/configure.rst @@ -56,8 +56,7 @@ A file could look like this (for example, located at .. code-block:: yaml output_dir: ~/esmvaltool_output - search_esgf: when_missing - download_dir: ~/downloaded_data + max_parallel_tasks: 1 ESMValCore searches for **all** YAML files in **each** of the following locations and merges them together: @@ -80,15 +79,19 @@ Within a directory, files are sorted lexicographically, and later files (e.g., files (like the old ``config-developer.yml`` files) will lead to errors. Make sure to move these files to a different directory. -To get a copy of the default configuration file, you can run +The minimal required configuration for the tool is that you configure where +it can find :ref:`input data `. In addition to that, you +may copy the default configuration file with :ref:`top level options ` -.. code-block:: bash +To get a copy of the default configuration file, you can run the command: - esmvaltool config get_config_user --path=/target/file.yml +.. code-block:: bash -If the option ``--path`` is omitted, the file will be copied to -``~/.config/esmvaltool/config-user.yml``. + esmvaltool config copy defaults/config-user.yml +This will copy the file to your configuration directory and you can tailor it +for your system, e.g. set the ``output_dir`` to a path where ESMValTool can +store its output files. Command line arguments ---------------------- @@ -100,7 +103,7 @@ Example: .. code-block:: bash - esmvaltool run --search_esgf=when_missing --max_parallel_tasks=2 /path/to/recipe.yml + esmvaltool run --max_parallel_tasks=2 /path/to/recipe.yml Options given via command line arguments will always take precedence over options specified via YAML files. @@ -149,86 +152,120 @@ Note: the following entries use Python syntax. For example, Python's ``None`` is YAML's ``null``, Python's ``True`` is YAML's ``true``, and Python's ``False`` is YAML's ``false``. -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| Option | Description | Type | Default value | -+===============================+========================================+=============================+========================================+ -| ``auxiliary_data_dir`` | Directory where auxiliary data is | :obj:`str` | ``~/auxiliary_data`` | -| | stored. [#f1]_ | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``check_level`` | Sensitivity of the CMOR check | :obj:`str` | ``default`` | -| | (``debug``, ``strict``, ``default`` | | | -| | ``relaxed``, ``ignore``), see | | | -| | :ref:`cmor_check_strictness`. | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``compress_netcdf`` | Use netCDF compression. | :obj:`bool` | ``False`` | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``config_developer_file`` | Path to custom | :obj:`str` | ``None`` (default file) | -| | :ref:`config-developer`. | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``dask`` | :ref:`config-dask`. | :obj:`dict` | See :ref:`config-dask-defaults` | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``diagnostics`` | Only run the selected diagnostics from | :obj:`list` or :obj:`str` | ``None`` (all diagnostics) | -| | the recipe, see :ref:`running`. | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``download_dir`` | Directory where downloaded data will | :obj:`str` | ``~/climate_data`` | -| | be stored. [#f4]_ | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``drs`` | Directory structure for input data. | :obj:`dict` | ``{CMIP3: ESGF, CMIP5: ESGF, CMIP6: | -| | [#f2]_ | | ESGF, CORDEX: ESGF, obs4MIPs: ESGF}`` | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``exit_on_warning`` | Exit on warning (only used in NCL | :obj:`bool` | ``False`` | -| | diagnostic scripts). | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``log_level`` | Log level of the console (``debug``, | :obj:`str` | ``info`` | -| | ``info``, ``warning``, ``error``). | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``logging`` | :ref:`config-logging`. | :obj:`dict` | See :ref:`config-logging` | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``max_datasets`` | Maximum number of datasets to use, see | :obj:`int` | ``None`` (all datasets from recipe) | -| | :ref:`running`. | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``max_parallel_tasks`` | Maximum number of parallel processes, | :obj:`int` | ``None`` (number of available CPUs) | -| | see :ref:`task_priority`. [#f5]_ | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``max_years`` | Maximum number of years to use, see | :obj:`int` | ``None`` (all years from recipe) | -| | :ref:`running`. | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``output_dir`` | Directory where all output will be | :obj:`str` | ``~/esmvaltool_output`` | -| | written, see :ref:`outputdata`. | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``output_file_type`` | Plot file type. | :obj:`str` | ``png`` | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``profile_diagnostic`` | Use a profiling tool for the | :obj:`bool` | ``False`` | -| | diagnostic run. [#f3]_ | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``projects`` | :ref:`config-projects`. | :obj:`dict` | See table in :ref:`config-projects` | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``remove_preproc_dir`` | Remove the ``preproc`` directory if | :obj:`bool` | ``True`` | -| | the run was successful, see also | | | -| | :ref:`preprocessed_datasets`. | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``resume_from`` | Resume previous run(s) by using | :obj:`list` of :obj:`str` | ``[]`` | -| | preprocessor output files from these | | | -| | output directories, see | | | -| | ref:`running`. | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``rootpath`` | Rootpaths to the data from different | :obj:`dict` | ``{default: ~/climate_data}`` | -| | projects. [#f2]_ | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``run_diagnostic`` | Run diagnostic scripts, see | :obj:`bool` | ``True`` | -| | :ref:`running`. | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``save_intermediary_cubes`` | Save intermediary cubes from the | :obj:`bool` | ``False`` | -| | preprocessor, see also | | | -| | :ref:`preprocessed_datasets`. | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``search_esgf`` | Automatic data download from ESGF | :obj:`str` | ``never`` | -| | (``never``, ``when_missing``, | | | -| | ``always``). [#f4]_ | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ -| ``skip_nonexistent`` | Skip non-existent datasets, see | :obj:`bool` | ``False`` | -| | :ref:`running`. | | | -+-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ +.. list-table:: + :widths: 15 50 15 20 + :header-rows: 1 + + * - Option + - Description + - Type + - Default value + * - ``auxiliary_data_dir`` + - Directory where auxiliary data is stored. [#f1]_ + - :obj:`str` + - ``~/auxiliary_data`` + * - ``check_level`` + - Sensitivity of the CMOR check (``debug``, ``strict``, ``default``, ``relaxed``, ``ignore``), see :ref:`cmor_check_strictness`. + - :obj:`str` + - ``default`` + * - ``compress_netcdf`` + - Use netCDF compression. + - :obj:`bool` + - ``False`` + * - ``config_developer_file`` + - Path to custom :ref:`config-developer`. + - :obj:`str` + - ``None`` (default file) + * - ``dask`` + - :ref:`config-dask`. + - :obj:`dict` + - See :ref:`config-dask-defaults` + * - ``diagnostics`` + - Only run the selected diagnostics from the recipe, see :ref:`running`. + - :obj:`list` or :obj:`str` + - ``None`` (all diagnostics) + * - ``download_dir`` + - [deprecated] Directory where downloaded data will be stored. [#f2]_ + - :obj:`str` + - ``~/climate_data`` + * - ``drs`` + - [deprecated] Directory structure for input data. [#f2]_ + - :obj:`dict` + - ``{CMIP3: ESGF, CMIP5: ESGF, CMIP6: ESGF, CORDEX: ESGF, obs4MIPs: ESGF}`` + * - ``exit_on_warning`` + - Exit on warning (only used in NCL diagnostic scripts). + - :obj:`bool` + - ``False`` + * - ``log_level`` + - Log level of the console (``debug``, ``info``, ``warning``, ``error``). + - :obj:`str` + - ``info`` + * - ``logging`` + - :ref:`config-logging`. + - :obj:`dict` + - See :ref:`config-logging` + * - ``max_datasets`` + - Maximum number of datasets to use, see :ref:`running`. + - :obj:`int` + - ``None`` (all datasets from recipe) + * - ``max_parallel_tasks`` + - Maximum number of parallel processes, see :ref:`task_priority`. [#f5]_ + - :obj:`int` + - ``None`` (number of available CPUs) + * - ``max_years`` + - Maximum number of years to use, see :ref:`running`. + - :obj:`int` + - ``None`` (all years from recipe) + * - ``output_dir`` + - Directory where all output will be written, see :ref:`outputdata`. + - :obj:`str` + - ``~/esmvaltool_output`` + * - ``output_file_type`` + - Plot file type. + - :obj:`str` + - ``png`` + * - ``profile_diagnostic`` + - Use a profiling tool for the diagnostic run. [#f3]_ + - :obj:`bool` + - ``False`` + * - ``projects`` + - :ref:`config-projects`. + - :obj:`dict` + - See table in :ref:`config-projects` + * - ``remove_preproc_dir`` + - Remove the ``preproc`` directory if the run was successful, see :ref:`preprocessed_datasets`. + - :obj:`bool` + - ``True`` + * - ``resume_from`` + - Resume previous run(s) by using preprocessor output files from these output directories, see :ref:`running`. + - :obj:`list` of :obj:`str` + - ``[]`` + * - ``rootpath`` + - [deprecated] Rootpaths to the data from different projects. [#f2]_ + - :obj:`dict` + - ``{default: ~/climate_data}`` + * - ``run_diagnostic`` + - Run diagnostic scripts, see :ref:`running`. + - :obj:`bool` + - ``True`` + * - ``save_intermediary_cubes`` + - Save intermediary cubes from the preprocessor, see also :ref:`preprocessed_datasets`. + - :obj:`bool` + - ``False`` + * - ``search_data`` + - Perform a quick or complete search for input data. When set to ``quick``, + search will stop as soon as a result is found. :ref:`Data sources ` + with a lower value for ``priority`` will be searched first. (``quick``, ``complete``) + - :obj:`str` + - ``quick`` + * - ``search_esgf`` + - [deprecated] Automatic data download from ESGF (``never``, ``when_missing``, ``always``). [#f2]_ + - :obj:`str` + - ``never`` + * - ``skip_nonexistent`` + - Skip non-existent datasets, see :ref:`running`. + - :obj:`bool` + - ``False`` .. [#f1] The ``auxiliary_data_dir`` setting is the path to place any required additional auxiliary data files. @@ -251,10 +288,8 @@ For example, Python's ``None`` is YAML's ``null``, Python's ``True`` is YAML's This setting is not for model or observational datasets, rather it is for extra data files such as shapefiles or other data sources needed by the diagnostics. -.. [#f2] A detailed explanation of the data finding-related options ``drs`` - and ``rootpath`` is presented in the :ref:`data-retrieval` section. - These sections relate directly to the data finding capabilities of - ESMValCore and are very important to be understood by the user. +.. [#f2] This option is scheduled for removal in v2.14.0. Please use + :ref:`data sources ` to configure data finding instead. .. [#f3] The ``profile_diagnostic`` setting triggers profiling of Python diagnostics, this will tell you which functions in the diagnostic took most time to run. @@ -271,19 +306,6 @@ For example, Python's ``None`` is YAML's ``null``, Python's ``True`` is YAML's Note that it is also possible to use vprof to understand other resources used while running the diagnostic, including execution time of different code blocks and memory usage. -.. [#f4] The ``search_esgf`` setting can be used to disable or enable automatic - downloads from ESGF. - If ``search_esgf`` is set to ``never``, the tool does not download any data - from the ESGF. - If ``search_esgf`` is set to ``when_missing``, the tool will download any - CMIP3, CMIP5, CMIP6, CORDEX, and obs4MIPs data that is required to run a - recipe but not available locally and store it in ``download_dir`` using the - ``ESGF`` directory structure defined in the :ref:`config-developer`. - If ``search_esgf`` is set to ``always``, the tool will first check the ESGF - for the needed data, regardless of any local data availability; if the data - found on ESGF is newer than the local data (if any) or the user specifies a - version of the data that is available only from the ESGF, then that data - will be downloaded; otherwise, local data will be used. .. [#f5] When using ``max_parallel_tasks`` with a value larger than 1 with the Dask threaded scheduler, every task will start ``num_workers`` threads. To avoid running out of memory or slowing down computations due to competition @@ -355,6 +377,13 @@ Available predefined Dask profiles: debugging purposes. Best used with ``max_parallel_tasks: 1``. +To copy these predefined profiles to your configuration directory for further +customization, run the command: + +.. code:: bash + + esmvaltool config copy defaults/dask.yml + Dask distributed scheduler configuration ---------------------------------------- @@ -659,6 +688,12 @@ The following project-specific options are available: +-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ | Option | Description | Type | Default value | +===============================+========================================+=============================+========================================+ +| ``data`` | Data sources are used to find input | :obj:`dict` | {} | +| | data and have to be configured before | | | +| | running the tool. See | | | +| | :ref:`config-data-sources` for | | | +| | details. | | | ++-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ | ``extra_facets`` | Extra key-value pairs ("*facets*") | :obj:`dict` | See | | | added to datasets in addition to the | | :ref:`config-extra-facets-defaults` | | | facets defined in the recipe. See | | | @@ -666,6 +701,147 @@ The following project-specific options are available: | | details. | | | +-------------------------------+----------------------------------------+-----------------------------+----------------------------------------+ + +.. _config-data-sources: + +Data sources +------------ +The ``data`` section defines sources of input data. The easiest way to get +started with these is to copy one of the example configuration files and tailor +it to your needs. + +To list the available example configuration files, run the command: + +.. code-block:: bash + + esmvaltool config list + +To use one of the example configuration files, copy it to +your configuration directory by running the command: + +.. code-block:: bash + + esmvaltool config copy data-intake-esgf.yml + +where ``data-intake-esgf.yml`` needs to be replaced by the name of the example +configuration you would like to use. The format of the configuration file +is described in :mod:`esmvalcore.io`. + +There are three modules available as part of ESMValCore that provide data sources: + +- :mod:`esmvalcore.io.intake_esgf`: Use the + `intake-esgf `_ library to load data that + is available from ESGF. +- :mod:`esmvalcore.local`: Use :mod:`glob` patterns to find files on a filesystem. +- :mod:`esmvalcore.esgf`: Use the legacy `esgf-pyclient + `_ library to find and download data + from ESGF. + +Adding a custom data source is relatively easy and is explained in +:mod:`esmvalcore.io.protocol`. + +There are various use cases and we provide example configurations for each of +them below. + +Personal computer +````````````````` + +On a personal computer, the recommended setup can be obtained by running the +commands: + +.. code-block:: bash + + esmvaltool config copy data-intake-esgf.yml + esmvaltool config copy data-local-esmvaltool.yml + +This will use the :mod:`esmvalcore.io.intake_esgf` module to access data +that is available through ESGF and use :mod:`esmvalcore.local` to find +observational and reanalysis datasets that have been +:ref:`CMORized with ESMValTool ` +(``OBS6`` and ``OBS`` projects for CMIP6- and CMIP5-style CMORization +respectively) or are supported in their :ref:`native format ` +through the ``native6`` project. + +.. warning:: + + It is important to :doc:`configure intake-esgf ` + for your system before using it. Make sure to set ``local_cache`` to a path + where it can store downloaded files, and if (some) ESGF data is already + available on your system, point ``esg_dataroot`` to it. If you are + missing certain search results, you may want to choose a different + index node for searching the ESGF. + +HPC system +`````````` + +On HPC systems, data is often stored in large shared filesystems. We have +several example configurations for popular HPC systems. To list the available +example files, run the command: + +.. code-block:: bash + + esmvaltool config list data-hpc + +If you are using one of the supported HPC systems, for example Jasmin, you can +copy the example configuration file by running the command: + +.. code-block:: bash + + esmvaltool config copy data-hpc-badc.yml + +and you should be good to go. If your HPC system is not supported yet, you can +copy one of the other example configuration files, e.g. ``data-hpc-dkrz.yml`` +and tailor it for your system. + +.. warning:: + + It is important to :doc:`configure intake-esgf ` + for your system before using it. Make sure to set ``local_cache`` to a path + where it can store downloaded files, and if (some) ESGF data is already + available on your system, point ``esg_dataroot`` to it. If you are + missing certain search results, you may want to choose a different + index node for searching the ESGF. + +.. note:: + + Deduplicating data found via :mod:`esmvalcore.io.intake_esgf` data sources + and the :mod:`esmvalcore.local` data sources has not yet been implemented. + Therefore it is recommended not to use the configuration option + ``search_data: complete`` when using both data sources for the same project. + The ``search_data: quick`` option can be safely used. + +Climate model data in its native format +``````````````````````````````````````` + +For each of the climate models that are supported in their +native format as described in :ref:`read_native_models`, an example configuration +file is available. To list the available example files, run the command: + +.. code-block:: bash + + esmvaltool config list data-native + +.. _filter_load_warnings: + +Filter Iris load warnings +````````````````````````` + +It is possible to ignore specific warnings when loading data with Iris. +This is particularly useful for native datasets which do not follow the CMOR +standard by default and consequently produce a lot of warnings when handled by +Iris. +This can be configured using the ``ignore_warnings`` argument to +:class:`esmvalcore.local.LocalDataSource`. + +Here is an example on how to ignore specific warnings when loading data from +the ``EMAC`` model in its native format: + +.. literalinclude:: ../configurations/data-native-emac.yml + :language: yaml + +The keyword arguments specified in the list items are directly passed to +:func:`warnings.filterwarnings` in addition to ``action=ignore``. + .. _config-extra-facets: Extra Facets @@ -786,10 +962,11 @@ ESGF configuration The ``esmvaltool run`` command can automatically download the files required to run a recipe from ESGF for the projects CMIP3, CMIP5, CMIP6, CORDEX, and obs4MIPs. -The downloaded files will be stored in the directory specified via the -:ref:`configuration option ` ``download_dir``. -To enable automatic downloads from ESGF, use the :ref:`configuration options -` ``search_esgf: when_missing`` or ``search_esgf: always``. + +Refer to :ref:`config-data-sources` for instructions on how to set this up. This +section describes additional configuration options for the :mod:`esmvalcore.esgf` +module, which is based on the legacy esgf-pyclient_ library. Most users +will not need this. .. note:: @@ -809,8 +986,8 @@ To enable automatic downloads from ESGF, use the :ref:`configuration options Configuration file ------------------ -An optional configuration file can be created for configuring how the tool uses -`esgf-pyclient `_ +An optional configuration file can be created for configuring how the +:class:`esmvalcore.esgf.ESGFDataSource` uses esgf-pyclient_ to find and download data. The name of this file is ``~/.esmvaltool/esgf-pyclient.yml``. @@ -896,18 +1073,18 @@ Developer configuration file Most users and diagnostic developers will not need to change this file, but it may be useful to understand its content. -It will be installed along with ESMValCore and can also be viewed on GitHub: +The settings from this file are being moved to the +:ref:`new configuration system `. In particular, the +``input_dir``, ``input_file``, and ``ignore_warnings`` settings have already +been replaced by the :class:`esmvalcore.local.LocalDataSource` that can be +configured via :ref:`data sources `. +The developer configuration file will be installed along with ESMValCore and can +also be viewed on GitHub: `esmvalcore/config-developer.yml `_. -This configuration file describes the file system structure and CMOR tables for several -key projects (CMIP6, CMIP5, obs4MIPs, OBS6, OBS) on several key machines (e.g. BADC, CP4CDS, DKRZ, -ETHZ, SMHI, BSC), and for native output data for some +This configuration file describes the CMOR tables for several +key projects (CMIP6, CMIP5, obs4MIPs, OBS6, OBS), and for native output data for some models (ICON, IPSL, ... see :ref:`configure_native_models`). -CMIP data is stored as part of the Earth System Grid -Federation (ESGF) and the standards for file naming and paths to files are set -out by CMOR and DRS. For a detailed description of these standards and their -adoption in ESMValCore, we refer the user to :ref:`CMOR-DRS` section where we -relate these standards to the data retrieval mechanism of the ESMValCore. Users can get a copy of this file with default values by running @@ -936,64 +1113,17 @@ Example of the CMIP6 project configuration: .. code-block:: yaml CMIP6: - input_dir: - default: '/' - BADC: '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}' - DKRZ: '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}' - ETHZ: '{exp}/{mip}/{short_name}/{dataset}/{ensemble}/{grid}/' - input_file: '{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc' output_file: '{project}_{dataset}_{mip}_{exp}_{ensemble}_{short_name}' cmor_type: 'CMIP6' cmor_strict: true -Input file paths ----------------- - -When looking for input files, the ``esmvaltool`` command provided by -ESMValCore replaces the placeholders ``{item}`` in -``input_dir`` and ``input_file`` with the values supplied in the recipe. -ESMValCore will try to automatically fill in the values for institute, frequency, -and modeling_realm based on the information provided in the CMOR tables -and/or :ref:`config-extra-facets` when reading the recipe. -If this fails for some reason, these values can be provided in the recipe too. - -The data directory structure of the CMIP projects is set up differently -at each site. As an example, the CMIP6 directory path on BADC would be: - -.. code-block:: yaml - - '{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}' - -The resulting directory path would look something like this: - -.. code-block:: bash - - CMIP/MOHC/HadGEM3-GC31-LL/historical/r1i1p1f3/Omon/tos/gn/latest - -Please, bear in mind that ``input_dirs`` can also be a list for those cases in -which may be needed: - -.. code-block:: yaml - - - '{exp}/{ensemble}/original/{mip}/{short_name}/{grid}/{version}' - - '{exp}/{ensemble}/computed/{mip}/{short_name}/{grid}/{version}' - -In that case, the resultant directories will be: - -.. code-block:: bash - - historical/r1i1p1f3/original/Omon/tos/gn/latest - historical/r1i1p1f3/computed/Omon/tos/gn/latest - -For a more in-depth description of how to configure ESMValCore so it can find -your data please see :ref:`CMOR-DRS`. - Preprocessor output files ------------------------- -The filename to use for preprocessed data is configured in a similar manner -using ``output_file``. Note that the extension ``.nc`` (and if applicable, -a start and end time) will automatically be appended to the filename. +The filename to use for preprocessed data is configured using ``output_file``, +similar to the filename template in :class:`esmvalcore.local.LocalDataSource`. +Note that the extension ``.nc`` (and if applicable, a start and end time) will +automatically be appended to the filename. .. _cmor_table_configuration: @@ -1086,38 +1216,6 @@ which will extend the entries from the default one `_). -.. _filterwarnings_config-developer: - -Filter preprocessor warnings ----------------------------- - -It is possible to ignore specific warnings of the preprocessor for a given -``project``. -This is particularly useful for native datasets which do not follow the CMOR -standard by default and consequently produce a lot of warnings when handled by -Iris. -This can be configured in the ``config-developer.yml`` file for some steps of -the preprocessing chain. - -Currently supported preprocessor steps: - -* :func:`~esmvalcore.preprocessor.load` - -Here is an example on how to ignore specific warnings during the preprocessor -step ``load`` for all datasets of project ``EMAC`` (taken from the default -``config-developer.yml`` file): - -.. code-block:: yaml - - ignore_warnings: - load: - - {message: 'Missing CF-netCDF formula term variable .*, referenced by netCDF variable .*', module: iris} - - {message: 'Ignored formula of unrecognised type: .*', module: iris} - -The keyword arguments specified in the list items are directly passed to -:func:`warnings.filterwarnings` in addition to ``action=ignore`` (may be -overwritten in ``config-developer.yml``). - .. _configure_native_models: Configuring datasets in native format @@ -1135,23 +1233,12 @@ Example: native6: cmor_strict: false - input_dir: - default: 'Tier{tier}/{dataset}/{version}/{frequency}/{short_name}' - input_file: - default: '*.nc' output_file: '{project}_{dataset}_{type}_{version}_{mip}_{short_name}' cmor_type: 'CMIP6' cmor_default_table_prefix: 'CMIP6_' ICON: cmor_strict: false - input_dir: - default: - - '{exp}' - - '{exp}/outdata' - - '{exp}/output' - input_file: - default: '{exp}_{var_type}*.nc' output_file: '{project}_{dataset}_{exp}_{var_type}_{mip}_{short_name}' cmor_type: 'CMIP6' cmor_default_table_prefix: 'CMIP6_' diff --git a/doc/quickstart/find_data.rst b/doc/quickstart/find_data.rst index d8f13b1fc5..939e65f378 100644 --- a/doc/quickstart/find_data.rst +++ b/doc/quickstart/find_data.rst @@ -70,11 +70,8 @@ CMOR-DRS_ are used again and the file will be automatically found: /gws/nopw/j04/esmeval/obsdata-v2/Tier3/ERA-Interim/OBS_ERA-Interim_reanaly_1_Amon_ta_201401-201412.nc -Since observational data are organized in Tiers depending on their level of -public availability, the ``default`` directory must be structured accordingly -with sub-directories ``TierX`` (``Tier1``, ``Tier2`` or ``Tier3``), even when -``drs: default``. - +Observational datasets CMORized by ESMValTool are organized in Tiers depending on +their level of public availability. .. _read_native_datasets: @@ -105,10 +102,17 @@ Supported native reanalysis/observational datasets The following native reanalysis/observational datasets are supported under the ``native6`` project. To use these datasets, put the files containing the data in the directory that -you have :ref:`configured ` for the ``rootpath`` of the +you have :ref:`configured ` for the ``rootpath`` of the ``native6`` project, in a subdirectory called -``Tier{tier}/{dataset}/{version}/{frequency}/{short_name}`` (assuming you are -using the ``default`` DRS for ``native6``). +``Tier{tier}/{dataset}/{version}/{frequency}/{short_name}``, assuming you are +using the default DRS for ``native6`` as defined in the file: + +.. literalinclude:: ../configurations/data-local-esmvaltool.yml + :language: yaml + :caption: Contents of ``data-local-esmvaltool.yml`` + :start-after: # Read native6, OBS6, and OBS data from the filesystem on a personal computer. + :end-before: # Data that has been CMORized by ESMValTool according to the CMIP6 standard. + Replace the items in curly braces by the values used in the variable/dataset definition in the :ref:`recipe `. @@ -121,8 +125,8 @@ ERA5 data can be downloaded from the Copernicus Climate Data Store (CDS) using the convenient tool `era5cli `__. For example for monthly data, place the files in the ``/Tier3/ERA5/version/mon/pr`` subdirectory of your ``rootpath`` that you have -configured for the ``native6`` project (assuming you are using the ``default`` -DRS for ``native6``). +configured for the ``native6`` project (assuming you are using the default DRS +for ``native6`` described :ref:`above `). - Supported variables: ``cl``, ``clt``, ``evspsbl``, ``evspsblpot``, ``mrro``, ``pr``, ``prsn``, ``ps``, ``psl``, ``ptype``, ``rls``, ``rlds``, ``rsds``, @@ -157,20 +161,18 @@ in its native GRIB format. `__. Reading self-downloaded ERA5 data in GRIB format is experimental and likely requires additional setup from the user like setting up the proper directory - structure for the input files and/or creating a custom :ref:`DRS - `. + structure for the input files. -To read these data with ESMValCore, use the :ref:`rootpath -` ``/pool/data/ERA5`` with :ref:`DRS -` ``DKRZ-ERA5-GRIB`` in your configuration, for example: +To read these data with ESMValCore, use the data definition for the ``native6`` +project: -.. code-block:: yaml +.. literalinclude:: ../configurations/data-hpc-dkrz.yml + :language: yaml + :caption: Contents of ``data-hpc-dkrz.yml`` + :start-at: # ERA5 data in GRIB format: + :end-before: OBS6: - rootpath: - ... - native6: - /pool/data/ERA5: DKRZ-ERA5-GRIB - ... +To use this configuration, run ``esmvaltool config copy data-hpc-dkrz.yml``. The `naming conventions `__ @@ -230,8 +232,8 @@ MSWEP For example for monthly data, place the files in the ``/Tier3/MSWEP/version/mon/pr`` subdirectory of your ``rootpath`` that you have -configured for the ``native6`` project (assuming you are using the ``default`` -DRS for ``native6``). +configured for the ``native6`` project (assuming you are using the default DRS +for ``native6`` described :ref:`above `). .. note:: For monthly data (``V220``), the data must be postfixed with the date, i.e. rename ``global_monthly_050deg.nc`` to ``global_monthly_050deg_197901-201710.nc`` @@ -273,8 +275,14 @@ The default naming conventions for input directories and files for CESM are * ``{case}/{gcomp}/proc/{tdir}/{tperiod}`` (post-processed data) * input files: ``{case}.{scomp}.{type}.{string}*nc`` -as configured in the :ref:`config-developer file ` (using the -:ref:`configuration option ` ``drs: default``). +as configured in: + +.. literalinclude:: ../configurations/data-native-cesm.yml + :language: yaml + :caption: Contents of ``data-native-cesm.yml`` + +To use this configuration, run ``esmvaltool config copy data-native-cesm.yml`` and +adapt the ``rootpath`` to your system. More information about CESM naming conventions are given `here `__. @@ -352,8 +360,14 @@ The default naming conventions for input directories and files for EMAC are * input directories: ``{exp}/{channel}`` * input files: ``{exp}*{channel}{postproc_flag}.nc`` -as configured in the :ref:`config-developer file ` (using the -:ref:`configuration option ` ``drs: default``). +as configured in: + +.. literalinclude:: ../configurations/data-native-emac.yml + :language: yaml + :caption: Contents of ``data-native-emac.yml`` + +To use this configuration, run ``esmvaltool config copy data-native-emac.yml`` and +adapt the ``rootpath`` to your system. Thus, example dataset entries could look like this: @@ -428,8 +442,14 @@ The default naming conventions for input directories and files for ICON are * input directories: ``{exp}``, ``{exp}/outdata``, or ``{exp}/output`` * input files: ``{exp}_{var_type}*.nc`` -as configured in the :ref:`config-developer file ` (using the -:ref:`configuration option ` ``drs: default``). +as configured in: + +.. literalinclude:: ../configurations/data-native-icon.yml + :language: yaml + :caption: Contents of ``data-native-icon.yml`` + +To use this configuration, run ``esmvaltool config copy data-native-icon.yml`` and +adapt the ``rootpath`` to your system. Currently, two different versions of ICON are supported: @@ -653,6 +673,9 @@ formats) are supported, and should be configured in recipes as e.g.: account: p86caub, status: PROD, dataset: IPSL-CM6, project: IPSLCM, root: /thredds/tgcc/store} +and data can be found by running ``esmvaltool config copy data-native-ipslcm.yml`` +and adapting the ``rootpath`` to your system. + .. _ipslcm_extra_facets_example: The ``Output`` format is an example of a case where variables are grouped in @@ -682,11 +705,14 @@ The default naming conventions for input directories and files for ACCESS output * input directories: ``{institute}/{sub_dataset}/{exp}/{modeling_realm}/netCDF`` * input files: ``{sub_dataset}.{special_attr}-*.nc`` -.. hint:: +as configured in: - We only provide one default `input_dir` since this is how ACCESS-ESM native data was - stored on NCI. Users can modify this path in the :ref:`config-developer` to match their local file structure. +.. literalinclude:: ../configurations/data-native-access.yml + :language: yaml + :caption: Contents of ``data-native-access.yml`` +To use this configuration, run ``esmvaltool config copy data-native-access.yml`` and +adapt the ``rootpath`` to your system. Thus, example dataset entries could look like this: @@ -729,228 +755,9 @@ Key Description Default value if Data retrieval ============== -Data retrieval in ESMValCore has two main aspects from the user's point of -view: - -* data can be found by the tool, subject to availability on disk or `ESGF `_; -* it is the user's responsibility to set the correct data retrieval parameters; - -The first point is self-explanatory: if the user runs the tool on a machine -that has access to a data repository or multiple data repositories, then -ESMValCore will look for and find the available data requested by the user. -If the files are not found locally, the tool can search the ESGF_ and download -the missing files, provided that they are available. - -The second point underlines the fact that the user has full control over what -type and the amount of data is needed for the analyses. Setting the data -retrieval parameters is explained below. - -Enabling automatic downloads from the ESGF ------------------------------------------- -To enable automatic downloads from ESGF, use the :ref:`configuration option -` ``search_esgf: when_missing`` (use local files -whenever possible) or ``search_esgf: always`` (always search ESGF for latest -version of files and only use local data if it is the latest version). -The files will be stored in the directory specified via the :ref:`configuration -option ` ``download_dir``. - -Setting the correct root paths ------------------------------- -The first step towards providing ESMValCore the correct set of parameters for -data retrieval is setting the root paths to the data. This is done in the -configuration. The two sections where the user will -set the paths are ``rootpath`` and ``drs``. ``rootpath`` contains pointers to -``CMIP``, ``OBS``, ``default`` and ``RAWOBS`` root paths; ``drs`` sets the type -of directory structure the root paths are structured by. It is important to -first discuss the ``drs`` parameter: as we've seen in the previous section, the -DRS as a standard is used for both file naming conventions and for directory -structures. - -.. _config_option_drs: - -Explaining ``drs: CMIP5:`` or ``drs: CMIP6:`` ---------------------------------------------- -Whereas ESMValCore will by default use the CMOR standard for file naming (please -refer above), by setting the ``drs`` parameter the user tells the tool what -type of root paths they need the data from, e.g.: - - .. code-block:: yaml - - drs: - CMIP6: BADC - -will tell the tool that the user needs data from a repository structured -according to the BADC DRS structure, i.e.: - -``ROOT/{institute}/{dataset_name}/{experiment}/{ensemble}/{mip}/{variable_short_name}/{grid}``; - -setting the ``ROOT`` parameter is explained below. This is a -strictly-structured repository tree and if there are any sort of irregularities -(e.g. there is no ``{mip}`` directory) the data will not be found! ``BADC`` can -be replaced with ``DKRZ`` or ``ETHZ`` depending on the existing ``ROOT`` -directory structure. -The snippet - - .. code-block:: yaml - - drs: - CMIP6: default - -is another way to retrieve data from a ``ROOT`` directory that has no DRS-like -structure; ``default`` indicates that the data lies in a directory that -contains all the files without any structure. - -The names of the directories trees that can be used under `drs` are defined in -:ref:`config-developer`. - -.. note:: - When using ``CMIP6: default`` or ``CMIP5: default``, all the needed files - must be in the same top-level directory specified under ``rootpath``. - However, it is not recommended to use this, as it makes it impossible for - the tool to read the facets from the directory tree. - Moreover, this way of organizing data makes it impossible to store multiple - versions of the same file because the files typically have the same name - for different versions. - -.. _config_option_rootpath: - -Explaining ``rootpath:`` ------------------------- - -``rootpath`` identifies the root directory for different data types (``ROOT`` as we used it above): - -* ``CMIP`` e.g. ``CMIP5`` or ``CMIP6``: this is the `root` path(s) to where the - CMIP files are stored; it can be a single path, a list of paths, or a mapping - with paths as keys and `drs` names as values; it can - point to an ESGF node or it can point to a user private repository. Example - for a CMIP5 root path pointing to the ESGF node mounted on CEDA-Jasmin (formerly - known as BADC): - - .. code-block:: yaml - - rootpath: - CMIP5: /badc/cmip5/data/cmip5/output1 - - Example for a CMIP6 root path pointing to the ESGF node on CEDA-Jasmin: - - .. code-block:: yaml - - rootpath: - CMIP6: /badc/cmip6/data/CMIP6 - - Example for a mix of CMIP6 root path pointing to the ESGF node on CEDA-Jasmin - and a user-specific data repository for extra data: - - .. code-block:: yaml - - rootpath: - CMIP6: - /badc/cmip6/data/CMIP6: BADC - ~/climate_data: ESGF - - Note that this notation combines the ``rootpath`` and ``drs`` settings, so it - is not necessary to specify the directory structure in under ``drs`` in this - case. - -* ``OBS``: this is the `root` path(s) to where the observational datasets are - stored; again, this could be a single path or a list of paths, just like for - CMIP data. Example for the OBS path for a large cache of observation datasets - on CEDA-Jasmin: - - .. code-block:: yaml - - rootpath: - OBS: /gws/nopw/j04/esmeval/obsdata-v2 - -* ``default``: this is the `root` path(s) where the tool will look for data - from projects that do not have their own rootpath set. - -* ``RAWOBS``: this is the `root` path(s) to where the raw observational data - files are stored; this is used by ``esmvaltool data format``. - -Synda ------ - -If the `synda install `_ command is used to download data, -it maintains the directory structure as on ESGF. To find data downloaded by -synda, use the ``SYNDA`` ``drs`` parameter. - -.. code-block:: yaml - - drs: - CMIP6: SYNDA - CMIP5: SYNDA - -Dataset definitions in ``recipe`` ---------------------------------- -Once the correct paths have been established, ESMValCore collects the -information on the specific datasets that are needed for the analysis. This -information, together with the CMOR convention for naming files (see CMOR-DRS_) -will allow the tool to search and find the right files. The specific -datasets are listed in any recipe, under either the ``datasets`` and/or -``additional_datasets`` sections, e.g. - -.. code-block:: yaml - - datasets: - - {dataset: HadGEM2-CC, project: CMIP5, exp: historical, ensemble: r1i1p1, start_year: 2001, end_year: 2004} - - {dataset: UKESM1-0-LL, project: CMIP6, exp: historical, ensemble: r1i1p1f2, grid: gn, start_year: 2004, end_year: 2014} - -The data finding feature will use this information to find data for **all** the variables specified in ``diagnostics/variables``. - -Recap and example -================= -Let us look at a practical example for a recap of the information above: -suppose you are using configuration that has the following entries for -data finding: - -.. code-block:: yaml - - rootpath: # running on CEDA-Jasmin - CMIP6: /badc/cmip6/data/CMIP6/CMIP - drs: - CMIP6: BADC # since you are on CEDA-Jasmin - -and the dataset you need is specified in your ``recipe.yml`` as: - -.. code-block:: yaml - - - {dataset: UKESM1-0-LL, project: CMIP6, mip: Amon, exp: historical, grid: gn, ensemble: r1i1p1f2, start_year: 2004, end_year: 2014} - -for a variable, e.g.: - -.. code-block:: yaml - - diagnostics: - some_diagnostic: - description: some_description - variables: - ta: - preprocessor: some_preprocessor - -The tool will then use the root path ``/badc/cmip6/data/CMIP6/CMIP`` and the -dataset information and will assemble the full DRS path using information from -CMOR-DRS_ and establish the path to the files as: - -.. code-block:: bash - - /badc/cmip6/data/CMIP6/CMIP/MOHC/UKESM1-0-LL/historical/r1i1p1f2/Amon - -then look for variable ``ta`` and specifically the latest version of the data -file: - -.. code-block:: bash - - /badc/cmip6/data/CMIP6/CMIP/MOHC/UKESM1-0-LL/historical/r1i1p1f2/Amon/ta/gn/latest/ - -and finally, using the file naming definition from CMOR-DRS_ find the file: - -.. code-block:: bash - - /badc/cmip6/data/CMIP6/CMIP/MOHC/UKESM1-0-LL/historical/r1i1p1f2/Amon/ta/gn/latest/ta_Amon_UKESM1-0-LL_historical_r1i1p1f2_gn_195001-201412.nc - -.. _observations: +Please go to :ref:`config-data-sources` for instructions and background on how +to configure data retrieval. Data loading ============ @@ -1002,7 +809,9 @@ Another use case is files that use different names for variables in their file name than for the netCDF4 variable name. To apply the extra facets for this purpose, simply use the corresponding tag in -the applicable DRS inside the :ref:`config-developer`. +the applicable ``filename_template`` or ``dirname_template`` in +:class:`esmvalcore.local.LocalDataSource`. + For example, given the extra facets .. code-block:: yaml @@ -1015,14 +824,15 @@ For example, given the extra facets tas: source_var_name: t2m -a corresponding entry in the developer configuration file could look like: - -.. code-block:: yaml - :caption: Contents of ``config-developer.yml`` +a corresponding entry in the configuration file could look like: - native6: - input_file: - default: '{source_var_name}_*.nc' +.. literalinclude:: ../configurations/data-local-esmvaltool.yml + :language: yaml + :caption: Contents of ``data-local-esmvaltool.yml`` + :start-at: # Data that can be read in its native format by ESMValCore. + :end-before: # Data that has been CMORized by ESMValTool according to the CMIP6 standard. The same replacement mechanism can be employed everywhere where tags can be -used, particularly in ``input_dir``, ``input_file``, and ``output_file``. +used, particularly in ``dirname_template`` and ``filename_template`` in +:class:`esmvalcore.local.LocalDataSource`, and in ``output_file`` in +:ref:`config-developer.yml `. diff --git a/doc/quickstart/run.rst b/doc/quickstart/run.rst index 61709bc778..6696f53d85 100644 --- a/doc/quickstart/run.rst +++ b/doc/quickstart/run.rst @@ -62,25 +62,6 @@ arguments: esmvaltool run --argument_name argument_value recipe_example.yml -To automatically download the files required to run a recipe from ESGF, use the -:ref:`configuration option ` ``search_esgf=when_missing`` (use -local files whenever possible) or ``search_esgf=always`` (always search ESGF -for latest version of files and only use local data if it is the latest -version): - -.. code:: bash - - esmvaltool run --search_esgf=when_missing recipe_example.yml - -or - -.. code:: bash - - esmvaltool run --search_esgf=always recipe_example.yml - -This feature is available for projects that are hosted on the ESGF, i.e. -CMIP3, CMIP5, CMIP6, CORDEX, and obs4MIPs. - To control the strictness of the CMOR checker and the checks during concatenation on auxiliary coordinates, supplementary variables, and derived coordinates, use the flag ``--check_level``: diff --git a/doc/recipe/overview.rst b/doc/recipe/overview.rst index 0218156bb3..eeca7e25e2 100644 --- a/doc/recipe/overview.rst +++ b/doc/recipe/overview.rst @@ -131,8 +131,8 @@ See :ref:`CMOR-DRS` for more information on this kind of file organization. When (some) files are available locally, the tool will not automatically look for more files on ESGF. To populate a recipe with all available datasets from ESGF, the -:ref:`configuration option ` ``search_esgf`` should be set to -``always``. +:ref:`configuration option ` ``search_data`` should be set to +``complete`` and an :ref:`ESGF data source ` needs to be configured. For more control over which datasets are selected, it is recommended to use a Python script or `Jupyter notebook `_ to compose diff --git a/environment.yml b/environment.yml index ae382ce831..78102cc380 100644 --- a/environment.yml +++ b/environment.yml @@ -19,6 +19,7 @@ dependencies: - fire - geopy - humanfriendly + - intake-esgf >=2025.10.22 - intake-esm - iris >=3.12.2 # https://github.com/SciTools/iris/issues/6417 - iris-esmf-regrid >=0.11.0 diff --git a/esmvalcore/_main.py b/esmvalcore/_main.py index 2f8d95d84e..9c8d413ef7 100644 --- a/esmvalcore/_main.py +++ b/esmvalcore/_main.py @@ -31,6 +31,7 @@ import logging import os +import re import sys from importlib.metadata import entry_points from pathlib import Path @@ -164,6 +165,154 @@ class Config: files. """ + def __init__(self) -> None: + from rich.console import Console + + self.console = Console(soft_wrap=True) + + def show( + self, + filter: tuple[str] | None = ("extra_facets",), # noqa: A002 + ) -> None: + """Show the current configuration. + + Parameters + ---------- + filter: + Filter this list of keys. By default, the `extra_facets` + key is filtered out, as it can be very large. + + """ + import yaml + from nested_lookup import nested_delete + from rich.syntax import Syntax + + from esmvalcore.config import CFG + + cfg = dict(CFG) + if filter: + for key in filter: + cfg = nested_delete(cfg, key) + exclude_msg = ( + ", excluding the keys " + ", ".join(f"'{f}'" for f in filter) + if filter + else "" + ) + self.console.print(f"# Current configuration{exclude_msg}:") + self.console.print( + Syntax( + yaml.safe_dump(cfg), + "yaml", + background_color="default", + ), + ) + + def list(self, name: str = "") -> None: + """List all available example configuration files. + + Arguments + --------- + name: + Only show configuration files that have this string in their name. + For example, to only show configuration files for data sources, + use `--name='data'`. + """ + from rich.markdown import Markdown + + import esmvalcore.config + + headers = { + "defaults": "Defaults", + "data": "Data Sources", + } + config_dir = Path(esmvalcore.config.__file__).parent / "configurations" + available_files = [ + file + for file in config_dir.rglob("*.yml") + if name.lower() in file.name.lower() + ] + + def description(file: Path) -> str: + if first_comment := re.search( + r"\A((?: *#.*\r?\n)+)", + file.read_text(encoding="utf-8"), + flags=re.MULTILINE, + ): + description = " ".join( + line.lstrip(" #").strip() + for line in first_comment.group(1).split("\n") + ).strip() + else: + description = "" + return description + + msg = [] + for header_name, header in headers.items(): + files = sorted( + f + for f in available_files + if str(f.relative_to(config_dir)).startswith(header_name) + ) + if files: + msg.append(f"\n# {header}\n") + msg += [ + f"- `{f.relative_to(config_dir)}`: {description(f)}" + for f in files + ] + self.console.print(Markdown("\n".join(msg))) + + def copy( + self, + source_file: Path, + target_file: Path | None = None, + overwrite: bool = False, + ) -> None: + """Copy one of the available example configuration files to the configuration directory. + + Arguments + --------- + source_file: + Source configuration file to copy. Use `esmvaltool config list` + to see all available configuration files. + target_file: + Target file name. If not provided, the file will be copied to + the configuration directory with the same filename as the source + file. + overwrite: + Overwrite an existing file. + """ + import esmvalcore.config + + source_file = Path(source_file) + target_dir = esmvalcore.config._config_object._get_user_config_dir() # noqa: SLF001 + target_file = target_dir / ( + source_file.name if target_file is None else target_file + ) + config_dir = Path(esmvalcore.config.__file__).parent / "configurations" + + available_files = { + f.relative_to(config_dir) for f in config_dir.rglob("*.yml") + } + if source_file not in available_files: + esmvalcore.config._logging.configure_logging( # noqa: SLF001 + console_log_level="info", + ) + self.list() + logger.error( + ( + "Configuration file '%s' not found, choose from one of the " + "available files listed above" + ), + source_file, + ) + sys.exit(1) + + self._copy_config_file( + config_dir / source_file, + target_file, + overwrite=overwrite, + ) + @staticmethod def _copy_config_file( in_file: Path, @@ -177,19 +326,19 @@ def _copy_config_file( configure_logging(console_log_level="info") + logger.info("Copying file %s to path %s", in_file, out_file) if out_file.is_file(): if overwrite: logger.info("Overwriting file %s.", out_file) else: - logger.info("Copy aborted. File %s already exists.", out_file) - return + logger.error("Copy aborted. File %s already exists.", out_file) + sys.exit(1) target_folder = out_file.parent if not target_folder.is_dir(): logger.info("Creating folder %s", target_folder) target_folder.mkdir(parents=True, exist_ok=True) - logger.info("Copying file %s to path %s.", in_file, out_file) shutil.copy2(in_file, out_file) logger.info("Copy finished.") @@ -212,7 +361,26 @@ def get_config_user( If not provided, the file will be copied to `~/.config/esmvaltool/`. + .. deprecated:: 2.13.0:: + + This function is deprecated and will be removed in ESMValCore + version 2.16.0. Use the ``copy`` method instead. + """ + import warnings + + from esmvalcore.exceptions import ESMValCoreDeprecationWarning + + deprecation_msg = ( + "The 'esmvaltool config get_config_user' command is deprecated and " + "will be removed in ESMValCore version 2.16.0. Use the command " + "`esmvaltool config copy defaults/config-user.yml` instead." + ) + warnings.warn( + deprecation_msg, + category=ESMValCoreDeprecationWarning, + stacklevel=1, + ) from .config._config_object import DEFAULT_CONFIG_DIR in_file = DEFAULT_CONFIG_DIR / "config-user.yml" @@ -587,7 +755,7 @@ def _log_header(self, log_files, cli_config_dir): def run(): """Run the `esmvaltool` program, logging any exceptions.""" - from .exceptions import RecipeError + from esmvalcore.exceptions import SuppressedError # Workaround to avoid using more for the output @@ -601,7 +769,7 @@ def display(lines, out): fire.Fire(ESMValTool()) except fire.core.FireExit: raise - except RecipeError as exc: + except SuppressedError as exc: # Hide the stack trace for RecipeErrors logger.error("%s", exc) logger.debug("Stack trace for debugging:", exc_info=True) diff --git a/esmvalcore/_provenance.py b/esmvalcore/_provenance.py index dc669731e5..a4f3b4c79d 100644 --- a/esmvalcore/_provenance.py +++ b/esmvalcore/_provenance.py @@ -1,33 +1,48 @@ """Provenance module.""" +from __future__ import annotations + import copy import logging import os from functools import total_ordering +from pathlib import Path +from typing import TYPE_CHECKING, Any from netCDF4 import Dataset from PIL import Image from PIL.PngImagePlugin import PngInfo from prov.model import ProvDerivation, ProvDocument -from ._version import __version__ +from esmvalcore._version import __version__ +from esmvalcore.io.protocol import DataElement + +if TYPE_CHECKING: + from collections.abc import Iterable + + import prov.model + + from esmvalcore._task import BaseTask logger = logging.getLogger(__name__) ESMVALTOOL_URI_PREFIX = "https://www.esmvaltool.org/" -def create_namespace(provenance, namespace): +def create_namespace( + provenance: prov.model.ProvBundle, + namespace: str, +) -> None: """Create an esmvaltool namespace.""" provenance.add_namespace(namespace, uri=ESMVALTOOL_URI_PREFIX + namespace) -def get_esmvaltool_provenance(): +def get_esmvaltool_provenance() -> prov.model.ProvActivity: """Create an esmvaltool run activity.""" provenance = ProvDocument() namespace = "software" create_namespace(provenance, namespace) - attributes = {} # TODO: add dependencies with versions here + attributes: dict = {} # TODO: add dependencies with versions here return provenance.activity( namespace + ":esmvaltool==" + __version__, other_attributes=attributes, @@ -37,7 +52,10 @@ def get_esmvaltool_provenance(): ESMVALTOOL_PROVENANCE = get_esmvaltool_provenance() -def attribute_to_authors(entity, authors): +def attribute_to_authors( + entity: prov.model.ProvEntity, + authors: list[dict[str, str]], +) -> None: """Attribute entity to authors.""" namespace = "author" create_namespace(entity.bundle, namespace) @@ -53,7 +71,10 @@ def attribute_to_authors(entity, authors): entity.wasAttributedTo(agent) -def attribute_to_projects(entity, projects): +def attribute_to_projects( + entity: prov.model.ProvEntity, + projects: list[str], +) -> None: """Attribute entity to projects.""" namespace = "project" create_namespace(entity.bundle, namespace) @@ -63,7 +84,10 @@ def attribute_to_projects(entity, projects): entity.wasAttributedTo(agent) -def get_recipe_provenance(documentation, filename): +def get_recipe_provenance( + documentation: dict[str, Any], + filename: Path, +) -> prov.model.ProvEntity: """Create a provenance entity describing a recipe.""" provenance = ProvDocument() @@ -84,7 +108,10 @@ def get_recipe_provenance(documentation, filename): return entity -def get_task_provenance(task, recipe_entity): +def get_task_provenance( + task: BaseTask, + recipe_entity: prov.model.ProvEntity, +) -> prov.model.ProvActivity: """Create a provenance activity describing a task.""" provenance = ProvDocument() create_namespace(provenance, "task") @@ -108,81 +135,102 @@ class TrackedFile: def __init__( self, - filename, - attributes=None, - ancestors=None, - prov_filename=None, + filename: DataElement | Path, + attributes: dict[str, Any] | None = None, + ancestors: Iterable[TrackedFile] | None = None, + prov_filename: str | None = None, ): """Create an instance of a file with provenance tracking. Arguments --------- - filename: str - Path to the file on disk. - attributes: dict + filename: + Path or data element containing the data described by the provenance. + + Attributes + ---------- Dictionary with facets describing the file. If set to None, this will be read from the file when provenance is initialized. - ancestors: :obj:`list` of :obj:`TrackedFile` + ancestors: Ancestor files. - prov_filename: str + prov_filename: The path this file has in the provenance record. This can differ from `filename` if the file was moved before resuming processing. """ self._filename = filename if prov_filename is None: - self.prov_filename = filename + self.prov_filename = ( + str(filename) if isinstance(filename, Path) else filename.name + ) else: self.prov_filename = prov_filename + self.attributes = copy.deepcopy(attributes) self.provenance = None self.entity = None self.activity = None - self._ancestors = [] if ancestors is None else ancestors + self._ancestors = [] if ancestors is None else list(ancestors) + + @property + def attributes(self) -> dict[str, Any]: + """Attributes describing the file.""" + if self._attributes is None: + msg = f"Call {self.__class__.__name__}.initialize_provenance before accessing attributes" + raise ValueError(msg) + return self._attributes - def __str__(self): + @attributes.setter + def attributes(self, value: dict[str, Any] | None): + """Set attributes describing the file.""" + self._attributes = value + + def __str__(self) -> str: """Return summary string.""" return f"{self.__class__.__name__}: {self.filename}" - def __repr__(self): + def __repr__(self) -> str: """Return representation string (e.g., used by ``pformat``).""" return f"{self.__class__.__name__}: {self.filename}" - def __eq__(self, other): + def __eq__(self, other) -> bool: """Check if `other` equals `self`.""" return hasattr(other, "filename") and self.filename == other.filename - def __lt__(self, other): + def __lt__(self, other) -> bool: """Check if `other` should be sorted before `self`.""" return hasattr(other, "filename") and self.filename < other.filename - def __hash__(self): + def __hash__(self) -> int: """Return a unique hash for the file.""" return hash(self.filename) - def copy_provenance(self): + def copy_provenance(self) -> TrackedFile: """Create a copy with identical provenance information.""" if self.provenance is None: msg = f"Provenance of {self} not initialized" raise ValueError(msg) - new = TrackedFile(self.filename, self.attributes) + new = TrackedFile(Path(self.filename), self.attributes) new.provenance = copy.deepcopy(self.provenance) new.entity = new.provenance.get_record(self.entity.identifier)[0] new.activity = new.provenance.get_record(self.activity.identifier)[0] return new @property - def filename(self): - """Filename.""" + def filename(self) -> DataElement | Path: + """Name of data described by this provenance document.""" return self._filename @property - def provenance_file(self): - """Filename of provenance.""" - return os.path.splitext(self.filename)[0] + "_provenance.xml" - - def initialize_provenance(self, activity): + def provenance_file(self) -> Path: + """Filename of provenance file.""" + if not isinstance(self.filename, Path): + msg = f"Saving provenance is only supported for pathlib.Path, not {type(self.filename)}" + raise NotImplementedError(msg) + return self.filename.with_name(f"{self.filename.stem}_provenance.xml") + + def initialize_provenance(self, activity: prov.model.ProvActivity) -> None: """Initialize the provenance document. Note: this also copies the ancestor provenance. Therefore, changes @@ -191,30 +239,33 @@ def initialize_provenance(self, activity): """ if self.provenance is not None: msg = f"Provenance of {self} already initialized" - raise ValueError( - msg, - ) + raise ValueError(msg) self.provenance = ProvDocument() self._initialize_namespaces() self._initialize_activity(activity) self._initialize_entity() self._initialize_ancestors(activity) - def _initialize_namespaces(self): + def _initialize_namespaces(self) -> None: """Initialize the namespaces.""" for namespace in ("file", "attribute", "preprocessor", "task"): create_namespace(self.provenance, namespace) - def _initialize_activity(self, activity): + def _initialize_activity(self, activity: prov.model.ProvActivity) -> None: """Copy the preprocessor task activity.""" self.activity = activity - self.provenance.update(activity.bundle) + self.provenance.update(activity.bundle) # type: ignore[attr-defined] - def _initialize_entity(self): + def _initialize_entity(self) -> None: """Initialize the entity representing the file.""" - if self.attributes is None: - # This happens for ancestor files of preprocessor files as created - # in esmvalcore.preprocessor.Processorfile.__init__. + if self._attributes is None: + if not isinstance(self.filename, DataElement): + msg = "Delayed reading of attributes is only supported for `DataElement`s" + raise TypeError(msg) + # This is used to delay reading the attributes of ancestor files of + # preprocessor files as created in + # esmvalcore.preprocessor.Processorfile.__init__ until after the data + # has been loaded. self.attributes = copy.deepcopy(self.filename.attributes) attributes = { @@ -222,38 +273,44 @@ def _initialize_entity(self): for k, v in self.attributes.items() if k not in ("authors", "projects") } - self.entity = self.provenance.entity( - f"file:{self.filename}", + self.entity = self.provenance.entity( # type: ignore[attr-defined] + f"file:{self.prov_filename}", attributes, ) attribute_to_authors(self.entity, self.attributes.get("authors", [])) attribute_to_projects(self.entity, self.attributes.get("projects", [])) - def _initialize_ancestors(self, activity): + def _initialize_ancestors(self, activity: prov.model.ProvActivity) -> None: """Register ancestor files for provenance tracking.""" for ancestor in self._ancestors: if ancestor.provenance is None: - if os.path.exists(ancestor.provenance_file): + if ( + isinstance(ancestor.filename, Path) + and ancestor.provenance_file.exists() + ): ancestor.restore_provenance() else: ancestor.initialize_provenance(activity) - self.provenance.update(ancestor.provenance) + self.provenance.update(ancestor.provenance) # type: ignore[attr-defined] self.wasderivedfrom(ancestor) - def wasderivedfrom(self, other): + def wasderivedfrom( + self, + other: TrackedFile | prov.model.ProvEntity, + ) -> None: """Let the file know that it was derived from other.""" if isinstance(other, TrackedFile): other_entity = other.entity else: other_entity = other - self.provenance.update(other_entity.bundle) if not self.activity: - msg = "Activity not initialized." + msg = f"Provenance of {self} not initialized" raise ValueError(msg) + self.provenance.update(other_entity.bundle) # type: ignore[attr-defined, union-attr] self.entity.wasDerivedFrom(other_entity, self.activity) - def _select_for_include(self): + def _select_for_include(self) -> dict[str, str]: attributes = { "software": f"Created with ESMValTool v{__version__}", } @@ -262,13 +319,19 @@ def _select_for_include(self): return attributes @staticmethod - def _include_provenance_nc(filename, attributes): + def _include_provenance_nc( + filename: Path, + attributes: dict[str, str], + ) -> None: with Dataset(filename, "a") as dataset: for key, value in attributes.items(): setattr(dataset, key, value) @staticmethod - def _include_provenance_png(filename, attributes): + def _include_provenance_png( + filename: Path, + attributes: dict[str, str], + ) -> None: pnginfo = PngInfo() exif_tags = { "caption": "ImageDescription", @@ -279,8 +342,11 @@ def _include_provenance_png(filename, attributes): with Image.open(filename) as image: image.save(filename, pnginfo=pnginfo) - def _include_provenance(self): + def _include_provenance(self) -> None: """Include provenance information as metadata.""" + if not isinstance(self.filename, Path): + msg = f"Writing attributes is only supported for pathlib.Path, not {type(self.filename)}" + raise NotImplementedError(msg) attributes = self._select_for_include() # Attach provenance to supported file types @@ -289,32 +355,32 @@ def _include_provenance(self): if write: write(self.filename, attributes) - def save_provenance(self): + def save_provenance(self) -> None: """Export provenance information.""" self.provenance = ProvDocument( - records=set(self.provenance.records), - namespaces=self.provenance.namespaces, + records=set(self.provenance.records), # type: ignore[attr-defined] + namespaces=self.provenance.namespaces, # type: ignore[attr-defined] ) self._include_provenance() with open(self.provenance_file, "wb") as file: # Create file with correct permissions before saving. - self.provenance.serialize(file, format="xml") + self.provenance.serialize(file, format="xml") # type: ignore[attr-defined] self.activity = None self.entity = None self.provenance = None - def restore_provenance(self): + def restore_provenance(self) -> None: """Import provenance information from a previously saved file.""" self.provenance = ProvDocument.deserialize( self.provenance_file, format="xml", ) entity_uri = f"{ESMVALTOOL_URI_PREFIX}file{self.prov_filename}" - self.entity = self.provenance.get_record(entity_uri)[0] + self.entity = self.provenance.get_record(entity_uri)[0] # type: ignore[attr-defined] # Find the associated activity - for rec in self.provenance.records: + for rec in self.provenance.records: # type: ignore[attr-defined] if isinstance(rec, ProvDerivation): - if rec.args[0] == self.entity.identifier: + if rec.args[0] == self.entity.identifier: # type: ignore[attr-defined] activity_id = rec.args[2] - self.activity = self.provenance.get_record(activity_id)[0] + self.activity = self.provenance.get_record(activity_id)[0] # type: ignore[attr-defined] break diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index 70bc46eeb6..99e40baff9 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -16,7 +16,7 @@ import esmvalcore.preprocessor from esmvalcore.exceptions import InputFilesNotFound, RecipeError -from esmvalcore.local import _get_start_end_year, _parse_period +from esmvalcore.local import _parse_period from esmvalcore.preprocessor import TIME_PREPROCESSORS, PreprocessingTask from esmvalcore.preprocessor._multimodel import _get_operator_and_kwargs from esmvalcore.preprocessor._other import _get_var_info @@ -178,18 +178,26 @@ def variable( raise RecipeError(msg) +def get_no_data_message(dataset: Dataset) -> str: + """Generate a message for debugging missing data in dataset.""" + lines = [ + f"No files were found for {dataset},\nusing data sources:", + "\n".join( + f"- data source: {data_source}\n message: {data_source.debug_info}" + for data_source in sorted( + dataset._used_data_sources, # noqa: SLF001 + key=lambda d: d.priority, + ) + ), + ] + return "\n".join(lines) + + def _log_data_availability_errors(dataset: Dataset) -> None: """Check if the required input data is available.""" - input_files = dataset.files - patterns = dataset._file_globs # noqa: SLF001 - if not input_files: - logger.error("No input files found for %s", dataset) - if patterns: - if len(patterns) == 1: - msg = f": {patterns[0]}" - else: - msg = "\n{}".format("\n".join(str(p) for p in patterns)) - logger.error("Looked for files matching%s", msg) + if not dataset.files: + msg = get_no_data_message(dataset) + logger.error(msg) logger.error("Set 'log_level' to 'debug' to get more information") @@ -231,7 +239,9 @@ def data_availability(dataset: Dataset, log: bool = True) -> None: msg = f"Missing data for {dataset.summary(True)}" raise InputFilesNotFound(msg) - if "timerange" not in facets: + if "timerange" not in facets or any( + "timerange" not in f.facets for f in input_files + ): return start_date, end_date = _parse_period(facets["timerange"]) @@ -241,8 +251,10 @@ def data_availability(dataset: Dataset, log: bool = True) -> None: available_years: set[int] = set() for file in input_files: - start, end = _get_start_end_year(file) - available_years.update(range(start, end + 1)) + start_date, end_date = file.facets["timerange"].split("/") # type: ignore[union-attr] + start_year = int(start_date[:4]) + end_year = int(end_date[:4]) + available_years.update(range(start_year, end_year + 1)) missing_years = required_years - available_years if missing_years: diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index d6285c9aed..da159183ba 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -23,6 +23,7 @@ from esmvalcore.dataset import Dataset from esmvalcore.exceptions import InputFilesNotFound, RecipeError from esmvalcore.local import ( + GRIB_FORMATS, _dates_to_timerange, _get_multiproduct_filename, _get_output_file, @@ -38,7 +39,6 @@ PreprocessorFile, ) from esmvalcore.preprocessor._area import _update_shapefile_path -from esmvalcore.preprocessor._io import GRIB_FORMATS from esmvalcore.preprocessor._multimodel import _get_stat_identifier from esmvalcore.preprocessor._regrid import ( _spec_to_latlonvals, @@ -60,6 +60,7 @@ from collections.abc import Iterable, Sequence from esmvalcore.config import Session + from esmvalcore.io.protocol import DataElement from esmvalcore.typing import Facets logger = logging.getLogger(__name__) @@ -328,20 +329,12 @@ def _update_weighting_settings( _exclude_dataset(settings, facets, "weighting_landsea_fraction") -def _add_to_download_list(dataset: Dataset) -> None: - """Add the files of `dataset` to `DOWNLOAD_FILES`.""" - for i, file in enumerate(dataset.files): - if isinstance(file, esgf.ESGFFile): - DOWNLOAD_FILES.add(file) - dataset.files[i] = file.local_file(dataset.session["download_dir"]) - - def _schedule_for_download(datasets: Iterable[Dataset]) -> None: """Schedule files for download.""" for dataset in datasets: - _add_to_download_list(dataset) + DOWNLOAD_FILES.update(dataset.files) for supplementary_ds in dataset.supplementaries: - _add_to_download_list(supplementary_ds) + DOWNLOAD_FILES.update(supplementary_ds.files) def _log_input_files(datasets: Iterable[Dataset]) -> None: @@ -367,12 +360,7 @@ def _log_input_files(datasets: Iterable[Dataset]) -> None: def _get_files_str(dataset: Dataset) -> str: """Get nice string representation of all files of a dataset.""" - return "\n".join( - f" {f}" - if f.exists() # type: ignore - else f" {f} (will be downloaded)" - for f in dataset.files - ) + return "\n".join(f" {f}" for f in dataset.files) def _check_input_files(input_datasets: Iterable[Dataset]) -> set[str]: @@ -455,10 +443,7 @@ def _get_common_attributes( # Ensure that attributes start_year and end_year are always available if at # least one of the input datasets defines it - if "timerange" in attributes: - start_year, end_year = _parse_period(attributes["timerange"]) - attributes["start_year"] = int(str(start_year[0:4])) - attributes["end_year"] = int(str(end_year[0:4])) + _set_start_end_year(attributes) return attributes @@ -722,7 +707,7 @@ def _get_preprocessor_products( ) for product in products: - _set_start_end_year(product) + _set_start_end_year(product.attributes) product.check() return products @@ -782,18 +767,18 @@ def _configure_multi_product_preprocessor( for product in multimodel_products | ensemble_products: product.check() - _set_start_end_year(product) + _set_start_end_year(product.attributes) -def _set_start_end_year(product: PreprocessorFile) -> None: +def _set_start_end_year(attributes: dict[str, Any]) -> None: """Set the attributes `start_year` and `end_year`. These attributes are used by many diagnostic scripts in ESMValTool. """ - if "timerange" in product.attributes: - start_year, end_year = _parse_period(product.attributes["timerange"]) - product.attributes["start_year"] = int(str(start_year[0:4])) - product.attributes["end_year"] = int(str(end_year[0:4])) + if "timerange" in attributes: + start_year, end_year = _parse_period(attributes["timerange"]) + attributes["start_year"] = int(str(start_year[0:4])) + attributes["end_year"] = int(str(end_year[0:4])) def _update_preproc_functions( @@ -916,7 +901,7 @@ def __init__( # Clear the global variable containing the set of files to download DOWNLOAD_FILES.clear() USED_DATASETS.clear() - self._download_files: set[esgf.ESGFFile] = set() + self._download_files: set[DataElement] = set() self.session = session self.session["write_ncl_interface"] = self._need_ncl( raw_recipe["diagnostics"], @@ -945,7 +930,7 @@ def _log_recipe_errors(self, exc: RecipeError) -> None: for task in exc.failed_tasks: logger.error(task.message) - if self.session["search_esgf"] == "never" and any( + if any( isinstance(err, InputFilesNotFound) for err in exc.failed_tasks ): logger.error( @@ -953,27 +938,20 @@ def _log_recipe_errors(self, exc: RecipeError) -> None: "found.", ) logger.error( - "If the files are available locally, please check " - "your `rootpath` and `drs` settings in your configuration " - "file(s)", + "If the files are available, please check the debug log and " + "the configuration of your data sources.", ) logger.error( - "To automatically download the required files to " - "`download_dir: %s`, use `search_esgf: when_missing` or " - "`search_esgf: always` in your configuration file(s), or run " - "the recipe with the command line argument " - "--search_esgf=when_missing or --search_esgf=always", - self.session["download_dir"], + "To see your current configuration, run the command " + '`esmvaltool config show` and check the "data" entries.', ) - logger.info( - "Note that automatic download is only available for files" - " that are hosted on the ESGF, i.e. for projects: %s, and %s", - ", ".join(list(esgf.facets.FACETS)[:-1]), - list(esgf.facets.FACETS)[-1], + logger.error( + "To see the available example configurations, run the " + "command: `esmvaltool config list`.", ) @staticmethod - def _need_ncl(raw_diagnostics: Diagnostic) -> bool: + def _need_ncl(raw_diagnostics: dict[str, Diagnostic]) -> bool: if not raw_diagnostics: return False for diagnostic in raw_diagnostics.values(): @@ -996,8 +974,8 @@ def _initialize_provenance(self, raw_documentation: dict[str, Any]): def _initialize_diagnostics( self, - raw_diagnostics: Diagnostic, - ) -> Diagnostic: + raw_diagnostics: dict[str, Diagnostic], + ) -> dict[str, Diagnostic]: """Define diagnostics in recipe.""" logger.debug("Retrieving diagnostics from recipe") check.diagnostics(raw_diagnostics) @@ -1013,7 +991,7 @@ def _initialize_diagnostics( variable_names = tuple(raw_diagnostic.get("variables", {})) diagnostic["scripts"] = self._initialize_scripts( name, - raw_diagnostic.get("scripts"), + raw_diagnostic.get("scripts", {}), variable_names, ) for key in ("themes", "realms"): @@ -1342,8 +1320,10 @@ def run(self) -> None: filled_recipe = self.write_filled_recipe() # Download required data - if self.session["search_esgf"] != "never": - esgf.download(self._download_files, self.session["download_dir"]) + # Add a special case for ESGF files to enable parallel downloads + esgf.download(self._download_files) + for file in self._download_files: + file.prepare() self.tasks.run(max_parallel_tasks=self.session["max_parallel_tasks"]) logger.info( diff --git a/esmvalcore/_recipe/to_datasets.py b/esmvalcore/_recipe/to_datasets.py index 7aab83719b..e335417175 100644 --- a/esmvalcore/_recipe/to_datasets.py +++ b/esmvalcore/_recipe/to_datasets.py @@ -5,14 +5,14 @@ import logging from collections.abc import Iterable, Iterator, Sequence from copy import deepcopy -from numbers import Number from typing import TYPE_CHECKING, Any +from esmvalcore._recipe.check import get_no_data_message from esmvalcore.cmor.table import _CMOR_KEYS, _update_cmor_facets from esmvalcore.dataset import INHERITED_FACETS, Dataset, _isglob from esmvalcore.esgf.facets import FACETS from esmvalcore.exceptions import RecipeError -from esmvalcore.local import LocalFile, _replace_years_with_timerange +from esmvalcore.local import _replace_years_with_timerange from esmvalcore.preprocessor._derive import get_required from esmvalcore.preprocessor._io import DATASET_KEYS from esmvalcore.preprocessor._supplementary_vars import ( @@ -219,7 +219,7 @@ def _get_supplementary_short_names( var_facets = dict(facets) _update_cmor_facets(var_facets) realms = var_facets.get("modeling_realm", []) - if isinstance(realms, (str, Number, bool)): + if isinstance(realms, (str, int)): realms = [str(realms)] ocean_realms = {"ocean", "seaIce", "ocnBgchem"} is_ocean_variable = any(realm in ocean_realms for realm in realms) @@ -504,37 +504,31 @@ def _report_unexpanded_globs( msg = ( "Unable to replace " + ", ".join(f"{k}={v}" for k, v in unexpanded_globs.items()) - + f" by a value for\n{unexpanded_ds}" + + f" by a value for {unexpanded_ds}" ) # Set supplementaries to [] to avoid searching for supplementary files expanded_ds.supplementaries = [] if expanded_ds.files: - if any(isinstance(f, LocalFile) for f in expanded_ds.files): - paths_msg = "paths to the " - else: - paths_msg = "" msg = ( - f"{msg}\nDo the {paths_msg}files:\n" + f"{msg}\nPlease check why the files:\n" + "\n".join( f"{f} with facets: {f.facets}" for f in expanded_ds.files ) - + "\nprovide the missing facet values?" + + "\ndo not provide the missing facet values. This will depend on " + "the data source they come from, e.g. can they be extracted from the " + "path for local files, or are they available from ESGF when " + "when searching ESGF for files?" ) else: timerange = expanded_ds.facets.get("timerange") - patterns = expanded_ds._file_globs # noqa: SLF001 + no_data_message = get_no_data_message(expanded_ds) msg = ( - f"{msg}\nNo files found matching:\n" - + "\n".join(str(p) for p in patterns) # type: ignore[union-attr] - + ( # type:ignore - f"\nwithin the requested timerange {timerange}." - if timerange - else "" - ) + f"{msg}\nbecause {no_data_message[0].lower()}{no_data_message[1:]}" ) - + if timerange: + msg += f"\nwithin the requested timerange {timerange}." return msg diff --git a/esmvalcore/_task.py b/esmvalcore/_task.py index d2c0831ed3..fe77472888 100644 --- a/esmvalcore/_task.py +++ b/esmvalcore/_task.py @@ -351,7 +351,7 @@ def __init__(self, prev_preproc_dir, preproc_dir, name): for prov_filename, attributes in prev_metadata.items(): # Update the filename in case the output directory was moved # since the original run - filename = str(prev_preproc_dir / Path(prov_filename).name) + filename = prev_preproc_dir / Path(prov_filename).name attributes["filename"] = filename product = TrackedFile( filename, @@ -676,7 +676,7 @@ def _run(self, input_files): msg, ) - def _collect_provenance(self): + def _collect_provenance(self) -> None: """Process provenance information provided by the diagnostic script.""" provenance_file = ( Path(self.settings["run_dir"]) / "diagnostic_provenance.yml" @@ -766,7 +766,7 @@ def _collect_provenance(self): TAGS.replace_tags_in_dict(attributes) - product = TrackedFile(filename, attributes, ancestors) + product = TrackedFile(Path(filename), attributes, ancestors) product.initialize_provenance(self.activity) _write_citation_files(product.filename, product.provenance) product.save_provenance() diff --git a/esmvalcore/cmor/_fixes/icon/_base_fixes.py b/esmvalcore/cmor/_fixes/icon/_base_fixes.py index c4c12da334..4023851862 100644 --- a/esmvalcore/cmor/_fixes/icon/_base_fixes.py +++ b/esmvalcore/cmor/_fixes/icon/_base_fixes.py @@ -23,9 +23,10 @@ from iris.cube import Cube, CubeList from iris.mesh import Connectivity, MeshXY +import esmvalcore.local from esmvalcore.cmor._fixes.native_datasets import NativeDatasetFix +from esmvalcore.config._data_sources import _get_data_sources from esmvalcore.iris_helpers import add_leading_dim_to_cube, date2num -from esmvalcore.local import _get_data_sources logger = logging.getLogger(__name__) @@ -322,10 +323,11 @@ def _get_grid_from_cube_attr(self, cube: Cube) -> Cube: def _get_grid_from_rootpath(self, grid_name: str) -> CubeList | None: """Try to get grid from the ICON rootpath.""" glob_patterns: list[Path] = [] - for data_source in _get_data_sources("ICON"): - glob_patterns.extend( - data_source.get_glob_patterns(**self.extra_facets), - ) + for data_source in _get_data_sources(self.session, "ICON"): # type: ignore[arg-type] + if isinstance(data_source, esmvalcore.local.LocalDataSource): + glob_patterns.extend( + data_source._get_glob_patterns(**self.extra_facets), # noqa: SLF001 + ) possible_grid_paths = [d.parent / grid_name for d in glob_patterns] for grid_path in possible_grid_paths: if grid_path.is_file(): diff --git a/esmvalcore/config/_config.py b/esmvalcore/config/_config.py index 6a3670a7ca..121ee2b126 100644 --- a/esmvalcore/config/_config.py +++ b/esmvalcore/config/_config.py @@ -94,7 +94,7 @@ def warn_if_old_extra_facets_exist() -> None: ) -def load_config_developer(cfg_file): +def load_config_developer(cfg_file) -> dict: """Read the developer's configuration file.""" with open(cfg_file, encoding="utf-8") as file: cfg = yaml.safe_load(file) @@ -120,6 +120,7 @@ def load_config_developer(cfg_file): CFG[project] = settings read_cmor_tables(cfg_file) + return cfg def get_project_config(project): diff --git a/esmvalcore/config/_config_object.py b/esmvalcore/config/_config_object.py index bb3dcaba0d..fcdddc9c37 100644 --- a/esmvalcore/config/_config_object.py +++ b/esmvalcore/config/_config_object.py @@ -11,15 +11,14 @@ import dask.config import esmvalcore +from esmvalcore.config._config import load_config_developer from esmvalcore.config._config_validators import ( _deprecated_options_defaults, _deprecators, _validators, ) from esmvalcore.config._validated_config import ValidatedConfig -from esmvalcore.exceptions import ( - InvalidConfigParameter, -) +from esmvalcore.exceptions import InvalidConfigParameter if TYPE_CHECKING: from collections.abc import Iterable, Mapping @@ -77,10 +76,7 @@ class Config(ValidatedConfig): _validate = _validators _deprecate = _deprecators _deprecated_defaults = _deprecated_options_defaults - _warn_if_missing = ( - ("drs", URL), - ("rootpath", URL), - ) + _warn_if_missing = (("projects", URL),) def __init__(self, *args, **kwargs): """Initialize class instance.""" @@ -133,7 +129,10 @@ def load_from_dirs(self, dirs: Iterable[str | Path]) -> None: new_config_dict = self._get_config_dict_from_dirs(dirs) self.clear() self.update(new_config_dict) - + # Add known projects from config-developer file while we still have it. + for project in load_config_developer(self["config_developer_file"]): + if project not in self["projects"]: + self["projects"][project] = {} self.check_missing() def reload(self) -> None: diff --git a/esmvalcore/config/_config_validators.py b/esmvalcore/config/_config_validators.py index 1b995c6d32..efb5e2538f 100644 --- a/esmvalcore/config/_config_validators.py +++ b/esmvalcore/config/_config_validators.py @@ -33,6 +33,11 @@ "always", # Always search ESGF for files ) +SEARCH_DATA_OPTIONS = ( + "quick", # Stop searching as soon as a result is found + "complete", # Search all configured data sources +) + class ValidationError(ValueError): """Custom validation error.""" @@ -307,6 +312,19 @@ def validate_search_esgf(value): return value +def validate_search_data(value): + """Validate options for data search.""" + value = validate_string(value) + value = value.lower() + if value not in SEARCH_DATA_OPTIONS: + msg = ( + f"`{value}` is not a valid option for `search_data`, possible " + f"values are {SEARCH_DATA_OPTIONS}" + ) + raise ValidationError(msg) + return value + + def validate_diagnostics( diagnostics: Iterable[str] | str | None, ) -> set[str] | None: @@ -347,6 +365,7 @@ def validate_projects(value: Any) -> Any: """Validate projects mapping.""" mapping = validate_dict(value) options_for_project: dict[str, Callable[[Any], Any]] = { + "data": validate_dict, # TODO: try to create data sources here "extra_facets": validate_dict, } for project, project_config in mapping.items(): @@ -386,6 +405,7 @@ def validate_projects(value: Any) -> Any: "rootpath": validate_rootpath, "run_diagnostic": validate_bool, "save_intermediary_cubes": validate_bool, + "search_data": validate_search_data, "search_esgf": validate_search_esgf, "skip_nonexistent": validate_bool, # From recipe @@ -459,10 +479,131 @@ def deprecate_extra_facets_dir( _handle_deprecation(option, deprecated_version, remove_version, more_info) +def deprecate_rootpath( + validated_config: ValidatedConfig, + value: Any, + validated_value: Any, +) -> None: + """Deprecate ``rootpath`` option. + + Parameters + ---------- + validated_config: + ``ValidatedConfig`` instance which will be modified in place. + value: + Raw input value for ``config_file`` option. + validated_value: + Validated value for ``config_file`` option. + + """ + validated_config # noqa: B018 + value # noqa: B018 + validated_value # noqa: B018 + option = "rootpath" + deprecated_version = "2.14.0" + remove_version = "2.16.0" + more_info = " Please configure data sources under `projects` instead." + _handle_deprecation(option, deprecated_version, remove_version, more_info) + + +def deprecate_drs( + validated_config: ValidatedConfig, # noqa: ARG001 + value: Any, # noqa: ARG001 + validated_value: Any, # noqa: ARG001 +) -> None: + """Deprecate ``drs`` option. + + Parameters + ---------- + validated_config: + ``ValidatedConfig`` instance which will be modified in place. + value: + Raw input value for ``config_file`` option. + validated_value: + Validated value for ``config_file`` option. + + """ + more_info = " Please configure data sources under `projects` instead." + _handle_deprecation("drs", "2.14.0", "2.16.0", more_info) + + +def deprecate_download_dir( + validated_config: ValidatedConfig, # noqa: ARG001 + value: Any, # noqa: ARG001 + validated_value: Any, # noqa: ARG001 +) -> None: + """Deprecate ``download_dir`` option. + + Parameters + ---------- + validated_config: + ``ValidatedConfig`` instance which will be modified in place. + value: + Raw input value for ``config_file`` option. + validated_value: + Validated value for ``config_file`` option. + + """ + more_info = " Please configure data sources under `projects` instead." + _handle_deprecation("download_dir", "2.14.0", "2.16.0", more_info) + + +def deprecate_search_esgf( + validated_config: ValidatedConfig, + value: Any, # noqa: ARG001 + validated_value: Any, +) -> None: + """Deprecate ``search_esgf`` option. + + Parameters + ---------- + validated_config: + ``ValidatedConfig`` instance which will be modified in place. + value: + Raw input value for ``config_file`` option. + validated_value: + Validated value for ``config_file`` option. + + """ + translate = { + "when_missing": "quick", + "always": "complete", + } + messages = { + "never": " Please configure only offline data sources under `projects` instead.", + } | { + k: f" Please use `search_data: {v}` instead of `search_esgf: {k}`." + for k, v in translate.items() + } + + if ( + validated_value in translate + and validated_config["search_data"] != translate[validated_value] + ): + logger.warning( + "Changing `search_data` to `%s` due to use of deprecated `search_esgf: %s`." + " Please update your configuration to use `search_data` directly. Support for " + "the `search_esgf` option will no longer be available in ESMValCore version 2.16.0.", + translate[validated_value], + validated_value, + ) + + _handle_deprecation( + "search_esgf", + "2.14.0", + "2.16.0", + more_info=messages.get(validated_value, ""), + ) + + # Example usage: see removed files in # https://github.com/ESMValGroup/ESMValCore/pull/2213 _deprecators: dict[str, Callable] = { "extra_facets_dir": deprecate_extra_facets_dir, # TODO: remove in v2.15.0 + "drs": deprecate_drs, # TODO: remove in v2.16.0 + "rootpath": deprecate_rootpath, # TODO: remove in v2.16.0 + "download_dir": deprecate_download_dir, # TODO: remove in v2.16.0 + "search_esgf": deprecate_search_esgf, # TODO: remove in v2.16.0 } diff --git a/esmvalcore/config/_data_sources.py b/esmvalcore/config/_data_sources.py new file mode 100644 index 0000000000..903ae7c5bf --- /dev/null +++ b/esmvalcore/config/_data_sources.py @@ -0,0 +1,87 @@ +"""Module for configuring data sources.""" + +import logging + +import yaml + +import esmvalcore.esgf +import esmvalcore.esgf.facets +import esmvalcore.local +from esmvalcore.config import Session +from esmvalcore.exceptions import InvalidConfigParameter, RecipeError +from esmvalcore.io import load_data_sources +from esmvalcore.io.protocol import DataSource + +logger = logging.getLogger(__name__) + + +def _get_data_sources( + session: Session, + project: str, +) -> list[DataSource]: + """Get the list of available data sources including legacy configuration. + + Arguments + --------- + session: + The configuration. + project: + Data sources for this project are returned. + + Returns + ------- + :obj:`list` of :obj:`DataSource`: + A list of available data sources. + + Raises + ------ + InvalidConfigParameter: + If the project or its settings are not found in the configuration. + + """ + try: + return load_data_sources(session, project) + except ValueError: + pass + + # Use legacy data sources from config-user.yml and config-developer.yml. + data_sources: list[DataSource] = [] + try: + legacy_local_data_sources = esmvalcore.local._get_data_sources(project) # noqa: SLF001 + except (RecipeError, KeyError): + # The project is not configured in config-developer.yml + legacy_local_data_sources = [] + else: + if ( + session.get("search_esgf", "") != "never" + and project in esmvalcore.esgf.facets.FACETS + ): + data_source = esmvalcore.esgf.ESGFDataSource( + name="legacy-esgf", + project=project, + priority=2, + download_dir=session["download_dir"], + ) + data_sources.append(data_source) + data_sources.extend(legacy_local_data_sources) + + if not data_sources: + cfg_snippet = { + "projects": { + p: { + "data": session["projects"].get(p, {}).get("data", {}), + } + for p in ( + session["projects"] if project is None else [project] + ) + }, + } + msg = ( + f"No data sources found for project '{project}'. Current configuration:\n" + f"{yaml.safe_dump(cfg_snippet)}" + "Please configure a data source by following the instructions at " + "https://docs.esmvaltool.org/projects/ESMValCore/en/latest/" + "quickstart/configure.html#project-specific-configuration" + ) + raise InvalidConfigParameter(msg) + return data_sources diff --git a/esmvalcore/config/_validated_config.py b/esmvalcore/config/_validated_config.py index 0dfca3b521..624068c411 100644 --- a/esmvalcore/config/_validated_config.py +++ b/esmvalcore/config/_validated_config.py @@ -57,7 +57,7 @@ class ValidatedConfig(MutableMapping): """ # validate values on the way in - def __init__(self, *args, **kwargs): + def __init__(self, *args, **kwargs) -> None: super().__init__() self._mapping: dict[str, Any] = {} self.update(*args, **kwargs) diff --git a/esmvalcore/config/config-logging.yml b/esmvalcore/config/config-logging.yml index 6635ca63ec..64c0f0bb38 100644 --- a/esmvalcore/config/config-logging.yml +++ b/esmvalcore/config/config-logging.yml @@ -1,25 +1,32 @@ # Logger configuration --- - version: 1 disable_existing_loggers: false formatters: console: - format: '%(asctime)s UTC [%(process)d] %(levelname)-7s %(message)s' + format: "%(asctime)s UTC [%(process)d] %(levelname)-7s %(message)s" brief: - format: '%(levelname)-7s [%(process)d] %(message)s' + format: "%(levelname)-7s [%(process)d] %(message)s" debug: - format: '%(asctime)s UTC [%(process)d] %(levelname)-7s %(name)s:%(lineno)s %(message)s' + format: "%(asctime)s UTC [%(process)d] %(levelname)-7s %(name)s:%(lineno)s %(message)s" filters: - only_cmor: # only events from CMOR check and generic fixes + only_ours: # only keep events from known loggers + (): esmvalcore.config._logging.FilterMultipleNames + names: [esmvalcore, intake-esgf] + mode: allow + only_cmor: # only events from CMOR check and generic fixes (): esmvalcore.config._logging.FilterMultipleNames names: [esmvalcore.cmor.check, esmvalcore.cmor._fixes.fix.genericfix] mode: allow - no_cmor: # no events from CMOR check and generic fixes + no_cmor: # no events from CMOR check and generic fixes (): esmvalcore.config._logging.FilterMultipleNames names: [esmvalcore.cmor.check, esmvalcore.cmor._fixes.fix.genericfix] mode: disallow - no_external_warnings: # no events from external Python warnings + no_intake_esgf: # no events from intake-esgf + (): esmvalcore.config._logging.FilterMultipleNames + names: ["intake-esgf"] + mode: disallow + no_external_warnings: # no events from external Python warnings (): esmvalcore.config._logging.FilterExternalWarnings handlers: console: @@ -27,21 +34,22 @@ handlers: level: INFO formatter: console stream: ext://sys.stdout - filters: [no_cmor, no_external_warnings] + filters: [only_ours, no_cmor, no_external_warnings, no_intake_esgf] simple_log_file: class: logging.FileHandler level: INFO formatter: brief filename: main_log.txt mode: w - filters: [no_cmor, no_external_warnings] + filters: [only_ours, no_cmor, no_external_warnings, no_intake_esgf] debug_log_file: class: logging.FileHandler level: DEBUG formatter: debug filename: main_log_debug.txt mode: w - cmor_log: # only contains output from CMOR check and generic fixes + filters: [only_ours] + cmor_log: # only contains output from CMOR check and generic fixes class: logging.FileHandler level: INFO formatter: brief diff --git a/esmvalcore/config/configurations/data-esmvalcore-esgf.yml b/esmvalcore/config/configurations/data-esmvalcore-esgf.yml new file mode 100644 index 0000000000..4ebae5e04a --- /dev/null +++ b/esmvalcore/config/configurations/data-esmvalcore-esgf.yml @@ -0,0 +1,19 @@ +# Download CMIP, CORDEX, and obs4MIPs data from ESGF using the `esmvalcore.esgf` +# module, which uses the legacy ESGF search interface. +projects: + CMIP6: &esgf-pyclient-data + data: + esgf-pyclient: + type: "esmvalcore.esgf.ESGFDataSource" + download_dir: ~/climate_data + # Use a lower priority than for esmvalcore.local.LocalDataSource + # to avoid searching ESGF with the setting `search_esgf: when_missing`. + priority: 10 + CMIP5: + <<: *esgf-pyclient-data + CMIP3: + <<: *esgf-pyclient-data + CORDEX: + <<: *esgf-pyclient-data + obs4MIPs: + <<: *esgf-pyclient-data diff --git a/esmvalcore/config/configurations/data-hpc-badc.yml b/esmvalcore/config/configurations/data-hpc-badc.yml new file mode 100644 index 0000000000..9bcc90e8d9 --- /dev/null +++ b/esmvalcore/config/configurations/data-hpc-badc.yml @@ -0,0 +1,51 @@ +# Read CMIP, CORDEX, obs4MIPs, OBS6, and OBS data from the filesystem at CEDA/Jasmin. +projects: + CMIP6: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /badc/cmip6/data + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + CMIP5: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /badc/cmip5/data + dirname_template: "{project.lower}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + CMIP3: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /badc/cmip3_drs/data + dirname_template: "{project.lower}/output/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{short_name}/{ensemble}/{version}" + filename_template: "{short_name}_*.nc" + CORDEX: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /badc/cordex/data + dirname_template: "{project}/output/{domain}/{institute}/{driver}/{exp}/{ensemble}/{institute}-{dataset}/{rcm_version}/{mip}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + obs4MIPs: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /gws/nopw/j04/esmeval/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{short_name}_*.nc" + OBS6: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /gws/nopw/j04/esmeval/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" + OBS: + data: + badc: + type: "esmvalcore.local.LocalDataSource" + rootpath: /gws/nopw/j04/esmeval/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/config/configurations/data-hpc-dkrz.yml b/esmvalcore/config/configurations/data-hpc-dkrz.yml new file mode 100644 index 0000000000..3ad4a4fb31 --- /dev/null +++ b/esmvalcore/config/configurations/data-hpc-dkrz.yml @@ -0,0 +1,90 @@ +# Read CMIP, CORDEX, obs4MIPs, native6, OBS6, and OBS data from the filesystem of Levante at DKRZ. +projects: + CMIP6: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/ik1017/CMIP6/data + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + esgf-cache: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/download + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + CMIP5: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/kd0956/CMIP5/data + dirname_template: "{project.lower}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + esgf-cache: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/download + dirname_template: "{project.lower}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + CMIP3: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/CMIP3 + dirname_template: "{exp}/{modeling_realm}/{frequency}/{short_name}/{dataset}/{ensemble}" + filename_template: "{short_name}_*.nc" + esgf-cache: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/download + dirname_template: "{project.lower}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{ensemble}/{short_name}/{version}" + filename_template: "{short_name}_*.nc" + CORDEX: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/ik1017/C3SCORDEX/data/c3s-cordex/output + dirname_template: "{domain}/{institute}/{driver}/{exp}/{ensemble}/{institute}-{dataset}/{rcm_version}/{mip}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + esgf-cache: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/download + dirname_template: "{project.lower}/output/{domain}/{institute}/{driver}/{exp}/{ensemble}/{dataset}/{rcm_version}/{frequency}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + obs4MIPs: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/OBS + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{short_name}_*.nc" + esgf-cache: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/download + dirname_template: "{project}/{dataset}/{version}" + filename_template: "{short_name}_*.nc" + native6: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/RAWOBS + dirname_template: "Tier{tier}/{dataset}/{version}/{frequency}/{short_name}" + filename_template: "*.nc" + # ERA5 data in GRIB format: + # https://docs.dkrz.de/doc/dataservices/finding_and_accessing_data/era_data/index.html#pool-data-era5-file-and-directory-names + dkrz-era5: + type: "esmvalcore.local.LocalDataSource" + rootpath: /pool/data/ERA5 + dirname_template: "{family}/{level}/{type}/{tres}/{grib_id}" + filename_template: "{family}{level}{typeid}_{tres}_*_{grib_id}.grb" + OBS6: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/OBS + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" + OBS: + data: + dkrz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /work/bd0854/DATA/ESMValTool2/OBS + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/config/configurations/data-hpc-ethz.yml b/esmvalcore/config/configurations/data-hpc-ethz.yml new file mode 100644 index 0000000000..390ed055ac --- /dev/null +++ b/esmvalcore/config/configurations/data-hpc-ethz.yml @@ -0,0 +1,30 @@ +# Read CMIP and OBS data from the filesystem at ETHZ. +projects: + CMIP6: + data: + ethz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /net/atmos/data + dirname_template: "{project.lower}/{exp}/{mip}/{short_name}/{dataset}/{ensemble}/{grid}/" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + CMIP5: + data: + ethz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /net/atmos/data + dirname_template: "{project.lower}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + CMIP3: + data: + ethz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /net/atmos/data + dirname_template: "{project.lower}/{exp}/{modeling_realm}/{frequency}/{short_name}/{dataset}/{ensemble}" + filename_template: "{short_name}_*.nc" + OBS: + data: + ethz: + type: "esmvalcore.local.LocalDataSource" + rootpath: /net/exo/landclim/PROJECTS/C3S/datadir/obsdir/ + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/config/configurations/data-hpc-ipsl.yml b/esmvalcore/config/configurations/data-hpc-ipsl.yml new file mode 100644 index 0000000000..914409643d --- /dev/null +++ b/esmvalcore/config/configurations/data-hpc-ipsl.yml @@ -0,0 +1,37 @@ +# Read CMIP, CORDEX, and obs4MIPs data from the filesystem at IPSL. +projects: + CMIP6: + data: + ipsl: + type: "esmvalcore.local.LocalDataSource" + rootpath: /bdd + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + CMIP5: + data: + ipsl: + type: "esmvalcore.local.LocalDataSource" + rootpath: /bdd + dirname_template: "{project}/output/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + CMIP3: + data: + ipsl: + type: "esmvalcore.local.LocalDataSource" + rootpath: /bdd + dirname_template: "{project}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{ensemble}/{short_name}/{version}/{short_name}" + filename_template: "{short_name}_*.nc" + CORDEX: + data: + ipsl: + type: "esmvalcore.local.LocalDataSource" + rootpath: /bdd + dirname_template: "{project}/output/{domain}/{institute}/{driver}/{exp}/{ensemble}/{institute}-{dataset}/{rcm_version}/{mip}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + obs4MIPs: + data: + ipsl: + type: "esmvalcore.local.LocalDataSource" + rootpath: /bdd + dirname_template: "{project}/obs-CFMIP/observations/{realm}/{short_name}/{frequency}/{grid}/{institute}/{dataset}/{version}" + filename_template: "{short_name}_*.nc" diff --git a/esmvalcore/config/configurations/data-hpc-mo.yml b/esmvalcore/config/configurations/data-hpc-mo.yml new file mode 100644 index 0000000000..686c93b4d9 --- /dev/null +++ b/esmvalcore/config/configurations/data-hpc-mo.yml @@ -0,0 +1,63 @@ +# Read CMIP, CORDEX, obs4MIPs, OBS6, and OBS data from the filesystem at the UK Met Office. +projects: + CMIP6: + data: + mo: &cmip6 + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/managecmip/champ + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + mo-old-vdi: + <<: *cmip6 + priority: 2 + rootpath: /project/champ/data + CMIP5: + data: + mo: &cmip5 + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/managecmip/champ + dirname_template: "{project.lower}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + mo-old-vdi: + <<: *cmip5 + priority: 2 + rootpath: /project/champ/data + CORDEX: + data: + mo: &cordex + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/managecmip/champ + dirname_template: "{project.lower}/output/{domain}/{institute}/{driver}/{exp}/{ensemble}/{institute}-{dataset}/{rcm_version}/{mip}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + mo-old-vdi: + <<: *cordex + priority: 2 + rootpath: /project/champ/data + obs4MIPs: + data: + mo: + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/esmval/ESMValTool/obs + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{short_name}_*.nc" + native6: + data: + mo: + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/esmval/ESMValTool/rawobs + dirname_template: "Tier{tier}/{dataset}/{version}/{frequency}/{short_name}" + filename_template: "*.nc" + OBS6: + data: + mo: + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/esmval/ESMValTool/obs + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" + OBS: + data: + mo: + type: "esmvalcore.local.LocalDataSource" + rootpath: /data/users/esmval/ESMValTool/obs + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/config/configurations/data-hpc-nci.yml b/esmvalcore/config/configurations/data-hpc-nci.yml new file mode 100644 index 0000000000..dc960c0efc --- /dev/null +++ b/esmvalcore/config/configurations/data-hpc-nci.yml @@ -0,0 +1,67 @@ +# Read CMIP, obs4MIPs, native6, OBS6, and OBS data from the filesystem at NCI. +projects: + CMIP6: + data: + oi10: &cmip6 + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/oi10/replicas + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + fs38: + <<: *cmip6 + rootpath: /g/data/fs38/publications + xp65: + <<: *cmip6 + rootpath: /g/data/xp65/public/apps/esmvaltool/replicas + CMIP5: + data: + r87: &cmip5 + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/r87/DRSv3/CMIP5 + dirname_template: "{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + al33: + <<: *cmip5 + rootpath: /g/data/al33/replicas/CMIP5/combined + rr3: &cmip5-default + <<: *cmip5 + rootpath: /g/data/rr3/publications + dirname_template: "{project}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}/{short_name}" + xp65: + <<: *cmip5-default + rootpath: /g/data/xp65/public/apps/esmvaltool/replicas + CMIP3: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/r87/DRSv3/CMIP3 + dirname_template: "{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{ensemble}/{short_name}/{latestversion}" + filename_template: "{short_name}_*.nc" + obs4MIPs: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{short_name}_*.nc" + native6: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/xp65/public/apps/esmvaltool/native6 + dirname_template: "Tier{tier}/{dataset}/{version}/{frequency}/{short_name}" + filename_template: "*.nc" + OBS6: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" + OBS: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/config/configurations/data-intake-esgf.yml b/esmvalcore/config/configurations/data-intake-esgf.yml new file mode 100644 index 0000000000..22b6b7c7a0 --- /dev/null +++ b/esmvalcore/config/configurations/data-intake-esgf.yml @@ -0,0 +1,77 @@ +# Read CMIP and obs4MIPs data from ESGF using intake-esgf. +projects: + CMIP6: + data: + intake-esgf: + type: "esmvalcore.io.intake_esgf.IntakeESGFDataSource" + facets: + activity: "activity_drs" + dataset: "source_id" + ensemble: "member_id" + exp: "experiment_id" + institute: "institution_id" + grid: "grid_label" + mip: "table_id" + project: "project" + short_name: "variable_id" + CMIP5: + data: + intake-esgf: + type: "esmvalcore.io.intake_esgf.IntakeESGFDataSource" + facets: + dataset: "model" + ensemble: "ensemble" + exp: "experiment" + frequency: "time_frequency" + institute: "institute" + mip: "cmor_table" + product: "product" + project: "project" + short_name: "variable" + values: + dataset: + "ACCESS1-0": "ACCESS1.0" + "ACCESS1-3": "ACCESS1.3" + "bcc-csm1-1": "BCC-CSM1.1" + "bcc-csm1-1-m": "BCC-CSM1.1(m)" + "CESM1-BGC": "CESM1(BGC)" + "CESM1-CAM5": "CESM1(CAM5)" + "CESM1-CAM5-1-FV2": "CESM1(CAM5.1,FV2)" + "CESM1-FASTCHEM": "CESM1(FASTCHEM)" + "CESM1-WACCM": "CESM1(WACCM)" + "CSIRO-Mk3-6-0": "CSIRO-Mk3.6.0" + "fio-esm": "FIO-ESM" + "GFDL-CM2p1": "GFDL-CM2.1" + "inmcm4": "INM-CM4" + "MRI-AGCM3-2H": "MRI-AGCM3.2H" + "MRI-AGCM3-2S": "MRI-AGCM3.2S" + CMIP3: + data: + intake-esgf: + type: "esmvalcore.io.intake_esgf.IntakeESGFDataSource" + facets: + dataset: "model" + ensemble: "ensemble" + exp: "experiment" + frequency: "time_frequency" + project: "project" + short_name: "variable" + obs4MIPs: + data: + intake-esgf-v2: + type: "esmvalcore.io.intake_esgf.IntakeESGFDataSource" + facets: + dataset: "source_id" + frequency: "frequency" + institute: "institution_id" + project: "project" + short_name: "variable_id" + # TODO: Add support for older ODS V1.0 obs4MIPs (CMIP5 style) data to intake-esgf + # intake-esgf-v1: + # type: "esmvalcore.io.intake_esgf.IntakeESGFDataSource" + # facets: + # dataset: "source_id" + # frequency: "time_frequency" + # institute: "institute" + # project: "project" + # short_name: "variable" diff --git a/esmvalcore/config/configurations/data-local-esmvaltool.yml b/esmvalcore/config/configurations/data-local-esmvaltool.yml new file mode 100644 index 0000000000..f4549e7775 --- /dev/null +++ b/esmvalcore/config/configurations/data-local-esmvaltool.yml @@ -0,0 +1,26 @@ +# Read native6, OBS6, and OBS data from the filesystem on a personal computer. +projects: + # Data that can be read in its native format by ESMValCore. + native6: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "Tier{tier}/{dataset}/{version}/{frequency}/{short_name}" + filename_template: "*.nc" + # Data that has been CMORized by ESMValTool according to the CMIP6 standard. + OBS6: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" + # Data that has been CMORized by ESMValTool according to the CMIP5 standard. + OBS: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "Tier{tier}/{dataset}" + filename_template: "{project}_{dataset}_{type}_{version}_{mip}_{short_name}[_.]*nc" diff --git a/esmvalcore/config/configurations/data-local.yml b/esmvalcore/config/configurations/data-local.yml new file mode 100644 index 0000000000..1080bf2c17 --- /dev/null +++ b/esmvalcore/config/configurations/data-local.yml @@ -0,0 +1,37 @@ +# Read CMIP, CORDEX, and obs4MIPs data from the filesystem on a personal computer. +projects: + CMIP6: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + CMIP5: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{project.lower}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc" + CMIP3: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{project.lower}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{ensemble}/{short_name}/{version}" + filename_template: "{short_name}_*.nc" + CORDEX: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{project.lower}/output/{domain}/{institute}/{driver}/{exp}/{ensemble}/{dataset}/{rcm_version}/{frequency}/{short_name}/{version}" + filename_template: "{short_name}_{domain}_{driver}_{exp}_{ensemble}_{institute}-{dataset}_{rcm_version}_{mip}*.nc" + obs4MIPs: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{project}/{dataset}/{version}" + filename_template: "{short_name}_*.nc" diff --git a/esmvalcore/config/configurations/data-native-access.yml b/esmvalcore/config/configurations/data-native-access.yml new file mode 100644 index 0000000000..832479c3e2 --- /dev/null +++ b/esmvalcore/config/configurations/data-native-access.yml @@ -0,0 +1,14 @@ +# Read data from the ACCESS model in its native format. +projects: + ACCESS: + data: + access-sub-dataset: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{dataset}/{sub_dataset}/{exp}/{modeling_realm}/netCDF" + filename_template: "{sub_dataset}.{freq_attribute}-*.nc" + access-ocean: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{dataset}/{sub_dataset}/{exp}/{modeling_realm}/netCDF" + filename_template: "ocean_{freq_attribute}.nc-*" diff --git a/esmvalcore/config/configurations/data-native-cesm.yml b/esmvalcore/config/configurations/data-native-cesm.yml new file mode 100644 index 0000000000..fdfb84bb5f --- /dev/null +++ b/esmvalcore/config/configurations/data-native-cesm.yml @@ -0,0 +1,15 @@ +# Read data from the CESM model in its native format. +projects: + CESM: + data: + run: &cesm + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "" # run directory + filename_template: "{case}.{scomp}.{type}.{string}*nc" + short-term-archive: + <<: *cesm + dirname_template: "{case}/{gcomp}/hist" # short-term archiving + postprocessed: + <<: *cesm + dirname_template: "{case}/{gcomp}/proc/{tdir}/{tperiod}" # postprocessed data diff --git a/esmvalcore/config/configurations/data-native-emac.yml b/esmvalcore/config/configurations/data-native-emac.yml new file mode 100644 index 0000000000..eb894e7115 --- /dev/null +++ b/esmvalcore/config/configurations/data-native-emac.yml @@ -0,0 +1,18 @@ +# Read data from the EMAC model in its native format. +projects: + EMAC: + data: + emac: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{exp}/{channel}" + filename_template: "{exp}*{channel}{postproc_flag}.nc" + ignore_warnings: + - message: "Ignored formula of unrecognised type: .*" + module: iris + - message: "Ignoring formula terms variable .* referenced by data variable .* via variable .*" + module: iris + - message: "Missing CF-netCDF formula term variable .*, referenced by netCDF variable .*" + module: iris + - message: "NetCDF variable .* contains unknown cell method .*" + module: iris diff --git a/esmvalcore/config/configurations/data-native-icon.yml b/esmvalcore/config/configurations/data-native-icon.yml new file mode 100644 index 0000000000..6f5332dfd2 --- /dev/null +++ b/esmvalcore/config/configurations/data-native-icon.yml @@ -0,0 +1,15 @@ +# Read data from the ICON model in its native format. +projects: + ICON: + data: + icon: &icon + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{exp}" + filename_template: "{exp}_{var_type}*.nc" + icon-outdata: + <<: *icon + dirname_template: "{exp}/outdata" + icon-output: + <<: *icon + dirname_template: "{exp}/output" diff --git a/esmvalcore/config/configurations/data-native-ipslcm.yml b/esmvalcore/config/configurations/data-native-ipslcm.yml new file mode 100644 index 0000000000..109ff7a5c8 --- /dev/null +++ b/esmvalcore/config/configurations/data-native-ipslcm.yml @@ -0,0 +1,14 @@ +# Read data from the IPSL climate model in its native format. +projects: + IPSLCM: + data: + ipslcm-varname: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{root}/{account}/{model}/{status}/{exp}/{simulation}/{dir}/{out}/{freq}" + filename_template: "{simulation}_*_{ipsl_varname}.nc" + ipslcm-group: + type: "esmvalcore.local.LocalDataSource" + rootpath: ~/climate_data + dirname_template: "{root}/{account}/{model}/{status}/{exp}/{simulation}/{dir}/{out}/{freq}" + filename_template: "{simulation}_*_{group}.nc" diff --git a/esmvalcore/config/configurations/defaults/more_top_level_options.yml b/esmvalcore/config/configurations/defaults/command_line_options.yml similarity index 62% rename from esmvalcore/config/configurations/defaults/more_top_level_options.yml rename to esmvalcore/config/configurations/defaults/command_line_options.yml index 80e21c27b3..df79939acd 100644 --- a/esmvalcore/config/configurations/defaults/more_top_level_options.yml +++ b/esmvalcore/config/configurations/defaults/command_line_options.yml @@ -1,3 +1,5 @@ +# Default values for configuration options typically set from the command line. + check_level: default diagnostics: null max_datasets: null diff --git a/esmvalcore/config/configurations/defaults/config-user.yml b/esmvalcore/config/configurations/defaults/config-user.yml index b2f8950a1c..a8b959a70e 100644 --- a/esmvalcore/config/configurations/defaults/config-user.yml +++ b/esmvalcore/config/configurations/defaults/config-user.yml @@ -1,20 +1,12 @@ -############################################################################### -# Default configuration settings -############################################################################### -# +# Default top level options. + # Note for users: # -------------- -# Site-specific entries for different HPC centers are given at the bottom of -# this file. Comment out/replace as needed. This default version of the file -# can be used in combination with the command line argument -# ``search_esgf=when_missing``. If only certain values are allowed for an -# option, these are listed after ``---``. The option in square brackets is the -# default value, i.e., the one that is used if this option is omitted in the -# file. -# -############################################################################### ---- +# If only certain values are allowed for an option, these are listed after +# ``---``. The option in square brackets is the default value, i.e., the one +# that is used if this option is omitted in the file. +--- # Destination directory where all output will be written # Includes log files and performance stats. output_dir: ~/esmvaltool_output @@ -23,21 +15,10 @@ output_dir: ~/esmvaltool_output # Used by some recipes to look for additional datasets. auxiliary_data_dir: ~/auxiliary_data -# Automatic data download from ESGF --- [never]/when_missing/always -# Use automatic download of missing CMIP3, CMIP5, CMIP6, CORDEX, and obs4MIPs -# data from ESGF. ``never`` disables this feature, which is useful if you are -# working on a computer without an internet connection, or if you have limited -# disk space. ``when_missing`` enables the automatic download for files that -# are not available locally. ``always`` will always check ESGF for the latest -# version of a file, and will only use local files if they correspond to that -# latest version. -search_esgf: never - -# Directory for storing downloaded climate data -# Make sure to use a directory where you can store multiple GBs of data. Your -# home directory on a HPC is usually not suited for this purpose, so please -# change the default value in this case! -download_dir: ~/climate_data +# Data search mode --- [quick]/complete +# Use ``quick`` to stop searching as soon as a file is found for a dataset. Use +# ``complete`` to search all configured data sources. +search_data: quick # Run at most this many tasks in parallel --- [null]/1/2/3/4/... # Set to ``null`` to use the number of available CPUs. If you run out of @@ -82,225 +63,3 @@ config_developer_file: null # A profiler tells you which functions in your code take most time to run. # Only available for Python diagnostics. profile_diagnostic: false - -# Rootpaths to the data from different projects -# This default setting will work if files have been downloaded by ESMValTool -# via ``search_esgf``. Lists are also possible. For site-specific entries and -# more examples, see below. Comment out these when using a site-specific path. -rootpath: - default: ~/climate_data - -# Directory structure for input data --- [default]/ESGF/BADC/DKRZ/ETHZ/etc. -# This default setting will work if files have been downloaded by ESMValTool -# via ``search_esgf``. See ``config-developer.yml`` for definitions. Comment -# out/replace as per needed. -drs: - CMIP3: ESGF - CMIP5: ESGF - CMIP6: ESGF - CORDEX: ESGF - obs4MIPs: ESGF - -# Example rootpaths and directory structure names for different projects. -# For each project, the entry can be a single path, a list of paths, or a -# mapping from paths to directory structure names. -# For single paths and list of paths, the directory structure names can be -# defined under 'drs'. -# If no path is defined for a project, the tool will look in the 'default' -# path. -# If no directory structure name is given, the name 'default' will be used. -# Directory structures corresponding to the names are defined in the file -# config-developer.yml. -# For site-specific entries, see below. -#rootpath: -# CMIP6: -# /path/to/data: DKRZ -# ~/path/to/more/data: ESGF -# CMIP5: -# - ~/cmip5_inputpath1 -# - ~/cmip5_inputpath2 -# CMIP3: ~/cmip6_inputpath -# OBS: ~/obs_inputpath -# OBS6: ~/obs6_inputpath -# obs4MIPs: ~/obs4mips_inputpath -# ana4mips: ~/ana4mips_inputpath -# native6: ~/native6_inputpath -# RAWOBS: ~/rawobs_inputpath -# default: ~/default_inputpath -#drs: -# CMIP3: ESGF -# CMIP5: ESGF -# CORDEX: ESGF -# obs4MIPs: ESGF - -# Directory tree created by automatically downloading from ESGF -# Uncomment the lines below to locate data that has been automatically -# downloaded from ESGF (using ``search_esgf``). -#rootpath: -# CMIP3: ~/climate_data -# CMIP5: ~/climate_data -# CMIP6: ~/climate_data -# CORDEX: ~/climate_data -# obs4MIPs: ~/climate_data -#drs: -# CMIP3: ESGF -# CMIP5: ESGF -# CMIP6: ESGF -# CORDEX: ESGF -# obs4MIPs: ESGF - -# Site-specific entries: JASMIN -# Uncomment the lines below to locate data on JASMIN. -#auxiliary_data_dir: /gws/nopw/j04/esmeval/aux_data/AUX -#rootpath: -# CMIP6: /badc/cmip6/data/CMIP6 -# CMIP5: /badc/cmip5/data/cmip5/output1 -# CMIP3: /badc/cmip3_drs/data/cmip3/output -# OBS: /gws/nopw/j04/esmeval/obsdata-v2 -# OBS6: /gws/nopw/j04/esmeval/obsdata-v2 -# obs4MIPs: /gws/nopw/j04/esmeval/obsdata-v2 -# ana4mips: /gws/nopw/j04/esmeval/obsdata-v2 -# CORDEX: /badc/cordex/data/CORDEX/output -#drs: -# CMIP6: BADC -# CMIP5: BADC -# CMIP3: BADC -# CORDEX: BADC -# OBS: default -# OBS6: default -# obs4MIPs: default -# ana4mips: default - -# Site-specific entries: DKRZ-Levante -# For bd0854 members a shared download directory is available -#search_esgf: when_missing -#download_dir: /work/bd0854/DATA/ESMValTool2/download -# Uncomment the lines below to locate data on Levante at DKRZ. -#auxiliary_data_dir: /work/bd0854/DATA/ESMValTool2/AUX -#rootpath: -# CMIP6: -# /work/bd0854/DATA/ESMValTool2/CMIP6_DKRZ: DKRZ -# /work/bd0854/DATA/ESMValTool2/download: ESGF -# CMIP5: -# /work/bd0854/DATA/ESMValTool2/CMIP5_DKRZ: DKRZ -# /work/bd0854/DATA/ESMValTool2/download: ESGF -# CMIP3: -# /work/bd0854/DATA/ESMValTool2/CMIP3: DKRZ -# /work/bd0854/DATA/ESMValTool2/download: ESGF -# CORDEX: -# /work/ik1017/C3SCORDEX/data/c3s-cordex/output: BADC -# /work/bd0854/DATA/ESMValTool2/download: ESGF -# OBS: /work/bd0854/DATA/ESMValTool2/OBS -# OBS6: /work/bd0854/DATA/ESMValTool2/OBS -# obs4MIPs: -# /work/bd0854/DATA/ESMValTool2/OBS: default -# /work/bd0854/DATA/ESMValTool2/download: ESGF -# ana4mips: /work/bd0854/DATA/ESMValTool2/OBS -# native6: -# /work/bd0854/DATA/ESMValTool2/RAWOBS: default -# /pool/data/ERA5: DKRZ-ERA5-GRIB -# RAWOBS: /work/bd0854/DATA/ESMValTool2/RAWOBS -#drs: -# ana4mips: default -# OBS: default -# OBS6: default -# native6: default - -# Site-specific entries: ETHZ -# Uncomment the lines below to locate data at ETHZ. -#rootpath: -# CMIP6: /net/atmos/data/cmip6 -# CMIP5: /net/atmos/data/cmip5 -# CMIP3: /net/atmos/data/cmip3 -# OBS: /net/exo/landclim/PROJECTS/C3S/datadir/obsdir/ -#drs: -# CMIP6: ETHZ -# CMIP5: ETHZ -# CMIP3: ETHZ - -# Site-specific entries: IPSL -# Uncomment the lines below to locate data on Ciclad at IPSL. -#rootpath: -# IPSLCM: / -# CMIP5: /bdd/CMIP5/output -# CMIP6: /bdd/CMIP6 -# CMIP3: /bdd/CMIP3 -# CORDEX: /bdd/CORDEX/output -# obs4MIPs: /bdd/obs4MIPS/obs-CFMIP/observations -# ana4mips: /not_yet -# OBS: /not_yet -# OBS6: /not_yet -# RAWOBS: /not_yet -#drs: -# CMIP6: DKRZ -# CMIP5: DKRZ -# CMIP3: IPSL -# CORDEX: BADC -# obs4MIPs: IPSL -# ana4mips: default -# OBS: not_yet -# OBS6: not_yet - -# Site-specific entries: Met Office - Old VDI -# Uncomment the lines below to locate data at the Met Office. -#rootpath: -# CMIP5: /project/champ/data/cmip5/output1 -# CMIP6: /project/champ/data/CMIP6 -# CORDEX: /project/champ/data/cordex/output -# OBS: /data/users/esmval/ESMValTool/obs -# OBS6: /data/users/esmval/ESMValTool/obs -# obs4MIPs: /data/users/esmval/ESMValTool/obs -# ana4mips: /project/champ/data/ana4MIPs -# native6: /data/users/esmval/ESMValTool/rawobs -# RAWOBS: /data/users/esmval/ESMValTool/rawobs -#drs: -# CMIP5: BADC -# CMIP6: BADC -# CORDEX: BADC -# OBS: default -# OBS6: default -# obs4MIPs: default -# ana4mips: BADC -# native6: default - -# Site-specific entries: Met Office - New VDI -# Uncomment the lines below to locate data at the Met Office. -#rootpath: -# CMIP5: /data/users/managecmip/champ/cmip5/output1 -# CMIP6: /data/users/managecmip/champ/CMIP6 -# CORDEX: /data/users/managecmip/champ/cordex/output -# OBS: /data/users/esmval/ESMValTool/obs -# OBS6: /data/users/esmval/ESMValTool/obs -# obs4MIPs: /data/users/esmval/ESMValTool/obs -# ana4mips: /data/users/managecmip/champ/ana4MIPs -# native6: /data/users/esmval/ESMValTool/rawobs -# RAWOBS: /data/users/esmval/ESMValTool/rawobs -#drs: -# CMIP5: BADC -# CMIP6: BADC -# CORDEX: BADC -# OBS: default -# OBS6: default -# obs4MIPs: default -# ana4mips: BADC -# native6: default - -# Site-specific entries: NCI -# Uncomment the lines below to locate data at NCI. -#rootpath: -# CMIP6: [/g/data/oi10/replicas/CMIP6, /g/data/fs38/publications/CMIP6, /g/data/xp65/public/apps/esmvaltool/replicas/CMIP6] -# CMIP5: [/g/data/r87/DRSv3/CMIP5, /g/data/al33/replicas/CMIP5/combined, /g/data/rr3/publications/CMIP5/output1, /g/data/xp65/public/apps/esmvaltool/replicas/cmip5/output1] -# CMIP3: /g/data/r87/DRSv3/CMIP3 -# OBS: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 -# OBS6: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 -# obs4MIPs: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 -# ana4mips: /g/data/ct11/access-nri/replicas/esmvaltool/obsdata-v2 -# native6: /g/data/xp65/public/apps/esmvaltool/native6 -# -#drs: -# CMIP6: NCI -# CMIP5: NCI -# CMIP3: NCI -# CORDEX: ESGF -# obs4MIPs: default -# ana4mips: default diff --git a/esmvalcore/config/configurations/defaults/dask.yml b/esmvalcore/config/configurations/defaults/dask.yml index 33f5579532..8a9422619b 100644 --- a/esmvalcore/config/configurations/defaults/dask.yml +++ b/esmvalcore/config/configurations/defaults/dask.yml @@ -1,3 +1,5 @@ +# Dask configuration profiles. + dask: use: local_threaded # use the `local_threaded` profile defined below profiles: diff --git a/esmvalcore/config/configurations/defaults/extra_facets_access.yml b/esmvalcore/config/configurations/defaults/extra_facets_access.yml index f82f71bf29..371b2fb586 100644 --- a/esmvalcore/config/configurations/defaults/extra_facets_access.yml +++ b/esmvalcore/config/configurations/defaults/extra_facets_access.yml @@ -1,4 +1,4 @@ -# Extra facets for native ACCESS model output +# Extra facets for native ACCESS model output. # A complete list of supported keys is given in the documentation (see # ESMValCore/doc/quickstart/find_data.rst). diff --git a/esmvalcore/config/configurations/defaults/extra_facets_cesm.yml b/esmvalcore/config/configurations/defaults/extra_facets_cesm.yml index 3e5ff17b3c..fb6e0cc9e2 100644 --- a/esmvalcore/config/configurations/defaults/extra_facets_cesm.yml +++ b/esmvalcore/config/configurations/defaults/extra_facets_cesm.yml @@ -1,4 +1,4 @@ -# Extra facets for native CESM model output +# Extra facets for native CESM model output. # Notes: # - All facets can also be specified in the recipes. The values given here are diff --git a/esmvalcore/config/configurations/defaults/extra_facets_cmip3.yml b/esmvalcore/config/configurations/defaults/extra_facets_cmip3.yml index 30fea3a979..7188ee6e84 100644 --- a/esmvalcore/config/configurations/defaults/extra_facets_cmip3.yml +++ b/esmvalcore/config/configurations/defaults/extra_facets_cmip3.yml @@ -1,3 +1,5 @@ +# Extra facets for the CMIP3 project. + --- projects: diff --git a/esmvalcore/config/configurations/defaults/extra_facets_cmip5.yml b/esmvalcore/config/configurations/defaults/extra_facets_cmip5.yml index 92dcae64fd..afd7dfa2b4 100644 --- a/esmvalcore/config/configurations/defaults/extra_facets_cmip5.yml +++ b/esmvalcore/config/configurations/defaults/extra_facets_cmip5.yml @@ -1,3 +1,4 @@ +# Extra facets for the CMIP5 project. --- projects: diff --git a/esmvalcore/config/configurations/defaults/extra_facets_emac.yml b/esmvalcore/config/configurations/defaults/extra_facets_emac.yml index 5b4b2fda30..169a69860f 100644 --- a/esmvalcore/config/configurations/defaults/extra_facets_emac.yml +++ b/esmvalcore/config/configurations/defaults/extra_facets_emac.yml @@ -1,4 +1,4 @@ -# Extra facets for native EMAC model output +# Extra facets for native EMAC model output. # Notes: # - All facets can also be specified in the recipes. The values given here are diff --git a/esmvalcore/config/configurations/defaults/extra_facets_icon.yml b/esmvalcore/config/configurations/defaults/extra_facets_icon.yml index 96547cbf89..503a9bcecc 100644 --- a/esmvalcore/config/configurations/defaults/extra_facets_icon.yml +++ b/esmvalcore/config/configurations/defaults/extra_facets_icon.yml @@ -1,4 +1,4 @@ -# Extra facets for native ICON model output +# Extra facets for native ICON model output. # Notes: # - All facets can also be specified in the recipes. The values given here are diff --git a/esmvalcore/config/configurations/defaults/extra_facets_ipslcm.yml b/esmvalcore/config/configurations/defaults/extra_facets_ipslcm.yml index 8b354dad1b..409b82e32b 100644 --- a/esmvalcore/config/configurations/defaults/extra_facets_ipslcm.yml +++ b/esmvalcore/config/configurations/defaults/extra_facets_ipslcm.yml @@ -1,3 +1,5 @@ +# Extra facets for native IPSL-CM6 model output. + # Mapping, for IPSLCM output formats 'Analyse' and 'Output', between a # CMOR variable name and the labels to use by ESMValTool to find the # corresponding file, and the corresponding variable in the file diff --git a/esmvalcore/config/configurations/defaults/extra_facets_native6.yml b/esmvalcore/config/configurations/defaults/extra_facets_native6.yml index 91861b53bb..6e56e49544 100644 --- a/esmvalcore/config/configurations/defaults/extra_facets_native6.yml +++ b/esmvalcore/config/configurations/defaults/extra_facets_native6.yml @@ -1,5 +1,5 @@ -# Extra facets for native6 ERA5 data in GRIB format -# +# Extra facets for native6 ERA5 data in GRIB format. + # See # https://docs.dkrz.de/doc/dataservices/finding_and_accessing_data/era_data/index.html#file-and-directory-names # for details on these facets. diff --git a/esmvalcore/config/configurations/defaults/logging.yml b/esmvalcore/config/configurations/defaults/logging.yml index d1cd1948f2..ba61355bd3 100644 --- a/esmvalcore/config/configurations/defaults/logging.yml +++ b/esmvalcore/config/configurations/defaults/logging.yml @@ -1,2 +1,7 @@ +# Default logging configuration. + +# Note that most logging options are still configured in the file +# esmvalcore/config/logging.yml and are not yet configurable for users. + logging: log_progress_interval: 0. diff --git a/esmvalcore/dataset.py b/esmvalcore/dataset.py index 229ba59bd9..157bd69bb6 100644 --- a/esmvalcore/dataset.py +++ b/esmvalcore/dataset.py @@ -15,22 +15,21 @@ from pathlib import Path from typing import TYPE_CHECKING, Any -from esmvalcore import esgf, local +from esmvalcore import esgf from esmvalcore._recipe import check from esmvalcore._recipe.from_datasets import datasets_to_recipe from esmvalcore.cmor.table import _get_mips, _update_cmor_facets from esmvalcore.config import CFG, Session from esmvalcore.config._config import ( get_activity, - get_ignored_warnings, get_institutes, load_extra_facets, ) +from esmvalcore.config._data_sources import _get_data_sources from esmvalcore.exceptions import InputFilesNotFound, RecipeError from esmvalcore.local import ( _dates_to_timerange, _get_output_file, - _get_start_end_date, ) from esmvalcore.preprocessor import preprocess @@ -39,6 +38,8 @@ from iris.cube import Cube + from esmvalcore.io.protocol import DataElement, DataSource + from esmvalcore.preprocessor import PreprocessorItem from esmvalcore.typing import Facets, FacetValue __all__ = [ @@ -49,8 +50,6 @@ logger = logging.getLogger(__name__) -File = esgf.ESGFFile | local.LocalFile - INHERITED_FACETS: list[str] = [ "dataset", "domain", @@ -130,8 +129,8 @@ def __init__(self, **facets: FacetValue) -> None: self._persist: set[str] = set() self._session: Session | None = None - self._files: Sequence[File] | None = None - self._file_globs: Sequence[Path] | None = None + self._files: Sequence[DataElement] | None = None + self._used_data_sources: Sequence[DataSource] = [] for key, value in facets.items(): self.set_facet(key, deepcopy(value), persist=True) @@ -192,7 +191,7 @@ def _derivation_necessary(self) -> bool: def _file_to_dataset( self, - file: esgf.ESGFFile | local.LocalFile, + file: DataElement, ) -> Dataset: """Create a dataset from a file with a `facets` attribute.""" facets = dict(file.facets) @@ -243,6 +242,12 @@ def _get_available_datasets(self) -> Iterator[Dataset]: expanded = False for file in dataset_template.files: dataset = self._file_to_dataset(file) + # Do not use the timerange facet from the file because there may be multiple + # files per dataset. + dataset.facets.pop("timerange", None) + # Restore the original timerange facet if it was specified. + if "timerange" in self.facets: + dataset.facets["timerange"] = self.facets["timerange"] # Filter out identical datasets facetset = frozenset( @@ -267,10 +272,11 @@ def _get_available_datasets(self) -> Iterator[Dataset]: for dataset, file in partially_defined: msg = ( f"{dataset} with unexpanded wildcards, created from file " - f"{file} with facets {file.facets}. Are the missing facets " - "in the path to the file?" - if isinstance(file, local.LocalFile) - else "available on ESGF?" + f"{file} with facets {file.facets}. Please check why " + "the missing facets are not available for the file." + "This will depend on the data source they come from, e.g. can " + "they be extracted from the path for local files, or are they " + "available from ESGF when when searching ESGF for files?" ) if expanded: logger.info("Ignoring %s", msg) @@ -287,7 +293,6 @@ def from_files(self) -> Iterator[Dataset]: The facet values for local files are retrieved from the directory tree where the directories represent the facets values. - Reading facet values from file names is not yet supported. See :ref:`CMOR-DRS` for more information on this kind of file organization. @@ -659,7 +664,9 @@ def augment_facets(self) -> None: """Add additional facets. This function will update the dataset with additional facets from - various sources. + various sources. These include :ref:`config-extra-facets` as well as + facets read from the controlled voculary included in the CMOR tables + if applicable. """ self._augment_facets() for supplementary in self.supplementaries: @@ -750,56 +757,43 @@ def find_files(self) -> None: supplementary.find_files() def _find_files(self) -> None: - self.files, self._file_globs = local.find_files( - debug=True, - **self.facets, - ) - - # If project does not support automatic downloads from ESGF, stop here - if self.facets["project"] not in esgf.facets.FACETS: - return - - # 'never' mode: never download files from ESGF and stop here - if self.session["search_esgf"] == "never": - return - - # 'when_missing' mode: if files are available locally, do not check - # ESGF - if self.session["search_esgf"] == "when_missing": - try: - check.data_availability(self, log=False) - except InputFilesNotFound: - pass # search ESGF for files - else: - return # use local files - - # Local files are not available in 'when_missing' mode or 'always' mode - # is used: check ESGF - local_files = {f.name: f for f in self.files} - search_result = esgf.find_files(**self.facets) - for file in search_result: - if file.name not in local_files: - # Use ESGF files that are not available locally. - self.files.append(file) - else: - # Use ESGF files that are newer than the locally available - # files. - local_file = local_files[file.name] - if "version" in local_file.facets: - if file.facets["version"] > local_file.facets["version"]: - idx = self.files.index(local_file) - self.files[idx] = file + def version(file: DataElement) -> str: + return str(file.facets.get("version", "")) + + self._used_data_sources = [] + files: dict[str, DataElement] = {} + for data_source in sorted( + _get_data_sources(self.session, self.facets["project"]), # type: ignore[arg-type] + key=lambda ds: ds.priority, + ): + result = data_source.find_data(**self.facets) + for file in result: + if file.name not in files: + files[file.name] = file + if version(files[file.name]) < version(file): + files[file.name] = file + self.files = list(files.values()) + self._used_data_sources.append(data_source) + # 'quick' mode: if files are available from a higher + # priority source, do not search lower priority sources. + if self.session["search_data"] == "complete": + try: + check.data_availability(self, log=False) + except InputFilesNotFound: + pass # continue search for data + else: + return # use what has been found so far @property - def files(self) -> list[File]: + def files(self) -> list[DataElement]: """The files associated with this dataset.""" if self._files is None: self.find_files() return self._files # type: ignore @files.setter - def files(self, value: Sequence[File]) -> None: - self._files = value + def files(self, value: Sequence[DataElement]) -> None: + self._files = list(value) def load(self) -> Cube: """Load dataset. @@ -817,7 +811,9 @@ def load(self) -> Cube: input_files = list(self.files) for supplementary_dataset in self.supplementaries: input_files.extend(supplementary_dataset.files) - esgf.download(input_files, self.session["download_dir"]) + esgf.download(input_files) + for file in input_files: + file.prepare() cube = self._load() supplementary_cubes = [] @@ -840,14 +836,7 @@ def load(self) -> Cube: def _load(self) -> Cube: """Load self.files into an iris cube and return it.""" if not self.files: - lines = [ - f"No files were found for {self}", - "locally using glob patterns:", - "\n".join(str(f) for f in self._file_globs or []), - ] - if self.session["search_esgf"] != "never": - lines.append("or on ESGF.") - msg = "\n".join(lines) + msg = check.get_no_data_message(self) raise InputFilesNotFound(msg) output_file = _get_output_file(self.facets, self.session.preproc_dir) @@ -863,12 +852,7 @@ def _load(self) -> Cube: "session": self.session, **self.facets, } - settings["load"] = { - "ignore_warnings": get_ignored_warnings( - self.facets["project"], - "load", - ), - } + settings["load"] = {} settings["fix_metadata"] = { "session": self.session, **self.facets, @@ -897,12 +881,7 @@ def _load(self) -> Cube: "short_name": self.facets["short_name"], } - result = [ - file.local_file(self.session["download_dir"]) - if isinstance(file, esgf.ESGFFile) - else file - for file in self.files - ] + result: Sequence[PreprocessorItem] = self.files for step, kwargs in settings.items(): result = preprocess( result, @@ -993,25 +972,37 @@ def _update_timerange(self) -> None: check.valid_time_selection(timerange) if "*" in timerange: + # Replace wildcards in timerange with "timerange" from DataElements, + # but only if all DataElements have the "timerange" facet. dataset = self.copy() dataset.facets.pop("timerange") dataset.supplementaries = [] check.data_availability(dataset) - intervals = [_get_start_end_date(f) for f in dataset.files] - - min_date = min(interval[0] for interval in intervals) - max_date = max(interval[1] for interval in intervals) + if all("timerange" in f.facets for f in dataset.files): + # "timerange" can only be reliably computed when all DataElements + # provide it. + intervals = [ + f.facets["timerange"].split("/") # type: ignore[union-attr] + for f in dataset.files + ] - if timerange == "*": - timerange = f"{min_date}/{max_date}" - if "*" in timerange.split("/")[0]: - timerange = timerange.replace("*", min_date) - if "*" in timerange.split("/")[1]: - timerange = timerange.replace("*", max_date) + min_date = min(interval[0] for interval in intervals) + max_date = max(interval[1] for interval in intervals) - # Make sure that years are in format YYYY - start_date, end_date = timerange.split("/") - timerange = _dates_to_timerange(start_date, end_date) - check.valid_time_selection(timerange) + if timerange == "*": + timerange = f"{min_date}/{max_date}" + if "*" in timerange.split("/")[0]: + timerange = timerange.replace("*", min_date) + if "*" in timerange.split("/")[1]: + timerange = timerange.replace("*", max_date) - self.set_facet("timerange", timerange) + if "*" in timerange: + # Drop the timerange facet if it still contains wildcards. + self.facets.pop("timerange") + else: + # Make sure that years are in format YYYY + start_date, end_date = timerange.split("/") + timerange = _dates_to_timerange(start_date, end_date) + # Update the timerange + check.valid_time_selection(timerange) + self.set_facet("timerange", timerange) diff --git a/esmvalcore/esgf/__init__.py b/esmvalcore/esgf/__init__.py index ca8607f964..84ff8c1d95 100644 --- a/esmvalcore/esgf/__init__.py +++ b/esmvalcore/esgf/__init__.py @@ -1,10 +1,50 @@ -"""Find files on the ESGF and download them.""" +"""Find files on the ESGF and download them. -from ._download import ESGFFile, download -from ._search import find_files +.. note:: + + This module uses `esgf-pyclient `_ + to search for and download files from the Earth System Grid Federation (ESGF). + `esgf-pyclient`_ uses a + `deprecated API `__ + that is scheduled to be taken offline and replaced by new APIs based on + STAC (ESGF East) and Globus (ESGF West). An ESGF node mimicking the deprecated + API but built op top of Globus will be kept online for some time at + https://esgf-node.ornl.gov/esgf-1-5-bridge, but users are encouraged + to migrate to the new APIs as soon as possible by using the + :mod:`esmvalcore.io.intake_esgf` module instead. + +This module provides the function :py:func:`esmvalcore.esgf.find_files` +for searching for files on ESGF using the ESMValTool vocabulary. +It returns :class:`esmvalcore.esgf.ESGFFile` objects, which have a convenient +:meth:`esmvalcore.esgf.ESGFFile.download` method for downloading the file. +A :func:`esmvalcore.esgf.download` function for downloading multiple files in +parallel is also available. + +It also provides an :class:`esmvalcore.esgf.ESGFDataSource` that can be +used to find files on ESGF from the :class:`~esmvalcore.dataset.Dataset` +or the :ref:`recipe `. To use it, run the command + +.. code:: bash + + esmvalcore config copy data-esmvalcore-esgf.yml + +to copy the default configuration file for this module to your configuration +directory. This will create a file with the following content: + +.. literalinclude:: ../configurations/data-esmvalcore-esgf.yml + :caption: Contents of ``data-esmvalcore-esgf.yml`` + :language: yaml + +See :ref:`config-data-sources` for more information on configuring data sources +and :ref:`config-esgf` for additional configuration options of this module. +""" + +from esmvalcore.esgf._download import ESGFFile, download +from esmvalcore.esgf._search import ESGFDataSource, find_files __all__ = [ "ESGFFile", + "ESGFDataSource", "download", "find_files", ] diff --git a/esmvalcore/esgf/_download.py b/esmvalcore/esgf/_download.py index 9a1ff04fcb..ce5f030735 100644 --- a/esmvalcore/esgf/_download.py +++ b/esmvalcore/esgf/_download.py @@ -1,5 +1,7 @@ """Module for downloading files from ESGF.""" +from __future__ import annotations + import concurrent.futures import contextlib import datetime @@ -14,17 +16,31 @@ from pathlib import Path from statistics import median from tempfile import NamedTemporaryFile +from typing import TYPE_CHECKING, Any from urllib.parse import urlparse import requests import yaml from humanfriendly import format_size, format_timespan -from esmvalcore.local import LocalFile -from esmvalcore.typing import Facets +from esmvalcore.config import CFG +from esmvalcore.io.protocol import DataElement +from esmvalcore.local import ( + LocalFile, + _dates_to_timerange, + _get_start_end_date_from_filename, +) from .facets import DATASET_MAP, FACETS +if TYPE_CHECKING: + from collections.abc import Iterable + + import iris.cube + from pyesgf.search.results import FileResult + + from esmvalcore.typing import Facets + logger = logging.getLogger(__name__) TIMEOUT = 5 * 60 @@ -166,7 +182,7 @@ def sort_hosts(urls): @functools.total_ordering -class ESGFFile: +class ESGFFile(DataElement): """File on the ESGF. This is the object returned by :func:`esmvalcore.esgf.find_files`. @@ -185,7 +201,11 @@ class ESGFFile: The URLs where the file can be downloaded. """ - def __init__(self, results): + def __init__( + self, + results: Iterable[FileResult], + dest_folder: Path | None = None, + ) -> None: results = list(results) self.name = str(Path(results[0].filename).with_suffix(".nc")) self.size = results[0].size @@ -196,6 +216,36 @@ def __init__(self, results): for result in results: self.urls.append(result.download_url) self._checksums.append((result.checksum_type, result.checksum)) + self.dest_folder = ( + CFG.get("download_dir") if dest_folder is None else dest_folder + ) + self._attributes: dict[str, Any] | None = None + + def prepare(self) -> None: + """Prepare the data for access.""" + self.download(self.dest_folder) + + @property + def attributes(self) -> dict[str, Any]: + """Attributes are key-value pairs describing the data.""" + if self._attributes is None: + msg = ( + "Attributes have not been read yet. Call the `to_iris` method " + "first to read the attributes from the file." + ) + raise ValueError(msg) + return self._attributes + + @attributes.setter + def attributes(self, value: dict[str, Any]) -> None: + self._attributes = value + + def to_iris(self) -> iris.cube.CubeList: + self.prepare() + local_file = self.local_file(self.dest_folder) + cube = local_file.to_iris() + self.attributes = local_file.attributes + return cube @classmethod def _from_results(cls, results, facets): @@ -275,6 +325,9 @@ def _get_facets(self, results): self.name, ) facets[facet] = value + start_date, end_date = _get_start_end_date_from_filename(self.name) + if start_date and end_date: + facets["timerange"] = _dates_to_timerange(start_date, end_date) return facets @staticmethod @@ -383,16 +436,16 @@ def __lt__(self, other): """Compare `self` to `other`.""" return (self.dataset, self.name) < (other.dataset, other.name) - def __hash__(self): - """Compute a unique hash value.""" + def __hash__(self) -> int: + """Return a number uniquely representing the data element.""" return hash((self.dataset, self.name)) - def local_file(self, dest_folder): + def local_file(self, dest_folder: Path | None) -> LocalFile: """Return the path to the local file after download. Arguments --------- - dest_folder: Path + dest_folder: The destination folder. Returns @@ -400,16 +453,17 @@ def local_file(self, dest_folder): LocalFile The path where the file will be located after download. """ + dest_folder = self.dest_folder if dest_folder is None else dest_folder file = LocalFile(dest_folder, self._get_relative_path()) file.facets = self.facets return file - def download(self, dest_folder): + def download(self, dest_folder: Path | None) -> LocalFile: """Download the file. Arguments --------- - dest_folder: Path + dest_folder: The destination folder. Raises @@ -424,7 +478,6 @@ def download(self, dest_folder): """ local_file = self.local_file(dest_folder) if local_file.exists(): - logger.debug("Skipping download of existing file %s", local_file) return local_file os.makedirs(local_file.parent, exist_ok=True) @@ -528,14 +581,14 @@ def get_download_message(files): return "\n".join(lines) -def download(files, dest_folder, n_jobs=4): +def download(files, dest_folder=None, n_jobs=4): """Download multiple ESGFFiles in parallel. Arguments --------- files: list of :obj:`ESGFFile` The files to download. - dest_folder: Path + dest_folder: Path or None The destination folder. n_jobs: int The number of files to download in parallel. @@ -552,9 +605,6 @@ def download(files, dest_folder, n_jobs=4): and not file.local_file(dest_folder).exists() ] if not files: - logger.debug( - "All required data is available locally, not downloading anything.", - ) return files = sorted(files) diff --git a/esmvalcore/esgf/_search.py b/esmvalcore/esgf/_search.py index 911e44cacb..c067f326c5 100644 --- a/esmvalcore/esgf/_search.py +++ b/esmvalcore/esgf/_search.py @@ -1,15 +1,21 @@ """Module for finding files on ESGF.""" +from __future__ import annotations + import itertools import logging +import os.path +from dataclasses import dataclass, field from functools import lru_cache +from pathlib import Path +from typing import TYPE_CHECKING import pyesgf.search import requests.exceptions from esmvalcore.config._esgf_pyclient import get_esgf_config +from esmvalcore.io.protocol import DataSource from esmvalcore.local import ( - _get_start_end_date, _parse_period, _replace_years_with_timerange, _truncate_dates, @@ -18,6 +24,9 @@ from ._download import ESGFFile from .facets import DATASET_MAP, FACETS +if TYPE_CHECKING: + from esmvalcore.typing import FacetValue + logger = logging.getLogger(__name__) @@ -177,17 +186,16 @@ def select_by_time(files, timerange): for file in files: start_date, end_date = _parse_period(timerange) - try: - start, end = _get_start_end_date(file) - except ValueError: - # If start and end year cannot be read from the filename - # just select everything. - selection.append(file) - else: + if "timerange" in file.facets: + start, end = file.facets["timerange"].split("/") start_date, end = _truncate_dates(start_date, end) end_date, start = _truncate_dates(end_date, start) if start <= end_date and end >= start_date: selection.append(file) + else: + # If start and end year cannot be read from the filename just select + # everything. + selection.append(file) return selection @@ -378,3 +386,50 @@ def cached_search(**facets): logger.debug("Selected files:\n%s", "\n".join(str(f) for f in files)) return files + + +@dataclass +class ESGFDataSource(DataSource): + name: str + """A name identifying the data source.""" + + project: str + """The project that the data source provides data for.""" + + priority: int + """The priority of the data source. Lower values have priority.""" + + download_dir: Path + """The destination directory where data will be downloaded.""" + + debug_info: str = field(init=False, repr=False, default="") + """A string containing debug information when no data is found.""" + + def __post_init__(self) -> None: + self.download_dir = Path( + os.path.expandvars(self.download_dir), + ).expanduser() + + def find_data(self, **facets: FacetValue) -> list[ESGFFile]: + """Find data. + + Parameters + ---------- + **facets : + Find data matching these facets. + + Returns + ------- + :obj:`list` of :obj:`esmvalcore.esgf.ESGFFile` + A list of files that have been found on ESGF. + """ + files = find_files(**facets) + for file in files: + file.dest_folder = self.download_dir + start_msg = "Search" if files else "No search" + self.debug_info = ( + f"{start_msg} results found on ESGF with query: {FIRST_ONLINE_INDEX_NODE}" + "/search?format=application%2Fsolr%2Bjson&distrib=true&type=File&" + + "&".join(f"{k}={v}" for k, v in get_esgf_facets(facets).items()) + ) + return files diff --git a/esmvalcore/exceptions.py b/esmvalcore/exceptions.py index fcfd7df512..e794499d62 100644 --- a/esmvalcore/exceptions.py +++ b/esmvalcore/exceptions.py @@ -32,7 +32,7 @@ class InvalidConfigParameter(Error, SuppressedError): """Config parameter is invalid.""" -class RecipeError(Error): +class RecipeError(Error, SuppressedError): """Recipe contains an error.""" def __init__(self, msg: str) -> None: diff --git a/esmvalcore/io/__init__.py b/esmvalcore/io/__init__.py new file mode 100644 index 0000000000..13022f8bf1 --- /dev/null +++ b/esmvalcore/io/__init__.py @@ -0,0 +1,123 @@ +"""A modular system for reading input data from various sources. + +An input data source can be defined in the configuration by using +:obj:`esmvalcore.config.CFG`, for example: + +.. code-block:: python + + >>> from esmvalcore.config import CFG + >>> CFG["projects"]["CMIP6"]["data"]["local"] = { + "type": "esmvalcore.local.LocalDataSource", + "rootpath": "~/climate_data", + "dirname_template": "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}", + "filename_template": "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc", + } + +or as a :ref:`YAML configuration file `: + +.. code-block:: yaml + + projects: + CMIP6: + data: + local: + type: "esmvalcore.local.LocalDataSource" + rootpath: "~/climate_data" + dirname_template: "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}" + filename_template: "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc" + +where ``CMIP6`` is a project, and ``local`` is a unique name describing the +data source. The data source type, +:class:`esmvalcore.local.LocalDataSource`, in the example above, needs to +implement the :class:`esmvalcore.io.protocol.DataSource` protocol. Any +remaining key-value pairs in the configuration, ``rootpath``, +``dirname_template``, and ``filename_template`` in this example, are passed +as keyword arguments to the data source when it is created. + +If there are multiple data sources configured for a project, deduplication of +search results happens based on the +:attr:`esmvalcore.io.protocol.DataElement.name` attribute and the ``"version"`` +facet in :attr:`esmvalcore.io.protocol.DataElement.facets` of the data elements +provided by the data sources. If no ``version`` facet is specified in the +search, the latest version will be used. If there is a tie, the data element +provided by the data source with the lowest value of +:attr:`esmvalcore.io.protocol.DataSource.priority` is chosen. +""" + +import importlib +import logging + +from esmvalcore.config import Session +from esmvalcore.io.protocol import DataSource + +logger = logging.getLogger(__name__) + + +def load_data_sources( + session: Session, + project: str | None = None, +) -> list[DataSource]: + """Get the list of available data sources. + + If no ``priority`` is configured for a data source, the default priority + of 1 is used. + + Arguments + --------- + session: + The configuration. + project: + If specified, only data sources for this project are returned. + + Returns + ------- + :obj:`list` of :obj:`DataSource`: + A list of available data sources. + + Raises + ------ + ValueError: + If the project or its settings are not found in the configuration. + + """ + data_sources: list[DataSource] = [] + if project is not None and project not in session["projects"]: + msg = f"Unknown project '{project}', please configure it under 'projects'." + raise ValueError(msg) + settings = ( + session["projects"] + if project is None + else {project: session["projects"][project]} + ) + for project_, project_settings in settings.items(): + for name, orig_kwargs in project_settings.get("data", {}).items(): + kwargs = orig_kwargs.copy() + module_name, cls_name = kwargs.pop("type").rsplit(".", 1) + module = importlib.import_module(module_name) + cls = getattr(module, cls_name) + priority = kwargs.pop("priority", 1) + data_source = cls( + name=name, + project=project_, + priority=priority, + **kwargs, + ) + if not isinstance(data_source, DataSource): + msg = ( + "Expected a data source of type `esmvalcore.io.protocol.DataSource`, " + f"but your configuration for project '{project_}' contains " + f"'{data_source}' of type '{type(data_source)}'." + ) + raise TypeError(msg) + data_sources.append(data_source) + + if not data_sources: + if project is None: + msg = "No data sources found. Check your configuration under 'projects'" + else: + msg = ( + f"No data sources found for project '{project}'. " + f"Check your configuration under 'projects: {project}: data'" + ) + raise ValueError(msg) + return data_sources diff --git a/esmvalcore/io/intake_esgf.py b/esmvalcore/io/intake_esgf.py new file mode 100644 index 0000000000..930ef6ad60 --- /dev/null +++ b/esmvalcore/io/intake_esgf.py @@ -0,0 +1,337 @@ +"""Access data using `intake-esgf `_. + +.. note:: + + It is highly recommended that you take a moment to + :doc:`configure intake-esgf ` before using it + with ESMValCore. Make sure to set ``local_cache`` to a path where + it can store downloaded files and if (some) ESGF data is already + available on your system, point ``esg_dataroot`` to it. If you are + missing certain search results, you may want to choose a different + index node for searching the ESGF. + +Run the command ``esmvaltool config copy data-intake-esgf.yml`` to update +your :ref:`configuration ` to use this module. This will +create a file with the following content in your configuration directory: + +.. literalinclude:: ../configurations/data-intake-esgf.yml + :language: yaml + :caption: Contents of ``data-intake-esgf.yml`` + +""" + +from __future__ import annotations + +import copy +from dataclasses import dataclass, field +from pathlib import Path +from typing import TYPE_CHECKING, Any + +import intake_esgf +import intake_esgf.exceptions +import isodate + +from esmvalcore.dataset import _isglob, _ismatch +from esmvalcore.io.protocol import DataElement, DataSource +from esmvalcore.iris_helpers import dataset_to_iris +from esmvalcore.local import _parse_period + +if TYPE_CHECKING: + import iris.cube + + from esmvalcore.typing import Facets, FacetValue + + +__all__ = [ + "IntakeESGFDataSource", + "IntakeESGFDataset", +] + + +class _CachingCatalog(intake_esgf.ESGFCatalog): + """An ESGF catalog that caches to_path_dict results.""" + + def __init__(self): + super().__init__() + self._result = {} + + @classmethod + def from_catalog( + cls, + catalog: intake_esgf.ESGFCatalog, + ) -> _CachingCatalog: + """Create a CachingCatalog from an existing ESGFCatalog.""" + cat = cls() + cat.indices = catalog.indices + cat.local_cache = catalog.local_cache + cat.esg_dataroot = catalog.esg_dataroot + cat.file_start = catalog.file_start + cat.file_end = catalog.file_end + cat.project = catalog.project + cat.df = catalog.df + return cat + + def to_path_dict( + self, + prefer_streaming: bool = False, + globus_endpoint: str | None = None, + globus_path: Path = Path("/"), + minimal_keys: bool = True, + ignore_facets: None | str | list[str] = None, + separator: str = ".", + quiet: bool = False, + ) -> dict[str, list[str | Path]]: + """Return the current search as a dictionary of paths to files.""" + kwargs = { + "prefer_streaming": prefer_streaming, + "globus_endpoint": globus_endpoint, + "globus_path": globus_path, + "minimal_keys": minimal_keys, + "ignore_facets": ignore_facets, + "separator": separator, + "quiet": quiet, + } + key = tuple((k, v) for k, v in kwargs.items() if k != "quiet") + if key not in self._result: + self._result[key] = super().to_path_dict(**kwargs) + return self._result[key] + + +@dataclass +class IntakeESGFDataset(DataElement): + """A dataset that can be used to load data found using intake-esgf_.""" + + name: str + """A unique name identifying the data.""" + + facets: Facets = field(repr=False) + """Facets are key-value pairs that were used to find this data.""" + + catalog: intake_esgf.ESGFCatalog = field(repr=False) + """The intake-esgf catalog describing this data.""" + + _attributes: dict[str, Any] | None = field( + init=False, + repr=False, + default=None, + ) + + def __hash__(self) -> int: + """Return a number uniquely representing the data element.""" + return hash((self.name, self.facets.get("version"))) + + def prepare(self) -> None: + """Prepare the data for access.""" + self.catalog.to_path_dict(minimal_keys=False) + for index in self.catalog.indices: + # Set the sessions to None to avoid issues with pickling + # requests_cache.CachedSession objects when max_parallel_tasks > 1. + # After the prepare step, the sessions for interacting with the + # search indices are not needed anymore as all file paths required + # to load the data have been found. To make sure we do not + # accidentally use the sessions later on, we set them to None + # instead of e.g. requests.Session objects. + # + # This seems the safest/fastest solution as it avoids accessing the + # sqlite database backing the cached_requests.CachedSession from + # multiple processes on multiple machines. + index.session = None + + @property + def attributes(self) -> dict[str, Any]: + """Attributes are key-value pairs describing the data.""" + if self._attributes is None: + msg = ( + "Attributes have not been read yet. Call the `to_iris` method " + "first to read the attributes from the file." + ) + raise ValueError(msg) + return self._attributes + + @attributes.setter + def attributes(self, value: dict[str, Any]) -> None: + self._attributes = value + + def to_iris(self) -> iris.cube.CubeList: + """Load the data as Iris cubes. + + Returns + ------- + : + The loaded data. + """ + files = self.catalog.to_path_dict( + minimal_keys=False, + quiet=True, + )[self.name] + dataset = self.catalog.to_dataset_dict( + minimal_keys=False, + add_measures=False, + quiet=True, + )[self.name] + # Store the local paths in the attributes for easier debugging. + dataset.attrs["source_file"] = ", ".join(str(f) for f in files) + # Cache the attributes. + self.attributes = copy.deepcopy(dataset.attrs) + return dataset_to_iris(dataset) + + +@dataclass +class IntakeESGFDataSource(DataSource): + """Data source that can be used to find data using intake-esgf.""" + + name: str + """A name identifying the data source.""" + + project: str + """The project that the data source provides data for.""" + + priority: int + """The priority of the data source. Lower values have priority.""" + + facets: dict[str, str] + """Mapping between the ESMValCore and ESGF facet names.""" + + values: dict[str, dict[str, str]] = field(default_factory=dict) + """Mapping between the ESMValCore and ESGF facet values.""" + + debug_info: str = field(init=False, repr=False, default="") + """A string containing debug information when no data is found.""" + + catalog: intake_esgf.ESGFCatalog = field( + init=False, + repr=False, + default_factory=intake_esgf.ESGFCatalog, + ) + """The intake-esgf catalog used to find data.""" + + def __post_init__(self): + self.catalog.project = intake_esgf.projects.projects[ + self.project.lower() + ] + + def find_data(self, **facets: FacetValue) -> list[IntakeESGFDataset]: + """Find data. + + Parameters + ---------- + **facets : + Find data matching these facets. + + Returns + ------- + : + A list of data elements that have been found. + """ + # Select searchable facets and normalize so all values are `list[str]`. + normalized_facets = { + facet: [str(values)] if isinstance(values, str | int) else values + for facet, values in facets.items() + if facet in self.facets + } + # Filter out glob patterns as these are not supported by intake-esgf. + non_glob_facets = { + facet: values + for facet, values in normalized_facets.items() + if not any(_isglob(v) for v in values) + } + # Translate "our" facets to ESGF facets and "our" values to ESGF values. + query = { + their_facet: [ + self.values.get(our_facet, {}).get(v, v) + for v in non_glob_facets[our_facet] + ] + for our_facet, their_facet in self.facets.items() + if our_facet in non_glob_facets + } + if ( + "timerange" in facets and not _isglob(facets["timerange"]) # type: ignore[operator] + ): + start, end = _parse_period(facets["timerange"]) + query["file_start"] = isodate.date_isoformat( + isodate.parse_date(start.split("T")[0]), + ) + query["file_end"] = isodate.date_isoformat( + isodate.parse_date(end.split("T")[0]), + ) + # Search ESGF. + try: + self.catalog.search(**query, quiet=True) + except intake_esgf.exceptions.NoSearchResults: + self.debug_info = ( + "`intake_esgf.ESGFCatalog().search(" + + ", ".join( + [ + f"{k}={v}" if isinstance(v, list) else f"{k}='{v}'" + for k, v in query.items() + ], + ) + + ")` did not return any results." + ) + return [] + + # Return a list of datasets, with one IntakeESGFDataset per dataset_id. + result: list[IntakeESGFDataset] = [] + + # These are the keys in the dict[str, xarray.Dataset] returned by + # `intake_esgf.ESGFCatalog.to_dataset_dict`. Taken from: + # https://github.com/esgf2-us/intake-esgf/blob/c34124e54078e70ef271709a6d158edb22bcdb96/intake_esgf/catalog.py#L523-L528 + self.catalog.df["key"] = self.catalog.df.apply( + lambda row: ".".join( + [row[f] for f in self.catalog.project.master_id_facets()], + ), + axis=1, + ) + inverse_values = { + our_facet: { + their_value: our_value + for our_value, their_value in self.values[our_facet].items() + } + for our_facet in self.values + } + for _, row in self.catalog.df.iterrows(): + dataset_id = row["key"] + # Use a caching catalog to avoid searching the indices after + # calling the ESGFFile.prepare method. + cat = _CachingCatalog.from_catalog(self.catalog) + # Subset the catalog to a single dataset. + cat.df = cat.df[cat.df.key == dataset_id] + # Ensure only the requested variable is included in the dataset. + # https://github.com/esgf2-us/intake-esgf/blob/18437bff5ee75acaaceef63093101223b4692259/intake_esgf/catalog.py#L544-L552 + if "short_name" in normalized_facets: + cat.last_search[self.facets["short_name"]] = [ + self.values.get("short_name", {}).get(v, v) + for v in normalized_facets["short_name"] + ] + # Retrieve "our" facets associated with the dataset_id. + dataset_facets = {"version": [f"v{row['version']}"]} + for our_facet, esgf_facet in self.facets.items(): + if esgf_facet in row: + esgf_values = row[esgf_facet] + if isinstance(esgf_values, str): + esgf_values = [esgf_values] + our_values = [ + inverse_values.get(our_facet, {}).get(v, v) + for v in esgf_values + ] + dataset_facets[our_facet] = our_values + # Only return datasets that match the glob patterns. + if all( + any( + _ismatch(v, p) + for v in dataset_facets[f] + for p in normalized_facets[f] + ) + for f in dataset_facets + if f in normalized_facets + ): + dataset = IntakeESGFDataset( + name=dataset_id, + facets={ + k: v[0] if len(v) == 1 else v + for k, v in dataset_facets.items() + }, # type: ignore[arg-type] + catalog=cat, + ) + result.append(dataset) + return result diff --git a/esmvalcore/io/protocol.py b/esmvalcore/io/protocol.py new file mode 100644 index 0000000000..6f7108c2a7 --- /dev/null +++ b/esmvalcore/io/protocol.py @@ -0,0 +1,82 @@ +"""Protocols for accessing data. + +This module defines the :class:`DataSource` and :class:`DataElement` protocols +for finding and loading data. A data source can be used to find data elements +matching specific facets. A data element represents some data that can be +loaded as Iris cubes. + +To add support for a new data source, write two classes that implement these +protocols and configure the tool to use the newly implemented data source as +described in :mod:`esmvalcore.io`. + +""" + +from collections.abc import Iterable +from typing import Any, Protocol, runtime_checkable + +import iris.cube + +from esmvalcore.typing import FacetValue + + +@runtime_checkable +class DataElement(Protocol): + """A data element represents some data that can be loaded. + + An :class:`esmvalcore.local.LocalFile` is an example of a data element. + """ + + name: str + """A unique name identifying the data.""" + + facets: dict[str, FacetValue] + """Facets are key-value pairs that can be used for searching the data.""" + + attributes: dict[str, Any] + """Attributes are key-value pairs describing the data.""" + + def __hash__(self) -> int: + """Return a number uniquely representing the data element.""" + + def prepare(self) -> None: + """Prepare the data for access.""" + + def to_iris(self) -> iris.cube.CubeList: + """Load the data as Iris cubes. + + Returns + ------- + iris.cube.CubeList + The loaded data. + """ + + +@runtime_checkable +class DataSource(Protocol): + """A data source can be used to find data.""" + + name: str + """A name identifying the data source.""" + + project: str + """The project that the data source provides data for.""" + + priority: int + """The priority of the data source. Lower values have priority.""" + + debug_info: str + """A string containing debug information when no data is found.""" + + def find_data(self, **facets: FacetValue) -> Iterable[DataElement]: + """Find data. + + Parameters + ---------- + **facets : + Find data matching these facets. + + Returns + ------- + :obj:`typing.Iterable` of :obj:`esmvalcore.io.base.DataElement` + The data elements that have been found. + """ diff --git a/esmvalcore/local.py b/esmvalcore/local.py index 70f30adee2..2f8ace30db 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -1,4 +1,42 @@ -"""Find files on the local filesystem.""" +"""Find files on the local filesystem. + +Example configuration to find CMIP6 data on a personal computer: + +.. literalinclude:: ../configurations/data-local.yml + :language: yaml + :caption: Contents of ``data-local.yml`` + :start-at: projects: + :end-before: CMIP5: + +The module will find files matching the :func:`glob.glob` pattern formed by +``rootpath/dirname_template/filename_template``, where the facets defined +inside the curly braces of the templates are replaced by their values +from the :class:`~esmvalcore.dataset.Dataset` or the :ref:`recipe ` +plus any facet-value pairs that can be automatically added using +:meth:`~esmvalcore.dataset.Dataset.augment_facets`. +Note that the name of the data source, ``local-data`` in the example above, +must be unique within each project but can otherwise be chosen freely. + +To start using this module on a personal computer, copy the example +configuration file into your configuration directory by running the command: + +.. code-block:: bash + + esmvaltool config copy data-local.yml + +and tailor it for your own system if needed. + +Example configuration files for popular HPC systems and some +:ref:`supported climate models ` are also available. View +the list of available files by running the command: + +.. code-block:: bash + + esmvaltool config list + +Further information is available in :ref:`config-data-sources`. + +""" from __future__ import annotations @@ -6,27 +44,29 @@ import itertools import logging import os +import os.path import re -from dataclasses import dataclass +import warnings +from dataclasses import dataclass, field from glob import glob from pathlib import Path from typing import TYPE_CHECKING, Any +import iris.cube +import iris.fileformats.cf import isodate from cf_units import Unit from netCDF4 import Dataset, Variable +import esmvalcore.io.protocol from esmvalcore.config import CFG -from esmvalcore.config._config import get_project_config +from esmvalcore.config._config import get_ignored_warnings, get_project_config from esmvalcore.exceptions import RecipeError -from esmvalcore.preprocessor._io import _load_from_file +from esmvalcore.iris_helpers import ignore_warnings_context if TYPE_CHECKING: from collections.abc import Iterable - import iris.cube - - from esmvalcore.esgf import ESGFFile from esmvalcore.typing import Facets, FacetValue logger = logging.getLogger(__name__) @@ -86,9 +126,9 @@ def _get_var_name(variable: Variable) -> str: return str(variable.name) -def _get_start_end_date( - file: str | Path | LocalFile | ESGFFile, -) -> tuple[str, str]: +def _get_start_end_date_from_filename( + file: str | Path, +) -> tuple[str | None, str | None]: """Get the start and end dates as a string from a file name. Examples of allowed dates: 1980, 198001, 1980-01, 19801231, 1980-12-31, @@ -117,13 +157,6 @@ def _get_start_end_date( ValueError Start or end date cannot be determined. """ - if hasattr(file, "name"): # noqa: SIM108 - # Path, LocalFile, ESGFFile - stem = Path(file.name).stem - else: - # str - stem = Path(file).stem - start_date = end_date = None # Build regex @@ -151,9 +184,34 @@ def _get_start_end_date( start_date, end_date = _get_from_pattern( datetime_pattern, date_range_pattern, - stem, + Path(file).stem, "datetime", ) + return start_date, end_date + + +def _get_start_end_date(file: str | Path) -> tuple[str, str]: + """Get the start and end dates as a string from a file. + + This function first tries to read the dates from the filename and only + if that fails, it will try to read them from the content of the file. + + Parameters + ---------- + file: + The file to read the start and end data from. + + Returns + ------- + tuple[str, str] + The start and end date. + + Raises + ------ + ValueError + Start or end date cannot be determined. + """ + start_date, end_date = _get_start_end_date_from_filename(file) # As final resort, try to get the dates from the file contents if ( @@ -199,17 +257,6 @@ def _get_start_end_date( return start_date, end_date -def _get_start_end_year( - file: str | Path | LocalFile | ESGFFile, -) -> tuple[int, int]: - """Get the start and end year as int from a file name. - - See :func:`_get_start_end_date`. - """ - (start_date, end_date) = _get_start_end_date(file) - return (int(start_date[:4]), int(end_date[:4])) - - def _dates_to_timerange(start_date: int | str, end_date: int | str) -> str: """Convert ``start_date`` and ``end_date`` to ``timerange``. @@ -467,23 +514,44 @@ def _select_drs(input_type: str, project: str, structure: str) -> list[str]: @dataclass(order=True) -class DataSource: - """Class for storing a data source and finding the associated files.""" +class LocalDataSource(esmvalcore.io.protocol.DataSource): + """Data source for finding files on a local filesystem.""" + + name: str + """A name identifying the data source.""" + + project: str + """The project that the data source provides data for.""" + + priority: int + """The priority of the data source. Lower values have priority.""" + + debug_info: str = field(init=False, repr=False, default="") + """A string containing debug information when no data is found.""" rootpath: Path + """The path where the directories are located.""" + dirname_template: str + """The template for the directory names.""" + filename_template: str + """The template for the file names.""" + + ignore_warnings: list[dict[str, Any]] | None = field(default_factory=list) + """Warnings to ignore when loading the data. + + The list should contain :class:`dict`s with keyword arguments that + will be passed to the :func:`warnings.filterwarnings` function when + calling :meth:`LocalFile.to_iris`. + """ def __post_init__(self) -> None: """Set further attributes.""" + self.rootpath = Path(os.path.expandvars(self.rootpath)).expanduser() self._regex_pattern = self._templates_to_regex() - @property - def regex_pattern(self) -> str: - """Get regex pattern that can be used to extract facets from paths.""" - return self._regex_pattern - - def get_glob_patterns(self, **facets) -> list[Path]: + def _get_glob_patterns(self, **facets) -> list[Path]: """Compose the globs that will be used to look for files.""" dirname_globs = _replace_tags(self.dirname_template, facets) filename_globs = _replace_tags(self.filename_template, facets) @@ -493,32 +561,71 @@ def get_glob_patterns(self, **facets) -> list[Path]: for f in filename_globs ) - def find_files(self, **facets) -> list[LocalFile]: - """Find files.""" - globs = self.get_glob_patterns(**facets) + def find_data(self, **facets) -> list[LocalFile]: + """Find data locally. + + Parameters + ---------- + **facets : + Find data matching these facets. + + Returns + ------- + : + A list of files. + + """ + facets = dict(facets) + if "original_short_name" in facets: + facets["short_name"] = facets["original_short_name"] + + globs = self._get_glob_patterns(**facets) + self.debug_info = "No files found matching glob pattern " + "\n".join( + str(g) for g in globs + ) logger.debug("Looking for files matching %s", globs) files: list[LocalFile] = [] for glob_ in globs: for filename in glob(str(glob_)): file = LocalFile(filename) - file.facets.update(self.path2facets(file)) + file.facets.update( + self._path2facets( + file, + add_timerange="timerange" in facets, + ), + ) + file.ignore_warnings = self.ignore_warnings files.append(file) + + files = _filter_versions_called_latest(files) + + if "version" not in facets: + files = _select_latest_version(files) + files.sort() # sorting makes it easier to see what was found if "timerange" in facets: files = _select_files(files, facets["timerange"]) return files - def path2facets(self, path: Path) -> dict[str, str]: + def _path2facets(self, path: Path, add_timerange: bool) -> dict[str, str]: """Extract facets from path.""" facets: dict[str, str] = {} - match = re.search(self.regex_pattern, str(path)) - if match is None: - return facets - for facet, value in match.groupdict().items(): - if value: - facets[facet] = value + + if (match := re.search(self._regex_pattern, str(path))) is not None: + for facet, value in match.groupdict().items(): + if value: + facets[facet] = value + + if add_timerange: + try: + start_date, end_date = _get_start_end_date(path) + except ValueError: + pass + else: + facets["timerange"] = _dates_to_timerange(start_date, end_date) + return facets def _templates_to_regex(self) -> str: @@ -607,12 +714,55 @@ def _templates_to_regex(self) -> str: return pattern +class DataSource(LocalDataSource): + """Data source for finding files on a local filesystem. + + .. deprecated:: 2.14.0 + This class is deprecated and will be removed in version 2.16.0. + Please use :class:`esmvalcore.local.LocalDataSource` instead. + """ + + def __init__(self, *args, **kwargs) -> None: + msg = ( + "The 'esmvalcore.local.LocalDataSource' class is deprecated and will be " + "removed in version 2.16.0. Please use 'esmvalcore.local.LocalDataSource'" + ) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + super().__init__(*args, **kwargs) + + @property + def regex_pattern(self) -> str: + """Get regex pattern that can be used to extract facets from paths.""" + return self._regex_pattern + + def get_glob_patterns(self, **facets) -> list[Path]: + """Compose the globs that will be used to look for files.""" + return self._get_glob_patterns(**facets) + + def path2facets(self, path: Path, add_timerange: bool) -> dict[str, str]: + """Extract facets from path.""" + return self._path2facets(path, add_timerange) + + def find_files(self, **facets) -> list[LocalFile]: + """Find files.""" + return self.find_data(**facets) + + _ROOTPATH_WARNED: set[tuple[str, tuple[str]]] = set() +_LEGACY_DATA_SOURCES_WARNED: set[str] = set() -def _get_data_sources(project: str) -> list[DataSource]: + +def _get_data_sources(project: str) -> list[LocalDataSource]: """Get a list of data sources.""" rootpaths = CFG["rootpath"] + default_drs = { + "CMIP3": "ESGF", + "CMIP5": "ESGF", + "CMIP6": "ESGF", + "CORDEX": "ESGF", + "obs4MIPs": "ESGF", + } for key in (project, "default"): if key in rootpaths: paths = rootpaths[key] @@ -625,17 +775,38 @@ def _get_data_sources(project: str) -> list[DataSource]: ) _ROOTPATH_WARNED.add((key, nonexistent)) if isinstance(paths, list): - structure = CFG["drs"].get(project, "default") + structure = CFG.get("drs", {}).get( + project, + default_drs.get(project, "default"), + ) paths = dict.fromkeys(paths, structure) - sources: list[DataSource] = [] + sources: list[LocalDataSource] = [] for path, structure in paths.items(): dir_templates = _select_drs("input_dir", project, structure) file_templates = _select_drs("input_file", project, structure) sources.extend( - DataSource(Path(path), d, f) + LocalDataSource( + name="legacy-local", + project=project, + priority=1, + rootpath=Path(path), + dirname_template=d, + filename_template=f, + ignore_warnings=get_ignored_warnings(project, "load"), + ) for d in dir_templates for f in file_templates ) + if project not in _LEGACY_DATA_SOURCES_WARNED: + logger.warning( + ( + "Using legacy data sources for project '%s' using 'rootpath' " + "and 'drs' settings and the path templates from '%s'" + ), + project, + CFG["config_developer_file"], + ) + _LEGACY_DATA_SOURCES_WARNED.add(project) return sources msg = ( @@ -753,6 +924,10 @@ def find_files( ) -> list[LocalFile] | tuple[list[LocalFile], list[Path]]: """Find files on the local filesystem. + .. deprecated:: 2.14.0 + This function is deprecated and will be removed in version 2.16.0. + Please use :meth:`esmvalcore.local.LocalDataSource.find_data` instead. + The directories that are searched for files are defined in :data:`esmvalcore.config.CFG` under the ``'rootpath'`` key using the directory structure defined under the ``'drs'`` key. @@ -810,6 +985,12 @@ def find_files( list[LocalFile] The files that were found. """ + msg = ( + "The function 'esmvalcore.local.find_files' is deprecated and will be removed " + "in version 2.16.0. Please use 'esmvalcore.local.LocalDataSource.find_data'" + ) + warnings.warn(msg, DeprecationWarning, stacklevel=2) + facets = dict(facets) if "original_short_name" in facets: facets["short_name"] = facets["original_short_name"] @@ -818,7 +999,7 @@ def find_files( filter_latest = False data_sources = _get_data_sources(facets["project"]) # type: ignore for data_source in data_sources: - for file in data_source.find_files(**facets): + for file in data_source.find_data(**facets): if file.facets.get("version") == "latest": filter_latest = True files.append(file) @@ -834,23 +1015,53 @@ def find_files( if debug: globs = [] for data_source in data_sources: - globs.extend(data_source.get_glob_patterns(**facets)) + globs.extend(data_source._get_glob_patterns(**facets)) # noqa: SLF001 return files, sorted(globs) return files -class LocalFile(type(Path())): # type: ignore +GRIB_FORMATS = (".grib2", ".grib", ".grb2", ".grb", ".gb2", ".gb") +"""GRIB file extensions.""" + + +def _get_attr_from_field_coord( + ncfield: iris.fileformats.cf.CFVariable, + coord_name: str | None, + attr: str, +) -> Any: + """Get attribute from netCDF field coordinate.""" + if coord_name is not None: + attrs = ncfield.cf_group[coord_name].cf_attrs() + attr_val = [value for (key, value) in attrs if key == attr] + if attr_val: + return attr_val[0] + return None + + +def _restore_lat_lon_units( + cube: iris.cube.Cube, + field: iris.fileformats.cf.CFVariable, + filename: str, # noqa: ARG001 +) -> None: # pylint: disable=unused-argument + """Use this callback to restore the original lat/lon units.""" + # Iris chooses to change longitude and latitude units to degrees + # regardless of value in file, so reinstating file value + for coord in cube.coords(): + if coord.standard_name in ["longitude", "latitude"]: + units = _get_attr_from_field_coord(field, coord.var_name, "units") + if units is not None: + coord.units = units + + +class LocalFile(type(Path()), esmvalcore.io.protocol.DataElement): # type: ignore """File on the local filesystem.""" + def prepare(self) -> None: + """Prepare the data for access.""" + @property def facets(self) -> Facets: - """Facets describing the file. - - Note - ---- - When using :func:`find_files`, facets are read from the directory - structure. Facets stored in filenames are not yet supported. - """ + """Facets are key-value pairs that were used to find this data.""" if not hasattr(self, "_facets"): self._facets: Facets = {} return self._facets @@ -861,7 +1072,7 @@ def facets(self, value: Facets) -> None: @property def attributes(self) -> dict[str, Any]: - """Attributes read from the file.""" + """Attributes are key-value pairs describing the data.""" if not hasattr(self, "_attributes"): msg = ( "Attributes have not been read yet. Call the `to_iris` method " @@ -874,10 +1085,23 @@ def attributes(self) -> dict[str, Any]: def attributes(self, value: dict[str, Any]) -> None: self._attributes = value - def to_iris( - self, - ignore_warnings: list[dict[str, Any]] | None = None, - ) -> iris.cube.CubeList: + @property + def ignore_warnings(self) -> list[dict[str, Any]] | None: + """Warnings to ignore when loading the data. + + The list should contain :class:`dict`s with keyword arguments that + will be passed to the :func:`warnings.filterwarnings` function when + calling the ``to_iris`` method. + """ + if not hasattr(self, "_ignore_warnings"): + self._ignore_warnings: list[dict[str, Any]] | None = None + return self._ignore_warnings + + @ignore_warnings.setter + def ignore_warnings(self, value: list[dict[str, Any]] | None) -> None: + self._ignore_warnings = value + + def to_iris(self) -> iris.cube.CubeList: """Load the data as Iris cubes. Returns @@ -885,7 +1109,20 @@ def to_iris( iris.cube.CubeList The loaded data. """ - cubes = _load_from_file(self, ignore_warnings=ignore_warnings) + file = Path(self) + + with ignore_warnings_context(self.ignore_warnings): + # GRIB files need to be loaded with iris.load, otherwise we will + # get separate (lat, lon) slices for each time step, pressure + # level, etc. + if file.suffix in GRIB_FORMATS: + cubes = iris.load(file, callback=_restore_lat_lon_units) + else: + cubes = iris.load_raw(file, callback=_restore_lat_lon_units) + + for cube in cubes: + cube.attributes.globals["source_file"] = str(file) + # Cache the attributes. self.attributes = copy.deepcopy(dict(cubes[0].attributes.globals)) return cubes diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index ff6f560cac..de4337f948 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -5,7 +5,6 @@ import copy import inspect import logging -from pathlib import Path from pprint import pformat from typing import TYPE_CHECKING, Any, TypeAlias @@ -15,6 +14,7 @@ from esmvalcore._task import BaseTask from esmvalcore.cmor.check import cmor_check_data, cmor_check_metadata from esmvalcore.cmor.fix import fix_data, fix_file, fix_metadata +from esmvalcore.io.protocol import DataElement from esmvalcore.preprocessor._area import ( area_statistics, extract_named_regions, @@ -103,11 +103,12 @@ if TYPE_CHECKING: from collections.abc import Callable, Iterable, Sequence + from pathlib import Path import prov.model from dask.delayed import Delayed - from esmvalcore.dataset import Dataset, File + from esmvalcore.dataset import Dataset logger = logging.getLogger(__name__) @@ -374,7 +375,7 @@ def _run_preproc_function( function: Callable, items: PreprocessorItem | Sequence[PreprocessorItem], kwargs: Any, - input_files: Sequence[File] | None = None, + input_files: Sequence[DataElement] | None = None, ) -> PreprocessorItem | Sequence[PreprocessorItem]: """Run preprocessor function.""" kwargs_str = ",\n".join( @@ -410,7 +411,7 @@ def _run_preproc_function( ) # Make sure that the arguments are indexable - if isinstance(items, (PreprocessorFile, Cube, str, Path)): + if isinstance(items, (PreprocessorFile, Cube, DataElement)): items = [items] if isinstance(items, set): items = list(items) @@ -438,7 +439,7 @@ def _run_preproc_function( def preprocess( items: Sequence[PreprocessorItem], step: str, - input_files: list[File] | None = None, + input_files: list[DataElement] | None = None, output_file: Path | None = None, debug: bool = False, **settings: Any, @@ -478,7 +479,7 @@ def preprocess( items = [] for item in result: - if isinstance(item, (PreprocessorFile, Cube, str, Path)): + if isinstance(item, (PreprocessorFile, Cube, DataElement)): items.append(item) else: items.extend(item) @@ -573,7 +574,7 @@ def apply(self, step: str, debug: bool = False) -> None: self.cubes, step, input_files=self._input_files, - output_file=self.filename, + output_file=self.filename, # type: ignore[arg-type] debug=debug, **self.settings[step], ) @@ -646,7 +647,7 @@ def _initialize_entity(self) -> None: settings = { "preprocessor:" + k: str(v) for k, v in self.settings.items() } - self.entity.add_attributes(settings) + self.entity.add_attributes(settings) # type: ignore[attr-defined] def group(self, keys: list) -> str: """Generate group keyword. @@ -671,7 +672,7 @@ def group(self, keys: list) -> str: return "_".join(identifier) -PreprocessorItem: TypeAlias = PreprocessorFile | Cube | str | Path +PreprocessorItem: TypeAlias = PreprocessorFile | Cube | DataElement def _apply_multimodel( diff --git a/esmvalcore/preprocessor/_io.py b/esmvalcore/preprocessor/_io.py index f050c4cfa7..229854cd75 100644 --- a/esmvalcore/preprocessor/_io.py +++ b/esmvalcore/preprocessor/_io.py @@ -20,16 +20,14 @@ from esmvalcore._task import write_ncl_settings from esmvalcore.exceptions import ESMValCoreLoadWarning -from esmvalcore.iris_helpers import ( - dataset_to_iris, - ignore_warnings_context, -) +from esmvalcore.io.protocol import DataElement +from esmvalcore.iris_helpers import dataset_to_iris +from esmvalcore.local import LocalFile if TYPE_CHECKING: from collections.abc import Sequence from dask.delayed import Delayed - from iris.fileformats.cf import CFVariable logger = logging.getLogger(__name__) @@ -42,40 +40,16 @@ "reference_dataset", "alternative_dataset", } -GRIB_FORMATS = (".grib2", ".grib", ".grb2", ".grb", ".gb2", ".gb") - - -def _get_attr_from_field_coord( - ncfield: CFVariable, - coord_name: str | None, - attr: str, -) -> Any: - """Get attribute from netCDF field coordinate.""" - if coord_name is not None: - attrs = ncfield.cf_group[coord_name].cf_attrs() - attr_val = [value for (key, value) in attrs if key == attr] - if attr_val: - return attr_val[0] - return None - - -def _restore_lat_lon_units( - cube: Cube, - field: CFVariable, - filename: str, # noqa: ARG001 -) -> None: # pylint: disable=unused-argument - """Use this callback to restore the original lat/lon units.""" - # Iris chooses to change longitude and latitude units to degrees - # regardless of value in file, so reinstating file value - for coord in cube.coords(): - if coord.standard_name in ["longitude", "latitude"]: - units = _get_attr_from_field_coord(field, coord.var_name, "units") - if units is not None: - coord.units = units def load( - file: str | Path | Cube | CubeList | xr.Dataset | ncdata.NcData, + file: str + | Path + | DataElement + | Cube + | CubeList + | xr.Dataset + | ncdata.NcData, ignore_warnings: list[dict[str, Any]] | None = None, backend_kwargs: dict[str, Any] | None = None, ) -> CubeList: @@ -113,8 +87,8 @@ def load( Invalid type for ``file``. """ - if hasattr(file, "to_iris"): - cubes = file.to_iris(ignore_warnings=ignore_warnings) + if isinstance(file, DataElement): + cubes = file.to_iris() elif isinstance(file, (str, Path)): extension = ( file.suffix @@ -122,7 +96,9 @@ def load( else os.path.splitext(file)[1] ) if "zarr" not in extension: - cubes = _load_from_file(file, ignore_warnings=ignore_warnings) + local_file = LocalFile(file) + local_file.ignore_warnings = ignore_warnings + cubes = local_file.to_iris() else: cubes = _load_zarr( file, @@ -161,7 +137,7 @@ def load( def _load_zarr( - file: str | Path | Cube | CubeList | xr.Dataset | ncdata.NcData, + file: str | Path, ignore_warnings: list[dict[str, Any]] | None = None, backend_kwargs: dict[str, Any] | None = None, ) -> CubeList: @@ -222,30 +198,6 @@ def _load_zarr( return dataset_to_iris(zarr_xr, ignore_warnings=ignore_warnings) -def _load_from_file( - file: str | Path, - ignore_warnings: list[dict[str, Any]] | None = None, -) -> CubeList: - """Load data from file.""" - file = Path(file) - logger.debug("Loading:\n%s", file) - - with ignore_warnings_context(ignore_warnings): - # GRIB files need to be loaded with iris.load, otherwise we will - # get separate (lat, lon) slices for each time step, pressure - # level, etc. - if file.suffix in GRIB_FORMATS: - cubes = iris.load(file, callback=_restore_lat_lon_units) - else: - cubes = iris.load_raw(file, callback=_restore_lat_lon_units) - logger.debug("Done with loading %s", file) - - for cube in cubes: - cube.attributes.globals["source_file"] = str(file) - - return cubes - - def save( # noqa: C901 cubes: Sequence[Cube], filename: Path | str, diff --git a/esmvalcore/typing.py b/esmvalcore/typing.py index 7880bdac1b..1e3735d4f2 100644 --- a/esmvalcore/typing.py +++ b/esmvalcore/typing.py @@ -3,19 +3,18 @@ from __future__ import annotations from collections.abc import Iterable, Sequence -from numbers import Number import dask.array as da import numpy as np from iris.cube import Cube -FacetValue = str | Sequence[str] | Number | bool +FacetValue = str | Sequence[str] | int """Type describing a single facet.""" Facets = dict[str, FacetValue] """Type describing a collection of facets.""" -NetCDFAttr = str | Number | Iterable +NetCDFAttr = str | int | float | Iterable """Type describing netCDF attributes. `NetCDF attributes diff --git a/pyproject.toml b/pyproject.toml index 0f9b12a331..22721ed465 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dependencies = [ "fire", "geopy", "humanfriendly", + "intake-esgf>=2025.10.22", "intake-esm", "iris-grib>=0.20.0", # github.com/ESMValGroup/ESMValCore/issues/2535 "isodate>=0.7.0", @@ -142,6 +143,7 @@ minversion = "6" markers = [ "installation: Test requires installation of dependencies", "use_sample_data: Run functional tests using real data", + "online: Run tests that require internet access", ] testpaths = ["tests"] xfail_strict = true @@ -219,6 +221,7 @@ ignore = [ "D102", # Missing docstring in public method "D103", # Missing docstring in public function "D104", # Missing docstring in public package + "PT013", # Allow importing fixtures from pytest to avoid repeating 'pytest' many times ] "doc/gensidebar.py" = [ "INP001", # File is part of an implicit namespace package diff --git a/tests/conftest.py b/tests/conftest.py index aaea60b9be..1cc8630b88 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -26,7 +26,6 @@ def _load_default_config(): "ignore", message="Do not instantiate `Config` objects directly", category=UserWarning, - module="esmvalcore", ) cfg = Config() cfg.load_from_dirs([]) @@ -49,7 +48,6 @@ def ignore_existing_user_config(monkeypatch, cfg_default): @pytest.fixture def session(tmp_path: Path, ignore_existing_user_config, monkeypatch): """Session object with default settings.""" - monkeypatch.setitem(CFG, "rootpath", {"default": {tmp_path: "default"}}) monkeypatch.setitem(CFG, "output_dir", tmp_path / "esmvaltool_output") return CFG.start_session("recipe_test") diff --git a/tests/integration/cmor/_fixes/icon/conftest.py b/tests/integration/cmor/_fixes/icon/conftest.py new file mode 100644 index 0000000000..e8f323c175 --- /dev/null +++ b/tests/integration/cmor/_fixes/icon/conftest.py @@ -0,0 +1,36 @@ +"""Fixtures for ICON fixes tests.""" + +import importlib.resources +from pathlib import Path + +import pytest +import yaml + +import esmvalcore.config +from esmvalcore.cmor._fixes.icon._base_fixes import IconFix + + +@pytest.fixture(autouse=True) +def tmp_cache_dir(monkeypatch, tmp_path): + """Use temporary path as cache directory for all tests in this module.""" + monkeypatch.setattr(IconFix, "CACHE_DIR", tmp_path) + + +@pytest.fixture +def session( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, + session: esmvalcore.config.Session, +) -> esmvalcore.config.Session: + """Configure ICON data source for all tests in this module.""" + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / "data-native-icon.yml", + ) as config_file: + cfg = yaml.safe_load(config_file.read_text(encoding="utf-8")) + for data_source in cfg["projects"]["ICON"]["data"]: + cfg["projects"]["ICON"]["data"][data_source]["rootpath"] = tmp_path + session["projects"]["ICON"]["data"] = cfg["projects"]["ICON"]["data"] + session["auxiliary_data_dir"] = tmp_path + return session diff --git a/tests/integration/cmor/_fixes/icon/test_icon.py b/tests/integration/cmor/_fixes/icon/test_icon.py index ce7cd6317a..de6e205f52 100644 --- a/tests/integration/cmor/_fixes/icon/test_icon.py +++ b/tests/integration/cmor/_fixes/icon/test_icon.py @@ -1,6 +1,5 @@ """Test the ICON on-the-fly CMORizer.""" -from copy import deepcopy from datetime import datetime from pathlib import Path from unittest import mock @@ -26,7 +25,6 @@ ) from esmvalcore.cmor.fix import Fix from esmvalcore.cmor.table import CoordinateInfo, get_var_info -from esmvalcore.config import CFG from esmvalcore.dataset import Dataset TEST_GRID_FILE_URI = ( @@ -36,12 +34,6 @@ TEST_GRID_FILE_NAME = "icon_grid.nc" -@pytest.fixture(autouse=True) -def tmp_cache_dir(monkeypatch, tmp_path): - """Use temporary path as cache directory for all tests in this module.""" - monkeypatch.setattr(IconFix, "CACHE_DIR", tmp_path) - - @pytest.fixture def cubes_atm_2d(test_data_path): """2D sample cubes.""" @@ -572,9 +564,10 @@ def test_get_areacella_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_areacella_fix(cubes_grid): +@pytest.mark.online +def test_areacella_fix(cubes_grid, session): """Test fix.""" - fix = get_allvars_fix("fx", "areacella") + fix = get_allvars_fix("fx", "areacella", session=session) fix.extra_facets["var_type"] = "fx" fixed_cubes = fix.fix_metadata(cubes_grid) @@ -595,9 +588,10 @@ def test_get_areacello_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_areacello_fix(cubes_grid): +@pytest.mark.online +def test_areacello_fix(cubes_grid, session): """Test fix.""" - fix = get_allvars_fix("Ofx", "areacello") + fix = get_allvars_fix("Ofx", "areacello", session=session) fix.extra_facets["var_type"] = "fx" fixed_cubes = fix.fix_metadata(cubes_grid) @@ -655,9 +649,10 @@ def test_get_lwp_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_lwp_fix(cubes_atm_2d): +@pytest.mark.online +def test_lwp_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("AERmon", "lwp") + fix = get_allvars_fix("AERmon", "lwp", session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -683,9 +678,10 @@ def test_get_rsdt_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_rsdt_fix(cubes_atm_2d): +@pytest.mark.online +def test_rsdt_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "rsdt") + fix = get_allvars_fix("Amon", "rsdt", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -706,9 +702,10 @@ def test_get_rsut_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_rsut_fix(cubes_atm_2d): +@pytest.mark.online +def test_rsut_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "rsut") + fix = get_allvars_fix("Amon", "rsut", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -732,9 +729,10 @@ def test_get_siconc_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_siconc_fix(cubes_atm_2d): +@pytest.mark.online +def test_siconc_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("SImon", "siconc") + fix = get_allvars_fix("SImon", "siconc", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) cube = check_siconc_metadata( @@ -758,9 +756,10 @@ def test_get_siconca_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_siconca_fix(cubes_atm_2d): +@pytest.mark.online +def test_siconca_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("SImon", "siconca") + fix = get_allvars_fix("SImon", "siconca", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) cube = check_siconc_metadata( @@ -787,9 +786,10 @@ def test_get_ta_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_ta_fix(cubes_atm_3d): +@pytest.mark.online +def test_ta_fix(cubes_atm_3d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "ta") + fix = get_allvars_fix("Amon", "ta", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_3d) cube = check_ta_metadata(fixed_cubes) @@ -798,9 +798,10 @@ def test_ta_fix(cubes_atm_3d): check_lat_lon(cube) -def test_ta_fix_no_plev_bounds(cubes_atm_3d): +@pytest.mark.online +def test_ta_fix_no_plev_bounds(cubes_atm_3d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "ta") + fix = get_allvars_fix("Amon", "ta", session=session) cubes = CubeList( [ cubes_atm_3d.extract_cube(NameConstraint(var_name="ta")), @@ -824,9 +825,10 @@ def test_get_tas_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_tas_fix(cubes_atm_2d): +@pytest.mark.online +def test_tas_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) cube = check_tas_metadata(fixed_cubes) @@ -835,9 +837,10 @@ def test_tas_fix(cubes_atm_2d): check_heightxm(cube, 2.0) -def test_tas_spatial_index_coord_already_present(cubes_atm_2d): +@pytest.mark.online +def test_tas_spatial_index_coord_already_present(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) index_coord = DimCoord(np.arange(8), var_name="ncells") cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) @@ -849,9 +852,10 @@ def test_tas_spatial_index_coord_already_present(cubes_atm_2d): check_lat_lon(cube) -def test_tas_scalar_height2m_already_present(cubes_atm_2d): +@pytest.mark.online +def test_tas_scalar_height2m_already_present(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) # Scalar height (with wrong metadata) already present height_coord = AuxCoord(2.0, var_name="h", standard_name="height") @@ -901,9 +905,10 @@ def test_tas_no_mesh(cubes_atm_2d): assert cube.coord_dims(lat) == cube.coord_dims(i_coord) -def test_tas_dim_height2m_already_present(cubes_atm_2d): +@pytest.mark.online +def test_tas_dim_height2m_already_present(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) # Dimensional coordinate height (with wrong metadata) already present height_coord = AuxCoord(2.0, var_name="h", standard_name="height") @@ -920,9 +925,10 @@ def test_tas_dim_height2m_already_present(cubes_atm_2d): check_heightxm(cube, 2.0) -def test_tas_no_shift_time(cubes_atm_2d): +@pytest.mark.online +def test_tas_no_shift_time(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets["shift_time"] = False fixed_cubes = fix.fix_metadata(cubes_atm_2d) @@ -944,9 +950,10 @@ def test_tas_no_shift_time(cubes_atm_2d): assert time.attributes == {} -def test_fix_does_not_change_cached_grid(cubes_atm_2d): +@pytest.mark.online +def test_fix_does_not_change_cached_grid(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert not fix._horizontal_grids assert not fix._meshes @@ -975,9 +982,10 @@ def test_get_uas_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_uas_fix(cubes_atm_2d): +@pytest.mark.online +def test_uas_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "uas") + fix = get_allvars_fix("Amon", "uas", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -1001,9 +1009,10 @@ def test_uas_fix(cubes_atm_2d): assert height.bounds is None -def test_uas_scalar_height10m_already_present(cubes_atm_2d): +@pytest.mark.online +def test_uas_scalar_height10m_already_present(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "uas") + fix = get_allvars_fix("Amon", "uas", session=session) # Scalar height (with wrong metadata) already present height_coord = AuxCoord(10.0, var_name="h", standard_name="height") @@ -1017,9 +1026,10 @@ def test_uas_scalar_height10m_already_present(cubes_atm_2d): check_heightxm(cube, 10.0) -def test_uas_dim_height10m_already_present(cubes_atm_2d): +@pytest.mark.online +def test_uas_dim_height10m_already_present(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "uas") + fix = get_allvars_fix("Amon", "uas", session=session) # Dimensional coordinate height (with wrong metadata) already present height_coord = AuxCoord(10.0, var_name="h", standard_name="height") @@ -1108,9 +1118,10 @@ def test_ch4clim_fix(cubes_regular_grid): # Test fix with empty standard_name -def test_empty_standard_name_fix(cubes_atm_2d, monkeypatch): +@pytest.mark.online +def test_empty_standard_name_fix(cubes_atm_2d, monkeypatch, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) # We know that tas has a standard name, but this being native model output # there may be variables with no standard name. The code is designed to # handle this gracefully and here we test it with an artificial, but @@ -1130,7 +1141,8 @@ def test_empty_standard_name_fix(cubes_atm_2d, monkeypatch): # Test automatic addition of missing coordinates -def test_add_time(cubes_atm_2d): +@pytest.mark.online +def test_add_time(cubes_atm_2d, session): """Test fix.""" # Remove time from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) @@ -1139,7 +1151,7 @@ def test_add_time(cubes_atm_2d): tas_cube.remove_coord("time") cubes = CubeList([tas_cube, uas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fixed_cubes = fix.fix_metadata(cubes) cube = check_tas_metadata(fixed_cubes) @@ -1162,13 +1174,14 @@ def test_add_time_fail(): fix._add_time(cube, cubes) -def test_add_latitude(cubes_atm_2d): +@pytest.mark.online +def test_add_latitude(cubes_atm_2d, session): """Test fix.""" # Remove latitude from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) tas_cube.remove_coord("latitude") cubes = CubeList([tas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert len(fix._horizontal_grids) == 0 fixed_cubes = fix.fix_metadata(cubes) @@ -1180,13 +1193,14 @@ def test_add_latitude(cubes_atm_2d): assert TEST_GRID_FILE_NAME in fix._horizontal_grids -def test_add_longitude(cubes_atm_2d): +@pytest.mark.online +def test_add_longitude(cubes_atm_2d, session): """Test fix.""" # Remove longitude from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) tas_cube.remove_coord("longitude") cubes = CubeList([tas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert len(fix._horizontal_grids) == 0 fixed_cubes = fix.fix_metadata(cubes) @@ -1198,14 +1212,15 @@ def test_add_longitude(cubes_atm_2d): assert TEST_GRID_FILE_NAME in fix._horizontal_grids -def test_add_latitude_longitude(cubes_atm_2d): +@pytest.mark.online +def test_add_latitude_longitude(cubes_atm_2d, session): """Test fix.""" # Remove latitude and longitude from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) tas_cube.remove_coord("latitude") tas_cube.remove_coord("longitude") cubes = CubeList([tas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert len(fix._horizontal_grids) == 0 fixed_cubes = fix.fix_metadata(cubes) @@ -1259,14 +1274,15 @@ def test_add_coord_from_grid_file_fail_no_url(): fix._add_coord_from_grid_file(Cube(0), "clat") -def test_add_coord_from_grid_fail_no_unnamed_dim(cubes_atm_2d): +@pytest.mark.online +def test_add_coord_from_grid_fail_no_unnamed_dim(cubes_atm_2d, session): """Test fix.""" # Remove latitude from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) tas_cube.remove_coord("latitude") index_coord = DimCoord(np.arange(8), var_name="ncells") tas_cube.add_dim_coord(index_coord, 1) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) msg = ( "Cannot determine coordinate dimension for coordinate 'clat', " @@ -1276,13 +1292,14 @@ def test_add_coord_from_grid_fail_no_unnamed_dim(cubes_atm_2d): fix._add_coord_from_grid_file(tas_cube, "clat") -def test_add_coord_from_grid_fail_two_unnamed_dims(cubes_atm_2d): +@pytest.mark.online +def test_add_coord_from_grid_fail_two_unnamed_dims(cubes_atm_2d, session): """Test fix.""" # Remove latitude from tas cube to test automatic addition tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) tas_cube.remove_coord("latitude") tas_cube = iris.util.new_axis(tas_cube) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) msg = ( "Cannot determine coordinate dimension for coordinate 'clat', " @@ -1321,19 +1338,16 @@ def test_get_horizontal_grid_from_attr_cached_in_dict( @mock.patch.object(IconFix, "_get_grid_from_facet", autospec=True) def test_get_horizontal_grid_from_attr_rootpath( mock_get_grid_from_facet, - monkeypatch, tmp_path, + session, ): """Test fix.""" - rootpath = deepcopy(CFG["rootpath"]) - rootpath["ICON"] = str(tmp_path) - monkeypatch.setitem(CFG, "rootpath", rootpath) cube = Cube(0, attributes={"grid_file_uri": "grid.nc"}) grid_cube = Cube(0, var_name="test_grid_cube") (tmp_path / "amip").mkdir(parents=True, exist_ok=True) iris.save(grid_cube, tmp_path / "amip" / "grid.nc") - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fix._horizontal_grids["grid_from_facet.nc"] = mock.sentinel.wrong_grid grid = fix.get_horizontal_grid(cube) @@ -1353,6 +1367,7 @@ def test_get_horizontal_grid_from_attr_cached_in_file( mock_requests, mock_get_grid_from_facet, tmp_path, + session, ): """Test fix.""" cube = Cube( @@ -1361,7 +1376,7 @@ def test_get_horizontal_grid_from_attr_cached_in_file( "grid_file_uri": "https://temporary.url/this/is/the/grid_file.nc", }, ) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert len(fix._horizontal_grids) == 0 # Save temporary grid file @@ -1380,15 +1395,17 @@ def test_get_horizontal_grid_from_attr_cached_in_file( mock_get_grid_from_facet.assert_not_called() +@pytest.mark.online @mock.patch.object(IconFix, "_get_grid_from_facet", autospec=True) def test_get_horizontal_grid_from_attr_cache_file_too_old( mock_get_grid_from_facet, tmp_path, monkeypatch, + session, ): """Test fix.""" cube = Cube(0, attributes={"grid_file_uri": TEST_GRID_FILE_URI}) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) assert len(fix._horizontal_grids) == 0 # Save temporary grid file @@ -1417,11 +1434,9 @@ def test_get_horizontal_grid_from_attr_cache_file_too_old( def test_get_horizontal_grid_from_facet_cached_in_dict( mock_get_grid_from_cube_attr, tmp_path, + session, ): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path - # Save temporary grid file (this will not be used; however, it is necessary # to not raise a FileNotFoundError) grid_path = "grid.nc" @@ -1451,11 +1466,9 @@ def test_get_horizontal_grid_from_facet( mock_get_grid_from_cube_attr, grid_path, tmp_path, + session, ): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path - # Make sure that grid specified by cube attribute is NOT used cube = Cube(0, attributes={"grid_file_uri": "cached_grid_url.nc"}) @@ -1479,11 +1492,8 @@ def test_get_horizontal_grid_from_facet( mock_get_grid_from_cube_attr.assert_not_called() -def test_get_horizontal_grid_from_facet_fail(tmp_path): +def test_get_horizontal_grid_from_facet_fail(session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path - cube = Cube(0) fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets["horizontal_grid"] = "/this/does/not/exist.nc" @@ -1742,9 +1752,10 @@ def test_invalid_time_units(cubes_atm_2d): # Test fix with (sub-)hourly data -def test_hourly_data(cubes_atm_2d): +@pytest.mark.online +def test_hourly_data(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets["frequency"] = "1hr" for cube in cubes_atm_2d: cube.coord("time").points = [20041104.5833333] @@ -2093,8 +2104,9 @@ def test_get_previous_timestep(frequency, datetime_in, datetime_out): # Test mesh creation raises warning because bounds do not match vertices +@pytest.mark.online @mock.patch("esmvalcore.cmor._fixes.icon._base_fixes.logger", autospec=True) -def test_get_mesh_fail_invalid_clat_bounds(mock_logger, cubes_atm_2d): +def test_get_mesh_fail_invalid_clat_bounds(mock_logger, cubes_atm_2d, session): """Test fix.""" # Slightly modify latitude bounds from tas cube to make mesh creation fail tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) @@ -2102,7 +2114,7 @@ def test_get_mesh_fail_invalid_clat_bounds(mock_logger, cubes_atm_2d): lat_bnds[0, 0] = 40.0 tas_cube.coord("latitude").bounds = lat_bnds cubes = CubeList([tas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fixed_cubes = fix.fix_metadata(cubes) cube = check_tas_metadata(fixed_cubes) @@ -2117,8 +2129,9 @@ def test_get_mesh_fail_invalid_clat_bounds(mock_logger, cubes_atm_2d): ) +@pytest.mark.online @mock.patch("esmvalcore.cmor._fixes.icon._base_fixes.logger", autospec=True) -def test_get_mesh_fail_invalid_clon_bounds(mock_logger, cubes_atm_2d): +def test_get_mesh_fail_invalid_clon_bounds(mock_logger, cubes_atm_2d, session): """Test fix.""" # Slightly modify longitude bounds from tas cube to make mesh creation fail tas_cube = cubes_atm_2d.extract_cube(NameConstraint(var_name="tas")) @@ -2126,7 +2139,7 @@ def test_get_mesh_fail_invalid_clon_bounds(mock_logger, cubes_atm_2d): lon_bnds[0, 1] = 40.0 tas_cube.coord("longitude").bounds = lon_bnds cubes = CubeList([tas_cube]) - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fixed_cubes = fix.fix_metadata(cubes) cube = check_tas_metadata(fixed_cubes) @@ -2189,11 +2202,8 @@ def test_get_mesh_not_cached_from_attr(monkeypatch): fix._create_mesh.assert_called_once_with(cube) -def test_get_mesh_cached_from_facet(monkeypatch, tmp_path): +def test_get_mesh_cached_from_facet(monkeypatch, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path - # Save temporary grid file (this will not be used; however, it is necessary # to not raise a FileNotFoundError) grid_path = "grid.nc" @@ -2213,11 +2223,8 @@ def test_get_mesh_cached_from_facet(monkeypatch, tmp_path): fix._create_mesh.assert_not_called() -def test_get_mesh_not_cached_from_facet(monkeypatch, tmp_path): +def test_get_mesh_not_cached_from_facet(monkeypatch, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path - # Save temporary grid file (this will not be used; however, it is necessary # to not raise a FileNotFoundError) grid_path = "grid.nc" @@ -2245,10 +2252,8 @@ def test_get_mesh_not_cached_from_facet(monkeypatch, tmp_path): ("b.nc", "Grid file", "{tmp_path}/b.nc"), ], ) -def test_get_path_from_facet(path, description, output, tmp_path): +def test_get_path_from_facet(path, description, output, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path path = path.format(tmp_path=tmp_path) fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets["test_path"] = path @@ -2271,10 +2276,8 @@ def test_get_path_from_facet(path, description, output, tmp_path): ("b.nc", "Grid file"), ], ) -def test_get_path_from_facet_fail(path, description, tmp_path): +def test_get_path_from_facet_fail(path, description, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path path = path.format(tmp_path=tmp_path) fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets["test_path"] = path @@ -2288,10 +2291,8 @@ def test_get_path_from_facet_fail(path, description, tmp_path): @pytest.mark.parametrize("facet", ["zg_file", "zghalf_file"]) @pytest.mark.parametrize("path", ["{tmp_path}/a.nc", "a.nc"]) -def test_add_additional_cubes(path, facet, tmp_path): +def test_add_additional_cubes(path, facet, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path path = path.format(tmp_path=tmp_path) fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets[facet] = path @@ -2310,10 +2311,8 @@ def test_add_additional_cubes(path, facet, tmp_path): @pytest.mark.parametrize("facet", ["zg_file", "zghalf_file"]) @pytest.mark.parametrize("path", ["{tmp_path}/a.nc", "a.nc"]) -def test_add_additional_cubes_fail(path, facet, tmp_path): +def test_add_additional_cubes_fail(path, facet, tmp_path, session): """Test fix.""" - session = CFG.start_session("my session") - session["auxiliary_data_dir"] = tmp_path path = path.format(tmp_path=tmp_path) fix = get_allvars_fix("Amon", "tas", session=session) fix.extra_facets[facet] = path diff --git a/tests/integration/cmor/_fixes/icon/test_icon_xpp.py b/tests/integration/cmor/_fixes/icon/test_icon_xpp.py index 42d711dd43..a089eba095 100644 --- a/tests/integration/cmor/_fixes/icon/test_icon_xpp.py +++ b/tests/integration/cmor/_fixes/icon/test_icon_xpp.py @@ -10,7 +10,7 @@ import esmvalcore.cmor._fixes.icon.icon_xpp from esmvalcore.cmor._fixes.fix import GenericFix -from esmvalcore.cmor._fixes.icon._base_fixes import AllVarsBase, IconFix +from esmvalcore.cmor._fixes.icon._base_fixes import AllVarsBase from esmvalcore.cmor._fixes.icon.icon_xpp import ( AllVars, Clwvi, @@ -30,12 +30,6 @@ from esmvalcore.dataset import Dataset -@pytest.fixture(autouse=True) -def tmp_cache_dir(monkeypatch, tmp_path): - """Use temporary path as cache directory for all tests in this module.""" - monkeypatch.setattr(IconFix, "CACHE_DIR", tmp_path) - - @pytest.fixture def cubes_atm_2d(test_data_path): """2D sample cubes.""" @@ -732,7 +726,8 @@ def test_get_rlutcs_fix(): assert fix == [Rlutcs(None), AllVars(None), GenericFix(None)] -def test_rlutcs_fix(cubes_atm_3d): +@pytest.mark.online +def test_rlutcs_fix(cubes_atm_3d, session): """Test fix.""" cube = cubes_atm_3d.extract_cube(NameConstraint(var_name="temp")) cube.var_name = "lwflx_up_clr" @@ -740,7 +735,7 @@ def test_rlutcs_fix(cubes_atm_3d): cube.data = np.arange(1 * 47 * 8, dtype=np.float32).reshape(1, 47, 8) cubes = CubeList([cube]) - fixed_cubes = fix_metadata(cubes, "Amon", "rlutcs") + fixed_cubes = fix_metadata(cubes, "Amon", "rlutcs", session=session) assert len(fixed_cubes) == 1 cube = fixed_cubes[0] @@ -770,9 +765,10 @@ def test_get_rsdt_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_rsdt_fix(cubes_atm_2d): +@pytest.mark.online +def test_rsdt_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "rsdt") + fix = get_allvars_fix("Amon", "rsdt", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -793,9 +789,10 @@ def test_get_rsut_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_rsut_fix(cubes_atm_2d): +@pytest.mark.online +def test_rsut_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "rsut") + fix = get_allvars_fix("Amon", "rsut", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) assert len(fixed_cubes) == 1 @@ -819,7 +816,8 @@ def test_get_rsutcs_fix(): assert fix == [Rsutcs(None), AllVars(None), GenericFix(None)] -def test_rsutcs_fix(cubes_atm_3d): +@pytest.mark.online +def test_rsutcs_fix(cubes_atm_3d, session): """Test fix.""" cube = cubes_atm_3d.extract_cube(NameConstraint(var_name="temp")) cube.var_name = "swflx_up_clr" @@ -827,7 +825,7 @@ def test_rsutcs_fix(cubes_atm_3d): cube.data = np.arange(1 * 47 * 8, dtype=np.float32).reshape(1, 47, 8) cubes = CubeList([cube]) - fixed_cubes = fix_metadata(cubes, "Amon", "rsutcs") + fixed_cubes = fix_metadata(cubes, "Amon", "rsutcs", session=session) assert len(fixed_cubes) == 1 cube = fixed_cubes[0] @@ -923,7 +921,8 @@ def test_get_siconc_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_siconc_fix(cubes_ocean_3d): +@pytest.mark.online +def test_siconc_fix(cubes_ocean_3d, session): """Test fix.""" cubes = CubeList( [cubes_ocean_3d.extract_cube(NameConstraint(var_name="to")).copy()], @@ -936,7 +935,7 @@ def test_siconc_fix(cubes_ocean_3d): cubes[0].remove_coord("depth") cubes[0].add_dim_coord(DimCoord(0.0, var_name="lev"), 1) - fix = get_allvars_fix("SImon", "siconc") + fix = get_allvars_fix("SImon", "siconc", session=session) fixed_cubes = fix.fix_metadata(cubes) cube = check_siconc_metadata( @@ -978,9 +977,10 @@ def test_get_siconca_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_siconca_fix(cubes_atm_2d): +@pytest.mark.online +def test_siconca_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("SImon", "siconca") + fix = get_allvars_fix("SImon", "siconca", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) cube = check_siconc_metadata( @@ -1007,9 +1007,10 @@ def test_get_ta_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_ta_fix(cubes_atm_3d): +@pytest.mark.online +def test_ta_fix(cubes_atm_3d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "ta") + fix = get_allvars_fix("Amon", "ta", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_3d) cube = check_ta_metadata(fixed_cubes) @@ -1030,9 +1031,10 @@ def test_get_tas_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_tas_fix(cubes_atm_2d): +@pytest.mark.online +def test_tas_fix(cubes_atm_2d, session): """Test fix.""" - fix = get_allvars_fix("Amon", "tas") + fix = get_allvars_fix("Amon", "tas", session=session) fixed_cubes = fix.fix_metadata(cubes_atm_2d) cube = check_tas_metadata(fixed_cubes) @@ -1068,9 +1070,10 @@ def test_get_thetao_fix(): assert fix == [AllVars(None), GenericFix(None)] -def test_thetao_fix(cubes_ocean_3d): +@pytest.mark.online +def test_thetao_fix(cubes_ocean_3d, session): """Test fix.""" - fix = get_allvars_fix("Omon", "thetao") + fix = get_allvars_fix("Omon", "thetao", session=session) fixed_cubes = fix.fix_metadata(cubes_ocean_3d) @@ -1089,7 +1092,8 @@ def test_thetao_fix(cubes_ocean_3d): assert cube.shape == (1, 47, 8) -def test_thetao_fix_already_bounds(cubes_ocean_3d): +@pytest.mark.online +def test_thetao_fix_already_bounds(cubes_ocean_3d, session): """Test fix.""" cube = cubes_ocean_3d.extract_cube(NameConstraint(var_name="to")) cube.coord("depth").guess_bounds() @@ -1098,7 +1102,7 @@ def test_thetao_fix_already_bounds(cubes_ocean_3d): cube.coord("depth").bounds = bounds cubes = CubeList([cube]) - fix = get_allvars_fix("Omon", "thetao") + fix = get_allvars_fix("Omon", "thetao", session=session) fixed_cubes = fix.fix_metadata(cubes) @@ -1118,12 +1122,13 @@ def test_thetao_fix_already_bounds(cubes_ocean_3d): assert cube.shape == (1, 47, 8) -def test_thetao_fix_no_bounds(cubes_ocean_3d): +@pytest.mark.online +def test_thetao_fix_no_bounds(cubes_ocean_3d, session): """Test fix.""" cube = cubes_ocean_3d.extract_cube(NameConstraint(var_name="to")) cubes = CubeList([cube]) - fix = get_allvars_fix("Omon", "thetao") + fix = get_allvars_fix("Omon", "thetao", session=session) fixed_cubes = fix.fix_metadata(cubes) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 391d2ab258..85b1505866 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -1,4 +1,5 @@ import os +from collections.abc import Iterator from pathlib import Path import iris @@ -11,6 +12,7 @@ _select_drs, _select_files, ) +from esmvalcore.typing import Facets def create_test_file(filename, tracking_id=None): @@ -27,7 +29,12 @@ def create_test_file(filename, tracking_id=None): iris.save(cube, filename) -def _get_files(root_path, facets, tracking_id): +def _get_files( # noqa: C901,PLR0912 + root_path: Path, + facets: Facets, + tracking_id: Iterator[int], + suffix: str = "nc", +) -> tuple[list[LocalFile], list[Path]]: """Return dummy files. Wildcards are only supported for `dataset` and `institute`; in this case @@ -43,8 +50,8 @@ def _get_files(root_path, facets, tracking_id): all_facets = [facets] # Globs without expanded facets - dir_template = _select_drs("input_dir", facets["project"], "default") - file_template = _select_drs("input_file", facets["project"], "default") + dir_template = _select_drs("input_dir", facets["project"], "default") # type: ignore[arg-type] + file_template = _select_drs("input_file", facets["project"], "default") # type: ignore[arg-type] dir_globs = _replace_tags(dir_template, facets) file_globs = _replace_tags(file_template, facets) globs = sorted( @@ -56,49 +63,57 @@ def _get_files(root_path, facets, tracking_id): filenames = [] dir_template = _select_drs( "input_dir", - expanded_facets["project"], + expanded_facets["project"], # type: ignore[arg-type] "default", ) file_template = _select_drs( "input_file", - expanded_facets["project"], + expanded_facets["project"], # type: ignore[arg-type] "default", ) + dir_globs = _replace_tags(dir_template, expanded_facets) file_globs = _replace_tags(file_template, expanded_facets) filename = str( root_path / "input" / dir_globs[0] / Path(file_globs[0]).name, ) + if filename.endswith("nc"): + filename = f"{filename[:-2]}{suffix}" + + if filename.endswith(f"[_.]*{suffix}"): + filename = filename.replace(f"[_.]*{suffix}", f"_*.{suffix}") - if filename.endswith("[_.]*nc"): - filename = filename.replace("[_.]*nc", "_*.nc") - - if filename.endswith("*.nc"): - filename = filename[: -len("*.nc")] + "_" - if facets["frequency"] == "fx": - intervals = [""] - else: - intervals = [ - "1990_1999", - "2000_2009", - "2010_2019", - ] + if facets["frequency"] == "fx": + intervals = [""] + else: + intervals = [ + "1990-1999", + "2000-2009", + "2010-2019", + ] + if filename.endswith(f"*.{suffix}"): + filename = filename[: -len(f"*.{suffix}")] for interval in intervals: - filenames.append(filename + interval + ".nc") + filenames.append(f"{filename}_{interval}.{suffix}") else: filenames.append(filename) - if "timerange" in facets: - filenames = _select_files(filenames, facets["timerange"]) - - for filename in filenames: - create_test_file(filename, next(tracking_id)) + if suffix == "nc": + for filename in filenames: + create_test_file(filename, next(tracking_id)) for filename in filenames: file = LocalFile(filename) - file.facets = expanded_facets + file.facets = dict(expanded_facets) + if facets["frequency"] != "fx": + for interval in intervals: + if interval in filename: + file.facets["timerange"] = interval.replace("-", "/") files.append(file) + if "timerange" in facets: + files = _select_files(files, facets["timerange"]) + return files, globs @@ -108,13 +123,11 @@ def _tracking_ids(i=0): i += 1 -def _get_find_files_func(path: Path, suffix: str = ".nc"): +def _get_find_files_func(path: Path, suffix: str = "nc"): tracking_id = _tracking_ids() - def find_files(*, debug: bool = False, **facets): - files, file_globs = _get_files(path, facets, tracking_id) - files = [f.with_suffix(suffix) for f in files] - file_globs = [g.with_suffix(suffix) for g in file_globs] + def find_files(self, *, debug: bool = False, **facets): + files, file_globs = _get_files(path, facets, tracking_id, suffix) if debug: return files, file_globs return files @@ -125,13 +138,21 @@ def find_files(*, debug: bool = False, **facets): @pytest.fixture def patched_datafinder(tmp_path, monkeypatch): find_files = _get_find_files_func(tmp_path) - monkeypatch.setattr(esmvalcore.local, "find_files", find_files) + monkeypatch.setattr( + esmvalcore.local.LocalDataSource, + "find_data", + find_files, + ) @pytest.fixture def patched_datafinder_grib(tmp_path, monkeypatch): - find_files = _get_find_files_func(tmp_path, suffix=".grib") - monkeypatch.setattr(esmvalcore.local, "find_files", find_files) + find_files = _get_find_files_func(tmp_path, suffix="grib") + monkeypatch.setattr( + esmvalcore.local.LocalDataSource, + "find_data", + find_files, + ) @pytest.fixture @@ -147,7 +168,7 @@ def patched_failing_datafinder(tmp_path, monkeypatch): """ tracking_id = _tracking_ids() - def find_files(*, debug: bool = False, **facets): + def find_files(self, *, debug: bool = False, **facets): files, file_globs = _get_files(tmp_path, facets, tracking_id) if facets["frequency"] == "fx": files = [] @@ -159,4 +180,8 @@ def find_files(*, debug: bool = False, **facets): return returned_files, file_globs return returned_files - monkeypatch.setattr(esmvalcore.local, "find_files", find_files) + monkeypatch.setattr( + esmvalcore.local.LocalDataSource, + "find_data", + find_files, + ) diff --git a/tests/integration/dataset/test_dataset.py b/tests/integration/dataset/test_dataset.py index cdc8310ea0..8558e1de28 100644 --- a/tests/integration/dataset/test_dataset.py +++ b/tests/integration/dataset/test_dataset.py @@ -1,15 +1,19 @@ +from __future__ import annotations + from pathlib import Path +from typing import TYPE_CHECKING -import iris.coords import iris.cube import pytest -from esmvalcore.config import CFG from esmvalcore.dataset import Dataset +if TYPE_CHECKING: + from esmvalcore.config import Session + @pytest.fixture -def example_data(tmp_path, monkeypatch): +def example_data_source(tmp_path: Path) -> dict[str, str]: cwd = Path(__file__).parent tas_src = cwd / "tas.nc" areacella_src = cwd / "areacella.nc" @@ -49,13 +53,15 @@ def example_data(tmp_path, monkeypatch): areacella_tgt.parent.mkdir(parents=True, exist_ok=True) areacella_tgt.symlink_to(areacella_src) - - monkeypatch.setitem(CFG, "rootpath", {"CMIP5": str(rootpath)}) - monkeypatch.setitem(CFG, "drs", {"CMIP5": "ESGF"}) - monkeypatch.setitem(CFG, "output_dir", tmp_path / "output_dir") + return { + "type": "esmvalcore.local.LocalDataSource", + "rootpath": str(rootpath), + "dirname_template": "{project.lower}/{product}/{institute}/{dataset}/{exp}/{frequency}/{modeling_realm}/{mip}/{ensemble}/{version}", + "filename_template": "{short_name}_{mip}_{dataset}_{exp}_{ensemble}*.nc", + } -def test_load(example_data): +def test_load(example_data_source: dict[str, str], session: Session) -> None: tas = Dataset( short_name="tas", mip="Amon", @@ -66,6 +72,10 @@ def test_load(example_data): timerange="1850/185002", ) tas.add_supplementary(short_name="areacella", mip="fx", ensemble="r0i0p0") + tas.session = session + tas.session["projects"]["CMIP5"]["data"] = { + "example-data-source": example_data_source, + } tas.augment_facets() diff --git a/tests/integration/esgf/search_results/expected.yml b/tests/integration/esgf/search_results/expected.yml index 24f02b9181..11f3f423e8 100644 --- a/tests/integration/esgf/search_results/expected.yml +++ b/tests/integration/esgf/search_results/expected.yml @@ -20,6 +20,7 @@ Amon_r1i1p1_historical,rcp85_INM-CM4_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "185001/200512" version: v20130207 local_file: cmip5/output1/INM/inmcm4/historical/mon/atmos/Amon/r1i1p1/v20130207/tas_Amon_inmcm4_historical_r1i1p1_185001-200512.nc name: tas_Amon_inmcm4_historical_r1i1p1_185001-200512.nc @@ -50,6 +51,7 @@ Amon_r1i1p1_historical,rcp85_INM-CM4_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "200601/210012" version: v20130207 local_file: cmip5/output1/INM/inmcm4/rcp85/mon/atmos/Amon/r1i1p1/v20130207/tas_Amon_inmcm4_rcp85_r1i1p1_200601-210012.nc name: tas_Amon_inmcm4_rcp85_r1i1p1_200601-210012.nc @@ -81,6 +83,7 @@ Amon_r1i1p1_historical_FIO-ESM_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "185001/200512" version: v20121010 local_file: cmip5/output1/FIO/FIO-ESM/historical/mon/atmos/Amon/r1i1p1/v20121010/tas_Amon_FIO-ESM_historical_r1i1p1_185001-200512.nc name: tas_Amon_FIO-ESM_historical_r1i1p1_185001-200512.nc @@ -108,6 +111,7 @@ Amon_r1i1p1_rcp85_HadGEM2-CC_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "205512/208011" version: v20120531 local_file: cmip5/output1/MOHC/HadGEM2-CC/rcp85/mon/atmos/Amon/r1i1p1/v20120531/tas_Amon_HadGEM2-CC_rcp85_r1i1p1_205512-208011.nc name: tas_Amon_HadGEM2-CC_rcp85_r1i1p1_205512-208011.nc @@ -132,6 +136,7 @@ Amon_r1i1p1_rcp85_HadGEM2-CC_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "208012/209912" version: v20120531 local_file: cmip5/output1/MOHC/HadGEM2-CC/rcp85/mon/atmos/Amon/r1i1p1/v20120531/tas_Amon_HadGEM2-CC_rcp85_r1i1p1_208012-209912.nc name: tas_Amon_HadGEM2-CC_rcp85_r1i1p1_208012-209912.nc @@ -156,6 +161,7 @@ Amon_r1i1p1_rcp85_HadGEM2-CC_CMIP5_tas.json: project: CMIP5 modeling_realm: atmos short_name: tas + timerange: "210001/210012" version: v20120531 local_file: cmip5/output1/MOHC/HadGEM2-CC/rcp85/mon/atmos/Amon/r1i1p1/v20120531/tas_Amon_HadGEM2-CC_rcp85_r1i1p1_210001-210012.nc name: tas_Amon_HadGEM2-CC_rcp85_r1i1p1_210001-210012.nc @@ -180,6 +186,7 @@ EUR-11_MOHC-HadGEM2-ES_r1i1p1_historical_CORDEX_RACMO22E_mon_tas.json: project: CORDEX rcm_version: v2 short_name: tas + timerange: "195001/195012" version: v20160620 local_file: cordex/output/EUR-11/KNMI/MOHC-HadGEM2-ES/historical/r1i1p1/RACMO22E/v2/mon/tas/v20160620/tas_EUR-11_MOHC-HadGEM2-ES_historical_r1i1p1_KNMI-RACMO22E_v2_mon_195001-195012.nc name: tas_EUR-11_MOHC-HadGEM2-ES_historical_r1i1p1_KNMI-RACMO22E_v2_mon_195001-195012.nc @@ -202,6 +209,7 @@ EUR-11_MOHC-HadGEM2-ES_r1i1p1_historical_CORDEX_RACMO22E_mon_tas.json: project: CORDEX rcm_version: v2 short_name: tas + timerange: "195101/196012" version: v20160620 local_file: cordex/output/EUR-11/KNMI/MOHC-HadGEM2-ES/historical/r1i1p1/RACMO22E/v2/mon/tas/v20160620/tas_EUR-11_MOHC-HadGEM2-ES_historical_r1i1p1_KNMI-RACMO22E_v2_mon_195101-196012.nc name: tas_EUR-11_MOHC-HadGEM2-ES_historical_r1i1p1_KNMI-RACMO22E_v2_mon_195101-196012.nc @@ -233,6 +241,7 @@ historical_gn_r4i1p1f1_CMIP6_CESM2_Amon_tas.json: mip: Amon project: CMIP6 short_name: tas + timerange: "185001/201412" version: v20190308 local_file: CMIP6/CMIP/NCAR/CESM2/historical/r4i1p1f1/Amon/tas/gn/v20190308/tas_Amon_CESM2_historical_r4i1p1f1_gn_185001-201412.nc name: tas_Amon_CESM2_historical_r4i1p1f1_gn_185001-201412.nc @@ -256,6 +265,7 @@ obs4MIPs_CERES-EBAF_mon_rsutcs.json: project: obs4MIPs modeling_realm: atmos short_name: rsutcs + timerange: "200003/201404" version: v20160610 local_file: obs4MIPs/CERES-EBAF/v20160610/rsutcs_CERES-EBAF_L3B_Ed2-8_200003-201404.nc name: rsutcs_CERES-EBAF_L3B_Ed2-8_200003-201404.nc @@ -273,6 +283,7 @@ obs4MIPs_GPCP-V2.3_pr.json: institute: NASA-GSFC project: obs4MIPs short_name: pr + timerange: "197901/201710" version: v20180519 local_file: obs4MIPs/GPCP-V2.3/v20180519/pr_GPCP-SG_L3_v2.3_197901-201710.nc name: pr_GPCP-SG_L3_v2.3_197901-201710.nc @@ -293,6 +304,7 @@ run1_historical_cccma_cgcm3_1_CMIP3_mon_tas.json: project: CMIP3 modeling_realm: atmos short_name: tas + timerange: "1850/2000" version: v1 local_file: cmip3/CCCma/cccma_cgcm3_1/historical/mon/atmos/run1/tas/v1/tas_a1_20c3m_1_cgcm3.1_t47_1850_2000.nc name: tas_a1_20c3m_1_cgcm3.1_t47_1850_2000.nc diff --git a/tests/integration/esgf/test_search_download.py b/tests/integration/esgf/test_search_download.py index 33680a42b3..685e55c937 100644 --- a/tests/integration/esgf/test_search_download.py +++ b/tests/integration/esgf/test_search_download.py @@ -183,6 +183,7 @@ def test_mock_search(variable, mocker): ] +@pytest.mark.online def test_real_search(): """Test a real search for a single file.""" variable = { diff --git a/tests/integration/preprocessor/_io/test_load.py b/tests/integration/preprocessor/_io/test_load.py index 59fbe09d78..1a9e747f4a 100644 --- a/tests/integration/preprocessor/_io/test_load.py +++ b/tests/integration/preprocessor/_io/test_load.py @@ -13,7 +13,7 @@ from iris.cube import Cube, CubeList from esmvalcore.exceptions import ESMValCoreLoadWarning -from esmvalcore.preprocessor._io import _get_attr_from_field_coord, load +from esmvalcore.preprocessor._io import load from tests import assert_array_equal @@ -141,15 +141,13 @@ def test_callback_fix_lat_units(tmp_path, sample_cube): assert str(sample_cube.coord("latitude").units) == "degrees_north" -def test_get_attr_from_field_coord_none(mocker): - """Test ``_get_attr_from_field_coord``.""" - attr = _get_attr_from_field_coord(mocker.sentinel.ncfield, None, "attr") - assert attr is None - - def test_fail_empty_cubes(mocker): """Test that ValueError is raised when cubes are empty.""" - mocker.patch("iris.load_raw", autospec=True, return_value=CubeList([])) + mocker.patch( + "esmvalcore.preprocessor._io.LocalFile.to_iris", + autospec=True, + return_value=CubeList([]), + ) msg = "myfilename does not contain any data" with pytest.raises(ValueError, match=msg): load("myfilename") diff --git a/tests/integration/preprocessor/_io/test_zarr.py b/tests/integration/preprocessor/_io/test_zarr.py index fc5684c967..7899a107a9 100644 --- a/tests/integration/preprocessor/_io/test_zarr.py +++ b/tests/integration/preprocessor/_io/test_zarr.py @@ -48,6 +48,7 @@ def test_load_zarr2_local(input_type): assert "latitude" in coord_names +@pytest.mark.online def test_load_zarr2_remote(): """Test loading a Zarr2 store from a https Object Store.""" zarr_path = ( @@ -88,6 +89,7 @@ def test_load_zarr2_remote(): assert "latitude" in coord_names +@pytest.mark.online def test_load_zarr3_remote(): """Test loading a Zarr3 store from a https Object Store.""" zarr_path = ( @@ -114,6 +116,7 @@ def test_load_zarr3_remote(): assert "latitude" in coord_names +@pytest.mark.online def test_load_zarr3_cmip6_metadata(): """ Test loading a Zarr3 store from a https Object Store. diff --git a/tests/integration/recipe/test_check.py b/tests/integration/recipe/test_check.py index 8c6c7009ce..3879b46724 100644 --- a/tests/integration/recipe/test_check.py +++ b/tests/integration/recipe/test_check.py @@ -1,19 +1,19 @@ """Integration tests for :mod:`esmvalcore._recipe.check`.""" -import os.path import subprocess from pathlib import Path -from typing import Any from unittest import mock import pyesgf.search.results import pytest +import pytest_mock import esmvalcore._recipe.check import esmvalcore.esgf from esmvalcore._recipe import check from esmvalcore.dataset import Dataset from esmvalcore.exceptions import RecipeError +from esmvalcore.local import LocalFile from esmvalcore.preprocessor import PreprocessorFile @@ -142,7 +142,12 @@ def test_ncl_version_broken(mocker): def test_data_availability_data(mock_logger, input_files, var, error): """Test check for data when data is present.""" dataset = Dataset(**var) - dataset.files = [Path(f) for f in input_files] + files = [] + for filename in input_files: + file = LocalFile(filename) + file.facets["timerange"] = filename.split("_")[-1].replace("-", "/") + files.append(file) + dataset.files = files if error is None: check.data_availability(dataset) mock_logger.error.assert_not_called() @@ -153,51 +158,39 @@ def test_data_availability_data(mock_logger, input_files, var, error): assert dataset.facets == var -DATA_AVAILABILITY_NO_DATA: list[Any] = [ - ([], [], None), - ([""], ["a*.nc"], (ERR_ALL, ": a*.nc")), - ([""], ["a*.nc", "b*.nc"], (ERR_ALL, "\na*.nc\nb*.nc")), - (["1"], ["a"], (ERR_ALL, ": 1/a")), - (["1"], ["a", "b"], (ERR_ALL, "\n1/a\n1/b")), - (["1", "2"], ["a"], (ERR_ALL, "\n1/a\n2/a")), - (["1", "2"], ["a", "b"], (ERR_ALL, "\n1/a\n1/b\n2/a\n2/b")), -] - - -@pytest.mark.parametrize( - ("dirnames", "filenames", "error"), - DATA_AVAILABILITY_NO_DATA, -) -@mock.patch("esmvalcore._recipe.check.logger", autospec=True) -def test_data_availability_no_data(mock_logger, dirnames, filenames, error): +def test_data_availability_no_data( + caplog: pytest.LogCaptureFixture, + mocker: pytest_mock.MockerFixture, +) -> None: """Test check for data when no data is present.""" - facets = { - "frequency": "mon", - "short_name": "tas", - "timerange": "2020/2025", - "alias": "alias", - "start_year": 2020, - "end_year": 2025, - } - dataset = Dataset(**facets) + dataset = Dataset( + frequency="mon", + short_name="tas", + timerange="2020/2025", + alias="alias", + start_year=2020, + end_year=2025, + ) dataset.files = [] - dataset._file_globs = [ - os.path.join(d, f) for d in dirnames for f in filenames - ] - error_first = ("No input files found for %s", dataset) - error_last = ("Set 'log_level' to 'debug' to get more information",) - with pytest.raises(RecipeError) as rec_err: + mock_data_source = mocker.Mock() + mock_data_source.debug_info = "debug info" + dataset._used_data_sources = [mock_data_source] + with pytest.raises(RecipeError) as exc: check.data_availability(dataset) - assert str(rec_err.value) == "Missing data for Dataset: tas" - if error is None: - assert mock_logger.error.call_count == 2 - errors = [error_first, error_last] - else: - assert mock_logger.error.call_count == 3 - errors = [error_first, error, error_last] - calls = [mock.call(*e) for e in errors] - assert mock_logger.error.call_args_list == calls - assert dataset.facets == facets + assert str(exc.value) == "Missing data for Dataset: tas" + assert len(caplog.records) == 2 + assert caplog.records[0].message == "\n".join( + [ + f"No files were found for {dataset},", + "using data sources:", + f"- data source: {mock_data_source}", + " message: debug info", + ], + ) + assert ( + caplog.records[1].message + == "Set 'log_level' to 'debug' to get more information" + ) GOOD_TIMERANGES = [ @@ -324,9 +317,9 @@ def test_data_availability_nonexistent(tmp_path): def test_reference_for_bias_preproc_empty(): """Test ``reference_for_bias_preproc``.""" products = { - PreprocessorFile(filename=10), - PreprocessorFile(filename=20), - PreprocessorFile(filename=30), + PreprocessorFile(filename=Path("10")), + PreprocessorFile(filename=Path("20")), + PreprocessorFile(filename=Path("30")), } check.reference_for_bias_preproc(products) @@ -334,11 +327,11 @@ def test_reference_for_bias_preproc_empty(): def test_reference_for_bias_preproc_one_ref(): """Test ``reference_for_bias_preproc`` with one reference.""" products = { - PreprocessorFile(filename=90), - PreprocessorFile(filename=10, settings={"bias": {}}), - PreprocessorFile(filename=20, settings={"bias": {}}), + PreprocessorFile(filename=Path("90")), + PreprocessorFile(filename=Path("10"), settings={"bias": {}}), + PreprocessorFile(filename=Path("20"), settings={"bias": {}}), PreprocessorFile( - filename=30, + filename=Path("30"), settings={"bias": {}}, attributes={"reference_for_bias": True}, ), @@ -349,10 +342,10 @@ def test_reference_for_bias_preproc_one_ref(): def test_reference_for_bias_preproc_no_ref(): """Test ``reference_for_bias_preproc`` with no reference.""" products = { - PreprocessorFile(filename=90), - PreprocessorFile(filename=10, settings={"bias": {}}), - PreprocessorFile(filename=20, settings={"bias": {}}), - PreprocessorFile(filename=30, settings={"bias": {}}), + PreprocessorFile(filename=Path("90")), + PreprocessorFile(filename=Path("10"), settings={"bias": {}}), + PreprocessorFile(filename=Path("20"), settings={"bias": {}}), + PreprocessorFile(filename=Path("30"), settings={"bias": {}}), } with pytest.raises(RecipeError) as rec_err: check.reference_for_bias_preproc(products) @@ -376,15 +369,15 @@ def test_reference_for_bias_preproc_no_ref(): def test_reference_for_bias_preproc_two_refs(): """Test ``reference_for_bias_preproc`` with two references.""" products = { - PreprocessorFile(filename=90), - PreprocessorFile(filename=10, settings={"bias": {}}), + PreprocessorFile(filename=Path("90")), + PreprocessorFile(filename=Path("10"), settings={"bias": {}}), PreprocessorFile( - filename=20, + filename=Path("20"), attributes={"reference_for_bias": True}, settings={"bias": {}}, ), PreprocessorFile( - filename=30, + filename=Path("30"), attributes={"reference_for_bias": True}, settings={"bias": {}}, ), diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index adeb030ea3..6b8b788173 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -1,10 +1,13 @@ +import importlib.resources import inspect import os import re from collections import defaultdict +from functools import lru_cache from pathlib import Path from pprint import pformat from textwrap import dedent +from typing import TYPE_CHECKING from unittest.mock import create_autospec import iris @@ -30,6 +33,49 @@ from esmvalcore.preprocessor import DEFAULT_ORDER, PreprocessingTask from tests.integration.test_provenance import check_provenance +if TYPE_CHECKING: + from esmvalcore.typing import Facets + + +@lru_cache +def _load_data_sources( + filename, +) -> dict[ + str, + dict[str, dict[str, dict[str, dict[str, str]]]], +]: + """Load data source configurations.""" + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / filename, + ) as config_file: + return yaml.safe_load(config_file.read_text(encoding="utf-8")) + + +def update_data_sources( + session: Session, + filename: str, + rootpath: Path, +) -> None: + """Update the data sources in `session` using config file `filename`.""" + cfg = _load_data_sources(filename) + projects = cfg["projects"] + for project in projects: + data_sources = projects[project]["data"] + for data_source in data_sources.values(): + data_source["rootpath"] = str(rootpath) + session["projects"][project]["data"] = data_sources + + +@pytest.fixture +def session(tmp_path: Path, session: Session) -> Session: + """Session fixture with default data sources.""" + update_data_sources(session, "data-local.yml", tmp_path) + update_data_sources(session, "data-local-esmvaltool.yml", tmp_path) + return session + + TAGS_FOR_TESTING = { "authors": { "andela_bouwe": { @@ -692,7 +738,7 @@ def test_default_fx_preprocessor(tmp_path, patched_datafinder, session): "remove_supplementary_variables": {}, "save": { "compress": False, - "filename": product.filename, + "filename": Path(product.filename), "compute": False, }, } @@ -1539,7 +1585,7 @@ def test_diagnostic_task_provenance( # Test that provenance was saved to xml and info embedded in netcdf product = next( iter( - p for p in diagnostic_task.products if p.filename.endswith(".nc") + p for p in diagnostic_task.products if p.filename.suffix == ".nc" ), ) cube = iris.load_cube(product.filename) @@ -2460,14 +2506,17 @@ def test_recipe_run(tmp_path, patched_datafinder, session, mocker): - {dataset: BNU-ESM} scripts: null """) - session["download_dir"] = tmp_path / "download_dir" - session["search_esgf"] = "when_missing" mocker.patch.object( - esmvalcore._recipe.recipe.esgf, + esmvalcore.esgf, "download", create_autospec=True, ) + mocker.patch.object( + esmvalcore.local.LocalFile, + "prepare", + create_autospec=True, + ) recipe = get_recipe(tmp_path, content, session) @@ -2476,10 +2525,8 @@ def test_recipe_run(tmp_path, patched_datafinder, session, mocker): recipe.write_html_summary = mocker.Mock() recipe.run() - esmvalcore._recipe.recipe.esgf.download.assert_called_once_with( - set(), - session["download_dir"], - ) + esmvalcore.esgf.download.assert_called() + esmvalcore.local.LocalFile.prepare.assert_called() recipe.tasks.run.assert_called_once_with( max_parallel_tasks=session["max_parallel_tasks"], ) @@ -2487,8 +2534,14 @@ def test_recipe_run(tmp_path, patched_datafinder, session, mocker): recipe.write_html_summary.assert_called_once() -def test_representative_dataset_regular_var(patched_datafinder, session): +def test_representative_dataset_regular_var( + tmp_path: Path, + patched_datafinder: None, + session: Session, +): """Test ``_representative_dataset`` with regular variable.""" + update_data_sources(session, "data-native-icon.yml", tmp_path) + variable = { "dataset": "ICON", "exp": "atm_amip-rad_R2B4_r1i1p1f1", @@ -2505,18 +2558,20 @@ def test_representative_dataset_regular_var(patched_datafinder, session): datasets = _representative_datasets(dataset) assert len(datasets) == 1 filename = datasets[0].files[0] - path = Path(filename) - assert path.name == "atm_amip-rad_R2B4_r1i1p1f1_atm_2d_ml_1990_1999.nc" + assert filename.name == "atm_amip-rad_R2B4_r1i1p1f1_atm_2d_ml_1990-1999.nc" @pytest.mark.parametrize("force_derivation", [True, False]) def test_representative_dataset_derived_var( - patched_datafinder, - session, - force_derivation, + tmp_path: Path, + patched_datafinder: None, + session: Session, + force_derivation: bool, ): """Test ``_representative_dataset`` with derived variable.""" - variable = { + update_data_sources(session, "data-native-icon.yml", tmp_path) + + variable: Facets = { "dataset": "ICON", "derive": True, "exp": "atm_amip-rad_R2B4_r1i1p1f1", @@ -2533,7 +2588,7 @@ def test_representative_dataset_derived_var( dataset.session = session representative_datasets = _representative_datasets(dataset) - expected_facets = { + expected_facets: Facets = { # Already present in variable "dataset": "ICON", "derive": True, diff --git a/tests/integration/test_main.py b/tests/integration/test_main.py index a9cfce6366..c701e35af9 100644 --- a/tests/integration/test_main.py +++ b/tests/integration/test_main.py @@ -13,8 +13,11 @@ from unittest.mock import patch import pytest +import yaml from fire.core import FireExit +import esmvalcore._main +import esmvalcore.config from esmvalcore._main import Config, ESMValTool, Recipes, run from esmvalcore.exceptions import RecipeError @@ -102,7 +105,7 @@ def test_empty_run(tmp_path): def test_recipes_get(tmp_path, monkeypatch): - """Test version command.""" + """Test esmvaltool recipes get command.""" src_recipe = tmp_path / "recipe.yml" src_recipe.touch() tgt_dir = tmp_path / "test" @@ -115,39 +118,139 @@ def test_recipes_get(tmp_path, monkeypatch): @patch("esmvalcore._main.Recipes.list", new=wrapper(Recipes.list)) def test_recipes_list(): - """Test version command.""" + """Test esmvaltool recipes list command.""" with arguments("esmvaltool", "recipes", "list"): run() @patch("esmvalcore._main.Recipes.list", new=wrapper(Recipes.list)) def test_recipes_list_do_not_admit_parameters(): - """Test version command.""" + """Test esmvaltool recipes list command.""" with arguments("esmvaltool", "recipes", "list", "parameter"): with pytest.raises(FireExit): run() +def test_config_copy(tmp_path: Path) -> None: + """Test esmvaltool config copy command.""" + tgt_file = tmp_path / "test.yml" + with arguments( + "esmvaltool", + "config", + "copy", + "defaults/config-user.yml", + f"--target-file={tgt_file}", + ): + run() + assert tgt_file.is_file() + + +def test_config_copy_nonexistent_file( + capsys: pytest.CaptureFixture, +) -> None: + """Test `esmvaltool config copy` fails if source file does not exist.""" + with pytest.raises(SystemExit): + with arguments( + "esmvaltool", + "config", + "copy", + "test-file-that-does-not-exist.yml", + ): + run() + assert ( + "Configuration file 'test-file-that-does-not-exist.yml' not found" + in capsys.readouterr().out + ) + + +def test_config_list(capsys: pytest.CaptureFixture) -> None: + """Test esmvaltool config list command.""" + with arguments("esmvaltool", "config", "list"): + run() + stdout = capsys.readouterr().out + assert "Defaults" in stdout + assert "defaults/config-user.yml: " in stdout + assert "Data Sources" in stdout + assert ( + "data-local.yml: Read CMIP, CORDEX, and obs4MIPs data from the filesystem" + in stdout + ) + assert len(stdout.split("\n")) > 20 + + +def test_config_list_no_description( + capsys: pytest.CaptureFixture, + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + """Test `esmvaltool config list` works when a config files provides no description.""" + config_dir = tmp_path / "configurations" + config_dir.mkdir() + monkeypatch.setattr(esmvalcore.config, "__file__", config_dir) + tmp_path.joinpath("configurations", "data-config.yml").touch() + with arguments("esmvaltool", "config", "list"): + run() + stdout = capsys.readouterr().out + assert "Data Sources" in stdout + assert "data-config.yml" in stdout + + +def test_config_show( + capsys: pytest.CaptureFixture, + cfg_default: Config, +) -> None: + """Test esmvaltool config show command.""" + with arguments("esmvaltool", "config", "show", "--filter=None"): + run() + stdout = capsys.readouterr().out + expected_header = "# Current configuration:\n" + assert expected_header in stdout + cfg_txt = stdout.split(expected_header)[1] + cfg = yaml.safe_load(cfg_txt) + reference = yaml.safe_load(yaml.safe_dump(dict(cfg_default))) # type: ignore[call-overload] + assert cfg == reference + + +def test_config_show_brief_by_default(capsys: pytest.CaptureFixture) -> None: + """Test that the `esmvaltool config show` command produces readable results.""" + with arguments("esmvaltool", "config", "show"): + run() + stdout = capsys.readouterr().out + expected_header = ( + "# Current configuration, excluding the keys 'extra_facets':\n" + ) + assert expected_header in stdout + # Check that the configuration that is listed by default is sufficiently + # brief for easy reading by a human. + assert len(stdout.split("\n")) < 200 + cfg_txt = stdout.split(expected_header)[1] + cfg = yaml.safe_load(cfg_txt) + assert "projects" in cfg + for project in cfg["projects"]: + assert "extra_facets" not in cfg["projects"][project] + + @patch( "esmvalcore._main.Config.get_config_developer", new=wrapper(Config.get_config_developer), ) def test_get_config_developer(): - """Test version command.""" + """Test esmvaltool config get_config_developer command.""" with arguments("esmvaltool", "config", "get_config_developer"): run() -def test_get_config_developer_no_path(): - """Test version command.""" +def test_get_config_developer_no_path(mocker, tmp_path): + """Test esmvaltool config get_config_developer command.""" + mocker.patch.object(esmvalcore._main.Path, "home", return_value=tmp_path) with arguments("esmvaltool", "config", "get_config_developer"): run() - config_file = Path.home() / ".esmvaltool" / "config-developer.yml" + config_file = tmp_path / ".esmvaltool" / "config-developer.yml" assert config_file.is_file() def test_get_config_developer_path(tmp_path): - """Test version command.""" + """Test esmvaltool config get_config_developer command.""" new_path = tmp_path / "subdir" with arguments( "esmvaltool", @@ -160,7 +263,7 @@ def test_get_config_developer_path(tmp_path): def test_get_config_developer_overwrite(tmp_path): - """Test version command.""" + """Test esmvaltool config get_config_developer command.""" config_developer = tmp_path / "config-developer.yml" config_developer.write_text("old text") with arguments( @@ -175,7 +278,7 @@ def test_get_config_developer_overwrite(tmp_path): def test_get_config_developer_no_overwrite(tmp_path): - """Test version command.""" + """Test esmvaltool config get_config_developer command.""" config_developer = tmp_path / "configuration_file.yml" config_developer.write_text("old text") with arguments( @@ -184,7 +287,8 @@ def test_get_config_developer_no_overwrite(tmp_path): "get_config_developer", f"--path={config_developer}", ): - run() + with pytest.raises(SystemExit): + run() assert config_developer.read_text() == "old text" @@ -193,7 +297,7 @@ def test_get_config_developer_no_overwrite(tmp_path): new=wrapper(Config.get_config_developer), ) def test_get_config_developer_bad_option_fails(): - """Test version command.""" + """Test esmvaltool config get_config_developer command.""" with arguments( "esmvaltool", "config", @@ -209,21 +313,22 @@ def test_get_config_developer_bad_option_fails(): new=wrapper(Config.get_config_user), ) def test_get_config_user(): - """Test version command.""" + """Test esmvaltool config get_config_user command.""" with arguments("esmvaltool", "config", "get_config_user"): run() -def test_get_config_user_no_path(): - """Test version command.""" +def test_get_config_user_no_path(mocker, tmp_path): + """Test esmvaltool config get_config_user command.""" + mocker.patch.object(esmvalcore._main.Path, "home", return_value=tmp_path) with arguments("esmvaltool", "config", "get_config_user"): run() - config_file = Path.home() / ".config" / "esmvaltool" / "config-user.yml" + config_file = tmp_path / ".config" / "esmvaltool" / "config-user.yml" assert config_file.is_file() def test_get_config_user_path(tmp_path): - """Test version command.""" + """Test esmvaltool config get_config_user command.""" new_path = tmp_path / "subdir" with arguments( "esmvaltool", @@ -236,7 +341,7 @@ def test_get_config_user_path(tmp_path): def test_get_config_user_overwrite(tmp_path): - """Test version command.""" + """Test esmvaltool config get_config_user command.""" config_user = tmp_path / "config-user.yml" config_user.write_text("old text") with arguments( @@ -251,7 +356,7 @@ def test_get_config_user_overwrite(tmp_path): def test_get_config_user_no_overwrite(tmp_path): - """Test version command.""" + """Test esmvaltool config get_config_user command.""" config_user = tmp_path / "configuration_file.yml" config_user.write_text("old text") with arguments( @@ -260,7 +365,8 @@ def test_get_config_user_no_overwrite(tmp_path): "get_config_user", f"--path={config_user}", ): - run() + with pytest.raises(SystemExit): + run() assert config_user.read_text() == "old text" @@ -269,7 +375,7 @@ def test_get_config_user_no_overwrite(tmp_path): new=wrapper(Config.get_config_user), ) def test_get_config_user_bad_option_fails(): - """Test version command.""" + """Test esmvaltool config get_config_user command.""" with arguments( "esmvaltool", "config", diff --git a/tests/sample_data/experimental/test_run_recipe.py b/tests/sample_data/experimental/test_run_recipe.py index 286751122a..7259526815 100644 --- a/tests/sample_data/experimental/test_run_recipe.py +++ b/tests/sample_data/experimental/test_run_recipe.py @@ -83,13 +83,18 @@ def test_run_recipe( assert isinstance(recipe._repr_html_(), str) sample_data_config = esmvaltool_sample_data.get_rootpaths() - monkeypatch.setitem(CFG, "rootpath", sample_data_config["rootpath"]) - monkeypatch.setitem(CFG, "drs", {"CMIP6": "SYNDA"}) session = cfg_default.start_session(recipe.path.stem) session["output_dir"] = tmp_path / "esmvaltool_output" session["max_parallel_tasks"] = 1 session["remove_preproc_dir"] = False - + session["projects"]["CMIP6"]["data"] = { + "local": { + "type": "esmvalcore.local.LocalDataSource", + "rootpath": sample_data_config["rootpath"]["CMIP6"][0], + "dirname_template": "{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}", + "filename_template": "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc", + }, + } output = recipe.run(task=task, session=session) assert len(output) > 0 diff --git a/tests/unit/config/test_config.py b/tests/unit/config/test_config.py index 7c5ed0e5c3..266795c616 100644 --- a/tests/unit/config/test_config.py +++ b/tests/unit/config/test_config.py @@ -1,3 +1,4 @@ +import re from importlib.resources import files as importlib_files from pathlib import Path @@ -15,6 +16,37 @@ ) from esmvalcore.exceptions import ESMValCoreDeprecationWarning, RecipeError +BUILTIN_CONFIG_DIR = Path(esmvalcore.config.__file__).parent.joinpath( + "configurations", +) + + +@pytest.mark.parametrize( + "config_file", + [ + pytest.param(f, id=f.relative_to(BUILTIN_CONFIG_DIR).as_posix()) + for f in BUILTIN_CONFIG_DIR.rglob("*.yml") + ], +) +def test_builtin_config_files_have_description(config_file) -> None: + """Test that all built-in config files have a description.""" + # Use the same code to find the description as in the + # `esmvaltool config list` command. + first_comment = re.search( + r"\A((?: *#.*\r?\n)+)", + config_file.read_text(encoding="utf-8"), + flags=re.MULTILINE, + ) + assert first_comment + description = " ".join( + line.lstrip(" #").strip() + for line in first_comment.group(1).split("\n") + ).strip() + # Add a basic check that the description is meaningful + assert len(description) > 15 + assert description.endswith(".") + + TEST_DEEP_UPDATE = [ ([{}], {}), ([{"a": 1, "b": 2}, {"a": 3}], {"a": 3, "b": 2}), @@ -138,6 +170,14 @@ def test_load_default_config(cfg_default, monkeypatch): paths=[str(p) for p in config_dir.glob("extra_facets_*.yml")], env={}, )["projects"] + # Add in projects without extra facets from the config developer file + # until we have transitioned all of its content to the new configuration + # system. + for project in yaml.safe_load( + default_dev_file.read_text(encoding="utf-8"), + ): + if project not in default_project_settings: + default_project_settings[project] = {} session = cfg_default.start_session("recipe_example") @@ -163,14 +203,6 @@ def test_load_default_config(cfg_default, monkeypatch): "use": "local_threaded", }, "diagnostics": None, - "download_dir": Path.home() / "climate_data", - "drs": { - "CMIP3": "ESGF", - "CMIP5": "ESGF", - "CMIP6": "ESGF", - "CORDEX": "ESGF", - "obs4MIPs": "ESGF", - }, "exit_on_warning": False, "log_level": "info", "logging": {"log_progress_interval": 0.0}, @@ -183,9 +215,8 @@ def test_load_default_config(cfg_default, monkeypatch): "projects": default_project_settings, "remove_preproc_dir": True, "resume_from": [], - "rootpath": {"default": [Path.home() / "climate_data"]}, "run_diagnostic": True, - "search_esgf": "never", + "search_data": "quick", "skip_nonexistent": False, "save_intermediary_cubes": False, } diff --git a/tests/unit/config/test_config_object.py b/tests/unit/config/test_config_object.py index f974ea574d..df3a923069 100644 --- a/tests/unit/config/test_config_object.py +++ b/tests/unit/config/test_config_object.py @@ -225,13 +225,15 @@ def test_load_from_dirs(dirs, output_file_type, rootpath, tmp_path): cfg = Config() assert not cfg cfg["rootpath"] = {"X": "x"} - cfg["search_esgf"] = "when_missing" + cfg["search_data"] = "complete" cfg.load_from_dirs(config_dirs) assert cfg["output_file_type"] == output_file_type - assert cfg["rootpath"] == rootpath - assert cfg["search_esgf"] == "never" + if any(Path(d).exists() for d in config_dirs): + # Legacy setting "rootpath" is not available in default config. + assert cfg["rootpath"] == rootpath + assert cfg["search_data"] == "quick" @pytest.mark.parametrize( @@ -315,7 +317,7 @@ def test_update_from_dirs(dirs, output_file_type, rootpath, tmp_path): cfg = Config() assert not cfg cfg["rootpath"] = {"X": "x"} - cfg["search_esgf"] = "when_missing" + cfg["search_data"] = "quick" cfg.update_from_dirs(config_dirs) @@ -324,7 +326,7 @@ def test_update_from_dirs(dirs, output_file_type, rootpath, tmp_path): else: assert cfg["output_file_type"] == output_file_type assert cfg["rootpath"] == rootpath - assert cfg["search_esgf"] == "when_missing" + assert cfg["search_data"] == "quick" def test_nested_update(): @@ -333,13 +335,13 @@ def test_nested_update(): assert not cfg cfg["drs"] = {"X": "x", "Z": "z"} - cfg["search_esgf"] = "when_missing" + cfg["search_data"] = "quick" cfg.nested_update({"drs": {"Y": "y", "X": "xx"}, "max_years": 1}) assert len(cfg) == 3 assert cfg["drs"] == {"Y": "y", "X": "xx", "Z": "z"} - assert cfg["search_esgf"] == "when_missing" + assert cfg["search_data"] == "quick" assert cfg["max_years"] == 1 diff --git a/tests/unit/config/test_config_validator.py b/tests/unit/config/test_config_validator.py index 7b4ddde924..eed9b19bd5 100644 --- a/tests/unit/config/test_config_validator.py +++ b/tests/unit/config/test_config_validator.py @@ -26,6 +26,7 @@ validate_positive, validate_projects, validate_rootpath, + validate_search_data, validate_search_esgf, validate_string, validate_string_or_none, @@ -202,6 +203,16 @@ def generate_validator_testcases(valid): "success": ((None, None),), "fail": (), }, + { + "validator": validate_search_data, + "success": ( + ("quick", "quick"), + ("QUICK", "quick"), + ("complete", "complete"), + ("Complete", "complete"), + ), + "fail": (0, 3.14, True, "fail"), + }, { "validator": validate_search_esgf, "success": ( diff --git a/tests/unit/config/test_data_sources.py b/tests/unit/config/test_data_sources.py new file mode 100644 index 0000000000..67b364018c --- /dev/null +++ b/tests/unit/config/test_data_sources.py @@ -0,0 +1,43 @@ +import pytest + +import esmvalcore.config._data_sources +import esmvalcore.local +from esmvalcore.config import Session +from esmvalcore.exceptions import InvalidConfigParameter + + +def test_load_data_sources_no_project_data_sources_configured( + session: Session, +) -> None: + """Test that loading data sources when no data sources are configured raises.""" + with pytest.raises( + InvalidConfigParameter, + match=r"No data sources found for project 'test'.*", + ): + esmvalcore.config._data_sources._get_data_sources( + session, + project="test", + ) + + +@pytest.mark.parametrize("search_esgf", ["never", "when_missing", "always"]) +def test_load_legacy_data_sources( + monkeypatch: pytest.MonkeyPatch, + session: Session, + search_esgf: str, +) -> None: + """Test that loading legacy data sources works.""" + for project in session["projects"]: + session["projects"][project].pop("data", None) + session["search_esgf"] = search_esgf + session["download_dir"] = "~/climate_data" + monkeypatch.setitem( + esmvalcore.local.CFG, + "rootpath", + {"default": "~/climate_data"}, + ) + data_sources = esmvalcore.config._data_sources._get_data_sources( + session, + project="CMIP6", + ) + assert len(data_sources) == 1 if search_esgf == "never" else 2 diff --git a/tests/unit/esgf/test_download.py b/tests/unit/esgf/test_download.py index 85b5cbae3e..9c36477b76 100644 --- a/tests/unit/esgf/test_download.py +++ b/tests/unit/esgf/test_download.py @@ -11,6 +11,7 @@ import requests import yaml from pyesgf.search.results import FileResult +from pytest_mock import MockerFixture import esmvalcore.esgf from esmvalcore.esgf import _download @@ -241,6 +242,7 @@ def test_init(): "dataset": "ABC", "project": "CMIP6", "short_name": "tas", + "timerange": "2000/2001", "version": "v1", } txt = f"ESGFFile:CMIP6/ABC/v1/{filename} on hosts ['something.org']" @@ -248,6 +250,60 @@ def test_init(): assert hash(file) == hash(("CMIP6.ABC.v1", filename)) +@pytest.fixture +def esgf_file() -> _download.ESGFFile: + """ESGFFile fixture.""" + json = { + "dataset_id": "CMIP6.dataset.v1|something.org", + "dataset_id_template_": ["%(mip_era)s.%(source_id)s"], + "project": ["CMIP6"], + "size": 12, + "title": "test.nc", + } + return _download.ESGFFile( + [FileResult(json=json, context=None)], + dest_folder=Path("/path/to/climate_data"), + ) + + +def test_prepare(mocker: MockerFixture, esgf_file: _download.ESGFFile) -> None: + """Test `ESGFFile.prepare`.""" + download = mocker.patch.object(_download.ESGFFile, "download") + esgf_file.prepare() + download.assert_called_once_with(esgf_file.dest_folder) + + +def test_attribute_not_set(esgf_file: _download.ESGFFile) -> None: + """Test accessing `ESGFFile.attributes` before calling to_iris.""" + with pytest.raises( + ValueError, + match=r"Attributes have not been read yet. Call the `to_iris` method .*", + ): + _ = esgf_file.attributes + + +def test_to_iris(mocker: MockerFixture, esgf_file: _download.ESGFFile) -> None: + """Test `ESGFFile.prepare`.""" + prepare = mocker.patch.object(_download.ESGFFile, "prepare") + local_file_to_iris = mocker.patch.object( + esmvalcore.esgf._download.LocalFile, + "to_iris", + return_value=mocker.sentinel.iris_cubes, + ) + mocker.patch.object( + esmvalcore.esgf._download.LocalFile, + "attributes", + new_callable=mocker.PropertyMock, + return_value={"attribute": "value"}, + ) + cubes = esgf_file.to_iris() + + assert cubes == mocker.sentinel.iris_cubes + assert esgf_file.attributes == {"attribute": "value"} + prepare.assert_called_once() + local_file_to_iris.assert_called_once() + + def test_from_results(): """Test ESGFFile._from_results().""" facets = { @@ -478,7 +534,7 @@ def test_single_download(mocker, tmp_path, checksum): response.iter_content.assert_called_with(chunk_size=2**20) -def test_download_skip_existing(tmp_path, caplog): +def test_download_skip_existing(tmp_path: Path, mocker: MockerFixture) -> None: filename = "test.nc" dataset = "dataset" dest_folder = tmp_path @@ -496,12 +552,9 @@ def test_download_skip_existing(tmp_path, caplog): local_file = file.local_file(dest_folder) local_file.parent.mkdir(parents=True) local_file.touch() - - caplog.set_level(logging.DEBUG) - + mock_download = mocker.patch.object(_download.ESGFFile, "_download") local_file = file.download(dest_folder) - - assert f"Skipping download of existing file {local_file}" in caplog.text + mock_download.assert_not_called() def test_single_download_fail(mocker, tmp_path): @@ -632,10 +685,8 @@ def test_download_fail(mocker, tmp_path, caplog): file.download.assert_called_with(dest_folder) -def test_download_noop(caplog): +def test_download_noop(mocker: MockerFixture) -> None: """Test downloading no files.""" - caplog.set_level("DEBUG") + mock_download = mocker.patch.object(_download.ESGFFile, "_download") esmvalcore.esgf.download([], dest_folder="/does/not/exist") - - msg = "All required data is available locally, not downloading anything." - assert msg in caplog.text + mock_download.assert_not_called() diff --git a/tests/unit/esgf/test_search.py b/tests/unit/esgf/test_search.py index 11b582fffb..5949cc5792 100644 --- a/tests/unit/esgf/test_search.py +++ b/tests/unit/esgf/test_search.py @@ -2,13 +2,16 @@ import copy import textwrap +from pathlib import Path import pyesgf.search import pytest import requests.exceptions from pyesgf.search.results import FileResult +from pytest_mock import MockerFixture -from esmvalcore.esgf import ESGFFile, _search, find_files +import esmvalcore.io.protocol +from esmvalcore.esgf import ESGFDataSource, ESGFFile, _search, find_files OUR_FACETS = ( { @@ -433,3 +436,39 @@ def test_search_unknown_project(): ) with pytest.raises(ValueError, match=msg): find_files(project=project, dataset="", short_name="") + + +class TestESGFDataSource: + """Test `esmvalcore.esgf.ESGFDataSource`.""" + + def test_init(self) -> None: + """Test initialization.""" + data_source = ESGFDataSource( + name="esgf-cmip6", + project="CMIP6", + priority=1, + download_dir=Path("/path/to/climate_data"), + ) + assert isinstance(data_source, esmvalcore.io.protocol.DataSource) + + def test_find_data(self, mocker: MockerFixture) -> None: + """Test find_data method.""" + data_source = ESGFDataSource( + name="esgf-cmip6", + project="CMIP6", + priority=1, + download_dir=Path("/path/to/climate_data"), + ) + + mock_result = [mocker.create_autospec(ESGFFile, instance=True)] + mock_find_files = mocker.patch( + "esmvalcore.esgf._search.find_files", + return_value=mock_result, + ) + + facets = {"short_name": "tas", "dataset": "A", "project": "CMIP6"} + result = data_source.find_data(**facets) + + mock_find_files.assert_called_once_with(**facets) + assert result is mock_result + assert result[0].dest_folder == Path("/path/to/climate_data") diff --git a/tests/unit/io/__init__.py b/tests/unit/io/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/io/test_intake_esgf.py b/tests/unit/io/test_intake_esgf.py new file mode 100644 index 0000000000..a72e145eae --- /dev/null +++ b/tests/unit/io/test_intake_esgf.py @@ -0,0 +1,347 @@ +"""Unit tests for esmvalcore.io.intake_esgf.""" + +import importlib.resources + +import intake_esgf +import iris.cube +import pandas as pd +import pytest +import xarray as xr +import yaml +from pytest import MonkeyPatch +from pytest_mock import MockerFixture + +import esmvalcore.io.intake_esgf +from esmvalcore.config import Session +from esmvalcore.io.intake_esgf import IntakeESGFDataset, IntakeESGFDataSource + + +def test_intakeesgfdataset_repr() -> None: + cat = intake_esgf.ESGFCatalog() + dataset = IntakeESGFDataset(name="id", facets={}, catalog=cat) + assert repr(dataset) == "IntakeESGFDataset(name='id')" + + +def test_prepare(mocker: MockerFixture) -> None: + """IntakeESGFDataset.prepare should call the catalog.to_path_dict method.""" + cat = intake_esgf.ESGFCatalog() + to_path_mock = mocker.patch.object(cat, "to_path_dict", autospec=True) + dataset = IntakeESGFDataset(name="id", facets={}, catalog=cat) + + dataset.prepare() + to_path_mock.assert_called_once_with(minimal_keys=False) + + +def test_attributes_raises_before_to_iris() -> None: + """Accessing attributes before to_iris should raise ValueError.""" + cat = intake_esgf.ESGFCatalog() + dataset = IntakeESGFDataset(name="id", facets={}, catalog=cat) + with pytest.raises(ValueError, match="Attributes have not been read yet"): + _ = dataset.attributes + + +def test_to_iris(mocker: MockerFixture) -> None: + """`to_iris` should load the data and cache attributes.""" + cat = intake_esgf.ESGFCatalog() + key = "my.dataset.1" + mocker.patch.object( + cat, + "to_path_dict", + return_value={key: ["/path/to/file.nc"]}, + ) + ds = xr.Dataset(attrs={"attr": "value"}) + mocker.patch.object(cat, "to_dataset_dict", return_value={key: ds}) + + cubes = mocker.sentinel.cubes + mocker.patch.object( + esmvalcore.io.intake_esgf, + "dataset_to_iris", + return_value=cubes, + ) + + dataset = IntakeESGFDataset(name=key, facets={}, catalog=cat) + result = dataset.to_iris() + assert result is cubes + + assert dataset.attributes == { + "attr": "value", + "source_file": "/path/to/file.nc", + } + + +@pytest.mark.online +def test_to_iris_online(): + """`to_iris` should load data from a real ESGF catalog.""" + data_source = IntakeESGFDataSource( + name="src", + project="CMIP6", + priority=1, + facets={ + "activity": "activity_drs", + "dataset": "source_id", + "ensemble": "member_id", + "exp": "experiment_id", + "grid": "grid_label", + "institute": "institution_id", + "mip": "table_id", + "project": "project", + "short_name": "variable_id", + }, + values={}, + ) + results = data_source.find_data( + dataset="CanESM5", + ensemble="r1i1p1f1", + exp="historical", + grid="gn", + mip="fx", + project="CMIP6", + short_name="areacella", + ) + assert len(results) == 1 + dataset = results[0] + assert isinstance(dataset, IntakeESGFDataset) + cubes = dataset.to_iris() + assert len(cubes) == 1 + assert isinstance(cubes[0], iris.cube.Cube) + # Check that the "source_file" attributes is present for debugging. + assert "source_file" in dataset.attributes + assert dataset.attributes["source_file"].endswith(".nc") + + +def test_find_data_no_results_sets_debug_info(mocker: MockerFixture) -> None: + """When catalog.search raises NoSearchResults, find_data should return empty list and set debug_info.""" + data_source = IntakeESGFDataSource( + name="src", + project="CMIP6", + priority=1, + facets={"short_name": "variable_id"}, + ) + + cat = intake_esgf.ESGFCatalog() + # Ensure last_search is present so debug_info can be constructed + cat.last_search = {"variable_id": "tas"} + mocker.patch.object( + cat, + "search", + side_effect=intake_esgf.exceptions.NoSearchResults("no results"), + ) + data_source.catalog = cat + + result = data_source.find_data(short_name="tas") + assert result == [] + expected_debug_info = "`intake_esgf.ESGFCatalog().search(variable_id=['tas'])` did not return any results." + assert data_source.debug_info == expected_debug_info + + +def test_find_data(mocker: MockerFixture, monkeypatch: MonkeyPatch): + """find_data should convert catalog.df rows into IntakeESGFDataset instances.""" + cat = intake_esgf.ESGFCatalog() + cat.project = intake_esgf.projects.projects["cmip6"] + cat.df = pd.DataFrame.from_dict( + { + "project": ["CMIP6", "CMIP6"], + "mip_era": ["CMIP6", "CMIP6"], + "activity_drs": ["CMIP", "ScenarioMIP"], + "institution_id": ["CCCma", "CCCma"], + "source_id": ["CanESM5", "CanESM5"], + "experiment_id": ["historical", "ssp585"], + "member_id": ["r1i1p1f1", "r1i1p1f1"], + "table_id": ["Amon", "Amon"], + "variable_id": ["tas", "tas"], + "grid_label": ["gn", "gn"], + "version": ["20190429", "20190429"], + "id": [ + [ + "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn.v20190429|crd-esgf-drc.ec.gc.ca", + "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn.v20190429|eagle.alcf.anl.gov", + "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn.v20190429|esgf-data04.diasjp.net", + "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn.v20190429|esgf-node.ornl.gov", + ], + [ + "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn.v20190429|crd-esgf-drc.ec.gc.ca", + "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn.v20190429|eagle.alcf.anl.gov", + "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn.v20190429|esgf-data04.diasjp.net", + "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn.v20190429|esgf-node.ornl.gov", + ], + ], + }, + ) + + # Patch search to just record last_search + def fake_search(**kwargs): + cat.last_search = kwargs + + mocker.patch.object(cat, "search", side_effect=fake_search) + + data_source = IntakeESGFDataSource( + name="src", + project="CMIP6", + priority=1, + facets={ + "activity": "activity_drs", + "dataset": "source_id", + "ensemble": "member_id", + "exp": "experiment_id", + "institute": "institution_id", + "grid": "grid_label", + "mip": "table_id", + "project": "project", + "short_name": "variable_id", + }, + values={}, + ) + data_source.catalog = cat + + # Call find_data - it should use the df we set and return one dataset + results = data_source.find_data(short_name="tas") + assert isinstance(results, list) + assert len(results) == 2 + + dataset = results[0] + assert isinstance(dataset, IntakeESGFDataset) + assert ( + dataset.name + == "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn" + ) + assert hash(dataset) == hash((dataset.name, "v20190429")) + + assert dataset.facets == { + "activity": "CMIP", + "dataset": "CanESM5", + "ensemble": "r1i1p1f1", + "exp": "historical", + "grid": "gn", + "institute": "CCCma", + "mip": "Amon", + "project": "CMIP6", + "short_name": "tas", + "version": "v20190429", + } + dataset = results[1] + assert ( + dataset.name + == "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn" + ) + assert dataset.facets == { + "activity": "ScenarioMIP", + "dataset": "CanESM5", + "ensemble": "r1i1p1f1", + "exp": "ssp585", + "grid": "gn", + "institute": "CCCma", + "mip": "Amon", + "project": "CMIP6", + "short_name": "tas", + "version": "v20190429", + } + + +@pytest.fixture +def data_sources(session: Session) -> list[esmvalcore.io.protocol.DataSource]: + """Fixture providing the default list of IntakeESGFDataSource data sources.""" + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / "data-intake-esgf.yml", + ) as config_file: + cfg = yaml.safe_load(config_file.read_text(encoding="utf-8")) + session["projects"] = cfg["projects"] + return esmvalcore.io.load_data_sources(session) + + +@pytest.mark.online +@pytest.mark.parametrize( + ("facets", "expected_names"), + [ + pytest.param( + { + "dataset": "CanESM5", + "ensemble": "r1i1p1f1", + "exp": ["historical", "ssp585"], + "grid": "gn", + "mip": "Amon", + "project": "CMIP6", + "short_name": "tas", + "timerange": "1850/2100", + }, + { + "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn", + "CMIP6.ScenarioMIP.CCCma.CanESM5.ssp585.r1i1p1f1.Amon.tas.gn", + }, + id="CMIP6", + ), + pytest.param( + { + "dataset": "CanESM5", + "ensemble": "r[1-3]i1p1f1", + "exp": "historical", + "grid": "gn", + "mip": "Amon", + "project": "CMIP6", + "short_name": "tas", + "timerange": "1850/2100", + }, + { + "CMIP6.CMIP.CCCma.CanESM5.historical.r1i1p1f1.Amon.tas.gn", + "CMIP6.CMIP.CCCma.CanESM5.historical.r2i1p1f1.Amon.tas.gn", + "CMIP6.CMIP.CCCma.CanESM5.historical.r3i1p1f1.Amon.tas.gn", + }, + id="CMIP6-with-glob-pattern", + ), + pytest.param( + { + "dataset": "ACCESS1-0", + "ensemble": "r1i1p1", + "exp": ["historical", "rcp85"], + "mip": "Amon", + "project": "CMIP5", + "short_name": "tas", + }, + { + "CSIRO-BOM.ACCESS1.0.historical.mon.atmos.Amon.r1i1p1.tas", + "CSIRO-BOM.ACCESS1.0.rcp85.mon.atmos.Amon.r1i1p1.tas", + }, + id="CMIP5", + ), + pytest.param( + { + "dataset": "cccma_cgcm3_1", + "ensemble": "run1", + "exp": "historical", + "mip": "A1", + "project": "CMIP3", + "short_name": "tas", + }, + { + "CMIP3.CCCMA.cccma_cgcm3_1.historical.day.atmos.run1.tas", + "CMIP3.CCCMA.cccma_cgcm3_1.historical.mon.atmos.run1.tas", + }, + id="CMIP3", + ), + pytest.param( + { + "dataset": "ERA-5", + "project": "obs4MIPs", + "short_name": "tas", + }, + { + "obs4MIPs.ECMWF.ERA-5.mon.tas.gn", + }, + id="obs4MIPs", + ), + ], +) +def test_find_data_online( + data_sources: list[IntakeESGFDataSource], + facets: dict[str, str | list[str]], + expected_names: list[str], +) -> None: + """Test finding data from a real ESGF catalog.""" + data_source = next( + ds for ds in data_sources if ds.project == facets["project"] + ) + result = data_source.find_data(**facets) + assert len(result) > 0 + result_names = {ds.name for ds in result} + assert result_names == expected_names diff --git a/tests/unit/io/test_load_data_sources.py b/tests/unit/io/test_load_data_sources.py new file mode 100644 index 0000000000..de1f7bad23 --- /dev/null +++ b/tests/unit/io/test_load_data_sources.py @@ -0,0 +1,82 @@ +"""Tests for :func:`esmvalcore.io.load_data_sources`.""" + +import importlib.resources +from dataclasses import dataclass + +import pytest + +import esmvalcore.config +import esmvalcore.io + + +def test_configurations_valid(cfg_default: esmvalcore.config.Config) -> None: + """Test that the data sources configuration in esmvalcore/config/configurations are valid.""" + configurations = ( + importlib.resources.files(esmvalcore.config) / "configurations" + ) + with importlib.resources.as_file(configurations) as config_dir: + cfg_default.load_from_dirs([config_dir]) + session = cfg_default.start_session("test") + data_sources = esmvalcore.io.load_data_sources(session) + for data_source in data_sources: + assert isinstance(data_source, esmvalcore.io.DataSource) + + +def test_load_data_sources_unknown_project( + session: esmvalcore.config.Session, +) -> None: + """Test that loading data sources for an unknown project raises.""" + with pytest.raises(ValueError, match=r"Unknown project 'unknown'.*"): + esmvalcore.io.load_data_sources(session, project="unknown") + + +def test_load_data_sources_no_data_sources_configured( + session: esmvalcore.config.Session, +) -> None: + """Test that loading data sources when no data sources are configured raises.""" + session["projects"].clear() + with pytest.raises( + ValueError, + match=r"No data sources found. Check your configuration under 'projects'", + ): + esmvalcore.io.load_data_sources(session) + + +def test_load_data_sources_no_project_data_sources_configured( + session: esmvalcore.config.Session, +) -> None: + """Test that loading data sources when no data sources are configured raises.""" + session["projects"]["test"] = {} + with pytest.raises( + ValueError, + match=r"No data sources found for project 'test'.*", + ): + esmvalcore.io.load_data_sources(session, project="test") + + +@dataclass +class IncompleteDataSource: + """An incomplete data source class for testing.""" + + name: str + project: str + priority: int + # Note the missing implementation of DataSource methods. + + +def test_load_data_sources_invalid_data_source_type( + session: esmvalcore.config.Session, +) -> None: + """Test that loading data sources with an invalid data source type raises.""" + session["projects"]["test"] = { + "data": { + "invalid_source": { + "type": "tests.unit.io.test_load_data_sources.IncompleteDataSource", + }, + }, + } + with pytest.raises( + TypeError, + match=r"Expected a data source of type `esmvalcore.io.protocol.DataSource`.*", + ): + esmvalcore.io.load_data_sources(session, project="test") diff --git a/tests/unit/local/test_facets.py b/tests/unit/local/test_facets.py index 1373b961c6..0332f27693 100644 --- a/tests/unit/local/test_facets.py +++ b/tests/unit/local/test_facets.py @@ -2,7 +2,7 @@ import pytest -from esmvalcore.local import DataSource, LocalFile +from esmvalcore.local import LocalDataSource, LocalFile @pytest.mark.parametrize( @@ -25,6 +25,28 @@ "facet2": "filename", }, ), + ( + "/climate_data/value1/filename_2000-2001.nc", + "/climate_data", + "{facet1}", + "{facet2}[_.]*nc", + { + "facet1": "value1", + "facet2": "filename", + "timerange": "2000/2001", + }, + ), + ( + "/climate_data/value1/filename_20001201-20011231.nc", + "/climate_data", + "{facet1}", + "{facet2}[_.]*nc", + { + "facet1": "value1", + "facet2": "filename", + "timerange": "20001201/20011231", + }, + ), ( "/climate_data/value1/xyz/filename.nc", "/climate_data", @@ -125,6 +147,7 @@ { "tier": "3", "dataset": "ds", + "timerange": "1993/1993", }, ), ( @@ -136,6 +159,7 @@ "tier": "3", "dataset": "ds", "short_name": "tas", + "timerange": "1993/1993", }, ), ( @@ -145,6 +169,7 @@ "{short_name}_*", { "short_name": "tas", + "timerange": "1993/1993", }, ), ( @@ -165,6 +190,7 @@ { "short_name": "tas", "dataset": "ds", + "timerange": "1993/1993", }, ), ( @@ -258,14 +284,42 @@ def test_path2facets( filename_template, facets, ): - """Test `DataSource.path2facets.""" + """Test `LocalDataSource.path2facets.""" path = Path(path) rootpath = Path(rootpath) - data_source = DataSource(rootpath, dirname_template, filename_template) - result = data_source.path2facets(path) + data_source = LocalDataSource( + name="test-source", + project="test-project", + priority=1, + rootpath=rootpath, + dirname_template=dirname_template, + filename_template=filename_template, + ) + add_timerange = "timerange" in facets + result = data_source._path2facets(path, add_timerange=add_timerange) assert result == facets +def test_path2facets_no_timerange(): + # Test that `LocalDataSource.path2facets` does not add "timerange" + # if it cannot determine the timerange. + path = Path("/climate_data/value1/filename.nc") + rootpath = Path("/climate_data") + data_source = LocalDataSource( + name="test-source", + project="test-project", + priority=1, + rootpath=rootpath, + dirname_template="{facet1}", + filename_template="{facet2}[_.]*nc", + ) + result = data_source._path2facets(path, add_timerange=True) + assert result == { + "facet1": "value1", + "facet2": "filename", + } + + def test_localfile(): file = LocalFile("/a/b.nc") file.facets = {"a": "A"} diff --git a/tests/unit/local/test_get_data_sources.py b/tests/unit/local/test_get_data_sources.py index cef6d49891..4c0e7be5d5 100644 --- a/tests/unit/local/test_get_data_sources.py +++ b/tests/unit/local/test_get_data_sources.py @@ -1,10 +1,11 @@ from pathlib import Path import pytest +import pytest_mock from esmvalcore.config import CFG from esmvalcore.config._config_validators import validate_config_developer -from esmvalcore.local import DataSource, _get_data_sources +from esmvalcore.local import DataSource, LocalDataSource, _get_data_sources @pytest.mark.parametrize( @@ -33,7 +34,7 @@ def test_get_data_sources(monkeypatch, rootpath_drs): monkeypatch.setitem(CFG, "drs", drs) sources = _get_data_sources("CMIP6") source = sources[0] - assert isinstance(source, DataSource) + assert isinstance(source, LocalDataSource) assert source.rootpath == Path("/climate_data") assert "{project}" in source.dirname_template assert "{short_name}" in source.filename_template @@ -52,3 +53,25 @@ def test_get_data_sources_nodefault(monkeypatch): ) with pytest.raises(KeyError): _get_data_sources("CMIP6") + + +def test_data_source_deprecated(mocker: pytest_mock.MockerFixture) -> None: + """Test that DataSource is deprecated.""" + mocker.patch.object(DataSource, "_path2facets") + mocker.patch.object(DataSource, "find_data") + with pytest.deprecated_call(): + data_source = DataSource( + name="test", + project="CMIP6", + priority=1, + rootpath=Path("/climate_data"), + dirname_template="/", + filename_template="*.nc", + ) + + assert data_source.regex_pattern + assert data_source.get_glob_patterns() == [Path("/climate_data/*.nc")] + data_source.path2facets(Path("/climate_data/some_file.nc"), False) + data_source._path2facets.assert_called() # type: ignore[attr-defined] + data_source.find_files(dataset="a") + data_source.find_data.assert_called() # type: ignore[attr-defined] diff --git a/tests/unit/local/test_time.py b/tests/unit/local/test_time.py index 30d5d1ea97..5548dfe254 100644 --- a/tests/unit/local/test_time.py +++ b/tests/unit/local/test_time.py @@ -12,7 +12,6 @@ LocalFile, _dates_to_timerange, _get_start_end_date, - _get_start_end_year, _replace_years_with_timerange, _truncate_dates, ) @@ -33,104 +32,46 @@ def _get_esgf_file(path): return ESGFFile([result]) -FILENAME_CASES = [ - ["var_whatever_1980-1981", 1980, 1981], - ["var_whatever_1980.nc", 1980, 1980], - ["a.b.x_yz_185001-200512.nc", 1850, 2005], - ["var_whatever_19800101-19811231.nc1", 1980, 1981], - ["var_whatever_19800101.nc", 1980, 1980], - ["1980-1981_var_whatever.nc", 1980, 1981], - ["1980_var_whatever.nc", 1980, 1980], - ["var_control-1980_whatever.nc", 1980, 1980], - ["19800101-19811231_var_whatever.nc", 1980, 1981], - ["19800101_var_whatever.nc", 1980, 1980], - ["var_control-19800101_whatever.nc", 1980, 1980], - ["19800101_var_control-1950_whatever.nc", 1980, 1980], - ["var_control-1950_whatever_19800101.nc", 1980, 1980], - ["CM61-LR-hist-03.1950_18500101_19491231_1M_concbc.nc", 1850, 1949], - [ - "icon-2.6.1_atm_amip_R2B5_r1v1i1p1l1f1_phy_3d_ml_20150101T000000Z.nc", - 2015, - 2015, - ], - ["pr_A1.186101-200012.nc", 1861, 2000], - ["tas_A1.20C3M_1.CCSM.atmm.1990-01_cat_1999-12.nc", 1990, 1999], - ["E5sf00_1M_1940_032.grb", 1940, 1940], - ["E5sf00_1D_1998-04_167.grb", 1998, 1998], - ["E5sf00_1H_1986-04-11_167.grb", 1986, 1986], - ["E5sf00_1M_1940-1941_032.grb", 1940, 1941], - ["E5sf00_1D_1998-01_1999-12_167.grb", 1998, 1999], - ["E5sf00_1H_2000-01-01_2001-12-31_167.grb", 2000, 2001], -] - -FILENAME_DATE_CASES = [ - ["var_whatever_1980-1981", "1980", "1981"], - ["var_whatever_1980.nc", "1980", "1980"], - ["a.b.x_yz_185001-200512.nc", "185001", "200512"], - ["var_whatever_19800101-19811231.nc1", "19800101", "19811231"], - ["var_whatever_19800101.nc", "19800101", "19800101"], - ["1980-1981_var_whatever.nc", "1980", "1981"], - ["1980_var_whatever.nc", "1980", "1980"], - ["var_control-1980_whatever.nc", "1980", "1980"], - ["19800101-19811231_var_whatever.nc", "19800101", "19811231"], - ["19800101_var_whatever.nc", "19800101", "19800101"], - ["var_control-19800101_whatever.nc", "19800101", "19800101"], - ["19800101_var_control-1950_whatever.nc", "19800101", "19800101"], - ["var_control-1950_whatever_19800101.nc", "19800101", "19800101"], - [ - "CM61-LR-hist-03.1950_18500101_19491231_1M_concbc.nc", - "18500101", - "19491231", - ], +@pytest.mark.parametrize( + "case", [ - "icon-2.6.1_atm_amip_R2B5_r1v1i1p1l1f1_phy_3d_ml_20150101T000000Z.nc", - "20150101T000000Z", - "20150101T000000Z", + ["var_whatever_1980-1981", "1980", "1981"], + ["var_whatever_1980.nc", "1980", "1980"], + ["a.b.x_yz_185001-200512.nc", "185001", "200512"], + ["var_whatever_19800101-19811231.nc1", "19800101", "19811231"], + ["var_whatever_19800101.nc", "19800101", "19800101"], + ["1980-1981_var_whatever.nc", "1980", "1981"], + ["1980_var_whatever.nc", "1980", "1980"], + ["var_control-1980_whatever.nc", "1980", "1980"], + ["19800101-19811231_var_whatever.nc", "19800101", "19811231"], + ["19800101_var_whatever.nc", "19800101", "19800101"], + ["var_control-19800101_whatever.nc", "19800101", "19800101"], + ["19800101_var_control-1950_whatever.nc", "19800101", "19800101"], + ["var_control-1950_whatever_19800101.nc", "19800101", "19800101"], + [ + "CM61-LR-hist-03.1950_18500101_19491231_1M_concbc.nc", + "18500101", + "19491231", + ], + [ + "icon-2.6.1_atm_amip_R2B5_r1v1i1p1l1f1_phy_3d_ml_20150101T000000Z.nc", + "20150101T000000Z", + "20150101T000000Z", + ], + ["pr_A1.186101-200012.nc", "186101", "200012"], + [ + "tas_A1.20C3M_1.CCSM.atmm.1990-01_cat_1999-12.nc", + "199001", + "199912", + ], + ["E5sf00_1M_1940_032.grb", "1940", "1940"], + ["E5sf00_1D_1998-04_167.grb", "199804", "199804"], + ["E5sf00_1H_1986-04-11_167.grb", "19860411", "19860411"], + ["E5sf00_1M_1940-1941_032.grb", "1940", "1941"], + ["E5sf00_1D_1998-01_1999-12_167.grb", "199801", "199912"], + ["E5sf00_1H_2000-01-01_2001-12-31_167.grb", "20000101", "20011231"], ], - ["pr_A1.186101-200012.nc", "186101", "200012"], - ["tas_A1.20C3M_1.CCSM.atmm.1990-01_cat_1999-12.nc", "199001", "199912"], - ["E5sf00_1M_1940_032.grb", "1940", "1940"], - ["E5sf00_1D_1998-04_167.grb", "199804", "199804"], - ["E5sf00_1H_1986-04-11_167.grb", "19860411", "19860411"], - ["E5sf00_1M_1940-1941_032.grb", "1940", "1941"], - ["E5sf00_1D_1998-01_1999-12_167.grb", "199801", "199912"], - ["E5sf00_1H_2000-01-01_2001-12-31_167.grb", "20000101", "20011231"], -] - - -@pytest.mark.parametrize("case", FILENAME_CASES) -def test_get_start_end_year(case): - """Tests for _get_start_end_year function.""" - filename, case_start, case_end = case - - # If the filename is inconclusive or too difficult we resort to reading the - # file, which fails here because the file is not there. - if case_start is None and case_end is None: - with pytest.raises(ValueError): - _get_start_end_year(filename) - with pytest.raises(ValueError): - _get_start_end_year(Path(filename)) - with pytest.raises(ValueError): - _get_start_end_year(LocalFile(filename)) - with pytest.raises(ValueError): - _get_start_end_year(_get_esgf_file(filename)) - - else: - start, end = _get_start_end_year(filename) - assert case_start == start - assert case_end == end - start, end = _get_start_end_year(Path(filename)) - assert case_start == start - assert case_end == end - start, end = _get_start_end_year(LocalFile(filename)) - assert case_start == start - assert case_end == end - start, end = _get_start_end_year(_get_esgf_file(filename)) - assert case_start == start - assert case_end == end - - -@pytest.mark.parametrize("case", FILENAME_DATE_CASES) +) def test_get_start_end_date(case): """Tests for _get_start_end_date function.""" filename, case_start, case_end = case @@ -145,7 +86,7 @@ def test_get_start_end_date(case): with pytest.raises(ValueError): _get_start_end_date(LocalFile(filename)) with pytest.raises(ValueError): - _get_start_end_date(_get_esgf_file(filename)) + _get_start_end_date(_get_esgf_file(filename).name) else: start, end = _get_start_end_date(filename) @@ -157,7 +98,7 @@ def test_get_start_end_date(case): start, end = _get_start_end_date(LocalFile(filename)) assert case_start == start assert case_end == end - start, end = _get_start_end_date(_get_esgf_file(filename)) + start, end = _get_start_end_date(_get_esgf_file(filename).name) assert case_start == start assert case_end == end @@ -173,9 +114,9 @@ def test_read_years_from_cube(tmp_path): ) cube.add_dim_coord(time, 0) iris.save(cube, temp_file) - start, end = _get_start_end_year(temp_file) - assert start == 1990 - assert end == 1991 + start, end = _get_start_end_date(temp_file) + assert int(start[:4]) == 1990 + assert int(end[:4]) == 1991 def test_read_datetime_from_cube(tmp_path): @@ -210,8 +151,6 @@ def test_raises_if_unable_to_deduce_no_time(tmp_path): iris.save(cube, temp_file) with pytest.raises(ValueError): _get_start_end_date(temp_file) - with pytest.raises(ValueError): - _get_start_end_year(temp_file) def test_raises_if_unable_to_deduce_no_time_units(tmp_path): @@ -223,16 +162,12 @@ def test_raises_if_unable_to_deduce_no_time_units(tmp_path): iris.save(cube, temp_file) with pytest.raises(ValueError): _get_start_end_date(temp_file) - with pytest.raises(ValueError): - _get_start_end_year(temp_file) def test_fails_if_no_date_present(): """Test raises if no date is present.""" with pytest.raises(ValueError): _get_start_end_date("var_whatever") - with pytest.raises(ValueError): - _get_start_end_year("var_whatever") def test_get_timerange_from_years(): diff --git a/tests/unit/local/test_to_iris.py b/tests/unit/local/test_to_iris.py index 15a50729ac..44a6a881d3 100644 --- a/tests/unit/local/test_to_iris.py +++ b/tests/unit/local/test_to_iris.py @@ -1,11 +1,14 @@ +from pathlib import Path + import iris.cube import pytest +from pytest_mock import MockerFixture -from esmvalcore.local import LocalFile +from esmvalcore.local import LocalFile, _get_attr_from_field_coord @pytest.fixture -def local_file(tmp_path): +def local_file(tmp_path: Path) -> LocalFile: cube = iris.cube.Cube([0]) cube.attributes.globals["attribute"] = "value" file = tmp_path / "test.nc" @@ -13,21 +16,27 @@ def local_file(tmp_path): return LocalFile(file) -def test_to_iris(local_file): +def test_to_iris(local_file: LocalFile) -> None: cubes = local_file.to_iris() assert len(cubes) == 1 -def test_attributes(local_file): +def test_attributes(local_file: LocalFile) -> None: local_file.to_iris() # Load the file to populate attributes attrs = local_file.attributes assert attrs["attribute"] == "value" -def test_attributes_without_loading(local_file): +def test_attributes_without_loading(local_file: LocalFile) -> None: """Test that accessing attributes without loading the file first raises.""" with pytest.raises( ValueError, match=r"Attributes have not been read yet.*", ): local_file.attributes # noqa: B018 + + +def test_get_attr_from_field_coord_none(mocker: MockerFixture) -> None: + """Test ``_get_attr_from_field_coord``.""" + attr = _get_attr_from_field_coord(mocker.sentinel.ncfield, None, "attr") + assert attr is None diff --git a/tests/unit/main/test_esmvaltool.py b/tests/unit/main/test_esmvaltool.py index b329998ee1..087c9b091b 100644 --- a/tests/unit/main/test_esmvaltool.py +++ b/tests/unit/main/test_esmvaltool.py @@ -61,7 +61,7 @@ def session(cfg): ("max_datasets", 2), ("max_years", 2), ("skip_nonexistent", True), - ("search_esgf", "when_missing"), + ("search_data", "complete"), ("diagnostics", "diagnostic_name/group_name"), ("check_level", "strict"), ], @@ -96,9 +96,9 @@ def test_run_command_line_config(mocker, cfg, argument, value, tmp_path): assert session[argument] == value -@pytest.mark.parametrize("search_esgf", ["never", "when_missing", "always"]) -def test_run(mocker, session, search_esgf): - session["search_esgf"] = search_esgf +@pytest.mark.parametrize("search_data", ["quick", "complete"]) +def test_run(mocker, session, search_data): + session["search_data"] = search_data session["log_level"] = "default" session["remove_preproc_dir"] = True session["save_intermediary_cubes"] = False @@ -255,7 +255,7 @@ def test_header( cli_config_dir, ) - assert len(caplog.messages) == 8 + assert len(caplog.messages) in [8, 9] assert caplog.messages[0] == HEADER assert caplog.messages[1] == "Package versions" assert caplog.messages[2] == "----------------" @@ -270,7 +270,8 @@ def test_header( f"{cli_config_dir} [NOT AN EXISTING DIRECTORY] (command line argument)", ], ) - assert caplog.messages[7] == ( + # There might be a warning about ~/.esmvaltool/config-user.yml here. + assert caplog.messages[-1] == ( "Writing program log files to:\npath_to_log_file1\npath_to_log_file2" ) diff --git a/tests/unit/preprocessor/test_shared.py b/tests/unit/preprocessor/test_shared.py index 773f380794..a1860ecde2 100644 --- a/tests/unit/preprocessor/test_shared.py +++ b/tests/unit/preprocessor/test_shared.py @@ -2,6 +2,7 @@ import inspect import warnings +from pathlib import Path import dask.array as da import iris.analysis @@ -384,17 +385,17 @@ def test_compute_area_weights(lazy): ) -def test_group_products_string_list(): +def test_group_products_string_list() -> None: products = [ PreprocessorFile( - filename="A_B.nc", + filename=Path("A_B.nc"), attributes={ "project": "A", "dataset": "B", }, ), PreprocessorFile( - filename="A_C.nc", + filename=Path("A_C.nc"), attributes={ "project": "A", "dataset": "C", diff --git a/tests/unit/provenance/test_trackedfile.py b/tests/unit/provenance/test_trackedfile.py index 9e22ca461b..16290ec72b 100644 --- a/tests/unit/provenance/test_trackedfile.py +++ b/tests/unit/provenance/test_trackedfile.py @@ -1,21 +1,48 @@ +from dataclasses import dataclass from pathlib import Path +from typing import Any +import iris.cube +import prov.model import pytest from prov.model import ProvDocument from esmvalcore._provenance import ESMVALTOOL_URI_PREFIX, TrackedFile +from esmvalcore.io.protocol import DataElement from esmvalcore.local import LocalFile +def test_set() -> None: + assert { + TrackedFile(Path("file1.nc"), attributes={}), + TrackedFile(Path("file1.nc"), attributes={}), + TrackedFile(Path("file2.nc"), attributes={}), + } == { + TrackedFile(Path("file1.nc"), attributes={}), + TrackedFile(Path("file2.nc"), attributes={}), + } + + +def test_sort() -> None: + file1 = TrackedFile(Path("file1.nc"), attributes={}) + file2 = TrackedFile(Path("file2.nc"), attributes={}) + assert sorted([file2, file1]) == [file1, file2] + + +def test_equals() -> None: + file = TrackedFile(Path("file.nc"), attributes={}) + assert file == TrackedFile(Path("file.nc"), attributes={}) + + @pytest.fixture -def tracked_input_file_nc(): +def tracked_input_file_nc() -> TrackedFile: input_file_nc = LocalFile("/path/to/file.nc") input_file_nc.attributes = {"a": "A"} return TrackedFile(filename=input_file_nc) @pytest.fixture -def tracked_output_file_nc(): +def tracked_output_file_nc() -> TrackedFile: return TrackedFile( filename=Path("/path/to/file.nc"), attributes={"a": "A"}, @@ -23,41 +50,56 @@ def tracked_output_file_nc(): @pytest.fixture -def tracked_input_file_grb(): +def tracked_input_file_grb() -> TrackedFile: input_file_grb = LocalFile("/path/to/file.grb") input_file_grb.attributes = {"a": "A"} return TrackedFile(filename=input_file_grb) -def test_init_input_nc(tracked_input_file_nc): +def test_init_input_nc(tracked_input_file_nc: TrackedFile) -> None: """Test `esmvalcore._provenance.TrackedFile.__init__`.""" assert tracked_input_file_nc.filename == LocalFile("/path/to/file.nc") - assert tracked_input_file_nc.attributes is None + with pytest.raises( + ValueError, + match=r"Call TrackedFile.initialize_provenance before accessing attributes", + ): + tracked_input_file_nc.attributes # noqa: B018 -def test_init_output_nc(tracked_output_file_nc): +def test_init_output_nc(tracked_output_file_nc: TrackedFile) -> None: """Test `esmvalcore._provenance.TrackedFile.__init__`.""" assert tracked_output_file_nc.filename == Path("/path/to/file.nc") assert tracked_output_file_nc.attributes == {"a": "A"} -def test_init_grb(tracked_input_file_grb): +def test_init_grb(tracked_input_file_grb: TrackedFile) -> None: """Test `esmvalcore._provenance.TrackedFile.__init__`.""" assert tracked_input_file_grb.filename == LocalFile("/path/to/file.grb") - assert tracked_input_file_grb.attributes is None + with pytest.raises( + ValueError, + match=r"Call TrackedFile.initialize_provenance before accessing attributes", + ): + tracked_input_file_grb.attributes # noqa: B018 + + +@pytest.fixture +def activity() -> prov.model.ProvActivity: + provenance = ProvDocument() + provenance.add_namespace("task", uri=ESMVALTOOL_URI_PREFIX + "task") + return provenance.activity("task:test-task-name") @pytest.mark.parametrize( "fixture_name", ["tracked_input_file_nc", "tracked_output_file_nc"], ) -def test_initialize_provenance_nc(fixture_name, request): +def test_initialize_provenance_nc( + fixture_name: str, + request: pytest.FixtureRequest, + activity: prov.model.ProvActivity, +) -> None: """Test `esmvalcore._provenance.TrackedFile.initialize_provenance`.""" tracked_file_nc = request.getfixturevalue(fixture_name) - provenance = ProvDocument() - provenance.add_namespace("task", uri=ESMVALTOOL_URI_PREFIX + "task") - activity = provenance.activity("task:test-task-name") - tracked_file_nc.initialize_provenance(activity) assert isinstance(tracked_file_nc.provenance, ProvDocument) assert tracked_file_nc.activity == activity @@ -65,33 +107,59 @@ def test_initialize_provenance_nc(fixture_name, request): assert tracked_file_nc.attributes == {"a": "A"} -def test_initialize_provenance_grb(tracked_input_file_grb): +def test_initialize_provenance_grb( + tracked_input_file_grb: TrackedFile, + activity: prov.model.ProvActivity, +) -> None: """Test `esmvalcore._provenance.TrackedFile.initialize_provenance`.""" - provenance = ProvDocument() - provenance.add_namespace("task", uri=ESMVALTOOL_URI_PREFIX + "task") - activity = provenance.activity("task:test-task-name") - tracked_input_file_grb.initialize_provenance(activity) assert isinstance(tracked_input_file_grb.provenance, ProvDocument) assert tracked_input_file_grb.activity == activity assert ( - str(tracked_input_file_grb.entity.identifier) + str(tracked_input_file_grb.entity.identifier) # type: ignore[attr-defined] == "file:/path/to/file.grb" ) assert tracked_input_file_grb.attributes == {"a": "A"} +def test_initialize_provenance_twice_raises( + tracked_output_file_nc: TrackedFile, + activity: prov.model.ProvActivity, +) -> None: + """Test `esmvalcore._provenance.TrackedFile.initialize_provenance` raises if called twice.""" + tracked_output_file_nc.initialize_provenance(activity) + + with pytest.raises( + ValueError, + match=r"Provenance of TrackedFile: /path/to/file.nc already initialized", + ): + tracked_output_file_nc.initialize_provenance(activity) + + +def test_initialize_provenance_no_attributes_raises( + activity: prov.model.ProvActivity, +) -> None: + """Test `esmvalcore._provenance.TrackedFile.initialize_provenance` with no attributes.""" + tracked_file = TrackedFile(filename=Path("/path/to/file.nc")) + + with pytest.raises( + TypeError, + match=r"Delayed reading of attributes is only supported for `DataElement`s", + ): + tracked_file.initialize_provenance(activity) + + @pytest.mark.parametrize( "fixture_name", ["tracked_input_file_nc", "tracked_output_file_nc"], ) -def test_copy_provenance(fixture_name, request): +def test_copy_provenance( + fixture_name: str, + request: pytest.FixtureRequest, + activity: prov.model.ProvActivity, +) -> None: """Test `esmvalcore._provenance.TrackedFile.copy_provenance`.""" tracked_file_nc = request.getfixturevalue(fixture_name) - provenance = ProvDocument() - provenance.add_namespace("task", uri=ESMVALTOOL_URI_PREFIX + "task") - activity = provenance.activity("task:test-task-name") - tracked_file_nc.initialize_provenance(activity) copied_file = tracked_file_nc.copy_provenance() @@ -99,3 +167,80 @@ def test_copy_provenance(fixture_name, request): assert copied_file.entity == tracked_file_nc.entity assert copied_file.provenance == tracked_file_nc.provenance assert copied_file.provenance is not tracked_file_nc.provenance + + +def test_copy_provenance_not_initialized() -> None: + """Test `esmvalcore._provenance.TrackedFile.copy_provenance` raises if provenance not initialized.""" + tracked_file = TrackedFile(filename=Path("/path/to/file.nc")) + + with pytest.raises( + ValueError, + match=r"Provenance of TrackedFile: /path/to/file.nc not initialized", + ): + tracked_file.copy_provenance() + + +def test_wasderivedfrom_not_initialized() -> None: + """Test `esmvalcore._provenance.TrackedFile.wasderivedfrom` raises if provenance not initialized.""" + tracked_file = TrackedFile(filename=Path("/path/to/file.nc")) + other_tracked_file = TrackedFile(filename=Path("/path/to/other_file.nc")) + + with pytest.raises( + ValueError, + match=r"Provenance of TrackedFile: /path/to/file.nc not initialized", + ): + tracked_file.wasderivedfrom(other_tracked_file) + + +@dataclass +class MockDataElement(DataElement): + """Mock DataElement for testing purposes.""" + + name: str + facets: dict[str, Any] + attributes: dict[str, Any] + + def prepare(self) -> None: + pass + + def __hash__(self) -> int: + return hash(self.name) + + def to_iris(self) -> iris.cube.CubeList: + return [] + + +def test_provenance_file_nonpath_notimplemented() -> None: + """Test `esmvalcore._provenance.TrackedFile.provenance_file` with a DataElement.""" + input_file = MockDataElement( + name="/path/to/input_file.nc", + facets={}, + attributes={}, + ) + tracked_file = TrackedFile(filename=input_file) + + assert tracked_file.filename == input_file + with pytest.raises( + NotImplementedError, + match=r"Saving provenance is only supported for pathlib.Path.*", + ): + _ = tracked_file.provenance_file + + +def test_save_provenance_notimplemented( + activity: prov.model.ProvActivity, +) -> None: + """Test `esmvalcore._provenance.TrackedFile.save_provenance` with a DataElement.""" + input_file = MockDataElement( + name="/path/to/input_file.nc", + facets={}, + attributes={}, + ) + tracked_file = TrackedFile(filename=input_file) + tracked_file.initialize_provenance(activity) + + with pytest.raises( + NotImplementedError, + match=r"Writing attributes is only supported for pathlib.Path.*", + ): + tracked_file.save_provenance() diff --git a/tests/unit/recipe/test_recipe.py b/tests/unit/recipe/test_recipe.py index 367eff6e17..87c3884846 100644 --- a/tests/unit/recipe/test_recipe.py +++ b/tests/unit/recipe/test_recipe.py @@ -148,53 +148,6 @@ def create_esgf_search_results(): return [file0, file1] -@pytest.mark.parametrize("local_availability", ["all", "partial", "none"]) -def test_schedule_for_download(monkeypatch, tmp_path, local_availability): - """Test that `_schedule_for_download` updates DOWNLOAD_FILES.""" - esgf_files = create_esgf_search_results() - download_dir = tmp_path / "download_dir" - local_dir = Path("/local_dir") - - # Local files can cover the entire period, part of it, or nothing - local_file_options = { - "all": [f.local_file(local_dir) for f in esgf_files], - "partial": [esgf_files[1].local_file(local_dir)], - "none": [], - } - local_files = local_file_options[local_availability] - - variable = { - "project": "CMIP6", - "mip": "Amon", - "frequency": "mon", - "short_name": "tas", - "dataset": "EC.-Earth3", - "exp": "historical", - "ensemble": "r1i1p1f1", - "grid": "gr", - "timerange": "1850/1851", - "alias": "CMIP6_EC-Eeath3_tas", - } - dataset = Dataset(**variable) - files = { - "all": local_files, - "partial": local_files + esgf_files[:1], - "none": esgf_files, - } - dataset.session = {"download_dir": download_dir} - dataset.files = list(files[local_availability]) - - monkeypatch.setattr(_recipe, "DOWNLOAD_FILES", set()) - _recipe._schedule_for_download([dataset]) - print(esgf_files) - expected = { - "all": set(), - "partial": set(esgf_files[:1]), - "none": set(esgf_files), - } - assert expected[local_availability] == _recipe.DOWNLOAD_FILES - - def test_write_html_summary(mocker, caplog): """Test `Recipe.write_html_summary` failing and logging a message.""" message = "Failed to look up references." diff --git a/tests/unit/recipe/test_to_datasets.py b/tests/unit/recipe/test_to_datasets.py index 88ffc4d9aa..675c538e69 100644 --- a/tests/unit/recipe/test_to_datasets.py +++ b/tests/unit/recipe/test_to_datasets.py @@ -2,6 +2,7 @@ from pathlib import Path import pytest +import pytest_mock import yaml from esmvalcore._recipe import to_datasets @@ -324,7 +325,10 @@ def test_max_years(session): @pytest.mark.parametrize("found_files", [True, False]) -def test_dataset_from_files_fails(monkeypatch, found_files): +def test_dataset_from_files_fails( + monkeypatch: pytest.MonkeyPatch, + found_files: bool, +) -> None: def from_files(_): file = LocalFile("/path/to/file") file.facets = {"facets1": "value1"} @@ -333,7 +337,6 @@ def from_files(_): short_name="tas", ) dataset.files = [file] if found_files else [] - dataset._file_globs = ["/path/to/tas_*.nc"] return [dataset] monkeypatch.setattr(Dataset, "from_files", from_files) @@ -413,7 +416,11 @@ def test_append_missing_supplementaries(): ) # dataset will be inherited from the main variable -def test_report_unexpanded_globs(mocker): +@pytest.mark.parametrize("files", [False, True]) +def test_report_unexpanded_globs( + mocker: pytest_mock.MockFixture, + files: bool, +) -> None: dataset = Dataset( alias="CMIP5", dataset="*", @@ -425,10 +432,11 @@ def test_report_unexpanded_globs(mocker): project="CMIP5", recipe_dataset_index=1, short_name="ta", + timerange="2000/2014", variable_group="ta850", ) - file = mocker.Mock(facets={"dataset": "*"}) - dataset.files = [file] + dataset.add_supplementary(short_name="areacella", mip="fx") + dataset.files = [mocker.Mock(facets={"dataset": "*"})] if files else [] unexpanded_globs = {"dataset": "*"} msg = to_datasets._report_unexpanded_globs( @@ -436,5 +444,11 @@ def test_report_unexpanded_globs(mocker): dataset, unexpanded_globs, ) - + print(msg) assert "paths to the" not in msg + assert "Unable to replace dataset=* by a value" in msg + if not files: + main_dataset = dataset.copy() + main_dataset.supplementaries = [] + assert f"because no files were found for {main_dataset}" in msg + assert "within the requested timerange 2000/2014" in msg diff --git a/tests/unit/task/test_diagnostic_task.py b/tests/unit/task/test_diagnostic_task.py index 15517187bb..cb6047bc11 100644 --- a/tests/unit/task/test_diagnostic_task.py +++ b/tests/unit/task/test_diagnostic_task.py @@ -228,7 +228,7 @@ def test_collect_provenance(mocker, diagnostic_task): diagnostic_task._collect_provenance() tracked_file_class.assert_called_once_with( - "test.png", + Path("test.png"), { "caption": "Some figure", "plot_type": ("tag_value",), diff --git a/tests/unit/task/test_print.py b/tests/unit/task/test_print.py index 0ed1352f68..53aad046d3 100644 --- a/tests/unit/task/test_print.py +++ b/tests/unit/task/test_print.py @@ -2,20 +2,22 @@ import copy import textwrap +from pathlib import Path import pytest from esmvalcore._task import DiagnosticTask from esmvalcore.dataset import Dataset +from esmvalcore.local import LocalFile from esmvalcore.preprocessor import PreprocessingTask, PreprocessorFile @pytest.fixture def preproc_file(): dataset = Dataset(short_name="tas") - dataset.files = ["/path/to/input_file.nc"] + dataset.files = [LocalFile("/path/to/input_file.nc")] return PreprocessorFile( - filename="/output/preproc/file.nc", + filename=Path("/output/preproc/file.nc"), attributes={"short_name": "tas"}, settings={ "extract_levels": {"scheme": "linear", "levels": [95000]}, @@ -52,9 +54,9 @@ def test_repr_preproc_task(preproc_task): PreprocessingTask: diag_1/tas order: ['extract_levels', 'save'] PreprocessorFile: /output/preproc/file.nc - input files: ['/path/to/input_file.nc'] + input files: [LocalFile('/path/to/input_file.nc')] settings: {'extract_levels': {'levels': [95000], 'scheme': 'linear'}, - 'save': {'filename': '/output/preproc/file.nc'}} + 'save': {'filename': PosixPath('/output/preproc/file.nc')}} ancestors: None """) @@ -97,9 +99,9 @@ def test_repr_simple_tree(preproc_task, diagnostic_task): PreprocessingTask: diag_1/tas order: ['extract_levels', 'save'] PreprocessorFile: /output/preproc/file.nc - input files: ['/path/to/input_file.nc'] + input files: [LocalFile('/path/to/input_file.nc')] settings: {'extract_levels': {'levels': [95000], 'scheme': 'linear'}, - 'save': {'filename': '/output/preproc/file.nc'}} + 'save': {'filename': PosixPath('/output/preproc/file.nc')}} ancestors: None """) @@ -141,25 +143,25 @@ def test_repr_full_tree(preproc_task, diagnostic_task): PreprocessingTask: diag_1/tas order: ['extract_levels', 'save'] PreprocessorFile: /output/preproc/file.nc - input files: ['/path/to/input_file.nc'] + input files: [LocalFile('/path/to/input_file.nc')] settings: {'extract_levels': {'levels': [95000], 'scheme': 'linear'}, - 'save': {'filename': '/output/preproc/file.nc'}} + 'save': {'filename': PosixPath('/output/preproc/file.nc')}} ancestors: PreprocessingTask: diag_1/tas_derive_input_1 order: ['extract_levels', 'save'] PreprocessorFile: /output/preproc/file.nc - input files: ['/path/to/input_file.nc'] + input files: [LocalFile('/path/to/input_file.nc')] settings: {'extract_levels': {'levels': [95000], 'scheme': 'linear'}, - 'save': {'filename': '/output/preproc/file.nc'}} + 'save': {'filename': PosixPath('/output/preproc/file.nc')}} ancestors: None PreprocessingTask: diag_1/tas_derive_input_2 order: ['extract_levels', 'save'] PreprocessorFile: /output/preproc/file.nc - input files: ['/path/to/input_file.nc'] + input files: [LocalFile('/path/to/input_file.nc')] settings: {'extract_levels': {'levels': [95000], 'scheme': 'linear'}, - 'save': {'filename': '/output/preproc/file.nc'}} + 'save': {'filename': PosixPath('/output/preproc/file.nc')}} ancestors: None """) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 68e8ceed05..c1d320b258 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1,18 +1,61 @@ +import importlib.resources import textwrap from collections import defaultdict +from functools import lru_cache from pathlib import Path -from unittest import mock import pyesgf import pytest +import yaml import esmvalcore.dataset +import esmvalcore.esgf import esmvalcore.local from esmvalcore.cmor.check import CheckLevels from esmvalcore.config import CFG, Session from esmvalcore.dataset import Dataset from esmvalcore.esgf import ESGFFile from esmvalcore.exceptions import InputFilesNotFound, RecipeError +from esmvalcore.typing import Facets + + +@lru_cache +def _load_default_data_sources() -> dict[ + str, + dict[str, dict[str, dict[str, dict[str, str]]]], +]: + """Load default data sources for local users.""" + cfg: dict[str, dict[str, dict[str, dict[str, dict[str, str]]]]] = { + "projects": {}, + } + for file in ( + "data-local.yml", + "data-local-esmvaltool.yml", + "data-native-cesm.yml", + "data-native-emac.yml", + "data-native-icon.yml", + "data-native-ipslcm.yml", + ): + with importlib.resources.as_file( + importlib.resources.files(esmvalcore.config) + / "configurations" + / file, + ) as config_file: + content = config_file.read_text(encoding="utf-8") + cfg["projects"].update(yaml.safe_load(content)["projects"]) + return cfg + + +@pytest.fixture +def session(tmp_path: Path, session: Session) -> Session: + """Session fixture with default local data sources.""" + projects = _load_default_data_sources()["projects"] + for project in projects: + data_sources = projects[project]["data"] + for data_source in data_sources.values(): + data_source["rootpath"] = str(tmp_path) + session["projects"][project]["data"] = data_sources + return session def test_repr(): @@ -873,6 +916,7 @@ def test_from_files_with_globs(monkeypatch, session): "mip": "Amon", "project": "CMIP6", "short_name": "tas", + "timerange": "185001/201412", "version": "v20181126", } file2 = esmvalcore.local.LocalFile( @@ -984,6 +1028,7 @@ def test_from_files_with_globs_and_missing_facets(monkeypatch, session): "mip": "Amon", "project": "CMIP6", "short_name": "tas", + "timerange": "185001/201412", "version": "v20181126", } file2 = esmvalcore.local.LocalFile( @@ -1030,7 +1075,6 @@ def test_from_files_with_globs_and_missing_facets(monkeypatch, session): mip="Amon", project="CMIP6", short_name="tas", - timerange="185001/201412", ) expected.session = session @@ -1065,6 +1109,7 @@ def test_from_files_with_globs_and_automatic_missing(monkeypatch, session): "mip": "Amon", "project": "CMIP6", "short_name": "tas", + "timerange": "185001/201412", "version": "v20181126", } @@ -1250,7 +1295,7 @@ def test_concatenating_historical_and_future_exps(mocker): assert dataset.supplementaries[0].facets["exp"] == "historical" -def test_from_recipe_with_glob(tmp_path, session, mocker): +def test_from_recipe_with_glob(tmp_path: Path, session: Session) -> None: recipe_txt = textwrap.dedent(""" diagnostics: @@ -1267,8 +1312,6 @@ def test_from_recipe_with_glob(tmp_path, session, mocker): recipe = tmp_path / "recipe_test.yml" recipe.write_text(recipe_txt, encoding="utf-8") - session["drs"]["CMIP5"] = "ESGF" - CFG["rootpath"]["CMIP5"] = [tmp_path] filenames = [ "cmip5/output1/CSIRO-QCCCE/CSIRO-Mk3-6-0/rcp85/mon/atmos/Amon/r1i1p1/" "v20120323/tas_Amon_CSIRO-Mk3-6-0_rcp85_r1i1p1_200601-210012.nc", @@ -1280,7 +1323,7 @@ def test_from_recipe_with_glob(tmp_path, session, mocker): path.parent.mkdir(parents=True, exist_ok=True) path.write_text("") - definitions = [ + definitions: list[Facets] = [ { "diagnostic": "diagnostic1", "variable_group": "tas", @@ -1420,18 +1463,34 @@ def dataset(): mip="Amon", frequency="mon", short_name="tas", - dataset="EC.-Earth3", + dataset="EC-Earth3", exp="historical", ensemble="r1i1p1f1", grid="gr", timerange="1850/1851", - alias="CMIP6_EC-Eeath3_tas", + alias="CMIP6_EC-Earth3_tas", ) dataset.session = { - "search_esgf": "when_missing", + "search_data": "complete", "download_dir": Path("/download_dir"), - "rootpath": None, - "drs": {}, + "projects": { + "CMIP6": { + "data": { + "local": { + "type": "esmvalcore.local.LocalDataSource", + "rootpath": Path("/local_dir"), + "dirname_template": "{project}/{activity}/{institute}/{dataset}/{exp}/{ensemble}/{mip}/{short_name}/{grid}/{version}", + "filename_template": "{short_name}_{mip}_{dataset}_{exp}_{ensemble}_{grid}*.nc", + "priority": 1, + }, + "esgf": { + "type": "esmvalcore.esgf.ESGFDataSource", + "download_dir": Path("/download_dir"), + "priority": 2, + }, + }, + }, + }, } return dataset @@ -1461,14 +1520,14 @@ def test_find_files(mocker, dataset, local_availability): ) mocker.patch.object( - esmvalcore.dataset.local, - "find_files", + esmvalcore.local.LocalDataSource, + "find_data", autospec=True, - return_value=(list(local_files), []), + return_value=list(local_files), ) mocker.patch.object( - esmvalcore.dataset.esgf, - "find_files", + esmvalcore.esgf.ESGFDataSource, + "find_data", autospec=True, return_value=list(esgf_files), ) @@ -1498,14 +1557,14 @@ def test_find_files_wildcard_timerange(mocker, dataset): ) mocker.patch.object( - esmvalcore.dataset.local, - "find_files", + esmvalcore.local.LocalDataSource, + "find_data", autospec=True, - return_value=(local_files, []), + return_value=list(local_files), ) mocker.patch.object( - esmvalcore.dataset.esgf, - "find_files", + esmvalcore.esgf.ESGFDataSource, + "find_data", autospec=True, return_value=list(esgf_files), ) @@ -1535,14 +1594,14 @@ def test_find_files_outdated_local(mocker, dataset): ) mocker.patch.object( - esmvalcore.dataset.local, - "find_files", + esmvalcore.local.LocalDataSource, + "find_data", autospec=True, - return_value=(local_files, []), + return_value=list(local_files), ) mocker.patch.object( - esmvalcore.dataset.esgf, - "find_files", + esmvalcore.esgf.ESGFDataSource, + "find_data", autospec=True, return_value=list(esgf_files), ) @@ -1550,65 +1609,6 @@ def test_find_files_outdated_local(mocker, dataset): assert dataset.files == esgf_files -@pytest.mark.parametrize( - "project", - ["CESM", "EMAC", "ICON", "IPSLCM", "OBS", "OBS6", "ana4mips", "native6"], -) -def test_find_files_non_esgf_projects(mocker, project, monkeypatch): - """Test that find_files does never download files for non-ESGF projects.""" - monkeypatch.setitem(CFG, "search_esgf", "always") - mock_local_find_files = mocker.patch.object( - esmvalcore.dataset.local, - "find_files", - autospec=True, - return_value=(mock.sentinel.files, mock.sentinel.file_globs), - ) - mock_esgf_find_files = mocker.patch.object( - esmvalcore.dataset.esgf, - "find_files", - autospec=True, - ) - - tas = Dataset( - short_name="tas", - mip="Amon", - project=project, - dataset="MY_DATASET", - timerange="2000/2000", - account="account", - case="case", - channel="channel", - dir="dir", - exp="amip", - freq="freq", - gcomp="gcomp", - group="group", - ipsl_varname="ipsl_varname", - model="model", - out="out", - root="root", - scomp="scomp", - simulation="simulation", - status="status", - string="string", - tag="tag", - tdir="tdir", - tier=3, - tperiod="tperiod", - type="sat", - var_type="var_type", - version=1, - ) - tas.augment_facets() - tas.find_files() - - mock_local_find_files.assert_called_once() - mock_esgf_find_files.assert_not_called() - - assert tas.files == mock.sentinel.files - assert tas._file_globs == mock.sentinel.file_globs - - def test_set_version(): dataset = Dataset(short_name="tas") dataset.add_supplementary(short_name="areacella") @@ -1679,9 +1679,9 @@ def test_update_timerange_year_format(session, input_time, output_time): assert dataset["timerange"] == output_time -@pytest.mark.parametrize("search_esgf", ["never", "when_missing", "always"]) -def test_update_timerange_no_files(session, search_esgf): - session["search_esgf"] = search_esgf +@pytest.mark.parametrize("search_data", ["quick", "complete"]) +def test_update_timerange_no_files(session, search_data): + session["search_data"] = search_data variable = { "alias": "CMIP6", "project": "CMIP6", @@ -1755,7 +1755,7 @@ def mock_preprocess( mocker.patch.object(esmvalcore.dataset, "preprocess", mock_preprocess) - items = [mocker.sentinel.file] + items = [mocker.create_autospec(esmvalcore.local.LocalFile, instance=True)] dataset.files = items cube = dataset.load() @@ -1776,9 +1776,7 @@ def mock_preprocess( assert order == load_order load_args = { - "load": { - "ignore_warnings": None, - }, + "load": {}, "fix_file": { "add_unique_suffix": True, "dataset": "CanESM2", @@ -1842,12 +1840,12 @@ def mock_preprocess( assert args == load_args _get_output_file.assert_called_with(dataset.facets, session.preproc_dir) + items[0].prepare.assert_called_once() def test_load_fail(session): dataset = Dataset() dataset.session = session - dataset.session["search_esgf"] = "when_missing" dataset.files = [] with pytest.raises(InputFilesNotFound): dataset.load() @@ -2137,7 +2135,7 @@ def test_get_extra_facets_native6(): } -OBS6_SAT_FACETS = { +OBS6_SAT_FACETS: Facets = { "project": "OBS6", "dataset": "SAT", "mip": "Amon", @@ -2191,8 +2189,11 @@ def test_derivation_necessary_no_derivation(): assert dataset._derivation_necessary() is False -def test_derivation_necessary_no_force_derivation_no_files(): +def test_derivation_necessary_no_force_derivation_no_files( + session: Session, +) -> None: dataset = Dataset(**OBS6_SAT_FACETS, short_name="lwcre", derive=True) + dataset.session = session assert dataset._derivation_necessary() is True diff --git a/tests/unit/test_provenance.py b/tests/unit/test_provenance.py deleted file mode 100644 index b6c20dbc2e..0000000000 --- a/tests/unit/test_provenance.py +++ /dev/null @@ -1,25 +0,0 @@ -"""Test `esmvalcore._provenance`.""" - -from esmvalcore._provenance import TrackedFile - - -def test_set(): - assert { - TrackedFile("file1.nc", attributes={}), - TrackedFile("file1.nc", attributes={}), - TrackedFile("file2.nc", attributes={}), - } == { - TrackedFile("file1.nc", attributes={}), - TrackedFile("file2.nc", attributes={}), - } - - -def test_sort(): - file1 = TrackedFile("file1.nc", attributes={}) - file2 = TrackedFile("file2.nc", attributes={}) - assert sorted([file2, file1]) == [file1, file2] - - -def test_equals(): - file = TrackedFile("file.nc", attributes={}) - assert file == TrackedFile("file.nc", attributes={})