diff --git a/doc/recipe/preprocessor.rst b/doc/recipe/preprocessor.rst index a1d603f289..a53d495bd5 100644 --- a/doc/recipe/preprocessor.rst +++ b/doc/recipe/preprocessor.rst @@ -2911,53 +2911,56 @@ See also :func:`esmvalcore.preprocessor.distance_metric`. .. _Weighted Earth mover's distance: https://pythonot.github.io/ quickstart.html#computing-wasserstein-distance - -.. _Memory use: - -Information on maximum memory required -====================================== -In the most general case, we can set upper limits on the maximum memory the -analysis will require: +.. _Other: -``Ms = (R + N) x F_eff - F_eff`` - when no multi-model analysis is performed; +Other +===== -``Mm = (2R + N) x F_eff - 2F_eff`` - when multi-model analysis is performed; +Miscellaneous functions that do not belong to any of the other categories. -where +.. _align_metadata: -* ``Ms``: maximum memory for non-multimodel module -* ``Mm``: maximum memory for multi-model module -* ``R``: computational efficiency of module; `R` is typically 2-3 -* ``N``: number of datasets -* ``F_eff``: average size of data per dataset where ``F_eff = e x f x F`` - where ``e`` is the factor that describes how lazy the data is (``e = 1`` for - fully realized data) and ``f`` describes how much the data was shrunk by the - immediately previous module, e.g. time extraction, area selection or level - extraction; note that for fix_data ``f`` relates only to the time extraction, - if data is exact in time (no time selection) ``f = 1`` for fix_data so for - cases when we deal with a lot of datasets ``R + N \approx N``, data is fully - realized, assuming an average size of 1.5GB for 10 years of `3D` netCDF data, - ``N`` datasets will require: +``align_metadata`` +------------------ +This function sets cube metadata to entries from a specific target project. +This is useful to align variable metadata of different projects prior to +performing multi-model operations (e.g., :ref:`multi-model statistics`). +For example, standard names differ for some variables between CMIP5 and CMIP6 +which would prevent the calculation of multi-model statistics between CMIP5 and +CMIP6 data. -``Ms = 1.5 x (N - 1)`` GB +The ``align_metadata`` preprocessor supports the following arguments in the +recipe: -``Mm = 1.5 x (N - 2)`` GB +* ``target_project`` (:obj:`str`): Project from which target metadata is read. +* ``target_mip`` (:obj:`str`; optional): MIP table from which target metadata + is read. + If not given, use the MIP tables of the corresponding variables defined in + the recipe. +* ``target_short_name`` (:obj:`str`; optional): Variable short name from which + target metadata is read. + If not given, use the short names of the corresponding variables defined in + the recipe. +* ``strict`` (:obj:`str`; optional, default: ``True``): If ``True``, raise an + error if desired metadata cannot be read for variable ``target_short_name`` + of MIP table ``target_mip`` and project ``target_project``. + If ``False``, no error is raised. -As a rule of thumb, the maximum required memory at a certain time for -multi-model analysis could be estimated by multiplying the number of datasets by -the average file size of all the datasets; this memory intake is high but also -assumes that all data is fully realized in memory; this aspect will gradually -change and the amount of realized data will decrease with the increase of -``dask`` use. +Example: -.. _Other: +.. code-block:: yaml -Other -===== + preprocessors: + calculate_multi_model_statistics: + align_metadata: + target_project: CMIP6 + multi_model_statistics: + span: overlap + statistics: [mean, median] -Miscellaneous functions that do not belong to any of the other categories. +See also :func:`esmvalcore.preprocessor.align_metadata`. .. _cumulative_sum: @@ -3085,3 +3088,44 @@ Example: normalization: sum See also :func:`esmvalcore.preprocessor.histogram`. + + +.. _Memory use: + +Information on maximum memory required +====================================== +In the most general case, we can set upper limits on the maximum memory the +analysis will require: + + +``Ms = (R + N) x F_eff - F_eff`` - when no multi-model analysis is performed; + +``Mm = (2R + N) x F_eff - 2F_eff`` - when multi-model analysis is performed; + +where + +* ``Ms``: maximum memory for non-multimodel module +* ``Mm``: maximum memory for multi-model module +* ``R``: computational efficiency of module; `R` is typically 2-3 +* ``N``: number of datasets +* ``F_eff``: average size of data per dataset where ``F_eff = e x f x F`` + where ``e`` is the factor that describes how lazy the data is (``e = 1`` for + fully realized data) and ``f`` describes how much the data was shrunk by the + immediately previous module, e.g. time extraction, area selection or level + extraction; note that for fix_data ``f`` relates only to the time extraction, + if data is exact in time (no time selection) ``f = 1`` for fix_data so for + cases when we deal with a lot of datasets ``R + N \approx N``, data is fully + realized, assuming an average size of 1.5GB for 10 years of `3D` netCDF data, + ``N`` datasets will require: + + +``Ms = 1.5 x (N - 1)`` GB + +``Mm = 1.5 x (N - 2)`` GB + +As a rule of thumb, the maximum required memory at a certain time for +multi-model analysis could be estimated by multiplying the number of datasets by +the average file size of all the datasets; this memory intake is high but also +assumes that all data is fully realized in memory; this aspect will gradually +change and the amount of realized data will decrease with the increase of +``dask`` use. diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py index a33868da74..9447d36c25 100644 --- a/esmvalcore/_recipe/check.py +++ b/esmvalcore/_recipe/check.py @@ -19,6 +19,7 @@ from esmvalcore.local import _get_start_end_year, _parse_period from esmvalcore.preprocessor import TIME_PREPROCESSORS, PreprocessingTask from esmvalcore.preprocessor._multimodel import _get_operator_and_kwargs +from esmvalcore.preprocessor._other import _get_var_info from esmvalcore.preprocessor._regrid import ( HORIZONTAL_SCHEMES_IRREGULAR, HORIZONTAL_SCHEMES_REGULAR, @@ -42,6 +43,31 @@ logger = logging.getLogger(__name__) +def align_metadata(step_settings: dict[str, Any]) -> None: + """Check settings of preprocessor ``align_metadata``.""" + project = step_settings.get("target_project") + mip = step_settings.get("target_mip") + short_name = step_settings.get("target_short_name") + strict = step_settings.get("strict", True) + + # Any missing arguments will be reported later + if project is None or mip is None or short_name is None: + return + + try: + _get_var_info(project, mip, short_name) + except ValueError as exc: + if strict: + msg = ( + f"align_metadata failed: {exc}. Set `strict=False` to ignore " + f"this." + ) + raise RecipeError(msg) from exc + except KeyError as exc: + msg = f"align_metadata failed: {exc}" + raise RecipeError(msg) from exc + + def ncl_version() -> None: """Check the NCL version.""" ncl = which("ncl") diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py index c749f4aff1..d8738e0b85 100644 --- a/esmvalcore/_recipe/recipe.py +++ b/esmvalcore/_recipe/recipe.py @@ -592,10 +592,28 @@ def update_ancestors( settings[key] = value +def _update_align_metadata( + settings: PreprocessorSettings, + dataset: Dataset, +) -> None: + """Update settings for ``align_metadata``.""" + if "align_metadata" in settings: + settings["align_metadata"].setdefault( + "target_mip", + dataset.facets["mip"], + ) + settings["align_metadata"].setdefault( + "target_short_name", + dataset.facets["short_name"], + ) + check.align_metadata(settings["align_metadata"]) + + def _update_extract_shape( settings: PreprocessorSettings, session: Session, ) -> None: + """Update settings for ``extract_shape``.""" if "extract_shape" in settings: shapefile = settings["extract_shape"].get("shapefile") if shapefile: @@ -785,6 +803,7 @@ def _update_preproc_functions( missing_vars: set[str], ) -> None: session = dataset.session + _update_align_metadata(settings, dataset) _update_extract_shape(settings, session) _update_weighting_settings(settings, dataset.facets) try: diff --git a/esmvalcore/cmor/table.py b/esmvalcore/cmor/table.py index f80f339584..58ed7cead4 100644 --- a/esmvalcore/cmor/table.py +++ b/esmvalcore/cmor/table.py @@ -53,9 +53,7 @@ def _update_cmor_facets(facets): f"Unable to load CMOR table (project) '{project}' for variable " f"'{short_name}' with mip '{mip}'" ) - raise RecipeError( - msg, - ) + raise RecipeError(msg) facets["original_short_name"] = table_entry.short_name for key in _CMOR_KEYS: if key not in facets: @@ -115,9 +113,7 @@ def get_var_info( f"No CMOR tables available for project '{project}'. The following " f"tables are available: {', '.join(CMOR_TABLES)}." ) - raise KeyError( - msg, - ) + raise KeyError(msg) # CORDEX X-hourly tables define the mip as ending in 'h' instead of 'hr' if project == "CORDEX" and mip.endswith("hr"): @@ -444,9 +440,7 @@ def _get_cmor_path(cmor_tables_path): if os.path.isdir(cmor_tables_path): return cmor_tables_path msg = f"CMOR tables not found in {cmor_tables_path}" - raise ValueError( - msg, - ) + raise ValueError(msg) def _load_table(self, json_file): with open(json_file, encoding="utf-8") as inf: @@ -1060,9 +1054,7 @@ def __init__(self, cmor_tables_path: str | Path | None = None) -> None: f"Custom CMOR tables path {self._user_table_folder} is " f"not a directory" ) - raise ValueError( - msg, - ) + raise ValueError(msg) self._read_table_dir(self._user_table_folder) else: self._user_table_folder = None diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index 86552814a5..1aea9aa83a 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -51,7 +51,12 @@ ensemble_statistics, multi_model_statistics, ) -from esmvalcore.preprocessor._other import clip, cumulative_sum, histogram +from esmvalcore.preprocessor._other import ( + align_metadata, + clip, + cumulative_sum, + histogram, +) from esmvalcore.preprocessor._regrid import ( extract_coordinate_points, extract_levels, @@ -115,7 +120,7 @@ # Concatenate all cubes in one "concatenate", "cmor_check_metadata", - # Extract years given by dataset keys (start_year and end_year) + # Extract years given by dataset keys (timerange/start_year and end_year) "clip_timerange", # Data reformatting/CMORization "fix_data", @@ -124,6 +129,8 @@ "add_supplementary_variables", # Derive variable "derive", + # Align metadata + "align_metadata", # Time extraction (as defined in the preprocessor section) "extract_time", "extract_season", diff --git a/esmvalcore/preprocessor/_other.py b/esmvalcore/preprocessor/_other.py index 80c947627b..b6e2ad3cc7 100644 --- a/esmvalcore/preprocessor/_other.py +++ b/esmvalcore/preprocessor/_other.py @@ -14,6 +14,7 @@ from iris.cube import Cube from iris.exceptions import CoordinateMultiDimError +from esmvalcore.cmor.table import get_var_info from esmvalcore.iris_helpers import ( ignore_iris_vague_metadata_warnings, rechunk_cube, @@ -30,38 +31,122 @@ if TYPE_CHECKING: from collections.abc import Iterable, Sequence + from esmvalcore.cmor.table import VariableInfo + logger = logging.getLogger(__name__) -def clip(cube, minimum=None, maximum=None): +def align_metadata( + cube: Cube, + target_project: str, + target_mip: str, + target_short_name: str, + strict: bool = True, +) -> Cube: + """Set cube metadata to entries from a specific target project. + + This is useful to align variable metadata of different projects prior to + performing multi-model operations (e.g., + :func:`~esmvalcore.preprocessor.multi_model_statistics`). For example, + standard names differ for some variables between CMIP5 and CMIP6 which + would prevent the calculation of multi-model statistics between CMIP5 and + CMIP6 data. + + Parameters + ---------- + cube: + Input cube. + target_project: + Project from which target metadata is read. + target_mip: + MIP table from which target metadata is read. + target_short_name: + Variable short name from which target metadata is read. + strict: + If ``True``, raise an error if desired metadata cannot be read for + variable ``target_short_name`` of MIP table ``target_mip`` and project + ``target_project``. If ``False``, no error is raised. + + Returns + ------- + iris.cube.Cube + Cube with updated metadata. + + Raises + ------ + KeyError + Invalid ``target_project`` given. + ValueError + If ``strict=True``: Variable ``target_short_name`` not available for + MIP table ``target_mip`` of project ``target_project``. + + """ + cube = cube.copy() + + try: + var_info = _get_var_info(target_project, target_mip, target_short_name) + except ValueError as exc: + if strict: + raise + logger.debug(exc) + return cube + + cube.long_name = var_info.long_name + cube.standard_name = var_info.standard_name + cube.var_name = var_info.short_name + cube.convert_units(var_info.units) + + return cube + + +def _get_var_info(project: str, mip: str, short_name: str) -> VariableInfo: + """Get variable information.""" + var_info = get_var_info(project, mip, short_name) + if var_info is None: + msg = ( + f"Variable '{short_name}' not available for table '{mip}' of " + f"project '{project}'" + ) + raise ValueError(msg) + return var_info + + +def clip( + cube: Cube, + minimum: float | None = None, + maximum: float | None = None, +) -> Cube: """Clip values at a specified minimum and/or maximum value. - Values lower than minimum are set to minimum and values - higher than maximum are set to maximum. + Values lower than minimum are set to minimum and values higher than maximum + are set to maximum. Parameters ---------- - cube: iris.cube.Cube - iris cube to be clipped - minimum: float - lower threshold to be applied on input cube data. - maximum: float - upper threshold to be applied on input cube data. + cube: + Input cube. + minimum: + Lower threshold to be applied on input cube data. + maximum: + Upper threshold to be applied on input cube data. Returns ------- iris.cube.Cube - clipped cube. + Clipped cube. + + Raises + ------ + ValueError + ``minimum`` and ``maximum`` are set to ``None``. + """ if minimum is None and maximum is None: - msg = "Either minimum, maximum or both have to be\ - specified." - raise ValueError( - msg, - ) + msg = "Either minimum, maximum or both have to be specified" + raise ValueError(msg) if minimum is not None and maximum is not None: if maximum < minimum: - msg = "Maximum should be equal or larger than minimum." + msg = "Maximum should be equal or larger than minimum" raise ValueError(msg) cube.data = da.clip(cube.core_data(), minimum, maximum) return cube @@ -235,18 +320,14 @@ def histogram( f"bins cannot be a str (got '{bins}'), must be int or Sequence of " f"int" ) - raise TypeError( - msg, - ) + raise TypeError(msg) allowed_norms = (None, "sum", "integral") if normalization is not None and normalization not in allowed_norms: msg = ( f"Expected one of {allowed_norms} for normalization, got " f"'{normalization}'" ) - raise ValueError( - msg, - ) + raise ValueError(msg) # If histogram is calculated over all coordinates, we can use # dask.array.histogram and do not need to worry about chunks; otherwise, @@ -309,9 +390,7 @@ def _get_bins( f"Cannot calculate histogram for bin_range={bin_range} (or for " f"fully masked data when `bin_range` is not given)" ) - raise ValueError( - msg, - ) + raise ValueError(msg) return (bin_range, bin_edges) diff --git a/tests/integration/cmor/test_table.py b/tests/integration/cmor/test_table.py index dd7688860f..c2b59d7af4 100644 --- a/tests/integration/cmor/test_table.py +++ b/tests/integration/cmor/test_table.py @@ -2,6 +2,7 @@ import os import unittest +from pathlib import Path import pytest @@ -158,6 +159,12 @@ def test_get_activity_from_exp(self): activity = self.variables_info.activities["1pctCO2"] self.assertListEqual(activity, ["CMIP"]) + def test_invalid_path(self): + path = Path(__file__) / "path" / "does" / "not" / "exist" + msg = r"CMOR tables not found in" + with pytest.raises(ValueError, match=msg): + CMIP6Info(path) + class Testobs4mipsInfo(unittest.TestCase): """Test for the obs$mips info class.""" diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index 5bd6ad47dc..1a8de202e4 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -3699,3 +3699,183 @@ def test_automatic_already_regrid_era5_grib( "target_grid": "1x1", "scheme": "nearest", } + + +def test_align_metadata(tmp_path, patched_datafinder, session): + content = dedent(""" + preprocessors: + test: + align_metadata: + target_project: CMIP6 + + diagnostics: + diagnostic_name: + variables: + tas: + preprocessor: test + project: CMIP6 + mip: Amon + exp: historical + timerange: '20000101/20001231' + ensemble: r1i1p1f1 + grid: gn + additional_datasets: + - {dataset: CanESM5} + + scripts: null + """) + recipe = get_recipe(tmp_path, content, session) + + # Check align_metadata settings have been updated + tasks = {t for task in recipe.tasks for t in task.flatten()} + assert len(tasks) == 1 + task = next(iter(tasks)) + products = task.products + assert len(products) == 1 + product = next(iter(task.products)) + assert "align_metadata" in product.settings + assert product.settings["align_metadata"]["target_project"] == "CMIP6" + assert product.settings["align_metadata"]["target_mip"] == "Amon" + assert product.settings["align_metadata"]["target_short_name"] == "tas" + + +def test_align_metadata_invalid_project(tmp_path, patched_datafinder, session): + content = dedent(""" + preprocessors: + test: + align_metadata: + target_project: ZZZ + + diagnostics: + diagnostic_name: + variables: + tas: + preprocessor: test + project: CMIP6 + mip: Amon + exp: historical + timerange: '20000101/20001231' + ensemble: r1i1p1f1 + grid: gn + additional_datasets: + - {dataset: CanESM5} + + scripts: null + """) + msg = ( + "align_metadata failed: \"No CMOR tables available for project 'ZZZ'. " + "The following tables are available: custom, CMIP6, CMIP5, CMIP3, OBS, " + "OBS6, native6, obs4MIPs, ana4mips, EMAC, CORDEX, IPSLCM, ICON, CESM, " + 'ACCESS."' + ) + with pytest.raises(RecipeError) as exc: + get_recipe(tmp_path, content, session) + assert str(exc.value) == INITIALIZATION_ERROR_MSG + assert exc.value.failed_tasks[0].message == msg + + +def test_align_metadata_invalid_name(tmp_path, patched_datafinder, session): + content = dedent(""" + preprocessors: + test: + align_metadata: + target_project: CMIP6 + target_short_name: zzz + + diagnostics: + diagnostic_name: + variables: + tas: + preprocessor: test + project: CMIP6 + mip: Amon + exp: historical + timerange: '20000101/20001231' + ensemble: r1i1p1f1 + grid: gn + additional_datasets: + - {dataset: CanESM5} + + scripts: null + """) + msg = ( + "align_metadata failed: Variable 'zzz' not available for table 'Amon' " + "of project 'CMIP6'. Set `strict=False` to ignore this." + ) + with pytest.raises(RecipeError) as exc: + get_recipe(tmp_path, content, session) + assert str(exc.value) == INITIALIZATION_ERROR_MSG + assert exc.value.failed_tasks[0].message == msg + + +def test_align_metadata_invalid_short_name_not_strict( + tmp_path, + patched_datafinder, + session, +): + content = dedent(""" + preprocessors: + test: + align_metadata: + target_project: CMIP6 + target_short_name: zzz + strict: false + + diagnostics: + diagnostic_name: + variables: + tas: + preprocessor: test + project: CMIP6 + mip: Amon + exp: historical + timerange: '20000101/20001231' + ensemble: r1i1p1f1 + grid: gn + additional_datasets: + - {dataset: CanESM5} + + scripts: null + """) + recipe = get_recipe(tmp_path, content, session) + + # Check align_metadata settings have been updated + tasks = {t for task in recipe.tasks for t in task.flatten()} + assert len(tasks) == 1 + task = next(iter(tasks)) + products = task.products + assert len(products) == 1 + product = next(iter(task.products)) + assert "align_metadata" in product.settings + assert product.settings["align_metadata"]["target_project"] == "CMIP6" + assert product.settings["align_metadata"]["target_mip"] == "Amon" + assert product.settings["align_metadata"]["target_short_name"] == "zzz" + assert product.settings["align_metadata"]["strict"] is False + + +def test_align_metadata_missing_arg(tmp_path, patched_datafinder, session): + content = dedent(""" + preprocessors: + test: + align_metadata: + target_short_name: tas + + diagnostics: + diagnostic_name: + variables: + tas: + preprocessor: test + project: CMIP6 + mip: Amon + exp: historical + timerange: '20000101/20001231' + ensemble: r1i1p1f1 + grid: gn + additional_datasets: + - {dataset: CanESM5} + + scripts: null + """) + msg = "Missing required argument" + with pytest.raises(ValueError, match=msg): + get_recipe(tmp_path, content, session) diff --git a/tests/unit/preprocessor/_other/test_other.py b/tests/unit/preprocessor/_other/test_other.py index c4dce441dd..8a01dc77a6 100644 --- a/tests/unit/preprocessor/_other/test_other.py +++ b/tests/unit/preprocessor/_other/test_other.py @@ -3,7 +3,6 @@ import unittest import dask.array as da -import iris.coord_categorisation import iris.coords import numpy as np import pytest @@ -20,12 +19,84 @@ from iris.exceptions import CoordinateMultiDimError from numpy.testing import assert_array_equal -from esmvalcore.preprocessor._other import clip, cumulative_sum, histogram +from esmvalcore.preprocessor import ( + align_metadata, + clip, + cumulative_sum, + histogram, +) from tests.unit.preprocessor._compare_with_refs.test_compare_with_refs import ( get_3d_cube, ) +def test_align_metadata(): + cube = Cube(0.0, units="celsius") + + new_cube = align_metadata(cube, "CMIP6", "Amon", "tas") + + assert new_cube is not cube + + assert new_cube.long_name == "Near-Surface Air Temperature" + assert new_cube.standard_name == "air_temperature" + assert new_cube.units == "K" + assert new_cube.var_name == "tas" + np.testing.assert_allclose(new_cube.data, 273.15) + + assert cube.long_name is None + assert cube.standard_name is None + assert cube.units == "celsius" + assert cube.var_name is None + np.testing.assert_allclose(cube.data, 0.0) + + +@pytest.mark.parametrize("short_name", ["sic", "siconc"]) +def test_align_metadata_alt_names(short_name): + cube = Cube(0.0, units="%") + + new_cube = align_metadata(cube, "CMIP6", "SImon", short_name) + + assert new_cube is not cube + + assert new_cube.long_name == "Sea-Ice Area Percentage (Ocean Grid)" + assert new_cube.standard_name == "sea_ice_area_fraction" + assert new_cube.units == "%" + assert new_cube.var_name == "siconc" + np.testing.assert_allclose(new_cube.data, 0.0) + + assert cube.long_name is None + assert cube.standard_name is None + assert cube.units == "%" + assert cube.var_name is None + np.testing.assert_allclose(cube.data, 0.0) + + +def test_align_metadata_invalid_project(): + cube = Cube(0.0, units="1") + msg = "No CMOR tables available for project 'ZZZ'" + with pytest.raises(KeyError, match=msg): + align_metadata(cube, "ZZZ", "Amon", "tas") + + +def test_align_metadata_invalid_short_name_strict(): + cube = Cube(0.0, units="1") + msg = "Variable 'zzz' not available for table 'Amon' of project 'CMIP6'" + with pytest.raises(ValueError, match=msg): + align_metadata(cube, "CMIP6", "Amon", "zzz") + + +def test_align_metadata_invalid_short_name_not_strict(): + cube = Cube(0.0, units="1") + new_cube = align_metadata(cube, "CMIP6", "Amon", "zzz", strict=False) + assert new_cube is not cube + assert new_cube == cube + assert new_cube.long_name is None + assert new_cube.standard_name is None + assert new_cube.units == "1" + assert new_cube.var_name is None + np.testing.assert_allclose(cube.data, 0.0) + + class TestOther(unittest.TestCase): """Test class for _other."""