ESMValGroup · schlunma · Aug 7, 2025 · Jul 30, 2025 · Jul 30, 2025 · Jul 31, 2025
diff --git a/doc/recipe/preprocessor.rst b/doc/recipe/preprocessor.rst
@@ -2911,53 +2911,56 @@ See also :func:`esmvalcore.preprocessor.distance_metric`.
 .. _Weighted Earth mover's distance: https://pythonot.github.io/
   quickstart.html#computing-wasserstein-distance
 
-
-.. _Memory use:
-
-Information on maximum memory required
-======================================
-In the most general case, we can set upper limits on the maximum memory the
-analysis will require:
+.. _Other:
 
 
-``Ms = (R + N) x F_eff - F_eff`` - when no multi-model analysis is performed;
+Other
+=====
 
-``Mm = (2R + N) x F_eff - 2F_eff`` - when multi-model analysis is performed;
+Miscellaneous functions that do not belong to any of the other categories.
 
-where
+.. _align_metadata:
 
-* ``Ms``: maximum memory for non-multimodel module
-* ``Mm``: maximum memory for multi-model module
-* ``R``: computational efficiency of module; `R` is typically 2-3
-* ``N``: number of datasets
-* ``F_eff``: average size of data per dataset where ``F_eff = e x f x F``
-  where ``e`` is the factor that describes how lazy the data is (``e = 1`` for
-  fully realized data) and ``f`` describes how much the data was shrunk by the
-  immediately previous module, e.g. time extraction, area selection or level
-  extraction; note that for fix_data ``f`` relates only to the time extraction,
-  if data is exact in time (no time selection) ``f = 1`` for fix_data so for
-  cases when we deal with a lot of datasets ``R + N \approx N``, data is fully
-  realized, assuming an average size of 1.5GB for 10 years of `3D` netCDF data,
-  ``N`` datasets will require:
+``align_metadata``
+------------------
 
+This function sets cube metadata to entries from a specific target project.
+This is useful to align variable metadata of different projects prior to
+performing multi-model operations (e.g., :ref:`multi-model statistics`).
+For example, standard names differ for some variables between CMIP5 and CMIP6
+which would prevent the calculation of multi-model statistics between CMIP5 and
+CMIP6 data.
 
-``Ms = 1.5 x (N - 1)`` GB
+The ``align_metadata`` preprocessor supports the following arguments in the
+recipe:
 
-``Mm = 1.5 x (N - 2)`` GB
+* ``target_project`` (:obj:`str`): Project from which target metadata is read.
+* ``target_mip`` (:obj:`str`; optional): MIP table from which target metadata
+  is read.
+  If not given, use the MIP tables of the corresponding variables defined in
+  the recipe.
+* ``target_short_name`` (:obj:`str`; optional): Variable short name from which
+  target metadata is read.
+  If not given, use the short names of the corresponding variables defined in
+  the recipe.
+* ``strict`` (:obj:`str`; optional, default: ``True``): If ``True``, raise an
+  error if desired metadata cannot be read for variable ``target_short_name``
+  of MIP table ``target_mip`` and project ``target_project``.
+  If ``False``, no error is raised.
 
-As a rule of thumb, the maximum required memory at a certain time for
-multi-model analysis could be estimated by multiplying the number of datasets by
-the average file size of all the datasets; this memory intake is high but also
-assumes that all data is fully realized in memory; this aspect will gradually
-change and the amount of realized data will decrease with the increase of
-``dask`` use.
+Example:
 
-.. _Other:
+.. code-block:: yaml
 
-Other
-=====
+    preprocessors:
+      calculate_multi_model_statistics:
+        align_metadata:
+          target_project: CMIP6
+        multi_model_statistics:
+          span: overlap
+          statistics: [mean, median]
 
-Miscellaneous functions that do not belong to any of the other categories.
+See also :func:`esmvalcore.preprocessor.align_metadata`.
 
 .. _cumulative_sum:
 
@@ -3085,3 +3088,44 @@ Example:
           normalization: sum
 
 See also :func:`esmvalcore.preprocessor.histogram`.
+
+
+.. _Memory use:
+
+Information on maximum memory required
+======================================
+In the most general case, we can set upper limits on the maximum memory the
+analysis will require:
+
+
+``Ms = (R + N) x F_eff - F_eff`` - when no multi-model analysis is performed;
+
+``Mm = (2R + N) x F_eff - 2F_eff`` - when multi-model analysis is performed;
+
+where
+
+* ``Ms``: maximum memory for non-multimodel module
+* ``Mm``: maximum memory for multi-model module
+* ``R``: computational efficiency of module; `R` is typically 2-3
+* ``N``: number of datasets
+* ``F_eff``: average size of data per dataset where ``F_eff = e x f x F``
+  where ``e`` is the factor that describes how lazy the data is (``e = 1`` for
+  fully realized data) and ``f`` describes how much the data was shrunk by the
+  immediately previous module, e.g. time extraction, area selection or level
+  extraction; note that for fix_data ``f`` relates only to the time extraction,
+  if data is exact in time (no time selection) ``f = 1`` for fix_data so for
+  cases when we deal with a lot of datasets ``R + N \approx N``, data is fully
+  realized, assuming an average size of 1.5GB for 10 years of `3D` netCDF data,
+  ``N`` datasets will require:
+
+
+``Ms = 1.5 x (N - 1)`` GB
+
+``Mm = 1.5 x (N - 2)`` GB
+
+As a rule of thumb, the maximum required memory at a certain time for
+multi-model analysis could be estimated by multiplying the number of datasets by
+the average file size of all the datasets; this memory intake is high but also
+assumes that all data is fully realized in memory; this aspect will gradually
+change and the amount of realized data will decrease with the increase of
+``dask`` use.
diff --git a/esmvalcore/_recipe/check.py b/esmvalcore/_recipe/check.py
@@ -19,6 +19,7 @@
 from esmvalcore.local import _get_start_end_year, _parse_period
 from esmvalcore.preprocessor import TIME_PREPROCESSORS, PreprocessingTask
 from esmvalcore.preprocessor._multimodel import _get_operator_and_kwargs
+from esmvalcore.preprocessor._other import _get_var_info
 from esmvalcore.preprocessor._regrid import (
     HORIZONTAL_SCHEMES_IRREGULAR,
     HORIZONTAL_SCHEMES_REGULAR,
@@ -42,6 +43,31 @@
 logger = logging.getLogger(__name__)
 
 
+def align_metadata(step_settings: dict[str, Any]) -> None:
+    """Check settings of preprocessor ``align_metadata``."""
+    project = step_settings.get("target_project")
+    mip = step_settings.get("target_mip")
+    short_name = step_settings.get("target_short_name")
+    strict = step_settings.get("strict", True)
+
+    # Any missing arguments will be reported later
+    if project is None or mip is None or short_name is None:
+        return
+
+    try:
+        _get_var_info(project, mip, short_name)
+    except ValueError as exc:
+        if strict:
+            msg = (
+                f"align_metadata failed: {exc}. Set `strict=False` to ignore "
+                f"this."
+            )
+            raise RecipeError(msg) from exc
+    except KeyError as exc:
+        msg = f"align_metadata failed: {exc}"
+        raise RecipeError(msg) from exc
+
+
 def ncl_version() -> None:
     """Check the NCL version."""
     ncl = which("ncl")

diff --git a/esmvalcore/_recipe/recipe.py b/esmvalcore/_recipe/recipe.py
@@ -592,10 +592,28 @@ def update_ancestors(
                 settings[key] = value
 
 
+def _update_align_metadata(
+    settings: PreprocessorSettings,
+    dataset: Dataset,
+) -> None:
+    """Update settings for ``align_metadata``."""
+    if "align_metadata" in settings:
+        settings["align_metadata"].setdefault(
+            "target_mip",
+            dataset.facets["mip"],
+        )
+        settings["align_metadata"].setdefault(
+            "target_short_name",
+            dataset.facets["short_name"],
+        )
+        check.align_metadata(settings["align_metadata"])
+
+
 def _update_extract_shape(
     settings: PreprocessorSettings,
     session: Session,
 ) -> None:
+    """Update settings for ``extract_shape``."""
     if "extract_shape" in settings:
         shapefile = settings["extract_shape"].get("shapefile")
         if shapefile:
@@ -785,6 +803,7 @@ def _update_preproc_functions(
     missing_vars: set[str],
 ) -> None:
     session = dataset.session
+    _update_align_metadata(settings, dataset)
     _update_extract_shape(settings, session)
     _update_weighting_settings(settings, dataset.facets)
     try:

diff --git a/esmvalcore/cmor/table.py b/esmvalcore/cmor/table.py
@@ -53,9 +53,7 @@ def _update_cmor_facets(facets):
             f"Unable to load CMOR table (project) '{project}' for variable "
             f"'{short_name}' with mip '{mip}'"
         )
-        raise RecipeError(
-            msg,
-        )
+        raise RecipeError(msg)
     facets["original_short_name"] = table_entry.short_name
     for key in _CMOR_KEYS:
         if key not in facets:
@@ -115,9 +113,7 @@ def get_var_info(
             f"No CMOR tables available for project '{project}'. The following "
             f"tables are available: {', '.join(CMOR_TABLES)}."
         )
-        raise KeyError(
-            msg,
-        )
+        raise KeyError(msg)
 
     # CORDEX X-hourly tables define the mip as ending in 'h' instead of 'hr'
     if project == "CORDEX" and mip.endswith("hr"):
@@ -444,9 +440,7 @@ def _get_cmor_path(cmor_tables_path):
         if os.path.isdir(cmor_tables_path):
             return cmor_tables_path
         msg = f"CMOR tables not found in {cmor_tables_path}"
-        raise ValueError(
-            msg,
-        )
+        raise ValueError(msg)
 
     def _load_table(self, json_file):
         with open(json_file, encoding="utf-8") as inf:
@@ -1060,9 +1054,7 @@ def __init__(self, cmor_tables_path: str | Path | None = None) -> None:
                     f"Custom CMOR tables path {self._user_table_folder} is "
                     f"not a directory"
                 )
-                raise ValueError(
-                    msg,
-                )
+                raise ValueError(msg)
             self._read_table_dir(self._user_table_folder)
         else:
             self._user_table_folder = None

diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py
@@ -51,7 +51,12 @@
     ensemble_statistics,
     multi_model_statistics,
 )
-from esmvalcore.preprocessor._other import clip, cumulative_sum, histogram
+from esmvalcore.preprocessor._other import (
+    align_metadata,
+    clip,
+    cumulative_sum,
+    histogram,
+)
 from esmvalcore.preprocessor._regrid import (
     extract_coordinate_points,
     extract_levels,
@@ -115,7 +120,7 @@
     # Concatenate all cubes in one
     "concatenate",
     "cmor_check_metadata",
-    # Extract years given by dataset keys (start_year and end_year)
+    # Extract years given by dataset keys (timerange/start_year and end_year)
     "clip_timerange",
     # Data reformatting/CMORization
     "fix_data",
@@ -124,6 +129,8 @@
     "add_supplementary_variables",
     # Derive variable
     "derive",
+    # Align metadata
+    "align_metadata",
     # Time extraction (as defined in the preprocessor section)
     "extract_time",
     "extract_season",