diff --git a/doc/sphinx/source/recipes/figures/benchmarking/annual_cycle.png b/doc/sphinx/source/recipes/figures/benchmarking/annual_cycle.png
new file mode 100644
index 0000000000..9836b2f6f0
Binary files /dev/null and b/doc/sphinx/source/recipes/figures/benchmarking/annual_cycle.png differ
diff --git a/doc/sphinx/source/recipes/figures/benchmarking/boxplots.png b/doc/sphinx/source/recipes/figures/benchmarking/boxplots.png
new file mode 100644
index 0000000000..247d42065a
Binary files /dev/null and b/doc/sphinx/source/recipes/figures/benchmarking/boxplots.png differ
diff --git a/doc/sphinx/source/recipes/figures/benchmarking/diurnal_cycle.png b/doc/sphinx/source/recipes/figures/benchmarking/diurnal_cycle.png
new file mode 100644
index 0000000000..27ad9913bc
Binary files /dev/null and b/doc/sphinx/source/recipes/figures/benchmarking/diurnal_cycle.png differ
diff --git a/doc/sphinx/source/recipes/figures/benchmarking/map.png b/doc/sphinx/source/recipes/figures/benchmarking/map.png
new file mode 100644
index 0000000000..80be821c1c
Binary files /dev/null and b/doc/sphinx/source/recipes/figures/benchmarking/map.png differ
diff --git a/doc/sphinx/source/recipes/figures/benchmarking/timeseries.png b/doc/sphinx/source/recipes/figures/benchmarking/timeseries.png
new file mode 100644
index 0000000000..4cb5e0ea23
Binary files /dev/null and b/doc/sphinx/source/recipes/figures/benchmarking/timeseries.png differ
diff --git a/doc/sphinx/source/recipes/figures/benchmarking/zonal.png b/doc/sphinx/source/recipes/figures/benchmarking/zonal.png
new file mode 100644
index 0000000000..52a3938620
Binary files /dev/null and b/doc/sphinx/source/recipes/figures/benchmarking/zonal.png differ
diff --git a/doc/sphinx/source/recipes/figures/model_evaluation/diurnal_cycle_clt_sepacific_3hr.png b/doc/sphinx/source/recipes/figures/model_evaluation/diurnal_cycle_clt_sepacific_3hr.png
new file mode 100644
index 0000000000..35bee589eb
Binary files /dev/null and b/doc/sphinx/source/recipes/figures/model_evaluation/diurnal_cycle_clt_sepacific_3hr.png differ
diff --git a/doc/sphinx/source/recipes/figures/monitor/diurnal_cycle_clt_tropics_3hr.png b/doc/sphinx/source/recipes/figures/monitor/diurnal_cycle_clt_tropics_3hr.png
new file mode 100644
index 0000000000..c91f757036
Binary files /dev/null and b/doc/sphinx/source/recipes/figures/monitor/diurnal_cycle_clt_tropics_3hr.png differ
diff --git a/doc/sphinx/source/recipes/figures/monitor/diurnalcycle_pr_tropics_EC-Earth3_3hr_historical_r1i1p1f1.png b/doc/sphinx/source/recipes/figures/monitor/diurnalcycle_pr_tropics_EC-Earth3_3hr_historical_r1i1p1f1.png
new file mode 100644
index 0000000000..1e2a77cec8
Binary files /dev/null and b/doc/sphinx/source/recipes/figures/monitor/diurnalcycle_pr_tropics_EC-Earth3_3hr_historical_r1i1p1f1.png differ
diff --git a/doc/sphinx/source/recipes/index.rst b/doc/sphinx/source/recipes/index.rst
index 4b175d3eb9..21391c1c87 100644
--- a/doc/sphinx/source/recipes/index.rst
+++ b/doc/sphinx/source/recipes/index.rst
@@ -21,6 +21,7 @@ large variety of input data.
 .. toctree::
    :maxdepth: 1
 
+   recipe_benchmarking
    recipe_model_evaluation
    recipe_monitor
    recipe_portrait
diff --git a/doc/sphinx/source/recipes/recipe_benchmarking.rst b/doc/sphinx/source/recipes/recipe_benchmarking.rst
new file mode 100644
index 0000000000..d6328cd0d5
--- /dev/null
+++ b/doc/sphinx/source/recipes/recipe_benchmarking.rst
@@ -0,0 +1,175 @@
+.. _recipe_benchmarking:
+
+Model Benchmarking
+==================
+
+Overview
+--------
+
+These recipes and diagnostics are based on :ref:`recipe_monitor <recipe_monitor>` that allow plotting arbitrary preprocessor output, i.e., arbitrary variables from arbitrary datasets. An extension of these diagnostics is used to benchmark a model simulation with other datasets (e.g. CMIP6). The benchmarking features are described in `Lauer et al.`_.
+
+.. _`Lauer et al.`: https://doi.org/10.5194/egusphere-2024-1518
+
+Available recipes and diagnostics
+---------------------------------
+
+Recipes are stored in `recipes/model_evaluation`
+
+* recipe_model_benchmarking_annual_cycle.yml
+* recipe_model_benchmarking_boxplots.yml
+* recipe_model_benchmarking_diurnal_cycle.yml
+* recipe_model_benchmarking_maps.yml
+* recipe_model_benchmarking_timeseries.yml
+* recipe_model_benchmarking_zonal.yml
+
+Diagnostics are stored in `diag_scripts/monitor/`
+
+* :ref:`multi_datasets.py
+  <api.esmvaltool.diag_scripts.monitor.multi_datasets>`:
+  Monitoring diagnostic to show multiple datasets in one plot (incl. biases).
+
+
+Recipe settings
+~~~~~~~~~~~~~~~
+
+See :ref:`multi_datasets.py<api.esmvaltool.diag_scripts.monitor.multi_datasets>` for a list of all possible configuration options that can be specified in the recipe.
+
+.. note::
+   Please note that exactly one dataset (the dataset to be benchmarked) needs to specify the facet ``benchmark_dataset: true`` in the dataset entry of the recipe. For line plots (i.e. annual cycle, diurnal cycle, time series), it is recommended, to specify a particular line color and line style in the ``scripts`` section of the recipe for the dataset to be benchmarked (``benchmark_dataset: true``) so that this dataset is easy to identify in the plot. In the example below, MIROC6 is the dataset to be benchmarked and ERA5 is used as a reference dataset.
+
+.. code-block:: yaml
+
+   scripts:
+     allplots:
+       script: monitor/multi_datasets.py
+       plot_folder: '{plot_dir}'
+       plot_filename: '{plot_type}_{real_name}_{mip}'
+       group_variables_by: variable_group
+       facet_used_for_labels: alias
+       plots:
+         diurnal_cycle:
+           annual_mean_kwargs: false
+           legend_kwargs:
+             loc: upper right
+           plot_kwargs:
+             'MIROC6':
+               color: red
+               label: '{alias}'
+               linestyle: '-'
+               linewidth: 2
+               zorder: 4
+             ERA5:
+               color: black
+               label: '{dataset}'
+               linestyle: '-'
+               linewidth: 2
+               zorder: 3
+             MultiModelPercentile10:
+               color: gray
+               label: '{dataset}'
+               linestyle: '--'
+               linewidth: 1
+               zorder: 2
+             MultiModelPercentile90:
+               color: gray
+               label: '{dataset}'
+               linestyle: '--'
+               linewidth: 1
+               zorder: 2
+             default:
+               color: lightgray
+               label: null
+               linestyle: '-'
+               linewidth: 1
+               zorder: 1
+
+Variables
+---------
+
+Any, but the variables' number of dimensions should match the ones expected by each plot.
+
+References
+----------
+
+* Lauer, A., L. Bock, B. Hassler, P. Jöckel, L. Ruhe, and M. Schlund: Monitoring and benchmarking Earth
+  System Model simulations with ESMValTool v2.12.0, EGUsphere [preprint], https://doi.org/10.5194/egusphere-2024-1518, 2024.
+
+Example plots
+-------------
+
+.. _fig_benchmarking_annual_cycle:
+.. figure::  /recipes/figures/benchmarking/annual_cycle.png
+   :align:   center
+   :width:   16cm
+
+   (Left) Multi-year global mean (2000-2004) of the seasonal cycle of near-surface temperature
+   in K from a simulation of MIROC6 and the reference dataset HadCRUT5 (black). The thin gray
+   lines show individual CMIP6 models used for comparison, the dashed gray lines show the 10%
+   and 90% percentiles of these CMIP6 models. (Right) same as (left) but for area-weighted RMSE
+   of near-surface temperature. The light blue shading shows the range of the 10% to 90%
+   percentiles of RMSE values from the ensemble of CMIP6 models used for comparison. Created
+   with recipe_model_benchmarking_annual_cycle.yml.
+
+.. _fig_benchmarking_boxplots:
+.. figure::  /recipes/figures/benchmarking/boxplots.png
+   :align:   center
+   :width:   16cm
+
+   (Left) Global area-weighted RMSE (smaller=better), (middle) weighted Pearson’s correlation
+   coefficient (higher=better) and (right) weighted Earth mover’s distance (smaller=better) of
+   the geographical pattern of 5-year means of different variables from a simulation of MIROC6
+   (red cross) in comparison to the CMIP6 ensemble (boxplot). Reference datasets for calculating
+   the three metrics are: near-surface temperature (tas): HadCRUT5, surface temperature (ts):
+   HadISST, precipitation (pr): GPCP-SG, air pressure at sea level (psl): ERA5, shortwave (rsut)
+   longwave (rlut) radiative fluxes at TOA and shortwave (swcre) and longwave (lwcre) cloud
+   radiative effects: CERES-EBAF. Each box indicates the range from the first quartile to the
+   third quartile, the vertical lines show the median, and the whiskers the minimum and maximum
+   values, excluding the outliers. Outliers are defined as being outside 1.5 times the
+   interquartile range. Created with recipe_model_benchmarking_boxplots.yml.
+
+.. _fig_benchmarking_diurn_cycle:
+.. figure::  /recipes/figures/benchmarking/diurnal_cycle.png
+   :align:   center
+   :width:   10cm
+
+   Area-weighted RMSE of the annual mean diurnal cycle (year 2000) of precipitation averaged over
+   the tropical ocean (ocean grid cells in the latitude belt 30°S to 30°N) from a simulation of
+   MIROC6 averaged compared with ERA5 data (black). The light blue shading shows the range of the
+   10% to 90% percentiles of RMSE values from the ensemble of CMIP6 models used for comparison.
+   Created with recipe_benchmarking_diurnal_cycle.yml.
+
+.. _fig_benchmarking_map:
+.. figure::  /recipes/figures/benchmarking/map.png
+   :align:   center
+   :width:   10cm
+
+   5-year annual mean (2000-2004) area-weighted RMSE of the precipitation rate in mm day-1 from a
+   simulation of MIROC6 compared with GPCP-SG data. The stippled areas mask grid cells where the
+   RMSE is smaller than the 90% percentile of RMSE values from an ensemble of CMIP6 models.
+   Created with recipe_model_benchmarking_maps.yml
+
+.. _fig_benchmarking_timeseries:
+.. figure::  /recipes/figures/benchmarking/timeseries.png
+   :align:   center
+   :width:   16cm
+
+   (Left) Time series from 2000 through 2014 of global average monthly mean temperature anomalies
+   (reference period 2000-2009) of the near-surface temperature in K from a simulation of MIROC6
+   (red) and the reference dataset HadCRUT5 (black). The thin gray lines show individual CMIP6
+   models used for comparison, the dashed gray lines show the 10% and 90% percentiles of these
+   CMIP6 models. (Right) same as (left) but for area-weighted RMSE of the near-surface air
+   temperature. The light blue shading shows the range of the 10% to 90% percentiles of RMSE
+   values from the ensemble of CMIP6 models used for comparison. Created with
+   recipe_model_benchmarking_timeseries.yml.
+
+.. _fig_benchmarking_zonal:
+.. figure::  /recipes/figures/benchmarking/zonal.png
+   :align:   center
+   :width:   10cm
+
+   5-year annual mean bias (2000-2004) of the zonally averaged temperature in K from a historical
+   simulation of MIROC6 compared with ERA5 reanalysis data. The stippled areas mask grid cells
+   where the absolute BIAS (:math:`|BIAS|`) is smaller than the maximum of the absolute 10%
+   (:math:`|p10|`) and the absolute 90% (:math:`|p90|`) percentiles from an ensemble of CMIP6
+   models, i.e. :math:`|BIAS| \geq max( |p10|, |p90|)`. Created with
+   recipe_model_benchmarking_zonal.yml.
diff --git a/doc/sphinx/source/recipes/recipe_model_evaluation.rst b/doc/sphinx/source/recipes/recipe_model_evaluation.rst
index c61f34aa62..0e8b8ec095 100644
--- a/doc/sphinx/source/recipes/recipe_model_evaluation.rst
+++ b/doc/sphinx/source/recipes/recipe_model_evaluation.rst
@@ -62,37 +62,38 @@ section).
 Example plots
 -------------
 
-.. _fig_1:
 .. figure::  /recipes/figures/model_evaluation/map_tas_MPI-ESM1-2-HR_Amon.jpg
    :align:   center
    :width:   14cm
 
-Global climatology of 2m near-surface air temperature.
+   Global climatology of 2m near-surface air temperature.
 
-.. _fig_2:
 .. figure::  /recipes/figures/model_evaluation/map_swcre_MPI-ESM1-2-HR_Amon.jpg
    :align:   center
    :width:   14cm
 
-Global climatology of the shortwave cloud radiative effect (SWCRE).
+   Global climatology of the shortwave cloud radiative effect (SWCRE).
 
-.. _fig_3:
 .. figure::  /recipes/figures/model_evaluation/timeseries_rtnt_ambiguous_dataset_Amon.jpg
    :align:   center
    :width:   14cm
 
-Time series of the global mean top-of-the-atmosphere net radiative flux.
+   Time series of the global mean top-of-the-atmosphere net radiative flux.
 
-.. _fig_4:
 .. figure::  /recipes/figures/model_evaluation/variable_vs_lat_pr_Amon.jpg
    :align:   center
    :width:   14cm
 
-Zonal mean precipitation.
+   Zonal mean precipitation.
 
-.. _fig_5:
 .. figure::  /recipes/figures/model_evaluation/annual_cycle_clt_southerocean_Amon.jpg
    :align:   center
    :width:   14cm
 
-Annual cycle of Southern Ocean total cloud cover.
+   Annual cycle of Southern Ocean total cloud cover.
+
+.. figure::  /recipes/figures/model_evaluation/diurnal_cycle_clt_sepacific_3hr.png
+   :align:   center
+   :width:   14cm
+
+   Diurnal cycle of Southeast Pacific total cloud cover.
diff --git a/doc/sphinx/source/recipes/recipe_monitor.rst b/doc/sphinx/source/recipes/recipe_monitor.rst
index 8f4893fc12..277f77f279 100644
--- a/doc/sphinx/source/recipes/recipe_monitor.rst
+++ b/doc/sphinx/source/recipes/recipe_monitor.rst
@@ -145,88 +145,102 @@ Example plots
    :align:   center
    :width:   14cm
 
-Global climatology of tas.
+   Global climatology of tas.
 
 .. _fig_seasonclimglobal:
 .. figure::  /recipes/figures/monitor/seasonclim.png
    :align:   center
    :width:   14cm
 
-Seasonal climatology of pr, with a custom colorbar.
+   Seasonal climatology of pr, with a custom colorbar.
 
 .. _fig_monthlyclimglobal:
 .. figure::  /recipes/figures/monitor/monclim.png
    :align:   center
    :width:   14cm
 
-Monthly climatology of sivol, only for March and September.
+   Monthly climatology of sivol, only for March and September.
 
 .. _fig_timeseries:
 .. figure::  /recipes/figures/monitor/timeseries.png
    :align:   center
    :width:   14cm
 
-Timeseries of Niño 3.4 index, computed directly with the preprocessor.
+   Timeseries of Niño 3.4 index, computed directly with the preprocessor.
 
 .. _fig_annual_cycle:
 .. figure::  /recipes/figures/monitor/annualcycle.png
    :align:   center
    :width:   14cm
 
-Annual cycle of tas.
+   Annual cycle of tas.
 
 .. _fig_timeseries_with_ref:
 .. figure::  /recipes/figures/monitor/timeseries_with_ref.png
    :align:   center
    :width:   14cm
 
-Timeseries of tas including a reference dataset.
+   Timeseries of tas including a reference dataset.
 
 .. _fig_annual_cycle_with_ref:
 .. figure::  /recipes/figures/monitor/annualcycle_with_ref.png
    :align:   center
    :width:   14cm
 
-Annual cycle of tas including a reference dataset.
+   Annual cycle of tas including a reference dataset.
+
+.. _fig_diurnal_cycle:
+.. figure::  /recipes/figures/monitor/diurnalcycle_pr_tropics_EC-Earth3_3hr_historical_r1i1p1f1.png
+   :align:   center
+   :width:   14cm
+
+   Diurnal cycle of precipitation in the Tropics from EC-Earth3.
+
+.. _fig_diurnal_cycle_with_ref:
+.. figure::  /recipes/figures/monitor/diurnal_cycle_clt_tropics_3hr.png
+   :align:   center
+   :width:   14cm
+
+   Diurnal cycle of clt including a reference dataset.
 
 .. _fig_map_with_ref:
 .. figure::  /recipes/figures/monitor/map_with_ref.png
    :align:   center
    :width:   14cm
 
-Global climatology of tas including a reference dataset.
+   Global climatology of tas including a reference dataset.
 
 .. _fig_zonal_mean_profile_with_ref:
 .. figure::  /recipes/figures/monitor/zonalmean_profile_with_ref.png
    :align:   center
    :width:   14cm
 
-Zonal mean profile of ta including a reference dataset.
+   Zonal mean profile of ta including a reference dataset.
 
 .. _fig_1d_profile_with_ref:
 .. figure::  /recipes/figures/monitor/1d_profile_with_ref.png
    :align:   center
    :width:   14cm
 
-1D profile of ta including a reference dataset.
+   1D profile of ta including a reference dataset.
 
 .. _fig_variable_vs_lat_with_ref:
 .. figure::  /recipes/figures/monitor/variable_vs_lat_with_ref.png
    :align:   center
    :width:   14cm
 
-Zonal mean pr including a reference dataset.
+   Zonal mean pr including a reference dataset.
 
 .. _fig_hovmoeller_z_vs_time_with_ref:
 .. figure::  /recipes/figures/monitor/hovmoeller_z_vs_time_with_ref.png
    :align:   center
    :width:   14cm
 
-Hovmoeller plot (pressure vs. time) of ta including a reference dataset.
+   Hovmoeller plot (pressure vs. time) of ta including a reference dataset.
 
 .. _fig_hovmoeller_time_vs_lat_with_ref:
 .. figure:: /recipes/figures/monitor/hovmoeller_time_vs_lat_with_ref.png
    :align:   center
    :width:   14cm
 
-Hovmoeller plot (time vs. latitude) of tas including a reference dataset
+   Hovmoeller plot (time vs. latitude) of tas including a reference dataset
diff --git a/doc/sphinx/source/recipes/recipe_thermodyn_diagtool.rst b/doc/sphinx/source/recipes/recipe_thermodyn_diagtool.rst
index 6d64a7b589..5ee51dfb9c 100644
--- a/doc/sphinx/source/recipes/recipe_thermodyn_diagtool.rst
+++ b/doc/sphinx/source/recipes/recipe_thermodyn_diagtool.rst
@@ -27,7 +27,7 @@ in pressure levels, the daily fields of 2D near-surface temperature and horizont
 required to perform a vertical interpolation, substituting data in pressure levels where surface pressure is
 lower than the respective level and fields are not stored as an output of the analysed model.
 
-The material entropy production is computed by using the indirect or the direct method (or both). The former 
+The material entropy production is computed by using the indirect or the direct method (or both). The former
 method relies on the convergence of radiative heat in the atmosphere (cfr. Lucarini et al., 2011; Pascale et al., 2011),
 the latter on all viscous and non-viscous dissipative processes occurring in the atmosphere
 (namely the sensible heat fluxes, the hydrological cycle with its components and the kinetic energy dissipation).
@@ -139,12 +139,10 @@ References
 Example plots
 -------------
 
-.. _fig_1:
 .. figure:: /recipes/figures/thermodyn_diagtool/meridional_transp.png
    :align:   left
    :width:   14cm
 
-.. _fig_2:
 .. figure:: /recipes/figures/thermodyn_diagtool/CanESM2_wmb_transp.png
    :align:   right
    :width:   14cm
diff --git a/esmvaltool/config-references.yml b/esmvaltool/config-references.yml
index 1ece453343..adc6ff23ba 100644
--- a/esmvaltool/config-references.yml
+++ b/esmvaltool/config-references.yml
@@ -810,6 +810,7 @@ projects:
   crescendo: EU H2020 project CRESCENDO
   dlrveu2: DLR project VEU2
   dlrveu: DLR project VEU
+  dlrmabak: DLR project MABAK
   embrace: EU FP7 project EMBRACE
   esm2025: EU H2020 project ESM2025 - Earth system models for the future
   esmval: DLR project ESMVal
diff --git a/esmvaltool/diag_scripts/clouds/clouds.ncl b/esmvaltool/diag_scripts/clouds/clouds.ncl
index c05c091cf4..d3d1ef3b8e 100644
--- a/esmvaltool/diag_scripts/clouds/clouds.ncl
+++ b/esmvaltool/diag_scripts/clouds/clouds.ncl
@@ -114,7 +114,8 @@ begin
 
   variables = metadata_att_as_array(variable_info, "short_name")
   if (.not. any(variables .eq. var0)) then
-    errstr = "diagnostic " + diag + " requires the following variable: " + var0
+    errstr = "diagnostic " + DIAG_SCRIPT \
+             + " requires the following variable: " + var0
     error_msg("f", DIAG_SCRIPT, "", errstr)
   end if
 
@@ -539,6 +540,10 @@ begin
       res@cnLevels            = ispan(0, 60, 5)
     end if
 
+    if (var0.eq."ts") then
+      res@cnLevels            = ispan(274, 304, 2)
+    end if
+
 ;    res@lbLabelBarOn         = False
     res@gsnRightString       = ""
 
diff --git a/esmvaltool/diag_scripts/monitor/monitor.py b/esmvaltool/diag_scripts/monitor/monitor.py
index dda5aa4f3d..19cf39a734 100644
--- a/esmvaltool/diag_scripts/monitor/monitor.py
+++ b/esmvaltool/diag_scripts/monitor/monitor.py
@@ -27,6 +27,11 @@
       produce multi panel plots for data with `shape_id` or `region`
       coordinates of length > 1. Supported coordinates: `time`, `shape_id`
       (optional) and `region` (optional).
+    - Diurnal cycle (plot type ``diurnal_cycle``): Generate a diurnal cycle
+      plot (timeseries like climatological from 0 to 24 hours). It will
+      produce multi panel plots for data with `shape_id` or `region`
+      coordinates of length > 1. Supported coordinates: `time`, `shape_id`
+      (optional) and `region` (optional).
 
 Configuration options in recipe
 -------------------------------
@@ -39,10 +44,10 @@
     monitor configuration file can be found :ref:`here <monitor_config_file>`.
 plots: dict, optional
     Plot types plotted by this diagnostic (see list above). Dictionary keys
-    must be ``clim``, ``seasonclim``, ``monclim``, ``timeseries`` or
-    ``annual_cycle``. Dictionary values are dictionaries used as options for
-    the corresponding plot. The allowed options for the different plot types
-    are given below.
+    must be ``clim``, ``seasonclim``, ``monclim``, ``timeseries``,
+    ``annual_cycle`` or ``diurnal_cycle``. Dictionary values are dictionaries
+    used as options for the corresponding plot. The allowed options for the
+    different plot types are given below.
 plot_filename: str, optional
     Filename pattern for the plots.
     Defaults to ``{plot_type}_{real_name}_{dataset}_{mip}_{exp}_{ensemble}``.
@@ -98,6 +103,10 @@
 ----------------------------------------------------
 None
 
+Configuration options for plot type ``diurnal_cycle``
+-----------------------------------------------------
+None
+
 .. hint::
 
    Extra arguments given to the recipe are ignored, so it is safe to use yaml
@@ -166,6 +175,7 @@ def compute(self):
 
                 self.timeseries(cube, var_info)
                 self.plot_annual_cycle(cube, var_info)
+                self.plot_diurnal_cycle(cube, var_info)
                 self.plot_monthly_climatology(cube, var_info)
                 self.plot_seasonal_climatology(cube, var_info)
                 self.plot_climatology(cube, var_info)
@@ -280,6 +290,57 @@ def plot_annual_cycle(self, cube, var_info):
             caption=caption,
         )
 
+    def plot_diurnal_cycle(self, cube, var_info):
+        """Plot the diurnal cycle according to configuration.
+
+        The key 'diurnal_cycle' must be passed to the 'plots' option in the
+        configuration.
+
+        Parameters
+        ----------
+        cube: iris.cube.Cube
+            Data to plot. Must be 1D with time or 2D with an extra 'shape_id'
+            or 'region' coordinate. In that case, the plot will be a multiple
+            one with one figure for each region
+        var_info: dict
+            Variable's metadata from ESMValTool
+
+        Warning
+        -------
+        The hourly climatology is done inside the function so the users can
+        plot both the timeseries and the diurnal cycle in one go
+        """
+        if 'diurnal_cycle' not in self.plots:
+            return
+        cube = climate_statistics(cube, period='hour')
+
+        plotter = PlotSeries()
+        plotter.outdir = self.get_plot_folder(var_info)
+        plotter.img_template = self.get_plot_path('diurnalcycle', var_info,
+                                                  add_ext=False)
+        plotter.filefmt = self.cfg['output_file_type']
+        region_coords = ('shape_id', 'region')
+        options = {
+            'xlabel': '',
+            'xlimits': None,
+            'suptitle': 'Diurnal cycle',
+        }
+        for region_coord in region_coords:
+            if cube.coords(region_coord):
+                plotter.multiplot_cube(cube, 'month', region_coord, **options)
+                return
+        plotter.plot_cube(cube, 'hour', **options)
+        caption = (f"Diurnal cycle of {var_info[n.LONG_NAME]} of "
+                   f"dataset {var_info[n.DATASET]} (project "
+                   f"{var_info[n.PROJECT]}) from {var_info[n.START_YEAR]} to "
+                   f"{var_info[n.END_YEAR]}.")
+        self.record_plot_provenance(
+            self.get_plot_path('diurnalcycle', var_info),
+            var_info,
+            'Diurnal cycle',
+            caption=caption,
+        )
+
     def plot_monthly_climatology(self, cube, var_info):
         """Plot the monthly climatology as a multipanel plot.
 
diff --git a/esmvaltool/diag_scripts/monitor/multi_datasets.py b/esmvaltool/diag_scripts/monitor/multi_datasets.py
index a5b3afca2d..851f1e7906 100644
--- a/esmvaltool/diag_scripts/monitor/multi_datasets.py
+++ b/esmvaltool/diag_scripts/monitor/multi_datasets.py
@@ -10,6 +10,15 @@
 facet ``reference_for_monitor_diags: true`` in the definition of the dataset in
 the recipe. Note that at most one reference dataset per variable is supported.
 
+Please note that all benchmarking plot types (i.e. all plot types starting with
+``benchmarking_``) require exactly one dataset (the dataset to be benchmarked)
+to have the facet ``benchmark_dataset: true`` in the dataset entry of the
+recipe.  For benchmarking line plots (i.e. ``benchmarking_annual_cycle``,
+``benchmarking_diurnal_cycle``, ``benchmarking_timeseries``), it is recommended
+to specify a particular line color and line style in the ``scripts`` section of
+the recipe for the dataset to be benchmarked (``benchmark_dataset: true``) so
+that this dataset is easy to identify in the plot.
+
 Currently supported plot types (use the option ``plots`` to specify them):
     - Time series (plot type ``timeseries``): for each variable separately, all
       datasets are plotted in one single figure. Input data needs to be 1D with
@@ -17,6 +26,9 @@
     - Annual cycle (plot type ``annual_cycle``): for each variable separately,
       all datasets are plotted in one single figure. Input data needs to be 1D
       with single dimension `month_number`.
+    - Diurnal cycle (plot type ``diurnal_cycle``): for each variable
+      separately, all datasets are plotted in one single figure. Input data
+      needs to be 1D with single dimension `hour`.
     - Maps (plot type ``map``): for each variable and dataset, an individual
       map is plotted. If a reference dataset is defined, also include this
       dataset and a bias plot into the figure. Note that if a reference dataset
@@ -64,6 +76,28 @@
       :func:`esmvalcore.preprocessor.regrid_time` and
       :func:`esmvalcore.preprocessor.regrid` for this). Input data
       needs to be 2D with dimensions `time`, `latitude`/`longitude`.
+    - Benchmarking plot annual cycles (``benchmarking_annual_cycle``):
+      Same as plot type ``annual_cycle`` but including the range of metric
+      results from an ensemble of models as shading.
+    - Benchmarking box plots (``benchmarking_boxplot``):
+      Box plots showing the metric results for given variables from a given
+      model and the range from the first quartile to the third quartile, the
+      median, and minimum and maximum values (excluding the outliers) from
+      an ensemble of models for comparison.
+    - Benchmarking plot diurnal cycles (``benchmarking_diurnal_cycle``):
+      Same as plot type ``diurnal_cycle`` but including range of metric results
+      from an ensemble of models as shading.
+    - Benchmarking map plots (``benchmarking_map``):
+      Same as plot type ``map`` but with stippled areas masking grid cells
+      where the selected metric is smaller than the 90% percentile of
+      corresponding values from an ensemble of models used for comparison.
+    - Benchmarking plot time series (``benchmarking_timeseries``):
+      Same as plot type ``timeseries`` but including the range of metric
+      results from an ensemble of models as shading.
+    - Benchmarking plot zonal mean profiles (plot type ``benchmarking_zonal``):
+      Same as plot type ``zonal_mean_profile`` but with stippled areas masking
+      grid cells where the selected metric is smaller than the 90% percentile
+      of corresponding values from an ensemble of models used for comparison.
 
 Author
 ------
@@ -155,8 +189,8 @@
     the time axis using :class:`matplotlib.dates.DateFormatter`. If ``None``,
     use the default formatting imposed by the iris plotting function.
 
-Configuration options for plot type ``annual_cycle``
-----------------------------------------------------
+Configuration options for plot type ``annual_cycle`` and ``diurnal_cycle``
+--------------------------------------------------------------------------
 gridline_kwargs: dict, optional
     Optional keyword arguments for grid lines. By default, ``color: lightgrey,
     alpha: 0.5`` are used. Use ``gridline_kwargs: false`` to not show grid
@@ -603,6 +637,152 @@
     lat/lon on x-axis.
 
 
+Configuration options for plot type ``benchmarking_annual_cycle``
+-----------------------------------------------------------------
+Same as for plot type ``annual_cycle``.
+
+Configuration options for plot type ``benchmarking_boxplot``
+------------------------------------------------------------
+fontsize: int, optional (default: None)
+    Fontsize used for ticks, labels and titles. For the latter, use the given
+    fontsize plus 2. Does not affect suptitles. If not given, use default
+    matplotlib values. For a more fine-grained definition of fontsizes, use the
+    option ``matplotlib_rc_params`` (see above).
+plot_kwargs: dict, optional
+    Optional keyword arguments for the plot function defined by ``plot_func``.
+    Dictionary keys are elements identified by ``facet_used_for_labels`` or
+    ``default``, e.g., ``CMIP6`` if ``facet_used_for_labels: project`` or
+    ``historical`` if ``facet_used_for_labels: exp``. Dictionary values are
+    dictionaries used as keyword arguments for the plot function defined by
+    ``plot_func``. String arguments can include facets in curly brackets which
+    will be derived from the corresponding dataset, e.g., ``{project}``,
+    ``{short_name}``, ``{exp}``. Examples: ``default: {levels: 2}, CMIP6:
+    {vmin: 200, vmax: 250}``.
+pyplot_kwargs: dict, optional
+    Optional calls to functions of :mod:`matplotlib.pyplot`. Dictionary keys
+    are functions of :mod:`matplotlib.pyplot`. Dictionary values are used as
+    argument(s) for these functions (if values are dictionaries, these are
+    interpreted as keyword arguments; otherwise a single argument is assumed).
+    String arguments can include facets in curly brackets which will be derived
+    from the corresponding dataset, e.g., ``{project}``, ``{short_name}``,
+    ``{exp}``. Examples: ``title: 'Awesome Plot of {long_name}'``, ``xlabel:
+    '{short_name}'``, ``xlim: [0, 5]``.
+var_order: list of str, optional
+    Optional list of strings containing variable names to define the order of
+    the variables plotted.
+
+Configuration options for plot type ``benchmarking_diurnal_cycle``
+------------------------------------------------------------------
+Same as for plot type ``diurnal_cycle``.
+
+Configuration options for plot type ``benchmarking_map``
+--------------------------------------------------------
+cbar_label: str, optional (default: '{short_name} [{units}]')
+    Colorbar label. Can include facets in curly brackets which will be derived
+    from the corresponding dataset, e.g., ``{project}``, ``{short_name}``,
+    ``{exp}``.
+cbar_kwargs: dict, optional
+    Optional keyword arguments for :func:`matplotlib.pyplot.colorbar`. By
+    default, uses ``orientation: horizontal, aspect: 30``.
+fontsize: int, optional (default: None)
+    Fontsize used for ticks, labels and titles. For the latter, use the given
+    fontsize plus 2. Does not affect suptitles. If not given, use default
+    matplotlib values. For a more fine-grained definition of fontsizes, use the
+    option ``matplotlib_rc_params`` (see above).
+plot_func: str, optional (default: 'contourf')
+    Plot function used to plot the maps. Must be a function of :mod:`iris.plot`
+    that supports plotting of 2D cubes with coordinates latitude and longitude.
+plot_kwargs: dict, optional
+    Optional keyword arguments for the plot function defined by ``plot_func``.
+    Dictionary keys are elements identified by ``facet_used_for_labels`` or
+    ``default``, e.g., ``CMIP6`` if ``facet_used_for_labels: project`` or
+    ``historical`` if ``facet_used_for_labels: exp``. Dictionary values are
+    dictionaries used as keyword arguments for the plot function defined by
+    ``plot_func``. String arguments can include facets in curly brackets which
+    will be derived from the corresponding dataset, e.g., ``{project}``,
+    ``{short_name}``, ``{exp}``. Examples: ``default: {levels: 2}, CMIP6:
+    {vmin: 200, vmax: 250}``. In addition to the normalization_ options
+    supported by the plot function, the option ``norm: centered`` can be
+    specified. In this case, the keywords ``vcenter`` and ``halfrange`` should
+    be used instead of ``vmin`` or ``vmax`` (see
+    :class:`~matplotlib.colors.CenteredNorm`).
+projection: str, optional (default: 'Robinson')
+    Projection used for the map plot. Needs to be a valid projection class of
+    :mod:`cartopy.crs`. Keyword arguments can be specified using the option
+    ``projection_kwargs``.
+projection_kwargs: dict, optional
+    Optional keyword arguments for the projection given by ``projection``. For
+    the default projection ``Robinson``, the default keyword arguments
+    ``central_longitude: 10`` are used.
+pyplot_kwargs: dict, optional
+    Optional calls to functions of :mod:`matplotlib.pyplot`. Dictionary keys
+    are functions of :mod:`matplotlib.pyplot`. Dictionary values are used as
+    argument(s) for these functions (if values are dictionaries, these are
+    interpreted as keyword arguments; otherwise a single argument is assumed).
+    String arguments can include facets in curly brackets which will be derived
+    from the corresponding dataset, e.g., ``{project}``, ``{short_name}``,
+    ``{exp}``. Examples: ``title: 'Awesome Plot of {long_name}'``, ``xlabel:
+    '{short_name}'``, ``xlim: [0, 5]``.
+rasterize: bool, optional (default: True)
+    If ``True``, use rasterization_ for map plots to produce smaller files.
+    This is only relevant for vector graphics (e.g., ``output_file_type:
+    pdf,svg,ps``).
+
+Configuration options for plot type ``benchmarking_timeseries``
+---------------------------------------------------------------
+Same as for plot type ``timeseries``.
+
+Configuration options for plot type ``benchmarking_zonal``
+----------------------------------------------------------
+cbar_label: str, optional (default: '{short_name} [{units}]')
+    Colorbar label. Can include facets in curly brackets which will be derived
+    from the corresponding dataset, e.g., ``{project}``, ``{short_name}``,
+    ``{exp}``.
+cbar_kwargs: dict, optional
+    Optional keyword arguments for :func:`matplotlib.pyplot.colorbar`. By
+    default, uses ``orientation: vertical``.
+fontsize: int, optional (default: None)
+    Fontsize used for ticks, labels and titles. For the latter, use the given
+    fontsize plus 2. Does not affect suptitles. If not given, use default
+    matplotlib values. For a more fine-grained definition of fontsizes, use the
+    option ``matplotlib_rc_params`` (see above).
+log_y: bool, optional (default: True)
+    Use logarithmic Y-axis.
+plot_func: str, optional (default: 'contourf')
+    Plot function used to plot the profiles. Must be a function of
+    :mod:`iris.plot` that supports plotting of 2D cubes with coordinates
+    latitude and altitude/air_pressure.
+plot_kwargs: dict, optional
+    Optional keyword arguments for the plot function defined by ``plot_func``.
+    Dictionary keys are elements identified by ``facet_used_for_labels`` or
+    ``default``, e.g., ``CMIP6`` if ``facet_used_for_labels: project`` or
+    ``historical`` if ``facet_used_for_labels: exp``. Dictionary values are
+    dictionaries used as keyword arguments for the plot function defined by
+    ``plot_func``. String arguments can include facets in curly brackets which
+    will be derived from the corresponding dataset, e.g., ``{project}``,
+    ``{short_name}``, ``{exp}``. Examples: ``default: {levels: 2}, CMIP6:
+    {vmin: 200, vmax: 250}``. In addition to the normalization_ options
+    supported by the plot function, the option ``norm: centered`` can be
+    specified. In this case, the keywords ``vcenter`` and ``halfrange`` should
+    be used instead of ``vmin`` or ``vmax`` (see
+    :class:`~matplotlib.colors.CenteredNorm`).
+pyplot_kwargs: dict, optional
+    Optional calls to functions of :mod:`matplotlib.pyplot`. Dictionary keys
+    are functions of :mod:`matplotlib.pyplot`. Dictionary values are used as
+    argument(s) for these functions (if values are dictionaries, these are
+    interpreted as keyword arguments; otherwise a single argument is assumed).
+    String arguments can include facets in curly brackets which will be derived
+    from the corresponding dataset, e.g., ``{project}``, ``{short_name}``,
+    ``{exp}``. Examples: ``title: 'Awesome Plot of {long_name}'``, ``xlabel:
+    '{short_name}'``, ``xlim: [0, 5]``.
+rasterize: bool, optional (default: True)
+    If ``True``, use rasterization_ for profile plots to produce smaller files.
+    This is only relevant for vector graphics (e.g., ``output_file_type:
+    pdf,svg,ps``).
+show_y_minor_ticklabels: bool, optional (default: False)
+    Show tick labels for the minor ticks on the Y axis.
+
+
 .. hint::
 
    Extra arguments given to the recipe are ignored, so it is safe to use yaml
@@ -628,6 +808,7 @@
 import matplotlib.dates as mdates
 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
 import seaborn as sns
 from iris.analysis.cartography import area_weights
 from iris.coord_categorisation import add_year
@@ -704,12 +885,19 @@ def __init__(self, config):
         self.supported_plot_types = [
             'timeseries',
             'annual_cycle',
+            'diurnal_cycle',
             'map',
             'zonal_mean_profile',
             '1d_profile',
             'variable_vs_lat',
             'hovmoeller_z_vs_time',
             'hovmoeller_time_vs_lat_or_lon',
+            'benchmarking_annual_cycle',
+            'benchmarking_boxplot',
+            'benchmarking_diurnal_cycle',
+            'benchmarking_map',
+            'benchmarking_timeseries',
+            'benchmarking_zonal',
         ]
         for (plot_type, plot_options) in self.plots.items():
             if plot_type not in self.supported_plot_types:
@@ -728,12 +916,44 @@ def __init__(self, config):
                 self.plots[plot_type].setdefault('pyplot_kwargs', {})
                 self.plots[plot_type].setdefault('time_format', None)
 
+            elif plot_type == 'benchmarking_timeseries':
+                self.plots[plot_type].setdefault('annual_mean_kwargs', {})
+                self.plots[plot_type].setdefault('gridline_kwargs', {})
+                self.plots[plot_type].setdefault('legend_kwargs', {})
+                self.plots[plot_type].setdefault('plot_kwargs', {})
+                self.plots[plot_type].setdefault('pyplot_kwargs', {})
+                self.plots[plot_type].setdefault('time_format', None)
+
             elif plot_type == 'annual_cycle':
                 self.plots[plot_type].setdefault('gridline_kwargs', {})
                 self.plots[plot_type].setdefault('legend_kwargs', {})
                 self.plots[plot_type].setdefault('plot_kwargs', {})
                 self.plots[plot_type].setdefault('pyplot_kwargs', {})
 
+            elif plot_type == 'benchmarking_annual_cycle':
+                self.plots[plot_type].setdefault('gridline_kwargs', {})
+                self.plots[plot_type].setdefault('legend_kwargs', {})
+                self.plots[plot_type].setdefault('plot_kwargs', {})
+                self.plots[plot_type].setdefault('pyplot_kwargs', {})
+
+            elif plot_type == 'diurnal_cycle':
+                self.plots[plot_type].setdefault('gridline_kwargs', {})
+                self.plots[plot_type].setdefault('legend_kwargs', {})
+                self.plots[plot_type].setdefault('plot_kwargs', {})
+                self.plots[plot_type].setdefault('pyplot_kwargs', {})
+
+            elif plot_type == 'benchmarking_diurnal_cycle':
+                self.plots[plot_type].setdefault('gridline_kwargs', {})
+                self.plots[plot_type].setdefault('legend_kwargs', {})
+                self.plots[plot_type].setdefault('plot_kwargs', {})
+                self.plots[plot_type].setdefault('pyplot_kwargs', {})
+
+            elif plot_type == 'benchmarking_boxplot':
+                self.plots[plot_type].setdefault('fontsize', None)
+                self.plots[plot_type].setdefault('plot_kwargs', {})
+                self.plots[plot_type].setdefault('pyplot_kwargs', {})
+                self.plots[plot_type].setdefault('var_order', None)
+
             elif plot_type == 'map':
                 self.plots[plot_type].setdefault(
                     'cbar_label', '{short_name} [{units}]')
@@ -768,6 +988,25 @@ def __init__(self, config):
                 self.plots[plot_type].setdefault('x_pos_stats_avg', 0.0)
                 self.plots[plot_type].setdefault('x_pos_stats_bias', 0.92)
 
+            elif plot_type == 'benchmarking_map':
+                self.plots[plot_type].setdefault(
+                    'cbar_label', '{short_name} [{units}]')
+                self.plots[plot_type].setdefault(
+                    'cbar_kwargs', {'orientation': 'horizontal', 'aspect': 30}
+                )
+                self.plots[plot_type].setdefault('fontsize', None)
+                self.plots[plot_type].setdefault('plot_func', 'contourf')
+                self.plots[plot_type].setdefault('plot_kwargs', {})
+                if 'projection' not in self.plots[plot_type]:
+                    self.plots[plot_type].setdefault('projection', 'Robinson')
+                    self.plots[plot_type].setdefault(
+                        'projection_kwargs', {'central_longitude': 10}
+                    )
+                else:
+                    self.plots[plot_type].setdefault('projection_kwargs', {})
+                self.plots[plot_type].setdefault('pyplot_kwargs', {})
+                self.plots[plot_type].setdefault('rasterize', True)
+
             elif plot_type == 'zonal_mean_profile':
                 self.plots[plot_type].setdefault(
                     'cbar_label', '{short_name} [{units}]')
@@ -798,6 +1037,22 @@ def __init__(self, config):
                 self.plots[plot_type].setdefault('x_pos_stats_avg', 0.01)
                 self.plots[plot_type].setdefault('x_pos_stats_bias', 0.7)
 
+            elif plot_type == 'benchmarking_zonal':
+                self.plots[plot_type].setdefault(
+                    'cbar_label', '{short_name} [{units}]')
+                self.plots[plot_type].setdefault(
+                    'cbar_kwargs', {'orientation': 'vertical'}
+                )
+                self.plots[plot_type].setdefault('fontsize', None)
+                self.plots[plot_type].setdefault('log_y', True)
+                self.plots[plot_type].setdefault('plot_func', 'contourf')
+                self.plots[plot_type].setdefault('plot_kwargs', {})
+                self.plots[plot_type].setdefault('pyplot_kwargs', {})
+                self.plots[plot_type].setdefault('rasterize', True)
+                self.plots[plot_type].setdefault(
+                    'show_y_minor_ticklabels', False
+                )
+
             elif plot_type == '1d_profile':
                 self.plots[plot_type].setdefault('aspect_ratio', 1.5)
                 self.plots[plot_type].setdefault('gridline_kwargs', {})
@@ -809,6 +1064,7 @@ def __init__(self, config):
                 self.plots[plot_type].setdefault(
                     'show_y_minor_ticklabels', False
                 )
+
             elif plot_type == 'variable_vs_lat':
                 self.plots[plot_type].setdefault('gridline_kwargs', {})
                 self.plots[plot_type].setdefault('legend_kwargs', {})
@@ -1076,6 +1332,20 @@ def _get_map_projection(self):
 
         return getattr(ccrs, projection)(**projection_kwargs)
 
+    def _get_benchmarking_projection(self):
+        """Get projection used for benchmarking map plots."""
+        plot_type = 'benchmarking_map'
+        projection = self.plots[plot_type]['projection']
+        projection_kwargs = self.plots[plot_type]['projection_kwargs']
+
+        # Check if desired projection is valid
+        if not hasattr(ccrs, projection):
+            raise AttributeError(
+                f"Got invalid projection '{projection}' for plotting "
+                f"{plot_type}, expected class of cartopy.crs")
+
+        return getattr(ccrs, projection)(**projection_kwargs)
+
     def _get_plot_func(self, plot_type):
         """Get plot function."""
         plot_func = self.plots[plot_type]['plot_func']
@@ -1114,8 +1384,10 @@ def _get_plot_kwargs(self, plot_type, dataset, bias=False):
                 plot_kwargs[key] = val
 
         # Default settings for different plot types
-        if plot_type in ('timeseries', 'annual_cycle', '1d_profile',
-                         'variable_vs_lat'):
+        if plot_type in ('timeseries', 'annual_cycle',
+                         'benchmarking_annual_cycle', '1d_profile',
+                         'diurnal_cycle', 'benchmarking_diurnal_cycle',
+                         'variable_vs_lat', 'benchmarking_timeseries'):
             plot_kwargs.setdefault('label', label)
 
         if plot_kwargs.get('norm') == 'centered':
@@ -1911,43 +2183,246 @@ def _plot_hovmoeller_time_vs_lat_or_lon_without_ref(self, plot_func,
         netcdf_path = get_diagnostic_filename(Path(plot_path).stem, self.cfg)
         return (plot_path, {netcdf_path: cube})
 
-    def _process_pyplot_kwargs(self, plot_type, dataset):
-        """Process functions for :mod:`matplotlib.pyplot`."""
-        pyplot_kwargs = self.plots[plot_type]['pyplot_kwargs']
-        for (func, arg) in pyplot_kwargs.items():
-            if isinstance(arg, str):
-                arg = self._fill_facet_placeholders(
-                    arg,
-                    dataset,
-                    f"pyplot_kwargs of {plot_type} '{func}: {arg}'",
+    def _plot_benchmarking_map(self, plot_func, dataset, percentile_dataset,
+                               metric):
+        """Plot benchmarking map plot."""
+        plot_type = 'benchmarking_map'
+        logger.info("Plotting benchmarking map for '%s'",
+                    self._get_label(dataset))
+
+        # Make sure that the data has the correct dimensions
+        cube = dataset['cube']
+
+        # Create plot with desired settings
+        with mpl.rc_context(self._get_custom_mpl_rc_params(plot_type)):
+            fig = plt.figure(**self.cfg['figure_kwargs'])
+            axes = fig.add_subplot(
+                projection=self._get_benchmarking_projection())
+            plot_kwargs = self._get_plot_kwargs(plot_type, dataset)
+            plot_kwargs['axes'] = axes
+            plot_kwargs['extend'] = "both"
+
+            # apply stippling (dots) to all grid cells that do not exceed
+            # the upper percentile given by 'percentile_dataset[]'
+            mask_cube = self._get_benchmark_mask(cube, percentile_dataset,
+                                                 metric)
+            hatching_plot_kwargs = {
+                'colors': 'none',
+                'levels': [.5, 1.5],
+                'hatches': ['......'],
+            }
+
+            if plot_func is iris.plot.contourf:
+                # see https://github.com/SciTools/cartopy/issues/2457
+                # and https://github.com/SciTools/cartopy/issues/2468
+                plot_kwargs['transform_first'] = True
+                hatching_plot_kwargs['transform_first'] = True
+                npx = da if cube.has_lazy_data() else np
+                cube_to_plot = cube.copy(
+                    npx.ma.filled(cube.core_data(), np.nan)
+                )
+                mask_cube_to_plot = mask_cube.copy(
+                    npx.ma.filled(mask_cube.core_data(), np.nan)
                 )
-            if arg is None:
-                getattr(plt, func)()
-            elif isinstance(arg, dict):
-                getattr(plt, func)(**arg)
             else:
-                getattr(plt, func)(arg)
+                cube_to_plot = cube
+                mask_cube_to_plot = mask_cube
 
-    @staticmethod
-    def _check_cube_dimensions(cube, plot_type):
-        """Check that cube has correct dimensional variables."""
-        expected_dimensions_dict = {
-            'annual_cycle': (['month_number'],),
-            'map': (['latitude', 'longitude'],),
-            'zonal_mean_profile': (['latitude', 'air_pressure'],
-                                   ['latitude', 'altitude']),
-            'timeseries': (['time'],),
-            '1d_profile': (['air_pressure'],
-                           ['altitude']),
-            'variable_vs_lat': (['latitude'],),
-            'hovmoeller_z_vs_time': (['time', 'air_pressure'],
-                                     ['time', 'altitude']),
-            'hovmoeller_time_vs_lat_or_lon': (['time', 'latitude'],
-                                              ['time', 'longitude']),
-        }
-        if plot_type not in expected_dimensions_dict:
-            raise NotImplementedError(f"plot_type '{plot_type}' not supported")
-        expected_dimensions = expected_dimensions_dict[plot_type]
+            # Plot
+            plot_map = plot_func(cube_to_plot, **plot_kwargs)
+            hatching = plot_func(mask_cube_to_plot, **hatching_plot_kwargs)
+
+            # set color for stippling to 'black' (default = 'white')
+            hatching.set_edgecolor('black')
+            hatching.set_linewidth(0.)
+
+            axes.coastlines()
+
+            # Setup colorbar
+            fontsize = (
+                self.plots[plot_type]['fontsize'] or
+                mpl.rcParams['axes.labelsize']
+            )
+            colorbar = fig.colorbar(plot_map, ax=axes,
+                                    **self._get_cbar_kwargs(plot_type))
+            colorbar.set_label(self._get_cbar_label(plot_type, dataset),
+                               fontsize=fontsize)
+            colorbar.ax.tick_params(labelsize=fontsize)
+
+            # Customize plot
+            axes.set_title(self._get_label(dataset))
+            fig.suptitle(dataset['long_name'])
+            self._process_pyplot_kwargs(plot_type, dataset)
+
+            # Rasterization
+            if self.plots[plot_type]['rasterize']:
+                self._set_rasterized([axes])
+
+        # File paths
+        plot_path = self.get_plot_path(plot_type, dataset)
+        netcdf_path = get_diagnostic_filename(Path(plot_path).stem, self.cfg)
+
+        return (plot_path, {netcdf_path: cube})
+
+    def _plot_benchmarking_boxplot(self, df, cubes, variables, datasets):
+        """Plot benchmarking boxplot."""
+        plot_type = 'benchmarking_boxplot'
+        logger.info("Plotting benchmarking boxplot for '%s'",
+                    self._get_label(datasets[0]))
+
+        # Create plot with desired settings
+        with mpl.rc_context(self._get_custom_mpl_rc_params(plot_type)):
+            fig = plt.figure(**self.cfg['figure_kwargs'])
+            metric = cubes[0].long_name.partition("of")[0]
+            fig.suptitle(f"{metric}of {self._get_label(datasets[0])}")
+
+            sns.set_style('darkgrid')
+
+            for i, var in enumerate(variables):
+                axes = plt.subplot(1, len(variables), i+1)
+                plot_kwargs = self._get_plot_kwargs(plot_type, datasets[i])
+                plot_kwargs['axes'] = axes
+
+                plot_boxplot = sns.boxplot(data=df[df['Variable'] == var])
+                plot_boxplot.set(xticklabels=[])
+
+                plt.scatter(0, cubes[i].data, marker='x', s=200, linewidths=2,
+                            color="red", zorder=3)
+
+                plt.xlabel(var)
+                if cubes[i].units != 1:
+                    plt.ylabel(cubes[i].units)
+
+                # Customize plot
+                self._process_pyplot_kwargs(plot_type, datasets[i])
+
+        # File paths
+        datasets[0]['variable_group'] = (
+            datasets[0]['short_name'].partition("_")[0])
+        plot_path = self.get_plot_path(plot_type,  datasets[0])
+        netcdf_path = get_diagnostic_filename(Path(plot_path).stem, self.cfg)
+
+        return (plot_path, {netcdf_path: cubes[0]})
+
+    def _plot_benchmarking_zonal(self, plot_func, dataset, percentile_dataset,
+                                 metric):
+        """Plot benchmarking zonal mean profile."""
+        plot_type = 'benchmarking_zonal'
+        logger.info("Plotting benchmarking zonal mean profile"
+                    " for '%s'",
+                    self._get_label(dataset))
+
+        # Make sure that the data has the correct dimensions
+        cube = dataset['cube']
+
+        # Create plot with desired settings
+        with mpl.rc_context(self._get_custom_mpl_rc_params(plot_type)):
+            fig = plt.figure(**self.cfg['figure_kwargs'])
+            axes = fig.add_subplot()
+            plot_kwargs = self._get_plot_kwargs(plot_type, dataset)
+            plot_kwargs['axes'] = axes
+            plot_kwargs['extend'] = "both"
+            plot_benchmarking_zonal = plot_func(cube, **plot_kwargs)
+
+            # apply stippling (dots) to all grid cells that do not exceed
+            # the upper percentile given by 'percentile_dataset[]'
+
+            mask_cube = self._get_benchmark_mask(cube, percentile_dataset,
+                                                 metric)
+            hatching = plot_func(
+               mask_cube,
+               colors='none',
+               levels=[.5, 1.5],
+               hatches=['......'],
+            )
+
+            # set color for stippling to 'black' (default = 'white')
+            hatching.set_edgecolor('black')
+            hatching.set_linewidth(0.)
+
+            # Setup colorbar
+            fontsize = (
+                self.plots[plot_type]['fontsize'] or
+                mpl.rcParams['axes.labelsize']
+            )
+            colorbar = fig.colorbar(plot_benchmarking_zonal, ax=axes,
+                                    **self._get_cbar_kwargs(plot_type))
+            colorbar.set_label(self._get_cbar_label(plot_type, dataset),
+                               fontsize=fontsize)
+            colorbar.ax.tick_params(labelsize=fontsize)
+
+            # Customize plot
+            axes.set_title(self._get_label(dataset))
+            fig.suptitle(dataset['long_name'])
+            axes.set_xlabel('latitude [°N]')
+            z_coord = cube.coord(axis='Z')
+            axes.set_ylabel(f'{z_coord.long_name} [{z_coord.units}]')
+            if self.plots[plot_type]['log_y']:
+                axes.set_yscale('log')
+                axes.get_yaxis().set_major_formatter(
+                    FormatStrFormatter('%.1f'))
+            if self.plots[plot_type]['show_y_minor_ticklabels']:
+                axes.get_yaxis().set_minor_formatter(
+                    FormatStrFormatter('%.1f'))
+            else:
+                axes.get_yaxis().set_minor_formatter(NullFormatter())
+            self._process_pyplot_kwargs(plot_type, dataset)
+
+            # Rasterization
+            if self.plots[plot_type]['rasterize']:
+                self._set_rasterized([axes])
+
+        # File paths
+        plot_path = self.get_plot_path(plot_type, dataset)
+        netcdf_path = get_diagnostic_filename(Path(plot_path).stem, self.cfg)
+
+        return (plot_path, {netcdf_path: cube})
+
+    def _process_pyplot_kwargs(self, plot_type, dataset):
+        """Process functions for :mod:`matplotlib.pyplot`."""
+        pyplot_kwargs = self.plots[plot_type]['pyplot_kwargs']
+        for (func, arg) in pyplot_kwargs.items():
+            if isinstance(arg, str):
+                arg = self._fill_facet_placeholders(
+                    arg,
+                    dataset,
+                    f"pyplot_kwargs of {plot_type} '{func}: {arg}'",
+                )
+            if arg is None:
+                getattr(plt, func)()
+            elif isinstance(arg, dict):
+                getattr(plt, func)(**arg)
+            else:
+                getattr(plt, func)(arg)
+
+    @staticmethod
+    def _check_cube_dimensions(cube, plot_type):
+        """Check that cube has correct dimensional variables."""
+        expected_dimensions_dict = {
+            'annual_cycle': (['month_number'],),
+            'benchmarking_boxplot': ([''],),
+            'diurnal_cycle': (['hour'],),
+            'map': (['latitude', 'longitude'],),
+            'benchmarking_annual_cycle': (['month_number'],),
+            'benchmarking_diurnal_cycle': (['hour'],),
+            'benchmarking_map': (['latitude', 'longitude'],),
+            'benchmarking_timeseries': (['time'],),
+            'benchmarking_zonal': (['latitude', 'air_pressure'],
+                                   ['latitude', 'altitude']),
+            'zonal_mean_profile': (['latitude', 'air_pressure'],
+                                   ['latitude', 'altitude']),
+            'timeseries': (['time'],),
+            '1d_profile': (['air_pressure'],
+                           ['altitude']),
+            'variable_vs_lat': (['latitude'],),
+            'hovmoeller_z_vs_time': (['time', 'air_pressure'],
+                                     ['time', 'altitude']),
+            'hovmoeller_time_vs_lat_or_lon': (['time', 'latitude'],
+                                              ['time', 'longitude']),
+        }
+        if plot_type not in expected_dimensions_dict:
+            raise NotImplementedError(f"plot_type '{plot_type}' not supported")
+        expected_dimensions = expected_dimensions_dict[plot_type]
         for dims in expected_dimensions:
             cube_dims = [cube.coords(dim, dim_coords=True) for dim in dims]
             if all(cube_dims) and cube.ndim == len(dims):
@@ -1955,50 +2430,544 @@ def _check_cube_dimensions(cube, plot_type):
         expected_dims_str = ' or '.join(
             [str(dims) for dims in expected_dimensions]
         )
-        raise ValueError(
-            f"Expected cube that exactly has the dimensional coordinates "
-            f"{expected_dims_str}, got {cube.summary(shorten=True)}")
+        raise ValueError(
+            f"Expected cube that exactly has the dimensional coordinates "
+            f"{expected_dims_str}, got {cube.summary(shorten=True)}")
+
+    @staticmethod
+    def _fill_facet_placeholders(string, dataset, description):
+        """Fill facet placeholders."""
+        try:
+            string = string.format(**dataset)
+        except KeyError as exc:
+            raise ValueError(
+                f"Not all necessary facets in {description} available for "
+                f"dataset\n{pformat(dataset)}") from exc
+        return string
+
+    @staticmethod
+    def _get_multi_dataset_facets(datasets):
+        """Derive common facets for multiple datasets."""
+        all_keys = {key for dataset in datasets for key in dataset}
+        multi_dataset_facets = {}
+        for key in all_keys:
+            if all(d.get(key) == datasets[0].get(key) for d in datasets):
+                multi_dataset_facets[key] = datasets[0].get(key)
+            else:
+                multi_dataset_facets[key] = f'ambiguous_{key}'
+        return multi_dataset_facets
+
+    def _get_reference_dataset(self, datasets):
+        """Extract reference dataset."""
+        variable = datasets[0][self.cfg['group_variables_by']]
+        ref_datasets = [d for d in datasets if
+                        d.get('reference_for_monitor_diags', False)]
+        if len(ref_datasets) > 1:
+            raise ValueError(
+                f"Expected at most 1 reference dataset (with "
+                f"'reference_for_monitor_diags: true' for variable "
+                f"'{variable}', got {len(ref_datasets):d}")
+        if ref_datasets:
+            return ref_datasets[0]
+        return None
+
+    def _get_benchmarking_reference(self, datasets):
+        """Extract reference dataset for calculation of benchmarking metric."""
+        variable = datasets[0][self.cfg['group_variables_by']]
+        ref_datasets = [d for d in datasets if
+                        d.get('reference_for_metric', False)]
+
+        if len(ref_datasets) == 1:
+            return ref_datasets[0]
+
+        # try variable attribute "reference_dataset"
+        for dataset in datasets:
+            print(dataset.get('reference_dataset'))
+            print(dataset.get('dataset'))
+            if dataset.get('reference_dataset') == dataset.get('dataset'):
+                ref_datasets = dataset
+                break
+        if len(ref_datasets) != 1:
+            raise ValueError(
+                f"Expected exactly 1 reference dataset for variable "
+                f"'{variable}', got {len(ref_datasets)}")
+        return None
+
+    def _get_benchmark_datasets(self, datasets):
+        """Get dataset to be benchmarked."""
+        variable = datasets[0][self.cfg['group_variables_by']]
+        benchmark_datasets = [d for d in datasets if
+                              d.get('benchmark_dataset', False)]
+        if len(benchmark_datasets) >= 1:
+            return benchmark_datasets
+
+        raise ValueError(
+            f"Expected at least 1 benchmark dataset (with "
+            f"'benchmark_dataset: true' for variable "
+            f"'{variable}'), got {len(benchmark_datasets):d}")
+
+    def _get_benchmark_group(self, datasets):
+        """Get datasets for benchmarking."""
+        benchmark_datasets = [d for d in datasets if not
+                              (d.get('benchmark_dataset', False) or
+                               d.get('reference_for_metric', False))]
+        return benchmark_datasets
+
+    def _get_benchmark_mask(self, cube, percentile_dataset, metric):
+        """Create mask for benchmarking cube depending on metric."""
+        mask_cube = cube.copy()
+
+        idx0 = 0  # index largest percentile
+        idx1 = len(percentile_dataset) - 1  # index smallest percentile
+
+        if metric == 'bias':
+            maxabs_perc = np.maximum(np.abs(percentile_dataset[idx0].data),
+                                     np.abs(percentile_dataset[idx1].data))
+            mask = np.where(np.abs(cube.data) >= maxabs_perc, 0, 1)
+        elif metric == 'emd':
+            mask = np.where(cube.data >= percentile_dataset[idx0].data, 0, 1)
+        elif metric == 'pearsonr':
+            mask = np.where(cube.data <= percentile_dataset[idx0].data, 0, 1)
+        elif metric == 'rmse':
+            mask = np.where(cube.data >= percentile_dataset[idx0].data, 0, 1)
+        else:
+            raise ValueError(
+                f"Could not create benchmarking mask, unknown benchmarking "
+                f"metric: '{metric}'")
+
+        mask_cube.data = mask
+        return mask_cube
+
+    def _get_benchmark_metric(self, datasets):
+        """Get benchmarking metric."""
+        short_name = datasets[0].get('short_name', '')
+        if 'rmse' in short_name:
+            metric = 'rmse'
+        elif 'pearsonr' in short_name:
+            metric = 'pearsonr'
+        elif 'emd' in short_name:
+            metric = 'emd'
+        else:
+            metric = 'bias'  # default
+            logger.info(
+                "Could not determine metric from short_name, "
+                "assuming benchmarking metric = %s", metric)
+        return metric
+
+    def _get_benchmark_percentiles(self, datasets):
+        """Get percentile datasets from multi-model statistics preprocessor."""
+        variable = datasets[0][self.cfg['group_variables_by']]
+        percentiles = []
+        for dataset in datasets:
+            statistics = dataset.get('multi_model_statistics')
+            if statistics:
+                if "Percentile" in statistics:
+                    percentiles.append(dataset)
+
+        # *** sort percentiles by size ***
+
+        # get percentiles as integers
+        iperc = []
+        for dataset in percentiles:
+            stat = dataset.get('multi_model_statistics')
+            perc = stat.replace('MultiModelPercentile', '')
+            iperc.append(int(perc))
+
+        idx = list(range(len(percentiles)))
+        # sort list of percentile datasets by percentile with highest
+        # percentile first (descending order)
+        zipped_pairs = zip(iperc, idx)
+        zval = [x for _, x in sorted(zipped_pairs, reverse=True)]
+        perc_sorted = [percentiles[i] for i in zval]
+        percentiles = perc_sorted
+
+        # get number of percentiles expected depending on benchmarking metric
+
+        metric = self._get_benchmark_metric(datasets)
+
+        if metric == 'bias':
+            numperc = 2
+        elif metric == 'rmse':
+            numperc = 1
+        elif metric == 'pearsonr':
+            numperc = 1
+        elif metric == 'emd':
+            numperc = 1
+        else:
+            raise ValueError(f"Unknown benchmarking metric: '{metric}'.")
+
+        if len(percentiles) >= numperc:
+            return percentiles
+
+        raise ValueError(
+            f"Expected at least '{numperc}' percentile datasets (created "
+            f"'with multi-model statistics preprocessor for variable "
+            f"'{variable}'), got {len(percentiles):d}")
+
+    def create_timeseries_plot(self, datasets):
+        """Create time series plot."""
+        plot_type = 'timeseries'
+        if plot_type not in self.plots:
+            return
+
+        if not datasets:
+            raise ValueError(f"No input data to plot '{plot_type}' given")
+
+        logger.info("Plotting %s", plot_type)
+        fig = plt.figure(**self.cfg['figure_kwargs'])
+        axes = fig.add_subplot()
+
+        # Plot all datasets in one single figure
+        ancestors = []
+        cubes = {}
+        for dataset in datasets:
+            ancestors.append(dataset['filename'])
+            cube = dataset['cube']
+            cubes[self._get_label(dataset)] = cube
+            self._check_cube_dimensions(cube, plot_type)
+
+            # Plot original time series
+            plot_kwargs = self._get_plot_kwargs(plot_type, dataset)
+            plot_kwargs['axes'] = axes
+            iris.plot.plot(cube, **plot_kwargs)
+
+            # Plot annual means if desired
+            annual_mean_kwargs = self.plots[plot_type]['annual_mean_kwargs']
+            if annual_mean_kwargs is not False:
+                logger.debug("Plotting annual means")
+                if not cube.coords('year'):
+                    add_year(cube, 'time')
+                annual_mean_cube = cube.aggregated_by('year',
+                                                      iris.analysis.MEAN)
+                plot_kwargs.pop('label', None)
+                plot_kwargs.update(annual_mean_kwargs)
+                iris.plot.plot(annual_mean_cube, **plot_kwargs)
+
+        # Default plot appearance
+        multi_dataset_facets = self._get_multi_dataset_facets(datasets)
+        axes.set_title(multi_dataset_facets['long_name'])
+        axes.set_xlabel('time')
+        # apply time formatting
+        if self.plots[plot_type]['time_format'] is not None:
+            axes.get_xaxis().set_major_formatter(
+                mdates.DateFormatter(self.plots[plot_type]['time_format']))
+        axes.set_ylabel(
+            f"{multi_dataset_facets[self.cfg['group_variables_by']]} "
+            f"[{multi_dataset_facets['units']}]"
+        )
+        gridline_kwargs = self._get_gridline_kwargs(plot_type)
+        if gridline_kwargs is not False:
+            axes.grid(**gridline_kwargs)
+
+        # Legend
+        legend_kwargs = self.plots[plot_type]['legend_kwargs']
+        if legend_kwargs is not False:
+            axes.legend(**legend_kwargs)
+
+        # Customize plot appearance
+        self._process_pyplot_kwargs(plot_type, multi_dataset_facets)
+
+        # Save plot
+        plot_path = self.get_plot_path(plot_type, multi_dataset_facets)
+        fig.savefig(plot_path, **self.cfg['savefig_kwargs'])
+        logger.info("Wrote %s", plot_path)
+        plt.close()
+
+        # Save netCDF file
+        netcdf_path = get_diagnostic_filename(Path(plot_path).stem, self.cfg)
+        var_attrs = {
+            n: datasets[0][n] for n in ('short_name', 'long_name', 'units')
+        }
+        io.save_1d_data(cubes, netcdf_path, 'time', var_attrs)
+
+        # Provenance tracking
+        caption = (f"Time series of {multi_dataset_facets['long_name']} for "
+                   f"various datasets.")
+        provenance_record = {
+            'ancestors': ancestors,
+            'authors': ['schlund_manuel'],
+            'caption': caption,
+            'plot_types': ['line'],
+            'long_names': [var_attrs['long_name']],
+        }
+        with ProvenanceLogger(self.cfg) as provenance_logger:
+            provenance_logger.log(plot_path, provenance_record)
+            provenance_logger.log(netcdf_path, provenance_record)
+
+    def create_benchmarking_timeseries(self, datasets):
+        """Create time series benchmarking plot."""
+        plot_type = 'benchmarking_timeseries'
+        if plot_type not in self.plots:
+            return
+
+        if not datasets:
+            raise ValueError(f"No input data to plot '{plot_type}' given")
+
+        logger.info("Plotting %s", plot_type)
+
+        # Get dataset to be benchmarked
+        plot_datasets = self._get_benchmark_datasets(datasets)
+        # Get percentiles from multi-model statistics
+        percentile_dataset = self._get_benchmark_percentiles(datasets)
+
+        fig = plt.figure(**self.cfg['figure_kwargs'])
+        axes = fig.add_subplot()
+
+        # load data
+
+        percentile_data = []
+
+        for dataset_to_load in percentile_dataset:
+            filename = dataset_to_load['filename']
+            logger.info("Loading %s", filename)
+            cube = iris.load_cube(filename)
+            percentile_data.append(cube)
+
+        # Plot all datasets in one single figure
+        ancestors = []
+        cubes = {}
+
+        for dataset in plot_datasets:
+            plot_kwargs = self._get_plot_kwargs(plot_type, dataset)
+            iris.plot.plot(dataset['cube'], **plot_kwargs)
+
+        yval2 = percentile_dataset[0]['cube']
+        if len(percentile_dataset) > 1:
+            idx = len(percentile_dataset) - 1
+            yval1 = percentile_dataset[idx]['cube']
+        else:
+            yval1 = yval2.copy()
+            ymin, __ = axes.get_ylim()
+            yval1.data = np.full(len(yval1.data), ymin)
+
+        dataset = plot_datasets[0]
+        iris.plot.fill_between(dataset['cube'].coord('time'), yval1, yval2,
+                               facecolor='lightblue', edgecolor='lightblue',
+                               linewidth=3, zorder=1, alpha=0.8)
+
+        # Default plot appearance
+        multi_dataset_facets = self._get_multi_dataset_facets(datasets)
+        axes.set_title(multi_dataset_facets['long_name'])
+        axes.set_xlabel('time')
+        # apply time formatting
+        if self.plots[plot_type]['time_format'] is not None:
+            axes.get_xaxis().set_major_formatter(
+                mdates.DateFormatter(self.plots[plot_type]['time_format']))
+        axes.set_ylabel(
+            f"{multi_dataset_facets[self.cfg['group_variables_by']]} "
+            f"[{multi_dataset_facets['units']}]"
+        )
+        gridline_kwargs = self._get_gridline_kwargs(plot_type)
+        if gridline_kwargs is not False:
+            axes.grid(**gridline_kwargs)
+
+        # Legend
+        legend_kwargs = self.plots[plot_type]['legend_kwargs']
+        if legend_kwargs is not False:
+            axes.legend(**legend_kwargs)
+
+        # Customize plot appearance
+        self._process_pyplot_kwargs(plot_type, multi_dataset_facets)
+
+        # Save plot
+        plot_path = self.get_plot_path(plot_type, multi_dataset_facets)
+        fig.savefig(plot_path, **self.cfg['savefig_kwargs'])
+        logger.info("Wrote %s", plot_path)
+        plt.close()
+
+        # Save netCDF file
+        netcdf_path = get_diagnostic_filename(Path(plot_path).stem, self.cfg)
+        var_attrs = {
+            n: datasets[0][n] for n in ('short_name', 'long_name', 'units')
+        }
+        cubes[self._get_label(dataset)] = dataset['cube']
+        io.save_1d_data(cubes, netcdf_path, 'time', var_attrs)
+
+        # Provenance tracking
+        caption = (f"Time series of {multi_dataset_facets['long_name']} for "
+                   f"various datasets.")
+        provenance_record = {
+            'ancestors': ancestors,
+            'authors': ['schlund_manuel'],
+            'caption': caption,
+            'plot_types': ['line'],
+            'long_names': [var_attrs['long_name']],
+        }
+        with ProvenanceLogger(self.cfg) as provenance_logger:
+            provenance_logger.log(plot_path, provenance_record)
+            provenance_logger.log(netcdf_path, provenance_record)
+
+    def create_annual_cycle_plot(self, datasets):
+        """Create annual cycle plot."""
+        plot_type = 'annual_cycle'
+        if plot_type not in self.plots:
+            return
+
+        if not datasets:
+            raise ValueError(f"No input data to plot '{plot_type}' given")
+
+        logger.info("Plotting %s", plot_type)
+        fig = plt.figure(**self.cfg['figure_kwargs'])
+        axes = fig.add_subplot()
+
+        # Plot all datasets in one single figure
+        ancestors = []
+        cubes = {}
+        for dataset in datasets:
+            ancestors.append(dataset['filename'])
+            cube = dataset['cube']
+            cubes[self._get_label(dataset)] = cube
+            self._check_cube_dimensions(cube, plot_type)
+
+            # Plot annual cycle
+            plot_kwargs = self._get_plot_kwargs(plot_type, dataset)
+            plot_kwargs['axes'] = axes
+            iris.plot.plot(cube, **plot_kwargs)
+
+        # Default plot appearance
+        multi_dataset_facets = self._get_multi_dataset_facets(datasets)
+        axes.set_title(multi_dataset_facets['long_name'])
+        axes.set_xlabel('Month')
+        axes.set_ylabel(
+            f"{multi_dataset_facets[self.cfg['group_variables_by']]} "
+            f"[{multi_dataset_facets['units']}]"
+        )
+        axes.set_xticks(range(1, 13), [str(m) for m in range(1, 13)])
+        gridline_kwargs = self._get_gridline_kwargs(plot_type)
+        if gridline_kwargs is not False:
+            axes.grid(**gridline_kwargs)
+
+        # Legend
+        legend_kwargs = self.plots[plot_type]['legend_kwargs']
+        if legend_kwargs is not False:
+            axes.legend(**legend_kwargs)
+
+        # Customize plot appearance
+        self._process_pyplot_kwargs(plot_type, multi_dataset_facets)
+
+        # Save plot
+        plot_path = self.get_plot_path(plot_type, multi_dataset_facets)
+        fig.savefig(plot_path, **self.cfg['savefig_kwargs'])
+        logger.info("Wrote %s", plot_path)
+        plt.close()
+
+        # Save netCDF file
+        netcdf_path = get_diagnostic_filename(Path(plot_path).stem, self.cfg)
+        var_attrs = {
+            n: datasets[0][n] for n in ('short_name', 'long_name', 'units')
+        }
+        io.save_1d_data(cubes, netcdf_path, 'month_number', var_attrs)
+
+        # Provenance tracking
+        caption = (f"Annual cycle of {multi_dataset_facets['long_name']} for "
+                   f"various datasets.")
+        provenance_record = {
+            'ancestors': ancestors,
+            'authors': ['schlund_manuel'],
+            'caption': caption,
+            'plot_types': ['seas'],
+            'long_names': [var_attrs['long_name']],
+        }
+        with ProvenanceLogger(self.cfg) as provenance_logger:
+            provenance_logger.log(plot_path, provenance_record)
+            provenance_logger.log(netcdf_path, provenance_record)
+
+    def create_benchmarking_annual(self, datasets):
+        """Create benchmarking annual cycle plot."""
+        plot_type = 'benchmarking_annual_cycle'
+        if plot_type not in self.plots:
+            return
+
+        if not datasets:
+            raise ValueError(f"No input data to plot '{plot_type}' given")
+
+        logger.info("Plotting %s", plot_type)
+
+        # Get dataset to be benchmarked
+        plot_datasets = self._get_benchmark_datasets(datasets)
+        # Get percentiles from multi-model statistics
+        percentile_dataset = self._get_benchmark_percentiles(datasets)
+
+        fig = plt.figure(**self.cfg['figure_kwargs'])
+        axes = fig.add_subplot()
+
+        # Plot all datasets in one single figure
+        ancestors = []
+        cubes = {}
+
+        # Plot annual cycle(s)
+        for dataset in plot_datasets:
+            cube = dataset['cube']
+            plot_kwargs = self._get_plot_kwargs(plot_type, dataset)
+            plot_kwargs['axes'] = axes
+            iris.plot.plot(cube, **plot_kwargs)
+
+        yval2 = percentile_dataset[0]['cube']
+        if len(percentile_dataset) > 1:
+            idx = len(percentile_dataset) - 1
+            yval1 = percentile_dataset[idx]['cube']
+        else:
+            yval1 = yval2.copy()
+            ymin, __ = axes.get_ylim()
+            yval1.data = np.full(len(yval1.data), ymin)
+
+        iris.plot.fill_between(cube.coord('month_number'), yval1, yval2,
+                               facecolor='lightblue',
+                               linewidth=0, zorder=1, alpha=0.8)
+
+        # Default plot appearance
+        multi_dataset_facets = self._get_multi_dataset_facets(datasets)
+        axes.set_title(multi_dataset_facets['long_name'])
+        axes.set_xlabel('Month')
+        axes.set_ylabel(
+            f"{multi_dataset_facets[self.cfg['group_variables_by']]} "
+            f"[{multi_dataset_facets['units']}]"
+        )
+        axes.set_xticks(range(1, 13), [str(m) for m in range(1, 13)])
+        gridline_kwargs = self._get_gridline_kwargs(plot_type)
+        if gridline_kwargs is not False:
+            axes.grid(**gridline_kwargs)
 
-    @staticmethod
-    def _fill_facet_placeholders(string, dataset, description):
-        """Fill facet placeholders."""
-        try:
-            string = string.format(**dataset)
-        except KeyError as exc:
-            raise ValueError(
-                f"Not all necessary facets in {description} available for "
-                f"dataset\n{pformat(dataset)}") from exc
-        return string
+        # Legend
+        legend_kwargs = self.plots[plot_type]['legend_kwargs']
+        if legend_kwargs is not False:
+            axes.legend(**legend_kwargs)
 
-    @staticmethod
-    def _get_multi_dataset_facets(datasets):
-        """Derive common facets for multiple datasets."""
-        all_keys = {key for dataset in datasets for key in dataset}
-        multi_dataset_facets = {}
-        for key in all_keys:
-            if all(d.get(key) == datasets[0].get(key) for d in datasets):
-                multi_dataset_facets[key] = datasets[0].get(key)
-            else:
-                multi_dataset_facets[key] = f'ambiguous_{key}'
-        return multi_dataset_facets
+        # Customize plot appearance
+        self._process_pyplot_kwargs(plot_type, multi_dataset_facets)
 
-    def _get_reference_dataset(self, datasets):
-        """Extract reference dataset."""
-        variable = datasets[0][self.cfg['group_variables_by']]
-        ref_datasets = [d for d in datasets if
-                        d.get('reference_for_monitor_diags', False)]
-        if len(ref_datasets) > 1:
-            raise ValueError(
-                f"Expected at most 1 reference dataset (with "
-                f"'reference_for_monitor_diags: true' for variable "
-                f"'{variable}', got {len(ref_datasets):d}")
-        if ref_datasets:
-            return ref_datasets[0]
-        return None
+        # Save plot
+        plot_path = self.get_plot_path(plot_type, multi_dataset_facets)
+        fig.savefig(plot_path, **self.cfg['savefig_kwargs'])
+        logger.info("Wrote %s", plot_path)
+        plt.close()
 
-    def create_timeseries_plot(self, datasets):
-        """Create time series plot."""
-        plot_type = 'timeseries'
+        # Save netCDF file
+        netcdf_path = get_diagnostic_filename(Path(plot_path).stem, self.cfg)
+        var_attrs = {
+            n: datasets[0][n] for n in ('short_name', 'long_name', 'units')
+        }
+        dataset = plot_datasets[0]
+        cubes[self._get_label(dataset)] = dataset['cube']
+        io.save_1d_data(cubes, netcdf_path, 'month_number', var_attrs)
+
+        # Provenance tracking
+        caption = (f"Annual cycle of {multi_dataset_facets['long_name']} for "
+                   f"various datasets.")
+        provenance_record = {
+            'ancestors': ancestors,
+            'authors': ['schlund_manuel'],
+            'caption': caption,
+            'plot_types': ['seas'],
+            'long_names': [var_attrs['long_name']],
+        }
+        with ProvenanceLogger(self.cfg) as provenance_logger:
+            provenance_logger.log(plot_path, provenance_record)
+            provenance_logger.log(netcdf_path, provenance_record)
+
+    def create_diurnal_cycle_plot(self, datasets):
+        """Create diurnal cycle plot."""
+        plot_type = 'diurnal_cycle'
         if plot_type not in self.plots:
             return
 
@@ -2018,35 +2987,21 @@ def create_timeseries_plot(self, datasets):
             cubes[self._get_label(dataset)] = cube
             self._check_cube_dimensions(cube, plot_type)
 
-            # Plot original time series
+            # Plot diurnal cycle
             plot_kwargs = self._get_plot_kwargs(plot_type, dataset)
             plot_kwargs['axes'] = axes
             iris.plot.plot(cube, **plot_kwargs)
 
-            # Plot annual means if desired
-            annual_mean_kwargs = self.plots[plot_type]['annual_mean_kwargs']
-            if annual_mean_kwargs is not False:
-                logger.debug("Plotting annual means")
-                if not cube.coords('year'):
-                    add_year(cube, 'time')
-                annual_mean_cube = cube.aggregated_by('year',
-                                                      iris.analysis.MEAN)
-                plot_kwargs.pop('label', None)
-                plot_kwargs.update(annual_mean_kwargs)
-                iris.plot.plot(annual_mean_cube, **plot_kwargs)
-
         # Default plot appearance
         multi_dataset_facets = self._get_multi_dataset_facets(datasets)
         axes.set_title(multi_dataset_facets['long_name'])
-        axes.set_xlabel('time')
-        # apply time formatting
-        if self.plots[plot_type]['time_format'] is not None:
-            axes.get_xaxis().set_major_formatter(
-                mdates.DateFormatter(self.plots[plot_type]['time_format']))
+        axes.set_xlabel('Hour')
         axes.set_ylabel(
             f"{multi_dataset_facets[self.cfg['group_variables_by']]} "
             f"[{multi_dataset_facets['units']}]"
         )
+        axes.set_xticks(range(0, 24), minor=True)
+        axes.set_xticks(range(0, 24, 3), [str(m) for m in range(0, 24, 3)])
         gridline_kwargs = self._get_gridline_kwargs(plot_type)
         if gridline_kwargs is not False:
             axes.grid(**gridline_kwargs)
@@ -2070,25 +3025,25 @@ def create_timeseries_plot(self, datasets):
         var_attrs = {
             n: datasets[0][n] for n in ('short_name', 'long_name', 'units')
         }
-        io.save_1d_data(cubes, netcdf_path, 'time', var_attrs)
+        io.save_1d_data(cubes, netcdf_path, 'hour', var_attrs)
 
         # Provenance tracking
-        caption = (f"Time series of {multi_dataset_facets['long_name']} for "
+        caption = (f"Diurnal cycle of {multi_dataset_facets['long_name']} for "
                    f"various datasets.")
         provenance_record = {
             'ancestors': ancestors,
             'authors': ['schlund_manuel'],
             'caption': caption,
-            'plot_types': ['line'],
+            'plot_types': ['seas'],
             'long_names': [var_attrs['long_name']],
         }
         with ProvenanceLogger(self.cfg) as provenance_logger:
             provenance_logger.log(plot_path, provenance_record)
             provenance_logger.log(netcdf_path, provenance_record)
 
-    def create_annual_cycle_plot(self, datasets):
-        """Create annual cycle plot."""
-        plot_type = 'annual_cycle'
+    def create_benchmarking_diurnal(self, datasets):
+        """Create benchmarking diurnal cycle plot."""
+        plot_type = 'benchmarking_diurnal_cycle'
         if plot_type not in self.plots:
             return
 
@@ -2096,32 +3051,50 @@ def create_annual_cycle_plot(self, datasets):
             raise ValueError(f"No input data to plot '{plot_type}' given")
 
         logger.info("Plotting %s", plot_type)
+
+        # Get dataset to be benchmarked
+        plot_datasets = self._get_benchmark_datasets(datasets)
+        # Get percentiles from multi-model statistics
+        percentile_dataset = self._get_benchmark_percentiles(datasets)
+
         fig = plt.figure(**self.cfg['figure_kwargs'])
         axes = fig.add_subplot()
 
         # Plot all datasets in one single figure
         ancestors = []
         cubes = {}
-        for dataset in datasets:
-            ancestors.append(dataset['filename'])
-            cube = dataset['cube']
-            cubes[self._get_label(dataset)] = cube
-            self._check_cube_dimensions(cube, plot_type)
 
-            # Plot annual cycle
+        # Plot diurnal cycle(s)
+        for dataset in plot_datasets:
+            cube = dataset['cube']
             plot_kwargs = self._get_plot_kwargs(plot_type, dataset)
             plot_kwargs['axes'] = axes
             iris.plot.plot(cube, **plot_kwargs)
 
+        yval2 = percentile_dataset[0]['cube']
+        if len(percentile_dataset) > 1:
+            idx = len(percentile_dataset) - 1
+            yval1 = percentile_dataset[idx]['cube']
+        else:
+            yval1 = yval2.copy()
+            ymin, __ = axes.get_ylim()
+            yval1.data = np.full(len(yval1.data), ymin)
+
+        iris.plot.fill_between(cube.coord('hour'), yval1, yval2,
+                               facecolor='lightblue',
+                               linewidth=0,
+                               zorder=1, alpha=0.8)
+
         # Default plot appearance
         multi_dataset_facets = self._get_multi_dataset_facets(datasets)
         axes.set_title(multi_dataset_facets['long_name'])
-        axes.set_xlabel('Month')
+        axes.set_xlabel('Hour')
         axes.set_ylabel(
             f"{multi_dataset_facets[self.cfg['group_variables_by']]} "
             f"[{multi_dataset_facets['units']}]"
         )
-        axes.set_xticks(range(1, 13), [str(m) for m in range(1, 13)])
+        axes.set_xticks(range(0, 24), minor=True)
+        axes.set_xticks(range(0, 24, 3), [str(m) for m in range(0, 24, 3)])
         gridline_kwargs = self._get_gridline_kwargs(plot_type)
         if gridline_kwargs is not False:
             axes.grid(**gridline_kwargs)
@@ -2145,10 +3118,12 @@ def create_annual_cycle_plot(self, datasets):
         var_attrs = {
             n: datasets[0][n] for n in ('short_name', 'long_name', 'units')
         }
-        io.save_1d_data(cubes, netcdf_path, 'month_number', var_attrs)
+        dataset = plot_datasets[0]
+        cubes[self._get_label(dataset)] = dataset['cube']
+        io.save_1d_data(cubes, netcdf_path, 'hour', var_attrs)
 
         # Provenance tracking
-        caption = (f"Annual cycle of {multi_dataset_facets['long_name']} for "
+        caption = (f"Diurnal cycle of {multi_dataset_facets['long_name']} for "
                    f"various datasets.")
         provenance_record = {
             'ancestors': ancestors,
@@ -2161,6 +3136,94 @@ def create_annual_cycle_plot(self, datasets):
             provenance_logger.log(plot_path, provenance_record)
             provenance_logger.log(netcdf_path, provenance_record)
 
+    def create_benchmarking_boxplot(self):
+        """Create boxplot."""
+        plot_type = 'benchmarking_boxplot'
+        if plot_type not in self.plots:
+            return
+
+        dframe = pd.DataFrame(columns=['Variable', 'Dataset', 'Value'])
+        ifile = 0
+
+        cubes = iris.cube.CubeList()
+        benchmark_datasets = []
+        variables = []
+
+        for (var_key, datasets) in self.grouped_input_data.items():
+            logger.info("Processing variable %s", var_key)
+
+            if not datasets:
+                raise ValueError(f"No input data to plot '{plot_type}' given")
+
+            # Get dataset to be benchmarked
+            plot_datasets = self._get_benchmark_datasets(datasets)
+            benchmark_dataset = plot_datasets[0]
+
+            logger.info("Plotting %s for dataset %s",
+                        plot_type, benchmark_dataset['dataset'])
+
+            # Get datasets for benchmarking
+            benchmark_group = self._get_benchmark_group(datasets)
+            logger.info("Benchmarking group of %i datasets.",
+                        len(benchmark_group))
+
+            ancestors = [benchmark_dataset['filename']]
+            for dataset in benchmark_group:
+                ancestors.append(dataset['filename'])
+
+            for dataset in benchmark_group:
+                dataset_name = dataset['dataset']
+                cube = iris.load_cube(dataset['filename'])
+                dframe.loc[ifile] = [var_key, dataset_name, cube.data]
+                ifile = ifile + 1
+
+            dframe['Value'] = dframe['Value'].astype(str).astype(float)
+
+            cubes.append(benchmark_dataset['cube'])
+            benchmark_datasets.append(benchmark_dataset)
+            variables.append(var_key)
+
+        # order of variables
+        if self.plots[plot_type]['var_order']:
+            var_order = self.plots[plot_type]['var_order']
+            if set(variables) == set(var_order):
+                ind = [variables.index(var_order[i])
+                       for i in range(len(variables))]
+                cubes = iris.cube.CubeList([cubes[i] for i in ind])
+                benchmark_datasets = [benchmark_datasets[i] for i in ind]
+                variables = var_order
+            else:
+                raise ValueError("List of ordered variables do not agree with"
+                                 " processed variables")
+
+        (plot_path, netcdf_paths) = (
+            self._plot_benchmarking_boxplot(dframe, cubes, variables,
+                                            benchmark_datasets)
+        )
+
+        # Save plot
+        plt.savefig(plot_path, **self.cfg['savefig_kwargs'])
+        logger.info("Wrote %s", plot_path)
+        plt.close()
+
+        # Save netCDF file
+        for (netcdf_path, cube) in netcdf_paths.items():
+            io.iris_save(cube, netcdf_path)
+
+            # Provenance tracking
+            caption = (
+                "Boxplot."
+                )
+            provenance_record = {
+                'ancestors': ancestors,
+                'authors': ['bock_lisa', 'schlund_manuel'],
+                'caption': caption,
+                'plot_types': ['box'],
+            }
+            with ProvenanceLogger(self.cfg) as provenance_logger:
+                provenance_logger.log(plot_path, provenance_record)
+                provenance_logger.log(netcdf_path, provenance_record)
+
     def create_map_plot(self, datasets):
         """Create map plot."""
         plot_type = 'map'
@@ -2234,6 +3297,71 @@ def create_map_plot(self, datasets):
                 for netcdf_path in netcdf_paths:
                     provenance_logger.log(netcdf_path, provenance_record)
 
+    def create_benchmarking_map_plot(self, datasets):
+        """Create benchmarking map plot."""
+        plot_type = 'benchmarking_map'
+        if plot_type not in self.plots:
+            return
+
+        if not datasets:
+            raise ValueError(f"No input data to plot '{plot_type}' given")
+
+        # Get reference dataset
+        ref_dataset = self._get_benchmarking_reference(datasets)
+        # Get dataset to be benchmarked
+        plot_datasets = self._get_benchmark_datasets(datasets)
+        # Get percentiles from multi-model statistics
+        percentile_dataset = self._get_benchmark_percentiles(datasets)
+        # Get benchmarking metric
+        metric = self._get_benchmark_metric(datasets)
+
+        # Get plot function
+        plot_func = self._get_plot_func(plot_type)
+
+        # load data
+
+        percentile_data = []
+
+        for dataset_to_load in percentile_dataset:
+            filename = dataset_to_load['filename']
+            logger.info("Loading %s", filename)
+            cube = iris.load_cube(filename)
+            percentile_data.append(cube)
+
+        for dataset in plot_datasets:
+            ancestors = [dataset['filename']]
+            (plot_path, netcdf_paths) = (
+                self._plot_benchmarking_map(plot_func, dataset,
+                                            percentile_data, metric)
+            )
+            caption = (
+                f"Map plot of {dataset['long_name']} of dataset "
+                f"{dataset['alias']}."
+            )
+            ancestors.append(ref_dataset['filename'])
+
+            # Save plot
+            plt.savefig(plot_path, **self.cfg['savefig_kwargs'])
+            logger.info("Wrote %s", plot_path)
+            plt.close()
+
+            # Save netCDFs
+            for (netcdf_path, cube) in netcdf_paths.items():
+                io.iris_save(cube, netcdf_path)
+
+            # Provenance tracking
+            provenance_record = {
+                'ancestors': ancestors,
+                'authors': ['schlund_manuel'],
+                'caption': caption,
+                'plot_types': ['map'],
+                'long_names': [dataset['long_name']],
+            }
+            with ProvenanceLogger(self.cfg) as provenance_logger:
+                provenance_logger.log(plot_path, provenance_record)
+                for netcdf_path in netcdf_paths:
+                    provenance_logger.log(netcdf_path, provenance_record)
+
     def create_zonal_mean_profile_plot(self, datasets):
         """Create zonal mean profile plot."""
         plot_type = 'zonal_mean_profile'
@@ -2309,6 +3437,72 @@ def create_zonal_mean_profile_plot(self, datasets):
                 for netcdf_path in netcdf_paths:
                     provenance_logger.log(netcdf_path, provenance_record)
 
+    def create_benchmarking_zonal_plot(self, datasets):
+        """Create benchmarking zonal mean profile plot."""
+        plot_type = 'benchmarking_zonal'
+        if plot_type not in self.plots:
+            return
+
+        if not datasets:
+            raise ValueError(f"No input data to plot '{plot_type}' given")
+
+        # Get dataset to be benchmarked
+        plot_datasets = self._get_benchmark_datasets(datasets)
+        # Get percentiles from multi-model statistics
+        percentile_dataset = self._get_benchmark_percentiles(datasets)
+        # Get benchmarking metric
+        metric = self._get_benchmark_metric(datasets)
+
+        # Get plot function
+        plot_func = self._get_plot_func(plot_type)
+
+        # Create a single plot for each dataset (incl. reference dataset if
+        # given)
+
+        # load data
+
+        percentile_data = []
+
+        for dataset_to_load in percentile_dataset:
+            filename = dataset_to_load['filename']
+            logger.info("Loading %s", filename)
+            cube = iris.load_cube(filename)
+            percentile_data.append(cube)
+
+        for dataset in plot_datasets:
+            (plot_path, netcdf_paths) = (
+                self._plot_benchmarking_zonal(plot_func, dataset,
+                                              percentile_data, metric)
+            )
+            ancestors = [dataset['filename']]
+
+            caption = (
+                f"Zonal mean profile of {dataset['long_name']} of dataset "
+                f"{dataset['alias']}."
+            )
+
+            # Save plot
+            plt.savefig(plot_path, **self.cfg['savefig_kwargs'])
+            logger.info("Wrote %s", plot_path)
+            plt.close()
+
+            # Save netCDFs
+            for (netcdf_path, cube) in netcdf_paths.items():
+                io.iris_save(cube, netcdf_path)
+
+            # Provenance tracking
+            provenance_record = {
+                'ancestors': ancestors,
+                'authors': ['schlund_manuel'],
+                'caption': caption,
+                'plot_types': ['vert'],
+                'long_names': [dataset['long_name']],
+            }
+            with ProvenanceLogger(self.cfg) as provenance_logger:
+                provenance_logger.log(plot_path, provenance_record)
+                for netcdf_path in netcdf_paths:
+                    provenance_logger.log(netcdf_path, provenance_record)
+
     def create_1d_profile_plot(self, datasets):
         """Create 1D profile plot."""
         plot_type = '1d_profile'
@@ -2636,10 +3830,17 @@ def create_hovmoeller_time_vs_lat_or_lon_plot(self, datasets):
     def compute(self):
         """Plot preprocessed data."""
         with mpl.rc_context(self.cfg['matplotlib_rc_params']):
+            self.create_benchmarking_boxplot()
             for (var_key, datasets) in self.grouped_input_data.items():
                 logger.info("Processing variable %s", var_key)
                 self.create_timeseries_plot(datasets)
                 self.create_annual_cycle_plot(datasets)
+                self.create_benchmarking_annual(datasets)
+                self.create_benchmarking_diurnal(datasets)
+                self.create_benchmarking_map_plot(datasets)
+                self.create_benchmarking_timeseries(datasets)
+                self.create_benchmarking_zonal_plot(datasets)
+                self.create_diurnal_cycle_plot(datasets)
                 self.create_map_plot(datasets)
                 self.create_zonal_mean_profile_plot(datasets)
                 self.create_1d_profile_plot(datasets)
diff --git a/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_annual_cycle.yml b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_annual_cycle.yml
new file mode 100644
index 0000000000..6b3e6c9f2a
--- /dev/null
+++ b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_annual_cycle.yml
@@ -0,0 +1,195 @@
+# ESMValTool
+---
+documentation:
+  title: Benchmarking of a single model.
+  description: >
+    Benchmarking: annual cycle.
+  authors:
+    - lauer_axel
+    - bock_lisa
+    - hassler_birgit
+    - lindenlaub_lukas
+    - schlund_manuel
+  maintainer:
+    - lauer_axel
+  references:
+    - lauer25gmd
+  projects:
+    - dlrmabak
+
+
+# Note: the following models are just examples
+datasets:
+  - {dataset: ACCESS-CM2, grid: gn, institute: CSIRO-ARCCSS}
+  - {dataset: ACCESS-ESM1-5, grid: gn, institute: CSIRO}
+  - {dataset: AWI-CM-1-1-MR, grid: gn}
+  - {dataset: AWI-ESM-1-1-LR, grid: gn}
+  - {dataset: BCC-CSM2-MR, grid: gn}
+  - {dataset: BCC-ESM1, grid: gn}
+  - {dataset: CAMS-CSM1-0, grid: gn}
+  - {dataset: CanESM5, grid: gn}
+  - {dataset: CanESM5-CanOE, grid: gn, ensemble: r1i1p2f1}
+  - {dataset: CESM2, grid: gn}
+  - {dataset: CESM2-FV2, grid: gn, institute: NCAR}
+  - {dataset: CESM2-WACCM, grid: gn, institute: NCAR}
+  - {dataset: CESM2-WACCM-FV2, grid: gn, institute: NCAR}
+  - {dataset: CIESM}
+  - {dataset: CNRM-CM6-1, ensemble: r1i1p1f2}
+  - {dataset: CNRM-CM6-1-HR, ensemble: r1i1p1f2}
+  - {dataset: CNRM-ESM2-1, ensemble: r1i1p1f2}
+  - {dataset: E3SM-1-0}
+  - {dataset: E3SM-1-1, institute: E3SM-Project}
+  - {dataset: EC-Earth3-Veg}
+  - {dataset: FGOALS-f3-L}
+  - {dataset: FGOALS-g3, grid: gn}
+  - {dataset: FIO-ESM-2-0, grid: gn}
+  - {dataset: GFDL-ESM4, grid: gr1}
+  - {dataset: GISS-E2-1-G, grid: gn}
+  - {dataset: GISS-E2-1-H, grid: gn}
+  - {dataset: HadGEM3-GC31-LL, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: HadGEM3-GC31-MM, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: INM-CM4-8, grid: gr1}
+  - {dataset: INM-CM5-0, grid: gr1}
+  - {dataset: IPSL-CM6A-LR}
+  - {dataset: KACE-1-0-G}
+  - {dataset: MCM-UA-1-0, grid: gn}
+  - {dataset: MIROC-ES2L, ensemble: r1i1p1f2, grid: gn}
+  - {dataset: MPI-ESM-1-2-HAM, grid: gn}
+  - {dataset: MPI-ESM1-2-HR, grid: gn}
+  - {dataset: MPI-ESM1-2-LR, grid: gn}
+  - {dataset: MRI-ESM2-0, grid: gn}
+  - {dataset: NESM3, grid: gn}
+  - {dataset: NorESM2-LM, grid: gn, institute: NCC}
+  - {dataset: NorESM2-MM, grid: gn, institute: NCC}
+  - {dataset: SAM0-UNICON, grid: gn}
+  - {dataset: UKESM1-0-LL, ensemble: r1i1p1f2, grid: gn}
+  # Dataset to be benchmarked
+  - {dataset: MIROC6, grid: gn, alias: MIROC6, benchmark_dataset: true}
+
+preprocessors:
+
+  pp_tas:
+    regrid:
+      target_grid: 2x2
+      scheme: linear
+    climate_statistics:
+      period: month
+    area_statistics:
+      operator: mean
+    multi_model_statistics:
+      span: overlap
+      statistics:
+        - operator: percentile
+          percent: 10
+        - operator: percentile
+          percent: 90
+      exclude: [reference_dataset, MIROC6]
+
+  pp_tas_metric:
+    custom_order: true
+    regrid_time:
+      calendar: standard
+    regrid:
+      target_grid: 2x2
+      scheme: linear
+    climate_statistics:
+      period: month
+    distance_metric:
+      metric: rmse
+      coords: [longitude, latitude]
+    multi_model_statistics:
+      span: overlap
+      statistics:
+        - operator: percentile
+          percent: 10
+        - operator: percentile
+          percent: 90
+      exclude: [reference_dataset, MIROC6]
+
+
+diagnostics:
+
+  annual_cycle:
+    description: Create "classical" annual cycle plot including a reference dataset.
+    variables:
+      tas:
+        timerange: '2000/2004'
+        preprocessor: pp_tas
+        project: CMIP6
+        mip: Amon
+        exp: historical
+        ensemble: r1i1p1f1
+        grid: gr
+        reference_dataset: HadCRUT5
+        additional_datasets:
+          - {dataset: HadCRUT5, project: OBS, type: ground, version: 5.0.1.0-analysis, tier: 2, alias: HadCRUT5}
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        group_variables_by: variable_group
+        facet_used_for_labels: alias
+        plots:
+          annual_cycle:
+            annual_mean_kwargs: False
+            plot_kwargs:
+              'MIROC6':
+                color: red
+                label: '{alias}'
+                linestyle: '-'
+                linewidth: 2
+                zorder: 4
+              HadCRUT5:
+                color: black
+                label: '{dataset}'
+                linestyle: '-'
+                linewidth: 2
+                zorder: 3
+              MultiModelPercentile10:
+                color: gray
+                label: '{dataset}'
+                linestyle: '--'
+                linewidth: 1
+                zorder: 2
+              MultiModelPercentile90:
+                color: gray
+                label: '{dataset}'
+                linestyle: '--'
+                linewidth: 1
+                zorder: 2
+              default:
+                color: lightgray
+                label: null
+                linestyle: '-'
+                linewidth: 1
+                zorder: 1
+
+  benchmarking_annual_cycle:
+    description: Create "benchmarking" annual cycle plot.
+    variables:
+      tas:
+        timerange: '2000/2004'
+        preprocessor: pp_tas_metric
+        project: CMIP6
+        mip: Amon
+        exp: historical
+        ensemble: r1i1p1f1
+        grid: gr
+        reference_dataset: HadCRUT5
+        additional_datasets:
+          - {dataset: HadCRUT5, project: OBS, type: ground, version: 5.0.1.0-analysis, tier: 2, reference_for_metric: true}
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        group_variables_by: variable_group
+        facet_used_for_labels: alias
+        plots:
+          benchmarking_annual_cycle:
+            plot_kwargs:
+              'MIROC6':
+                color: red
+                label: '{alias}'
+                linestyle: '-'
+                linewidth: 1.5
+                zorder: 3
diff --git a/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_boxplots.yml b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_boxplots.yml
new file mode 100644
index 0000000000..8d887e5d68
--- /dev/null
+++ b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_boxplots.yml
@@ -0,0 +1,440 @@
+# ESMValTool
+---
+documentation:
+  title: Benchmarkig of a single model.
+
+  description: |
+    Benchmarking: Box plots.
+    The diagnostic called in this recipe uses the seaborn.boxplot function.
+    See seaborn manual for detailed information
+    (https://seaborn.pydata.org/generated/seaborn.boxplot.html).
+
+  authors:
+    - lauer_axel
+    - bock_lisa
+    - hassler_birgit
+    - lindenlaub_lukas
+    - schlund_manuel
+
+  maintainer:
+    - bock_lisa
+
+  references:
+    - lauer25gmd
+
+  projects:
+    - dlrmabak
+
+
+datasets:
+  - {dataset: ACCESS-CM2, grid: gn, institute: CSIRO-ARCCSS}
+  - {dataset: ACCESS-ESM1-5, grid: gn, institute: CSIRO}
+  - {dataset: AWI-CM-1-1-MR, grid: gn}
+  - {dataset: AWI-ESM-1-1-LR, grid: gn}
+  - {dataset: BCC-CSM2-MR, grid: gn}
+  - {dataset: BCC-ESM1, grid: gn}
+  - {dataset: CAMS-CSM1-0, grid: gn}
+  - {dataset: CanESM5, grid: gn}
+  - {dataset: CanESM5-CanOE, grid: gn, ensemble: r1i1p2f1}
+  - {dataset: CESM2, grid: gn}
+  - {dataset: CESM2-FV2, grid: gn, institute: NCAR}
+  - {dataset: CESM2-WACCM, grid: gn, institute: NCAR}
+  - {dataset: CESM2-WACCM-FV2, grid: gn, institute: NCAR}
+  - {dataset: CIESM}
+  - {dataset: CNRM-CM6-1, ensemble: r1i1p1f2}
+  - {dataset: CNRM-CM6-1-HR, ensemble: r1i1p1f2}
+  - {dataset: CNRM-ESM2-1, ensemble: r1i1p1f2}
+  - {dataset: E3SM-1-0}
+  - {dataset: E3SM-1-1, institute: E3SM-Project}
+  - {dataset: EC-Earth3-Veg}
+  - {dataset: FGOALS-f3-L}
+  - {dataset: FGOALS-g3, grid: gn}
+  - {dataset: GFDL-ESM4, grid: gr1}
+  - {dataset: GISS-E2-1-G, grid: gn}
+  - {dataset: GISS-E2-1-H, grid: gn}
+  - {dataset: HadGEM3-GC31-LL, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: HadGEM3-GC31-MM, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: INM-CM4-8, grid: gr1}
+  - {dataset: INM-CM5-0, grid: gr1}
+  - {dataset: IPSL-CM6A-LR}
+  - {dataset: KACE-1-0-G}
+  - {dataset: MIROC-ES2L, ensemble: r1i1p1f2, grid: gn}
+  - {dataset: MPI-ESM-1-2-HAM, grid: gn}
+  - {dataset: MPI-ESM1-2-HR, grid: gn}
+  - {dataset: MPI-ESM1-2-LR, grid: gn}
+  - {dataset: MRI-ESM2-0, grid: gn}
+  - {dataset: NESM3, grid: gn}
+  - {dataset: NorESM2-LM, grid: gn, institute: NCC}
+  - {dataset: NorESM2-MM, grid: gn, institute: NCC}
+  - {dataset: SAM0-UNICON, grid: gn}
+  - {dataset: UKESM1-0-LL, ensemble: r1i1p1f2, grid: gn}
+  # Dataset to be benchmarked
+  - {dataset: MIROC6, grid: gn, benchmark_dataset: true}
+
+
+VAR_SETTINGS: &var_settings
+  project: CMIP6
+  mip: Amon
+  exp: historical
+  ensemble: r1i1p1f1
+  grid: gr
+  timerange: '2000/2004'
+
+
+preprocessors:
+
+  rmse:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    distance_metric:
+      metric: weighted_rmse
+      coords: [latitude, longitude]
+
+  rmse_pr:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    convert_units:
+      units: mm day-1
+    distance_metric:
+      metric: weighted_rmse
+      coords: [latitude, longitude]
+
+  rmse_sst:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    mask_below_threshold:
+      threshold: 273.15
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    distance_metric:
+      metric: weighted_rmse
+      coords: [latitude, longitude]
+
+  rmse_land:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    mask_landsea:
+      mask_out: sea
+    distance_metric:
+      metric: weighted_rmse
+      coords: [latitude, longitude]
+
+  pearsonr:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    distance_metric:
+      metric: weighted_pearsonr
+      coords: [latitude, longitude]
+
+  pearsonr_pr:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    convert_units:
+      units: mm day-1
+    distance_metric:
+      metric: weighted_pearsonr
+      coords: [latitude, longitude]
+
+  pearsonr_sst:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    mask_below_threshold:
+      threshold: 273.15
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    distance_metric:
+      metric: weighted_pearsonr
+      coords: [latitude, longitude]
+
+  pearsonr_land:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    mask_landsea:
+      mask_out: sea
+    distance_metric:
+      metric: weighted_pearsonr
+      coords: [latitude, longitude]
+
+  emd:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    distance_metric:
+      metric: weighted_emd
+      coords: [latitude, longitude]
+
+  emd_pr:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    convert_units:
+      units: mm day-1
+    distance_metric:
+      metric: weighted_emd
+      coords: [latitude, longitude]
+
+  emd_sst:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    mask_below_threshold:
+      threshold: 273.15
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    distance_metric:
+      metric: weighted_emd
+      coords: [latitude, longitude]
+
+  emd_land:
+    custom_order: true
+    climate_statistics:
+      operator: mean
+    regrid:
+      target_grid: 2x2
+      scheme: nearest
+    mask_landsea:
+      mask_out: sea
+    distance_metric:
+      metric: weighted_emd
+      coords: [latitude, longitude]
+
+
+diagnostics:
+
+  plot_boxplots_rmse:
+    description: Plot boxplots for different variables.
+    variables:
+      tas_land:
+        <<: *var_settings
+        preprocessor: rmse_land
+        short_name: tas
+        additional_datasets:
+          - {dataset: HadCRUT5, project: OBS, type: ground,
+             version: 5.0.1.0-analysis, tier: 2, reference_for_metric: true}
+      lwcre:
+        <<: *var_settings
+        preprocessor: rmse
+        derive: true
+        force_derivation: true
+        channel: Amon
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      pr:
+        <<: *var_settings
+        preprocessor: rmse_pr
+        additional_datasets:
+          - {dataset: GPCP-SG, project: OBS, type: atmos, version: 2.3, tier: 2,
+             reference_for_metric: true}
+      psl:
+        <<: *var_settings
+        preprocessor: rmse
+        additional_datasets:
+          - {dataset: ERA5, project: native6, type: reanaly, version: v1,
+             tier: 3, reference_for_metric: true}
+      rlut:
+        <<: *var_settings
+        preprocessor: rmse
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      rsut:
+        <<: *var_settings
+        preprocessor: rmse
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      swcre:
+        <<: *var_settings
+        preprocessor: rmse
+        derive: true
+        force_derivation: true
+        channel: Amon
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      sst:
+        <<: *var_settings
+        preprocessor: rmse_sst
+        short_name: ts
+        additional_datasets:
+          - {dataset: HadISST, project: OBS, type: reanaly, version: 1, tier: 2, reference_for_metric: true}
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        group_variables_by: variable_group
+        plots:
+          benchmarking_boxplot:
+            var_order: ['tas_land', 'sst', 'pr', 'psl', 'rsut', 'rlut', 'swcre', 'lwcre']
+
+
+  plot_boxplots_pearsonr:
+    description: Plot boxplots for different variables.
+    variables:
+      tas_land:
+        <<: *var_settings
+        short_name: tas
+        preprocessor: pearsonr_land
+        additional_datasets:
+          - {dataset: HadCRUT5, project: OBS, type: ground,
+             version: 5.0.1.0-analysis, tier: 2, reference_for_metric: true}
+      lwcre:
+        <<: *var_settings
+        preprocessor: pearsonr
+        derive: true
+        force_derivation: true
+        channel: Amon
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      pr:
+        <<: *var_settings
+        preprocessor: pearsonr_pr
+        additional_datasets:
+          - {dataset: GPCP-SG, project: OBS, type: atmos, version: 2.3, tier: 2,
+             reference_for_metric: true}
+      psl:
+        <<: *var_settings
+        preprocessor: pearsonr
+        additional_datasets:
+          - {dataset: ERA5, project: native6, type: reanaly, version: v1,
+             tier: 3, reference_for_metric: true}
+      rlut:
+        <<: *var_settings
+        preprocessor: pearsonr
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      rsut:
+        <<: *var_settings
+        preprocessor: pearsonr
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      swcre:
+        <<: *var_settings
+        preprocessor: pearsonr
+        derive: true
+        force_derivation: true
+        channel: Amon
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      sst:
+        <<: *var_settings
+        preprocessor: pearsonr_sst
+        short_name: ts
+        additional_datasets:
+          - {dataset: HadISST, project: OBS, type: reanaly, version: 1, tier: 2, reference_for_metric: true}
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        group_variables_by: variable_group
+        plots:
+          benchmarking_boxplot:
+            var_order: ['tas_land', 'sst', 'pr', 'psl', 'rsut', 'rlut', 'swcre', 'lwcre']
+
+
+  plot_boxplots_emd:
+    description: Plot boxplots for different variables.
+    variables:
+      tas_land:
+        <<: *var_settings
+        preprocessor: emd_land
+        short_name: tas
+        additional_datasets:
+          - {dataset: HadCRUT5, project: OBS, type: ground,
+             version: 5.0.1.0-analysis, tier: 2, reference_for_metric: true}
+      lwcre:
+        <<: *var_settings
+        preprocessor: emd
+        derive: true
+        force_derivation: true
+        channel: Amon
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      pr:
+        <<: *var_settings
+        preprocessor: emd_pr
+        additional_datasets:
+          - {dataset: GPCP-SG, project: OBS, type: atmos, version: 2.3, tier: 2,
+             reference_for_metric: true}
+      psl:
+        <<: *var_settings
+        preprocessor: emd
+        additional_datasets:
+          - {dataset: ERA5, project: native6, type: reanaly, version: v1,
+             tier: 3, reference_for_metric: true}
+      rlut:
+        <<: *var_settings
+        preprocessor: emd
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      rsut:
+        <<: *var_settings
+        preprocessor: emd
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      swcre:
+        <<: *var_settings
+        preprocessor: emd
+        derive: true
+        force_derivation: true
+        channel: Amon
+        additional_datasets:
+          - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.2,
+             tier: 2, start_year: 2001, end_year: 2020, reference_for_metric: true}
+      sst:
+        <<: *var_settings
+        preprocessor: emd_sst
+        short_name: ts
+        additional_datasets:
+          - {dataset: HadISST, project: OBS, type: reanaly, version: 1, tier: 2, reference_for_metric: true}
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        group_variables_by: variable_group
+        plots:
+          benchmarking_boxplot:
+            var_order: ['tas_land', 'sst', 'pr', 'psl', 'rsut', 'rlut', 'swcre', 'lwcre']
diff --git a/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_diurnal_cycle.yml b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_diurnal_cycle.yml
new file mode 100644
index 0000000000..4fbbbb6b69
--- /dev/null
+++ b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_diurnal_cycle.yml
@@ -0,0 +1,203 @@
+# ESMValTool
+---
+documentation:
+  title: Benchmarking of a single model.
+  description: >
+    Benchmarking: diurnal cycle.
+  authors:
+    - lauer_axel
+    - bock_lisa
+    - hassler_birgit
+    - lindenlaub_lukas
+    - schlund_manuel
+  maintainer:
+    - lauer_axel
+  references:
+    - lauer25gmd
+  projects:
+    - dlrmabak
+
+
+datasets:
+  - {dataset: ACCESS-CM2, grid: gn, institute: CSIRO-ARCCSS}
+  - {dataset: ACCESS-ESM1-5, grid: gn, institute: CSIRO}
+  - {dataset: AWI-CM-1-1-MR, grid: gn}
+  - {dataset: AWI-ESM-1-1-LR, grid: gn}
+  - {dataset: BCC-CSM2-MR, grid: gn}
+  - {dataset: CNRM-CM6-1, ensemble: r1i1p1f2}
+  - {dataset: CNRM-ESM2-1, ensemble: r1i1p1f2}
+  - {dataset: EC-Earth3-Veg}
+  - {dataset: FGOALS-g3, grid: gn}
+  - {dataset: GISS-E2-1-G, grid: gn}
+  - {dataset: HadGEM3-GC31-LL, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: HadGEM3-GC31-MM, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: IPSL-CM6A-LR}
+  - {dataset: KACE-1-0-G}
+  - {dataset: MIROC-ES2L, ensemble: r1i1p1f2, grid: gn}
+  - {dataset: MPI-ESM-1-2-HAM, grid: gn}
+  - {dataset: MPI-ESM1-2-HR, grid: gn}
+  - {dataset: MPI-ESM1-2-LR, grid: gn}
+  - {dataset: MRI-ESM2-0, grid: gn}
+  - {dataset: NESM3, grid: gn}
+  - {dataset: SAM0-UNICON, grid: gn}
+  - {dataset: UKESM1-0-LL, ensemble: r1i1p1f2, grid: gn}
+  # Dataset to be benchmarked
+  - {dataset: MIROC6, grid: gn, benchmark_dataset: true, alias: MIROC6}
+
+
+preprocessors:
+
+  pp_diurn_Tropics:
+    custom_order: true
+    regrid:
+      target_grid: 2x2
+      scheme: linear
+    local_solar_time:
+    extract_region:
+      start_longitude: 0
+      end_longitude: 360
+      start_latitude: -30
+      end_latitude: 30
+    mask_landsea:
+      mask_out: land
+    resample_hours:
+      interval: 3
+      offset: 1
+      interpolate: linear
+    area_statistics:
+      operator: mean
+    climate_statistics:
+      period: hourly
+    convert_units:
+      units: mm day-1
+    multi_model_statistics:
+      span: overlap
+      statistics:
+        - operator: percentile
+          percent: 10
+        - operator: percentile
+          percent: 90
+      exclude: [reference_dataset, MIROC6]
+
+  pp_diurn_Tropics_metric:
+    custom_order: true
+    regrid:
+      target_grid: 2x2
+      scheme: linear
+    local_solar_time:
+    extract_region:
+      start_longitude: 0
+      end_longitude: 360
+      start_latitude: -30
+      end_latitude: 30
+    mask_landsea:
+      mask_out: land
+    resample_hours:
+      interval: 3
+      offset: 1
+      interpolate: linear
+    climate_statistics:
+      period: hourly
+    convert_units:
+      units: mm day-1
+    distance_metric:
+      metric: rmse
+      coords: [longitude, latitude]
+    multi_model_statistics:
+      span: overlap
+      statistics:
+        - operator: percentile
+          percent: 10
+        - operator: percentile
+          percent: 90
+      exclude: [reference_dataset, MIROC6]
+
+
+diagnostics:
+
+  diurnal_cycle:
+    description: Classical diurnal cycle plot including reference dataset.
+    variables:
+      pr_tropics: &var_settings
+        project: CMIP6
+        timerange: 2000/2000
+        preprocessor: pp_diurn_Tropics
+        short_name: pr
+        exp: historical
+        mip: 3hr
+        ensemble: r1i1p1f1
+        grid: gr
+        reference_dataset: ERA5
+        additional_datasets:
+          - {dataset: ERA5, project: native6, type: reanaly, version: 'v1', frequency: 1hr,
+             tier: 3, reference_for_metric: true, alias: ERA5}
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        plot_filename: '{plot_type}_{real_name}_{mip}'
+        group_variables_by: variable_group
+        facet_used_for_labels: alias
+        plots:
+          diurnal_cycle:
+            annual_mean_kwargs: False
+            legend_kwargs:
+              loc: upper right
+            plot_kwargs:
+              'MIROC6':
+                color: red
+                label: '{alias}'
+                linestyle: '-'
+                linewidth: 2
+                zorder: 4
+              ERA5:
+                color: black
+                label: '{dataset}'
+                linestyle: '-'
+                linewidth: 2
+                zorder: 3
+              MultiModelPercentile10:
+                color: gray
+                label: '{dataset}'
+                linestyle: '--'
+                linewidth: 1
+                zorder: 2
+              MultiModelPercentile90:
+                color: gray
+                label: '{dataset}'
+                linestyle: '--'
+                linewidth: 1
+                zorder: 2
+              default:
+                color: lightgray
+                label: null
+                linestyle: '-'
+                linewidth: 1
+                zorder: 1
+
+  benchmarking_diurnal_cycle:
+    description: Create "benchmarking" diurnal cycle plot.
+    variables:
+      pr_tropics:
+        <<: *var_settings
+        preprocessor: pp_diurn_Tropics_metric
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        plot_filename: '{plot_type}_{real_name}_{mip}'
+        group_variables_by: variable_group
+        facet_used_for_labels: alias
+        plots:
+          benchmarking_diurnal_cycle:
+            legend_kwargs:
+              loc: upper right
+            plot_kwargs:
+              'MIROC6':
+                color: red
+                label: '{alias}'
+                linestyle: '-'
+                linewidth: 2
+                zorder: 4
+            pyplot_kwargs:
+              title: '{short_name}'
diff --git a/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_maps.yml b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_maps.yml
new file mode 100644
index 0000000000..bb747852b7
--- /dev/null
+++ b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_maps.yml
@@ -0,0 +1,117 @@
+# ESMValTool
+---
+documentation:
+  title: Benchmarking of a single model.
+  description: >
+    Benchmarking: map plots.
+  authors:
+    - lauer_axel
+    - bock_lisa
+    - hassler_birgit
+    - lindenlaub_lukas
+    - schlund_manuel
+  maintainer:
+    - lauer_axel
+  references:
+    - lauer25gmd
+  projects:
+    - dlrmabak
+
+
+datasets:
+  - {dataset: ACCESS-CM2, grid: gn, institute: CSIRO-ARCCSS}
+  - {dataset: ACCESS-ESM1-5, grid: gn, institute: CSIRO}
+  - {dataset: AWI-CM-1-1-MR, grid: gn}
+  - {dataset: AWI-ESM-1-1-LR, grid: gn}
+  - {dataset: BCC-CSM2-MR, grid: gn}
+  - {dataset: BCC-ESM1, grid: gn}
+  - {dataset: CAMS-CSM1-0, grid: gn}
+  - {dataset: CanESM5, grid: gn}
+  - {dataset: CanESM5-CanOE, grid: gn, ensemble: r1i1p2f1}
+  - {dataset: CESM2, grid: gn}
+  - {dataset: CESM2-FV2, grid: gn, institute: NCAR}
+  - {dataset: CESM2-WACCM, grid: gn, institute: NCAR}
+  - {dataset: CESM2-WACCM-FV2, grid: gn, institute: NCAR}
+  - {dataset: CIESM}
+  - {dataset: CNRM-CM6-1, ensemble: r1i1p1f2}
+  - {dataset: CNRM-CM6-1-HR, ensemble: r1i1p1f2}
+  - {dataset: CNRM-ESM2-1, ensemble: r1i1p1f2}
+  - {dataset: E3SM-1-0}
+  - {dataset: E3SM-1-1, institute: E3SM-Project}
+  - {dataset: EC-Earth3-Veg}
+  - {dataset: FGOALS-f3-L}
+  - {dataset: FGOALS-g3, grid: gn}
+  - {dataset: FIO-ESM-2-0, grid: gn}
+  - {dataset: GFDL-ESM4, grid: gr1}
+  - {dataset: GISS-E2-1-G, grid: gn}
+  - {dataset: GISS-E2-1-H, grid: gn}
+  - {dataset: HadGEM3-GC31-LL, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: HadGEM3-GC31-MM, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: INM-CM4-8, grid: gr1}
+  - {dataset: INM-CM5-0, grid: gr1}
+  - {dataset: IPSL-CM6A-LR}
+  - {dataset: KACE-1-0-G}
+  - {dataset: MCM-UA-1-0, grid: gn}
+  - {dataset: MIROC-ES2L, ensemble: r1i1p1f2, grid: gn}
+  - {dataset: MPI-ESM-1-2-HAM, grid: gn}
+  - {dataset: MPI-ESM1-2-HR, grid: gn}
+  - {dataset: MPI-ESM1-2-LR, grid: gn}
+  - {dataset: MRI-ESM2-0, grid: gn}
+  - {dataset: NESM3, grid: gn}
+  - {dataset: NorESM2-LM, grid: gn, institute: NCC}
+  - {dataset: NorESM2-MM, grid: gn, institute: NCC}
+  - {dataset: SAM0-UNICON, grid: gn}
+  - {dataset: UKESM1-0-LL, ensemble: r1i1p1f2, grid: gn}
+  # Dataset to be benchmarked
+  - {dataset: MIROC6, grid: gn, benchmark_dataset: true, alias: MIROC6}
+
+
+preprocessors:
+
+  pp_pr:
+    custom_order: true
+    regrid_time:
+      calendar: standard
+    regrid:
+      target_grid: 2x2
+      scheme: linear
+    convert_units:
+      units: mm day-1
+    distance_metric:
+      metric: rmse
+      coords: [time]
+    multi_model_statistics:
+      span: overlap
+      statistics:
+        - operator: percentile
+          percent: 90
+      exclude: [reference_dataset, MIROC6]
+
+
+diagnostics:
+
+  benchmarking_maps:
+    description: Plot RMSE map.
+    variables:
+      pr:
+        timerange: '2000/2004'
+        preprocessor: pp_pr
+        project: CMIP6
+        mip: Amon
+        exp: historical
+        ensemble: r1i1p1f1
+        grid: gr
+        reference_dataset: GPCP-SG
+        additional_datasets:
+          - {dataset: GPCP-SG, project: OBS, type: atmos, version: 2.3, tier: 2, reference_for_metric: true, alias: GPCP-SG}
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        group_variables_by: variable_group
+        plots:
+          benchmarking_map:
+            plot_kwargs:
+              default:
+                cmap: 'cool'
+                levels: [0.0, 0.5, 1.0, 2.0, 3.0, 5.0, 7.5, 10.0]
diff --git a/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_timeseries.yml b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_timeseries.yml
new file mode 100644
index 0000000000..c07882305c
--- /dev/null
+++ b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_timeseries.yml
@@ -0,0 +1,208 @@
+# ESMValTool
+---
+documentation:
+  title: Benchmarking of a single model.
+  description: >
+    Benchmarking: time series.
+  authors:
+    - lauer_axel
+    - bock_lisa
+    - hassler_birgit
+    - lindenlaub_lukas
+    - schlund_manuel
+  maintainer:
+    - lauer_axel
+  references:
+    - lauer25gmd
+  projects:
+    - dlrmabak
+
+
+datasets:
+  - {dataset: ACCESS-CM2, grid: gn, institute: CSIRO-ARCCSS}
+  - {dataset: ACCESS-ESM1-5, grid: gn, institute: CSIRO}
+  - {dataset: AWI-CM-1-1-MR, grid: gn}
+  - {dataset: AWI-ESM-1-1-LR, grid: gn}
+  - {dataset: BCC-CSM2-MR, grid: gn}
+  - {dataset: BCC-ESM1, grid: gn}
+  - {dataset: CAMS-CSM1-0, grid: gn}
+  - {dataset: CanESM5, grid: gn}
+  - {dataset: CanESM5-CanOE, grid: gn, ensemble: r1i1p2f1}
+  - {dataset: CESM2, grid: gn}
+  - {dataset: CESM2-FV2, grid: gn, institute: NCAR}
+  - {dataset: CESM2-WACCM, grid: gn, institute: NCAR}
+  - {dataset: CESM2-WACCM-FV2, grid: gn, institute: NCAR}
+  - {dataset: CIESM}
+  - {dataset: CNRM-CM6-1, ensemble: r1i1p1f2}
+  - {dataset: CNRM-CM6-1-HR, ensemble: r1i1p1f2}
+  - {dataset: CNRM-ESM2-1, ensemble: r1i1p1f2}
+  - {dataset: E3SM-1-0}
+  - {dataset: E3SM-1-1, institute: E3SM-Project}
+  - {dataset: EC-Earth3-Veg}
+  - {dataset: FGOALS-f3-L}
+  - {dataset: FGOALS-g3, grid: gn}
+  - {dataset: FIO-ESM-2-0, grid: gn}
+  - {dataset: GFDL-ESM4, grid: gr1}
+  - {dataset: GISS-E2-1-G, grid: gn}
+  - {dataset: GISS-E2-1-H, grid: gn}
+  - {dataset: HadGEM3-GC31-LL, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: HadGEM3-GC31-MM, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: INM-CM4-8, grid: gr1}
+  - {dataset: INM-CM5-0, grid: gr1}
+  - {dataset: IPSL-CM6A-LR}
+  - {dataset: KACE-1-0-G}
+  - {dataset: MCM-UA-1-0, grid: gn}
+  - {dataset: MIROC-ES2L, ensemble: r1i1p1f2, grid: gn}
+  - {dataset: MPI-ESM-1-2-HAM, grid: gn}
+  - {dataset: MPI-ESM1-2-HR, grid: gn}
+  - {dataset: MPI-ESM1-2-LR, grid: gn}
+  - {dataset: MRI-ESM2-0, grid: gn}
+  - {dataset: NESM3, grid: gn}
+  - {dataset: NorESM2-LM, grid: gn, institute: NCC}
+  - {dataset: NorESM2-MM, grid: gn, institute: NCC}
+  - {dataset: SAM0-UNICON, grid: gn}
+  - {dataset: UKESM1-0-LL, ensemble: r1i1p1f2, grid: gn}
+  # Dataset to be benchmarked
+  - {dataset: MIROC6, grid: gn, benchmark_dataset: true}
+
+preprocessors:
+
+  pp_tas:
+    regrid:
+      target_grid: 2x2
+      scheme: linear
+    anomalies:
+      period: month
+      reference:
+        start_year: 2000
+        start_month: 1
+        start_day: 1
+        end_year: 2009
+        end_month: 12
+        end_day: 31
+    area_statistics:
+      operator: mean
+    multi_model_statistics:
+      span: overlap
+      statistics:
+        - operator: percentile
+          percent: 10
+        - operator: percentile
+          percent: 90
+      exclude: [reference_dataset, MIROC6]
+
+  pp_tas_metric:
+    custom_order: true
+    regrid_time:
+      calendar: standard
+    regrid:
+      target_grid: 2x2
+      scheme: linear
+    anomalies:
+      period: month
+      reference:
+        start_year: 2000
+        start_month: 1
+        start_day: 1
+        end_year: 2009
+        end_month: 12
+        end_day: 31
+    distance_metric:
+      metric: rmse
+      coords: [longitude, latitude]
+    multi_model_statistics:
+      span: overlap
+      statistics:
+        - operator: percentile
+          percent: 10
+        - operator: percentile
+          percent: 90
+      exclude: [reference_dataset, MIROC6]
+
+
+diagnostics:
+
+  timeseries:
+    description: Plot "classical" time series of global mean anomalies including a reference dataset.
+    variables:
+      tas:
+        timerange: '2000/2014'
+        preprocessor: pp_tas
+        project: CMIP6
+        mip: Amon
+        exp: historical
+        ensemble: r1i1p1f1
+        grid: gr
+        reference_dataset: HadCRUT5
+        additional_datasets:
+          - {dataset: HadCRUT5, project: OBS, type: ground, version: 5.0.1.0-analysis, tier: 2, reference_for_metric: true}
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        group_variables_by: variable_group
+        plots:
+          timeseries:
+            annual_mean_kwargs: False
+            plot_kwargs:
+              MIROC6:
+                color: red
+                label: '{dataset}'
+                linestyle: '-'
+                linewidth: 2
+                zorder: 4
+              HadCRUT5:
+                color: black
+                label: '{dataset}'
+                linestyle: '-'
+                linewidth: 2
+                zorder: 3
+              MultiModelPercentile10:
+                color: gray
+                label: '{dataset}'
+                linestyle: '--'
+                linewidth: 1
+                zorder: 2
+              MultiModelPercentile90:
+                color: gray
+                label: '{dataset}'
+                linestyle: '--'
+                linewidth: 1
+                zorder: 2
+              default:
+                color: lightgray
+                label: null
+                linestyle: '-'
+                linewidth: 1
+                zorder: 1
+
+  benchmarking_timeseries:
+    description: Plot "benchmarking" time series of global mean anomalies.
+    variables:
+      tas:
+        timerange: '2000/2014'
+        preprocessor: pp_tas_metric
+        project: CMIP6
+        mip: Amon
+        exp: historical
+        ensemble: r1i1p1f1
+        grid: gr
+        reference_dataset: HadCRUT5
+        additional_datasets:
+          - {dataset: HadCRUT5, project: OBS, type: ground, version: 5.0.1.0-analysis, tier: 2, reference_for_metric: true}
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        group_variables_by: variable_group
+        plots:
+          benchmarking_timeseries:
+            plot_kwargs:
+            annual_mean_kwargs: False
+            plot_kwargs:
+              MIROC6:
+                color: red
+                label: '{dataset}'
+                linestyle: '-'
+                linewidth: 1.5
+                zorder: 3
diff --git a/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_zonal.yml b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_zonal.yml
new file mode 100644
index 0000000000..1b22e51a05
--- /dev/null
+++ b/esmvaltool/recipes/model_evaluation/recipe_model_benchmarking_zonal.yml
@@ -0,0 +1,124 @@
+# ESMValTool
+---
+documentation:
+  title: Benchmarking of a single model.
+  description: >
+    Benchmarking: zonal mean plots.
+  authors:
+    - lauer_axel
+    - bock_lisa
+    - hassler_birgit
+    - lindenlaub_lukas
+    - schlund_manuel
+  maintainer:
+    - lauer_axel
+  references:
+    - lauer25gmd
+  projects:
+    - dlrmabak
+
+
+datasets:
+  - {dataset: ACCESS-CM2, grid: gn, institute: CSIRO-ARCCSS}
+  - {dataset: ACCESS-ESM1-5, grid: gn, institute: CSIRO}
+  - {dataset: AWI-CM-1-1-MR, grid: gn}
+  - {dataset: AWI-ESM-1-1-LR, grid: gn}
+  - {dataset: BCC-CSM2-MR, grid: gn}
+  - {dataset: BCC-ESM1, grid: gn}
+  - {dataset: CAMS-CSM1-0, grid: gn}
+  - {dataset: CanESM5, grid: gn}
+  - {dataset: CanESM5-CanOE, grid: gn, ensemble: r1i1p2f1}
+  - {dataset: CESM2, grid: gn}
+  - {dataset: CESM2-FV2, grid: gn, institute: NCAR}
+  - {dataset: CESM2-WACCM, grid: gn, institute: NCAR}
+  - {dataset: CESM2-WACCM-FV2, grid: gn, institute: NCAR}
+  - {dataset: CIESM}
+  - {dataset: CNRM-CM6-1, ensemble: r1i1p1f2}
+  - {dataset: CNRM-CM6-1-HR, ensemble: r1i1p1f2}
+  - {dataset: CNRM-ESM2-1, ensemble: r1i1p1f2}
+  - {dataset: E3SM-1-0}
+  - {dataset: E3SM-1-1, institute: E3SM-Project}
+  - {dataset: EC-Earth3-Veg}
+  - {dataset: FGOALS-f3-L}
+  - {dataset: FGOALS-g3, grid: gn}
+  - {dataset: GFDL-ESM4, grid: gr1}
+  - {dataset: GISS-E2-1-G, grid: gn}
+  - {dataset: GISS-E2-1-H, grid: gn}
+  - {dataset: HadGEM3-GC31-LL, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: HadGEM3-GC31-MM, ensemble: r1i1p1f3, grid: gn}
+  - {dataset: INM-CM4-8, grid: gr1}
+  - {dataset: INM-CM5-0, grid: gr1}
+  - {dataset: IPSL-CM6A-LR}
+  - {dataset: KACE-1-0-G}
+  - {dataset: MCM-UA-1-0, grid: gn}
+  - {dataset: MIROC-ES2L, ensemble: r1i1p1f2, grid: gn}
+  - {dataset: MPI-ESM-1-2-HAM, grid: gn}
+  - {dataset: MPI-ESM1-2-HR, grid: gn}
+  - {dataset: MPI-ESM1-2-LR, grid: gn}
+  - {dataset: MRI-ESM2-0, grid: gn}
+  - {dataset: NESM3, grid: gn}
+  - {dataset: NorESM2-LM, grid: gn, institute: NCC}
+  - {dataset: NorESM2-MM, grid: gn, institute: NCC}
+  - {dataset: SAM0-UNICON, grid: gn}
+  - {dataset: UKESM1-0-LL, ensemble: r1i1p1f2, grid: gn}
+  # Dataset to be benchmarked
+  - {dataset: MIROC6, grid: gn, benchmark_dataset: true}
+
+
+preprocessors:
+  pp_ta:
+    custom_order: true
+    regrid_time:
+      calendar: standard
+    extract_levels:
+      levels: {cmor_table: CMIP6, coordinate: plev27}
+      coordinate: air_pressure
+      scheme: linear
+    regrid:
+      target_grid: 2x2
+      scheme: linear
+    mask_below_threshold:
+      threshold: 0.0
+    climate_statistics:
+      operator: mean
+    zonal_statistics:
+      operator: mean
+    bias:
+      bias_type: absolute
+    multi_model_statistics:
+      span: overlap
+      statistics:
+        - operator: percentile
+          percent: 10
+        - operator: percentile
+          percent: 90
+      exclude: [reference_dataset, MIROC6]
+
+
+diagnostics:
+
+  benchmarking_zonal:
+    description: Plot zonal mean profile.
+    variables:
+      ta:
+        timerange: '2000/2004'
+        preprocessor: pp_ta
+        project: CMIP6
+        mip: Amon
+        exp: historical
+        ensemble: r1i1p1f1
+        grid: gr
+        reference_dataset: ERA5
+        additional_datasets:
+          - {dataset: ERA5, project: native6, type: reanaly, version: 'v1', tier: 3, reference_for_metric: true, reference_for_bias: true}
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        group_variables_by: variable_group
+        plots:
+          benchmarking_zonal:
+            plot_kwargs:
+              default:
+                cmap: 'bwr'
+                levels: [-5, -3, -2, -1, 0, 1, 2, 3, 5]
diff --git a/esmvaltool/recipes/model_evaluation/recipe_model_evaluation_clouds_cycles.yml b/esmvaltool/recipes/model_evaluation/recipe_model_evaluation_clouds_cycles.yml
index 8139a04dfc..431bb5246e 100644
--- a/esmvaltool/recipes/model_evaluation/recipe_model_evaluation_clouds_cycles.yml
+++ b/esmvaltool/recipes/model_evaluation/recipe_model_evaluation_clouds_cycles.yml
@@ -22,6 +22,9 @@ datasets:
 timerange_for_models: &time_period
   timerange: '2000/2014'  # can be specified, this is just an example
 
+timerange_diurnal: &time_diurnal_period
+  timerange: '20060101/20060201'  # can be specified, this is just an example
+
 
 preprocessors:
 
@@ -31,6 +34,14 @@ preprocessors:
     climate_statistics:
       period: month
 
+  pp_Tropics:
+    <<: *global_settings
+    extract_region:
+      start_longitude: 0
+      end_longitude: 360
+      start_latitude: -30
+      end_latitude: 30
+
   pp_SEPacific:
     <<: *global_settings
     extract_region:
@@ -59,21 +70,76 @@ preprocessors:
       start_latitude: 45
       end_latitude: 60
 
+  pp_diurn_Tropics:
+    custom_order: true
+    local_solar_time:
+    extract_region:
+      start_longitude: 0
+      end_longitude: 360
+      start_latitude: -30
+      end_latitude: 30
+    area_statistics:
+      operator: mean
+    climate_statistics:
+      period: hourly
+
+  pp_diurn_SEPacific:
+    custom_order: true
+    local_solar_time:
+    extract_region:
+      start_longitude: 265
+      end_longitude: 275
+      start_latitude: -25
+      end_latitude: -5
+    mask_landsea:
+      mask_out: land
+    area_statistics:
+      operator: mean
+    climate_statistics:
+      period: hourly
+
+  pp_diurn_SouthernOcean:
+    custom_order: true
+    local_solar_time:
+    extract_region:
+      start_longitude: 0
+      end_longitude: 360
+      start_latitude: -65
+      end_latitude: -30
+    mask_landsea:
+      mask_out: land
+    area_statistics:
+      operator: mean
+    climate_statistics:
+      period: hourly
+
+  pp_diurn_StormTracks:
+    custom_order: true
+    local_solar_time:
+    extract_region:
+      start_longitude: 0
+      end_longitude: 360
+      start_latitude: 45
+      end_latitude: 60
+    area_statistics:
+      operator: mean
+    climate_statistics:
+      period: hourly
+
 
 diagnostics:
 
   anncyc:
     description: Plot annual cycles including reference datasets.
     variables:
-      clt_global: &clt_settings
+      clt_tropics: &clt_settings
+        <<: *clt_settings
         <<: *time_period
-        preprocessor: pp_global
+        preprocessor: pp_Tropics
         short_name: clt
         mip: Amon
         additional_datasets:
           - {dataset: ESACCI-CLOUD, project: OBS, type: sat, version: AVHRR-AMPM-fv3.0, tier: 2}
-      clt_tropics:
-        <<: *clt_settings
       clt_sepacific:
         <<: *clt_settings
         preprocessor: pp_SEPacific
@@ -83,15 +149,13 @@ diagnostics:
       clt_stormtracks:
         <<: *clt_settings
         preprocessor: pp_StormTracks
-      clivi_global: &clivi_settings
+      clivi_tropics: &clivi_settings
         <<: *time_period
-        preprocessor: pp_global
+        preprocessor: pp_Tropics
         short_name: clivi
         mip: Amon
         additional_datasets:
           - {dataset: ESACCI-CLOUD, project: OBS, type: sat, version: AVHRR-AMPM-fv3.0, tier: 2}
-      clivi_tropics:
-        <<: *clivi_settings
       clivi_sepacific:
         <<: *clivi_settings
         preprocessor: pp_SEPacific
@@ -101,16 +165,14 @@ diagnostics:
       clivi_stormtracks:
         <<: *clivi_settings
         preprocessor: pp_StormTracks
-      lwp_global: &lwp_settings
+      lwp_tropics: &lwp_settings
         <<: *time_period
-        preprocessor: pp_global
+        preprocessor: pp_Tropics
         short_name: lwp
         derive: true
         mip: Amon
         additional_datasets:
           - {dataset: ESACCI-CLOUD, project: OBS, type: sat, version: AVHRR-AMPM-fv3.0, tier: 2}
-      lwp_tropics:
-        <<: *lwp_settings
       lwp_sepacific:
         <<: *lwp_settings
         preprocessor: pp_SEPacific
@@ -120,16 +182,14 @@ diagnostics:
       lwp_stormtracks:
         <<: *lwp_settings
         preprocessor: pp_StormTracks
-      swcre_global: &swcre_settings
+      swcre_tropics: &swcre_settings
         <<: *time_period
-        preprocessor: pp_global
+        preprocessor: pp_Tropics
         short_name: swcre
         derive: true
         mip: Amon
         additional_datasets:
           - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.1, tier: 2}
-      swcre_tropics:
-        <<: *swcre_settings
       swcre_sepacific:
         <<: *swcre_settings
         preprocessor: pp_SEPacific
@@ -139,16 +199,14 @@ diagnostics:
       swcre_stormtracks:
         <<: *swcre_settings
         preprocessor: pp_StormTracks
-      lwcre_global: &lwcre_settings
+      lwcre_tropics: &lwcre_settings
         <<: *time_period
-        preprocessor: pp_global
+        preprocessor: pp_Tropics
         short_name: lwcre
         derive: true
         mip: Amon
         additional_datasets:
           - {dataset: CERES-EBAF, project: OBS, type: sat, version: Ed4.1, tier: 2}
-      lwcre_tropics:
-        <<: *lwcre_settings
       lwcre_sepacific:
         <<: *lwcre_settings
         preprocessor: pp_SEPacific
@@ -175,5 +233,46 @@ diagnostics:
                 color: C1
               ESACCI-CLOUD:
                 color: black
+              CERES-EBAF:
+                color: black
+            pyplot_kwargs:
+              title: '{short_name}'
+
+  diurncyc:
+    description: Example plot diurnal cycles including reference datasets.
+    variables:
+      clt_tropics: &clt_diurn_settings
+        <<: *time_diurnal_period
+        preprocessor: pp_diurn_Tropics
+        short_name: clt
+        mip: 3hr
+        additional_datasets:
+          - {dataset: ERA5, project: native6, type: reanaly, version: 'v1', frequency: 1hr, tier: 3}
+      clt_sepacific:
+        <<: *clt_diurn_settings
+        preprocessor: pp_diurn_SEPacific
+      clt_southerocean:
+        <<: *clt_diurn_settings
+        preprocessor: pp_diurn_SouthernOcean
+      clt_stormtracks:
+        <<: *clt_diurn_settings
+        preprocessor: pp_diurn_StormTracks
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        plot_filename: '{plot_type}_{real_name}_{mip}'
+        group_variables_by: variable_group
+        plots:
+          diurnal_cycle:
+            legend_kwargs:
+              loc: upper right
+            plot_kwargs:
+              MPI-ESM1-2-HR:
+                color: C0
+              MPI-ESM1-2-LR:
+                color: C1
+              ERA5:
+                color: black
             pyplot_kwargs:
               title: '{short_name}'
diff --git a/esmvaltool/recipes/monitor/recipe_monitor.yml b/esmvaltool/recipes/monitor/recipe_monitor.yml
index a37f186583..32b773211d 100644
--- a/esmvaltool/recipes/monitor/recipe_monitor.yml
+++ b/esmvaltool/recipes/monitor/recipe_monitor.yml
@@ -12,7 +12,7 @@ documentation:
 
 
 datasets:
-  - {project: CMIP6, dataset: EC-Earth3, exp: historical, ensemble: r1i1p1f1, start_year: 1850, end_year: 2014}
+  - {project: CMIP6, dataset: EC-Earth3, exp: historical, ensemble: r1i1p1f1}
 
 preprocessors:
   timeseries_regular:
@@ -114,6 +114,22 @@ preprocessors:
     climate_statistics:
       period: month
 
+  pp_diurn_tropics:
+    custom_order: true
+    extract_region:
+      start_longitude: 0
+      end_longitude: 360
+      start_latitude: -30
+      end_latitude: 30
+    mask_landsea:
+      mask_out: land
+    local_solar_time:
+    area_statistics:
+      operator: mean
+    convert_units:
+      units: mm day-1
+
+
 diagnostics:
   plot_timeseries_annual_cycle:
     description: "Plot time series and annualcycles"
@@ -122,6 +138,7 @@ diagnostics:
         mip: Amon
         preprocessor: timeseries_regular
         grid: gr
+        timerange: 1850/2014
     scripts:
       plot: &plot_default
         script: monitor/monitor.py
@@ -140,12 +157,14 @@ diagnostics:
         mip: Omon
         preprocessor: nino3
         grid: gn
+        timerange: 1850/2014
       nino34:
         plot_name: 'Niño 3.4 index'
         short_name: tos
         mip: Omon
         preprocessor: nino34
         grid: gn
+        timerange: 1850/2014
     scripts:
       plot:
         <<: *plot_default
@@ -161,18 +180,35 @@ diagnostics:
         preprocessor: mlotstnorth
         grid: gn
         plot_name: Mixed layer depth average above 50ºN
+        timerange: 1850/2014
       mlotst-south:
         short_name: mlotst
         mip: Omon
         preprocessor: mlotstsouth
         grid: gn
         plot_name: Mixed layer depth average below 40ºS
+        timerange: 1850/2014
     scripts:
       plot:
         <<: *plot_default
         plots:
           annual_cycle: {}
 
+  plot_diurnal_cycle:
+    description: "Plot diurnal cycle"
+    variables:
+      pr_tropics:
+        timerange: '20010101/20010201'  # can be specified, this is just an example
+        preprocessor: pp_diurn_tropics
+        short_name: pr
+        mip: 3hr
+        grid: gr
+    scripts:
+      plot:
+        script: monitor/monitor.py
+        plots:
+          diurnal_cycle: {}
+
   global_climatologies:
     description: "Plot map data"
     variables:
@@ -180,76 +216,92 @@ diagnostics:
         mip: Amon
         preprocessor: climatology
         grid: gr
+        timerange: 1850/2014
       ps:
         mip: Amon
         preprocessor: climatology
         grid: gr
+        timerange: 1850/2014
       rsns:
         derive: true
         mip: Amon
         preprocessor: climatology
         grid: gr
+        timerange: 1850/2014
       rlns:
         derive: true
         mip: Amon
         preprocessor: climatology
         grid: gr
+        timerange: 1850/2014
       hfss:
         mip: Amon
         preprocessor: climatology
         grid: gr
+        timerange: 1850/2014
       hfls:
         mip: Amon
         preprocessor: climatology
         grid: gr
+        timerange: 1850/2014
       vas:
         mip: Amon
         preprocessor: climatology
         grid: gr
+        timerange: 1850/2014
       pr:
         mip: Amon
         preprocessor: climatology_pr
         grid: gr
+        timerange: 1850/2014
       evspsbl:
         mip: Amon
         preprocessor: climatology
         grid: gr
+        timerange: 1850/2014
       ua200:
         short_name: ua
         mip: Amon
         preprocessor: climatology_200hPa
         grid: gr
         plot_name: Eastward Wind at 200 hPa
+        timerange: 1850/2014
       ua500:
         short_name: ua
         mip: Amon
         preprocessor: climatology_500hPa
         grid: gr
         plot_name: Eastward Wind at 500 hPa
+        timerange: 1850/2014
       zg200:
         short_name: zg
         mip: Amon
         preprocessor: climatology_200hPa
         grid: gr
         plot_name: Geopotential height at 200 hPa
+        timerange: 1850/2014
       zg500:
         short_name: zg
         mip: Amon
         preprocessor: climatology_500hPa
         grid: gr
         plot_name: Geopotential height at 500 hPa
+        timerange: 1850/2014
       tos:
         mip: Omon
         preprocessor: climatology
         grid: gn
+        timerange: 1850/2014
       zos:
         mip: Omon
         preprocessor: climatology
         grid: gn
+        timerange: 1850/2014
       sos:
         mip: Omon
         preprocessor: climatology
         grid: gn
+        timerange: 1850/2014
     scripts:
       plot:
         <<: *plot_default
@@ -265,18 +317,22 @@ diagnostics:
         mip: Amon
         preprocessor: climatology
         grid: gr
+        timerange: 1850/2014
       pr:
         mip: Amon
         preprocessor: climatology_pr
         grid: gr
+        timerange: 1850/2014
       tos:
         mip: Omon
         preprocessor: climatology
         grid: gn
+        timerange: 1850/2014
       sos:
         mip: Omon
         grid: gn
         preprocessor: climatology
+        timerange: 1850/2014
     scripts:
       plot:
         <<: *plot_default
@@ -294,6 +350,7 @@ diagnostics:
         preprocessor: nao_djf
         eof_name: NAO as first EOF in DJF
         pc_name: NAO index as first PC in DJF
+        timerange: 1850/2014
       sam:
         short_name: psl
         mip: Amon
@@ -301,6 +358,7 @@ diagnostics:
         preprocessor: sam_jja
         eof_name: SAM as first EOF in JJA
         pc_name: SAM index as first PC in JJA
+        timerange: 1850/2014
     scripts:
       eof:
         <<: *plot_default
@@ -313,6 +371,7 @@ diagnostics:
         mip: SImon
         preprocessor: climatology
         grid: gn
+        timerange: 1850/2014
 
     scripts:
       plot:
@@ -333,6 +392,7 @@ diagnostics:
         mip: Omon
         preprocessor: clim_fma
         grid: gn
+        timerange: 1850/2014
     scripts:
       plot:
         <<: *plot_default
@@ -348,6 +408,7 @@ diagnostics:
         mip: Omon
         preprocessor: clim_aso
         grid: gn
+        timerange: 1850/2014
     scripts:
       plot:
         <<: *plot_default
diff --git a/esmvaltool/recipes/monitor/recipe_monitor_with_refs.yml b/esmvaltool/recipes/monitor/recipe_monitor_with_refs.yml
index f5eea141dc..b5c7d093d1 100644
--- a/esmvaltool/recipes/monitor/recipe_monitor_with_refs.yml
+++ b/esmvaltool/recipes/monitor/recipe_monitor_with_refs.yml
@@ -102,6 +102,19 @@ preprocessors:
     zonal_statistics:
       operator: mean
 
+  pp_diurn_Tropics:
+    custom_order: true
+    local_solar_time:
+    extract_region:
+      start_longitude: 0
+      end_longitude: 360
+      start_latitude: -30
+      end_latitude: 30
+    area_statistics:
+      operator: mean
+    climate_statistics:
+      period: hourly
+
 
 diagnostics:
 
@@ -265,3 +278,29 @@ diagnostics:
             fontsize: 10
             show_x_minor_ticks: false
             time_format: '%Y'
+
+  plot_diurnal_cycle:
+    description: Example plot diurnal cycle including reference dataset.
+    variables:
+      clt_tropics:
+        preprocessor: pp_diurn_Tropics
+        short_name: clt
+        mip: 3hr
+        timerange: '20060101/20060201'
+    scripts:
+      allplots:
+        script: monitor/multi_datasets.py
+        plot_folder: '{plot_dir}'
+        plot_filename: '{plot_type}_{real_name}_{mip}'
+        group_variables_by: variable_group
+        plots:
+          diurnal_cycle:
+            legend_kwargs:
+              loc: upper right
+            plot_kwargs:
+              MPI-ESM1-2-HR:
+                color: C0
+              MPI-ESM1-2-LR:
+                color: C1
+            pyplot_kwargs:
+              title: '{short_name}'
diff --git a/esmvaltool/references/lauer25gmd.bibtex b/esmvaltool/references/lauer25gmd.bibtex
new file mode 100644
index 0000000000..69332ddfe5
--- /dev/null
+++ b/esmvaltool/references/lauer25gmd.bibtex
@@ -0,0 +1,13 @@
+@article{lauer25gmd,
+	doi = {10.5194/egusphere-2024-1518},
+	url = {https://egusphere.copernicus.org/preprints/2024/egusphere-2024-1518/},
+	year = {2024},
+	publisher = {European Geosciences Union},
+        address = {Göttingen Germany},
+	volume = {2024},
+	number = {},
+	pages = {1--35},
+	author = {Lauer, A. and Bock, L. and Hassler, B. and J\"ockel, P. and Ruhe, L. and Schlund, M.},
+	title = {Monitoring and benchmarking Earth System Model simulations with ESMValTool v2.12.0},
+	journal = {EGUsphere}
+}