Mobt775 Add SAMOS apply plugins and all CLIs #2153

New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
gavinevans merged 95 commits into metoppv:master from brhooper:mobt775_samos_3
Oct 8, 2025
improver/calibration/__init__.py
            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -6,6 +6,7 @@
  
    and coefficient inputs.

    """

    import glob

    from collections import OrderedDict

    from pathlib import Path

    from typing import Dict, List, Optional, Tuple, Union

    @@ -19,6 +20,8 @@
  
        get_diagnostic_cube_name_from_probability_name,

    )

    from improver.utilities.cube_manipulation import MergeCubes

    from improver.utilities.flatten import flatten

    from improver.utilities.load import load_cubelist

    class CalibrationSchemas:

    @@ -268,71 +271,76 @@ def split_forecasts_and_bias_files(cubes: CubeList) -> Tuple[Cube, Optional[Cube
  
        return forecast_cube, bias_cubes

    def split_pickle_parquet_and_netcdf(

        files: List[Path],

    ) -> Tuple[Optional[CubeList], Optional[List[Path]], Optional[object]]:

        """Split the input files into pickle, parquet, and netcdf files.

        Any or all of NetCDF, Parquet, and pickle files can be loaded. Only a single

        pickle file is expected, but multiple netCDF and parquet files can be provided.

    def split_netcdf_parquet_pickle(files):

        """Split the input files into netcdf, parquet, and pickle files.

        Only a single pickle file is expected.

        Args:

            files:

                A list of input file paths.

                A list of input file paths which will be split into pickle,

                parquet, and netcdf files.

        Returns:

            - A flattened cube list containing all the cubes loaded from NetCDF files, or None.

            - A list of paths to Parquet files, or None.

            - A loaded pickle file, or None.

            - A flattened cube list containing all the cubes contained within the

              provided paths to NetCDF files.

            - A list of paths to Parquet files.

            - A loaded pickle file.

        Raises:

            ValueError: If the path provided is not loadable as a pickle file, parquet file

                or netcdf file.

            ValueError: If multiple pickle files provided, as only one is ever expected.

        """

        cubes = iris.cube.CubeList()

        loaded_pickle = None

        cubes = CubeList([])

        loaded_pickles = []

        parquets = []

        for file_path in files:

            if not file_path.exists():

                continue

        for file in files:

            file_paths = glob.glob(str(file))

            for file_path_str in file_paths:

                file_path = Path(file_path_str)

                if not file_path.exists():

                    continue

            # Directories indicate we are working with parquet files.

            if file_path.is_dir():

                parquets.append(file_path)

                continue

                # Directories indicate we are working with parquet files.

                if file_path.is_dir():

                    parquets.append(file_path)

                    continue

            try:

                cube = iris.load(file_path)

                cubes.extend(cube)

            except ValueError:

                if loaded_pickle is not None:

                    msg = "Multiple pickle inputs have been provided. Only one is expected."

                    raise ValueError(msg)

                try:

                    loaded_pickle = joblib.load(file_path)

                except Exception as e:

                    msg = f"Failed to load {file_path}: {e}"

                    raise ValueError(msg)

                    cube = load_cubelist(str(file_path))

                    cubes.extend(cube)

                except ValueError:

                    try:

                        loaded_pickles.append(joblib.load(file_path))

                    except Exception as e:

                        msg = f"Failed to load {file_path}: {e}"

                        raise ValueError(msg)

        if len(loaded_pickles) > 1:

            msg = "Multiple pickle inputs have been provided. Only one is expected."

            raise ValueError(msg)

        return (

            cubes if cubes else None,

            parquets if parquets else None,

            loaded_pickle if loaded_pickle else None,

            loaded_pickles[0] if loaded_pickles else None,

        )

    def identify_parquet_type(

        parquet_paths: List[Path],

    ) -> Tuple[Optional[Path], Optional[Path]]:

    def identify_parquet_type(parquet_paths: List[Path]):

        """Determine whether the provided parquet paths contain forecast or truth data.

        This is done by checking the columns within the parquet files for the presence

        of a forecast_period column which is only present for forecast data.

        Args:

            parquet_paths:

                A list of paths to Parquet files.

        Returns:

            - The path to the Parquet file containing the historical forecasts.

            - The path to the Parquet file containing the truths.

        """

        # import here to avoid dependency on pyarrow for all of improver

        import pyarrow.parquet as pq

        forecast_table_path = None

    @@ -351,6 +359,134 @@ def identify_parquet_type(
  
        return forecast_table_path, truth_table_path

    def split_cubes_for_samos(

        cubes: CubeList,

        gam_features: List[str],

        truth_attribute: Optional[str] = None,

        expect_emos_coeffs: bool = False,

        expect_emos_fields: bool = False,

    ):

        """Function to split the forecast, truth, gam additional predictors and emos

        additional predictor cubes.

        Args:

            cubes:

                A list of input cubes which will be split into relevant groups.

            gam_features:

                A list of strings containing the names of the additional fields

                required for the SAMOS GAMs.

            truth_attribute:

                An attribute and its value in the format of "attribute=value",

                which must be present on truth cubes. If None, no truth cubes are

                expected or returned.

            expect_emos_coeffs:

                If True, EMOS coefficient cubes are expected to be found in the input

                cubes. If False, an error will be raised if any such cubes are found.

            expect_emos_fields:

                If True, additional EMOS fields are expected to be found in the input

                cubes. If False, an error will be raised if any such cubes are found.

        Raises:

            IOError:

                If EMOS coefficients cubes are found when they are not expected.

            IOError:

                If additional fields cubes are found which do not match the features in

                gam_features.

            IOError:

                If probability cubes are provided with more than one name.

        Returns:

            - A cube containing all the historic forecasts, or None if no such cubes

              were found.

            - A cube containing all the truth data, or None if no such cubes were found

              or no truth_attribute was provided.

            - A cubelist containing all the additional fields required for the GAMs,

              or None if no such cubes were found.

            - A cubelist containing all the EMOS coefficient cubes, or None if no such

              cubes were found.

            - A cubelist containing all the additional fields required for EMOS,

              or None if no such cubes were found.

            - A cube containing a probability template, or None if no such cube is found.

        """

        forecast = iris.cube.CubeList([])

        truth = iris.cube.CubeList([])

        gam_additional_fields = iris.cube.CubeList([])

        emos_coefficients = iris.cube.CubeList([])

        emos_additional_fields = iris.cube.CubeList([])

        prob_template = None

        # Prepare variables used to split forecast and truth.

        truth_key, truth_value = None, None

        if truth_attribute:

            truth_key, truth_value = truth_attribute.split("=")

        for cube in flatten(cubes):

            if "time" in [c.name() for c in cube.coords()]:

                if truth_key and cube.attributes.get(truth_key) == truth_value:

                    truth.append(cube.copy())

                else:

                    forecast.append(cube.copy())

            elif "emos_coefficient" in cube.name():

                emos_coefficients.append(cube.copy())

            elif cube.name() in gam_features:

                gam_additional_fields.append(cube.copy())

            else:

                emos_additional_fields.append(cube.copy())

        # Check that all required inputs are present and no unexpected cubes have been

        # found.

        missing_inputs = []

        if len(forecast) == 0:

            missing_inputs.append("forecast")

        if truth_key and len(truth) == 0:

            missing_inputs.append("truth")

        if missing_inputs:

            raise IOError(f"Missing {' and '.join(missing_inputs)} input.")

        if not expect_emos_coeffs and len(emos_coefficients) > 0:

            msg = (

                f"Found EMOS coefficients cubes when they were not expected. The following "

                f"such cubes were found: {[c.name() for c in emos_coefficients]}."

            )

            raise IOError(msg)

        if not expect_emos_fields and len(emos_additional_fields) > 0:

            msg = (

                f"Found additional fields cubes which do not match the features in "

                f"gam_features. The following cubes were found: "

                f"{[c.name() for c in emos_additional_fields]}."

            )

            raise IOError(msg)

        # Split out prob_template cube if required.

        forecast_names = [c.name() for c in forecast]

        prob_forecast_names = [name for name in forecast_names if "probability" in name]

        if len(set(prob_forecast_names)) > 1:

            msg = (

                "Providing multiple probability cubes is not supported. A probability cube "

                "can either be provided as the forecast or the probability template, but "

                f"not both. Cubes provided: {prob_forecast_names}."

            )

            raise IOError(msg)

        else:

            if len(set(forecast_names)) > 1:

                prob_template = forecast.extract(prob_forecast_names[0])[0]

                forecast.remove(prob_template)

        forecast = MergeCubes()(forecast)

        if truth_key:

            truth = MergeCubes()(truth)

        return (

            None if not forecast else forecast,

            None if not truth else truth,

            None if not gam_additional_fields else gam_additional_fields,

            None if not emos_coefficients else emos_coefficients,

            None if not emos_additional_fields else emos_additional_fields,

            prob_template,

        )

    def validity_time_check(forecast: Cube, validity_times: List[str]) -> bool:

        """Check the validity time of the forecast matches the accepted validity times

        within the validity times list.
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Mobt775 Add SAMOS apply plugins and all CLIs #2153

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!