diff --git a/improver/calibration/__init__.py b/improver/calibration/__init__.py index 054a2d4b2b..fa6c58af99 100644 --- a/improver/calibration/__init__.py +++ b/improver/calibration/__init__.py @@ -6,6 +6,7 @@ and coefficient inputs. """ +import glob from collections import OrderedDict from pathlib import Path from typing import Dict, List, Optional, Tuple, Union @@ -19,6 +20,8 @@ get_diagnostic_cube_name_from_probability_name, ) from improver.utilities.cube_manipulation import MergeCubes +from improver.utilities.flatten import flatten +from improver.utilities.load import load_cubelist class CalibrationSchemas: @@ -268,71 +271,76 @@ def split_forecasts_and_bias_files(cubes: CubeList) -> Tuple[Cube, Optional[Cube return forecast_cube, bias_cubes -def split_pickle_parquet_and_netcdf( - files: List[Path], -) -> Tuple[Optional[CubeList], Optional[List[Path]], Optional[object]]: - """Split the input files into pickle, parquet, and netcdf files. - Any or all of NetCDF, Parquet, and pickle files can be loaded. Only a single - pickle file is expected, but multiple netCDF and parquet files can be provided. +def split_netcdf_parquet_pickle(files): + """Split the input files into netcdf, parquet, and pickle files. + Only a single pickle file is expected. Args: files: - A list of input file paths. + A list of input file paths which will be split into pickle, + parquet, and netcdf files. + Returns: - - A flattened cube list containing all the cubes loaded from NetCDF files, or None. - - A list of paths to Parquet files, or None. - - A loaded pickle file, or None. + - A flattened cube list containing all the cubes contained within the + provided paths to NetCDF files. + - A list of paths to Parquet files. + - A loaded pickle file. + Raises: - ValueError: If the path provided is not loadable as a pickle file, parquet file - or netcdf file. ValueError: If multiple pickle files provided, as only one is ever expected. """ - cubes = iris.cube.CubeList() - loaded_pickle = None + cubes = CubeList([]) + loaded_pickles = [] parquets = [] - for file_path in files: - if not file_path.exists(): - continue + for file in files: + file_paths = glob.glob(str(file)) + for file_path_str in file_paths: + file_path = Path(file_path_str) + if not file_path.exists(): + continue - # Directories indicate we are working with parquet files. - if file_path.is_dir(): - parquets.append(file_path) - continue + # Directories indicate we are working with parquet files. + if file_path.is_dir(): + parquets.append(file_path) + continue - try: - cube = iris.load(file_path) - cubes.extend(cube) - except ValueError: - if loaded_pickle is not None: - msg = "Multiple pickle inputs have been provided. Only one is expected." - raise ValueError(msg) try: - loaded_pickle = joblib.load(file_path) - except Exception as e: - msg = f"Failed to load {file_path}: {e}" - raise ValueError(msg) + cube = load_cubelist(str(file_path)) + cubes.extend(cube) + except ValueError: + try: + loaded_pickles.append(joblib.load(file_path)) + except Exception as e: + msg = f"Failed to load {file_path}: {e}" + raise ValueError(msg) + + if len(loaded_pickles) > 1: + msg = "Multiple pickle inputs have been provided. Only one is expected." + raise ValueError(msg) return ( cubes if cubes else None, parquets if parquets else None, - loaded_pickle if loaded_pickle else None, + loaded_pickles[0] if loaded_pickles else None, ) -def identify_parquet_type( - parquet_paths: List[Path], -) -> Tuple[Optional[Path], Optional[Path]]: +def identify_parquet_type(parquet_paths: List[Path]): """Determine whether the provided parquet paths contain forecast or truth data. This is done by checking the columns within the parquet files for the presence of a forecast_period column which is only present for forecast data. + Args: parquet_paths: A list of paths to Parquet files. + + Returns: - The path to the Parquet file containing the historical forecasts. - The path to the Parquet file containing the truths. """ + # import here to avoid dependency on pyarrow for all of improver import pyarrow.parquet as pq forecast_table_path = None @@ -351,6 +359,134 @@ def identify_parquet_type( return forecast_table_path, truth_table_path +def split_cubes_for_samos( + cubes: CubeList, + gam_features: List[str], + truth_attribute: Optional[str] = None, + expect_emos_coeffs: bool = False, + expect_emos_fields: bool = False, +): + """Function to split the forecast, truth, gam additional predictors and emos + additional predictor cubes. + + Args: + cubes: + A list of input cubes which will be split into relevant groups. + gam_features: + A list of strings containing the names of the additional fields + required for the SAMOS GAMs. + truth_attribute: + An attribute and its value in the format of "attribute=value", + which must be present on truth cubes. If None, no truth cubes are + expected or returned. + expect_emos_coeffs: + If True, EMOS coefficient cubes are expected to be found in the input + cubes. If False, an error will be raised if any such cubes are found. + expect_emos_fields: + If True, additional EMOS fields are expected to be found in the input + cubes. If False, an error will be raised if any such cubes are found. + + Raises: + IOError: + If EMOS coefficients cubes are found when they are not expected. + IOError: + If additional fields cubes are found which do not match the features in + gam_features. + IOError: + If probability cubes are provided with more than one name. + + Returns: + - A cube containing all the historic forecasts, or None if no such cubes + were found. + - A cube containing all the truth data, or None if no such cubes were found + or no truth_attribute was provided. + - A cubelist containing all the additional fields required for the GAMs, + or None if no such cubes were found. + - A cubelist containing all the EMOS coefficient cubes, or None if no such + cubes were found. + - A cubelist containing all the additional fields required for EMOS, + or None if no such cubes were found. + - A cube containing a probability template, or None if no such cube is found. + """ + forecast = iris.cube.CubeList([]) + truth = iris.cube.CubeList([]) + gam_additional_fields = iris.cube.CubeList([]) + emos_coefficients = iris.cube.CubeList([]) + emos_additional_fields = iris.cube.CubeList([]) + prob_template = None + + # Prepare variables used to split forecast and truth. + truth_key, truth_value = None, None + if truth_attribute: + truth_key, truth_value = truth_attribute.split("=") + + for cube in flatten(cubes): + if "time" in [c.name() for c in cube.coords()]: + if truth_key and cube.attributes.get(truth_key) == truth_value: + truth.append(cube.copy()) + else: + forecast.append(cube.copy()) + elif "emos_coefficient" in cube.name(): + emos_coefficients.append(cube.copy()) + elif cube.name() in gam_features: + gam_additional_fields.append(cube.copy()) + else: + emos_additional_fields.append(cube.copy()) + + # Check that all required inputs are present and no unexpected cubes have been + # found. + missing_inputs = [] + if len(forecast) == 0: + missing_inputs.append("forecast") + if truth_key and len(truth) == 0: + missing_inputs.append("truth") + if missing_inputs: + raise IOError(f"Missing {' and '.join(missing_inputs)} input.") + + if not expect_emos_coeffs and len(emos_coefficients) > 0: + msg = ( + f"Found EMOS coefficients cubes when they were not expected. The following " + f"such cubes were found: {[c.name() for c in emos_coefficients]}." + ) + raise IOError(msg) + + if not expect_emos_fields and len(emos_additional_fields) > 0: + msg = ( + f"Found additional fields cubes which do not match the features in " + f"gam_features. The following cubes were found: " + f"{[c.name() for c in emos_additional_fields]}." + ) + raise IOError(msg) + + # Split out prob_template cube if required. + forecast_names = [c.name() for c in forecast] + prob_forecast_names = [name for name in forecast_names if "probability" in name] + if len(set(prob_forecast_names)) > 1: + msg = ( + "Providing multiple probability cubes is not supported. A probability cube " + "can either be provided as the forecast or the probability template, but " + f"not both. Cubes provided: {prob_forecast_names}." + ) + raise IOError(msg) + else: + if len(set(forecast_names)) > 1: + prob_template = forecast.extract(prob_forecast_names[0])[0] + forecast.remove(prob_template) + + forecast = MergeCubes()(forecast) + if truth_key: + truth = MergeCubes()(truth) + + return ( + None if not forecast else forecast, + None if not truth else truth, + None if not gam_additional_fields else gam_additional_fields, + None if not emos_coefficients else emos_coefficients, + None if not emos_additional_fields else emos_additional_fields, + prob_template, + ) + + def validity_time_check(forecast: Cube, validity_times: List[str]) -> bool: """Check the validity time of the forecast matches the accepted validity times within the validity times list. diff --git a/improver/calibration/emos_calibration.py b/improver/calibration/emos_calibration.py index 78e204f9ee..59123e2e49 100644 --- a/improver/calibration/emos_calibration.py +++ b/improver/calibration/emos_calibration.py @@ -885,9 +885,10 @@ def _create_cubelist( cubelist is for a separate EMOS coefficient e.g. alpha, beta, gamma, delta. """ - (template_dims, spatial_associated_coords) = ( - self._get_spatial_associated_coordinates(historic_forecasts) - ) + ( + template_dims, + spatial_associated_coords, + ) = self._get_spatial_associated_coordinates(historic_forecasts) coords_to_replace = ( self._create_temporal_coordinates(historic_forecasts) + spatial_associated_coords @@ -1193,6 +1194,7 @@ def guess_and_minimise( forecast_predictor_data = np.ma.stack( broadcast_data_to_time_coord(forecast_predictors) ) + initial_guess = self.compute_initial_guess( truths.data, forecast_predictor_data, @@ -1743,181 +1745,6 @@ def _check_additional_field_sites(self, forecast, additional_fields): ) raise ValueError(msg) - @staticmethod - def _get_attribute( - coefficients: CubeList, attribute_name: str, optional: bool = False - ) -> Optional[Any]: - """Get the value for the requested attribute, ensuring that the - attribute is present consistently across the cubes within the - coefficients cubelist. - - Args: - coefficients: - EMOS coefficients - attribute_name: - Name of expected attribute - optional: - Indicate whether the attribute is allowed to be optional. - - Returns: - Returns None if the attribute is not present. Otherwise, - the value of the attribute is returned. - - Raises: - ValueError: If coefficients do not share the expected attributes. - """ - attributes = [ - str(c.attributes[attribute_name]) - for c in coefficients - if c.attributes.get(attribute_name) is not None - ] - - if not attributes and optional: - return None - if not attributes and not optional: - msg = ( - f"The {attribute_name} attribute must be specified on all " - "coefficients cubes." - ) - raise AttributeError(msg) - - if len(set(attributes)) == 1 and len(attributes) == len(coefficients): - return coefficients[0].attributes[attribute_name] - - msg = ( - "Coefficients must share the same {0} attribute. " - "{0} attributes provided: {1}".format(attribute_name, attributes) - ) - raise AttributeError(msg) - - def _get_input_forecast_type(self, forecast: Cube): - """Identifies whether the forecast is in probability, realization - or percentile space. - - Args: - forecast - - """ - try: - find_percentile_coordinate(forecast) - except CoordinateNotFoundError: - if forecast.name().startswith("probability_of"): - forecast_type = "probabilities" - else: - forecast_type = "realizations" - else: - forecast_type = "percentiles" - self.input_forecast_type = forecast_type - - def _convert_to_realizations( - self, forecast: Cube, realizations_count: Optional[int], ignore_ecc_bounds: bool - ) -> Cube: - """Convert an input forecast of probabilities or percentiles into - pseudo-realizations. - - Args: - forecast - realizations_count: - Number of pseudo-realizations to generate from the input - forecast - ignore_ecc_bounds - - Returns: - Cube with pseudo-realizations - """ - if not realizations_count: - raise ValueError( - "The 'realizations_count' argument must be defined " - "for forecasts provided as {}".format(self.input_forecast_type) - ) - - if self.input_forecast_type == "probabilities": - conversion_plugin = ConvertProbabilitiesToPercentiles( - ecc_bounds_warning=ignore_ecc_bounds - ) - if self.input_forecast_type == "percentiles": - conversion_plugin = ResamplePercentiles( - ecc_bounds_warning=ignore_ecc_bounds - ) - - forecast_as_percentiles = conversion_plugin( - forecast, no_of_percentiles=realizations_count - ) - forecast_as_realizations = RebadgePercentilesAsRealizations()( - forecast_as_percentiles - ) - - return forecast_as_realizations - - def _format_forecast( - self, template: Cube, randomise: bool, random_seed: Optional[int] - ) -> Cube: - """ - Generate calibrated probability, percentile or realization output in - the desired format. - - Args: - template: - A template cube containing the coordinates and metadata expected - on the calibrated forecast. - randomise: - If True, order realization output randomly rather than using - the input forecast. If forecast type is not realizations, this - is ignored. - random_seed: - For realizations input if randomise is True, random seed for - generating re-ordered percentiles. If randomise is False, the - random seed may still be used for splitting ties. - - Returns: - Calibrated forecast - """ - if self.output_forecast_type == "probabilities": - conversion_plugin = ConvertLocationAndScaleParametersToProbabilities( - distribution=self.distribution["name"], - shape_parameters=self.distribution["shape"], - ) - result = conversion_plugin( - self.distribution["location"], self.distribution["scale"], template - ) - - else: - conversion_plugin = ConvertLocationAndScaleParametersToPercentiles( - distribution=self.distribution["name"], - shape_parameters=self.distribution["shape"], - ) - - if self.output_forecast_type == "percentiles": - perc_coord = find_percentile_coordinate(template) - result = conversion_plugin( - self.distribution["location"], - self.distribution["scale"], - template, - percentiles=( - self.percentiles if self.percentiles else perc_coord.points - ), - ) - else: - no_of_percentiles = len(template.coord("realization").points) - percentiles = conversion_plugin( - self.distribution["location"], - self.distribution["scale"], - template, - no_of_percentiles=no_of_percentiles, - ) - result = EnsembleReordering().process( - percentiles, - template, - random_ordering=randomise, - random_seed=random_seed, - ) - - # Preserve cell methods from template. - for cm in template.cell_methods: - result.add_cell_method(cm) - - return result - def process( self, forecast: Cube, @@ -1931,7 +1758,8 @@ def process( predictor: str = "mean", randomise: bool = False, random_seed: Optional[int] = None, - ) -> Cube: + return_parameters: bool = False, + ) -> Union[Cube, CubeList]: """Calibrate input forecast using pre-calculated coefficients Args: @@ -1969,12 +1797,18 @@ def process( random_seed: Used in generating calibrated realizations. If input forecast is probabilities or percentiles, this is ignored. + return_parameters: + If True, return location and scale parameters of the calibrated forecast + distribution, rather than forecasts. This option supercedes all other + inputs which affect the format of the output. Returns: Calibrated forecast in the form of the input (ie probabilities - percentiles or realizations) + percentiles or realizations), or a tuple containing cubes of location and + scale parameters of the calibrated forecast distribution if + return_parameters is True. """ - self._get_input_forecast_type(forecast) + self.input_forecast_type = get_forecast_type(forecast) self.output_forecast_type = ( "probabilities" if prob_template else self.input_forecast_type ) @@ -1992,7 +1826,7 @@ def process( forecast_as_realizations = forecast.copy() if self.input_forecast_type != "realizations": - forecast_as_realizations = self._convert_to_realizations( + forecast_as_realizations = convert_to_realizations( forecast.copy(), realizations_count, ignore_ecc_bounds ) @@ -2007,20 +1841,222 @@ def process( tolerate_time_mismatch=tolerate_time_mismatch, ) - self.distribution = { - "name": self._get_attribute(coefficients, "distribution"), - "location": location_parameter, - "scale": scale_parameter, - "shape": self._get_attribute( - coefficients, "shape_parameters", optional=True - ), - } + if return_parameters: + return location_parameter, scale_parameter + else: + self.distribution = { + "name": get_attribute_from_coefficients(coefficients, "distribution"), + "location": location_parameter, + "scale": scale_parameter, + "shape": get_attribute_from_coefficients( + coefficients, "shape_parameters", optional=True + ), + } - template = prob_template if prob_template else forecast - result = self._format_forecast(template, randomise, random_seed) + template = prob_template if prob_template else forecast + result = generate_forecast_from_distribution( + self.distribution, template, self.percentiles, randomise, random_seed + ) - if land_sea_mask: - # fill in masked sea points with uncalibrated data - merge_land_and_sea(result, forecast) + if land_sea_mask: + # fill in masked sea points with uncalibrated data + merge_land_and_sea(result, forecast) - return result + return result + + +def get_forecast_type(forecast: Cube) -> str: + """Identifies whether the forecast is in probability, realization + or percentile space. + + Args: + forecast + + Returns: + forecast_type: str + One of "probabilities", "realizations" or "percentiles" + """ + try: + find_percentile_coordinate(forecast) + except CoordinateNotFoundError: + if forecast.name().startswith("probability_of"): + forecast_type = "probabilities" + else: + forecast_type = "realizations" + else: + forecast_type = "percentiles" + + return forecast_type + + +def convert_to_realizations( + forecast: Cube, realizations_count: Optional[int], ignore_ecc_bounds: bool +) -> Cube: + """Convert an input forecast of probabilities or percentiles into + pseudo-realizations. + + Args: + forecast + realizations_count: + Number of pseudo-realizations to generate from the input + forecast + ignore_ecc_bounds: + If True, allow percentiles from probabilities to exceed the ECC bounds + range. If input is not probabilities, this is ignored. + + Returns: + Cube with pseudo-realizations. + """ + input_forecast_type = get_forecast_type(forecast) + if not realizations_count: + raise ValueError( + "The 'realizations_count' argument must be defined " + f"for forecasts provided as {input_forecast_type}" + ) + + if input_forecast_type == "probabilities": + conversion_plugin = ConvertProbabilitiesToPercentiles( + ecc_bounds_warning=ignore_ecc_bounds + ) + if input_forecast_type == "percentiles": + conversion_plugin = ResamplePercentiles(ecc_bounds_warning=ignore_ecc_bounds) + + forecast_as_percentiles = conversion_plugin( + forecast, no_of_percentiles=realizations_count + ) + forecast_as_realizations = RebadgePercentilesAsRealizations()( + forecast_as_percentiles + ) + + return forecast_as_realizations + + +def get_attribute_from_coefficients( + coefficients: CubeList, attribute_name: str, optional: bool = False +) -> Optional[Any]: + """Get the value for the requested attribute, ensuring that the + attribute is present consistently across the cubes within the + coefficients cubelist. + + Args: + coefficients: + EMOS coefficients + attribute_name: + Name of expected attribute + optional: + Indicate whether the attribute is allowed to be optional. + + Returns: + Returns None if the attribute is not present. Otherwise, + the value of the attribute is returned. + + Raises: + AttributeError: If the expected attribute is not on all coefficients cubes. + AttributeError: If the expected attribute is not the same across all + coefficients cubes. + """ + attributes = [ + str(c.attributes[attribute_name]) + for c in coefficients + if c.attributes.get(attribute_name) is not None + ] + + if not attributes and optional: + return None + if not attributes and not optional: + msg = ( + f"The {attribute_name} attribute must be specified on all " + "coefficients cubes." + ) + raise AttributeError(msg) + + if len(set(attributes)) == 1 and len(attributes) == len(coefficients): + return coefficients[0].attributes[attribute_name] + + msg = ( + "Coefficients must share the same {0} attribute. " + "{0} attributes provided: {1}".format(attribute_name, attributes) + ) + raise AttributeError(msg) + + +def generate_forecast_from_distribution( + distribution: Dict, + template: Cube, + percentiles: Optional[Sequence], + randomise: bool, + random_seed: Optional[int], +) -> Cube: + """ + Generate calibrated probability, percentile or realization output in + the desired format. + + Args: + distribution: + A dictionary with the following key-value pairs: + - "name": the name of the distribution + - "location": a cube of location parameters + - "scale": a cube of scale parameters + - "shape": an optional array of shape parameters + template: + A template cube containing the coordinates and metadata expected + on the calibrated forecast. + percentiles: + The set of percentiles used to create the calibrated forecast if the + template cube contains percentile data. + randomise: + If True, order realization output randomly rather than using + the input forecast. If forecast type is not realizations, this + is ignored. + random_seed: + For realizations input if randomise is True, random seed for + generating re-ordered percentiles. If randomise is False, the + random seed may still be used for splitting ties. + + Returns: + Calibrated forecast + """ + output_forecast_type = get_forecast_type(template) + if output_forecast_type == "probabilities": + conversion_plugin = ConvertLocationAndScaleParametersToProbabilities( + distribution=distribution["name"], + shape_parameters=distribution["shape"], + ) + result = conversion_plugin( + distribution["location"], distribution["scale"], template + ) + + else: + conversion_plugin = ConvertLocationAndScaleParametersToPercentiles( + distribution=distribution["name"], + shape_parameters=distribution["shape"], + ) + + if output_forecast_type == "percentiles": + perc_coord = find_percentile_coordinate(template) + result = conversion_plugin( + distribution["location"], + distribution["scale"], + template, + percentiles=(percentiles if percentiles else perc_coord.points), + ) + else: + no_of_percentiles = len(template.coord("realization").points) + percentiles = conversion_plugin( + distribution["location"], + distribution["scale"], + template, + no_of_percentiles=no_of_percentiles, + ) + result = EnsembleReordering().process( + percentiles, + template, + random_ordering=randomise, + random_seed=random_seed, + ) + + # Preserve cell methods from template. + for cm in template.cell_methods: + result.add_cell_method(cm) + + return result diff --git a/improver/calibration/load_and_train_quantile_regression_random_forest.py b/improver/calibration/load_and_train_quantile_regression_random_forest.py index bc342af45f..5a8d9d558c 100644 --- a/improver/calibration/load_and_train_quantile_regression_random_forest.py +++ b/improver/calibration/load_and_train_quantile_regression_random_forest.py @@ -20,7 +20,7 @@ CalibrationSchemas, get_training_period_cycles, identify_parquet_type, - split_pickle_parquet_and_netcdf, + split_netcdf_parquet_pickle, ) from improver.calibration.quantile_regression_random_forest import ( TrainQuantileRegressionRandomForests, @@ -290,7 +290,7 @@ def process( """ if not self.quantile_forest_installed: return None, None, None - cube_inputs, parquets, _ = split_pickle_parquet_and_netcdf(file_paths) + cube_inputs, parquets, _ = split_netcdf_parquet_pickle(file_paths) # If there are no parquet files, return None. if not parquets: diff --git a/improver/calibration/samos_calibration.py b/improver/calibration/samos_calibration.py index 7535aa0560..4d7e188780 100644 --- a/improver/calibration/samos_calibration.py +++ b/improver/calibration/samos_calibration.py @@ -7,7 +7,7 @@ Statistics (SAMOS). """ -from typing import Dict, List, Optional, Tuple +from typing import Dict, List, Optional, Sequence, Tuple import iris import iris.pandas @@ -25,14 +25,21 @@ def GAM(self): from iris.analysis import MEAN, STD_DEV from iris.cube import Cube, CubeList from iris.util import new_axis -from numpy import array, int64, nan +from numpy import array, clip, float32, int64, nan from numpy.ma import masked_all_like from pandas import merge -from improver import BasePlugin +from improver import BasePlugin, PostProcessingPlugin from improver.calibration.emos_calibration import ( + ApplyEMOS, EstimateCoefficientsForEnsembleCalibration, + convert_to_realizations, + generate_forecast_from_distribution, + get_attribute_from_coefficients, + get_forecast_type, ) +from improver.ensemble_copula_coupling.utilities import get_bounds_of_distribution +from improver.metadata.probabilistic import find_threshold_coordinate from improver.utilities.cube_manipulation import collapse_realizations from improver.utilities.generalized_additive_models import GAMFit, GAMPredict from improver.utilities.mathematical_operations import CalculateClimateAnomalies @@ -44,6 +51,7 @@ def GAM(self): def prepare_data_for_gam( input_cube: Cube, additional_fields: Optional[CubeList] = None, + unique_site_id_key: Optional[str] = None, ) -> pd.DataFrame: """ Convert input cubes in to a single, combined dataframe. @@ -56,9 +64,15 @@ def prepare_data_for_gam( additional_fields cubes. Args: - input_cube: A cube of forecast or observation data. - additional_fields: Additional cubes with points which can be matched with points - in input_cube by matching spatial coordinate values. + input_cube: + A cube of forecast or observation data. + additional_fields: + Additional cubes with points which can be matched with points + in input_cube by matching spatial coordinate values. + unique_site_id_key: + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. Returns: A pandas dataframe with rows equal to the number of sites/grid points in @@ -77,11 +91,25 @@ def prepare_data_for_gam( add_ancillary_variables=True, ) df.reset_index(inplace=True) + if additional_fields: - spatial_coords = [ - input_cube.coord(axis="X").name(), - input_cube.coord(axis="Y").name(), - ] + # Check if we are dealing with spot data. + site_data = "spot_index" in [c.name() for c in input_cube.coords()] + + # For site data we should use unique IDs wherever possible. As a + # fallback we can match on latitude, longitude and altitude. We + # need altitude to accommodate e.g. sites with a lower and upper + # forecast altitude. If we are working with gridded data we use the + # spatial coordinates of the input cube. + if site_data and unique_site_id_key is not None: + match_coords = [unique_site_id_key] + elif site_data: + match_coords = ["latitude", "longitude", "altitude"] + else: + match_coords = [ + input_cube.coord(axis="X").name(), + input_cube.coord(axis="Y").name(), + ] for cube in additional_fields: new_df = iris.pandas.as_data_frame( cube, @@ -90,9 +118,7 @@ def prepare_data_for_gam( add_ancillary_variables=True, ) new_df.reset_index(inplace=True) - match_coords = spatial_coords.copy() - match_coords.append(cube.name()) - df = merge(left=df, right=new_df[match_coords], how="left") + df = merge(left=df, right=new_df[match_coords + [cube.name()]], how="left") return df @@ -130,6 +156,63 @@ def convert_dataframe_to_cube( return result +def get_climatological_stats( + input_cube: Cube, + gams: List, + gam_features: List[str], + additional_cubes: Optional[CubeList], + sd_clip: float = 0.25, + unique_site_id_key: Optional[str] = None, +) -> Tuple[Cube, Cube]: + """Function to predict climatological means and standard deviations given fitted + GAMs for each statistic and cubes which can be used to construct a dataframe + containing all required features for those GAMs. + + Args: + input_cube + gams: A list containing two fitted GAMs, the first for predicting the + climatological mean of the locations in input_cube and the second + predicting the climatological standard deviation. + gam_features: + The list of features. These must be either coordinates on input_cube or + share a name with a cube in additional_cubes. The index of each + feature should match the indices used in model_specification. + additional_cubes: + Additional fields to use as supplementary predictors. + sd_clip: + The minimum standard deviation value to allow when predicting from the GAM. + Any predictions below this value will be set to this value. + unique_site_id_key: + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. + + Returns: + A pair of cubes containing climatological mean and climatological standard + deviation predictions respectively. + """ + diagnostic = input_cube.name() + + df = prepare_data_for_gam( + input_cube, additional_cubes, unique_site_id_key=unique_site_id_key + ) + + # Calculate climatological means and standard deviations using previously + # fitted GAMs. + mean_pred = GAMPredict().process(gams[0], df[gam_features]) + sd_pred = GAMPredict().process(gams[1], df[gam_features]) + + # Convert means and standard deviations into cubes + df[diagnostic] = mean_pred + mean_cube = convert_dataframe_to_cube(df, input_cube) + + df[diagnostic] = sd_pred + sd_cube = convert_dataframe_to_cube(df, input_cube) + sd_cube.data = clip(sd_cube.data, a_min=sd_clip, a_max=None) + + return mean_cube, sd_cube + + class TrainGAMsForSAMOS(BasePlugin): """ Class for fitting Generalised Additive Models (GAMs) to training data for use in @@ -150,6 +233,7 @@ def __init__( link: str = "identity", fit_intercept: bool = True, window_length: int = 11, + unique_site_id_key: Optional[str] = None, ): """ Initialize the class. @@ -157,29 +241,33 @@ def __init__( Args: model_specification: A list of lists which each contain three items (in order): - 1. a string containing a single pyGAM term; one of 'linear', - 'spline', 'tensor', or 'factor' - 2. a list of integers which correspond to the features to be - included in that term - 3. a dictionary of kwargs to be included when defining the term + 1. a string containing a single pyGAM term; one of 'linear', + 'spline', 'tensor', or 'factor'. + 2. a list of integers which correspond to the features to be + included in that term. + 3. a dictionary of kwargs to be included when defining the term. max_iter: A pyGAM argument which determines the maximum iterations allowed when - fitting the GAM + fitting the GAM. tol: A pyGAM argument determining the tolerance used to define the stopping - criteria + criteria. distribution: - A pyGAM argument determining the distribution to be used in the model + A pyGAM argument determining the distribution to be used in the model. link: - A pyGAM argument determining the link function to be used in the model + A pyGAM argument determining the link function to be used in the model. fit_intercept: A pyGAM argument determining whether to include an intercept term in - the model + the model. window_length: The length of the rolling window used to calculate the mean and standard deviation of the input cube when the input cube does not have a realization dimension coordinate. This must be an odd integer greater than 1. + unique_site_id_key: + An optional key to use for uniquely identifying each site in the + training data. If not provided, the default behavior is to use the + spatial coordinates (latitude, longitude) of each site. """ self.model_specification = model_specification self.max_iter = max_iter @@ -187,6 +275,7 @@ def __init__( self.distribution = distribution self.link = link self.fit_intercept = fit_intercept + self.unique_site_id_key = unique_site_id_key if window_length < 3 or window_length % 2 == 0 or window_length % 1 != 0: raise ValueError( @@ -201,6 +290,7 @@ def apply_aggregator( ) -> Cube: """ Internal function to apply rolling window aggregator to padded cube. + Args: padded_cube: The cube to have rolling window calculation applied to. @@ -448,11 +538,11 @@ def process( ) for stat_cube in stat_cubes: - df = prepare_data_for_gam(stat_cube, additional_fields) - + df = prepare_data_for_gam( + stat_cube, additional_fields, unique_site_id_key=self.unique_site_id_key + ) feature_values = df[features].values targets = df[input_cube.name()].values - output.append(plugin.process(feature_values, targets)) return output @@ -478,6 +568,7 @@ def __init__( self, distribution: str, emos_kwargs: Optional[Dict] = None, + unique_site_id_key: Optional[str] = None, ) -> None: """Initialize the class. @@ -488,56 +579,14 @@ def __init__( emos_kwargs: Keyword arguments accepted by the EstimateCoefficientsForEnsembleCalibration plugin. Should not contain a distribution argument. + unique_site_id_key: + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. """ self.distribution = distribution self.emos_kwargs = emos_kwargs if emos_kwargs else {} - - @staticmethod - def get_climatological_stats( - input_cube: Cube, - gams: List[pygam.GAM], - gam_features: List[str], - additional_fields: Optional[CubeList], - ) -> Tuple[Cube, Cube]: - """Function to predict climatological means and standard deviations given fitted - GAMs for each statistic and cubes which can be used to construct a dataframe - containing all required features for those GAMs. - - Args: - input_cube: - A cube of forecasts or observations from the training dataset. - gams: - A list containing two fitted GAMs, the first for predicting the - climatological mean of the locations in input_cube and the second - predicting the climatological standard deviation. - gam_features: - The list of features. These must be either coordinates on input_cube or - share a name with a cube in additional_fields. The index of each - feature should match the indices used in model_specification. - additional_fields: - Additional fields to use as supplementary predictors. - - Returns: - A pair of cubes containing climatological mean and climatological standard - deviation predictions respectively. - """ - diagnostic = input_cube.name() - - df = prepare_data_for_gam(input_cube, additional_fields) - - # Calculate climatological means and standard deviations using previously - # fitted GAMs. - mean_pred = GAMPredict().process(gams[0], df[gam_features]) - sd_pred = GAMPredict().process(gams[1], df[gam_features]) - - # Convert means and standard deviations into cubes - df[diagnostic] = mean_pred - mean_cube = convert_dataframe_to_cube(df, input_cube) - - df[diagnostic] = sd_pred - sd_cube = convert_dataframe_to_cube(df, input_cube) - - return mean_cube, sd_cube + self.unique_site_id_key = unique_site_id_key def climate_anomaly_emos( self, @@ -566,7 +615,6 @@ def climate_anomaly_emos( land points are used to calculate the coefficients. Within the land-sea mask cube land points should be specified as ones, and sea points as zeros. - Returns: CubeList constructed using the coefficients provided and using metadata from the historic_forecasts cube. Each cube within the @@ -634,18 +682,25 @@ def process( land points are used to calculate the EMOS coefficients. Within the land-sea mask cube land points should be specified as ones, and sea points as zeros. - Returns: CubeList constructed using the coefficients provided and using metadata from the historic_forecasts cube. Each cube within the cubelist is for a separate EMOS coefficient e.g. alpha, beta, gamma, delta. """ - forecast_mean, forecast_sd = self.get_climatological_stats( - historic_forecasts, forecast_gams, gam_features, gam_additional_fields + forecast_mean, forecast_sd = get_climatological_stats( + historic_forecasts, + forecast_gams, + gam_features, + gam_additional_fields, + unique_site_id_key=self.unique_site_id_key, ) - truth_mean, truth_sd = self.get_climatological_stats( - truths, truth_gams, gam_features, gam_additional_fields + truth_mean, truth_sd = get_climatological_stats( + truths, + truth_gams, + gam_features, + gam_additional_fields, + unique_site_id_key=self.unique_site_id_key, ) emos_coefficients = self.climate_anomaly_emos( @@ -656,3 +711,252 @@ def process( ) return emos_coefficients + + +class ApplySAMOS(PostProcessingPlugin): + """ + Class to calibrate an input forecast using SAMOS given the following inputs: + - Two GAMs which model, respectively, the climatological mean and standard + deviation of the forecast. This allows the forecast to be converted to + climatological anomalies. + - A set of EMOS coefficients which can be applied to correct the climatological + anomalies. + """ + + def __init__( + self, + percentiles: Optional[Sequence] = None, + unique_site_id_key: Optional[str] = None, + ): + """Initialize class. + + Args: + percentiles: + The set of percentiles used to create the calibrated forecast. + unique_site_id_key: + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. + """ + self.percentiles = [float32(p) for p in percentiles] if percentiles else None + self.unique_site_id_key = unique_site_id_key + + def transform_anomalies_to_original_units( + self, + location_parameter: Cube, + scale_parameter: Cube, + truth_mean: Cube, + truth_sd: Cube, + forecast: Cube, + input_forecast_type: str, + ) -> None: + """Function to transform location and scale parameters which describe a + climatological anomaly distribution to location and scale parameters which + describe a distribution in the units of the original forecast. Both parameter + cubes are modified in place. + + Predictions of mean and standard deviation from the 'truth' GAMs are used for + this transformation. This ensures that the calibrated forecast follows the + 'true' distribution, rather than the distribution of the original forecast, + following the suggested method in: + + Dabernig, M., Mayr, G.J., Messner, J.W. and Zeileis, A. (2017). + Spatial ensemble post-processing with standardized anomalies. + Q.J.R. Meteorol. Soc, 143: 909-916. https://doi.org/10.1002/qj.2975 + + Args: + location_parameter: + Cube containing the location parameter of the climatological anomaly + distribution. This is modified in place. + scale_parameter: + Cube containing the scale parameter of the climatological anomaly + distribution. This is modified in place. + truth_mean: + Cube containing climatological mean predictions of the truths. + truth_sd: + Cube containing climatological standard deviation predictions of the + truths. + forecast: + The original, uncalibrated forecast. + input_forecast_type: + The type of the original, uncalibrated forecast. One of 'realizations', + 'percentiles' or 'probabilities'. + """ + + # The data in these cubes are identical along the realization dimensions. + truth_mean = next(truth_mean.slices_over("realization")) + truth_sd = next(truth_sd.slices_over("realization")) + + # Transform location and scale parameters so that they represent a distribution + # in the units of the original forecast, rather than climatological anomalies. + forecast_units = ( + forecast.units + if input_forecast_type != "probabilities" + else find_threshold_coordinate(forecast).units + ) + location_parameter.data = ( + location_parameter.data * truth_sd.data + ) + truth_mean.data + location_parameter.units = forecast_units + + # The scale parameter returned by ApplyEMOS is the standard deviation for a + # normal distribution. To get the desired standard deviation in + # realization/percentile space we must multiply by the estimated forecast + # standard deviation. + scale_parameter.data = scale_parameter.data * truth_sd.data + scale_parameter.units = forecast_units + + def process( + self, + forecast: Cube, + forecast_gams: List, + truth_gams: List, + gam_features: List[str], + emos_coefficients: CubeList, + gam_additional_fields: Optional[CubeList] = None, + emos_additional_fields: Optional[CubeList] = None, + prob_template: Optional[Cube] = None, + realizations_count: Optional[int] = None, + ignore_ecc_bounds: bool = True, + tolerate_time_mismatch: bool = False, + predictor: str = "mean", + randomise: bool = False, + random_seed: Optional[int] = None, + ): + """Calibrate input forecast using GAMs to convert the forecast to climatological + anomalies and pre-calculated EMOS coefficients to apply to those anomalies. + + Args: + forecast: + Uncalibrated forecast as probabilities, percentiles or + realizations. + forecast_gams: + A list containing two fitted GAMs, the first for predicting the + climatological mean of the historic forecasts at each location and the + second predicting the climatological standard deviation. + truth_gams: + A list containing two fitted GAMs, the first for predicting the + climatological mean of the truths at each location and the + second predicting the climatological standard deviation. + gam_features: + The list of features. These must be either coordinates on input_cube or + share a name with a cube in additional_cubes. The index of each + feature should match the indices used in model_specification. + emos_coefficients: + EMOS coefficients. + gam_additional_fields: + Additional fields to use as supplementary predictors in the GAMs. + emos_additional_fields: + Additional fields to use as supplementary predictors in EMOS. + prob_template: + A cube containing a probability forecast that will be used as + a template when generating probability output when the input + format of the forecast cube is not probabilities i.e. realizations + or percentiles. + realizations_count: + Number of realizations to use when generating the intermediate + calibrated forecast from probability or percentile inputs + ignore_ecc_bounds: + If True, allow percentiles from probabilities to exceed the ECC + bounds range. If input is not probabilities, this is ignored. + tolerate_time_mismatch: + If True, tolerate a mismatch in validity time and forecast + period for coefficients vs forecasts. Use with caution! + predictor: + Predictor to be used to calculate the location parameter of the + calibrated distribution. Value is "mean" or "realizations". + randomise: + Used in generating calibrated realizations. If input forecast + is probabilities or percentiles, this is ignored. + random_seed: + Used in generating calibrated realizations. If input forecast + is probabilities or percentiles, this is ignored. + + Returns: + Calibrated forecast in the form of the input (ie probabilities + percentiles or realizations). + """ + input_forecast_type = get_forecast_type(forecast) + + forecast_as_realizations = forecast.copy() + if input_forecast_type != "realizations": + forecast_as_realizations = convert_to_realizations( + forecast.copy(), realizations_count, ignore_ecc_bounds + ) + forecast_mean, forecast_sd = get_climatological_stats( + forecast_as_realizations, + forecast_gams, + gam_features, + gam_additional_fields, + unique_site_id_key=self.unique_site_id_key, + ) + forecast_ca = CalculateClimateAnomalies(ignore_temporal_mismatch=True).process( + diagnostic_cube=forecast_as_realizations, + mean_cube=forecast_mean, + std_cube=forecast_sd, + ) + + # Returns parameters which describe a climate anomaly distribution. + location_parameter, scale_parameter = ApplyEMOS( + percentiles=self.percentiles + ).process( + forecast=forecast_ca, + coefficients=emos_coefficients, + additional_fields=emos_additional_fields, + prob_template=prob_template, + realizations_count=realizations_count, + ignore_ecc_bounds=ignore_ecc_bounds, + tolerate_time_mismatch=tolerate_time_mismatch, + predictor=predictor, + randomise=randomise, + random_seed=random_seed, + return_parameters=True, + ) + + truth_mean, truth_sd = get_climatological_stats( + forecast_as_realizations, + truth_gams, + gam_features, + gam_additional_fields, + unique_site_id_key=self.unique_site_id_key, + ) + + # Transform location and scale parameters to be in the same units as the + # original forecast. + self.transform_anomalies_to_original_units( + truth_mean=truth_mean, + truth_sd=truth_sd, + location_parameter=location_parameter, + scale_parameter=scale_parameter, + forecast=forecast, + input_forecast_type=input_forecast_type, + ) + + # Generate output in desired format from distribution. + self.distribution = { + "name": get_attribute_from_coefficients(emos_coefficients, "distribution"), + "location": location_parameter, + "scale": scale_parameter, + "shape": get_attribute_from_coefficients( + emos_coefficients, "shape_parameters", optional=True + ), + } + + template = prob_template if prob_template else forecast + result = generate_forecast_from_distribution( + self.distribution, template, self.percentiles, randomise, random_seed + ) + + if input_forecast_type != "probabilities" and not prob_template: + # Enforce that the result is within sensible bounds. + bounds_pairing = get_bounds_of_distribution( + bounds_pairing_key=result.name(), desired_units=result.units + ) + result.data = clip( + result.data, a_min=bounds_pairing[0], a_max=bounds_pairing[1] + ) + + # Enforce correct dtype. + result.data = result.data.astype(dtype=float32) + + return result diff --git a/improver/calibration/utilities.py b/improver/calibration/utilities.py index 68eec73f6d..9c788b4155 100644 --- a/improver/calibration/utilities.py +++ b/improver/calibration/utilities.py @@ -8,10 +8,13 @@ """ +import warnings +from pathlib import Path from typing import List, Set, Tuple, Union import iris import numpy as np +import pandas as pd from iris.coords import DimCoord from iris.cube import Cube, CubeList from numpy import ndarray @@ -472,3 +475,183 @@ def check_data_sufficiency( f"historic forecasts and truth pairs: {proportion_of_nans}." ) raise ValueError(msg) + + +def prepare_cube_no_calibration( + forecast: Cube, + emos_coefficients: CubeList, + ignore_ecc_bounds_exceedance: bool = False, + validity_times: List[str] = None, + percentiles: List[float] = None, + prob_template: Cube = None, +) -> Cube | None: + """ + Function to add appropriate metadata to cubes that cannot be calibrated. If the + forecast can be calibrated then nothing is returned. + + Args: + forecast (iris.cube.Cube): + The forecast to be calibrated. The input format could be either + realizations, probabilities or percentiles. + emos_coefficients (iris.cube.CubeList): + The EMOS coefficients to be applied to the forecast. + ignore_ecc_bounds_exceedance (bool): + If True, where the percentiles exceed the ECC bounds range, + raises a warning rather than an exception. This occurs when the + current forecast is in the form of probabilities and is + converted to percentiles, as part of converting the input + probabilities into realizations. + validity_times (List[str]): + Times at which the forecast must be valid. This must be provided + as a four digit string (HHMM) where the first two digits represent the hour + and the last two digits represent the minutes e.g. 0300 or 0315. If the + forecast provided is at a different validity time then no coefficients + will be applied. + percentiles (List[float]): + The set of percentiles used to create the calibrated forecast. + prob_template (iris.cube.Cube): + Optionally, a cube containing a probability forecast that will be + used as a template when generating probability output when the input + format of the forecast cube is not probabilities i.e. realizations + or percentiles. If no coefficients are provided and a probability + template is provided, the probability template forecast will be + returned as the uncalibrated probability forecast. + + Returns: + The prepared forecast cube or None. + """ + from improver.calibration import add_warning_comment, validity_time_check + from improver.ensemble_copula_coupling.ensemble_copula_coupling import ( + ResamplePercentiles, + ) + + if validity_times is not None and not validity_time_check(forecast, validity_times): + if percentiles: + # Ensure that a consistent set of percentiles are returned, + # regardless of whether SAMOS is successfully applied. + percentiles = [np.float32(p) for p in percentiles] + forecast = ResamplePercentiles( + ecc_bounds_warning=ignore_ecc_bounds_exceedance + )(forecast, percentiles=percentiles) + elif prob_template: + forecast = prob_template + forecast = add_warning_comment(forecast) + return forecast + + if emos_coefficients is None: + if prob_template: + msg = ( + "There are no coefficients provided for calibration. As a " + "probability template has been provided with the aim of " + "creating a calibrated probability forecast, the probability " + "template will be returned as the uncalibrated probability " + "forecast." + ) + warnings.warn(msg) + prob_template = add_warning_comment(prob_template) + return prob_template + + if percentiles: + # Ensure that a consistent set of percentiles are returned, + # regardless of whether SAMOS is successfully applied. + percentiles = [np.float32(p) for p in percentiles] + forecast = ResamplePercentiles( + ecc_bounds_warning=ignore_ecc_bounds_exceedance + )(forecast, percentiles=percentiles) + + msg = ( + "There are no coefficients provided for calibration. The " + "uncalibrated forecast will be returned." + ) + warnings.warn(msg) + + forecast = add_warning_comment(forecast) + return forecast + + +def convert_parquet_to_cube( + forecast: Path, + truth: Path, + forecast_period: int, + cycletime: str, + training_length: int, + diagnostic: str, + percentiles: List[float], + experiment: str, +) -> iris.cube.CubeList: + """Function to convert a parquet file containing forecast and truth data + into a CubeList for use in calibration. + + Args: + forecast (pathlib.Path): + The path to a Parquet file containing the historical forecasts + to be used for calibration. The expected columns within the + Parquet file are: forecast, blend_time, forecast_period, + forecast_reference_time, time, wmo_id, percentile, diagnostic, + latitude, longitude, period, height, cf_name, units. + truth (pathlib.Path): + The path to a Parquet file containing the truths to be used + for calibration. The expected columns within the + Parquet file are: ob_value, time, wmo_id, diagnostic, latitude, + longitude and altitude. + forecast_period (int): + Forecast period to be calibrated in seconds. + cycletime (str): + Cycletime of a format similar to 20170109T0000Z. + training_length (int): + Number of days within the training period. + diagnostic (str): + The name of the diagnostic to be calibrated within the forecast + and truth tables. This name is used to filter the Parquet file + when reading from disk. + percentiles (List[float]): + The set of percentiles to be used for estimating coefficients. + These should be a set of equally spaced quantiles. + experiment (str): + A value within the experiment column to select from the forecast + table. + + Returns: + A CubeList containing the forecast and truth cubes, with the + forecast cube containing the percentiles as an auxiliary coordinate. + """ + from improver.calibration.dataframe_utilities import ( + forecast_and_truth_dataframes_to_cubes, + ) + + # Load forecasts from parquet file filtering by diagnostic and blend_time. + forecast_period_td = pd.Timedelta(int(forecast_period), unit="seconds") + + cycletimes = pd.date_range( + end=pd.Timestamp(cycletime) + - pd.Timedelta(1, unit="days") + - forecast_period_td.floor("D"), + periods=int(training_length), + freq="D", + ) + filters = [[("diagnostic", "==", diagnostic), ("blend_time", "in", cycletimes)]] + forecast_df = pd.read_parquet(forecast, filters=filters) + + # Load truths from parquet file filtering by diagnostic. + filters = [[("diagnostic", "==", diagnostic)]] + truth_df = pd.read_parquet(truth, filters=filters) + if truth_df.empty: + msg = ( + f"The requested filepath {truth} does not contain the " + f"requested contents: {filters}" + ) + raise IOError(msg) + + forecast_cube, truth_cube = forecast_and_truth_dataframes_to_cubes( + forecast_df, + truth_df, + cycletime, + forecast_period, + training_length, + percentiles=percentiles, + experiment=experiment, + ) + if not forecast_cube or not truth_cube: + return [None, None] + else: + return CubeList([forecast_cube, truth_cube]) diff --git a/improver/cli/apply_emos_coefficients.py b/improver/cli/apply_emos_coefficients.py index 848ab3edf8..425ac08087 100644 --- a/improver/cli/apply_emos_coefficients.py +++ b/improver/cli/apply_emos_coefficients.py @@ -107,66 +107,27 @@ def process( The calibrated forecast cube. """ - import warnings - - import numpy as np from improver.calibration import ( - add_warning_comment, split_forecasts_and_coeffs, - validity_time_check, ) from improver.calibration.emos_calibration import ApplyEMOS - from improver.ensemble_copula_coupling.ensemble_copula_coupling import ( - ResamplePercentiles, - ) + from improver.calibration.utilities import prepare_cube_no_calibration (forecast, coefficients, additional_predictors, land_sea_mask, prob_template) = ( split_forecasts_and_coeffs(cubes, land_sea_mask_name) ) - if validity_times is not None and not validity_time_check(forecast, validity_times): - if percentiles: - # Ensure that a consistent set of percentiles are returned, - # regardless of whether EMOS is successfully applied. - percentiles = [np.float32(p) for p in percentiles] - forecast = ResamplePercentiles( - ecc_bounds_warning=ignore_ecc_bounds_exceedance - )(forecast, percentiles=percentiles) - elif prob_template: - forecast = prob_template - forecast = add_warning_comment(forecast) - return forecast - - if coefficients is None: - if prob_template: - msg = ( - "There are no coefficients provided for calibration. As a " - "probability template has been provided with the aim of " - "creating a calibrated probability forecast, the probability " - "template will be returned as the uncalibrated probability " - "forecast." - ) - warnings.warn(msg) - prob_template = add_warning_comment(prob_template) - return prob_template - - if percentiles: - # Ensure that a consistent set of percentiles are returned, - # regardless of whether EMOS is successfully applied. - percentiles = [np.float32(p) for p in percentiles] - forecast = ResamplePercentiles( - ecc_bounds_warning=ignore_ecc_bounds_exceedance - )(forecast, percentiles=percentiles) - - msg = ( - "There are no coefficients provided for calibration. The " - "uncalibrated forecast will be returned." - ) - warnings.warn(msg) - - forecast = add_warning_comment(forecast) - return forecast + uncalibrated_forecast = prepare_cube_no_calibration( + forecast, + coefficients, + ignore_ecc_bounds_exceedance=ignore_ecc_bounds_exceedance, + validity_times=validity_times, + percentiles=percentiles, + prob_template=prob_template, + ) + if uncalibrated_forecast is not None: + return uncalibrated_forecast calibration_plugin = ApplyEMOS(percentiles=percentiles) result = calibration_plugin( diff --git a/improver/cli/apply_quantile_regression_random_forest.py b/improver/cli/apply_quantile_regression_random_forest.py index 765406d6ca..c870302082 100644 --- a/improver/cli/apply_quantile_regression_random_forest.py +++ b/improver/cli/apply_quantile_regression_random_forest.py @@ -60,12 +60,12 @@ def process( iris.cube.Cube: The calibrated forecast cube. """ - from improver.calibration import split_pickle_parquet_and_netcdf + from improver.calibration import split_netcdf_parquet_pickle from improver.calibration.load_and_apply_quantile_regression_random_forest import ( PrepareAndApplyQRF, ) - cubes, _, qrf_descriptors = split_pickle_parquet_and_netcdf(file_paths) + cubes, _, qrf_descriptors = split_netcdf_parquet_pickle(file_paths) result = PrepareAndApplyQRF( feature_config=feature_config, diff --git a/improver/cli/apply_samos_coefficients.py b/improver/cli/apply_samos_coefficients.py new file mode 100644 index 0000000000..5011f19030 --- /dev/null +++ b/improver/cli/apply_samos_coefficients.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python +# (C) Crown Copyright, Met Office. All rights reserved. +# +# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. +# See LICENSE in the root of the repository for full licensing details. +"""Script to apply Standardised Anomaly Model Output Statistics (SAMOS) calibration.""" + +from improver import cli + + +@cli.clizefy +@cli.with_output +def process( + *file_paths: cli.inputpath, + gam_features: cli.comma_separated_list, + validity_times: cli.comma_separated_list = None, + realizations_count: int = None, + randomise=False, + random_seed: int = None, + ignore_ecc_bounds_exceedance=False, + tolerate_time_mismatch=False, + predictor="mean", + percentiles: cli.comma_separated_list = None, + unique_site_id_key: str = None, +): + """Apply coefficients for Standardized Anomaly Model Output Statistics (SAMOS). + + The forecast is converted to anomaly data, the forecast mean and standard deviation + are predicted from the provided GAM models. The anomaly data is calibrated using the + EMOS plugin and the provided forecast coefficients. The calibrated forecast is then + regenerated from the distributional information and the data is written to a cube. + If no coefficients are provided the input forecast is returned unchanged. + + Args: + file_paths (cli.inputpath): + A list of input paths containing: + - Path to a pickle file containing the GAMs to be used. This pickle + file contains two lists, each containing two fitted GAMs. The first list + contains GAMS for predicting each of the climatological mean and standard + deviation of the historical forecasts. The second list contains GAMS for + predicting each of the climatological mean and standard deviation of the + truths. + - Path to a NetCDF file containing the forecast to be calibrated. The + input forecast could be given as realizations, probabilities or + percentiles. + - Path to a NetCDF file containing a cube list that includes the + coefficients to be used for calibration or None. If none then the input, + or probability template if provided, is returned unchanged. + - Optionally, paths to additional NetCDF files that will be provided to + the emos plugin representing static additional predictors. These static + additional predictors are expected not to have a time coordinate. These + will be identified by their omission from the gam_features list. + - Optionally paths to additional NetCDF files that contain additional + features (static predictors) that will be provided to the GAM to help + calculate the climatological statistics. The name of the cubes should + match one of the names in the gam_features list. + - Optionally, path to a NetCDF file containing the land-sea mask. This + is used to ensure that only land points are calibrated. If no land-sea + mask is provided, all points will be calibrated. + - Optionally, path to a NetCDF file containing a probability forecast + that will be used as a template when generating probability output when + the input format of the forecast cube is not probabilities i.e. + realizations or percentiles. If no coefficients are provided and a + probability template is provided, the probability template forecast will + be returned as the uncalibrated probability forecast. + + gam_features (list of str): + A list of the names of the cubes that will be used as additional + features in the GAM. Additionally, the name of any coordinates + that are to be used as features in the GAM. + validity_times (List[str]): + Times at which the forecast must be valid. This must be provided + as a four digit string (HHMM) where the first two digits represent the hour + and the last two digits represent the minutes e.g. 0300 or 0315. If the + forecast provided is at a different validity time then no coefficients + will be applied. + realizations_count (int): + Option to specify the number of ensemble realizations that will be + created from probabilities or percentiles when applying the SAMOS + coefficients. + randomise (bool): + Option to reorder the post-processed forecasts randomly. If not + set, the ordering of the raw ensemble is used. This option is + only valid when the input format is realizations. + random_seed (int): + Option to specify a value for the random seed for testing + purposes, otherwise the default random seen behaviour is utilised. + The random seed is used in the generation of the random numbers + used for either the randomise option to order the input + percentiles randomly, rather than use the ordering from the raw + ensemble, or for splitting tied values within the raw ensemble, + so that the values from the input percentiles can be ordered to + match the raw ensemble. + ignore_ecc_bounds_exceedance (bool): + If True, where the percentiles exceed the ECC bounds range, + raises a warning rather than an exception. This occurs when the + current forecasts is in the form of probabilities and is + converted to percentiles, as part of converting the input + probabilities into realizations. + tolerate_time_mismatch (bool): + If True, tolerate a mismatch in validity time and forecast period + for coefficients vs forecasts. Use with caution! + predictor (str): + String to specify the form of the predictor used to calculate + the location parameter when estimating the EMOS coefficients. + Currently the ensemble mean ("mean") and the ensemble + realizations ("realizations") are supported as the predictors. + percentiles (List[float]): + The set of percentiles used to create the calibrated forecast. + unique_site_id_key (str): + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. + + Returns: + iris.cube.Cube: + The calibrated forecast cube. + """ + import scipy.sparse + + from improver.calibration import ( + split_cubes_for_samos, + split_netcdf_parquet_pickle, + ) + from improver.calibration.samos_calibration import ApplySAMOS + from improver.calibration.utilities import prepare_cube_no_calibration + + # monkey-patch to 'tweak' scipy to prevent errors occurring. + def to_array(self): + return self.toarray() + + scipy.sparse.spmatrix.A = property(to_array) + + # Split the input paths into cubes and pickles + cubes, _, gams = split_netcdf_parquet_pickle(file_paths) + + # Split the cubes into forecast cubes, along with any additional fields + # provided for the GAMs and EMOS, and the coefficients to be used for calibration + ( + forecast, + _, + gam_additional_fields, + emos_coefficients, + emos_additional_fields, + prob_template, + ) = split_cubes_for_samos( + cubes=cubes, + gam_features=gam_features, + truth_attribute=None, + expect_emos_coeffs=True, + expect_emos_fields=True, + ) + + uncalibrated_forecast = prepare_cube_no_calibration( + forecast, + emos_coefficients, + ignore_ecc_bounds_exceedance=ignore_ecc_bounds_exceedance, + validity_times=validity_times, + percentiles=percentiles, + prob_template=prob_template, + ) + + if uncalibrated_forecast is not None: + return uncalibrated_forecast + + plugin = ApplySAMOS(percentiles=percentiles, unique_site_id_key=unique_site_id_key) + result = plugin.process( + forecast=forecast, + forecast_gams=gams[0], + truth_gams=gams[1], + gam_features=gam_features, + emos_coefficients=emos_coefficients, + gam_additional_fields=gam_additional_fields, + emos_additional_fields=emos_additional_fields, + prob_template=prob_template, + realizations_count=realizations_count, + ignore_ecc_bounds=ignore_ecc_bounds_exceedance, + tolerate_time_mismatch=tolerate_time_mismatch, + predictor=predictor, + randomise=randomise, + random_seed=random_seed, + ) + + return result diff --git a/improver/cli/estimate_emos_coefficients_from_table.py b/improver/cli/estimate_emos_coefficients_from_table.py index 817a87803f..1c5e2873da 100755 --- a/improver/cli/estimate_emos_coefficients_from_table.py +++ b/improver/cli/estimate_emos_coefficients_from_table.py @@ -42,7 +42,7 @@ def process( Args: forecast (pathlib.Path): The path to a Parquet file containing the historical forecasts - to be used for calibration.The expected columns within the + to be used for calibration. The expected columns within the Parquet file are: forecast, blend_time, forecast_period, forecast_reference_time, time, wmo_id, percentile, diagnostic, latitude, longitude, period, height, cf_name, units. @@ -88,7 +88,7 @@ def process( predictor (str): String to specify the form of the predictor used to calculate the location parameter when estimating the EMOS coefficients. - Currently the ensemble mean ("mean") and the ensemble realizations + Currently, the ensemble mean ("mean") and the ensemble realizations ("realizations") are supported as options. tolerance (float): The tolerance for the Continuous Ranked Probability Score (CRPS) @@ -115,40 +115,21 @@ def process( CubeList containing the coefficients estimated using EMOS. Each coefficient is stored in a separate cube. """ - import iris - import pandas as pd from iris.cube import CubeList - from improver.calibration import get_training_period_cycles - from improver.calibration.dataframe_utilities import ( - forecast_and_truth_dataframes_to_cubes, - ) from improver.calibration.emos_calibration import ( EstimateCoefficientsForEnsembleCalibration, ) + from improver.calibration.utilities import convert_parquet_to_cube - # Load forecasts from parquet file filtering by diagnostic and blend_time. - cycletimes = get_training_period_cycles(cycletime, forecast_period, training_length) - filters = [[("diagnostic", "==", diagnostic), ("blend_time", "in", cycletimes)]] - forecast_df = pd.read_parquet(forecast, filters=filters) - - # Load truths from parquet file filtering by diagnostic. - filters = [[("diagnostic", "==", diagnostic)]] - truth_df = pd.read_parquet(truth, filters=filters) - if truth_df.empty: - msg = ( - f"The requested filepath {truth} does not contain the " - f"requested contents: {filters}" - ) - raise IOError(msg) - - forecast_cube, truth_cube = forecast_and_truth_dataframes_to_cubes( - forecast_df, - truth_df, - cycletime, - forecast_period, - training_length, + forecast_cube, truth_cube = convert_parquet_to_cube( + forecast, + truth, + forecast_period=forecast_period, + cycletime=cycletime, + training_length=training_length, + diagnostic=diagnostic, percentiles=percentiles, experiment=experiment, ) diff --git a/improver/cli/estimate_samos_coefficients.py b/improver/cli/estimate_samos_coefficients.py new file mode 100755 index 0000000000..c2fc07f6c2 --- /dev/null +++ b/improver/cli/estimate_samos_coefficients.py @@ -0,0 +1,158 @@ +#!/usr/bin/env python +# (C) Crown Copyright, Met Office. All rights reserved. +# +# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. +# See LICENSE in the root of the repository for full licensing details. +"""CLI to estimate the Ensemble Model Output Statistics (EMOS) coefficients for +Standardized Anomaly Model Output Statistics (SAMOS).""" + +from improver import cli + + +@cli.clizefy +@cli.with_output +def process( + *file_paths: cli.inputpath, + truth_attribute: str, + gam_features: cli.comma_separated_list, + use_default_initial_guess=False, + units=None, + predictor="mean", + tolerance: float = 0.02, + max_iterations: int = 1000, + unique_site_id_key: str = "wmo_id", +): + """Estimate EMOS coefficients for use with SAMOS. + + Loads in arguments for estimating coefficients for Ensemble Model + Output Statistics (EMOS), otherwise known as Non-homogeneous Gaussian + Regression (NGR). Two sources of input data must be provided: historical + forecasts and historical truth data (to use in calibration). + The estimated coefficients are output as a cube. + + Args: + file_paths (cli.inputpath): + A list of input paths containing: + - Path to a pickle file containing the GAMs to be used. This pickle + file contains two lists, each containing two fitted GAMs. The first list + contains GAMS for predicting each of the climatological mean and + standard deviation of the historical forecasts. The second list contains + GAMS for predicting each of the climatological mean and standard + deviation of the truths. + - Paths to NetCDF files containing the historical forecasts and + corresponding truths used for calibration. They must have the same + diagnostic name and will be separated based on the provided truth + attribute. + - Optionally, paths to additional NetCDF files that will be provided to + the emos plugin representing static additional predictors. These static + additional predictors are expected not to have a time coordinate. These + will be identified by their omission from the gam_features list. + - Optionally paths to additional NetCDF files that contain additional + features (static predictors) that will be provided to the GAM to help + calculate the climatological statistics. The name of the cubes should + match one of the names in the gam_features list. + truth_attribute (str): + An attribute and its value in the format of "attribute=value", + which must be present on historical truth cubes. + gam_features (list of str): + A list of the names of the cubes that will be used as additional + features in the GAM. Additionally, the name of any coordinates + that are to be used as features in the GAM. + use_default_initial_guess (bool): + If True, use the default initial guess. The default initial guess + assumes no adjustments are required to the initial choice of + predictor to generate the calibrated distribution. This means + coefficients of 1 for the multiplicative coefficients and 0 for + the additive coefficients. If False, the initial guess is computed. + units (str): + The units that calibration should be undertaken in. The historical + forecast and truth will be converted as required. + predictor (str): + String to specify the form of the predictor used to calculate the + location parameter when estimating the EMOS coefficients. + Currently the ensemble mean ("mean") and the ensemble realizations + ("realizations") are supported as options. + tolerance (float): + The tolerance for the Continuous Ranked Probability Score (CRPS) + calculated by the minimisation. Once multiple iterations result in + a CRPS equal to the same value within the specified tolerance, the + minimisation will terminate. + max_iterations (int): + The maximum number of iterations allowed until the minimisation has + converged to a stable solution. If the maximum number of iterations + is reached but the minimisation has not yet converged to a stable + solution, then the available solution is used anyway, and a warning + is raised. If the predictor is "realizations", then the number of + iterations may require increasing, as there will be more + coefficients to solve. + unique_site_id_key (str): + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. For estimation the default is "wmo_id" + as we expect to be including observation data. + + Returns: + iris.cube.CubeList: + CubeList containing the coefficients estimated using EMOS. Each + coefficient is stored in a separate cube. + """ + import scipy.sparse + + from improver.calibration import ( + split_cubes_for_samos, + split_netcdf_parquet_pickle, + ) + from improver.calibration.samos_calibration import TrainEMOSForSAMOS + + # monkey-patch to 'tweak' scipy to prevent errors occurring + def to_array(self): + return self.toarray() + + scipy.sparse.spmatrix.A = property(to_array) + + # Split the input paths into cubes and pickles + cubes, _, gams = split_netcdf_parquet_pickle(file_paths) + + # Split the cubes into forecast and truth cubes, along with any additional fields + # provided for the GAMs and EMOS. + ( + forecast, + truth, + gam_additional_fields, + _, + emos_additional_fields, + _, + ) = split_cubes_for_samos( + cubes=cubes, + gam_features=gam_features, + truth_attribute=truth_attribute, + expect_emos_coeffs=False, + expect_emos_fields=True, + ) + + if forecast is None or truth is None: + return + + # Train emos coefficients for the SAMOS model. + emos_kwargs = { + "use_default_initial_guess": use_default_initial_guess, + "desired_units": units, + "predictor": predictor, + "tolerance": tolerance, + "max_iterations": max_iterations, + } + + plugin = TrainEMOSForSAMOS( + distribution="norm", + emos_kwargs=emos_kwargs, + unique_site_id_key=unique_site_id_key, + ) + return plugin( + historic_forecasts=forecast, + truths=truth, + forecast_gams=gams[0], + truth_gams=gams[1], + gam_features=gam_features, + gam_additional_fields=gam_additional_fields, + emos_additional_fields=emos_additional_fields, + ) diff --git a/improver/cli/estimate_samos_coefficients_from_table.py b/improver/cli/estimate_samos_coefficients_from_table.py new file mode 100755 index 0000000000..60a607ffe8 --- /dev/null +++ b/improver/cli/estimate_samos_coefficients_from_table.py @@ -0,0 +1,177 @@ +#!/usr/bin/env python +# (C) Crown Copyright, Met Office. All rights reserved. +# +# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. +# See LICENSE in the root of the repository for full licensing details. +"""CLI to estimate the Ensemble Model Output Statistics (EMOS) coefficients for +Standardized Anomaly Model Output Statistics (SAMOS).""" + +from improver import cli + + +@cli.clizefy +@cli.with_output +def process( + *file_paths: cli.inputpath, + gam_features: cli.comma_separated_list, + use_default_initial_guess=False, + units=None, + predictor="mean", + tolerance: float = 0.02, + max_iterations: int = 1000, + percentiles: cli.comma_separated_list = None, + experiment: str = None, + forecast_period: int, + training_length: int, + diagnostic: str, + cycletime: str, + unique_site_id_key: str = "wmo_id", +): + """Estimate EMOS coefficients for use with SAMOS. + + Loads in arguments for estimating coefficients for Ensemble Model + Output Statistics (EMOS), otherwise known as Non-homogeneous Gaussian + Regression (NGR). Two sources of input data must be provided: historical + forecasts and historical truth data (to use in calibration). + The estimated coefficients are output as a cube. + + Args: + file_paths (cli.inputpath): + A list of input paths containing: + - Path to a pickle file containing the GAMs to be used. This pickle + file contains two lists, each containing two fitted GAMs. The first list + contains GAMS for predicting each of the climatological mean and + standard deviation of the historical forecasts. The second list contains + GAMS for predicting each of the climatological mean and standard + deviation of the truths. + - The path to a Parquet file containing the historical forecasts + to be used for calibration. The expected columns within the + Parquet file are: forecast, blend_time, forecast_period, + forecast_reference_time, time, wmo_id, percentile, diagnostic, + latitude, longitude, period, height, cf_name, units. + - The path to a Parquet file containing the truths to be used + for calibration. The expected columns within the + Parquet file are: ob_value, time, wmo_id, diagnostic, latitude, + longitude and altitude. + - Optionally paths to additional NetCDF files that contain additional + features (static predictors) that will be provided when estimating the + SAMOS coefficients. The name of all cubes in this list must be in the + gam_features list. + + gam_features (list of str): + A list of the names of the cubes that will be used as additional + features in the GAM. Additionally, the name of any coordinates + that are to be used as features in the GAM. + use_default_initial_guess (bool): + If True, use the default initial guess. The default initial guess + assumes no adjustments are required to the initial choice of + predictor to generate the calibrated distribution. This means + coefficients of 1 for the multiplicative coefficients and 0 for + the additive coefficients. If False, the initial guess is computed. + units (str): + The units that calibration should be undertaken in. The historical + forecast and truth will be converted as required. + predictor (str): + String to specify the form of the predictor used to calculate the + location parameter when estimating the EMOS coefficients. + Currently the ensemble mean ("mean") and the ensemble realizations + ("realizations") are supported as options. + tolerance (float): + The tolerance for the Continuous Ranked Probability Score (CRPS) + calculated by the minimisation. Once multiple iterations result in + a CRPS equal to the same value within the specified tolerance, the + minimisation will terminate. + max_iterations (int): + The maximum number of iterations allowed until the minimisation has + converged to a stable solution. If the maximum number of iterations + is reached but the minimisation has not yet converged to a stable + solution, then the available solution is used anyway, and a warning + is raised. If the predictor is "realizations", then the number of + iterations may require increasing, as there will be more + coefficients to solve. + percentiles (List[float]): + The set of percentiles to be used for estimating EMOS coefficients. + These should be a set of equally spaced quantiles. + experiment (str): + A value within the experiment column to select from the forecast + table. + forecast_period (int): + Forecast period to be calibrated in seconds. + training_length (int): + Number of days within the training period. + diagnostic (str): + The name of the diagnostic to be calibrated within the forecast + and truth tables. This name is used to filter the Parquet file + when reading from disk. + cycletime (str): + Cycletime of a format similar to 20170109T0000Z. + unique_site_id_key (str): + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. For estimation the default is "wmo_id" + as we expect to be including observation data. + + Returns: + iris.cube.CubeList or None: + CubeList containing the coefficients estimated using EMOS. Each + coefficient is stored in a separate cube. None if a forecast or + truth cube cannot be created from the parquet table. + """ + import scipy.sparse + + from improver.calibration import ( + identify_parquet_type, + split_netcdf_parquet_pickle, + ) + from improver.calibration.samos_calibration import TrainEMOSForSAMOS + from improver.calibration.utilities import convert_parquet_to_cube + + # monkey-patch to 'tweak' scipy to prevent errors occurring. + def to_array(self): + return self.toarray() + + scipy.sparse.spmatrix.A = property(to_array) + + # Split the input paths into cubes and pickles. + samos_additional_predictors, parquets, gams = split_netcdf_parquet_pickle( + file_paths + ) + # Determine which parquet path provides truths and which historic forecasts. + forecast, truth = identify_parquet_type(parquets) + + forecast_cube, truth_cube = convert_parquet_to_cube( + forecast, + truth, + diagnostic=diagnostic, + cycletime=cycletime, + forecast_period=forecast_period, + training_length=training_length, + percentiles=percentiles, + experiment=experiment, + ) + + if not forecast_cube or not truth_cube: + return + + # Train emos coefficients for the SAMOS model. + emos_kwargs = { + "use_default_initial_guess": use_default_initial_guess, + "desired_units": units, + "predictor": predictor, + "tolerance": tolerance, + "max_iterations": max_iterations, + } + + plugin = TrainEMOSForSAMOS( + distribution="norm", + emos_kwargs=emos_kwargs, + unique_site_id_key=unique_site_id_key, + ) + return plugin( + historic_forecasts=forecast_cube, + truths=truth_cube, + forecast_gams=gams[0], + truth_gams=gams[1], + gam_features=gam_features, + gam_additional_fields=samos_additional_predictors, + ) diff --git a/improver/cli/estimate_samos_gams.py b/improver/cli/estimate_samos_gams.py new file mode 100755 index 0000000000..85e31cf353 --- /dev/null +++ b/improver/cli/estimate_samos_gams.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python +# (C) Crown Copyright, Met Office. All rights reserved. +# +# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. +# See LICENSE in the root of the repository for full licensing details. +"""CLI to estimate the Generalized Additive Model (GAM) for Standardized Anomaly Model +Output Statistics (SAMOS).""" + +from improver import cli + + +@cli.clizefy +@cli.with_output +def process( + *cubes: cli.inputcube, + truth_attribute: str, + gam_features: cli.comma_separated_list, + model_specification: cli.inputjson, + max_iterations: int = 100, + tolerance: float = 0.0001, + distribution: str = "normal", + link: str = "identity", + fit_intercept: bool = True, + unique_site_id_key: str = "wmo_id", +): + """Estimate Generalized Additive Model (GAM) for SAMOS. + + Args: + cubes (list of iris.cube.Cube): + A list of cubes containing the historical forecasts and + corresponding truth used for calibration. They must have the same + cube name and will be separated based on the truth attribute. The + list may also contain additional features (static predictors) that + will be provided when estimating the GAM. + truth_attribute (str): + An attribute and its value in the format of "attribute=value", + which must be present on historical truth cubes. + gam_features (list of str): + A list of the names of the cubes that will be used as additional + features in the GAM. + model_specification (dict): + A list containing three items (in order): + 1. a string containing a single pyGAM term; one of 'l' (linear), + 's' (spline), 'te' (tensor), or 'f' (factor) + 2. a list of integers which correspond to the features to be + included in that term + 3. a dictionary of kwargs to be included when defining the term + max_iterations (int): + The maximum number of iterations to use when estimating the GAM + coefficients. + tolerance (float): + The tolerance for the stopping criteria. + distribution (str): + The distribution to be used in the model. Valid options are normal, binomial, + poisson, gamma, inv-gauss. + link (str): + The link function to be used in the model. Valid options are identity, logit, inverse, log + or inverse-squared. + fit_intercept (bool): + Whether to include an intercept term in the model. Default is True. + unique_site_id_key (str): + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. For GAM estimation the default is + "wmo_id" as we expect to have a training data set comprising matched + obs and forecast sites. + + Returns: + List: + A list containing the fitted GAMs for the forecast and truth cubes in that order. + """ + + from improver.calibration import split_cubes_for_samos + from improver.calibration.samos_calibration import TrainGAMsForSAMOS + + # Split the cubes into forecast and truth cubes, along with any additional fields + # provided for the GAMs. + ( + forecast, + truth, + gam_additional_fields, + _, + _, + _, + ) = split_cubes_for_samos( + cubes=cubes, + gam_features=gam_features, + truth_attribute=truth_attribute, + expect_emos_coeffs=False, + expect_emos_fields=False, + ) + + if forecast is None or truth is None: + return + + plugin = TrainGAMsForSAMOS( + model_specification=model_specification, + max_iter=max_iterations, + tol=tolerance, + distribution=distribution, + link=link, + fit_intercept=fit_intercept, + unique_site_id_key=unique_site_id_key, + ) + + truth_gams = plugin.process( + input_cube=truth, + features=gam_features, + additional_fields=gam_additional_fields, + ) + + forecast_gams = plugin.process( + input_cube=forecast, + features=gam_features, + additional_fields=gam_additional_fields, + ) + + return [forecast_gams, truth_gams] diff --git a/improver/cli/estimate_samos_gams_from_table.py b/improver/cli/estimate_samos_gams_from_table.py new file mode 100755 index 0000000000..3219d35a88 --- /dev/null +++ b/improver/cli/estimate_samos_gams_from_table.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python +# (C) Crown Copyright, Met Office. All rights reserved. +# +# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. +# See LICENSE in the root of the repository for full licensing details. +"""CLI to estimate the Generalized Additive Model (GAM) for Standardized Anomaly Model +Output Statistics (SAMOS).""" + +from improver import cli + + +@cli.clizefy +@cli.with_output +def process( + *file_paths: cli.inputpath, + gam_features: cli.comma_separated_list, + model_specification: cli.inputjson, + tolerance: float = 0.02, + max_iterations: int = 1000, + percentiles: cli.comma_separated_list = None, + experiment: str = None, + forecast_period: int, + training_length: int, + diagnostic: str, + cycletime: str, + distribution: str = "normal", + link: str = "identity", + fit_intercept: bool = True, + unique_site_id_key: str = "wmo_id", +): + """Estimate Generalized Additive Model (GAM) for SAMOS. + + Args: + file_paths (cli.inputpath): + A list of input paths containing: + - The path to a Parquet file containing the historical forecasts + to be used for calibration.The expected columns within the + Parquet file are: forecast, blend_time, forecast_period, + forecast_reference_time, time, wmo_id, percentile, diagnostic, + latitude, longitude, period, height, cf_name, units. + - The path to a Parquet file containing the truths to be used + for calibration. The expected columns within the + Parquet file are: ob_value, time, wmo_id, diagnostic, latitude, + longitude and altitude. + - Optionally paths to additional NetCDF files that contain additional + features (static predictors) that will be provided when estimating the + GAM. + gam_features (list of str): + A list of the names of the cubes that will be used as additional + features in the GAM. Additionally, the name of any coordinates + that are to be used as features in the GAM. + model_specification (dict): + A list containing three items (in order): + 1. a string containing a single pyGAM term; one of 'l' (linear), + 's' (spline), 'te' (tensor), or 'f' (factor) + 2. a list of integers which correspond to the features to be + included in that term + 3. a dictionary of kwargs to be included when defining the term + tolerance (float): + The tolerance for the Continuous Ranked Probability Score (CRPS) + calculated by the minimisation. Once multiple iterations result in + a CRPS equal to the same value within the specified tolerance, the + minimisation will terminate. + max_iterations (int): + The maximum number of iterations allowed until the minimisation has + converged to a stable solution. If the maximum number of iterations + is reached but the minimisation has not yet converged to a stable + solution, then the available solution is used anyway, and a warning + is raised. If the predictor is "realizations", then the number of + iterations may require increasing, as there will be more + coefficients to solve. + percentiles (List[float]): + The set of percentiles to be used for estimating EMOS coefficients. + These should be a set of equally spaced quantiles. + experiment (str): + A value within the experiment column to select from the forecast + table. + forecast_period (int): + Forecast period to be calibrated in seconds. + training_length (int): + Number of days within the training period. + diagnostic (str): + The name of the diagnostic to be calibrated within the forecast + and truth tables. This name is used to filter the Parquet file + when reading from disk. + cycletime (str): + Cycletime of a format similar to 20170109T0000Z. + distribution (str): + The distribution to be used in the model. Valid options are normal, binomial, + poisson, gamma, inv-gauss. + link (str): + The link function to be used in the model. Valid options are identity, logit, inverse, log + or inverse-squared. + fit_intercept (bool): + Whether to include an intercept term in the model. Default is True. + unique_site_id_key (str): + If working with spot data and available, the name of the coordinate + in the input cubes that contains unique site IDs, e.g. "wmo_id" if + all sites have a valid wmo_id. For GAM estimation the default is + "wmo_id" as we expect to have a training data set comprising matched + obs and forecast sites. + + Returns: + (list of GAM models): + A list containing two lists of two fitted GAMs. The first list + contains two fitted GAMs, one for predicting the climatological mean + of the historical forecasts and the second predicting the + climatological standard deviation. The second list contains two + fitted GAMs, one for predicting the climatological mean of the truths + and the second predicting the climatological standard deviation. + """ + + from improver.calibration import ( + identify_parquet_type, + split_netcdf_parquet_pickle, + ) + from improver.calibration.samos_calibration import TrainGAMsForSAMOS + from improver.calibration.utilities import convert_parquet_to_cube + + # Split the input paths into cubes and pickles. + additional_predictors, parquets, _ = split_netcdf_parquet_pickle(file_paths) + # Determine which parquet path provides truths and which historic forecasts. + forecast, truth = identify_parquet_type(parquets) + + forecast_cube, truth_cube = convert_parquet_to_cube( + forecast, + truth, + diagnostic=diagnostic, + cycletime=cycletime, + forecast_period=forecast_period, + training_length=training_length, + percentiles=percentiles, + experiment=experiment, + ) + + if not forecast_cube or not truth_cube: + return + + plugin = TrainGAMsForSAMOS( + model_specification=model_specification, + max_iter=max_iterations, + tol=tolerance, + distribution=distribution, + link=link, + fit_intercept=fit_intercept, + unique_site_id_key=unique_site_id_key, + ) + + truth_gams = plugin.process( + input_cube=truth_cube, + features=gam_features, + additional_fields=additional_predictors, + ) + + forecast_gams = plugin.process( + input_cube=forecast_cube, + features=gam_features, + additional_fields=additional_predictors, + ) + + return [forecast_gams, truth_gams] diff --git a/improver/ensemble_copula_coupling/ensemble_copula_coupling.py b/improver/ensemble_copula_coupling/ensemble_copula_coupling.py index 20b989ceab..833b48d944 100644 --- a/improver/ensemble_copula_coupling/ensemble_copula_coupling.py +++ b/improver/ensemble_copula_coupling/ensemble_copula_coupling.py @@ -353,12 +353,13 @@ def _interpolate_percentiles( bounds_pairing = get_bounds_of_distribution( forecast_at_percentiles.name(), cube_units ) - (original_percentiles, forecast_at_reshaped_percentiles) = ( - self._add_bounds_to_percentiles_and_forecast_at_percentiles( - original_percentiles, - forecast_at_reshaped_percentiles, - bounds_pairing, - ) + ( + original_percentiles, + forecast_at_reshaped_percentiles, + ) = self._add_bounds_to_percentiles_and_forecast_at_percentiles( + original_percentiles, + forecast_at_reshaped_percentiles, + bounds_pairing, ) forecast_at_interpolated_percentiles = interpolate_multiple_rows_same_x( @@ -658,10 +659,11 @@ def _probabilities_to_percentiles( ) cube_units = forecast_probabilities.coord(threshold_coord.name()).units bounds_pairing = get_bounds_of_distribution(phenom_name, cube_units) - (threshold_points, probabilities_for_cdf) = ( - self._add_bounds_to_thresholds_and_probabilities( - threshold_points, probabilities_for_cdf, bounds_pairing - ) + ( + threshold_points, + probabilities_for_cdf, + ) = self._add_bounds_to_thresholds_and_probabilities( + threshold_points, probabilities_for_cdf, bounds_pairing ) if np.any(np.diff(probabilities_for_cdf) < 0): @@ -1001,10 +1003,10 @@ def _location_and_scale_parameters_to_percentiles( result[index, :] = percentile_method.ppf(percentile_list) # If percent point function (PPF) returns NaNs, fill in # mean instead of NaN values. NaN will only be generated if the - # scale parameter (standard deviation) is zero. Therefore, if the - # scale parameter (standard deviation) is zero, the mean value is + # scale parameter (standard deviation) is zero or negative. Therefore, if the + # scale parameter (standard deviation) is zero or negative, the mean value is # used for all gridpoints with a NaN. - if np.any(scale_data == 0): + if np.any(scale_data <= 0): nan_index = np.argwhere(np.isnan(result[index, :])) result[index, nan_index] = location_data[nan_index] if np.any(np.isnan(result)): @@ -1393,14 +1395,15 @@ def rank_ecc( Raises: ValueError: tie_break is not either 'random' or 'realization' """ + if random_seed is not None: + random_seed = int(random_seed) + random_seed = np.random.RandomState(random_seed) + results = iris.cube.CubeList([]) for rawfc, calfc in zip( raw_forecast_realizations.slices_over("time"), post_processed_forecast_percentiles.slices_over("time"), ): - if random_seed is not None: - random_seed = int(random_seed) - random_seed = np.random.RandomState(random_seed) if random_ordering: random_data = random_seed.rand(*rawfc.data.shape) # Returns the indices that would sort the array. diff --git a/improver/utilities/cli_utilities.py b/improver/utilities/cli_utilities.py index cdc2937b6d..3abad560c9 100644 --- a/improver/utilities/cli_utilities.py +++ b/improver/utilities/cli_utilities.py @@ -7,6 +7,8 @@ import json from typing import Dict, Optional +import joblib + def load_json_or_none(file_path: Optional[str]) -> Optional[Dict]: """If there is a path, runs json.load and returns it. Else returns None. @@ -16,9 +18,7 @@ def load_json_or_none(file_path: Optional[str]) -> Optional[Dict]: File path to the json file to load. Returns: - A dictionary loaded from a json file. - or - None + A dictionary loaded from a json file, or None. """ metadata_dict = None if file_path: @@ -26,3 +26,19 @@ def load_json_or_none(file_path: Optional[str]) -> Optional[Dict]: with open(file_path, "r") as input_file: metadata_dict = json.load(input_file) return metadata_dict + + +def load_pickle_or_none(file_path: Optional[str]) -> Optional[list]: + """If there is a path, load the pickled object and returns it. Else returns None. + + Args: + file_path: + File path to the pickled object file to load. + + Returns: + A list of contained pickled objects, or None. + """ + if file_path: + with open(file_path, "rb") as input_file: + object = joblib.load(input_file) + return object if file_path else None diff --git a/improver/utilities/compare.py b/improver/utilities/compare.py index ce48145ad3..64da974fea 100644 --- a/improver/utilities/compare.py +++ b/improver/utilities/compare.py @@ -373,13 +373,43 @@ def compare_data( reporter(f"different data {name} - {numpy_err_message}") +def compare_objects( + actual_var: PathLike, desired_var: PathLike, reporter: Callable[[str], None] +) -> None: + """ + Compare two pickled objects. This is not a complete comparison as two + objects may have the same string representation but be different + objects. + + Args: + actual_var: Path to the pickled object produced by test run. + desired_var: Path to the pickled object considered good. + reporter: Callback function for reporting differences. + """ + import joblib + + try: + with open(actual_var, "rb") as f: + actual_data = joblib.load(f) + except OSError as exc: + reporter(str(exc)) + return + try: + with open(desired_var, "rb") as f: + desired_data = joblib.load(f) + except OSError as exc: + reporter(str(exc)) + return + if str(actual_data) != str(desired_data): + reporter(f"Different data found in {actual_data} and {desired_data}.") + + def compare_pickled_forest( output_path: PathLike, kgo_path: PathLike, reporter: Optional[Callable[[str], None]] = None, ): """Load a pickled forest (e.g. a Random Forest) and compare its contents. - Args: output_path: data file produced by test run kgo_path: data file considered good e.g. KGO diff --git a/improver/utilities/generalized_additive_models.py b/improver/utilities/generalized_additive_models.py index db9932b84a..52fdeafcee 100644 --- a/improver/utilities/generalized_additive_models.py +++ b/improver/utilities/generalized_additive_models.py @@ -131,6 +131,10 @@ def to_array(self): # Import from pygam here to minimize dependencies from pygam import GAM + # Remove nans from arrays. + predictors = predictors[~np.isnan(targets)] + targets = targets[~np.isnan(targets)] + eqn = self.create_pygam_model() gam = GAM( eqn, diff --git a/improver/utilities/mathematical_operations.py b/improver/utilities/mathematical_operations.py index b80561b59e..17ce5ad569 100644 --- a/improver/utilities/mathematical_operations.py +++ b/improver/utilities/mathematical_operations.py @@ -18,9 +18,7 @@ create_new_diagnostic_cube, generate_mandatory_attributes, ) -from improver.utilities.cube_checker import ( - assert_spatial_coords_match, -) +from improver.utilities.cube_checker import assert_spatial_coords_match from improver.utilities.cube_manipulation import ( enforce_coordinate_ordering, get_dim_coord_names, @@ -468,8 +466,10 @@ def calculate_anomalies( ) -> ndarray: """Calculate climate anomalies from the input cubes.""" anomalies_data = diagnostic_cube.data - mean_cube.data + if std_cube: anomalies_data = anomalies_data / std_cube.data + return anomalies_data @staticmethod @@ -556,6 +556,11 @@ def _create_output_cube( self._add_reference_epoch_metadata(output_cube, mean_cube) anomalies_data = self.calculate_anomalies(diagnostic_cube, mean_cube, std_cube) + # Ensure any mask from the input cube is copied over to the output cube. + try: + anomalies_data.mask = output_cube.data.mask + except AttributeError: + pass output_cube.data = anomalies_data diff --git a/improver_tests/acceptance/SHA256SUMS b/improver_tests/acceptance/SHA256SUMS index f59d51ecb7..0cd3cf06b9 100644 --- a/improver_tests/acceptance/SHA256SUMS +++ b/improver_tests/acceptance/SHA256SUMS @@ -132,6 +132,22 @@ a82a48e690514f82b77cf54d70df10166c56ce1ab977ee5217348ac74d3293c9 ./apply-reliab cf666c51a6355d050406e5c7d15daca1e13d415be05a0cc92429d458474546fc ./apply-reliability-calibration/point_by_point/cubelist_table_point_by_point.nc 06681263dcb0538d844205937637e186352edd2a362b5f4ea5336bc67852acc2 ./apply-reliability-calibration/point_by_point/forecast_point_by_point.nc 30c0807e00af53babe747d04ec84ab5a71c977948c89aeb16d13b79740eb409b ./apply-reliability-calibration/point_by_point/kgo_point_by_point.nc +ff3a00a16fa94697d6e529a6f98384e4602f70a7d6ce26669c3947c8ac2e6f7e ./apply-samos-coefficients/additional_features/landmask.nc +8964d00ec69f384501269e0093bb7c4684d1fd7cf292a8286ee4ea68fe3b73e4 ./apply-samos-coefficients/additional_features/roughness_length.nc +a809cb7eb51c5cc35403e0f9d357a6bdf11431bb05ac6ba9bc10d58abe636d41 ./apply-samos-coefficients/forecast.nc +7c0268d6117de9002033a72a0388c963094c0f87b2448c2455802e41750960e0 ./apply-samos-coefficients/gam_configs/samos_gam.pkl +a979ae8ee62d3e7867ac6ed31a423c03bd65fc52f818d7e24ad212f789352c61 ./apply-samos-coefficients/gam_configs/samos_gam_additional_features.pkl +33ae30a74bc600b0f4ff10feff1ae3443323f1c17e57ec6c2f0aec4cb0e72ae5 ./apply-samos-coefficients/gam_configs/samos_gam_sites.pkl +6802f0378f76fba1c8bf44f7e90d20052048de851873862a80f09883ad5aaaff ./apply-samos-coefficients/kgo_coord.nc +8b1f4642c692a0811dc4d6a6d95ed552a8024bee017b9cc3e2e870410d38ddd3 ./apply-samos-coefficients/kgo_cubes_additional_cubes.nc +b0abca332d83f8df653711c6e431cd8f954dac987bb7d6acdabf3e05193b3f20 ./apply-samos-coefficients/kgo_cubes_emos_and_gam_features.nc +b5cd51b07b1daa7338dee0ad0d3eff626df4c1d692fbb4c35c75dc14f6ab753e ./apply-samos-coefficients/kgo_sites.nc +a3cbc9e1124dae85f911ecb338408a992bb3af0e6459a912d2296f8fc1d219c1 ./apply-samos-coefficients/kgo_with_comment.nc +1af4c8f931ec0b2fd53d9e7225abac7cec9039e651c3956c8289e1d5490d6645 ./apply-samos-coefficients/samos_coefficients/coefficients_coordinates.nc +e8e9661759ebf7cf8e78d259a33a57ba8a5f055a013bda68daf2f265d28de367 ./apply-samos-coefficients/samos_coefficients/coefficients_emos_and_gam_features.nc +17923bf4ed5251a4d4e600084365bbe75f2f0c91c4aadf1893b7392460161fee ./apply-samos-coefficients/samos_coefficients/coefficients_extra_features.nc +1a93dfcda78232e4302601c67b0e1c24c5f7e8fd0a7b55d8be6c52360ca7010d ./apply-samos-coefficients/samos_coefficients/coefficients_sites.nc +45aad45ea94eb663e9344c7fd93e08df3491418bf1d7f3f5be32ef5569bb81e6 ./apply-samos-coefficients/site_forecast.nc cf66fdbefec74d058e131a31dd1f51e4d10595265d21680272eb9590c3cc2fc2 ./between-thresholds/input.nc d40ce0bdf404f36afdeb7b23860da08cb93fa91766ba25b12c1b4ec99a41ca7d ./between-thresholds/kgo.nc a938495a85d1b9488e57e54c26d9120eb6bca016d9cc353967cfa347812e4fdc ./between-thresholds/threshold_ranges_km.json @@ -420,6 +436,33 @@ cdcbaebb219f4be7775f26d7d6fd8384002d1e46320d7f8bc086ea39c7913652 ./estimate-emo 397b0e3712041863f9d8ac45480b67162f6e740cf9b7d2e1521a6844074146e8 ./estimate-emos-coefficients/truncated_normal/truth/20170602T1500Z-PT0000H-horizontal_wind_speed_at_10m.nc 1032b6e41987c08ad4caace25e182a02701177408461c7b573f3e8b304b31c16 ./estimate-emos-coefficients/truncated_normal/truth/20170603T1500Z-PT0000H-horizontal_wind_speed_at_10m.nc fe84db49d69ea62e2e82312c7b3d860c7329c627d79d76c2543d365f69d2e0d8 ./estimate-emos-coefficients/truncated_normal/truth/20170604T1500Z-PT0000H-horizontal_wind_speed_at_10m.nc +cc3fb6e27313a487f17a69ca223523e86f3fa6c7c71f08574d2efcecab5bc4a2 ./estimate-samos-coefficients-from-table/distance_to_water.nc +a67d9606590ecafd2099f5491e683ec722b574c9bc2a2d1105ecc2eba8ed7bc8 ./estimate-samos-coefficients-from-table/gam_coordinates.pkl +8811643fb3b05232e8dfd2bac852f986c7b6f99989946a26f92630d5a588e834 ./estimate-samos-coefficients-from-table/gam_cubes.pkl +d340bbb94fbaa1f975b88ef41e9d8e3dfda54d3935d4c84257cde7bb18d72856 ./estimate-samos-coefficients-from-table/kgo_coordinates.nc +00ef721539ca3754638a032a42f6723e55981f6b4acdf3076ba0648551b6fecf ./estimate-samos-coefficients-from-table/kgo_gam_and_emos_cube.nc +ba78790fdabba784498c3c9b000f0f9c67664e7d7a3367780f67bce2f5858ea3 ./estimate-samos-coefficients-from-table/kgo_gam_cube.nc +0938d182a496e8708301dd19bf715402aceaa0a2da32915c320b22ef30933d1f ./estimate-samos-coefficients-from-table/land_fraction.nc +ff3a00a16fa94697d6e529a6f98384e4602f70a7d6ce26669c3947c8ac2e6f7e ./estimate-samos-coefficients/additional_features/landmask.nc +8964d00ec69f384501269e0093bb7c4684d1fd7cf292a8286ee4ea68fe3b73e4 ./estimate-samos-coefficients/additional_features/roughness_length.nc +7c0268d6117de9002033a72a0388c963094c0f87b2448c2455802e41750960e0 ./estimate-samos-coefficients/gam_configs/samos_gam.pkl +a979ae8ee62d3e7867ac6ed31a423c03bd65fc52f818d7e24ad212f789352c61 ./estimate-samos-coefficients/gam_configs/samos_gam_additional_features.pkl +33ae30a74bc600b0f4ff10feff1ae3443323f1c17e57ec6c2f0aec4cb0e72ae5 ./estimate-samos-coefficients/gam_configs/samos_gam_sites.pkl +85f86c253b3d7047b3078761992690251629c549dd22b9fde87fbf604bd2afa6 ./estimate-samos-coefficients/kgo_coordinates.nc +12d5d92e469d01943123f7c7b5afb944bf7613b65431c44cdaa22277f70d1a7b ./estimate-samos-coefficients/kgo_extra_gam_feature.nc +e8e9661759ebf7cf8e78d259a33a57ba8a5f055a013bda68daf2f265d28de367 ./estimate-samos-coefficients/kgo_gam_and_emos.nc +7d89c1a0f1dd64e9c6c0c406f0e2a47a9282778cbe688b363170b84b4f3e87cd ./estimate-samos-coefficients/kgo_sites.nc +7c0268d6117de9002033a72a0388c963094c0f87b2448c2455802e41750960e0 ./estimate-samos-gam/kgo.pkl +a979ae8ee62d3e7867ac6ed31a423c03bd65fc52f818d7e24ad212f789352c61 ./estimate-samos-gam/kgo_extra_cube.pkl +33ae30a74bc600b0f4ff10feff1ae3443323f1c17e57ec6c2f0aec4cb0e72ae5 ./estimate-samos-gam/kgo_sites.pkl +3bfb4f1b610528f34096745613fc62db3ca10b7f38c54b2495996a9b2cdfc695 ./estimate-samos-gam/output.pkl +8964d00ec69f384501269e0093bb7c4684d1fd7cf292a8286ee4ea68fe3b73e4 ./estimate-samos-gam/roughness_length.nc +34222261b96e60ffd281637d5876cf84664a59efc9977407cb3a7de30c99b0c9 ./estimate-samos-gam/samos_model_spec_simple.json +cc3fb6e27313a487f17a69ca223523e86f3fa6c7c71f08574d2efcecab5bc4a2 ./estimate-samos-gams-from-table/distance_to_water.nc +a67d9606590ecafd2099f5491e683ec722b574c9bc2a2d1105ecc2eba8ed7bc8 ./estimate-samos-gams-from-table/kgo_coords.pkl +8811643fb3b05232e8dfd2bac852f986c7b6f99989946a26f92630d5a588e834 ./estimate-samos-gams-from-table/kgo_cubes.pkl +3ad45b58562c2c654bc988e3eccd55c608a312b615acd1297d569dbef42cbfb9 ./estimate-samos-gams-from-table/roughness.nc +34222261b96e60ffd281637d5876cf84664a59efc9977407cb3a7de30c99b0c9 ./estimate-samos-gams-from-table/samos_model_spec_simple.json 386284bf4c5daa3567fb78ca00331494c99ac664902490188dfdf98c39a494ba ./expected-value/deterministic.nc 64bcba24bc314d1ffc6bb7adbae6b9e6d872cb75e8ef834463e4fb72e55ad16a ./expected-value/kgo.nc f61aa0326219c7e3934823bc4d607c2b3ec022a7c1df15d3519aea91207effce ./expected-value/percentile.nc diff --git a/improver_tests/acceptance/acceptance.py b/improver_tests/acceptance/acceptance.py index 4ae0efc049..35a39b28ff 100644 --- a/improver_tests/acceptance/acceptance.py +++ b/improver_tests/acceptance/acceptance.py @@ -15,7 +15,11 @@ from improver import cli from improver.constants import DEFAULT_TOLERANCE -from improver.utilities.compare import compare_netcdfs, compare_pickled_forest +from improver.utilities.compare import ( + compare_netcdfs, + compare_objects, + compare_pickled_forest, +) RECREATE_DIR_ENVVAR = "RECREATE_KGO" ACC_TEST_DIR_ENVVAR = "IMPROVER_ACC_TEST_DIR" @@ -320,8 +324,11 @@ def compare( rtol (float): Relative tolerance exclude_vars (Iterable[str]): Variables to exclude from comparison exclude_attributes (Iterable[str]): Attributes to exclude from comparison - file_type (str): Name of file type to compare, either "netCDF" or - "pickled_forest". + file_type (str): Name of file type to compare. One "netCDF", "pickled_forest", + or "generic_pickle". + + Raises: + ValueError: if file_type is not recognised. Returns: None @@ -371,6 +378,11 @@ def message_recorder(exception_message): kgo_path, reporter=message_recorder, ) + elif file_type == "generic_pickle": + compare_objects(output_path, kgo_path, reporter=message_recorder) + else: + msg = f"There is no support for comparing the file_type provided: {file_type}" + raise ValueError(msg) if difference_found: if recreate: diff --git a/improver_tests/acceptance/test_apply_samos_coefficients.py b/improver_tests/acceptance/test_apply_samos_coefficients.py new file mode 100644 index 0000000000..af31b637bc --- /dev/null +++ b/improver_tests/acceptance/test_apply_samos_coefficients.py @@ -0,0 +1,149 @@ +# (C) Crown Copyright, Met Office. All rights reserved. +# +# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. +# See LICENSE in the root of the repository for full licensing details. +"""Tests for the apply-samos-coefficients CLI""" + +import pytest + +from improver.constants import LOOSE_TOLERANCE + +from . import acceptance as acc + +pytestmark = [pytest.mark.acc, acc.skip_if_kgo_missing] +CLI = acc.cli_name_with_dashes(__file__) +run_cli = acc.run_cli(CLI) + + +def test_normal_gam_coordinates(tmp_path): + """Test diagnostic with assumed normal distribution and GAM feature + provided as coordinates on the input cube.""" + kgo_dir = acc.kgo_root() / "apply-samos-coefficients" + kgo_path = kgo_dir / "kgo_coord.nc" + + input_path = kgo_dir / "forecast.nc" + samos_est_path = kgo_dir / "samos_coefficients/coefficients_coordinates.nc" + gam_path = kgo_dir / "gam_configs/samos_gam.pkl" + output_path = tmp_path / "output.nc" + + gam_features = "projection_y_coordinate,projection_x_coordinate,height" + + args = [ + input_path, + samos_est_path, + gam_path, + "--gam-features", + gam_features, + "--realizations-count", + "9", + "--random-seed", + "0", + "--output", + output_path, + ] + run_cli(args) + acc.compare(output_path, kgo_path, atol=LOOSE_TOLERANCE) + + +def test_normal_gam_cubes(tmp_path): + """ + Test apply-samos-coefficients for diagnostic with assumed + normal distribution and additional features provided as cubes. + """ + kgo_dir = acc.kgo_root() / "apply-samos-coefficients" + kgo_path = kgo_dir / "kgo_cubes_additional_cubes.nc" + + input_path = kgo_dir / "forecast.nc" + samos_est_path = kgo_dir / "samos_coefficients/coefficients_extra_features.nc" + gam_path = kgo_dir / "gam_configs/samos_gam_additional_features.pkl" + output_path = tmp_path / "output.nc" + additional_features_path = kgo_dir / "additional_features/roughness_length.nc" + gam_features = ( + "projection_y_coordinate,projection_x_coordinate,vegetative_roughness_length" + ) + + args = [ + input_path, + samos_est_path, + additional_features_path, + gam_path, + "--gam-features", + gam_features, + "--realizations-count", + "9", + "--random-seed", + "0", + "--output", + output_path, + ] + run_cli(args) + acc.compare(output_path, kgo_path, atol=LOOSE_TOLERANCE) + + +def test_normal_sites(tmp_path): + """ + Test apply-samos-coefficients for diagnostic with assumed + normal distribution at sites. + """ + kgo_dir = acc.kgo_root() / "apply-samos-coefficients" + kgo_path = kgo_dir / "kgo_sites.nc" + + input_path = kgo_dir / "site_forecast.nc" + samos_est_path = kgo_dir / "samos_coefficients/coefficients_sites.nc" + gam_path = kgo_dir / "gam_configs/samos_gam_sites.pkl" + output_path = tmp_path / "output.nc" + gam_features = "latitude,longitude,height" + + args = [ + input_path, + samos_est_path, + gam_path, + "--gam-features", + gam_features, + "--realizations-count", + "9", + "--random-seed", + "0", + "--output", + output_path, + ] + run_cli(args) + acc.compare(output_path, kgo_path, atol=LOOSE_TOLERANCE) + + +def test_no_coefficients(tmp_path): + """Test apply-samos-coefficients when no coefficients provided""" + kgo_dir = acc.kgo_root() / "apply-samos-coefficients/" + kgo_path = kgo_dir / "kgo_with_comment.nc" + + input_path = kgo_dir / "site_forecast.nc" + gam_path = kgo_dir / "gam_configs/samos_gam_sites.pkl" + output_path = tmp_path / "output.nc" + additional_features_path = kgo_dir / "additional_features/roughness_length.nc" + gam_features = ( + "projection_y_coordinate,projection_x_coordinate,vegetative_roughness_length" + ) + + args = [ + input_path, + additional_features_path, + gam_path, + "--gam-features", + gam_features, + "--realizations-count", + "9", + "--random-seed", + "0", + "--output", + output_path, + ] + + with pytest.warns(UserWarning, match=".*no coefficients provided.*"): + run_cli(args) + + # Check output matches kgo. + acc.compare(output_path, kgo_path, atol=LOOSE_TOLERANCE) + # Check output only differs from input due to addition of a comment. + acc.compare( + output_path, input_path, atol=LOOSE_TOLERANCE, exclude_attributes=["comment"] + ) diff --git a/improver_tests/acceptance/test_estimate_samos_coefficients.py b/improver_tests/acceptance/test_estimate_samos_coefficients.py new file mode 100644 index 0000000000..8b2a431e5b --- /dev/null +++ b/improver_tests/acceptance/test_estimate_samos_coefficients.py @@ -0,0 +1,140 @@ +# (C) Crown Copyright, Met Office. All rights reserved. +# +# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. +# See LICENSE in the root of the repository for full licensing details. +""" +Tests for the estimate-samos-coefficients CLI + +Many of these tests use globs which are expanded by IMPROVER code itself, +rather than by shell glob expansion. There are also some directory globs +which expand directory names in addition to filenames. +""" + +import pytest + +from . import acceptance as acc + +pytestmark = [pytest.mark.acc, acc.skip_if_kgo_missing] +CLI = acc.cli_name_with_dashes(__file__) +run_cli = acc.run_cli(CLI) + +pytest.importorskip("statsmodels") + +# The EMOS estimation tolerance is defined in units of the variable being +# calibrated - not in terms of the EMOS coefficients produced by +# estimate-emos-coefficients and compared against KGOs here. +# See comments and CLI help messages in +# improver/cli/estimate_emos_coefficients.py for more detail. +EST_EMOS_TOLERANCE = 1e-4 + +# The EMOS coefficients are expected to vary by at most one order of magnitude +# more than the CRPS tolerance specified. +COMPARE_EMOS_TOLERANCE = EST_EMOS_TOLERANCE * 10 + +# Pre-convert to string for easier use in each test +EST_EMOS_TOL = str(EST_EMOS_TOLERANCE) + + +@pytest.mark.slow +def test_coordinates(tmp_path): + """ + Test estimate-samos-coefficients for diagnostic with assumed + normal distribution and additional gam features are coordinates on + the input cubes. + """ + # The source data is from the estimate-emos-coefficients acceptance tests + source_emos_dir = acc.kgo_root() / "estimate-emos-coefficients/normal" + history_path = source_emos_dir / "history/*.nc" + truth_path = source_emos_dir / "truth/*.nc" + + kgo_dir = acc.kgo_root() / "estimate-samos-coefficients" + output_path = tmp_path / "output.nc" + kgo_path = kgo_dir / "kgo_coordinates.nc" + gam_path = kgo_dir / "gam_configs/samos_gam.pkl" + gam_features = "projection_y_coordinate,projection_x_coordinate,height" + + args = [ + history_path, + truth_path, + gam_path, + "--truth-attribute", + "mosg__model_configuration=uk_det", + "--gam-features", + gam_features, + "--output", + output_path, + ] + run_cli(args) + acc.compare( + output_path, kgo_path, atol=COMPARE_EMOS_TOLERANCE, rtol=COMPARE_EMOS_TOLERANCE + ) + + +def test_normal_cube_gam_features(tmp_path): + """ + Test estimate-samos-coefficients for diagnostic with assumed + normal distribution and additional features provided as a cube. + """ + source_emos_dir = acc.kgo_root() / "estimate-emos-coefficients/normal" + history_path = source_emos_dir / "history/*.nc" + truth_path = source_emos_dir / "truth/*.nc" + + kgo_dir = acc.kgo_root() / "estimate-samos-coefficients" + additional_features_path = kgo_dir / "additional_features/roughness_length.nc" + output_path = tmp_path / "output.nc" + kgo_path = kgo_dir / "kgo_extra_gam_feature.nc" + gam_path = kgo_dir / "gam_configs/samos_gam_additional_features.pkl" + gam_features = ( + "projection_y_coordinate,projection_x_coordinate,vegetative_roughness_length" + ) + + args = [ + history_path, + truth_path, + additional_features_path, + gam_path, + "--truth-attribute", + "mosg__model_configuration=uk_det", + "--gam-features", + gam_features, + "--output", + output_path, + ] + run_cli(args) + acc.compare( + output_path, kgo_path, atol=COMPARE_EMOS_TOLERANCE, rtol=COMPARE_EMOS_TOLERANCE + ) + + +def test_estimate_samos_coefficients_sites(tmp_path): + """ + Test estimate-samos-coefficients for site data with a diagnostic with and assumed + normal distribution and additional gam features are coordinates on + the input cubes. + """ + # The source data is from the estimate-emos-coefficients acceptance tests + source_emos_dir = acc.kgo_root() / "estimate-emos-coefficients/normal/sites" + history_path = source_emos_dir / "history/*.nc" + truth_path = source_emos_dir / "truth/*.nc" + + kgo_dir = acc.kgo_root() / "estimate-samos-coefficients" + output_path = tmp_path / "output.nc" + kgo_path = kgo_dir / "kgo_sites.nc" + gam_path = kgo_dir / "gam_configs/samos_gam_sites.pkl" + gam_features = "latitude,longitude,height" + + args = [ + history_path, + truth_path, + gam_path, + "--truth-attribute", + "mosg__model_configuration=uk_det", + "--gam-features", + gam_features, + "--output", + output_path, + ] + run_cli(args) + acc.compare( + output_path, kgo_path, atol=COMPARE_EMOS_TOLERANCE, rtol=COMPARE_EMOS_TOLERANCE + ) diff --git a/improver_tests/acceptance/test_estimate_samos_coefficients_from_table.py b/improver_tests/acceptance/test_estimate_samos_coefficients_from_table.py new file mode 100644 index 0000000000..6e6a4536f1 --- /dev/null +++ b/improver_tests/acceptance/test_estimate_samos_coefficients_from_table.py @@ -0,0 +1,147 @@ +# (C) Crown Copyright, Met Office. All rights reserved. +# +# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. +# See LICENSE in the root of the repository for full licensing details. +""" +Tests for the estimate-samos-coefficients-from-table CLI +""" + +import pytest + +from . import acceptance as acc + +pytestmark = [pytest.mark.acc, acc.skip_if_kgo_missing] +CLI = acc.cli_name_with_dashes(__file__) +run_cli = acc.run_cli(CLI) + +for mod in ["pyarrow", "statsmodels"]: + pytest.importorskip(mod) + +# The EMOS estimation tolerance is defined in units of the variable being +# calibrated - not in terms of the EMOS coefficients produced by +# estimate-emos-coefficients and compared against KGOs here. +# See comments and CLI help messages in +# improver/cli/estimate_emos_coefficients.py for more detail. +EST_EMOS_TOLERANCE = 1e-4 + +# The EMOS coefficients are expected to vary by at most one order of magnitude +# more than the CRPS tolerance specified. +COMPARE_EMOS_TOLERANCE = EST_EMOS_TOLERANCE * 10 + +# Pre-convert to string for easier use in each test +EST_EMOS_TOL = str(EST_EMOS_TOLERANCE) + + +@pytest.mark.slow +def test_additional_features_coords(tmp_path): + """ + Test estimate-samos-coefficients-from-table with an example forecast and truth + table for screen temperature.Extra features for the GAMs are provided + as coordinates on the forecast and truth cubes. + """ + source_dir = acc.kgo_root() / "estimate-emos-coefficients-from-table/" + history_path = source_dir / "forecast_table" + truth_path = source_dir / "truth_table" + + kgo_dir = acc.kgo_root() / "estimate-samos-coefficients-from-table/" + kgo_path = kgo_dir / "kgo_coordinates.nc" + output_path = tmp_path / "output.nc" + + gam_config = kgo_dir / "gam_coordinates.pkl" + + compulsory_args = [history_path, truth_path, gam_config] + named_args = [ + "--gam-features", + "latitude,longitude,altitude", + "--percentiles", + "10,20,30,40,50,60,70,80,90", + "--forecast-period", + "86400", + "--training-length", + "5", + "--diagnostic", + "temperature_at_screen_level", + "--cycletime", + "20210805T2100Z", + "--output", + output_path, + ] + + run_cli(compulsory_args + named_args) + acc.compare( + output_path, kgo_path, atol=COMPARE_EMOS_TOLERANCE, rtol=COMPARE_EMOS_TOLERANCE + ) + + +@pytest.mark.slow +def test_additional_gam_features_cube(tmp_path): + """ + Test estimate-samos-coefficients-from-table with an example forecast and truth + table for screen temperature. Extra features for the GAMs are provided + as a cube. + """ + source_dir = acc.kgo_root() / "estimate-emos-coefficients-from-table/" + history_path = source_dir / "forecast_table" + truth_path = source_dir / "truth_table" + + kgo_dir = acc.kgo_root() / "estimate-samos-coefficients-from-table/" + kgo_path = kgo_dir / "kgo_gam_cube.nc" + output_path = tmp_path / "output.nc" + gam_additional_features = kgo_dir / "distance_to_water.nc" + gam_config = kgo_dir / "gam_coordinates.pkl" + + compulsory_args = [history_path, truth_path, gam_config, gam_additional_features] + named_args = [ + "--gam-features", + "latitude,longitude,distance_to_water", + "--percentiles", + "10,20,30,40,50,60,70,80,90", + "--forecast-period", + "86400", + "--training-length", + "5", + "--diagnostic", + "temperature_at_screen_level", + "--cycletime", + "20210805T2100Z", + "--output", + output_path, + ] + + run_cli(compulsory_args + named_args) + acc.compare( + output_path, kgo_path, atol=COMPARE_EMOS_TOLERANCE, rtol=COMPARE_EMOS_TOLERANCE + ) + + +@pytest.mark.slow +def test_return_none(tmp_path): + """ + Test that None is returned if cube cannot be created from table. + """ + source_dir = acc.kgo_root() / "estimate-emos-coefficients-from-table/" + history_path = source_dir / "forecast_table_quantiles" + truth_path = source_dir / "truth_table" + + kgo_dir = acc.kgo_root() / "estimate-samos-coefficients-from-table/" + output_path = tmp_path / "output.nc" + gam_config = kgo_dir / "gam_coordinates.pkl" + + compulsory_args = [history_path, truth_path, gam_config] + named_args = [ + "--gam-features", + "projection_y_coordinate,projection_x_coordinate,height", + "--percentiles", + "10,20,30,40,50,60,70,80,90", + "--forecast-period", + "3600000", + "--training-length", + "5", + "--diagnostic", + "temperature_at_screen_level", + "--cycletime", + "20210805T2100Z", + "--output", + output_path, + ] + assert run_cli(compulsory_args + named_args) is None diff --git a/improver_tests/acceptance/test_estimate_samos_gams.py b/improver_tests/acceptance/test_estimate_samos_gams.py new file mode 100644 index 0000000000..05b084ae90 --- /dev/null +++ b/improver_tests/acceptance/test_estimate_samos_gams.py @@ -0,0 +1,142 @@ +# (C) Crown Copyright, Met Office. All rights reserved. +# +# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. +# See LICENSE in the root of the repository for full licensing details. +""" +Tests for the estimate-samos-gams CLI +""" + +import pytest + +from . import acceptance as acc + +pytestmark = [pytest.mark.acc, acc.skip_if_kgo_missing] +CLI = acc.cli_name_with_dashes(__file__) +run_cli = acc.run_cli(CLI) + +pytest.importorskip("statsmodels") + +TOLERANCE = str(1e-4) + + +@pytest.mark.slow +def test_gam_features_on_cube(tmp_path): + """ + Test estimate-samos-gams-coefficients for diagnostic with assumed + normal distribution. + """ + source_emos_dir = acc.kgo_root() / "estimate-emos-coefficients/normal" + history_path = source_emos_dir / "history/*.nc" + truth_path = source_emos_dir / "truth/*.nc" + + kgo_dir = acc.kgo_root() / "estimate-samos-gam" + kgo_path = kgo_dir / "kgo.pkl" + model_specification_path = kgo_dir / "samos_model_spec_simple.json" + output_path = tmp_path / "output.pkl" + + gam_features = "projection_y_coordinate,projection_x_coordinate,height" + args = [ + history_path, + truth_path, + "--distribution", + "normal", + "--truth-attribute", + "mosg__model_configuration=uk_det", + "--tolerance", + TOLERANCE, + "--gam-features", + gam_features, + "--model-specification", + model_specification_path, + "--output", + output_path, + ] + run_cli(args) + + # Compare the output with the known good output. This + # comparison only ensures that the string version of the + # pickled objects are the same, not the actual objects as + # there is no function to compare the GAM class objects. + acc.compare(output_path, kgo_path, file_type="generic_pickle") + + +def test_gam_cube_gam_features(tmp_path): + """ + Test estimate-samos-gams-coefficients for diagnostic with assumed + normal distribution and additional features provided as a cube. + """ + source_emos_dir = acc.kgo_root() / "estimate-emos-coefficients/normal" + history_path = source_emos_dir / "history/*.nc" + truth_path = source_emos_dir / "truth/*.nc" + + kgo_dir = acc.kgo_root() / "estimate-samos-gam" + kgo_path = kgo_dir / "kgo.pkl" + additional_features_path = kgo_dir / "roughness_length.nc" + model_specification_path = kgo_dir / "samos_model_spec_simple.json" + output_path = tmp_path / "output.pkl" + + gam_features = ( + "projection_x_coordinate,projection_y_coordinate,vegetative_roughness_length" + ) + args = [ + history_path, + truth_path, + additional_features_path, + "--distribution", + "normal", + "--truth-attribute", + "mosg__model_configuration=uk_det", + "--tolerance", + TOLERANCE, + "--gam-features", + gam_features, + "--model-specification", + model_specification_path, + "--output", + output_path, + ] + run_cli(args) + # Compare the output with the known good output. This + # comparison only ensures that the string version of the + # pickled objects are the same, not the actual objects as + # there is no function to compare the GAM class objects. + acc.compare(output_path, kgo_path, file_type="generic_pickle") + + +def test_gam_at_sites(): + """ + Test estimate-samos-gams-coefficients for diagnostic with assumed + normal distribution and additional features provided as a cube. + """ + source_emos_dir = acc.kgo_root() / "estimate-emos-coefficients/normal/sites" + history_path = source_emos_dir / "history/*.nc" + truth_path = source_emos_dir / "truth/*.nc" + + kgo_dir = acc.kgo_root() / "estimate-samos-gam" + kgo_path = kgo_dir / "kgo_sites.pkl" + model_specification_path = kgo_dir / "samos_model_spec_simple.json" + output_path = kgo_dir / "output.pkl" + + gam_features = "latitude,longitude,height" + args = [ + history_path, + truth_path, + "--distribution", + "normal", + "--truth-attribute", + "mosg__model_configuration=uk_det", + "--tolerance", + TOLERANCE, + "--gam-features", + gam_features, + "--model-specification", + model_specification_path, + "--output", + output_path, + ] + run_cli(args) + # Compare the output with the known good output. This + # comparison only ensures that the string version of the + # pickled objects are the same, not the actual objects as + # there is no function to compare the GAM class objects. + acc.compare(output_path, kgo_path, file_type="generic_pickle") diff --git a/improver_tests/acceptance/test_estimate_samos_gams_from_table.py b/improver_tests/acceptance/test_estimate_samos_gams_from_table.py new file mode 100644 index 0000000000..928cef52af --- /dev/null +++ b/improver_tests/acceptance/test_estimate_samos_gams_from_table.py @@ -0,0 +1,169 @@ +# (C) Crown Copyright, Met Office. All rights reserved. +# +# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. +# See LICENSE in the root of the repository for full licensing details. +""" +Tests for the estimate-samos-gams-from-table CLI + +""" + +import pytest + +from . import acceptance as acc + +pytestmark = [pytest.mark.acc, acc.skip_if_kgo_missing] +CLI = acc.cli_name_with_dashes(__file__) +run_cli = acc.run_cli(CLI) + +for mod in ["pyarrow", "statsmodels"]: + pytest.importorskip(mod) + + +@pytest.mark.slow +def test_additional_features_coords( + tmp_path, +): + """ + Test estimate-samos-gams-from-table with an example forecast and truth + table for screen temperature. Extra features for the GAMs are provided + as coordinates on the forecast and truth cubes. + """ + source_dir = acc.kgo_root() / "estimate-emos-coefficients-from-table/" + history_path = source_dir / "forecast_table" + truth_path = source_dir / "truth_table" + + kgo_dir = acc.kgo_root() / "estimate-samos-gams-from-table/" + kgo_path = kgo_dir / "kgo_coords.pkl" + + output_path = tmp_path / "output.pkl" + compulsory_args = [history_path, truth_path] + named_args = [ + "--diagnostic", + "temperature_at_screen_level", + "--cycletime", + "20210805T2100Z", + "--forecast-period", + "86400", + "--training-length", + "5", + "--distribution", + "normal", + "--tolerance", + "1e-4", + "--gam-features", + "latitude,longitude,altitude", + "--model-specification", + kgo_dir / "samos_model_spec_simple.json", + "--percentiles", + "10,20,30,40,50,60,70,80,90", + "--output", + output_path, + ] + + run_cli(compulsory_args + named_args) + # Compare the output with the known good output. This + # comparison only ensures that the string version of the + # pickled objects are the same, not the actual objects as + # there is no function to compare the GAM class objects. + acc.compare(output_path, kgo_path, file_type="generic_pickle") + + +@pytest.mark.slow +def test_additional_features_cube( + tmp_path, +): + """ + Test estimate-samos-gams-from-table with an example forecast and truth + table for screen temperature. Extra features for the GAMs are provided + as a cube. + """ + source_dir = acc.kgo_root() / "estimate-emos-coefficients-from-table/" + history_path = source_dir / "forecast_table" + truth_path = source_dir / "truth_table" + + kgo_dir = acc.kgo_root() / "estimate-samos-gams-from-table/" + kgo_path = kgo_dir / "kgo_cubes.pkl" + additional_features = kgo_dir / "distance_to_water.nc" + + output_path = tmp_path / "output.pkl" + compulsory_args = [history_path, truth_path, additional_features] + named_args = [ + "--diagnostic", + "temperature_at_screen_level", + "--cycletime", + "20210805T2100Z", + "--forecast-period", + "86400", + "--training-length", + "5", + "--distribution", + "normal", + "--tolerance", + "1e-4", + "--gam-features", + "latitude,longitude,distance_to_water", + "--model-specification", + kgo_dir / "samos_model_spec_simple.json", + "--percentiles", + "10,20,30,40,50,60,70,80,90", + "--output", + output_path, + ] + + run_cli(compulsory_args + named_args) + # Compare the output with the known good output. This + # comparison only ensures that the string version of the + # pickled objects are the same, not the actual objects as + # there is no function to compare the GAM class objects. + acc.compare(output_path, kgo_path, file_type="generic_pickle") + + +@pytest.mark.slow +def test_additional_features_cubes( + tmp_path, +): + """ + Test estimate-samos-gams-from-table with an example forecast and truth + table for screen temperature. Extra features for the GAMs are provided + by multiple cubes. This test demonstrates that the CLI can accept + multiple additional predictor cubes. + """ + source_dir = acc.kgo_root() / "estimate-emos-coefficients-from-table/" + history_path = source_dir / "forecast_table" + truth_path = source_dir / "truth_table" + + kgo_dir = acc.kgo_root() / "estimate-samos-gams-from-table/" + kgo_path = kgo_dir / "kgo_cubes.pkl" + additional_features = [kgo_dir / "distance_to_water.nc", kgo_dir / "roughness.nc"] + + output_path = tmp_path / "output.pkl" + compulsory_args = [history_path, truth_path, *additional_features] + named_args = [ + "--diagnostic", + "temperature_at_screen_level", + "--cycletime", + "20210805T2100Z", + "--forecast-period", + "86400", + "--training-length", + "5", + "--distribution", + "normal", + "--tolerance", + "1e-4", + "--gam-features", + "latitude,longitude,distance_to_water,vegetative_roughness_length", + "--model-specification", + kgo_dir / "samos_model_spec_simple.json", + "--percentiles", + "10,20,30,40,50,60,70,80,90", + "--output", + output_path, + ] + + run_cli(compulsory_args + named_args) + # Compare the output with the known good output. This + # comparison only ensures that the string version of the + # pickled objects are the same, not the actual objects as + # there is no function to compare the GAM class objects. + acc.compare(output_path, kgo_path, file_type="generic_pickle") diff --git a/improver_tests/calibration/samos_calibration/helper_functions.py b/improver_tests/calibration/samos_calibration/helper_functions.py index f224326d20..0a35681b7d 100644 --- a/improver_tests/calibration/samos_calibration/helper_functions.py +++ b/improver_tests/calibration/samos_calibration/helper_functions.py @@ -16,6 +16,12 @@ set_up_variable_cube, ) +FORECAST_ATTRIBUTES = { + "title": "MOGREPS-UK Forecast", + "source": "Met Office Unified Model", + "institution": "Met Office", +} + def create_simple_cube( forecast_type: str, @@ -138,7 +144,7 @@ def create_cubes_for_gam_fitting( # Create array of random noise which increases with x and y, so that there is # some variance in the data to model in the standard deviation GAM. rng = np.random.RandomState(210825) # Set seed for reproducible results. - noise = rng.normal(loc=0.0, scale=addition / 30) + noise = rng.normal(loc=0.0, scale=0.05 + (addition / 30)) input_cube.data = input_cube.data + addition + noise additional_cubes = iris.cube.CubeList([]) diff --git a/improver_tests/calibration/samos_calibration/test_ApplySAMOS.py b/improver_tests/calibration/samos_calibration/test_ApplySAMOS.py new file mode 100644 index 0000000000..f990968d8d --- /dev/null +++ b/improver_tests/calibration/samos_calibration/test_ApplySAMOS.py @@ -0,0 +1,333 @@ +# (C) Crown Copyright, Met Office. All rights reserved. +# +# This file is part of 'IMPROVER' and is released under the BSD 3-Clause license. +# See LICENSE in the root of the repository for full licensing details. +"""Unit tests for the ApplySAMOS class within samos_calibration.py""" + +import numpy as np +import pytest +from iris.cube import CubeList + +from improver.calibration.samos_calibration import ( + ApplySAMOS, + TrainGAMsForSAMOS, +) +from improver.threshold import Threshold +from improver_tests.calibration.emos_calibration.test_ApplyEMOS import ( + build_coefficients_cubelist, +) +from improver_tests.calibration.samos_calibration.helper_functions import ( + FORECAST_ATTRIBUTES, + create_cubes_for_gam_fitting, +) + +EXPECTED_DATA_DICT = { + "real_real_emosalt_notgamalt": np.array( + [ + [ + [313.50665, 305.0316, 287.89963], + [280.07907, 287.8345, 296.38815], + [333.15, 333.15, 304.0257], + ], + [ + [313.4974, 304.8424, 288.25708], + [279.9784, 288.0995, 295.84637], + [333.15, 333.15, 302.26242], + ], + [ + [313.50204, 304.65323, 288.61453], + [280.17975, 287.967, 296.11728], + [333.15, 333.15, 303.14404], + ], + ], + dtype=np.float32, + ), + "real_real_notemosalt_notgamalt": np.array( + [ + [ + [273.1705, 280.7735, 287.9103], + [280.7542, 288.1842, 296.4416], + [288.2182, 295.7106, 304.0292], + ], + [ + [273.1613, 280.58432, 288.2678], + [280.65353, 288.44922, 295.8998], + [288.01764, 295.00946, 302.26596], + ], + [ + [273.1659, 280.39514, 288.62524], + [280.8549, 288.3167, 296.17072], + [288.41873, 296.41177, 303.1476], + ], + ], + dtype=np.float32, + ), + "real_real_emosalt_gamalt": np.array( + [ + [ + [313.50665, 305.0316, 287.89963], + [280.07907, 287.8345, 296.38815], + [333.15, 333.15, 304.0257], + ], + [ + [313.4974, 304.8424, 288.25708], + [279.9784, 288.0995, 295.84637], + [333.15, 333.15, 302.26242], + ], + [ + [313.50204, 304.65323, 288.61453], + [280.17975, 287.967, 296.11728], + [333.15, 333.15, 303.14404], + ], + ], + dtype=np.float32, + ), + "real_real_notemosalt_gamalt": np.array( + [ + [ + [273.1705, 280.77353, 287.91034], + [280.7542, 288.18423, 296.4416], + [288.2182, 295.7106, 304.02924], + ], + [ + [273.1613, 280.58432, 288.2678], + [280.65353, 288.44922, 295.8998], + [288.01764, 295.00946, 302.26596], + ], + [ + [273.1659, 280.39514, 288.62524], + [280.8549, 288.3167, 296.17072], + [288.41873, 296.41177, 303.1476], + ], + ], + dtype=np.float32, + ), + "real_prob_emosalt_notgamalt": np.array( + [ + [ + [1.0000, 1.0000, 0.6862], + [0.0000, 0.4333, 1.0000], + [1.0000, 1.0000, 1.0000], + ], + [ + [1.0000, 1.0000, 0.0000], + [0.0000, 0.0000, 0.0000], + [1.0000, 1.0000, 0.5439], + ], + ], + dtype=np.float32, + ), + "prob_prob_notemosalt_notgamalt": np.array( + [ + [ + [0.9772, 1.0000, 0.9974], + [1.0000, 0.9974, 0.9974], + [0.9974, 0.9974, 0.9974], + ], + [ + [0.0228, 0.9772, 0.9918], + [0.9772, 0.9918, 0.9918], + [0.9918, 0.9918, 0.9918], + ], + [ + [0.0000, 0.0228, 0.9772], + [0.0228, 0.9772, 0.9772], + [0.9772, 0.9772, 0.9772], + ], + ], + dtype=np.float32, + ), + "perc_perc_notemosalt_notgamalt": np.array( + [ + [ + [273.1613, 280.39514, 287.91034], + [280.65353, 288.18423, 295.8998], + [288.01764, 295.00946, 302.266], + ], + [ + [273.1659, 280.58432, 288.2678], + [280.7542, 288.3167, 296.17072], + [288.2182, 295.7106, 303.1476], + ], + [ + [273.1705, 280.7735, 288.62524], + [280.8549, 288.44922, 296.44162], + [288.41873, 296.41177, 304.02924], + ], + ], + dtype=np.float32, + ), + "perc_prob_notemosalt_notgamalt": np.array( + [ + [ + [0.0000, 0.0000, 0.6933], + [0.0000, 0.9466, 1.0000], + [0.7685, 1.0000, 1.0000], + ], + [ + [0.0000, 0.0000, 0.0000], + [0.0000, 0.0000, 0.0000], + [0.0000, 0.0000, 0.5450], + ], + ], + dtype=np.float32, + ), + "real_prob_notemosalt_notgamalt": np.array( + [ + [ + [0.0000, 0.0000, 0.6933], + [0.0000, 0.9466, 1.0000], + [0.7685, 1.0000, 1.0000], + ], + [ + [0.0000, 0.0000, 0.0000], + [0.0000, 0.0000, 0.0000], + [0.0000, 0.0000, 0.5450], + ], + ], + dtype=np.float32, + ), +} + + +def get_expected_result( + input_format, output_format, emos_include_altitude, gam_include_altitude +): + """Function to get the expected result of the process() method.""" + expected_name = "air_temperature" + if output_format == "probability": + expected_name = "probability_of_air_temperature_above_threshold" + + expected_dims = ( + ["air_temperature", "latitude", "longitude"] + if output_format == "probability" + else [output_format, "latitude", "longitude"] + ) + + emos_field = "emosalt" if emos_include_altitude else "notemosalt" + gam_field = "gamalt" if gam_include_altitude else "notgamalt" + expected_data_key = ( + f"{input_format[:4]}_{output_format[:4]}_{emos_field}_{gam_field}" + ) + try: + expected_data = EXPECTED_DATA_DICT[expected_data_key] + except KeyError: + expected_data = None + + return expected_data, expected_name, expected_dims + + +@pytest.mark.parametrize("percentiles", [None, 5, [5], [5, 10, 15]]) +def test__init__(percentiles): + """Test that the class initializes variables correctly.""" + # Skip test if pyGAM not available. + pytest.importorskip("pygam") + + if percentiles == 5: + msg = "'int' object is not iterable" + with pytest.raises(TypeError, match=msg): + ApplySAMOS(percentiles) + else: + result = ApplySAMOS(percentiles) + assert getattr(result, "percentiles") == percentiles + + +@pytest.mark.parametrize( + "input_format,output_format,emos_include_altitude,gam_include_altitude", + [ + ["realization", "realization", False, False], + ["realization", "realization", True, False], + ["realization", "realization", False, True], + ["realization", "realization", True, True], + ["realization", "probability", False, False], + ["realization", "probability", True, False], + ["percentile", "percentile", False, False], + ["percentile", "probability", False, False], + ["probability", "probability", False, False], + ], +) +def test_process( + input_format, output_format, emos_include_altitude, gam_include_altitude +): + """Test that the process method returns the expected results.""" + # Skip test if pyGAM not available. + pytest.importorskip("pygam") + + # Set up model terms for spatial predictors. + model_specification = [["linear", [0], {}], ["linear", [1], {}]] + features = ["latitude", "longitude"] + n_spatial_points = 3 + n_realizations = 3 + n_times = 20 + + forecast_cube, additional_cubes = create_cubes_for_gam_fitting( + n_spatial_points=n_spatial_points, + n_realizations=n_realizations, + n_times=n_times, + include_altitude=emos_include_altitude, + fixed_forecast_period=True, + ) + + for key, value in FORECAST_ATTRIBUTES.items(): + forecast_cube.attributes[key] = value + + forecast_gams = TrainGAMsForSAMOS(model_specification).process( + forecast_cube, features, additional_cubes + ) + + forecast_slice = next(forecast_cube.slices_over(["forecast_reference_time"])) + if emos_include_altitude: + emos_coefficients = build_coefficients_cubelist( + forecast_slice, + [0, [0.9, 0.1], 0, 1], + CubeList([forecast_slice, additional_cubes[0]]), + ) + else: + emos_coefficients = build_coefficients_cubelist( + forecast_slice, + [0, 1, 0, 1], + CubeList([forecast_slice]), + ) + + if input_format == "realization": + process_kwargs = {} + elif input_format == "percentile": + forecast_slice.coord("realization").rename("percentile") + forecast_slice.coord("percentile").points = np.array([25, 50, 75]) + process_kwargs = { + "realizations_count": n_realizations, + } + elif input_format == "probability": + thresholds = [273, 278, 283] + forecast_slice = Threshold(thresholds, collapse_coord="realization").process( + forecast_slice + ) + process_kwargs = { + "realizations_count": n_realizations, + } + + if output_format == "probability" and not input_format == "probability": + thresholds = [288, 303] + prob_template = Threshold(thresholds, collapse_coord=input_format).process( + forecast_slice + ) + process_kwargs.update({"prob_template": prob_template}) + + expected_data, expected_name, expected_dims = get_expected_result( + input_format, output_format, emos_include_altitude, gam_include_altitude + ) + + result = ApplySAMOS(percentiles=None).process( + forecast=forecast_slice, + forecast_gams=forecast_gams, + truth_gams=forecast_gams, + gam_features=features, + emos_coefficients=emos_coefficients, + gam_additional_fields=None, + emos_additional_fields=additional_cubes, + **process_kwargs, + ) + + np.testing.assert_array_almost_equal(result.data, expected_data, decimal=4) + assert result.name() == expected_name + assert [c.name() for c in result.coords(dim_coords=True)] == expected_dims diff --git a/improver_tests/calibration/samos_calibration/test_TrainEMOSForSAMOS.py b/improver_tests/calibration/samos_calibration/test_TrainEMOSForSAMOS.py index 5ad3332f61..12696a50a2 100644 --- a/improver_tests/calibration/samos_calibration/test_TrainEMOSForSAMOS.py +++ b/improver_tests/calibration/samos_calibration/test_TrainEMOSForSAMOS.py @@ -7,7 +7,10 @@ import numpy as np import pytest -from improver.calibration.samos_calibration import TrainEMOSForSAMOS, TrainGAMsForSAMOS +from improver.calibration.samos_calibration import ( + TrainEMOSForSAMOS, + TrainGAMsForSAMOS, +) from improver_tests.calibration.samos_calibration.helper_functions import ( create_cubes_for_gam_fitting, create_simple_cube, @@ -42,91 +45,6 @@ def test__init__(kwargs): assert getattr(result, key) == kwargs[key] -@pytest.mark.parametrize("include_altitude", [False, True]) -def test_get_climatological_stats( - include_altitude, -): - """Test that the get_climatological_stats method returns the expected results.""" - # Skip test if pyGAM not available. - pytest.importorskip("pygam") - - # Set up model terms for spatial predictors. - model_specification = [["linear", [0], {}], ["linear", [1], {}]] - features = ["latitude", "longitude"] - n_spatial_points = 5 - n_realizations = 5 - n_times = 20 - - if include_altitude: - features.append("surface_altitude") - model_specification.append(["spline", [features.index("surface_altitude")], {}]) - - cube_for_gam, additional_cubes_for_gam = create_cubes_for_gam_fitting( - n_spatial_points=n_spatial_points, - n_realizations=n_realizations, - n_times=n_times, - include_altitude=include_altitude, - ) - - gams = TrainGAMsForSAMOS(model_specification).process( - cube_for_gam, features, additional_cubes_for_gam - ) - - cube_for_test, additional_cubes_for_test = create_cubes_for_gam_fitting( - n_spatial_points=2, - n_realizations=2, - n_times=1, - include_altitude=include_altitude, - ) - - result_mean, result_sd = TrainEMOSForSAMOS.get_climatological_stats( - cube_for_test, gams, features, additional_cubes_for_test - ) - - expected_mean = create_simple_cube( - forecast_type="gridded", - n_spatial_points=2, - n_realizations=2, - n_times=1, - fill_value=0.0, - ) - expected_sd = expected_mean.copy() - - if not include_altitude: - expected_mean.data = np.array( - [ - [[284.39363425, 288.14659092], [288.14237183, 291.8953285]], - [[284.39363425, 288.14659092], [288.14237183, 291.8953285]], - ], - dtype=np.float32, - ) - expected_sd.data = np.array( - [ - [[0.36190698, 0.49423461], [0.48704575, 0.61937337]], - [[0.36190698, 0.49423461], [0.48704575, 0.61937337]], - ], - dtype=np.float32, - ) - else: - expected_mean.data = np.array( - [ - [[274.41442381, 288.1640568], [278.16316139, 291.91279438]], - [[274.41442381, 288.1640568], [278.16316139, 291.91279438]], - ], - dtype=np.float32, - ) - expected_sd.data = np.array( - [ - [[0.36048627, 0.50103523], [0.48562504, 0.626174]], - [[0.36048627, 0.50103523], [0.48562504, 0.626174]], - ], - dtype=np.float32, - ) - - assert result_mean == expected_mean - assert result_sd == expected_sd - - def test_climate_anomaly_emos(): """Test that the climate_anomaly_emos method returns the expected results.""" # Skip test if pyGAM not available. @@ -229,9 +147,9 @@ def test_process(include_altitude): "emos_coefficient_delta", ] if include_altitude: - expected_data = [0.0450892, -0.1451314, 0.0, 1.05] + expected_data = [0.02605934, -0.11531556, 0.00025, 1.0] else: - expected_data = [-0.05576817, 0.08254312, 0.0, 1.0012305] + expected_data = [0.02422589, -0.11010934, 0.0, 1.0] for i, cube in enumerate(result): assert expected_names[i] == cube.name() diff --git a/improver_tests/calibration/samos_calibration/test_TrainGAMsForSAMOS.py b/improver_tests/calibration/samos_calibration/test_TrainGAMsForSAMOS.py index 41a726d03e..b5b15c56e8 100644 --- a/improver_tests/calibration/samos_calibration/test_TrainGAMsForSAMOS.py +++ b/improver_tests/calibration/samos_calibration/test_TrainGAMsForSAMOS.py @@ -286,7 +286,6 @@ def test_calculate_cube_statistics_exception(model_specification): "coordinate. The increments between points in the time coordinate " "were: \\[86400 86401\\]. The smallest increment was: 86400." ) - with pytest.raises(ValueError, match=msg): TrainGAMsForSAMOS( model_specification=model_specification @@ -300,49 +299,49 @@ def test_calculate_cube_statistics_exception(model_specification): False, [["linear", [0], {}], ["linear", [1], {}]], 11, - np.array([288.14942357, 0.50000486], dtype=np.float64), + np.array([288.1492261877016, 0.5494898838013565], dtype=np.float64), ], [ True, [["linear", [0], {}], ["linear", [1], {}]], 11, - np.array([288.16681245, 0.50060569], dtype=np.float64), + np.array([288.16844872649284, 0.5496814185908583], dtype=np.float64), ], [ False, [["tensor", [0, 1], {}]], 11, - np.array([288.13047912, 0.48075810], dtype=np.float64), + np.array([288.1285965937152, 0.5290237015452911], dtype=np.float64), ], [ True, [["tensor", [0, 1], {}]], 11, - np.array([288.13571987, 0.48000211], dtype=np.float64), + np.array([288.13439901991666, 0.5280646018499808], dtype=np.float64), ], [ False, [["linear", [0], {}], ["linear", [1], {}]], 1, - np.array([288.16497963, 0.51062603], dtype=np.float64), + np.array([288.1659800896903, 0.561020575955558], dtype=np.float64), ], [ True, [["linear", [0], {}], ["linear", [1], {}]], 1, - np.array([288.15651954, 0.49548990], dtype=np.float64), + np.array([288.15797722833526, 0.5427105296998994], dtype=np.float64), ], [ False, [["tensor", [0, 1], {}]], 1, - np.array([287.99195197, 0.46191877], dtype=np.float64), + np.array([287.97628879844785, 0.5074109794042518], dtype=np.float64), ], [ True, [["tensor", [0, 1], {}]], 1, - np.array([287.97881919, 0.44321410], dtype=np.float64), + np.array([287.9622424769424, 0.48765295694648797], dtype=np.float64), ], ], ) diff --git a/improver_tests/calibration/samos_calibration/test_samos_utilities.py b/improver_tests/calibration/samos_calibration/test_samos_utilities.py index 3516b157e3..27c999b3cd 100644 --- a/improver_tests/calibration/samos_calibration/test_samos_utilities.py +++ b/improver_tests/calibration/samos_calibration/test_samos_utilities.py @@ -11,11 +11,13 @@ import numpy as np import pandas as pd import pytest -from iris.cube import Cube +from iris.cube import Cube, CubeList from pandas.testing import assert_frame_equal from improver.calibration.samos_calibration import ( + TrainGAMsForSAMOS, convert_dataframe_to_cube, + get_climatological_stats, prepare_data_for_gam, ) from improver.synthetic_data.set_up_test_cubes import ( @@ -23,6 +25,7 @@ set_up_variable_cube, ) from improver_tests.calibration.samos_calibration.helper_functions import ( + create_cubes_for_gam_fitting, create_simple_cube, ) @@ -163,7 +166,7 @@ def test_prepare_data_for_gam_gridded( set_up_kwargs=set_up_kwargs, ) - additional_cubes = [] + additional_cubes = CubeList() if include_altitude: additional_cubes.append(altitude_cube("gridded", set_up_kwargs)) surface_altitude = np.array([10.0, 20.0, 20.0, 10.0] * 2, dtype=np.float32) @@ -194,17 +197,23 @@ def test_prepare_data_for_gam_spot( fill_value=305.0, ) - additional_cubes = [] + additional_cubes = CubeList() if include_altitude: additional_cubes.append(altitude_cube("spot")) surface_altitude = np.array([10.0, 20.0] * 2, dtype=np.float32) spot_dataframe["surface_altitude"] = surface_altitude if include_land_fraction: - additional_cubes.append(land_fraction_cube("spot")) + land_fraction_cube_spot = land_fraction_cube("spot") + land_fraction_cube_spot.coord("wmo_id").points = np.array( + ["00000", "00003", "00002", "00001"], dtype=" 0 else False + + ( + result_forecast_cube, + result_truth_cube, + result_gam_additional_fields, + result_emos_coefficients, + result_emos_additional_fields, + result_prob_template, + ) = split_cubes_for_samos( + merged_input_cubelist, + gam_features=gam_features, + truth_attribute="truth_attribute=truth", + expect_emos_coeffs=False, + expect_emos_fields=expect_emos_additional_fields, + ) + + # Check that the output forecast cube is as expected. + assert isinstance(result_forecast_cube, iris.cube.Cube) + assert result_forecast_cube.shape == (2, 3, 3) + assert "forecast_reference_time" in [ + c.name() for c in result_forecast_cube.coords() + ] + assert "forecast_period" in [c.name() for c in result_forecast_cube.coords()] + + # Check that the output truth cube is as expected. + assert isinstance(result_truth_cube, iris.cube.Cube) + assert result_truth_cube.shape == (2, 3, 3) + assert "forecast_reference_time" not in [ + c.name() for c in result_truth_cube.coords() + ] + assert "forecast_period" not in [c.name() for c in result_truth_cube.coords()] + assert result_truth_cube.attributes["truth_attribute"] == "truth" + + # Check that the output GAM additional fields cubes are as expected. + if len(gam_features) > 0: + assert len(result_gam_additional_fields) == len(gam_features) + for cube in result_gam_additional_fields: + assert cube.name() in gam_features + else: + assert result_gam_additional_fields is None + + # Check that the output EMOS additional fields cubes are as expected. + if len(emos_feature_names) > 0: + assert len(result_emos_additional_fields) == len(emos_feature_names) + for cube in result_emos_additional_fields: + assert cube.name() in emos_feature_names + else: + assert result_emos_additional_fields is None + + # Check all other outputs are None. + assert result_emos_coefficients is None + assert result_prob_template is None + + +@pytest.mark.parametrize("n_prob_templates", [1, 2]) +def test_split_cubes_for_samos_prob_template( + forecast_cubes_multi_time, + truth_cubes_multi_time, + probability_forecast_cubes_multi_time, + n_prob_templates, +): + """Test that the split_cubes_for_samos function correctly separates out input + cubes when a probability template cube is provided.""" + merged_input_cubelist = truth_cubes_multi_time.copy() + merged_input_cubelist.extend(forecast_cubes_multi_time) + + if n_prob_templates == 1: + merged_input_cubelist.append(probability_forecast_cubes_multi_time[0]) + + ( + result_forecast_cube, + result_truth_cube, + result_gam_additional_fields, + result_emos_coefficients, + result_emos_additional_fields, + result_prob_template, + ) = split_cubes_for_samos( + merged_input_cubelist, + gam_features=[], + truth_attribute="truth_attribute=truth", + expect_emos_coeffs=True, + expect_emos_fields=True, + ) + + # Check that the output forecast cube is as expected. + assert isinstance(result_forecast_cube, iris.cube.Cube) + assert result_forecast_cube.shape == (2, 3, 3) + assert "forecast_reference_time" in [ + c.name() for c in result_forecast_cube.coords() + ] + assert "forecast_period" in [c.name() for c in result_forecast_cube.coords()] + + # Check that the output truth cube is as expected. + assert isinstance(result_truth_cube, iris.cube.Cube) + assert result_truth_cube.shape == (2, 3, 3) + assert "forecast_reference_time" not in [ + c.name() for c in result_truth_cube.coords() + ] + assert "forecast_period" not in [c.name() for c in result_truth_cube.coords()] + assert result_truth_cube.attributes["truth_attribute"] == "truth" + + # Check that the output probability template cube is as expected. + assert isinstance(result_prob_template, iris.cube.Cube) + assert result_prob_template.shape == (2, 3, 3) + assert "wind_speed" in [c.name() for c in result_prob_template.coords()] + assert "forecast_reference_time" in [ + c.name() for c in result_prob_template.coords() + ] + assert "forecast_period" in [c.name() for c in result_prob_template.coords()] + + # Check all other outputs are None. + assert result_gam_additional_fields is None + assert result_emos_coefficients is None + assert result_emos_additional_fields is None + + else: + probability_cubes = probability_forecast_cubes_multi_time.copy() + probability_cubes[1].rename("probability_of_air_temperature_above_threshold") + probability_cubes[1].coord("wind_speed").rename("air_temperature") + merged_input_cubelist.extend(probability_cubes) + + with pytest.raises( + IOError, match="Providing multiple probability cubes is not supported." + ): + split_cubes_for_samos( + merged_input_cubelist, + gam_features=[], + truth_attribute="truth_attribute=truth", + expect_emos_coeffs=True, + expect_emos_fields=True, + ) + + +@pytest.mark.skipif(not pyarrow_installed, reason="pyarrow not installed") +@pytest.mark.parametrize( + "include_forecast,include_truth", [(True, False), (False, True), (True, True)] +) +def test_identify_parquet_type_basic(tmp_path, include_forecast, include_truth): + """Test that this function correctly splits out input parquet files into forecasts + and truths.""" + merged_input_paths = [] + if include_forecast: + forecast_dir = create_multi_site_forecast_parquet_file( + tmp_path=tmp_path, + include_forecast_period=True, + ) + merged_input_paths.append(Path(forecast_dir)) + if include_truth: + truth_dir = create_multi_site_forecast_parquet_file( + tmp_path=tmp_path, + include_forecast_period=False, + ) + merged_input_paths.append(Path(truth_dir)) + + ( + result_forecast_paths, + result_truth_paths, + ) = identify_parquet_type(merged_input_paths) + + assert result_forecast_paths == ( + merged_input_paths[0] if include_forecast else None + ) + assert result_truth_paths == (merged_input_paths[-1] if include_truth else None) + + +@pytest.mark.skipif(not pyarrow_installed, reason="pyarrow not installed") +@pytest.mark.parametrize("include_netcdf", [True, False]) +@pytest.mark.parametrize("include_parquet", [True, False]) +@pytest.mark.parametrize("n_pickles", [0, 1, 2]) +def test_split_netcdf_parquet_pickle_basic( + tmp_path, forecast_cubes_multi_time, include_netcdf, include_parquet, n_pickles +): + """Test that this function correctly splits out input files into pickle, parquet + and netcdf files.""" + pytest.importorskip("pygam") + + merged_input_paths = [] + + if include_parquet: + forecast_dir = create_multi_site_forecast_parquet_file( + tmp_path=tmp_path, + include_forecast_period=True, + ) + merged_input_paths.append(Path(forecast_dir)) + + truth_dir = create_multi_site_forecast_parquet_file( + tmp_path=tmp_path, + include_forecast_period=False, + ) + merged_input_paths.append(Path(truth_dir)) + + if include_netcdf: + test_cube, netcdf_path = create_netcdf_file( + tmp_path=tmp_path, + ) + merged_input_paths.append(Path(netcdf_path)) + + if n_pickles > 0: + pickle_object, pickle_path = create_pickle_file( + tmp_path=tmp_path, filename="data1.pkl" + ) + merged_input_paths.append(Path(pickle_path)) + + if n_pickles == 2: + pickle_object, pickle_path = create_pickle_file( + tmp_path=tmp_path, filename="data2.pkl" + ) + merged_input_paths.append(Path(pickle_path)) + + msg = "Multiple pickle inputs have been provided. Only one is expected." + with pytest.raises(ValueError, match=msg): + split_netcdf_parquet_pickle(merged_input_paths) + else: + ( + result_cube, + result_parquets, + result_pickles, + ) = split_netcdf_parquet_pickle(merged_input_paths) + + if include_netcdf: + assert len(result_cube) == 1 + result_cube = result_cube[0] + # The save/load process adds a Conventions global attribute to the cube. + test_cube.attributes.globals["Conventions"] = "CF-1.7" + assert result_cube == test_cube + else: + assert result_cube is None + + if include_parquet: + # Parquets are returned as filepaths. + assert result_parquets == merged_input_paths[0:2] + else: + assert result_parquets is None + + if n_pickles == 1: + # The pickled object is a list containing 2 pygam GAMs. + for i in range(len(result_pickles)): + for key in ["max_iter", "tol", "fit_intercept"]: + assert ( + result_pickles[i].get_params()[key] + == pickle_object[i].get_params()[key] + ) + assert result_pickles[i].terms == pickle_object[i].terms + assert result_pickles[i].distribution == pickle_object[i].distribution + assert result_pickles[i].link == pickle_object[i].link + else: + assert result_pickles is None if __name__ == "__main__": diff --git a/improver_tests/calibration/utilities/test_utilities.py b/improver_tests/calibration/utilities/test_utilities.py index 9e6d6ff6ba..db2b7b1e9c 100644 --- a/improver_tests/calibration/utilities/test_utilities.py +++ b/improver_tests/calibration/utilities/test_utilities.py @@ -9,10 +9,15 @@ """ import datetime +import importlib +import re import unittest +from pathlib import Path import iris import numpy as np +import pandas as pd +import pytest from iris.coords import AuxCoord, DimCoord from iris.cube import CubeList from iris.tests import IrisTest @@ -25,23 +30,30 @@ check_forecast_consistency, check_predictor, convert_cube_data_to_2d, + convert_parquet_to_cube, create_unified_frt_coord, filter_non_matching_cubes, flatten_ignoring_masked_data, forecast_coords_match, get_frt_hours, merge_land_and_sea, + prepare_cube_no_calibration, ) from improver.metadata.constants.time_types import TIME_COORDS from improver.synthetic_data.set_up_test_cubes import ( add_coordinate, set_up_percentile_cube, + set_up_probability_cube, set_up_variable_cube, ) from improver.utilities.cube_manipulation import sort_coord_in_cube from ..emos_calibration.helper_functions import SetupCubes +pyarrow_installed = True +if not importlib.util.find_spec("pyarrow"): + pyarrow_installed = False + class Test_convert_cube_data_to_2d(IrisTest): """Test the convert_cube_data_to_2d utility.""" @@ -914,5 +926,374 @@ def test_spot_point_by_point_nans_alternative_proportion(self): ) +@pytest.fixture +def forecast_cube(): + return set_up_variable_cube( + data=np.array( + [[1.0, 2.0, 2.0], [2.0, 1.0, 3.0], [1.0, 3.0, 3.0]], dtype=np.float32 + ), + name="wind_speed", + units="m/s", + ) + + +@pytest.fixture +def forecast_percentile_cube(): + return set_up_percentile_cube( + data=np.array( + [ + [[1.0, 2.0, 2.0], [2.0, 1.0, 3.0], [1.0, 3.0, 3.0]], + [[3.0, 4.0, 4.0], [4.0, 3.0, 5.0], [3.0, 5.0, 5.0]], + ], + dtype=np.float32, + ), + percentiles=np.array([50, 60], dtype=np.float32), + name="wind_speed", + units="m/s", + ) + + +@pytest.fixture +def prob_template(): + return set_up_probability_cube( + data=np.ones((2, 3, 3), dtype=np.float32), + thresholds=[1.5, 2.5], + variable_name="wind_speed", + ) + + +@pytest.fixture +def emos_coefficient_cubes(): + # Set-up coefficient cubes + fp_names = ["wind_speed"] + predictor_index = DimCoord( + np.array(range(len(fp_names)), dtype=np.int32), + long_name="predictor_index", + units="1", + ) + dim_coords_and_dims = ((predictor_index, 0),) + predictor_name = AuxCoord(fp_names, long_name="predictor_name", units="no_unit") + aux_coords_and_dims = ((predictor_name, 0),) + + attributes = { + "diagnostic_standard_name": "wind_speed", + "distribution": "norm", + } + alpha = iris.cube.Cube( + np.array(0, dtype=np.float32), + long_name="emos_coefficients_alpha", + units="K", + attributes=attributes, + ) + beta = iris.cube.Cube( + np.array([0.5], dtype=np.float32), + long_name="emos_coefficients_beta", + units="1", + attributes=attributes, + dim_coords_and_dims=dim_coords_and_dims, + aux_coords_and_dims=aux_coords_and_dims, + ) + gamma = iris.cube.Cube( + np.array(0, dtype=np.float32), + long_name="emos_coefficients_gamma", + units="K", + attributes=attributes, + ) + delta = iris.cube.Cube( + np.array(1, dtype=np.float32), + long_name="emos_coefficients_delta", + units="1", + attributes=attributes, + ) + + return CubeList([alpha, beta, gamma, delta]) + + +def create_multi_forecast_period_forecast_parquet_file(tmp_path): + """Create a parquet file with multi-forecast period forecast data.""" + + data_dict = { + "percentile": [50, 50, 50, 50], + "forecast": [277, 270, 280, 269], + "altitude": [10, 83, 10, 83], + "blend_time": [pd.Timestamp("2017-01-02 00:00:00", tz="utc")] * 4, + "forecast_period": np.repeat( + [ + [pd.Timedelta(int(6 * 3.6e3), unit="seconds")], + [pd.Timedelta(int(12 * 3.6e3), unit="seconds")], + ], + 2, + ), + "forecast_reference_time": [pd.Timestamp("2017-01-02 00:00:00", tz="utc")] * 4, + "latitude": [60.1, 59.9, 60.1, 59.9], + "longitude": [1, 2, 1, 2], + "time": np.repeat( + [ + pd.Timestamp("2017-01-02 06:00:00", tz="utc"), + pd.Timestamp("2017-01-02 12:00:00", tz="utc"), + ], + 2, + ), + "wmo_id": ["03001", "03002", "03001", "03002"], + "station_id": ["03001", "03002", "03001", "03002"], + "cf_name": ["air_temperature"] * 4, + "units": ["K"] * 4, + "experiment": ["latestblend"] * 4, + "period": [pd.NaT] * 4, + "height": [1.5] * 4, + "diagnostic": ["temperature_at_screen_level"] * 4, + } + # Add wind speed to demonstrate filtering. + wind_speed_dict = data_dict.copy() + wind_speed_dict["forecast"] = [6, 16, 12, 15] + wind_speed_dict["cf_name"] = "wind_speed" + wind_speed_dict["units"] = "m s-1" + wind_speed_dict["diagnostic"] = "wind_speed_at_10m" + data_df = pd.DataFrame(data_dict) + wind_speed_df = pd.DataFrame(wind_speed_dict) + joined_df = pd.concat([data_df, wind_speed_df], ignore_index=True) + + output_dir = tmp_path / "forecast_parquet_files" + output_dir.mkdir(parents=True, exist_ok=True) + output_path = str(output_dir / "forecast.parquet") + joined_df.to_parquet(output_path, index=False, engine="pyarrow") + + return data_df, output_dir + + +def create_multi_forecast_period_truth_parquet_file(tmp_path): + """Create a parquet file with multi-forecast period truth data.""" + data_dict = { + "diagnostic": ["temperature_at_screen_level"] * 4, + "latitude": [60.1, 59.9, 60.1, 59.9], + "longitude": [1, 2, 1, 2], + "altitude": [10, 83, 10, 83], + "time": np.repeat( + [ + pd.Timestamp("2017-01-02 06:00:00", tz="utc"), + pd.Timestamp("2017-01-02 12:00:00", tz="utc"), + ], + 2, + ), + "wmo_id": ["03001", "03002", "03001", "03002"], + "ob_value": [280, 273, 284, 275], + } + wind_speed_dict = data_dict.copy() + wind_speed_dict["ob_value"] = [2, 11, 10, 14] + wind_speed_dict["diagnostic"] = "wind_speed_at_10m" + data_df = pd.DataFrame(data_dict) + wind_speed_df = pd.DataFrame(wind_speed_dict) + joined_df = pd.concat([data_df, wind_speed_df], ignore_index=True) + + output_dir = tmp_path / "truth_parquet_files" + output_dir.mkdir(parents=True, exist_ok=True) + output_path = str(output_dir / "truth.parquet") + joined_df.to_parquet(output_path, index=False, engine="pyarrow") + return data_df, output_dir + + +@pytest.mark.parametrize( + "include_coeffs,validity_times", + [ + [True, None], + [True, ["0400"]], # matches forecast cube validity time + [True, ["0500"]], # does not match forecast cube validity time + [False, None], + ], +) +def test_prepare_cube_no_calibration_basic( + forecast_cube, emos_coefficient_cubes, include_coeffs, validity_times +): + """Test that the function returns the expected result when coefficients are + provided with and without using the validity_time argument. + """ + result = prepare_cube_no_calibration( + forecast_cube, + emos_coefficients=emos_coefficient_cubes if include_coeffs else None, + validity_times=validity_times, + ) + + if not include_coeffs or validity_times == ["0500"]: + # No matching coefficients so should return the input cube with some + # additional metadata. + assert isinstance(result, iris.cube.Cube) + assert result.coords() == forecast_cube.coords() + assert np.all(result.data == forecast_cube.data) + assert result.attributes["comment"] == ( + "Warning: Calibration of this forecast has been attempted, however, no " + "calibration has been applied." + ) + else: + assert result is None + + +@pytest.mark.parametrize( + "include_coeffs,validity_times", + [ + [True, None], + [True, ["0400"]], # matches forecast cube validity time + [True, ["0500"]], # does not match forecast cube validity time + [False, None], + ], +) +def test_prepare_cube_no_calibration_prob_template( + forecast_cube, emos_coefficient_cubes, prob_template, include_coeffs, validity_times +): + """Test that the function returns the expected result when coefficients and a + probability template are provided with and without using the validity_time argument. + """ + result = prepare_cube_no_calibration( + forecast_cube, + emos_coefficients=emos_coefficient_cubes if include_coeffs else None, + validity_times=validity_times, + prob_template=prob_template, + ) + + if not include_coeffs or validity_times == ["0500"]: + # No matching coefficients so should return the probability template cube with + # some additional metadata. + assert isinstance(result, iris.cube.Cube) + assert result.coords() == prob_template.coords() + assert np.all(result.data == prob_template.data) + assert result.attributes["comment"] == ( + "Warning: Calibration of this forecast has been attempted, however, no " + "calibration has been applied." + ) + else: + assert result is None + + +@pytest.mark.skipif(not pyarrow_installed, reason="pyarrow not installed") +@pytest.mark.parametrize( + "include_coeffs,validity_times", + [ + [True, None], + [True, ["0400"]], # matches forecast cube validity time + [True, ["0500"]], # does not match forecast cube validity time + [False, None], + ], +) +def test_prepare_cube_no_calibration_percentiles( + forecast_percentile_cube, emos_coefficient_cubes, include_coeffs, validity_times +): + """Test that the function returns the expected result when coefficients and a set + of percentiles are provided.""" + percentiles = [55] + + expected = set_up_percentile_cube( + data=np.array( + [ + [[2.0, 3.0, 3.0], [3.0, 2.0, 4.0], [2.0, 4.0, 4.0]], + ], + dtype=np.float32, + ), + percentiles=np.array(percentiles, dtype=np.float32), + name="wind_speed", + units="m/s", + ) + + result = prepare_cube_no_calibration( + forecast_percentile_cube, + emos_coefficients=emos_coefficient_cubes if include_coeffs else None, + validity_times=validity_times, + percentiles=percentiles, + ) + + if not include_coeffs or validity_times == ["0500"]: + # No matching coefficients so should return a cube with percentiles + # resampled. + assert isinstance(result, iris.cube.Cube) + assert result.coords() == expected.coords() + assert np.all(result.data == expected.data) + assert result.attributes["comment"] == ( + "Warning: Calibration of this forecast has been attempted, however, no " + "calibration has been applied." + ) + else: + assert result is None + + +@pytest.mark.skipif(not pyarrow_installed, reason="pyarrow not installed") +@pytest.mark.parametrize("cycletime", ["20170103T0000Z", "20170104T0000Z"]) +def test_convert_parquet_to_cube_basic(tmp_path, cycletime): + """Test that this function returns the expected cubes when provided with valid + inputs.""" + fcs_df, fcs_path = create_multi_forecast_period_forecast_parquet_file(tmp_path) + + truth_df, truth_path = create_multi_forecast_period_truth_parquet_file(tmp_path) + + fcs_cube, truth_cube = convert_parquet_to_cube( + Path(fcs_path), + Path(truth_path), + forecast_period=6 * 3.6e3, # seconds + cycletime=cycletime, + training_length=1, + diagnostic="temperature_at_screen_level", + percentiles=[50], + experiment="latestblend", + ) + + if cycletime == "20170103T0000Z": + # There is valid training data in the dataframes for this cycletime. + assert isinstance(fcs_cube, iris.cube.Cube) + assert isinstance(truth_cube, iris.cube.Cube) + + for cube in [fcs_cube, truth_cube]: + assert cube.name() == "air_temperature" + assert ( + cube.coord("time").points + == pd.Timestamp("2017-01-02 06:00:00", tz="utc").timestamp() + ) + np.testing.assert_array_almost_equal( + cube.coord("altitude").points, np.array([10, 83], dtype=np.float32) + ) + np.testing.assert_array_almost_equal( + cube.coord("latitude").points, np.array([60.1, 59.9], dtype=np.float32) + ) + np.testing.assert_array_almost_equal( + cube.coord("longitude").points, np.array([1, 2], dtype=np.float32) + ) + np.testing.assert_array_equal( + cube.coord("wmo_id").points, np.array(["03001", "03002"]) + ) + + assert ( + fcs_cube.coord("forecast_reference_time").points + == pd.Timestamp("2017-01-02 00:00:00", tz="utc").timestamp() + ) + assert fcs_cube.coord("forecast_period").points == 6 * 3.6e3 + else: + # There is no valid training data in the dataframes for this cycletime. + assert fcs_cube is None + assert truth_cube is None + + +@pytest.mark.skipif(not pyarrow_installed, reason="pyarrow not installed") +def test_convert_parquet_to_cube_exception(tmp_path): + """Test that the correct exception is raised when filtering returns an empty truth + dataframe.""" + fcs_df, fcs_path = create_multi_forecast_period_forecast_parquet_file(tmp_path) + + truth_df, truth_path = create_multi_forecast_period_truth_parquet_file(tmp_path) + + # This input diagnostic does not exist in the forecast or truth dataframes. + diagnostic = "lwe_precipitation_rate" + msg = re.escape( + f"The requested filepath {truth_path} does not contain the requested contents: " + "[[('diagnostic', '==', 'lwe_precipitation_rate')]]" + ) + with pytest.raises(IOError, match=msg): + convert_parquet_to_cube( + Path(fcs_path), + Path(truth_path), + forecast_period=6 * 3.6e3, # seconds + cycletime="20170103T0000Z", + training_length=1, + diagnostic=diagnostic, + percentiles=[50], + experiment="latestblend", + ) + + if __name__ == "__main__": unittest.main()