diff --git a/.circleci/config.yml b/.circleci/config.yml index 09e336c98b..6fecaf7675 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -47,6 +47,7 @@ jobs: - coverage-reporter/send_report: coverage-reports: 'test-reports/coverage.xml' project-token: $CODACY_PROJECT_TOKEN + skip: true # skip if project-token is not defined (i.e. on a fork) install: # Test installation diff --git a/doc/develop/fixing_data.rst b/doc/develop/fixing_data.rst index 6dbe5fe96b..3008863a34 100644 --- a/doc/develop/fixing_data.rst +++ b/doc/develop/fixing_data.rst @@ -353,3 +353,21 @@ For example for monthly data, place the files in the ``/Tier3/MSWEP/latestversio For monthly data (V220), the data must be postfixed with the date, i.e. rename ``global_monthly_050deg.nc`` to ``global_monthly_050deg_197901-201710.nc`` For more info: http://www.gloh2o.org/ + +.. _extra-facets-fixes: + +Use of extra facets in fixes +============================ +Extra facets are a mechanism to provide additional information for certain kinds +of data. The general approach is described in :ref:`extra_facets`. Here, we +describe how they can be used in fixes to mold data into the form required by +the applicable standard. For example, if the input data is part of an +observational product that delivers surface temperature with a variable name of +`t2m` inside a file named `2m_temperature_1950_monthly.nc`, but the same +variable is called `tas` in the applicable standard, a fix can be created that +reads the original variable from the correct file, and provides a renamed +variable to the rest of the processing chain. + +Normally, the applicable standard for variables is CMIP6. + +For more details, refer to existing uses of this feature as examples. diff --git a/doc/quickstart/configure.rst b/doc/quickstart/configure.rst index cd8a92eca9..accc7f87f4 100644 --- a/doc/quickstart/configure.rst +++ b/doc/quickstart/configure.rst @@ -320,3 +320,73 @@ following documentation section: These four items here are named people, references and projects listed in the ``config-references.yml`` file. + +.. _extra_facets: + +Extra Facets +============ + +Sometimes it is useful to provide extra information for the loading of data, +particularly in the case of native model data, or observational or other data, +that generally follows the established standards, but is not part of the big +supported projects like CMIP, CORDEX, obs4MIPs. + +To support this, we provide the extra facets facilities. Facets are the +key-value pairs described in :ref:`Datasets`. Extra facets allows for the +addition of more details per project, dataset, mip table, and variable name. + +More precisely, one can provide this information in an extra yaml file, named +`{project}-something.yml`, where `{project}` corresponds to the project as used +by ESMValTool in :ref:`Datasets` and "something" is arbitrary. + +Format of the extra facets files +-------------------------------- +The extra facets are given in a yaml file, whose file name identifies the +project. Inside the file there is a hierarchy of nested dictionaries with the +following levels. At the top there is the `dataset` facet, followed by the `mip` +table, and finally the `short_name`. The leaf dictionary placed here gives the +extra facets that will be made available to data finder and the fix +infrastructure. The following example illustrates the concept. + +.. _extra-facets-example-1: + +.. code-block:: yaml + :caption: Extra facet example file `native6-era5.yml` + + ERA5: + Amon: + tas: {source_var_name: "t2m", cds_var_name: "2m_temperature"} + + +Location of the extra facets files +---------------------------------- +Extra facets files can be placed in several different places. When we use them +to support a particular use-case within the ESMValTool project, they will be +provided in the sub-folder `extra_facets` inside the package +`esmvalcore._config`. If they are used from the user side, they can be either +placed in `~/.esmvaltool/extra_facets` or in any other directory of the users +choosing. In that case this directory must be added to the `config-user.yml` +file under the `extra_facets_dir` setting, which can take a single directory or +a list of directories. + +The order in which the directories are searched is + +1. The internal directory `esmvalcore._config/extra_facets` +2. The default user directory `~/.esmvaltool/extra_facets` +3. The custom user directories in the order in which they are given in + `config-user.yml`. + +The extra facets files within each of these directories are processed in +lexicographical order according to their file name. + +In all cases it is allowed to supersede information from earlier files in later +files. This makes it possible for the user to effectively override even internal +default facets, for example to deal with local particularities in the data +handling. + +Use of extra facets +------------------- +For extra facets to be useful, the information that they provide must be +applied. There are fundamentally two places where this comes into play. One is +:ref:`the datafinder`, the other are +:ref:`fixes`. diff --git a/doc/quickstart/find_data.rst b/doc/quickstart/find_data.rst index e2fa0a61bd..05905c04a1 100644 --- a/doc/quickstart/find_data.rst +++ b/doc/quickstart/find_data.rst @@ -303,3 +303,35 @@ flexible concatenation between two cubes, depending on the particular setup: Note that two cube concatenation is the base operation of an iterative process of reducing multiple cubes from multiple data segments via cube concatenation ie if there is no time-overlapping data, the cubes concatenation is performed in one step. + +.. _extra-facets-data-finder: + +Use of extra facets in the datafinder +===================================== +Extra facets are a mechanism to provide additional information for certain kinds +of data. The general approach is described in :ref:`extra_facets`. Here, we +describe how they can be used to locate data files within the datafinder +framework. This is useful to build paths for directory structures and file names +that follow a different system than the established DRS for, e.g. CMIP. +A common application is the location of variables in multi-variable files as +often found in climate models' native output formats. + +Another use case is files that use different names for variables in their +file name than for the netCDF4 variable name. + +To apply the extra facets for this purpose, simply use the corresponding tag in +the applicable DRS inside the `config-developer.yml` file. For example, given +the extra facets in :ref:`extra-facets-example-1`, one might write the +following. + +.. _extra-facets-example-2: + +.. code-block:: yaml + :caption: Example drs use in `config-developer.yml` + + native6: + input_file: + default: '{name_in_filename}*.nc' + +The same replacement mechanism can be employed everywhere where tags can be +used, particularly in `input_dir` and `input_file`. diff --git a/doc/requirements.txt b/doc/requirements.txt index a948ebe1d0..1a1e74592b 100644 --- a/doc/requirements.txt +++ b/doc/requirements.txt @@ -1,6 +1,7 @@ autodocsumm dask[array] fiona +importlib_resources jinja2 netCDF4 numpy diff --git a/esmvalcore/_config/__init__.py b/esmvalcore/_config/__init__.py index 382237b990..1b7357ac1c 100644 --- a/esmvalcore/_config/__init__.py +++ b/esmvalcore/_config/__init__.py @@ -3,6 +3,7 @@ get_activity, get_institutes, get_project_config, + get_extra_facets, load_config_developer, read_config_developer_file, read_config_user_file, @@ -14,6 +15,7 @@ 'read_config_user_file', 'read_config_developer_file', 'load_config_developer', + 'get_extra_facets', 'get_project_config', 'get_institutes', 'get_activity', diff --git a/esmvalcore/_config/_config.py b/esmvalcore/_config/_config.py index 12e1abe8c8..f799200122 100644 --- a/esmvalcore/_config/_config.py +++ b/esmvalcore/_config/_config.py @@ -1,8 +1,11 @@ """Functions dealing with config-user.yml / config-developer.yml.""" +import collections.abc import datetime import logging import os +import sys import warnings +from functools import lru_cache from pathlib import Path import yaml @@ -13,6 +16,46 @@ CFG = {} +if sys.version_info[:2] >= (3, 9): + # pylint: disable=no-name-in-module + from importlib.resources import files as importlib_files +else: + from importlib_resources import files as importlib_files + + +def _deep_update(dictionary, update): + for key, value in update.items(): + if isinstance(value, collections.abc.Mapping): + dictionary[key] = _deep_update(dictionary.get(key, {}), value) + else: + dictionary[key] = value + return dictionary + + +@lru_cache +def _load_extra_facets(project, extra_facets_dir): + config = {} + config_paths = [ + importlib_files("esmvalcore._config") / "extra_facets", + Path.home() / ".esmvaltool" / "extra_facets", + ] + config_paths.extend([Path(p) for p in extra_facets_dir]) + for config_path in config_paths: + config_file_paths = config_path.glob(f"{project.lower()}-*.yml") + for config_file_path in sorted(config_file_paths): + logger.debug("Loading extra facets from %s", config_file_path) + with config_file_path.open() as config_file: + config_piece = yaml.safe_load(config_file) + if config_piece: + _deep_update(config, config_piece) + return config + + +def get_extra_facets(project, dataset, mip, short_name, extra_facets_dir): + """Read configuration files with additional variable information.""" + project_details = _load_extra_facets(project, extra_facets_dir) + return project_details.get(dataset, {}).get(mip, {}).get(short_name, {}) + def read_config_user_file(config_file, folder_name, options=None): """Read config user file and store settings in a dictionary.""" @@ -61,6 +104,7 @@ def read_config_user_file(config_file, folder_name, options=None): 'output_file_type': 'png', 'output_dir': 'esmvaltool_output', 'auxiliary_data_dir': 'auxiliary_data', + 'extra_facets_dir': tuple(), 'save_intermediary_cubes': False, 'remove_preproc_dir': True, 'max_parallel_tasks': None, @@ -83,6 +127,12 @@ def read_config_user_file(config_file, folder_name, options=None): cfg['output_dir'] = _normalize_path(cfg['output_dir']) cfg['auxiliary_data_dir'] = _normalize_path(cfg['auxiliary_data_dir']) + if isinstance(cfg['extra_facets_dir'], str): + cfg['extra_facets_dir'] = (_normalize_path(cfg['extra_facets_dir']), ) + else: + cfg['extra_facets_dir'] = tuple( + _normalize_path(p) for p in cfg['extra_facets_dir']) + cfg['config_developer_file'] = _normalize_path( cfg['config_developer_file']) diff --git a/esmvalcore/_recipe.py b/esmvalcore/_recipe.py index 3e094c9ed4..fcf06502f4 100644 --- a/esmvalcore/_recipe.py +++ b/esmvalcore/_recipe.py @@ -12,7 +12,13 @@ from . import __version__ from . import _recipe_checks as check -from ._config import TAGS, get_activity, get_institutes, get_project_config +from ._config import ( + TAGS, + get_activity, + get_extra_facets, + get_institutes, + get_project_config, +) from ._data_finder import ( get_input_filelist, get_output_file, @@ -93,6 +99,13 @@ def _add_cmor_info(variable, override=False): check.variable(variable, required_keys=cmor_keys) +def _add_extra_facets(variable, extra_facets_dir): + extra_facets = get_extra_facets(variable["project"], variable["dataset"], + variable["mip"], variable["short_name"], + extra_facets_dir) + _augment(variable, extra_facets) + + def _special_name_to_dataset(variable, special_name): """Convert special names to dataset names.""" if special_name in ('reference_dataset', 'alternative_dataset'): @@ -137,6 +150,7 @@ def _update_target_levels(variable, variables, settings, config_user): else: variable_data = _get_dataset_info(dataset, variables) filename = _dataset_to_file(variable_data, config_user) + fix_dir = f"{os.path.splitext(variable_data['filename'])[0]}_fixed" settings['extract_levels']['levels'] = get_reference_levels( filename=filename, project=variable_data['project'], @@ -144,8 +158,7 @@ def _update_target_levels(variable, variables, settings, config_user): short_name=variable_data['short_name'], mip=variable_data['mip'], frequency=variable_data['frequency'], - fix_dir=os.path.splitext(variable_data['filename'])[0] + - '_fixed', + fix_dir=fix_dir, ) @@ -261,12 +274,7 @@ def _get_default_settings(variable, config_user, derive=False): settings['concatenate'] = {} # Configure fixes - fix = { - 'project': variable['project'], - 'dataset': variable['dataset'], - 'short_name': variable['short_name'], - 'mip': variable['mip'], - } + fix = deepcopy(variable) # File fixes fix_dir = os.path.splitext(variable['filename'])[0] + '_fixed' settings['fix_file'] = dict(fix) @@ -349,14 +357,12 @@ def _search_fx_mip(tables, found_mip, variable, fx_info, config_user): found_mip = True fx_info['mip'] = mip fx_info = _add_fxvar_keys(fx_info, variable) - logger.debug( - "For fx variable '%s', found table '%s'", - fx_info['short_name'], mip) + logger.debug("For fx variable '%s', found table '%s'", + fx_info['short_name'], mip) fx_files = _get_input_files(fx_info, config_user)[0] if fx_files: - logger.debug( - "Found fx variables '%s':\n%s", - fx_info['short_name'], pformat(fx_files)) + logger.debug("Found fx variables '%s':\n%s", + fx_info['short_name'], pformat(fx_files)) return found_mip, fx_info, fx_files @@ -369,17 +375,17 @@ def _get_fx_files(variable, fx_info, config_user): try: get_project_config(var_project) except ValueError: - raise RecipeError( - f"Requested fx variable '{fx_info['short_name']}' " - f"with parent variable '{variable}' does not have " - f"a '{var_project}' project in config-developer.") + raise RecipeError(f"Requested fx variable '{fx_info['short_name']}' " + f"with parent variable '{variable}' does not have " + f"a '{var_project}' project in config-developer.") project_tables = CMOR_TABLES[var_project].tables # force only the mip declared by user found_mip = False if not fx_info['mip']: - found_mip, fx_info, fx_files = _search_fx_mip( - project_tables, found_mip, variable, fx_info, config_user) + found_mip, fx_info, fx_files = _search_fx_mip(project_tables, + found_mip, variable, + fx_info, config_user) else: fx_cmor = project_tables[fx_info['mip']].get(fx_info['short_name']) if fx_cmor: @@ -395,8 +401,8 @@ def _get_fx_files(variable, fx_info, config_user): # flag a warning if not fx_files: - logger.warning( - "Missing data for fx variable '%s'", fx_info['short_name']) + logger.warning("Missing data for fx variable '%s'", + fx_info['short_name']) # allow for empty lists corrected for by NE masks if fx_files: @@ -448,7 +454,10 @@ def _update_fx_files(step_name, settings, variable, config_user, fx_vars): def _fx_list_to_dict(fx_vars): - """Convert fx list to dictionary. To be deprecated at some point.""" + """Convert fx list to dictionary. + + To be deprecated at some point. + """ user_fx_vars = {} for fx_var in fx_vars: if isinstance(fx_var, dict): @@ -461,6 +470,7 @@ def _fx_list_to_dict(fx_vars): def _update_fx_settings(settings, variable, config_user): """Update fx settings depending on the needed method.""" + # get fx variables either from user defined attribute or fixed def _get_fx_vars_from_attribute(step_settings, step_name): user_fx_vars = step_settings.get('fx_variables') @@ -1012,12 +1022,13 @@ def _initialize_diagnostics(self, raw_diagnostics, raw_datasets): for name, raw_diagnostic in raw_diagnostics.items(): diagnostic = {} diagnostic['name'] = name + additional_datasets = raw_diagnostic.get('additional_datasets', []) + datasets = (raw_datasets + additional_datasets) diagnostic['preprocessor_output'] = \ self._initialize_preprocessor_output( name, raw_diagnostic.get('variables', {}), - raw_datasets + - raw_diagnostic.get('additional_datasets', [])) + datasets) variable_names = tuple(raw_diagnostic.get('variables', {})) diagnostic['scripts'] = self._initialize_scripts( name, raw_diagnostic.get('scripts'), variable_names) @@ -1041,8 +1052,8 @@ def _initialize_datasets(raw_datasets): @staticmethod def _expand_tag(variables, input_tag): - """ - Expand tags such as ensemble members or stardates to multiple datasets. + """Expand tags such as ensemble members or stardates to multiple + datasets. Expansion only supports ensembles defined as strings, not lists. """ @@ -1110,6 +1121,7 @@ def _initialize_variables(self, raw_variable, raw_datasets): if 'fx' not in raw_variable.get('mip', ''): required_keys.update({'start_year', 'end_year'}) for variable in variables: + _add_extra_facets(variable, self._cfg['extra_facets_dir']) if 'institute' not in variable: institute = get_institutes(variable) if institute: diff --git a/esmvalcore/cmor/_fixes/fix.py b/esmvalcore/cmor/_fixes/fix.py index 4595c3914f..3e1e9a8a00 100644 --- a/esmvalcore/cmor/_fixes/fix.py +++ b/esmvalcore/cmor/_fixes/fix.py @@ -1,4 +1,4 @@ -"""Contains the base class for dataset fixes""" +"""Contains the base class for dataset fixes.""" import importlib import inspect import os @@ -7,23 +7,25 @@ class Fix: - """ - Base class for dataset fixes. - """ - def __init__(self, vardef): + """Base class for dataset fixes.""" + def __init__(self, vardef, extra_facets=None): """Initialize fix object. Parameters ---------- vardef: str CMOR table entry - + extra_facets: dict, optional + Extra facets are mainly used for data outside of the big projects + like CMIP, CORDEX, obs4MIPs. For details, see :ref:`extra_facets`. """ self.vardef = vardef + if extra_facets is None: + extra_facets = {} + self.extra_facets = extra_facets def fix_file(self, filepath, output_dir): - """ - Apply fixes to the files prior to creating the cube. + """Apply fixes to the files prior to creating the cube. Should be used only to fix errors that prevent loading or can not be fixed in the cube (i.e. those related with missing_value @@ -34,7 +36,7 @@ def fix_file(self, filepath, output_dir): filepath: str file to fix output_dir: str - path to the folder to store the fixe files, if required + path to the folder to store the fixed files, if required Returns ------- @@ -42,13 +44,11 @@ def fix_file(self, filepath, output_dir): Path to the corrected file. It can be different from the original filepath if a fix has been applied, but if not it should be the original filepath - """ return filepath def fix_metadata(self, cubes): - """ - Apply fixes to the metadata of the cube. + """Apply fixes to the metadata of the cube. Changes applied here must not require data loading. @@ -63,13 +63,11 @@ def fix_metadata(self, cubes): ------- iris.cube.CubeList Fixed cubes. They can be different instances. - """ return cubes def get_cube_from_list(self, cubes, short_name=None): - """ - Get a cube from the list with a given short name. + """Get a cube from the list with a given short name. Parameters ---------- @@ -96,8 +94,7 @@ def get_cube_from_list(self, cubes, short_name=None): raise Exception('Cube for variable "{}" not found'.format(short_name)) def fix_data(self, cube): - """ - Apply fixes to the data of the cube. + """Apply fixes to the data of the cube. These fixes should be applied before checking the data. @@ -110,7 +107,6 @@ def fix_data(self, cube): ------- iris.cube.Cube Fixed cube. It can be a difference instance. - """ return cube @@ -121,9 +117,8 @@ def __ne__(self, other): return not self.__eq__(other) @staticmethod - def get_fixes(project, dataset, mip, short_name): - """ - Get the fixes that must be applied for a given dataset. + def get_fixes(project, dataset, mip, short_name, extra_facets=None): + """Get the fixes that must be applied for a given dataset. It will look for them at the module esmvalcore.cmor._fixes.PROJECT in the file DATASET, and get @@ -142,6 +137,9 @@ def get_fixes(project, dataset, mip, short_name): dataset: str mip: str short_name: str + extra_facets: dict, optional + Extra facets are mainly used for data outside of the big projects + like CMIP, CORDEX, obs4MIPs. For details, see :ref:`extra_facets`. Returns ------- @@ -155,6 +153,9 @@ def get_fixes(project, dataset, mip, short_name): dataset = dataset.replace('-', '_').lower() short_name = short_name.replace('-', '_').lower() + if extra_facets is None: + extra_facets = {} + fixes = [] try: fixes_module = importlib.import_module( @@ -164,7 +165,7 @@ def get_fixes(project, dataset, mip, short_name): classes = dict((name.lower(), value) for name, value in classes) for fix_name in (short_name, mip.lower(), 'allvars'): try: - fixes.append(classes[fix_name](vardef)) + fixes.append(classes[fix_name](vardef, extra_facets)) except KeyError: pass except ImportError: @@ -173,8 +174,7 @@ def get_fixes(project, dataset, mip, short_name): @staticmethod def get_fixed_filepath(output_dir, filepath): - """ - Get the filepath for the fixed file + """Get the filepath for the fixed file. Parameters ---------- diff --git a/esmvalcore/cmor/fix.py b/esmvalcore/cmor/fix.py index 23cc98e4f9..7fb957fd26 100644 --- a/esmvalcore/cmor/fix.py +++ b/esmvalcore/cmor/fix.py @@ -15,7 +15,8 @@ logger = logging.getLogger(__name__) -def fix_file(file, short_name, project, dataset, mip, output_dir): +def fix_file(file, short_name, project, dataset, mip, output_dir, + **extra_facets): """Fix files before ESMValTool can load them. This fixes are only for issues that prevent iris from loading the cube or @@ -33,6 +34,9 @@ def fix_file(file, short_name, project, dataset, mip, output_dir): dataset:str output_dir: str Output directory for fixed files + **extra_facets: dict, optional + Extra facets are mainly used for data outside of the big projects like + CMIP, CORDEX, obs4MIPs. For details, see :ref:`extra_facets`. Returns ------- @@ -42,7 +46,8 @@ def fix_file(file, short_name, project, dataset, mip, output_dir): for fix in Fix.get_fixes(project=project, dataset=dataset, mip=mip, - short_name=short_name): + short_name=short_name, + extra_facets=extra_facets): file = fix.fix_file(file, output_dir) return file @@ -53,7 +58,8 @@ def fix_metadata(cubes, dataset, mip, frequency=None, - check_level=CheckLevels.DEFAULT): + check_level=CheckLevels.DEFAULT, + **extra_facets): """Fix cube metadata if fixes are required and check it anyway. This method collects all the relevant fixes for a given variable, applies @@ -78,6 +84,9 @@ def fix_metadata(cubes, Variable's data frequency, if available check_level: CheckLevels Level of strictness of the checks. Set to default. + **extra_facets: dict, optional + Extra facets are mainly used for data outside of the big projects like + CMIP, CORDEX, obs4MIPs. For details, see :ref:`extra_facets`. Returns ------- @@ -92,7 +101,8 @@ def fix_metadata(cubes, fixes = Fix.get_fixes(project=project, dataset=dataset, mip=mip, - short_name=short_name) + short_name=short_name, + extra_facets=extra_facets) fixed_cubes = [] by_file = defaultdict(list) for cube in cubes: @@ -147,7 +157,8 @@ def fix_data(cube, dataset, mip, frequency=None, - check_level=CheckLevels.DEFAULT): + check_level=CheckLevels.DEFAULT, + **extra_facets): """Fix cube data if fixes add present and check it anyway. This method assumes that metadata is already fixed and checked. @@ -171,6 +182,9 @@ def fix_data(cube, Variable's data frequency, if available check_level: CheckLevels Level of strictness of the checks. Set to default. + **extra_facets: dict, optional + Extra facets are mainly used for data outside of the big projects like + CMIP, CORDEX, obs4MIPs. For details, see :ref:`extra_facets`. Returns ------- @@ -185,7 +199,8 @@ def fix_data(cube, for fix in Fix.get_fixes(project=project, dataset=dataset, mip=mip, - short_name=short_name): + short_name=short_name, + extra_facets=extra_facets): cube = fix.fix_data(cube) checker = _get_cmor_checker(frequency=frequency, table=project, diff --git a/esmvalcore/experimental/config/_config_validators.py b/esmvalcore/experimental/config/_config_validators.py index d83a26f331..4c1492c42f 100644 --- a/esmvalcore/experimental/config/_config_validators.py +++ b/esmvalcore/experimental/config/_config_validators.py @@ -60,23 +60,22 @@ def _listify_validator(scalar_validator, allow_stringlist=False, *, n_items=None, - docstring=None): + docstring=None, + return_type=list): """Apply the validator to a list.""" def func(inp): if isinstance(inp, str): try: - inp = [ + inp = return_type( scalar_validator(val.strip()) for val in inp.split(',') - if val.strip() - ] + if val.strip()) except Exception: if allow_stringlist: # Sometimes, a list of colors might be a single string # of single-letter colornames. So give that a shot. - inp = [ + inp = return_type( scalar_validator(val.strip()) for val in inp - if val.strip() - ] + if val.strip()) else: raise # Allow any ordered sequence type -- generators, np.ndarray, pd.Series @@ -87,10 +86,9 @@ def func(inp): # behavior of filtering out any empty strings (behavior was # from the original validate_stringlist()), while allowing # any non-string/text scalar values such as numbers and arrays. - inp = [ + inp = return_type( scalar_validator(val) for val in inp - if not isinstance(val, str) or val - ] + if not isinstance(val, str) or val) else: raise ValidationError( f"Expected str or other non-set iterable, but got {inp}") @@ -166,6 +164,10 @@ def chained(value): validate_pathlist = _listify_validator(validate_path, docstring='Return a list of paths.') +validate_pathtuple = _listify_validator(validate_path, + docstring='Return a tuple of paths.', + return_type=tuple) + validate_int_positive = _chain_validator(validate_int, validate_positive) validate_int_positive_or_none = _make_type_validator(validate_int_positive, allow_none=True) @@ -259,6 +261,7 @@ def deprecate(func, variable, version: str = None): 'exit_on_warning': validate_bool, 'output_dir': validate_path, 'auxiliary_data_dir': validate_path, + 'extra_facets_dir': validate_pathtuple, 'compress_netcdf': validate_bool, 'save_intermediary_cubes': validate_bool, 'remove_preproc_dir': validate_bool, diff --git a/esmvalcore/preprocessor/__init__.py b/esmvalcore/preprocessor/__init__.py index 9fe86d140e..43940f3c4d 100644 --- a/esmvalcore/preprocessor/__init__.py +++ b/esmvalcore/preprocessor/__init__.py @@ -212,13 +212,14 @@ def check_preprocessor_settings(settings): function = function = globals()[step] argspec = inspect.getfullargspec(function) args = argspec.args[1:] - # Check for invalid arguments - invalid_args = set(settings[step]) - set(args) - if invalid_args: - raise ValueError( - "Invalid argument(s): {} encountered for preprocessor " - "function {}. \nValid arguments are: [{}]".format( - ', '.join(invalid_args), step, ', '.join(args))) + if not (argspec.varargs or argspec.varkw): + # Check for invalid arguments + invalid_args = set(settings[step]) - set(args) + if invalid_args: + raise ValueError( + "Invalid argument(s): {} encountered for preprocessor " + "function {}. \nValid arguments are: [{}]".format( + ', '.join(invalid_args), step, ', '.join(args))) # Check for missing arguments defaults = argspec.defaults diff --git a/esmvalcore/preprocessor/_ancillary_vars.py b/esmvalcore/preprocessor/_ancillary_vars.py index 53a53d529a..6309d35d43 100644 --- a/esmvalcore/preprocessor/_ancillary_vars.py +++ b/esmvalcore/preprocessor/_ancillary_vars.py @@ -1,13 +1,13 @@ """Preprocessor functions for ancillary variables and cell measures.""" import logging -import iris import dask.array as da +import iris -from esmvalcore.preprocessor._io import load, concatenate_callback, concatenate -from esmvalcore.cmor.fix import fix_metadata, fix_data -from esmvalcore.cmor.check import cmor_check_metadata, cmor_check_data +from esmvalcore.cmor.check import cmor_check_data, cmor_check_metadata +from esmvalcore.cmor.fix import fix_data, fix_metadata +from esmvalcore.preprocessor._io import concatenate, concatenate_callback, load logger = logging.getLogger(__name__) @@ -16,17 +16,16 @@ def _load_fx(var_cube, fx_info, check_level): """Load and CMOR-check fx variables.""" fx_cubes = iris.cube.CubeList() + project = fx_info['project'] + mip = fx_info['mip'] + short_name = fx_info['short_name'] + freq = fx_info['frequency'] + for fx_file in fx_info['filename']: loaded_cube = load(fx_file, callback=concatenate_callback) - short_name = fx_info['short_name'] - project = fx_info['project'] - dataset = fx_info['dataset'] - mip = fx_info['mip'] - freq = fx_info['frequency'] - loaded_cube = fix_metadata(loaded_cube, short_name=short_name, - project=project, dataset=dataset, - mip=mip, frequency=freq, - check_level=check_level) + loaded_cube = fix_metadata(loaded_cube, + check_level=check_level, + **fx_info) fx_cubes.append(loaded_cube[0]) fx_cube = concatenate(fx_cubes) @@ -38,12 +37,13 @@ def _load_fx(var_cube, fx_info, check_level): short_name=short_name, frequency=freq, check_level=check_level) - fx_cube = fix_data(fx_cube, short_name=short_name, project=project, - dataset=dataset, mip=mip, frequency=freq, - check_level=check_level) + fx_cube = fix_data(fx_cube, check_level=check_level, **fx_info) - fx_cube = cmor_check_data(fx_cube, cmor_table=project, mip=mip, - short_name=fx_cube.var_name, frequency=freq, + fx_cube = cmor_check_data(fx_cube, + cmor_table=project, + mip=mip, + short_name=fx_cube.var_name, + frequency=freq, check_level=check_level) return fx_cube @@ -61,9 +61,8 @@ def _is_fx_broadcastable(fx_cube, cube): def add_cell_measure(cube, fx_cube, measure): - """ - Broadcast fx_cube and add it as a cell_measure in - the cube containing the data. + """Broadcast fx_cube and add it as a cell_measure in the cube containing + the data. Parameters ---------- @@ -102,14 +101,13 @@ def add_cell_measure(cube, fx_cube, measure): var_name=fx_cube.var_name, attributes=fx_cube.attributes) cube.add_cell_measure(measure, range(0, measure.ndim)) - logger.debug('Added %s as cell measure in cube of %s.', - fx_cube.var_name, cube.var_name) + logger.debug('Added %s as cell measure in cube of %s.', fx_cube.var_name, + cube.var_name) def add_ancillary_variable(cube, fx_cube): - """ - Broadcast fx_cube and add it as an ancillary_variable in - the cube containing the data. + """Broadcast fx_cube and add it as an ancillary_variable in the cube + containing the data. Parameters ---------- @@ -142,10 +140,9 @@ def add_ancillary_variable(cube, fx_cube): def add_fx_variables(cube, fx_variables, check_level): - """ - Load requested fx files, check with CMOR standards and add the - fx variables as cell measures or ancillary variables in - the cube containing the data. + """Load requested fx files, check with CMOR standards and add the fx + variables as cell measures or ancillary variables in the cube containing + the data. Parameters ---------- @@ -189,8 +186,7 @@ def add_fx_variables(cube, fx_variables, check_level): def remove_fx_variables(cube): - """ - Remove fx variables present as cell measures or ancillary variables in + """Remove fx variables present as cell measures or ancillary variables in the cube containing the data. Parameters diff --git a/setup.py b/setup.py index 12f992c827..66ba906f08 100755 --- a/setup.py +++ b/setup.py @@ -34,6 +34,7 @@ 'dask[array]', 'fiona', 'fire', + "importlib_resources;python_version<'3.9'", 'jinja2', 'nc-time-axis', # needed by iris.plot 'netCDF4', diff --git a/tests/integration/test_recipe.py b/tests/integration/test_recipe.py index 8d2f932e4e..dee11ed554 100644 --- a/tests/integration/test_recipe.py +++ b/tests/integration/test_recipe.py @@ -115,35 +115,86 @@ def create_test_file(filename, tracking_id=None): iris.save(cube, filename) -def _get_default_settings_for_chl(fix_dir, save_filename): +def _get_default_settings_for_chl(fix_dir, save_filename, preprocessor): """Get default preprocessor settings for chl.""" + standard_name = ('mass_concentration_of_phytoplankton_' + 'expressed_as_chlorophyll_in_sea_water') defaults = { 'load': { 'callback': concatenate_callback, }, 'concatenate': {}, 'fix_file': { - 'project': 'CMIP5', + 'alias': 'CanESM2', 'dataset': 'CanESM2', - 'short_name': 'chl', + 'diagnostic': 'diagnostic_name', + 'end_year': 2005, + 'ensemble': 'r1i1p1', + 'exp': 'historical', + 'filename': fix_dir.replace('_fixed', '.nc'), + 'frequency': 'yr', + 'institute': ['CCCma'], + 'long_name': 'Total Chlorophyll Mass Concentration', 'mip': 'Oyr', + 'modeling_realm': ['ocnBgchem'], + 'original_short_name': 'chl', 'output_dir': fix_dir, + 'preprocessor': preprocessor, + 'project': 'CMIP5', + 'recipe_dataset_index': 0, + 'short_name': 'chl', + 'standard_name': standard_name, + 'start_year': 2000, + 'units': 'kg m-3', + 'variable_group': 'chl', }, 'fix_data': { 'check_level': CheckLevels.DEFAULT, - 'project': 'CMIP5', + 'alias': 'CanESM2', 'dataset': 'CanESM2', - 'short_name': 'chl', - 'mip': 'Oyr', + 'diagnostic': 'diagnostic_name', + 'end_year': 2005, + 'ensemble': 'r1i1p1', + 'exp': 'historical', + 'filename': fix_dir.replace('_fixed', '.nc'), 'frequency': 'yr', + 'institute': ['CCCma'], + 'long_name': 'Total Chlorophyll Mass Concentration', + 'mip': 'Oyr', + 'modeling_realm': ['ocnBgchem'], + 'original_short_name': 'chl', + 'preprocessor': preprocessor, + 'project': 'CMIP5', + 'recipe_dataset_index': 0, + 'short_name': 'chl', + 'standard_name': standard_name, + 'start_year': 2000, + 'units': 'kg m-3', + 'variable_group': 'chl', }, 'fix_metadata': { 'check_level': CheckLevels.DEFAULT, - 'project': 'CMIP5', + 'alias': 'CanESM2', 'dataset': 'CanESM2', - 'short_name': 'chl', - 'mip': 'Oyr', + 'diagnostic': 'diagnostic_name', + 'end_year': 2005, + 'ensemble': 'r1i1p1', + 'exp': 'historical', + 'filename': fix_dir.replace('_fixed', '.nc'), 'frequency': 'yr', + 'institute': ['CCCma'], + 'long_name': 'Total Chlorophyll Mass Concentration', + 'mip': 'Oyr', + 'modeling_realm': ['ocnBgchem'], + 'original_short_name': 'chl', + 'preprocessor': preprocessor, + 'project': 'CMIP5', + 'recipe_dataset_index': 0, + 'short_name': 'chl', + 'standard_name': standard_name, + 'start_year': 2000, + 'units': 'kg m-3', + 'variable_group': 'chl', }, 'clip_start_end_year': { 'start_year': 2000, @@ -475,7 +526,8 @@ def test_default_preprocessor(tmp_path, patched_datafinder, config_user): fix_dir = os.path.join( preproc_dir, 'CMIP5_CanESM2_Oyr_historical_r1i1p1_chl_2000-2005_fixed') - defaults = _get_default_settings_for_chl(fix_dir, product.filename) + defaults = _get_default_settings_for_chl(fix_dir, product.filename, + 'default') assert product.settings == defaults @@ -515,7 +567,8 @@ def test_default_preprocessor_custom_order(tmp_path, patched_datafinder, fix_dir = os.path.join( preproc_dir, 'CMIP5_CanESM2_Oyr_historical_r1i1p1_chl_2000-2005_fixed') - defaults = _get_default_settings_for_chl(fix_dir, product.filename) + defaults = _get_default_settings_for_chl(fix_dir, product.filename, + 'default_custom_order') assert product.settings == defaults @@ -553,27 +606,70 @@ def test_default_fx_preprocessor(tmp_path, patched_datafinder, config_user): }, 'concatenate': {}, 'fix_file': { - 'project': 'CMIP5', + 'alias': 'CanESM2', 'dataset': 'CanESM2', - 'short_name': 'sftlf', + 'diagnostic': 'diagnostic_name', + 'ensemble': 'r0i0p0', + 'exp': 'historical', + 'filename': fix_dir.replace('_fixed', '.nc'), + 'frequency': 'fx', + 'institute': ['CCCma'], + 'long_name': 'Land Area Fraction', 'mip': 'fx', + 'modeling_realm': ['atmos'], + 'original_short_name': 'sftlf', 'output_dir': fix_dir, + 'preprocessor': 'default', + 'project': 'CMIP5', + 'recipe_dataset_index': 0, + 'short_name': 'sftlf', + 'standard_name': 'land_area_fraction', + 'units': '%', + 'variable_group': 'sftlf' }, 'fix_data': { 'check_level': CheckLevels.DEFAULT, - 'project': 'CMIP5', + 'alias': 'CanESM2', 'dataset': 'CanESM2', - 'short_name': 'sftlf', - 'mip': 'fx', + 'diagnostic': 'diagnostic_name', + 'ensemble': 'r0i0p0', + 'exp': 'historical', + 'filename': fix_dir.replace('_fixed', '.nc'), 'frequency': 'fx', + 'institute': ['CCCma'], + 'long_name': 'Land Area Fraction', + 'mip': 'fx', + 'modeling_realm': ['atmos'], + 'original_short_name': 'sftlf', + 'preprocessor': 'default', + 'project': 'CMIP5', + 'recipe_dataset_index': 0, + 'short_name': 'sftlf', + 'standard_name': 'land_area_fraction', + 'units': '%', + 'variable_group': 'sftlf' }, 'fix_metadata': { 'check_level': CheckLevels.DEFAULT, - 'project': 'CMIP5', + 'alias': 'CanESM2', 'dataset': 'CanESM2', - 'short_name': 'sftlf', - 'mip': 'fx', + 'diagnostic': 'diagnostic_name', + 'ensemble': 'r0i0p0', + 'exp': 'historical', + 'filename': fix_dir.replace('_fixed', '.nc'), 'frequency': 'fx', + 'institute': ['CCCma'], + 'long_name': 'Land Area Fraction', + 'mip': 'fx', + 'modeling_realm': ['atmos'], + 'original_short_name': 'sftlf', + 'preprocessor': 'default', + 'project': 'CMIP5', + 'recipe_dataset_index': 0, + 'short_name': 'sftlf', + 'standard_name': 'land_area_fraction', + 'units': '%', + 'variable_group': 'sftlf' }, 'cmor_check_metadata': { 'check_level': CheckLevels.DEFAULT, diff --git a/tests/sample_data/experimental/test_run_recipe.py b/tests/sample_data/experimental/test_run_recipe.py index d5efc210a5..e78eae94d4 100644 --- a/tests/sample_data/experimental/test_run_recipe.py +++ b/tests/sample_data/experimental/test_run_recipe.py @@ -53,7 +53,10 @@ def test_run_recipe(task, recipe, tmp_path): assert isinstance(recipe, Recipe) assert isinstance(recipe._repr_html_(), str) - output = recipe.run(task=task) + session = CFG.start_session(recipe.path.stem) + session['extra_facets_dir'] = [] + + output = recipe.run(task=task, session=session) assert len(output) > 0 assert isinstance(output, RecipeOutput) @@ -85,6 +88,9 @@ def test_run_recipe_diagnostic_failing(recipe, tmp_path): CFG['output_dir'] = tmp_path + session = CFG.start_session(recipe.path.stem) + session['extra_facets_dir'] = [] + with pytest.raises(RecipeError): task = 'example/non-existant' - _ = recipe.run(task) + _ = recipe.run(task, session) diff --git a/tests/sample_data/extra_facets/override/test6-01.yml b/tests/sample_data/extra_facets/override/test6-01.yml new file mode 100644 index 0000000000..3f375d1314 --- /dev/null +++ b/tests/sample_data/extra_facets/override/test6-01.yml @@ -0,0 +1,12 @@ +--- +PROJECT1: + Amon: + tas: + source_var_name: "t2m" + cds_var_name: "temperature_2m" + uas: + source_var_name: "u10n" + cds_var_name: "10m_u-component_of_neutral_wind" + vas: + source_var_name: "v10n" + cds_var_name: "10m_v-component_of_neutral_wind" diff --git a/tests/sample_data/extra_facets/override/test6-02.yml b/tests/sample_data/extra_facets/override/test6-02.yml new file mode 100644 index 0000000000..7cf8a552cd --- /dev/null +++ b/tests/sample_data/extra_facets/override/test6-02.yml @@ -0,0 +1,6 @@ +--- +PROJECT1: + Amon: + vas: + source_var_name: "10v" + cds_var_name: "v-component_of_neutral_wind_at_10m" diff --git a/tests/sample_data/extra_facets/simple/test6-01.yml b/tests/sample_data/extra_facets/simple/test6-01.yml new file mode 100644 index 0000000000..d940b8c96b --- /dev/null +++ b/tests/sample_data/extra_facets/simple/test6-01.yml @@ -0,0 +1,5 @@ +--- +PROJECT1: + Amon: + tas: {source_var_name: "2t", cds_var_name: "2m_temperature"} + psl: {source_var_name: "msl", cds_var_name: "mean_sea_level_pressure"} diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py new file mode 100644 index 0000000000..708923890e --- /dev/null +++ b/tests/unit/test_config.py @@ -0,0 +1,61 @@ +from pathlib import Path + +import pytest + +from esmvalcore._config._config import ( + _deep_update, + _load_extra_facets, + importlib_files, +) + +TEST_DEEP_UPDATE = [ + ([{}], {}), + ([dict(a=1, b=2), dict(a=3)], dict(a=3, b=2)), + ([ + dict(a=dict(b=1, c=dict(d=2)), e=dict(f=4, g=5)), + dict(a=dict(b=2, c=3)), + ], dict(a=dict(b=2, c=3), e=dict(f=4, g=5))), +] + + +@pytest.mark.parametrize('dictionaries, expected_merged', TEST_DEEP_UPDATE) +def test_deep_update(dictionaries, expected_merged): + merged = dictionaries[0] + for update in dictionaries[1:]: + merged = _deep_update(merged, update) + assert expected_merged == merged + + +BASE_PATH = importlib_files('tests') +BASE_PATH /= Path('sample_data') / Path('extra_facets') # type: ignore + +TEST_LOAD_EXTRA_FACETS = [ + ('test-nonexistent', tuple(), {}), + ('test-nonexistent', (BASE_PATH / 'simple', ), {}), # type: ignore + ( + 'test6', + (BASE_PATH / 'simple', ), # type: ignore + dict(PROJECT1=dict(Amon=dict( + tas=dict(cds_var_name='2m_temperature', source_var_name='2t'), + psl=dict(cds_var_name='mean_sea_level_pressure', + source_var_name='msl'))))), + ( + 'test6', + (BASE_PATH / 'simple', BASE_PATH / 'override'), # type: ignore + dict(PROJECT1=dict(Amon=dict( + tas=dict(cds_var_name='temperature_2m', source_var_name='t2m'), + psl=dict(cds_var_name='mean_sea_level_pressure', + source_var_name='msl'), + uas=dict(cds_var_name='10m_u-component_of_neutral_wind', + source_var_name='u10n'), + vas=dict(cds_var_name='v-component_of_neutral_wind_at_10m', + source_var_name='10v'), + )))), +] + + +@pytest.mark.parametrize('project, extra_facets_dir, expected', + TEST_LOAD_EXTRA_FACETS) +def test_load_extra_facets(project, extra_facets_dir, expected): + extra_facets = _load_extra_facets(project, extra_facets_dir) + assert extra_facets == expected