diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 8c3a6455d7..18e41c3c3f 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -92,6 +92,9 @@ This document explains the changes made to Iris for this release #. `@wjbenfold`_ added caching to the calculation of the points array in a :class:`~iris.coords.DimCoord` created using :meth:`~iris.coords.DimCoord.from_regular`. (:pull:`4698`) +#. `@wjbenfold`_ introduced caching in :func:`_lazy_data._optimum_chunksize` and + :func:`iris.fileformats.pp_load_rules._epoch_date_hours` to reduce time spent + repeating calculations. (:pull:`4716`) 🔥 Deprecations diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 27f09b2a35..ac7ae34511 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -10,7 +10,7 @@ """ -from functools import wraps +from functools import lru_cache, wraps import dask import dask.array as da @@ -47,7 +47,14 @@ def is_lazy_data(data): return result -def _optimum_chunksize(chunks, shape, limit=None, dtype=np.dtype("f4")): +@lru_cache +def _optimum_chunksize_internals( + chunks, + shape, + limit=None, + dtype=np.dtype("f4"), + dask_array_chunksize=dask.config.get("array.chunk-size"), +): """ Reduce or increase an initial chunk shape to get close to a chosen ideal size, while prioritising the splitting of the earlier (outer) dimensions @@ -86,7 +93,7 @@ def _optimum_chunksize(chunks, shape, limit=None, dtype=np.dtype("f4")): # Set the chunksize limit. if limit is None: # Fetch the default 'optimal' chunksize from the dask config. - limit = dask.config.get("array.chunk-size") + limit = dask_array_chunksize # Convert to bytes limit = dask.utils.parse_bytes(limit) @@ -146,6 +153,25 @@ def _optimum_chunksize(chunks, shape, limit=None, dtype=np.dtype("f4")): return tuple(result) +@wraps(_optimum_chunksize_internals) +def _optimum_chunksize( + chunks, + shape, + limit=None, + dtype=np.dtype("f4"), +): + # By providing dask_array_chunksize as an argument, we make it so that the + # output of _optimum_chunksize_internals depends only on its arguments (and + # thus we can use lru_cache) + return _optimum_chunksize_internals( + tuple(chunks), + tuple(shape), + limit=limit, + dtype=dtype, + dask_array_chunksize=dask.config.get("array.chunk-size"), + ) + + def as_lazy_data(data, chunks=None, asarray=False): """ Convert the input array `data` to a dask array. diff --git a/lib/iris/fileformats/pp_load_rules.py b/lib/iris/fileformats/pp_load_rules.py index 82f40dbf14..c23772f235 100644 --- a/lib/iris/fileformats/pp_load_rules.py +++ b/lib/iris/fileformats/pp_load_rules.py @@ -9,6 +9,7 @@ # SciTools/iris-code-generators:tools/gen_rules.py import calendar +from functools import wraps import cf_units import numpy as np @@ -514,7 +515,7 @@ def _new_coord_and_dims( _HOURS_UNIT = cf_units.Unit("hours") -def _epoch_date_hours(epoch_hours_unit, datetime): +def _epoch_date_hours_internals(epoch_hours_unit, datetime): """ Return an 'hours since epoch' number for a date. @@ -589,6 +590,30 @@ def _epoch_date_hours(epoch_hours_unit, datetime): return epoch_hours +_epoch_date_hours_cache = {} +_epoch_date_hours_cache_max_size = 128 # lru_cache default + + +@wraps(_epoch_date_hours_internals) +def _epoch_date_hours(epoch_hours_unit, datetime): + # Not using functools.lru_cache because it does an equality check that fails + # on datetime objects from different calendars. + + key = (epoch_hours_unit, hash(datetime)) + + if key not in _epoch_date_hours_cache: + _epoch_date_hours_cache[key] = _epoch_date_hours_internals( + epoch_hours_unit, datetime + ) + + # Limit cache size + while len(_epoch_date_hours_cache) > _epoch_date_hours_cache_max_size: + oldest_item = next(iter(_epoch_date_hours_cache)) + _epoch_date_hours_cache.pop(oldest_item, None) + + return _epoch_date_hours_cache[key] + + def _convert_time_coords( lbcode, lbtim,