From eeae96d973b81d610014521db652ed369e161eca Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Tue, 26 Apr 2022 12:59:45 +0100 Subject: [PATCH 01/13] Caching --- lib/iris/_lazy_data.py | 13 ++++++++++++- lib/iris/coords.py | 18 ++++++++++++++++++ lib/iris/fileformats/pp_load_rules.py | 16 +++++++++------- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 27f09b2a35..9ec877b963 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -10,7 +10,7 @@ """ -from functools import wraps +from functools import lru_cache, wraps import dask import dask.array as da @@ -48,6 +48,17 @@ def is_lazy_data(data): def _optimum_chunksize(chunks, shape, limit=None, dtype=np.dtype("f4")): + if isinstance(chunks, list): + chunks = tuple(chunks) + if isinstance(shape, list): + shape = tuple(shape) + return _optimum_chunksize_internals(chunks, shape, limit, dtype) + + +@lru_cache +def _optimum_chunksize_internals( + chunks, shape, limit=None, dtype=np.dtype("f4") +): """ Reduce or increase an initial chunk shape to get close to a chosen ideal size, while prioritising the splitting of the earlier (outer) dimensions diff --git a/lib/iris/coords.py b/lib/iris/coords.py index 0a1aecb983..f810a934e7 100644 --- a/lib/iris/coords.py +++ b/lib/iris/coords.py @@ -2921,6 +2921,24 @@ def xml_element(self, doc): return element +_dim_coord_cache = {} + + +def dim_coord_from_regular(*args, **kwargs): + coord_system = kwargs.pop("coord_system", None) + key = (args, tuple(kwargs.items())) + if key in _dim_coord_cache: + cached_coord, cached_coord_system = _dim_coord_cache[key] + if coord_system == cached_coord_system: + return cached_coord + else: + new_coord = DimCoord.from_regular( + *args, coord_system=coord_system, **kwargs + ) + _dim_coord_cache[key] = (new_coord, coord_system) + return new_coord + + class AuxCoord(Coord): """ A CF auxiliary coordinate. diff --git a/lib/iris/fileformats/pp_load_rules.py b/lib/iris/fileformats/pp_load_rules.py index 82f40dbf14..4ce1b34950 100644 --- a/lib/iris/fileformats/pp_load_rules.py +++ b/lib/iris/fileformats/pp_load_rules.py @@ -9,12 +9,13 @@ # SciTools/iris-code-generators:tools/gen_rules.py import calendar +from functools import lru_cache import cf_units import numpy as np from iris.aux_factory import HybridHeightFactory, HybridPressureFactory -from iris.coords import AuxCoord, CellMethod, DimCoord +from iris.coords import AuxCoord, CellMethod, DimCoord, dim_coord_from_regular from iris.fileformats._pp_lbproc_pairs import LBPROC_MAP from iris.fileformats.rules import ( ConversionMetadata, @@ -514,6 +515,7 @@ def _new_coord_and_dims( _HOURS_UNIT = cf_units.Unit("hours") +@lru_cache def _epoch_date_hours(epoch_hours_unit, datetime): """ Return an 'hours since epoch' number for a date. @@ -1120,7 +1122,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - DimCoord.from_regular( + dim_coord_from_regular( f.bzx, f.bdx, f.lbnpt, @@ -1141,7 +1143,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - DimCoord.from_regular( + dim_coord_from_regular( f.bzx, f.bdx, f.lbnpt, @@ -1163,7 +1165,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - DimCoord.from_regular( + dim_coord_from_regular( f.bzy, f.bdy, f.lbrow, @@ -1183,7 +1185,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - DimCoord.from_regular( + dim_coord_from_regular( f.bzy, f.bdy, f.lbrow, @@ -1270,7 +1272,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - DimCoord.from_regular( + dim_coord_from_regular( f.bzx, f.bdx, f.lbnpt, @@ -1380,7 +1382,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - DimCoord.from_regular( + dim_coord_from_regular( f.bzx, f.bdx, f.lbnpt, long_name="site_number", units="1" ), 1, From 5214d07f317d14d73a53e5dc105b63cc4cad8de6 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Tue, 26 Apr 2022 16:33:39 +0100 Subject: [PATCH 02/13] Move _optimum_chunksize docstring --- lib/iris/_lazy_data.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 9ec877b963..e43490e158 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -48,17 +48,6 @@ def is_lazy_data(data): def _optimum_chunksize(chunks, shape, limit=None, dtype=np.dtype("f4")): - if isinstance(chunks, list): - chunks = tuple(chunks) - if isinstance(shape, list): - shape = tuple(shape) - return _optimum_chunksize_internals(chunks, shape, limit, dtype) - - -@lru_cache -def _optimum_chunksize_internals( - chunks, shape, limit=None, dtype=np.dtype("f4") -): """ Reduce or increase an initial chunk shape to get close to a chosen ideal size, while prioritising the splitting of the earlier (outer) dimensions @@ -66,9 +55,9 @@ def _optimum_chunksize_internals( Args: - * chunks (tuple of int, or None): + * chunks (iterable of int, or None): Pre-existing chunk shape of the target data : None if unknown. - * shape (tuple of int): + * shape (iterable of int): The full array shape of the target data. * limit (int): The 'ideal' target chunk size, in bytes. Default from dask.config. @@ -94,6 +83,17 @@ def _optimum_chunksize_internals( "chunks = [c[0] for c in normalise_chunks('auto', ...)]". """ + if isinstance(chunks, list): + chunks = tuple(chunks) + if isinstance(shape, list): + shape = tuple(shape) + return _optimum_chunksize_internals(chunks, shape, limit, dtype) + + +@lru_cache +def _optimum_chunksize_internals( + chunks, shape, limit=None, dtype=np.dtype("f4") +): # Set the chunksize limit. if limit is None: # Fetch the default 'optimal' chunksize from the dask config. From 31c7e9180c72ed352e9d03a5006051148d79b2f2 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Tue, 26 Apr 2022 17:13:11 +0100 Subject: [PATCH 03/13] Should be copying coords because they're mutable --- lib/iris/coords.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iris/coords.py b/lib/iris/coords.py index f810a934e7..063ca3e470 100644 --- a/lib/iris/coords.py +++ b/lib/iris/coords.py @@ -2930,7 +2930,7 @@ def dim_coord_from_regular(*args, **kwargs): if key in _dim_coord_cache: cached_coord, cached_coord_system = _dim_coord_cache[key] if coord_system == cached_coord_system: - return cached_coord + return cached_coord.copy() else: new_coord = DimCoord.from_regular( *args, coord_system=coord_system, **kwargs From 87920d7e45b2fa7582c86c73d2525bfac76762ea Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Tue, 26 Apr 2022 17:22:36 +0100 Subject: [PATCH 04/13] Handle failure of datetime comparisons gracefully --- lib/iris/fileformats/pp_load_rules.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/lib/iris/fileformats/pp_load_rules.py b/lib/iris/fileformats/pp_load_rules.py index 4ce1b34950..cc11d400a8 100644 --- a/lib/iris/fileformats/pp_load_rules.py +++ b/lib/iris/fileformats/pp_load_rules.py @@ -515,8 +515,7 @@ def _new_coord_and_dims( _HOURS_UNIT = cf_units.Unit("hours") -@lru_cache -def _epoch_date_hours(epoch_hours_unit, datetime): +def _epoch_date_hours_base(epoch_hours_unit, datetime): """ Return an 'hours since epoch' number for a date. @@ -591,6 +590,16 @@ def _epoch_date_hours(epoch_hours_unit, datetime): return epoch_hours +def _epoch_date_hours(*args, **kwargs): + try: + return _epoch_date_hours_cached(*args, **kwargs) + except TypeError: + return _epoch_date_hours_base(*args, **kwargs) + + +_epoch_date_hours_cached = lru_cache(_epoch_date_hours_base) + + def _convert_time_coords( lbcode, lbtim, From 6e386a13e42e49851f6cf0d073cfb5ca25ad0a2a Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Wed, 27 Apr 2022 13:28:15 +0100 Subject: [PATCH 05/13] Review fixes and more careful checks on mutable objects --- lib/iris/_lazy_data.py | 41 ++++++++++++++++++--------- lib/iris/coords.py | 22 ++++++++++---- lib/iris/fileformats/pp_load_rules.py | 22 ++++++++------ 3 files changed, 58 insertions(+), 27 deletions(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index e43490e158..469601d433 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -10,7 +10,7 @@ """ -from functools import lru_cache, wraps +from functools import wraps import dask import dask.array as da @@ -47,7 +47,9 @@ def is_lazy_data(data): return result -def _optimum_chunksize(chunks, shape, limit=None, dtype=np.dtype("f4")): +def _optimum_chunksize_internals( + chunks, shape, limit=None, dtype=np.dtype("f4") +): """ Reduce or increase an initial chunk shape to get close to a chosen ideal size, while prioritising the splitting of the earlier (outer) dimensions @@ -83,17 +85,6 @@ def _optimum_chunksize(chunks, shape, limit=None, dtype=np.dtype("f4")): "chunks = [c[0] for c in normalise_chunks('auto', ...)]". """ - if isinstance(chunks, list): - chunks = tuple(chunks) - if isinstance(shape, list): - shape = tuple(shape) - return _optimum_chunksize_internals(chunks, shape, limit, dtype) - - -@lru_cache -def _optimum_chunksize_internals( - chunks, shape, limit=None, dtype=np.dtype("f4") -): # Set the chunksize limit. if limit is None: # Fetch the default 'optimal' chunksize from the dask config. @@ -157,6 +148,30 @@ def _optimum_chunksize_internals( return tuple(result) +_optimum_chunksize_cache = {} + + +@wraps(_optimum_chunksize_internals) +def _optimum_chunksize(chunks, shape, limit=None, dtype=np.dtype("f4")): + + key = tuple( + [ + tuple(chunks), + tuple(shape), + limit, + dtype, + dask.config.get("array.chunk-size"), + ] + ) + + if key not in _optimum_chunksize_cache: + _optimum_chunksize_cache[key] = _optimum_chunksize_internals( + chunks, shape, limit, dtype + ) + + return _optimum_chunksize_cache[key] + + def as_lazy_data(data, chunks=None, asarray=False): """ Convert the input array `data` to a dask array. diff --git a/lib/iris/coords.py b/lib/iris/coords.py index 063ca3e470..bca9de6dca 100644 --- a/lib/iris/coords.py +++ b/lib/iris/coords.py @@ -2925,18 +2925,28 @@ def xml_element(self, doc): def dim_coord_from_regular(*args, **kwargs): + # Throughout this function we treat the coord_system specially as it is + # mutable and therefore not hashable. It's therefore cached under the key + # with the coord and checked separately before we believe we have a cache + # hit. coord_system = kwargs.pop("coord_system", None) key = (args, tuple(kwargs.items())) + + # Check for cache hit if key in _dim_coord_cache: cached_coord, cached_coord_system = _dim_coord_cache[key] if coord_system == cached_coord_system: return cached_coord.copy() - else: - new_coord = DimCoord.from_regular( - *args, coord_system=coord_system, **kwargs - ) - _dim_coord_cache[key] = (new_coord, coord_system) - return new_coord + + # Cache miss requires a new coord + new_coord = DimCoord.from_regular( + *args, coord_system=coord_system, **kwargs + ) + + # The versions in the cache should be copies so they can't be mutated + # outside of this function. + _dim_coord_cache[key] = (new_coord.copy(), coord_system.copy()) + return new_coord class AuxCoord(Coord): diff --git a/lib/iris/fileformats/pp_load_rules.py b/lib/iris/fileformats/pp_load_rules.py index cc11d400a8..9fae162554 100644 --- a/lib/iris/fileformats/pp_load_rules.py +++ b/lib/iris/fileformats/pp_load_rules.py @@ -9,7 +9,7 @@ # SciTools/iris-code-generators:tools/gen_rules.py import calendar -from functools import lru_cache +from functools import wraps import cf_units import numpy as np @@ -515,7 +515,7 @@ def _new_coord_and_dims( _HOURS_UNIT = cf_units.Unit("hours") -def _epoch_date_hours_base(epoch_hours_unit, datetime): +def _epoch_date_hours_internals(epoch_hours_unit, datetime): """ Return an 'hours since epoch' number for a date. @@ -590,14 +590,20 @@ def _epoch_date_hours_base(epoch_hours_unit, datetime): return epoch_hours -def _epoch_date_hours(*args, **kwargs): - try: - return _epoch_date_hours_cached(*args, **kwargs) - except TypeError: - return _epoch_date_hours_base(*args, **kwargs) +_epoch_date_hours_cache = {} + + +@wraps(_epoch_date_hours_internals) +def _epoch_date_hours(epoch_hours_unit, datetime): + key = (epoch_hours_unit, datetime) + + if key not in _epoch_date_hours_cache: + _epoch_date_hours_cache[key] = _epoch_date_hours_internals( + epoch_hours_unit, datetime + ) -_epoch_date_hours_cached = lru_cache(_epoch_date_hours_base) + return _epoch_date_hours_cache[key] def _convert_time_coords( From 6f9bcc017566e4dc8cf923d8c10a9327e3c1a8a1 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Wed, 27 Apr 2022 13:35:06 +0100 Subject: [PATCH 06/13] coord_system doesn't have 'copy' but __repr__ covers the whole thing (I think) --- lib/iris/coords.py | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/lib/iris/coords.py b/lib/iris/coords.py index bca9de6dca..9768cd2e6c 100644 --- a/lib/iris/coords.py +++ b/lib/iris/coords.py @@ -2926,27 +2926,18 @@ def xml_element(self, doc): def dim_coord_from_regular(*args, **kwargs): # Throughout this function we treat the coord_system specially as it is - # mutable and therefore not hashable. It's therefore cached under the key - # with the coord and checked separately before we believe we have a cache - # hit. + # mutable and therefore not hashable. It is assumed to be identical to any + # coord_system with the same __repr__ output. coord_system = kwargs.pop("coord_system", None) - key = (args, tuple(kwargs.items())) + key = (args, tuple(kwargs.items()), repr(coord_system)) # Check for cache hit - if key in _dim_coord_cache: - cached_coord, cached_coord_system = _dim_coord_cache[key] - if coord_system == cached_coord_system: - return cached_coord.copy() - - # Cache miss requires a new coord - new_coord = DimCoord.from_regular( - *args, coord_system=coord_system, **kwargs - ) + if key not in _dim_coord_cache: + _dim_coord_cache[key] = DimCoord.from_regular( + *args, coord_system=coord_system, **kwargs + ) - # The versions in the cache should be copies so they can't be mutated - # outside of this function. - _dim_coord_cache[key] = (new_coord.copy(), coord_system.copy()) - return new_coord + return _dim_coord_cache[key].copy() class AuxCoord(Coord): From bade0a4f6232ce534c43783123982527c8cd281d Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Thu, 28 Apr 2022 12:14:49 +0100 Subject: [PATCH 07/13] Review fixes --- lib/iris/coord_systems.py | 9 +++++++++ lib/iris/fileformats/pp_load_rules.py | 2 ++ 2 files changed, 11 insertions(+) diff --git a/lib/iris/coord_systems.py b/lib/iris/coord_systems.py index 510aafcb48..44da6aeb6a 100644 --- a/lib/iris/coord_systems.py +++ b/lib/iris/coord_systems.py @@ -100,6 +100,15 @@ def _ellipsoid_to_globe(ellipsoid, globe_default): return globe + @abstractmethod + def __repr__(self): + """ + Return a string representing this coordinate system, such that the same + string implies the coordinate system is the same. + + """ + pass + @abstractmethod def as_cartopy_crs(self): """ diff --git a/lib/iris/fileformats/pp_load_rules.py b/lib/iris/fileformats/pp_load_rules.py index 9fae162554..0c3556f834 100644 --- a/lib/iris/fileformats/pp_load_rules.py +++ b/lib/iris/fileformats/pp_load_rules.py @@ -595,6 +595,8 @@ def _epoch_date_hours_internals(epoch_hours_unit, datetime): @wraps(_epoch_date_hours_internals) def _epoch_date_hours(epoch_hours_unit, datetime): + # Not using functools.lru_cache because it does an equality check that fails + # on datetime objects from different calendars. key = (epoch_hours_unit, datetime) From 3016e35d26dcbabd3e74319ece69a60d99502b68 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Thu, 28 Apr 2022 14:32:10 +0100 Subject: [PATCH 08/13] Revert coord_system changes --- lib/iris/_lazy_data.py | 41 +++++++++++---------------- lib/iris/coord_systems.py | 9 ------ lib/iris/coords.py | 19 ------------- lib/iris/fileformats/pp_load_rules.py | 14 ++++----- 4 files changed, 23 insertions(+), 60 deletions(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 469601d433..4defa229df 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -10,7 +10,7 @@ """ -from functools import wraps +from functools import lru_cache, wraps import dask import dask.array as da @@ -47,8 +47,13 @@ def is_lazy_data(data): return result +@lru_cache def _optimum_chunksize_internals( - chunks, shape, limit=None, dtype=np.dtype("f4") + chunks, + shape, + limit=None, + dtype=np.dtype("f4"), + dask_array_chunksize=dask.config.get("array.chunk-size"), ): """ Reduce or increase an initial chunk shape to get close to a chosen ideal @@ -57,9 +62,9 @@ def _optimum_chunksize_internals( Args: - * chunks (iterable of int, or None): + * chunks (tuple of int, or None): Pre-existing chunk shape of the target data : None if unknown. - * shape (iterable of int): + * shape (tuple of int): The full array shape of the target data. * limit (int): The 'ideal' target chunk size, in bytes. Default from dask.config. @@ -88,7 +93,7 @@ def _optimum_chunksize_internals( # Set the chunksize limit. if limit is None: # Fetch the default 'optimal' chunksize from the dask config. - limit = dask.config.get("array.chunk-size") + limit = dask_array_chunksize # Convert to bytes limit = dask.utils.parse_bytes(limit) @@ -148,28 +153,14 @@ def _optimum_chunksize_internals( return tuple(result) -_optimum_chunksize_cache = {} - - @wraps(_optimum_chunksize_internals) -def _optimum_chunksize(chunks, shape, limit=None, dtype=np.dtype("f4")): - - key = tuple( - [ - tuple(chunks), - tuple(shape), - limit, - dtype, - dask.config.get("array.chunk-size"), - ] - ) +def _optimum_chunksize(*args, **kwargs): - if key not in _optimum_chunksize_cache: - _optimum_chunksize_cache[key] = _optimum_chunksize_internals( - chunks, shape, limit, dtype - ) - - return _optimum_chunksize_cache[key] + return _optimum_chunksize_internals( + *args, + dask_array_chunksize=dask.config.get("array.chunk-size"), + **kwargs, + ) def as_lazy_data(data, chunks=None, asarray=False): diff --git a/lib/iris/coord_systems.py b/lib/iris/coord_systems.py index 44da6aeb6a..510aafcb48 100644 --- a/lib/iris/coord_systems.py +++ b/lib/iris/coord_systems.py @@ -100,15 +100,6 @@ def _ellipsoid_to_globe(ellipsoid, globe_default): return globe - @abstractmethod - def __repr__(self): - """ - Return a string representing this coordinate system, such that the same - string implies the coordinate system is the same. - - """ - pass - @abstractmethod def as_cartopy_crs(self): """ diff --git a/lib/iris/coords.py b/lib/iris/coords.py index 9768cd2e6c..0a1aecb983 100644 --- a/lib/iris/coords.py +++ b/lib/iris/coords.py @@ -2921,25 +2921,6 @@ def xml_element(self, doc): return element -_dim_coord_cache = {} - - -def dim_coord_from_regular(*args, **kwargs): - # Throughout this function we treat the coord_system specially as it is - # mutable and therefore not hashable. It is assumed to be identical to any - # coord_system with the same __repr__ output. - coord_system = kwargs.pop("coord_system", None) - key = (args, tuple(kwargs.items()), repr(coord_system)) - - # Check for cache hit - if key not in _dim_coord_cache: - _dim_coord_cache[key] = DimCoord.from_regular( - *args, coord_system=coord_system, **kwargs - ) - - return _dim_coord_cache[key].copy() - - class AuxCoord(Coord): """ A CF auxiliary coordinate. diff --git a/lib/iris/fileformats/pp_load_rules.py b/lib/iris/fileformats/pp_load_rules.py index 0c3556f834..8f0ab78d84 100644 --- a/lib/iris/fileformats/pp_load_rules.py +++ b/lib/iris/fileformats/pp_load_rules.py @@ -15,7 +15,7 @@ import numpy as np from iris.aux_factory import HybridHeightFactory, HybridPressureFactory -from iris.coords import AuxCoord, CellMethod, DimCoord, dim_coord_from_regular +from iris.coords import AuxCoord, CellMethod, DimCoord from iris.fileformats._pp_lbproc_pairs import LBPROC_MAP from iris.fileformats.rules import ( ConversionMetadata, @@ -1139,7 +1139,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - dim_coord_from_regular( + DimCoord.from_regular( f.bzx, f.bdx, f.lbnpt, @@ -1160,7 +1160,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - dim_coord_from_regular( + DimCoord.from_regular( f.bzx, f.bdx, f.lbnpt, @@ -1182,7 +1182,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - dim_coord_from_regular( + DimCoord.from_regular( f.bzy, f.bdy, f.lbrow, @@ -1202,7 +1202,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - dim_coord_from_regular( + DimCoord.from_regular( f.bzy, f.bdy, f.lbrow, @@ -1289,7 +1289,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - dim_coord_from_regular( + DimCoord.from_regular( f.bzx, f.bdx, f.lbnpt, @@ -1399,7 +1399,7 @@ def _all_other_rules(f): ): dim_coords_and_dims.append( ( - dim_coord_from_regular( + DimCoord.from_regular( f.bzx, f.bdx, f.lbnpt, long_name="site_number", units="1" ), 1, From ef7de0f1aebc1da55427f1980928b43f729b8836 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Thu, 28 Apr 2022 15:08:07 +0100 Subject: [PATCH 09/13] Still need to handle tuple issue --- lib/iris/_lazy_data.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 4defa229df..d01969e902 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -154,12 +154,20 @@ def _optimum_chunksize_internals( @wraps(_optimum_chunksize_internals) -def _optimum_chunksize(*args, **kwargs): +def _optimum_chunksize( + chunks, + shape, + limit=None, + dtype=np.dtype("f4"), + dask_array_chunksize=dask.config.get("array.chunk-size"), +): return _optimum_chunksize_internals( - *args, + tuple(chunks), + tuple(shape), + limit=None, + dtype=np.dtype("f4"), dask_array_chunksize=dask.config.get("array.chunk-size"), - **kwargs, ) From a6405c51adfdabd5fc0bba0f8516a36839f3fb33 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Thu, 28 Apr 2022 15:10:54 +0100 Subject: [PATCH 10/13] What's new --- docs/src/whatsnew/latest.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 8c3a6455d7..18e41c3c3f 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -92,6 +92,9 @@ This document explains the changes made to Iris for this release #. `@wjbenfold`_ added caching to the calculation of the points array in a :class:`~iris.coords.DimCoord` created using :meth:`~iris.coords.DimCoord.from_regular`. (:pull:`4698`) +#. `@wjbenfold`_ introduced caching in :func:`_lazy_data._optimum_chunksize` and + :func:`iris.fileformats.pp_load_rules._epoch_date_hours` to reduce time spent + repeating calculations. (:pull:`4716`) 🔥 Deprecations From 580aa71239da89bd31dbfa258f0bfd67c3c1f7b6 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Thu, 28 Apr 2022 16:20:07 +0100 Subject: [PATCH 11/13] Fix prompted by test failure --- lib/iris/_lazy_data.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index d01969e902..78ccb5f289 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -159,14 +159,13 @@ def _optimum_chunksize( shape, limit=None, dtype=np.dtype("f4"), - dask_array_chunksize=dask.config.get("array.chunk-size"), ): return _optimum_chunksize_internals( tuple(chunks), tuple(shape), - limit=None, - dtype=np.dtype("f4"), + limit=limit, + dtype=dtype, dask_array_chunksize=dask.config.get("array.chunk-size"), ) From 46965984fbf3e88f4fc2bb973b98f5bfc64156cb Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Thu, 5 May 2022 17:03:31 +0100 Subject: [PATCH 12/13] Limit cache size --- lib/iris/_lazy_data.py | 4 +++- lib/iris/fileformats/pp_load_rules.py | 6 ++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 78ccb5f289..ac7ae34511 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -160,7 +160,9 @@ def _optimum_chunksize( limit=None, dtype=np.dtype("f4"), ): - + # By providing dask_array_chunksize as an argument, we make it so that the + # output of _optimum_chunksize_internals depends only on its arguments (and + # thus we can use lru_cache) return _optimum_chunksize_internals( tuple(chunks), tuple(shape), diff --git a/lib/iris/fileformats/pp_load_rules.py b/lib/iris/fileformats/pp_load_rules.py index 8f0ab78d84..3df97417cd 100644 --- a/lib/iris/fileformats/pp_load_rules.py +++ b/lib/iris/fileformats/pp_load_rules.py @@ -591,6 +591,7 @@ def _epoch_date_hours_internals(epoch_hours_unit, datetime): _epoch_date_hours_cache = {} +_epoch_date_hours_cache_max_size = 128 # lru_cache default @wraps(_epoch_date_hours_internals) @@ -605,6 +606,11 @@ def _epoch_date_hours(epoch_hours_unit, datetime): epoch_hours_unit, datetime ) + # Limit cache size + while len(_epoch_date_hours_cache) > _epoch_date_hours_cache_max_size: + oldest_item = next(iter(_epoch_date_hours_cache)) + _epoch_date_hours_cache.pop(oldest_item, None) + return _epoch_date_hours_cache[key] From 2a791dd3798281e0bb96217b391a49568053ab75 Mon Sep 17 00:00:00 2001 From: Will Benfold Date: Thu, 5 May 2022 17:39:36 +0100 Subject: [PATCH 13/13] Work around comparable datetime issue --- lib/iris/fileformats/pp_load_rules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iris/fileformats/pp_load_rules.py b/lib/iris/fileformats/pp_load_rules.py index 3df97417cd..c23772f235 100644 --- a/lib/iris/fileformats/pp_load_rules.py +++ b/lib/iris/fileformats/pp_load_rules.py @@ -599,7 +599,7 @@ def _epoch_date_hours(epoch_hours_unit, datetime): # Not using functools.lru_cache because it does an equality check that fails # on datetime objects from different calendars. - key = (epoch_hours_unit, datetime) + key = (epoch_hours_unit, hash(datetime)) if key not in _epoch_date_hours_cache: _epoch_date_hours_cache[key] = _epoch_date_hours_internals(