From 9f83a9a65bc1cc34f2196370602e6b539aa7fe7f Mon Sep 17 00:00:00 2001 From: James Frost Date: Sat, 1 Mar 2025 16:12:29 +0000 Subject: [PATCH 01/90] Correct unit to units in AuxCoord docstring (#6348) * Correct unit to units in AuxCoord docstring Fixes #6347 * Add whatsnew entry --- docs/src/whatsnew/latest.rst | 6 +++++- lib/iris/coords.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 8bc63d49a2..32c97b9cac 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -116,7 +116,10 @@ This document explains the changes made to Iris for this release #. `@ESadek-MO`_ and `@trexfeathers`_ created a style guide for ``pytest`` tests, and consolidated ``Test Categories`` and ``Testing Tools`` into - :ref:`contributing_tests` (:issue:`5574`, :pull:`5785`) + :ref:`contributing_tests`. (:issue:`5574`, :pull:`5785`) + +#. `@jfrost-mo`_ corrected ``unit`` to ``units`` in the docstring for + :class:`iris.coords.AuxCoord`. (:issue:`6347`, :pull:`6348`) 💼 Internal @@ -148,6 +151,7 @@ This document explains the changes made to Iris for this release core dev names are automatically included by the common_links.inc: .. _@fnattino: https://github.com/fnattino +.. _@jfrost-mo: https://github.com/jfrost-mo .. _@jrackham-mo: https://github.com/jrackham-mo .. _@stefsmeets: https://github.com/stefsmeets .. _@valeriupredoi: https://github.com/valeriupredoi diff --git a/lib/iris/coords.py b/lib/iris/coords.py index 029d2c603e..bc0991565c 100644 --- a/lib/iris/coords.py +++ b/lib/iris/coords.py @@ -2956,7 +2956,7 @@ def __init__(self, *args, **kwargs): Descriptive name of the coordinate. var_name : optional The netCDF variable name for the coordinate. - unit : :class:`~cf_units.Unit`, optional + units : :class:`~cf_units.Unit`, optional The :class:`~cf_units.Unit` of the coordinate's values. Can be a string, which will be converted to a Unit object. bounds : optional From 409b1d16c4fa38b63ece33a61e007118e4f13276 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Thu, 6 Mar 2025 06:16:28 +0000 Subject: [PATCH 02/90] Bump scitools/workflows from 2025.02.3 to 2025.03.1 (#6357) Bumps [scitools/workflows](https://github.com/scitools/workflows) from 2025.02.3 to 2025.03.1. - [Release notes](https://github.com/scitools/workflows/releases) - [Commits](https://github.com/scitools/workflows/compare/2025.02.3...2025.03.1) --- updated-dependencies: - dependency-name: scitools/workflows dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci-manifest.yml | 2 +- .github/workflows/refresh-lockfiles.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci-manifest.yml b/.github/workflows/ci-manifest.yml index 430977af34..31de8f3b24 100644 --- a/.github/workflows/ci-manifest.yml +++ b/.github/workflows/ci-manifest.yml @@ -23,4 +23,4 @@ concurrency: jobs: manifest: name: "check-manifest" - uses: scitools/workflows/.github/workflows/ci-manifest.yml@2025.02.3 + uses: scitools/workflows/.github/workflows/ci-manifest.yml@2025.03.1 diff --git a/.github/workflows/refresh-lockfiles.yml b/.github/workflows/refresh-lockfiles.yml index ef8cae8322..6b6de39e06 100644 --- a/.github/workflows/refresh-lockfiles.yml +++ b/.github/workflows/refresh-lockfiles.yml @@ -14,5 +14,5 @@ on: jobs: refresh_lockfiles: - uses: scitools/workflows/.github/workflows/refresh-lockfiles.yml@2025.02.3 + uses: scitools/workflows/.github/workflows/refresh-lockfiles.yml@2025.03.1 secrets: inherit From 9fede7c667b71239184a6415e8e403c07531c4ba Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 6 Mar 2025 15:51:05 +0100 Subject: [PATCH 03/90] Use realized data in benchmarks when requested (#6339) * Use realized data in benchmarks when requested * Add whatsnew --- benchmarks/benchmarks/generate_data/__init__.py | 9 ++++++--- benchmarks/benchmarks/generate_data/stock.py | 3 ++- docs/src/whatsnew/latest.rst | 3 +++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/benchmarks/benchmarks/generate_data/__init__.py b/benchmarks/benchmarks/generate_data/__init__.py index bb53e26b2f..3366bec6e0 100644 --- a/benchmarks/benchmarks/generate_data/__init__.py +++ b/benchmarks/benchmarks/generate_data/__init__.py @@ -106,11 +106,14 @@ def load_realised(): file loading, but some benchmarks are only meaningful if starting with real arrays. """ + from iris.fileformats._nc_load_rules import helpers from iris.fileformats.netcdf.loader import _get_cf_var_data as pre_patched def patched(cf_var, filename): return as_concrete_data(pre_patched(cf_var, filename)) - netcdf._get_cf_var_data = patched - yield netcdf - netcdf._get_cf_var_data = pre_patched + netcdf.loader._get_cf_var_data = patched + helpers._get_cf_var_data = patched + yield + netcdf.loader._get_cf_var_data = pre_patched + helpers._get_cf_var_data = pre_patched diff --git a/benchmarks/benchmarks/generate_data/stock.py b/benchmarks/benchmarks/generate_data/stock.py index 04698e8ff5..47014078e7 100644 --- a/benchmarks/benchmarks/generate_data/stock.py +++ b/benchmarks/benchmarks/generate_data/stock.py @@ -162,7 +162,8 @@ def realistic_4d_w_everything(w_mesh=False, lazy=False) -> iris.cube.Cube: lazy : bool If True, the Cube will be returned with all arrays as they would normally be loaded from file (i.e. most will still be lazy Dask - arrays). If False, all arrays will be realised NumPy arrays. + arrays). If False, all arrays (except derived coordinates) will be + realised NumPy arrays. """ diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 32c97b9cac..833488304e 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -145,6 +145,9 @@ This document explains the changes made to Iris for this release #. `@trexfeathers`_ temporarily pinned Sphinx to `<8.2`. (:pull:`6344`, :issue:`6345`) +#. `@bouweandela`_ fixed a bug in the benchmarking code that caused all benchmarks + to be run with lazy data. (:pull:`6339`) + .. comment Whatsnew author names (@github name) in alphabetical order. Note that, From c6f65a25bb3c31d4da0bfb5ff372831ad92509c3 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Fri, 7 Mar 2025 01:56:14 +0100 Subject: [PATCH 04/90] Make array equal work with masked Dask arrays and add tests (#6325) * Fix issue where the mask of Dask arrays was ignored and improve tests * Optimize unmasked numpy arrays * Add whatsnew * Use dask.array.blockwise for array comparison * Faster comparison * Avoid flattening arrays * Avoid checking points if one coord has bounds and the other does not have them * Small simplification * Add test * Use a separate map and reduce operation to avoid running out of memory on large arrays * Correct order of checking if array is floating point dtype Also consider non-floating point arrays equal with withnans=False --------- Co-authored-by: Patrick Peglar --- docs/src/whatsnew/latest.rst | 3 + lib/iris/coords.py | 11 +- .../tests/unit/concatenate/test_hashing.py | 16 + lib/iris/tests/unit/util/test_array_equal.py | 307 +++++++++++------- lib/iris/util.py | 91 +++++- 5 files changed, 280 insertions(+), 148 deletions(-) diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 833488304e..43fd95e49a 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -65,6 +65,9 @@ This document explains the changes made to Iris for this release older NetCDF formats e.g. ``NETCDF4_CLASSIC`` support a maximum precision of 32-bit. (:issue:`6178`, :pull:`6343`) +#. `@bouweandela`_ fixed handling of masked Dask arrays in + :func:`~iris.util.array_equal`. + 💣 Incompatible Changes ======================= diff --git a/lib/iris/coords.py b/lib/iris/coords.py index bc0991565c..ca73dcb729 100644 --- a/lib/iris/coords.py +++ b/lib/iris/coords.py @@ -589,21 +589,22 @@ def __eq__(self, other): if hasattr(other, "metadata"): # metadata comparison eq = self.metadata == other.metadata + + # Also consider bounds, if we have them. + # (N.B. though only Coords can ever actually *have* bounds). + if eq and eq is not NotImplemented: + eq = self.has_bounds() is other.has_bounds() + # data values comparison if eq and eq is not NotImplemented: eq = iris.util.array_equal( self._core_values(), other._core_values(), withnans=True ) - - # Also consider bounds, if we have them. - # (N.B. though only Coords can ever actually *have* bounds). if eq and eq is not NotImplemented: if self.has_bounds() and other.has_bounds(): eq = iris.util.array_equal( self.core_bounds(), other.core_bounds(), withnans=True ) - else: - eq = not self.has_bounds() and not other.has_bounds() return eq diff --git a/lib/iris/tests/unit/concatenate/test_hashing.py b/lib/iris/tests/unit/concatenate/test_hashing.py index 24062a2af3..88064e4e46 100644 --- a/lib/iris/tests/unit/concatenate/test_hashing.py +++ b/lib/iris/tests/unit/concatenate/test_hashing.py @@ -9,6 +9,8 @@ import pytest from iris import _concatenate +from iris.tests.unit.util.test_array_equal import TEST_CASES +from iris.util import array_equal @pytest.mark.parametrize( @@ -75,6 +77,20 @@ def test_compute_hashes(a, b, eq): assert eq == (hashes["a"] == hashes["b"]) +@pytest.mark.parametrize( + "a,b", + [ + (a, b) + for (a, b, withnans, eq) in TEST_CASES + if isinstance(a, np.ndarray | da.Array) and isinstance(b, np.ndarray | da.Array) + ], +) +def test_compute_hashes_vs_array_equal(a, b): + """Test that hashing give the same answer as `array_equal(withnans=True)`.""" + hashes = _concatenate._compute_hashes({"a": a, "b": b}) + assert array_equal(a, b, withnans=True) == (hashes["a"] == hashes["b"]) + + def test_arrayhash_equal_incompatible_chunks_raises(): hash1 = _concatenate._ArrayHash(1, chunks=((1, 1),)) hash2 = _concatenate._ArrayHash(1, chunks=((2,),)) diff --git a/lib/iris/tests/unit/util/test_array_equal.py b/lib/iris/tests/unit/util/test_array_equal.py index 3e1aaf1bfb..eafe123aed 100644 --- a/lib/iris/tests/unit/util/test_array_equal.py +++ b/lib/iris/tests/unit/util/test_array_equal.py @@ -4,133 +4,190 @@ # See LICENSE in the root of the repository for full licensing details. """Test function :func:`iris.util.array_equal`.""" +import dask.array as da import numpy as np import numpy.ma as ma +import pytest from iris.util import array_equal - -class Test: - def test_0d(self): - array_a = np.array(23) - array_b = np.array(23) - array_c = np.array(7) - assert array_equal(array_a, array_b) - assert not array_equal(array_a, array_c) - - def test_0d_and_scalar(self): - array_a = np.array(23) - assert array_equal(array_a, 23) - assert not array_equal(array_a, 45) - - def test_1d_and_sequences(self): - for sequence_type in (list, tuple): - seq_a = sequence_type([1, 2, 3]) - array_a = np.array(seq_a) - assert array_equal(array_a, seq_a) - assert not array_equal(array_a, seq_a[:-1]) - array_a[1] = 45 - assert not array_equal(array_a, seq_a) - - def test_nd(self): - array_a = np.array(np.arange(24).reshape(2, 3, 4)) - array_b = np.array(np.arange(24).reshape(2, 3, 4)) - array_c = np.array(np.arange(24).reshape(2, 3, 4)) - array_c[0, 1, 2] = 100 - assert array_equal(array_a, array_b) - assert not array_equal(array_a, array_c) - - def test_masked_is_not_ignored(self): - array_a = ma.masked_array([1, 2, 3], mask=[1, 0, 1]) - array_b = ma.masked_array([2, 2, 2], mask=[1, 0, 1]) - assert array_equal(array_a, array_b) - - def test_masked_is_different(self): - array_a = ma.masked_array([1, 2, 3], mask=[1, 0, 1]) - array_b = ma.masked_array([1, 2, 3], mask=[0, 0, 1]) - assert not array_equal(array_a, array_b) - - def test_masked_isnt_unmasked(self): - array_a = np.array([1, 2, 2]) - array_b = ma.masked_array([1, 2, 2], mask=[0, 0, 1]) - assert not array_equal(array_a, array_b) - - def test_masked_unmasked_equivelance(self): - array_a = np.array([1, 2, 2]) - array_b = ma.masked_array([1, 2, 2]) - array_c = ma.masked_array([1, 2, 2], mask=[0, 0, 0]) - assert array_equal(array_a, array_b) - assert array_equal(array_a, array_c) - - def test_fully_masked_arrays(self): - array_a = ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True) - array_b = ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True) - assert array_equal(array_a, array_b) - - def test_fully_masked_0d_arrays(self): - array_a = ma.masked_array(3, mask=True) - array_b = ma.masked_array(3, mask=True) - assert array_equal(array_a, array_b) - - def test_fully_masked_string_arrays(self): - array_a = ma.masked_array(["a", "b", "c"], mask=True) - array_b = ma.masked_array(["a", "b", "c"], mask=[1, 1, 1]) - assert array_equal(array_a, array_b) - - def test_partially_masked_string_arrays(self): - array_a = ma.masked_array(["a", "b", "c"], mask=[1, 0, 1]) - array_b = ma.masked_array(["a", "b", "c"], mask=[1, 0, 1]) - assert array_equal(array_a, array_b) - - def test_string_arrays_equal(self): - array_a = np.array(["abc", "def", "efg"]) - array_b = np.array(["abc", "def", "efg"]) - assert array_equal(array_a, array_b) - - def test_string_arrays_different_contents(self): - array_a = np.array(["abc", "def", "efg"]) - array_b = np.array(["abc", "de", "efg"]) - assert not array_equal(array_a, array_b) - - def test_string_arrays_subset(self): - array_a = np.array(["abc", "def", "efg"]) - array_b = np.array(["abc", "def"]) - assert not array_equal(array_a, array_b) - assert not array_equal(array_b, array_a) - - def test_string_arrays_unequal_dimensionality(self): - array_a = np.array("abc") - array_b = np.array(["abc"]) - array_c = np.array([["abc"]]) - assert not array_equal(array_a, array_b) - assert not array_equal(array_b, array_a) - assert not array_equal(array_a, array_c) - assert not array_equal(array_b, array_c) - - def test_string_arrays_0d_and_scalar(self): - array_a = np.array("foobar") - assert array_equal(array_a, "foobar") - assert not array_equal(array_a, "foo") - assert not array_equal(array_a, "foobar.") - - def test_nan_equality_nan_ne_nan(self): - array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) - array_b = array_a.copy() - assert not array_equal(array_a, array_a) - assert not array_equal(array_a, array_b) - - def test_nan_equality_nan_naneq_nan(self): - array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) - array_b = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) - assert array_equal(array_a, array_a, withnans=True) - assert array_equal(array_a, array_b, withnans=True) - - def test_nan_equality_nan_nanne_a(self): - array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) - array_b = np.array([1.0, np.nan, 2.0, 0.0, 3.0]) - assert not array_equal(array_a, array_b, withnans=True) - - def test_nan_equality_a_nanne_b(self): - array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) - array_b = np.array([1.0, np.nan, 2.0, np.nan, 4.0]) - assert not array_equal(array_a, array_b, withnans=True) +ARRAY1 = np.array(np.arange(24).reshape(2, 3, 4)) +ARRAY1[0, 1, 2] = 100 + +ARRAY2 = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) + +TEST_CASES = [ + # test empty + (np.array([]), np.array([]), False, True), + (np.array([]), np.array([], dtype=np.float64), True, True), + # test 0d + (np.array(23), np.array(23), False, True), + (np.array(23), np.array(7), False, False), + # test 0d and scalar + (np.array(23), 23, False, True), + (np.array(23), 45, False, False), + # test 1d and sequences + (np.array([1, 2, 3]), [1, 2, 3], False, True), + (np.array([1, 2, 3]), [1, 2], False, False), + (np.array([1, 45, 3]), [1, 2, 3], False, False), + (np.array([1, 2, 3]), (1, 2, 3), False, True), + (np.array([1, 2, 3]), (1, 2), False, False), + (np.array([1, 45, 3]), (1, 2, 3), False, False), + # test 3d + ( + np.array(np.arange(24).reshape(2, 3, 4)), + np.array(np.arange(24).reshape(2, 3, 4)), + False, + True, + ), + ( + np.array(np.arange(24).reshape(2, 3, 4)), + ARRAY1, + False, + False, + ), + # test masked is not ignored + ( + ma.masked_array([1, 2, 3], mask=[1, 0, 1]), + ma.masked_array([2, 2, 2], mask=[1, 0, 1]), + False, + True, + ), + # test masked is different + ( + ma.masked_array([1, 2, 3], mask=[1, 0, 1]), + ma.masked_array([1, 2, 3], mask=[0, 0, 1]), + False, + False, + ), + # test masked isn't unmasked + ( + np.array([1, 2, 2]), + ma.masked_array([1, 2, 2], mask=[0, 0, 1]), + False, + False, + ), + ( + ma.masked_array([1, 2, 2], mask=[0, 0, 1]), + ma.masked_array([1, 2, 2]), + False, + False, + ), + ( + np.array([1, 2]), + ma.masked_array([1, 3], mask=[0, 1]), + False, + False, + ), + # test masked/unmasked_equivalence + ( + np.array([1, 2, 2]), + ma.masked_array([1, 2, 2]), + False, + True, + ), + ( + np.array([1, 2, 2]), + ma.masked_array([1, 2, 2], mask=[0, 0, 0]), + False, + True, + ), + # test fully masked arrays + ( + ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True), + ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True), + False, + True, + ), + # test fully masked 0d arrays + ( + ma.masked_array(3, mask=True), + ma.masked_array(3, mask=True), + False, + True, + ), + # test fully masked string arrays + ( + ma.masked_array(["a", "b", "c"], mask=True), + ma.masked_array(["a", "b", "c"], mask=[1, 1, 1]), + False, + True, + ), + # test partially masked string arrays + ( + ma.masked_array(["a", "b", "c"], mask=[1, 0, 1]), + ma.masked_array(["a", "b", "c"], mask=[1, 0, 1]), + False, + True, + ), + # test string arrays equal + ( + np.array(["abc", "def", "efg"]), + np.array(["abc", "def", "efg"]), + False, + True, + ), + # test string arrays different contents + ( + np.array(["abc", "def", "efg"]), + np.array(["abc", "de", "efg"]), + False, + False, + ), + # test string arrays subset + ( + np.array(["abc", "def", "efg"]), + np.array(["abc", "def"]), + False, + False, + ), + ( + np.array(["abc", "def"]), + np.array(["abc", "def", "efg"]), + False, + False, + ), + # test string arrays unequal dimensionality + (np.array("abc"), np.array(["abc"]), False, False), + (np.array(["abc"]), np.array("abc"), False, False), + (np.array("abc"), np.array([["abc"]]), False, False), + (np.array(["abc"]), np.array([["abc"]]), False, False), + # test string arrays 0d and scalar + (np.array("foobar"), "foobar", False, True), + (np.array("foobar"), "foo", False, False), + (np.array("foobar"), "foobar.", False, False), + # test nan equality nan ne nan + (ARRAY2, ARRAY2, False, False), + (ARRAY2, ARRAY2.copy(), False, False), + # test nan equality nan naneq nan + (ARRAY2, ARRAY2, True, True), + (ARRAY2, ARRAY2.copy(), True, True), + # test nan equality nan nanne a + ( + np.array([1.0, np.nan, 2.0, np.nan, 3.0]), + np.array([1.0, np.nan, 2.0, 0.0, 3.0]), + True, + False, + ), + # test nan equality a nanne b + ( + np.array([1.0, np.nan, 2.0, np.nan, 3.0]), + np.array([1.0, np.nan, 2.0, np.nan, 4.0]), + True, + False, + ), +] + + +@pytest.mark.parametrize("lazy", [False, True]) +@pytest.mark.parametrize("array_a,array_b,withnans,eq", TEST_CASES) +def test_array_equal(array_a, array_b, withnans, eq, lazy): + if lazy: + identical = array_a is array_b + if isinstance(array_a, np.ndarray): + array_a = da.asarray(array_a, chunks=2) + if isinstance(array_b, np.ndarray): + array_b = da.asarray(array_b, chunks=1) + if identical: + array_b = array_a + assert eq == array_equal(array_a, array_b, withnans=withnans) diff --git a/lib/iris/util.py b/lib/iris/util.py index 94cb077a2f..fef83b4c94 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -387,14 +387,58 @@ def _rolling_window(array): return rw -def array_equal(array1, array2, withnans=False): +def _masked_array_equal( + array1: np.ndarray, + array2: np.ndarray, + equal_nan: bool, +) -> np.ndarray: + """Return whether two, possibly masked, arrays are equal.""" + mask1 = ma.getmask(array1) + mask2 = ma.getmask(array2) + + # Compare mask equality. + if mask1 is ma.nomask and mask2 is ma.nomask: + eq = True + elif mask1 is ma.nomask: + eq = not mask2.any() + elif mask2 is ma.nomask: + eq = not mask1.any() + else: + eq = np.array_equal(mask1, mask2) + + if not eq: + eqs = np.zeros(array1.shape, dtype=bool) + else: + # Compare data equality. + if not (mask1 is ma.nomask or mask2 is ma.nomask): + # Ignore masked data. + ignore = mask1 + else: + ignore = None + + if equal_nan: + # Ignore data that is np.nan in both arrays. + nanmask = np.isnan(array1) & np.isnan(array2) + if ignore is None: + ignore = nanmask + else: + ignore |= nanmask + + eqs = ma.getdata(array1) == ma.getdata(array2) + if ignore is not None: + eqs = np.where(ignore, True, eqs) + + return eqs + + +def array_equal(array1, array2, withnans: bool = False) -> bool: """Return whether two arrays have the same shape and elements. Parameters ---------- array1, array2 : arraylike Args to be compared, normalised if necessary with :func:`np.asarray`. - withnans : bool, default=False + withnans : default=False When unset (default), the result is False if either input contains NaN points. This is the normal floating-point arithmetic result. When set, return True if inputs contain the same value in all elements, @@ -409,31 +453,42 @@ def array_equal(array1, array2, withnans=False): This function maintains laziness when called; it does not realise data. See more at :doc:`/userguide/real_and_lazy_data`. """ - if withnans and (array1 is array2): - return True def normalise_array(array): - if not is_lazy_data(array): - if not ma.isMaskedArray(array): - array = np.asanyarray(array) + if not isinstance(array, np.ndarray | da.Array): + array = np.asanyarray(array) return array array1, array2 = normalise_array(array1), normalise_array(array2) + floating_point_arrays = array1.dtype.kind == "f" or array2.dtype.kind == "f" + if (array1 is array2) and (withnans or not floating_point_arrays): + return True + + if not floating_point_arrays: + withnans = False + eq = array1.shape == array2.shape if eq: - array1_masked = ma.is_masked(array1) - eq = array1_masked == ma.is_masked(array2) - if eq and array1_masked: - eq = np.array_equal(ma.getmaskarray(array1), ma.getmaskarray(array2)) - if eq: - eqs = array1 == array2 - if withnans and (array1.dtype.kind == "f" or array2.dtype.kind == "f"): - eqs = np.where(np.isnan(array1) & np.isnan(array2), True, eqs) - eq = np.all(eqs) - eq = bool(eq) or eq is ma.masked + if is_lazy_data(array1) or is_lazy_data(array2): + # Use a separate map and reduce operation to avoid running out of memory. + ndim = array1.ndim + indices = tuple(range(ndim)) + eq = da.blockwise( + _masked_array_equal, + indices, + array1, + indices, + array2, + indices, + dtype=bool, + meta=np.empty((0,) * ndim, dtype=bool), + equal_nan=withnans, + ).all() + else: + eq = _masked_array_equal(array1, array2, equal_nan=withnans).all() - return eq + return bool(eq) def approx_equal(a, b, max_absolute_error=1e-10, max_relative_error=1e-10): From 0ae0d49b38e8de50a5bc0acf219b192a68d421a4 Mon Sep 17 00:00:00 2001 From: Chris Bunney <48915820+ukmo-ccbunney@users.noreply.github.com> Date: Fri, 7 Mar 2025 09:40:28 +0000 Subject: [PATCH 05/90] Handle NetCDF variable length strings (and other VLen types) (#6340) * Initial workaround - if variable is a "str" type then force it to load. Would be better to force all Variable Length types to be lazy, but can't ascertain this information from the CFAuxiliaryCoordinateVariable instance. * Added some TODO comments * Working solution that checks for VLEN arrays and handles the special variable length "str" case. * Formatting updates * Added size hinting of variable length arrays using CHUNK_CONTROL context manager * Access netCDF variable `datatype` via `cf_var.cf_data` to avoid Mock failures. * Make check on cf_var.cf_data.datatype optional as it assumes underlying storage is netCDF (which for this module is true, but for mock testing is not) * Added warning category * Added unit tests for VLen arrays. * Use 'safe-access' version of netCDF4.VLType to satisfy coding-standards checker. * Updates to comments and added _MEAN_VL_ARRAY_LEN as module variable * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Restored accidentally removed comment * Updated docstring and added docs * Updated Whats New * Fixed some typos in docs * Fixed doctest indent * Missing imports for doctest * Rewording of docs * Missing black lines in doctest * Update lockfiles with iris-sample-data * Better URL for variable length types [Review comment] * Slight rewording of docs and fix some typos [Review comments] * reworded hinting section * Typo --------- Co-authored-by: Patrick Peglar Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- docs/src/further_topics/netcdf_io.rst | 55 +++++++++++++++++ docs/src/whatsnew/latest.rst | 3 + .../fileformats/netcdf/_thread_safe_nc.py | 1 + lib/iris/fileformats/netcdf/loader.py | 61 +++++++++++++++++-- .../netcdf/loader/test__get_cf_var_data.py | 49 ++++++++++++++- requirements/locks/py311-linux-64.lock | 2 +- requirements/locks/py312-linux-64.lock | 2 +- requirements/locks/py313-linux-64.lock | 2 +- 8 files changed, 166 insertions(+), 9 deletions(-) diff --git a/docs/src/further_topics/netcdf_io.rst b/docs/src/further_topics/netcdf_io.rst index 4e1c32b22f..682918d5f4 100644 --- a/docs/src/further_topics/netcdf_io.rst +++ b/docs/src/further_topics/netcdf_io.rst @@ -122,6 +122,61 @@ Iris' optimisation all together, and will take its chunksizes from Dask's behavi (70, 37, 49) +Variable-length datatypes +------------------------- + +The NetCDF4 module provides support for variable-length (or "ragged") data +types (``VLType``); see +`Variable-length data types `_ + +The ``VLType`` allows for storing data where the length of the data in each array element +can vary. When ``VLType`` arrays are loaded into Iris cubes (or numpy), they are stored +as an array of ``Object`` types - essentially an array-of-arrays, rather than a single +multi-dimensional array. + +The most likely case to encounter variable-length data types is when an array of +strings (not characters) are stored in a NetCDF file. As the string length for any +particular array element can vary the values are stored as an array of ``VLType``. + +As each element of a variable-length array is stored as a ``VLType`` containing +an unknown number of vales, the total size of a variable-length NetCDF array +cannot be known without first loading the data. This makes it difficult for +Iris to make an informed decision on whether to the load the data lazily or not. +The user can aid this decision using *VLType size hinting* described below. + +VLType size hinting +^^^^^^^^^^^^^^^^^^^ + +If the user has some *a priori* knowledge of the average length of the data in +variable-length ``VLType``, this can be provided as a hint to Iris via the +``CHUNK_CONTROL`` context manager and the special ``_vl_hint`` keyword +targeting the variable, e.g. ``CHUNK_CONTROL.set("varname", _vl_hint=5)``. +This allows Iris to make a more informed decision on whether to load the +data lazily. + +For example, consider a netCDF file with an auxiliary coordinate +``experiment_version`` that is stored as a variable-length string type. By +default, Iris will attempt to guess the total array size based on the known +dimension sizes (``time=150`` in this example) and load the data lazily. +However, if it is known prior to loading the file that the strings are all no +longer than 5 characters this information can be passed to the Iris NetCDF +loader so it can be make a more informed decision on lazy loading: + +.. doctest:: + + >>> import iris + >>> from iris.fileformats.netcdf.loader import CHUNK_CONTROL + >>> + >>> sample_file = iris.sample_data_path("vlstr_type.nc") + >>> cube = iris.load_cube(sample_file) + >>> print(cube.coord('experiment_version').has_lazy_points()) + True + >>> with CHUNK_CONTROL.set("expver", _vl_hint=5): + ... cube = iris.load_cube(sample_file) + >>> print(cube.coord('experiment_version').has_lazy_points()) + False + + Split Attributes ----------------- diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 43fd95e49a..07915ad040 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -65,6 +65,9 @@ This document explains the changes made to Iris for this release older NetCDF formats e.g. ``NETCDF4_CLASSIC`` support a maximum precision of 32-bit. (:issue:`6178`, :pull:`6343`) +# `@ukmo-ccbunney` added support for loading NetCDF variable-length string types + and size hinting for better lazy loading (:issue:`6149`, :pull:`6340`) + #. `@bouweandela`_ fixed handling of masked Dask arrays in :func:`~iris.util.array_equal`. diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py index 3a556f5447..35588eb2c4 100644 --- a/lib/iris/fileformats/netcdf/_thread_safe_nc.py +++ b/lib/iris/fileformats/netcdf/_thread_safe_nc.py @@ -20,6 +20,7 @@ # Doesn't need thread protection, but this allows all netCDF4 refs to be # replaced with thread_safe refs. default_fillvals = netCDF4.default_fillvals +VLType = netCDF4.VLType class _ThreadSafeWrapper(ABC): diff --git a/lib/iris/fileformats/netcdf/loader.py b/lib/iris/fileformats/netcdf/loader.py index 55d0a88b79..b384667b21 100644 --- a/lib/iris/fileformats/netcdf/loader.py +++ b/lib/iris/fileformats/netcdf/loader.py @@ -196,6 +196,11 @@ def _get_actual_dtype(cf_var): # mostly done for speed improvement. See https://github.com/SciTools/iris/pull/5069 _LAZYVAR_MIN_BYTES = 5000 +# A stab in the dark at the mean length of the "ragged dimension" for netCDF "variable +# length arrays" (`NetCDF.VLType` type). Total array size is unknown until the variable is +# read in. Making this number bigger makes it more likely an array will be loaded lazily. +_MEAN_VL_ARRAY_LEN = 10 + def _get_cf_var_data(cf_var, filename): """Get an array representing the data of a CF variable. @@ -215,12 +220,40 @@ def _get_cf_var_data(cf_var, filename): # See https://github.com/SciTools/iris/issues/4994 "Xarray bridge". result = cf_var._data_array else: - total_bytes = cf_var.size * cf_var.dtype.itemsize + # Determine size of data; however can't do this for variable length (VLEN) + # netCDF arrays as the size of the array can only be known by reading the + # data; see https://github.com/Unidata/netcdf-c/issues/1893. + # Note: "Variable length" netCDF types have a datatype of `nc.VLType`. + if isinstance(getattr(cf_var, "datatype", None), _thread_safe_nc.VLType): + msg = ( + f"NetCDF variable `{cf_var.cf_name}` is a variable length type of kind {cf_var.dtype} " + "thus the total data size cannot be known in advance. This may affect the lazy loading " + "of the data." + ) + warnings.warn(msg, category=iris.warnings.IrisLoadWarning) + + # Give user the chance to pass a hint of the average variable length array size via + # the chunk control context manager. This allows for better decisions to be made on + # whether the data should be lazy-loaded or not. + mean_vl_array_len = _MEAN_VL_ARRAY_LEN + if CHUNK_CONTROL.mode is not CHUNK_CONTROL.Modes.AS_DASK: + if chunks := CHUNK_CONTROL.var_dim_chunksizes.get(cf_var.cf_name): + if vl_chunk_hint := chunks.get("_vl_hint"): + mean_vl_array_len = vl_chunk_hint + + # Special handling for strings (`str` type) as these don't have an itemsize attribute; + # assume 4 bytes which is sufficient for unicode character storage + itemsize = 4 if cf_var.dtype is str else cf_var.dtype.itemsize + + # For `VLType` cf_var.size will just return the known dimension size. + total_bytes = cf_var.size * mean_vl_array_len * itemsize + else: + # Normal NCVariable type: + total_bytes = cf_var.size * cf_var.dtype.itemsize + if total_bytes < _LAZYVAR_MIN_BYTES: # Don't make a lazy array, as it will cost more memory AND more time to access. - # Instead fetch the data immediately, as a real array, and return that. result = cf_var[:] - else: # Get lazy chunked data out of a cf variable. # Creates Dask wrappers around data arrays for any cube components which @@ -228,10 +261,13 @@ def _get_cf_var_data(cf_var, filename): dtype = _get_actual_dtype(cf_var) # Make a data-proxy that mimics array access and can fetch from the file. + # Note: Special handling needed for "variable length string" types which + # return a dtype of `str`, rather than a numpy type; use `S1` in this case. + fill_dtype = "S1" if cf_var.dtype is str else cf_var.dtype.str[1:] fill_value = getattr( cf_var.cf_data, "_FillValue", - _thread_safe_nc.default_fillvals[cf_var.dtype.str[1:]], + _thread_safe_nc.default_fillvals[fill_dtype], ) proxy = NetCDFDataProxy( cf_var.shape, dtype, filename, cf_var.cf_name, fill_value @@ -699,6 +735,10 @@ def set( ) -> Iterator[None]: r"""Control the Dask chunk sizes applied to NetCDF variables during loading. + This function can also be used to provide a size hint for the unknown + array lengths when loading "variable-length" NetCDF data types. + See https://unidata.github.io/netcdf4-python/#netCDF4.Dataset.vltypes + Parameters ---------- var_names : str or list of str, default=None @@ -710,7 +750,8 @@ def set( Each key-value pair defines a chunk size for a named file dimension, e.g. ``{'time': 10, 'model_levels':1}``. Values of ``-1`` will lock the chunk size to the full size of that - dimension. + dimension. To specify a size hint for "variable-length" data types + use the special name `_vl_hint`. Notes ----- @@ -734,6 +775,16 @@ def set( i.e. the setting configured by ``dask.config.set({'array.chunk-size': '250MiB'})``. + For variable-length data types the size of the variable (or "ragged") + dimension of the individual array elements cannot be known without + reading the data. This can make it difficult for Iris to determine + whether to load the data lazily or not. If the user has some apriori + knowledge of the mean variable array length this can be passed as + as a size hint via the special `_vl_hint` name. For example a hint + that variable-length string array that contains 4 character experiment + identifiers: + ``CHUNK_CONTROL.set("expver", _vl_hint=4)`` + """ old_mode = self.mode old_var_dim_chunksizes = deepcopy(self.var_dim_chunksizes) diff --git a/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py b/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py index efa291a0b4..9460ad8b5a 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py +++ b/lib/iris/tests/unit/fileformats/netcdf/loader/test__get_cf_var_data.py @@ -15,6 +15,7 @@ from iris._lazy_data import _optimum_chunksize import iris.fileformats.cf +from iris.fileformats.netcdf._thread_safe_nc import VLType from iris.fileformats.netcdf.loader import CHUNK_CONTROL, _get_cf_var_data @@ -33,7 +34,8 @@ def _make(self, chunksizes=None, shape=None, dtype="i4", **extra_properties): cf_data.chunking = mock.MagicMock(return_value=chunksizes) if shape is None: shape = self.shape - dtype = np.dtype(dtype) + if dtype is not str: # for testing VLen str arrays (dtype=`class `) + dtype = np.dtype(dtype) cf_var = mock.MagicMock( spec=iris.fileformats.cf.CFVariable, dtype=dtype, @@ -103,6 +105,51 @@ def test_arraytype__100f8_is_real(self): var_data = _get_cf_var_data(cf_var, self.filename) self.assertIs(var_data, mock.sentinel.real_data_accessed) + def test_vltype__1000str_is_lazy(self): + # Variable length string type + mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type") + cf_var = self._make(shape=(1000,), dtype=str, datatype=mock_vltype) + var_data = _get_cf_var_data(cf_var, self.filename) + self.assertIsInstance(var_data, da.Array) + + def test_vltype__1000str_is_real_with_hint(self): + # Variable length string type with a hint on the array variable length size + mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type") + cf_var = self._make(shape=(100,), dtype=str, datatype=mock_vltype) + with CHUNK_CONTROL.set("DUMMY_VAR", _vl_hint=1): + var_data = _get_cf_var_data(cf_var, self.filename) + self.assertIs(var_data, mock.sentinel.real_data_accessed) + + def test_vltype__100str_is_real(self): + # Variable length string type + mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type") + cf_var = self._make(shape=(100,), dtype=str, datatype=mock_vltype) + var_data = _get_cf_var_data(cf_var, self.filename) + self.assertIs(var_data, mock.sentinel.real_data_accessed) + + def test_vltype__100str_is_lazy_with_hint(self): + # Variable length string type with a hint on the array variable length size + mock_vltype = mock.Mock(spec=VLType, dtype=str, name="varlen string type") + cf_var = self._make(shape=(100,), dtype=str, datatype=mock_vltype) + with CHUNK_CONTROL.set("DUMMY_VAR", _vl_hint=50): + var_data = _get_cf_var_data(cf_var, self.filename) + self.assertIsInstance(var_data, da.Array) + + def test_vltype__100f8_is_lazy(self): + # Variable length float64 type + mock_vltype = mock.Mock(spec=VLType, dtype="f8", name="varlen float64 type") + cf_var = self._make(shape=(1000,), dtype="f8", datatype=mock_vltype) + var_data = _get_cf_var_data(cf_var, self.filename) + self.assertIsInstance(var_data, da.Array) + + def test_vltype__100f8_is_real_with_hint(self): + # Variable length float64 type with a hint on the array variable length size + mock_vltype = mock.Mock(spec=VLType, dtype="f8", name="varlen float64 type") + cf_var = self._make(shape=(100,), dtype="f8", datatype=mock_vltype) + with CHUNK_CONTROL.set("DUMMY_VAR", _vl_hint=2): + var_data = _get_cf_var_data(cf_var, self.filename) + self.assertIs(var_data, mock.sentinel.real_data_accessed) + def test_cf_data_emulation(self): # Check that a variable emulation object passes its real data directly. emulated_data = mock.Mock() diff --git a/requirements/locks/py311-linux-64.lock b/requirements/locks/py311-linux-64.lock index c35a2009f7..c035923776 100644 --- a/requirements/locks/py311-linux-64.lock +++ b/requirements/locks/py311-linux-64.lock @@ -162,7 +162,7 @@ https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.cond https://conda.anaconda.org/conda-forge/noarch/idna-3.10-pyhd8ed1ab_1.conda#39a4f67be3286c86d696df570b1201b7 https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2#7de5386c8fea29e76b303f37dde4c352 https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 -https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.1-pyhd8ed1ab_1.conda#ae376af0a29183e98a95508ed6944664 +https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.2-pyhd8ed1ab_0.conda#895f6625dd8a246fece9279fcc12c1de https://conda.anaconda.org/conda-forge/linux-64/jack-1.9.22-h7c63dc7_2.conda#f56277b7f079f1b13cbf7fb9b4f194c4 https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.7-py311hd18a35c_0.conda#be34c90cce87090d24da64a7c239ca96 https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_h59b9bed_openblas.conda#728dbebd0f7a20337218beacffd37916 diff --git a/requirements/locks/py312-linux-64.lock b/requirements/locks/py312-linux-64.lock index a9c7e309d4..83468bef25 100644 --- a/requirements/locks/py312-linux-64.lock +++ b/requirements/locks/py312-linux-64.lock @@ -156,7 +156,7 @@ https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.cond https://conda.anaconda.org/conda-forge/noarch/idna-3.10-pyhd8ed1ab_1.conda#39a4f67be3286c86d696df570b1201b7 https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2#7de5386c8fea29e76b303f37dde4c352 https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 -https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.1-pyhd8ed1ab_1.conda#ae376af0a29183e98a95508ed6944664 +https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.2-pyhd8ed1ab_0.conda#895f6625dd8a246fece9279fcc12c1de https://conda.anaconda.org/conda-forge/linux-64/jack-1.9.22-h7c63dc7_2.conda#f56277b7f079f1b13cbf7fb9b4f194c4 https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.8-py312h84d6215_0.conda#6713467dc95509683bfa3aca08524e8a https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_h59b9bed_openblas.conda#728dbebd0f7a20337218beacffd37916 diff --git a/requirements/locks/py313-linux-64.lock b/requirements/locks/py313-linux-64.lock index 493ef1b9d1..31fa73bced 100644 --- a/requirements/locks/py313-linux-64.lock +++ b/requirements/locks/py313-linux-64.lock @@ -162,7 +162,7 @@ https://conda.anaconda.org/conda-forge/noarch/hyperframe-6.1.0-pyhd8ed1ab_0.cond https://conda.anaconda.org/conda-forge/noarch/idna-3.10-pyhd8ed1ab_1.conda#39a4f67be3286c86d696df570b1201b7 https://conda.anaconda.org/conda-forge/noarch/imagesize-1.4.1-pyhd8ed1ab_0.tar.bz2#7de5386c8fea29e76b303f37dde4c352 https://conda.anaconda.org/conda-forge/noarch/iniconfig-2.0.0-pyhd8ed1ab_1.conda#6837f3eff7dcea42ecd714ce1ac2b108 -https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.1-pyhd8ed1ab_1.conda#ae376af0a29183e98a95508ed6944664 +https://conda.anaconda.org/conda-forge/noarch/iris-sample-data-2.5.2-pyhd8ed1ab_0.conda#895f6625dd8a246fece9279fcc12c1de https://conda.anaconda.org/conda-forge/linux-64/jack-1.9.22-h7c63dc7_2.conda#f56277b7f079f1b13cbf7fb9b4f194c4 https://conda.anaconda.org/conda-forge/linux-64/kiwisolver-1.4.7-py313h33d0bda_0.conda#9862d13a5e466273d5a4738cffcb8d6c https://conda.anaconda.org/conda-forge/linux-64/libblas-3.9.0-31_h59b9bed_openblas.conda#728dbebd0f7a20337218beacffd37916 From 87454650a4647d46cb196b3f2b7f64e4a3a65cbb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 7 Mar 2025 13:41:52 +0000 Subject: [PATCH 06/90] [pre-commit.ci] pre-commit autoupdate (#6356) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit updates: - [github.com/astral-sh/ruff-pre-commit: v0.9.7 → v0.9.9](https://github.com/astral-sh/ruff-pre-commit/compare/v0.9.7...v0.9.9) Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .pre-commit-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 373079b594..46d0bed381 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -29,7 +29,7 @@ repos: - id: no-commit-to-branch - repo: https://github.com/astral-sh/ruff-pre-commit - rev: "v0.9.7" + rev: "v0.9.9" hooks: - id: ruff types: [file, python] From cf85915243fedbf5e328edcdab4caa81b403677a Mon Sep 17 00:00:00 2001 From: Patrick Peglar Date: Fri, 7 Mar 2025 17:07:58 +0000 Subject: [PATCH 07/90] Combine cubes 2 (#6334) * Remove unused 'unique' option from load collections and combine methods. * Reinstate 'unique=False' in combine merges to fix load(). * Fix typing for combine submodule. * More typing and other small improvements to combine code. * Move loading-specific parts from CombineOptions to LoadPolicy. * Fix LoadPolicy error handling + test. * Properly support merge_unique, and add equalise_cubes. * Allow LoadPolicy.context with no settings arg. * Turn off varying-reference support for 'legacy' load setting only. * Implement combine_cube as util, and redirect test. * Small docs fixes. * Add functioning doctest example for combine_cubes. * Further doctest fixes. * Add cubelist combine methods. * Better documentation of CombineOptions and LoadPolicy settings. * Recombine LoadPolicy into CombineOptions. * Fix doctest. * Rework CombineOptions.set() tests to cover context() method also. * Add tests for cubelist combine functions (and fix). * Add tests for individual combine control keywords. * Added whatsnew, and minimal links in other docs sections. * Review changes: docs improvements; docstrings for cubelist combine methods. * Tiny formatting correction. * Review changes: docs explain not to assign to iris.COMBINE_POLICY. * Review changes: fix obsolete comments + todos in testcode. * Review changes: add test for string arg + kwargs. * Review changes: remove obsolete TODO notes. --- docs/src/userguide/loading_iris_cubes.rst | 7 + docs/src/whatsnew/latest.rst | 24 +- lib/iris/__init__.py | 22 +- lib/iris/_combine.py | 356 +++++++++++------- lib/iris/cube.py | 60 +++ lib/iris/fileformats/rules.py | 6 +- lib/iris/loading.py | 56 +-- lib/iris/tests/integration/test_trajectory.py | 4 +- .../varying_references/test_realdata_load.py | 4 +- .../test_roundtrip_time_varying_references.py | 8 +- lib/iris/tests/unit/combine/__init__.py | 5 + .../tests/unit/combine/test_CombineOptions.py | 226 +++++++++++ lib/iris/tests/unit/cube/test_CubeList.py | 49 +++ lib/iris/tests/unit/test_LoadPolicy.py | 144 ------- lib/iris/tests/unit/test_combine_cubes.py | 90 ----- .../tests/unit/util/test_combine_cubes.py | 142 +++++++ lib/iris/util.py | 143 ++++++- 17 files changed, 916 insertions(+), 430 deletions(-) create mode 100644 lib/iris/tests/unit/combine/__init__.py create mode 100644 lib/iris/tests/unit/combine/test_CombineOptions.py delete mode 100644 lib/iris/tests/unit/test_LoadPolicy.py delete mode 100644 lib/iris/tests/unit/test_combine_cubes.py create mode 100644 lib/iris/tests/unit/util/test_combine_cubes.py diff --git a/docs/src/userguide/loading_iris_cubes.rst b/docs/src/userguide/loading_iris_cubes.rst index b71f033c30..cbba4da39f 100644 --- a/docs/src/userguide/loading_iris_cubes.rst +++ b/docs/src/userguide/loading_iris_cubes.rst @@ -15,6 +15,13 @@ Iris will attempt to return **as few cubes as possible** by collecting together multiple fields with a shared standard name into a single multidimensional cube. +.. hint:: + + There are details at :class:`iris.CombineOptions` on how Iris works to load + fewer and larger cubes : The :data:`iris.COMBINE_POLICY` object allows the user to + control how cubes are combined during the loading process. See the documentation + of the :class:`iris.CombineOptions` class for details. + The :py:func:`iris.load` function automatically recognises the format of the given files and attempts to produce Iris Cubes from their contents. diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 07915ad040..b243796082 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -30,6 +30,26 @@ This document explains the changes made to Iris for this release ✨ Features =========== +#. `@pp-mo`_ renamed the :class:`iris.LoadPolicy` as :class:`iris.CombineOptions` and + :data:`iris.LOAD_POLICY` as :data:`iris.COMBINE_POLICY`, though the original names + remain functional (and refer to the same things) for now. + (:issue:`6203`, :pull:`6334`) + +#. `@pp-mo`_ added new :meth:`~iris.cube.CubeList.combine` and + :meth:`~iris.cube.CubeList.combine_cube` methods of a :class:`~iris.cube.CubeList` + as an alternative way of accessing the :func:`~iris.util.combine_cubes` mechanism. + (:issue:`6203`, :pull:`6334`) + +#. `@pp-mo`_ added a new utility function :func:`~iris.util.combine_cubes`, to give + general public access to the combine merge/concatenate mechanism introduced for + generalised loading support via :class:`iris.LoadPolicy` in the Iris 3.11 release. + (:issue:`6203`, :pull:`6334`) + +#. `@pp-mo`_ overhauled the :class:`iris.LoadPolicy` facility by adding a new + ``equalise_cubes_kwarg`` keyword, enabling it to call the + :func:`~iris.util.equalise_cubes` utility function as one of its processing stages. + (:issue:`6203`, :pull:`6334`) + #. `@pp-mo`_ added a new utility function :func:`~iris.util.equalise_cubes`, to help with aligning cubes so they can merge / concatenate. (:issue:`6248`, :pull:`6257`) @@ -48,11 +68,11 @@ This document explains the changes made to Iris for this release However, :meth:`~iris.cube.Cube.transpose` will work, as will :meth:`~iris.cube.Cube.copy`. Note that, ``cube.copy(data=iris.DATALESS)`` will provide a dataless copy of a cube. (:issue:`4447`, :pull:`6253`) - + #. `@ESadek-MO`_ added the :mod:`iris.quickplot` ``footer`` kwarg to render text in the bottom right of the plot figure. (:issue:`6247`, :pull:`6332`) - + 🐛 Bugs Fixed ============= diff --git a/lib/iris/__init__.py b/lib/iris/__init__.py index d622bd18b0..d141dbdb5f 100644 --- a/lib/iris/__init__.py +++ b/lib/iris/__init__.py @@ -18,6 +18,12 @@ :class:`Cubes `, and combine those cubes into higher-dimensional cubes where possible. +.. note:: + + User control of the 'combine' process is provided via a specific + :class:`iris.CombineOptions` object called :data:`iris.COMBINE_POLICY`. + See the :class:`iris.CombineOptions` class for details. + The :func:`load_cube` and :func:`load_cubes` functions are similar to :func:`load`, but they raise an exception if the number of cubes is not what was expected. They are more useful in scripts, where they can @@ -94,14 +100,13 @@ def callback(cube, field, filename): import threading from typing import Callable, Literal +from iris._combine import COMBINE_POLICY as _COMBINE_POLICY +from iris._combine import CombineOptions import iris._constraints import iris.config import iris.io from iris.io import save -from iris.loading import LOAD_POLICY as _LOAD_POLICY from iris.loading import ( - CombineOptions, - LoadPolicy, load, load_cube, load_cubes, @@ -111,8 +116,14 @@ def callback(cube, field, filename): # NOTE: we make an independent local 'LOAD_POLICY' definition here, just so that we # can ensure an entry for it in our API documentation page. -#: A control object containing the current file loading strategy options. -LOAD_POLICY = _LOAD_POLICY +#: An object to control default cube combination and loading options +COMBINE_POLICY = _COMBINE_POLICY + +#: An alias for the :class:`~iris._combine.CombineOptions` class. +LoadPolicy = CombineOptions + +#: An alias for the :data:`~iris.COMBINE_POLICY` object. +LOAD_POLICY = _COMBINE_POLICY from ._deprecation import IrisDeprecation, warn_deprecated @@ -132,6 +143,7 @@ def callback(cube, field, filename): # Restrict the names imported when using "from iris import *" __all__ = [ "AttributeConstraint", + "COMBINE_POLICY", "CombineOptions", "Constraint", "DATALESS", diff --git a/lib/iris/_combine.py b/lib/iris/_combine.py index 7b01dfc87e..5afffe7309 100644 --- a/lib/iris/_combine.py +++ b/lib/iris/_combine.py @@ -11,124 +11,214 @@ publicly available. """ +from __future__ import annotations + import contextlib import threading -from typing import Mapping - +from typing import TYPE_CHECKING, Any, Dict, List -class CombineOptions(threading.local): - """A container for cube combination options. +if TYPE_CHECKING: + from iris.cube import Cube, CubeList - Controls for generalised merge/concatenate options. - Also controls the detection and handling of cases where a hybrid coordinate - uses multiple reference fields during loading : for example, a UM file which - contains a series of fields describing time-varying orography. +class CombineOptions(threading.local): + """A control object for Iris loading and cube combination options. - Options can be set directly, or via :meth:`~iris.LoadPolicy.set`, or changed for - the scope of a code block with :meth:`~iris.LoadPolicy.context`. + Both the iris loading functions and the "combine_cubes" utility apply a number of + possible "cube combination" operations to a list of cubes, in a definite sequence, + all of which tend to combine cubes into a smaller number of larger or + higher-dimensional cubes. - .. note :: + This object groups various control options for these behaviours, which apply to + both the :func:`iris.util.combine_cubes` utility method and the core Iris loading + functions "iris.load_xxx". - The default behaviour will "fix" loading for cases like the time-varying - orography case described above. However, this is not strictly - backwards-compatible. If this causes problems, you can force identical loading - behaviour to earlier Iris versions with ``LOAD_POLICY.set("legacy")`` or - equivalent. + The :class:`CombineOptions` class defines the allowed control options, while a + global singleton object :data:`iris.COMBINE_POLICY` holds the current global + default settings. - .. testsetup:: + The individual configurable options are : - from iris import LOAD_POLICY + * ``equalise_cubes_kwargs`` = (dict or None) + Specifies keywords for an :func:`iris.util.equalise_cubes` call, to be applied + before any merge/concatenate step. If ``None``, or empty, no equalisation step + is performed. - Notes - ----- - The individual configurable options are : + * ``merge_concat_sequence`` = "m" / "c" / "cm" / "mc" + Specifies whether to apply :meth:`~iris.cube.CubeList.merge`, or + :meth:`~iris.cube.CubeList.concatenate` operations, or both, in either order. - * ``support_multiple_references`` = True / False - When enabled, the presence of multiple aux-factory reference cubes, which merge - to define a extra dimension, will add that dimension to the loaded cubes. - This is essential for correct support of time-dependent hybrid coordinates (i.e. - aux factories) when loading from fields-based data (e.g. PP or GRIB). - For example (notably) time-dependent orography in UM data on hybrid-heights. + * ``merge_unique`` = True / False + When True, any merge operation will error if its result contains multiple + identical cubes. Otherwise (unique=False), that is a permitted result. - In addition, when such multiple references are detected, an extra concatenate - step is added to the 'merge_concat_sequence' (see below), if none is already - configured there. + .. Note:: - * ``merge_concat_sequence`` = "m" / "c" / "cm" / "mc" - Specifies whether to merge, or concatenate, or both in either order. - This is the "combine" operation which is applied to loaded data. + By default, in a normal :meth:`~iris.cube.CubeList.merge` operation on a + :class:`~iris.cube.CubeList`, ``unique`` defaults to ``True``. + For loading operations, however, the default is ``unique=False``, as this + produces the intended behaviour when loading with multiple constraints. * ``repeat_until_unchanged`` = True / False When enabled, the configured "combine" operation will be repeated until the result is stable (no more cubes are combined). - Several common sets of options are provided in :data:`~iris.LOAD_POLICY.SETTINGS` : + * ``support_multiple_references`` = True / False + When enabled, support cases where a hybrid coordinate has multiple reference + fields : for example, a UM file which contains a series of fields describing a + time-varying orography. + + Alternatively, certain fixed combinations of options can be selected by a + "settings" name, one of :data:`CombineOptions.SETTINGS_NAMES` : * ``"legacy"`` - Produces loading behaviour identical to Iris versions < 3.11, i.e. before the - varying hybrid references were supported. + Apply a plain merge step only, i.e. ``merge_concat_sequence="m"``. + Other options are all "off". + This produces loading behaviour identical to Iris versions < 3.11, i.e. before + the varying hybrid references were supported. * ``"default"`` As "legacy" except that ``support_multiple_references=True``. This differs from "legacy" only when multiple mergeable reference fields are encountered, in which case incoming cubes are extended into the extra dimension, and a concatenate step is added. + Since the handling of multiple references affects only loading operations, + for the purposes of calls to :func:`~iris.util.combine_cubes`, this setting is + *identical* to "legacy". + + .. Warning:: + + The ``"default"`` setting **is** the initial default mode. + + This "fixes" loading for cases like the time-varying orography case + described. However, this setting is not strictly + backwards-compatible. If this causes problems, you can force identical + loading behaviour to earlier Iris versions (< v3.11) with + ``COMBINE_POLICY.set("legacy")`` or equivalent. * ``"recommended"`` - Enables multiple reference handling, *and* applies a merge step followed by - a concatenate step. + In addition to the "merge" step, allow a following "concatenate", i.e. + ``merge_concat_sequence="mc"``. * ``"comprehensive"`` - Like "recommended", but will also *repeat* the merge+concatenate steps until no - further change is produced. + As for "recommended", uses ``merge_concat_sequence="mc"``, but now also + *repeats* the merge+concatenate steps until no further change is produced, + i.e. ``repeat_until_unchanged=True``. + Also applies a prior 'equalise_cubes' call, of the form + ``equalise_cubes(cubes, apply_all=True)``. - .. note :: + .. Note:: - The 'comprehensive' policy makes a maximum effort to reduce the number of + The "comprehensive" policy makes a maximum effort to reduce the number of cubes to a minimum. However, it still cannot combine cubes with a mixture of matching dimension and scalar coordinates. This may be supported at some later date, but for now is not possible without specific user actions. - .. Note :: + .. testsetup:: - See also : :ref:`controlling_merge`. + from iris import COMBINE_POLICY + loadpolicy_old_settings = COMBINE_POLICY.settings() + + .. testcleanup:: + + # restore original settings, so as not to upset other tests + COMBINE_POLICY.set(loadpolicy_old_settings) + + Examples + -------- + Note: :data:`COMBINE_POLICY` is the global control object, which determines + the current default options for loading or :func:`iris.util.combine_cubes` calls. + For the latter case, however, control via argument and keywords is also available. + + .. Note:: + + The ``iris.COMBINE_POLICY`` can be adjusted by either: + + 1. calling ``iris.COMBINE_POLICY.set()``, or + 2. using ``with COMBINE_POLICY.context(): ...``, or + 3. assigning a property ``COMBINE_POLICY.