From 17ca4b86a322546bd29217f77f384cb071d37e18 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 19 Feb 2025 16:52:08 +0100 Subject: [PATCH 01/11] Fix issue where the mask of Dask arrays was ignored and improve tests --- .../tests/unit/concatenate/test_hashing.py | 16 + lib/iris/tests/unit/util/test_array_equal.py | 298 ++++++++++-------- lib/iris/util.py | 32 +- 3 files changed, 212 insertions(+), 134 deletions(-) diff --git a/lib/iris/tests/unit/concatenate/test_hashing.py b/lib/iris/tests/unit/concatenate/test_hashing.py index 24062a2af3..88064e4e46 100644 --- a/lib/iris/tests/unit/concatenate/test_hashing.py +++ b/lib/iris/tests/unit/concatenate/test_hashing.py @@ -9,6 +9,8 @@ import pytest from iris import _concatenate +from iris.tests.unit.util.test_array_equal import TEST_CASES +from iris.util import array_equal @pytest.mark.parametrize( @@ -75,6 +77,20 @@ def test_compute_hashes(a, b, eq): assert eq == (hashes["a"] == hashes["b"]) +@pytest.mark.parametrize( + "a,b", + [ + (a, b) + for (a, b, withnans, eq) in TEST_CASES + if isinstance(a, np.ndarray | da.Array) and isinstance(b, np.ndarray | da.Array) + ], +) +def test_compute_hashes_vs_array_equal(a, b): + """Test that hashing give the same answer as `array_equal(withnans=True)`.""" + hashes = _concatenate._compute_hashes({"a": a, "b": b}) + assert array_equal(a, b, withnans=True) == (hashes["a"] == hashes["b"]) + + def test_arrayhash_equal_incompatible_chunks_raises(): hash1 = _concatenate._ArrayHash(1, chunks=((1, 1),)) hash2 = _concatenate._ArrayHash(1, chunks=((2,),)) diff --git a/lib/iris/tests/unit/util/test_array_equal.py b/lib/iris/tests/unit/util/test_array_equal.py index 3e1aaf1bfb..9f277edb20 100644 --- a/lib/iris/tests/unit/util/test_array_equal.py +++ b/lib/iris/tests/unit/util/test_array_equal.py @@ -4,133 +4,181 @@ # See LICENSE in the root of the repository for full licensing details. """Test function :func:`iris.util.array_equal`.""" +import dask.array as da import numpy as np import numpy.ma as ma +import pytest from iris.util import array_equal - -class Test: - def test_0d(self): - array_a = np.array(23) - array_b = np.array(23) - array_c = np.array(7) - assert array_equal(array_a, array_b) - assert not array_equal(array_a, array_c) - - def test_0d_and_scalar(self): - array_a = np.array(23) - assert array_equal(array_a, 23) - assert not array_equal(array_a, 45) - - def test_1d_and_sequences(self): - for sequence_type in (list, tuple): - seq_a = sequence_type([1, 2, 3]) - array_a = np.array(seq_a) - assert array_equal(array_a, seq_a) - assert not array_equal(array_a, seq_a[:-1]) - array_a[1] = 45 - assert not array_equal(array_a, seq_a) - - def test_nd(self): - array_a = np.array(np.arange(24).reshape(2, 3, 4)) - array_b = np.array(np.arange(24).reshape(2, 3, 4)) - array_c = np.array(np.arange(24).reshape(2, 3, 4)) - array_c[0, 1, 2] = 100 - assert array_equal(array_a, array_b) - assert not array_equal(array_a, array_c) - - def test_masked_is_not_ignored(self): - array_a = ma.masked_array([1, 2, 3], mask=[1, 0, 1]) - array_b = ma.masked_array([2, 2, 2], mask=[1, 0, 1]) - assert array_equal(array_a, array_b) - - def test_masked_is_different(self): - array_a = ma.masked_array([1, 2, 3], mask=[1, 0, 1]) - array_b = ma.masked_array([1, 2, 3], mask=[0, 0, 1]) - assert not array_equal(array_a, array_b) - - def test_masked_isnt_unmasked(self): - array_a = np.array([1, 2, 2]) - array_b = ma.masked_array([1, 2, 2], mask=[0, 0, 1]) - assert not array_equal(array_a, array_b) - - def test_masked_unmasked_equivelance(self): - array_a = np.array([1, 2, 2]) - array_b = ma.masked_array([1, 2, 2]) - array_c = ma.masked_array([1, 2, 2], mask=[0, 0, 0]) - assert array_equal(array_a, array_b) - assert array_equal(array_a, array_c) - - def test_fully_masked_arrays(self): - array_a = ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True) - array_b = ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True) - assert array_equal(array_a, array_b) - - def test_fully_masked_0d_arrays(self): - array_a = ma.masked_array(3, mask=True) - array_b = ma.masked_array(3, mask=True) - assert array_equal(array_a, array_b) - - def test_fully_masked_string_arrays(self): - array_a = ma.masked_array(["a", "b", "c"], mask=True) - array_b = ma.masked_array(["a", "b", "c"], mask=[1, 1, 1]) - assert array_equal(array_a, array_b) - - def test_partially_masked_string_arrays(self): - array_a = ma.masked_array(["a", "b", "c"], mask=[1, 0, 1]) - array_b = ma.masked_array(["a", "b", "c"], mask=[1, 0, 1]) - assert array_equal(array_a, array_b) - - def test_string_arrays_equal(self): - array_a = np.array(["abc", "def", "efg"]) - array_b = np.array(["abc", "def", "efg"]) - assert array_equal(array_a, array_b) - - def test_string_arrays_different_contents(self): - array_a = np.array(["abc", "def", "efg"]) - array_b = np.array(["abc", "de", "efg"]) - assert not array_equal(array_a, array_b) - - def test_string_arrays_subset(self): - array_a = np.array(["abc", "def", "efg"]) - array_b = np.array(["abc", "def"]) - assert not array_equal(array_a, array_b) - assert not array_equal(array_b, array_a) - - def test_string_arrays_unequal_dimensionality(self): - array_a = np.array("abc") - array_b = np.array(["abc"]) - array_c = np.array([["abc"]]) - assert not array_equal(array_a, array_b) - assert not array_equal(array_b, array_a) - assert not array_equal(array_a, array_c) - assert not array_equal(array_b, array_c) - - def test_string_arrays_0d_and_scalar(self): - array_a = np.array("foobar") - assert array_equal(array_a, "foobar") - assert not array_equal(array_a, "foo") - assert not array_equal(array_a, "foobar.") - - def test_nan_equality_nan_ne_nan(self): - array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) - array_b = array_a.copy() - assert not array_equal(array_a, array_a) - assert not array_equal(array_a, array_b) - - def test_nan_equality_nan_naneq_nan(self): - array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) - array_b = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) - assert array_equal(array_a, array_a, withnans=True) - assert array_equal(array_a, array_b, withnans=True) - - def test_nan_equality_nan_nanne_a(self): - array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) - array_b = np.array([1.0, np.nan, 2.0, 0.0, 3.0]) - assert not array_equal(array_a, array_b, withnans=True) - - def test_nan_equality_a_nanne_b(self): - array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) - array_b = np.array([1.0, np.nan, 2.0, np.nan, 4.0]) - assert not array_equal(array_a, array_b, withnans=True) +ARRAY1 = np.array(np.arange(24).reshape(2, 3, 4)) +ARRAY1[0, 1, 2] = 100 + +ARRAY2 = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) + +TEST_CASES = [ + # test 0d + (np.array(23), np.array(23), False, True), + (np.array(23), np.array(7), False, False), + # test 0d and scalar + (np.array(23), 23, False, True), + (np.array(23), 45, False, False), + # test 1d and sequences + (np.array([1, 2, 3]), [1, 2, 3], False, True), + (np.array([1, 2, 3]), [1, 2], False, False), + (np.array([1, 45, 3]), [1, 2, 3], False, False), + (np.array([1, 2, 3]), (1, 2, 3), False, True), + (np.array([1, 2, 3]), (1, 2), False, False), + (np.array([1, 45, 3]), (1, 2, 3), False, False), + # test 3d + ( + np.array(np.arange(24).reshape(2, 3, 4)), + np.array(np.arange(24).reshape(2, 3, 4)), + False, + True, + ), + ( + np.array(np.arange(24).reshape(2, 3, 4)), + ARRAY1, + False, + False, + ), + # test masked is not ignored + ( + ma.masked_array([1, 2, 3], mask=[1, 0, 1]), + ma.masked_array([2, 2, 2], mask=[1, 0, 1]), + False, + True, + ), + # test masked is different + ( + ma.masked_array([1, 2, 3], mask=[1, 0, 1]), + ma.masked_array([1, 2, 3], mask=[0, 0, 1]), + False, + False, + ), + # test masked isn't unmasked + ( + np.array([1, 2, 2]), + ma.masked_array([1, 2, 2], mask=[0, 0, 1]), + False, + False, + ), + ( + np.array([1, 2]), + ma.masked_array([1, 3], mask=[0, 1]), + False, + False, + ), + # test masked/unmasked_equivalence + ( + np.array([1, 2, 2]), + ma.masked_array([1, 2, 2]), + False, + True, + ), + ( + np.array([1, 2, 2]), + ma.masked_array([1, 2, 2], mask=[0, 0, 0]), + False, + True, + ), + # test fully masked arrays + ( + ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True), + ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True), + False, + True, + ), + # test fully masked 0d arrays + ( + ma.masked_array(3, mask=True), + ma.masked_array(3, mask=True), + False, + True, + ), + # test fully masked string arrays + ( + ma.masked_array(["a", "b", "c"], mask=True), + ma.masked_array(["a", "b", "c"], mask=[1, 1, 1]), + False, + True, + ), + # test partially masked string arrays + ( + ma.masked_array(["a", "b", "c"], mask=[1, 0, 1]), + ma.masked_array(["a", "b", "c"], mask=[1, 0, 1]), + False, + True, + ), + # test string arrays equal + ( + np.array(["abc", "def", "efg"]), + np.array(["abc", "def", "efg"]), + False, + True, + ), + # test string arrays different contents + ( + np.array(["abc", "def", "efg"]), + np.array(["abc", "de", "efg"]), + False, + False, + ), + # test string arrays subset + ( + np.array(["abc", "def", "efg"]), + np.array(["abc", "def"]), + False, + False, + ), + ( + np.array(["abc", "def"]), + np.array(["abc", "def", "efg"]), + False, + False, + ), + # test string arrays unequal dimensionality + (np.array("abc"), np.array(["abc"]), False, False), + (np.array(["abc"]), np.array("abc"), False, False), + (np.array("abc"), np.array([["abc"]]), False, False), + (np.array(["abc"]), np.array([["abc"]]), False, False), + # test string arrays 0d and scalar + (np.array("foobar"), "foobar", False, True), + (np.array("foobar"), "foo", False, False), + (np.array("foobar"), "foobar.", False, False), + # test nan equality nan ne nan + (ARRAY2, ARRAY2, False, False), + (ARRAY2, ARRAY2.copy(), False, False), + # test nan equality nan naneq nan + (ARRAY2, ARRAY2, True, True), + (ARRAY2, ARRAY2.copy(), True, True), + # test nan equality nan nanne a + ( + np.array([1.0, np.nan, 2.0, np.nan, 3.0]), + np.array([1.0, np.nan, 2.0, 0.0, 3.0]), + True, + False, + ), + # test nan equality a nanne b + ( + np.array([1.0, np.nan, 2.0, np.nan, 3.0]), + np.array([1.0, np.nan, 2.0, np.nan, 4.0]), + True, + False, + ), +] + + +@pytest.mark.parametrize("lazy", [False, True]) +@pytest.mark.parametrize("array_a,array_b,withnans,eq", TEST_CASES) +def test_array_equal(array_a, array_b, withnans, eq, lazy): + if lazy: + identical = array_a is array_b + if isinstance(array_a, np.ndarray): + array_a = da.asarray(array_a) + if isinstance(array_b, np.ndarray): + array_b = da.asarray(array_b) + if identical: + array_b = array_a + assert eq == array_equal(array_a, array_b, withnans=withnans) diff --git a/lib/iris/util.py b/lib/iris/util.py index dfefb504e9..b0e9d85829 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -422,16 +422,30 @@ def normalise_array(array): eq = array1.shape == array2.shape if eq: - array1_masked = ma.is_masked(array1) - eq = array1_masked == ma.is_masked(array2) - if eq and array1_masked: - eq = np.array_equal(ma.getmaskarray(array1), ma.getmaskarray(array2)) - if eq: - eqs = array1 == array2 + if is_lazy_data(array1) or is_lazy_data(array2): + data1 = da.ma.getdata(array1) + data2 = da.ma.getdata(array2) + mask1 = da.ma.getmaskarray(array1) + mask2 = da.ma.getmaskarray(array2) + else: + data1 = ma.getdata(array1) + data2 = ma.getdata(array2) + mask1 = ma.getmask(array1) + mask2 = ma.getmask(array2) + if not (mask1 is ma.nomask and mask2 is ma.nomask): + # Ensure masks are of the same type. + mask1 = ma.getmaskarray(array1) + mask2 = ma.getmaskarray(array2) + + select = mask1 & mask2 + if withnans and (array1.dtype.kind == "f" or array2.dtype.kind == "f"): - eqs = np.where(np.isnan(array1) & np.isnan(array2), True, eqs) - eq = np.all(eqs) - eq = bool(eq) or eq is ma.masked + select |= np.isnan(data1) & np.isnan(data2) + + data_eq = np.where(select, True, data1 == data2).all() + mask_eq = (mask1 == mask2).all() + + eq = bool(data_eq & mask_eq) return eq From 7304c8b337e05680e81cbe72f984d4efc9e1e477 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 20 Feb 2025 22:17:49 +0100 Subject: [PATCH 02/11] Optimize unmasked numpy arrays --- lib/iris/util.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/lib/iris/util.py b/lib/iris/util.py index b0e9d85829..aa557bf1c8 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -432,17 +432,23 @@ def normalise_array(array): data2 = ma.getdata(array2) mask1 = ma.getmask(array1) mask2 = ma.getmask(array2) - if not (mask1 is ma.nomask and mask2 is ma.nomask): - # Ensure masks are of the same type. - mask1 = ma.getmaskarray(array1) - mask2 = ma.getmaskarray(array2) - select = mask1 & mask2 + if mask1 is ma.nomask or mask2 is ma.nomask: + ignore = np.False_ + else: + ignore = mask1 & mask2 if withnans and (array1.dtype.kind == "f" or array2.dtype.kind == "f"): - select |= np.isnan(data1) & np.isnan(data2) + nanmask = np.isnan(data1) & np.isnan(data2) + if ignore is np.False_: + ignore = nanmask + else: + ignore |= nanmask - data_eq = np.where(select, True, data1 == data2).all() + data_eqs = data1 == data2 + if ignore is not np.False_: + data_eqs = np.where(ignore, True, data_eqs) + data_eq = data_eqs.all() mask_eq = (mask1 == mask2).all() eq = bool(data_eq & mask_eq) From d17aa7fb860b1af9f8d64f8b37eb18b287795e73 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 26 Feb 2025 12:33:59 +0100 Subject: [PATCH 03/11] Add whatsnew --- docs/src/whatsnew/latest.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst index 3eef612468..63814dff2f 100644 --- a/docs/src/whatsnew/latest.rst +++ b/docs/src/whatsnew/latest.rst @@ -48,6 +48,9 @@ This document explains the changes made to Iris for this release #. `@rcomer`_ added handling for string stash codes when saving pp files. (:issue:`6239`, :pull:`6289`) +#. `@bouweandela`_ fixed handling of masked Dask arrays in + :func:`~iris.util.array_equal`. + 💣 Incompatible Changes ======================= From 1abf173a9fafe448ad40bf6650483db5b94647d6 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Sun, 2 Mar 2025 21:35:26 +0100 Subject: [PATCH 04/11] Use dask.array.blockwise for array comparison --- lib/iris/tests/unit/util/test_array_equal.py | 4 +- lib/iris/util.py | 94 +++++++++++++------- 2 files changed, 65 insertions(+), 33 deletions(-) diff --git a/lib/iris/tests/unit/util/test_array_equal.py b/lib/iris/tests/unit/util/test_array_equal.py index 9f277edb20..efa2495194 100644 --- a/lib/iris/tests/unit/util/test_array_equal.py +++ b/lib/iris/tests/unit/util/test_array_equal.py @@ -176,9 +176,9 @@ def test_array_equal(array_a, array_b, withnans, eq, lazy): if lazy: identical = array_a is array_b if isinstance(array_a, np.ndarray): - array_a = da.asarray(array_a) + array_a = da.asarray(array_a, chunks=2) if isinstance(array_b, np.ndarray): - array_b = da.asarray(array_b) + array_b = da.asarray(array_b, chunks=1) if identical: array_b = array_a assert eq == array_equal(array_a, array_b, withnans=withnans) diff --git a/lib/iris/util.py b/lib/iris/util.py index 95d4afd33d..3366c2e5f4 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -387,14 +387,59 @@ def _rolling_window(array): return rw -def array_equal(array1, array2, withnans=False): +def _masked_array_equal( + array1: np.ndarray, + array2: np.ndarray, + equal_nan: bool, +) -> bool: + """Return whether two, possibly masked, arrays are equal.""" + mask1 = ma.getmask(array1) + mask2 = ma.getmask(array2) + + if mask1 is ma.nomask and mask2 is ma.nomask: + eq = True + elif mask1 is ma.nomask: + eq = not mask2.any() + elif mask2 is ma.nomask: + eq = not mask1.any() + else: + eq = np.array_equal(mask1, mask2) + + if eq: + if ma.isMaskedArray(array1): + array1 = array1.compressed() + if ma.isMaskedArray(array2): + array2 = array2.compressed() + eq = np.array_equal(array1, array2, equal_nan=equal_nan) + + return eq + + +def _apply_masked_array_equal( + blocks1: Iterable[np.ndarray], + blocks2: Iterable[np.ndarray], + equal_nan: bool, +) -> bool: + """Return whether two, possibly masked, arrays are equal. + + This function is for use with :func:`dask.array.blockwise`. + """ + eq = True + for block1, block2 in zip(blocks1, blocks2, strict=True): + eq = _masked_array_equal(block1, block2, equal_nan=equal_nan) + if eq is False: + break + return eq + + +def array_equal(array1, array2, withnans: bool = False) -> bool: """Return whether two arrays have the same shape and elements. Parameters ---------- array1, array2 : arraylike Args to be compared, normalised if necessary with :func:`np.asarray`. - withnans : bool, default=False + withnans : default=False When unset (default), the result is False if either input contains NaN points. This is the normal floating-point arithmetic result. When set, return True if inputs contain the same value in all elements, @@ -412,6 +457,9 @@ def array_equal(array1, array2, withnans=False): if withnans and (array1 is array2): return True + if withnans and not (array1.dtype.kind == "f" or array2.dtype.kind == "f"): + withnans = False + def normalise_array(array): if not is_lazy_data(array): if not ma.isMaskedArray(array): @@ -423,37 +471,21 @@ def normalise_array(array): eq = array1.shape == array2.shape if eq: if is_lazy_data(array1) or is_lazy_data(array2): - data1 = da.ma.getdata(array1) - data2 = da.ma.getdata(array2) - mask1 = da.ma.getmaskarray(array1) - mask2 = da.ma.getmaskarray(array2) - else: - data1 = ma.getdata(array1) - data2 = ma.getdata(array2) - mask1 = ma.getmask(array1) - mask2 = ma.getmask(array2) - - if mask1 is ma.nomask or mask2 is ma.nomask: - ignore = np.False_ + eq = da.blockwise( + _apply_masked_array_equal, + "", + array1.flatten(), + "i", + array2.flatten(), + "i", + dtype=bool, + meta=np.empty((0,), dtype=bool), + equal_nan=withnans, + ) else: - ignore = mask1 & mask2 + eq = _masked_array_equal(array1, array2, equal_nan=withnans) - if withnans and (array1.dtype.kind == "f" or array2.dtype.kind == "f"): - nanmask = np.isnan(data1) & np.isnan(data2) - if ignore is np.False_: - ignore = nanmask - else: - ignore |= nanmask - - data_eqs = data1 == data2 - if ignore is not np.False_: - data_eqs = np.where(ignore, True, data_eqs) - data_eq = data_eqs.all() - mask_eq = (mask1 == mask2).all() - - eq = bool(data_eq & mask_eq) - - return eq + return bool(eq) def approx_equal(a, b, max_absolute_error=1e-10, max_relative_error=1e-10): From cfd48526ed5d20479b0004aa732cc6551dcaf27d Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Tue, 4 Mar 2025 14:31:51 +0100 Subject: [PATCH 05/11] Faster comparison --- lib/iris/tests/unit/util/test_array_equal.py | 3 ++ lib/iris/util.py | 30 ++++++++++++++++---- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/lib/iris/tests/unit/util/test_array_equal.py b/lib/iris/tests/unit/util/test_array_equal.py index efa2495194..790929bc84 100644 --- a/lib/iris/tests/unit/util/test_array_equal.py +++ b/lib/iris/tests/unit/util/test_array_equal.py @@ -17,6 +17,9 @@ ARRAY2 = np.array([1.0, np.nan, 2.0, np.nan, 3.0]) TEST_CASES = [ + # test empty + (np.array([]), np.array([]), False, True), + (np.array([]), np.array([], dtype=np.float64), True, True), # test 0d (np.array(23), np.array(23), False, True), (np.array(23), np.array(7), False, False), diff --git a/lib/iris/util.py b/lib/iris/util.py index 3366c2e5f4..55fb19045b 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -396,6 +396,7 @@ def _masked_array_equal( mask1 = ma.getmask(array1) mask2 = ma.getmask(array2) + # Compare mask equality. if mask1 is ma.nomask and mask2 is ma.nomask: eq = True elif mask1 is ma.nomask: @@ -406,11 +407,28 @@ def _masked_array_equal( eq = np.array_equal(mask1, mask2) if eq: - if ma.isMaskedArray(array1): - array1 = array1.compressed() - if ma.isMaskedArray(array2): - array2 = array2.compressed() - eq = np.array_equal(array1, array2, equal_nan=equal_nan) + # Compare data equality. + if not (mask1 is ma.nomask or mask2 is ma.nomask): + # Ignore masked data. + ignore = mask1 + else: + ignore = None + + if equal_nan: + # Ignore data that is np.nan in both arrays. + nanmask = np.isnan(array1) & np.isnan(array2) + if ignore is None: + ignore = nanmask + else: + ignore |= nanmask + + # This is faster than using np.array_equal with equal_nan=True. + data1 = array1.data if ma.isMaskedArray(array1) else array1 + data2 = array2.data if ma.isMaskedArray(array2) else array2 + eqs = data1 == data2 + if ignore is not None: + eqs = np.where(ignore, True, eqs) + eq = eqs.all() return eq @@ -427,7 +445,7 @@ def _apply_masked_array_equal( eq = True for block1, block2 in zip(blocks1, blocks2, strict=True): eq = _masked_array_equal(block1, block2, equal_nan=equal_nan) - if eq is False: + if not eq: break return eq From 59604a26b55bf53c4d46cbfef0a0ef4e825bc9b6 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Tue, 4 Mar 2025 15:59:04 +0100 Subject: [PATCH 06/11] Avoid flattening arrays --- lib/iris/util.py | 46 +++++++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 13 deletions(-) diff --git a/lib/iris/util.py b/lib/iris/util.py index 55fb19045b..ab3ae73d15 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -434,19 +434,39 @@ def _masked_array_equal( def _apply_masked_array_equal( - blocks1: Iterable[np.ndarray], - blocks2: Iterable[np.ndarray], + blocks1: list | np.ndarray, + blocks2: list | np.ndarray, equal_nan: bool, ) -> bool: - """Return whether two, possibly masked, arrays are equal. + """Return whether two collections of arrays are equal or not. This function is for use with :func:`dask.array.blockwise`. + + Parameters + ---------- + blocks1 : + The collection of arrays representing chunks from the first array. Can + be a numpy array or a (nested) list of numpy arrays. + blocks2 : + The collection of arrays representing chunks from the second array. Can + be a numpy array or a (nested) list of numpy arrays. + equal_nan : + Consder NaN values equal. + + Returns + ------- + : + Whether the two collections are equal or not. + """ - eq = True - for block1, block2 in zip(blocks1, blocks2, strict=True): - eq = _masked_array_equal(block1, block2, equal_nan=equal_nan) - if not eq: - break + if isinstance(blocks1, np.ndarray): + eq = _masked_array_equal(blocks1, blocks2, equal_nan=equal_nan) + else: + eq = True + for block1, block2 in zip(blocks1, blocks2, strict=True): + eq = _apply_masked_array_equal(block1, block2, equal_nan=equal_nan) + if not eq: + break return eq @@ -491,11 +511,11 @@ def normalise_array(array): if is_lazy_data(array1) or is_lazy_data(array2): eq = da.blockwise( _apply_masked_array_equal, - "", - array1.flatten(), - "i", - array2.flatten(), - "i", + tuple(), + array1, + tuple(range(array1.ndim)), + array2, + tuple(range(array2.ndim)), dtype=bool, meta=np.empty((0,), dtype=bool), equal_nan=withnans, From b3344e954b0cf0a0c1b40c98b7a129f7f38a81c6 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Tue, 4 Mar 2025 15:59:51 +0100 Subject: [PATCH 07/11] Avoid checking points if one coord has bounds and the other does not have them --- lib/iris/coords.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lib/iris/coords.py b/lib/iris/coords.py index bc0991565c..ca73dcb729 100644 --- a/lib/iris/coords.py +++ b/lib/iris/coords.py @@ -589,21 +589,22 @@ def __eq__(self, other): if hasattr(other, "metadata"): # metadata comparison eq = self.metadata == other.metadata + + # Also consider bounds, if we have them. + # (N.B. though only Coords can ever actually *have* bounds). + if eq and eq is not NotImplemented: + eq = self.has_bounds() is other.has_bounds() + # data values comparison if eq and eq is not NotImplemented: eq = iris.util.array_equal( self._core_values(), other._core_values(), withnans=True ) - - # Also consider bounds, if we have them. - # (N.B. though only Coords can ever actually *have* bounds). if eq and eq is not NotImplemented: if self.has_bounds() and other.has_bounds(): eq = iris.util.array_equal( self.core_bounds(), other.core_bounds(), withnans=True ) - else: - eq = not self.has_bounds() and not other.has_bounds() return eq From e43997d32be38c14daf5a8898d66cca3b843dfe4 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Tue, 4 Mar 2025 16:25:46 +0100 Subject: [PATCH 08/11] Small simplification --- lib/iris/util.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/iris/util.py b/lib/iris/util.py index ab3ae73d15..a2b393c7be 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -423,9 +423,7 @@ def _masked_array_equal( ignore |= nanmask # This is faster than using np.array_equal with equal_nan=True. - data1 = array1.data if ma.isMaskedArray(array1) else array1 - data2 = array2.data if ma.isMaskedArray(array2) else array2 - eqs = data1 == data2 + eqs = ma.getdata(array1) == ma.getdata(array2) if ignore is not None: eqs = np.where(ignore, True, eqs) eq = eqs.all() From 6fc1665efa7e75a8732727ce74e64aa7235f6330 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 5 Mar 2025 15:35:46 +0100 Subject: [PATCH 09/11] Add test --- lib/iris/tests/unit/util/test_array_equal.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/iris/tests/unit/util/test_array_equal.py b/lib/iris/tests/unit/util/test_array_equal.py index 790929bc84..eafe123aed 100644 --- a/lib/iris/tests/unit/util/test_array_equal.py +++ b/lib/iris/tests/unit/util/test_array_equal.py @@ -67,6 +67,12 @@ False, False, ), + ( + ma.masked_array([1, 2, 2], mask=[0, 0, 1]), + ma.masked_array([1, 2, 2]), + False, + False, + ), ( np.array([1, 2]), ma.masked_array([1, 3], mask=[0, 1]), From 092e95f4ee444a6f8f9efa3eef7341ab2b31c8ea Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Wed, 5 Mar 2025 17:08:33 +0100 Subject: [PATCH 10/11] Use a separate map and reduce operation to avoid running out of memory on large arrays --- lib/iris/util.py | 64 ++++++++++++------------------------------------ 1 file changed, 15 insertions(+), 49 deletions(-) diff --git a/lib/iris/util.py b/lib/iris/util.py index a2b393c7be..14682314b0 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -391,7 +391,7 @@ def _masked_array_equal( array1: np.ndarray, array2: np.ndarray, equal_nan: bool, -) -> bool: +) -> np.ndarray: """Return whether two, possibly masked, arrays are equal.""" mask1 = ma.getmask(array1) mask2 = ma.getmask(array2) @@ -406,7 +406,9 @@ def _masked_array_equal( else: eq = np.array_equal(mask1, mask2) - if eq: + if not eq: + eqs = np.zeros(array1.shape, dtype=bool) + else: # Compare data equality. if not (mask1 is ma.nomask or mask2 is ma.nomask): # Ignore masked data. @@ -422,50 +424,11 @@ def _masked_array_equal( else: ignore |= nanmask - # This is faster than using np.array_equal with equal_nan=True. eqs = ma.getdata(array1) == ma.getdata(array2) if ignore is not None: eqs = np.where(ignore, True, eqs) - eq = eqs.all() - - return eq - -def _apply_masked_array_equal( - blocks1: list | np.ndarray, - blocks2: list | np.ndarray, - equal_nan: bool, -) -> bool: - """Return whether two collections of arrays are equal or not. - - This function is for use with :func:`dask.array.blockwise`. - - Parameters - ---------- - blocks1 : - The collection of arrays representing chunks from the first array. Can - be a numpy array or a (nested) list of numpy arrays. - blocks2 : - The collection of arrays representing chunks from the second array. Can - be a numpy array or a (nested) list of numpy arrays. - equal_nan : - Consder NaN values equal. - - Returns - ------- - : - Whether the two collections are equal or not. - - """ - if isinstance(blocks1, np.ndarray): - eq = _masked_array_equal(blocks1, blocks2, equal_nan=equal_nan) - else: - eq = True - for block1, block2 in zip(blocks1, blocks2, strict=True): - eq = _apply_masked_array_equal(block1, block2, equal_nan=equal_nan) - if not eq: - break - return eq + return eqs def array_equal(array1, array2, withnans: bool = False) -> bool: @@ -507,19 +470,22 @@ def normalise_array(array): eq = array1.shape == array2.shape if eq: if is_lazy_data(array1) or is_lazy_data(array2): + # Use a separate map and reduce operation to avoid running out of memory. + ndim = array1.ndim + indices = tuple(range(ndim)) eq = da.blockwise( - _apply_masked_array_equal, - tuple(), + _masked_array_equal, + indices, array1, - tuple(range(array1.ndim)), + indices, array2, - tuple(range(array2.ndim)), + indices, dtype=bool, - meta=np.empty((0,), dtype=bool), + meta=np.empty((0,) * ndim, dtype=bool), equal_nan=withnans, - ) + ).all() else: - eq = _masked_array_equal(array1, array2, equal_nan=withnans) + eq = _masked_array_equal(array1, array2, equal_nan=withnans).all() return bool(eq) From 8217ea9051e6482a87b85a29a3e05428dc022b68 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Thu, 6 Mar 2025 17:03:12 +0100 Subject: [PATCH 11/11] Correct order of checking if array is floating point dtype Also consider non-floating point arrays equal with withnans=False --- lib/iris/util.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/lib/iris/util.py b/lib/iris/util.py index 14682314b0..fef83b4c94 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -453,20 +453,21 @@ def array_equal(array1, array2, withnans: bool = False) -> bool: This function maintains laziness when called; it does not realise data. See more at :doc:`/userguide/real_and_lazy_data`. """ - if withnans and (array1 is array2): - return True - - if withnans and not (array1.dtype.kind == "f" or array2.dtype.kind == "f"): - withnans = False def normalise_array(array): - if not is_lazy_data(array): - if not ma.isMaskedArray(array): - array = np.asanyarray(array) + if not isinstance(array, np.ndarray | da.Array): + array = np.asanyarray(array) return array array1, array2 = normalise_array(array1), normalise_array(array2) + floating_point_arrays = array1.dtype.kind == "f" or array2.dtype.kind == "f" + if (array1 is array2) and (withnans or not floating_point_arrays): + return True + + if not floating_point_arrays: + withnans = False + eq = array1.shape == array2.shape if eq: if is_lazy_data(array1) or is_lazy_data(array2):