diff --git a/benchmarks/benchmarks/generate_data/__init__.py b/benchmarks/benchmarks/generate_data/__init__.py
index bb53e26b2f..3366bec6e0 100644
--- a/benchmarks/benchmarks/generate_data/__init__.py
+++ b/benchmarks/benchmarks/generate_data/__init__.py
@@ -106,11 +106,14 @@ def load_realised():
     file loading, but some benchmarks are only meaningful if starting with real
     arrays.
     """
+    from iris.fileformats._nc_load_rules import helpers
     from iris.fileformats.netcdf.loader import _get_cf_var_data as pre_patched
 
     def patched(cf_var, filename):
         return as_concrete_data(pre_patched(cf_var, filename))
 
-    netcdf._get_cf_var_data = patched
-    yield netcdf
-    netcdf._get_cf_var_data = pre_patched
+    netcdf.loader._get_cf_var_data = patched
+    helpers._get_cf_var_data = patched
+    yield
+    netcdf.loader._get_cf_var_data = pre_patched
+    helpers._get_cf_var_data = pre_patched
diff --git a/benchmarks/benchmarks/generate_data/stock.py b/benchmarks/benchmarks/generate_data/stock.py
index 04698e8ff5..47014078e7 100644
--- a/benchmarks/benchmarks/generate_data/stock.py
+++ b/benchmarks/benchmarks/generate_data/stock.py
@@ -162,7 +162,8 @@ def realistic_4d_w_everything(w_mesh=False, lazy=False) -> iris.cube.Cube:
     lazy : bool
         If True, the Cube will be returned with all arrays as they would
         normally be loaded from file (i.e. most will still be lazy Dask
-        arrays). If False, all arrays will be realised NumPy arrays.
+        arrays). If False, all arrays (except derived coordinates) will be
+        realised NumPy arrays.
 
     """
 
diff --git a/docs/src/whatsnew/latest.rst b/docs/src/whatsnew/latest.rst
index 32c97b9cac..881c3ab64e 100644
--- a/docs/src/whatsnew/latest.rst
+++ b/docs/src/whatsnew/latest.rst
@@ -48,11 +48,11 @@ This document explains the changes made to Iris for this release
    However, :meth:`~iris.cube.Cube.transpose` will work, as will
    :meth:`~iris.cube.Cube.copy`. Note that, ``cube.copy(data=iris.DATALESS)``
    will provide a dataless copy of a cube. (:issue:`4447`, :pull:`6253`)
-   
+
 #. `@ESadek-MO`_ added the :mod:`iris.quickplot` ``footer`` kwarg to
    render text in the bottom right of the plot figure.
    (:issue:`6247`, :pull:`6332`)
-   
+
 
 🐛 Bugs Fixed
 =============
@@ -65,6 +65,9 @@ This document explains the changes made to Iris for this release
    older NetCDF formats e.g. ``NETCDF4_CLASSIC`` support a maximum precision of
    32-bit. (:issue:`6178`, :pull:`6343`)
 
+#. `@bouweandela`_ fixed handling of masked Dask arrays in
+    :func:`~iris.util.array_equal`.
+
 
 💣 Incompatible Changes
 =======================
@@ -145,6 +148,8 @@ This document explains the changes made to Iris for this release
 #. `@trexfeathers`_ temporarily pinned Sphinx to `<8.2`.
    (:pull:`6344`, :issue:`6345`)
 
+#. `@bouweandela`_ fixed a bug in the benchmarking code that caused all benchmarks
+    to be run with lazy data. (:pull:`6339`)
 
 .. comment
     Whatsnew author names (@github name) in alphabetical order. Note that,
diff --git a/lib/iris/coords.py b/lib/iris/coords.py
index bc0991565c..ca73dcb729 100644
--- a/lib/iris/coords.py
+++ b/lib/iris/coords.py
@@ -589,21 +589,22 @@ def __eq__(self, other):
         if hasattr(other, "metadata"):
             # metadata comparison
             eq = self.metadata == other.metadata
+
+            # Also consider bounds, if we have them.
+            # (N.B. though only Coords can ever actually *have* bounds).
+            if eq and eq is not NotImplemented:
+                eq = self.has_bounds() is other.has_bounds()
+
             # data values comparison
             if eq and eq is not NotImplemented:
                 eq = iris.util.array_equal(
                     self._core_values(), other._core_values(), withnans=True
                 )
-
-            # Also consider bounds, if we have them.
-            # (N.B. though only Coords can ever actually *have* bounds).
             if eq and eq is not NotImplemented:
                 if self.has_bounds() and other.has_bounds():
                     eq = iris.util.array_equal(
                         self.core_bounds(), other.core_bounds(), withnans=True
                     )
-                else:
-                    eq = not self.has_bounds() and not other.has_bounds()
 
         return eq
 
diff --git a/lib/iris/tests/unit/concatenate/test_hashing.py b/lib/iris/tests/unit/concatenate/test_hashing.py
index 24062a2af3..88064e4e46 100644
--- a/lib/iris/tests/unit/concatenate/test_hashing.py
+++ b/lib/iris/tests/unit/concatenate/test_hashing.py
@@ -9,6 +9,8 @@
 import pytest
 
 from iris import _concatenate
+from iris.tests.unit.util.test_array_equal import TEST_CASES
+from iris.util import array_equal
 
 
 @pytest.mark.parametrize(
@@ -75,6 +77,20 @@ def test_compute_hashes(a, b, eq):
     assert eq == (hashes["a"] == hashes["b"])
 
 
+@pytest.mark.parametrize(
+    "a,b",
+    [
+        (a, b)
+        for (a, b, withnans, eq) in TEST_CASES
+        if isinstance(a, np.ndarray | da.Array) and isinstance(b, np.ndarray | da.Array)
+    ],
+)
+def test_compute_hashes_vs_array_equal(a, b):
+    """Test that hashing give the same answer as `array_equal(withnans=True)`."""
+    hashes = _concatenate._compute_hashes({"a": a, "b": b})
+    assert array_equal(a, b, withnans=True) == (hashes["a"] == hashes["b"])
+
+
 def test_arrayhash_equal_incompatible_chunks_raises():
     hash1 = _concatenate._ArrayHash(1, chunks=((1, 1),))
     hash2 = _concatenate._ArrayHash(1, chunks=((2,),))
diff --git a/lib/iris/tests/unit/util/test_array_equal.py b/lib/iris/tests/unit/util/test_array_equal.py
index 3e1aaf1bfb..eafe123aed 100644
--- a/lib/iris/tests/unit/util/test_array_equal.py
+++ b/lib/iris/tests/unit/util/test_array_equal.py
@@ -4,133 +4,190 @@
 # See LICENSE in the root of the repository for full licensing details.
 """Test function :func:`iris.util.array_equal`."""
 
+import dask.array as da
 import numpy as np
 import numpy.ma as ma
+import pytest
 
 from iris.util import array_equal
 
-
-class Test:
-    def test_0d(self):
-        array_a = np.array(23)
-        array_b = np.array(23)
-        array_c = np.array(7)
-        assert array_equal(array_a, array_b)
-        assert not array_equal(array_a, array_c)
-
-    def test_0d_and_scalar(self):
-        array_a = np.array(23)
-        assert array_equal(array_a, 23)
-        assert not array_equal(array_a, 45)
-
-    def test_1d_and_sequences(self):
-        for sequence_type in (list, tuple):
-            seq_a = sequence_type([1, 2, 3])
-            array_a = np.array(seq_a)
-            assert array_equal(array_a, seq_a)
-            assert not array_equal(array_a, seq_a[:-1])
-            array_a[1] = 45
-            assert not array_equal(array_a, seq_a)
-
-    def test_nd(self):
-        array_a = np.array(np.arange(24).reshape(2, 3, 4))
-        array_b = np.array(np.arange(24).reshape(2, 3, 4))
-        array_c = np.array(np.arange(24).reshape(2, 3, 4))
-        array_c[0, 1, 2] = 100
-        assert array_equal(array_a, array_b)
-        assert not array_equal(array_a, array_c)
-
-    def test_masked_is_not_ignored(self):
-        array_a = ma.masked_array([1, 2, 3], mask=[1, 0, 1])
-        array_b = ma.masked_array([2, 2, 2], mask=[1, 0, 1])
-        assert array_equal(array_a, array_b)
-
-    def test_masked_is_different(self):
-        array_a = ma.masked_array([1, 2, 3], mask=[1, 0, 1])
-        array_b = ma.masked_array([1, 2, 3], mask=[0, 0, 1])
-        assert not array_equal(array_a, array_b)
-
-    def test_masked_isnt_unmasked(self):
-        array_a = np.array([1, 2, 2])
-        array_b = ma.masked_array([1, 2, 2], mask=[0, 0, 1])
-        assert not array_equal(array_a, array_b)
-
-    def test_masked_unmasked_equivelance(self):
-        array_a = np.array([1, 2, 2])
-        array_b = ma.masked_array([1, 2, 2])
-        array_c = ma.masked_array([1, 2, 2], mask=[0, 0, 0])
-        assert array_equal(array_a, array_b)
-        assert array_equal(array_a, array_c)
-
-    def test_fully_masked_arrays(self):
-        array_a = ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True)
-        array_b = ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True)
-        assert array_equal(array_a, array_b)
-
-    def test_fully_masked_0d_arrays(self):
-        array_a = ma.masked_array(3, mask=True)
-        array_b = ma.masked_array(3, mask=True)
-        assert array_equal(array_a, array_b)
-
-    def test_fully_masked_string_arrays(self):
-        array_a = ma.masked_array(["a", "b", "c"], mask=True)
-        array_b = ma.masked_array(["a", "b", "c"], mask=[1, 1, 1])
-        assert array_equal(array_a, array_b)
-
-    def test_partially_masked_string_arrays(self):
-        array_a = ma.masked_array(["a", "b", "c"], mask=[1, 0, 1])
-        array_b = ma.masked_array(["a", "b", "c"], mask=[1, 0, 1])
-        assert array_equal(array_a, array_b)
-
-    def test_string_arrays_equal(self):
-        array_a = np.array(["abc", "def", "efg"])
-        array_b = np.array(["abc", "def", "efg"])
-        assert array_equal(array_a, array_b)
-
-    def test_string_arrays_different_contents(self):
-        array_a = np.array(["abc", "def", "efg"])
-        array_b = np.array(["abc", "de", "efg"])
-        assert not array_equal(array_a, array_b)
-
-    def test_string_arrays_subset(self):
-        array_a = np.array(["abc", "def", "efg"])
-        array_b = np.array(["abc", "def"])
-        assert not array_equal(array_a, array_b)
-        assert not array_equal(array_b, array_a)
-
-    def test_string_arrays_unequal_dimensionality(self):
-        array_a = np.array("abc")
-        array_b = np.array(["abc"])
-        array_c = np.array([["abc"]])
-        assert not array_equal(array_a, array_b)
-        assert not array_equal(array_b, array_a)
-        assert not array_equal(array_a, array_c)
-        assert not array_equal(array_b, array_c)
-
-    def test_string_arrays_0d_and_scalar(self):
-        array_a = np.array("foobar")
-        assert array_equal(array_a, "foobar")
-        assert not array_equal(array_a, "foo")
-        assert not array_equal(array_a, "foobar.")
-
-    def test_nan_equality_nan_ne_nan(self):
-        array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0])
-        array_b = array_a.copy()
-        assert not array_equal(array_a, array_a)
-        assert not array_equal(array_a, array_b)
-
-    def test_nan_equality_nan_naneq_nan(self):
-        array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0])
-        array_b = np.array([1.0, np.nan, 2.0, np.nan, 3.0])
-        assert array_equal(array_a, array_a, withnans=True)
-        assert array_equal(array_a, array_b, withnans=True)
-
-    def test_nan_equality_nan_nanne_a(self):
-        array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0])
-        array_b = np.array([1.0, np.nan, 2.0, 0.0, 3.0])
-        assert not array_equal(array_a, array_b, withnans=True)
-
-    def test_nan_equality_a_nanne_b(self):
-        array_a = np.array([1.0, np.nan, 2.0, np.nan, 3.0])
-        array_b = np.array([1.0, np.nan, 2.0, np.nan, 4.0])
-        assert not array_equal(array_a, array_b, withnans=True)
+ARRAY1 = np.array(np.arange(24).reshape(2, 3, 4))
+ARRAY1[0, 1, 2] = 100
+
+ARRAY2 = np.array([1.0, np.nan, 2.0, np.nan, 3.0])
+
+TEST_CASES = [
+    # test empty
+    (np.array([]), np.array([]), False, True),
+    (np.array([]), np.array([], dtype=np.float64), True, True),
+    # test 0d
+    (np.array(23), np.array(23), False, True),
+    (np.array(23), np.array(7), False, False),
+    # test 0d and scalar
+    (np.array(23), 23, False, True),
+    (np.array(23), 45, False, False),
+    # test 1d and sequences
+    (np.array([1, 2, 3]), [1, 2, 3], False, True),
+    (np.array([1, 2, 3]), [1, 2], False, False),
+    (np.array([1, 45, 3]), [1, 2, 3], False, False),
+    (np.array([1, 2, 3]), (1, 2, 3), False, True),
+    (np.array([1, 2, 3]), (1, 2), False, False),
+    (np.array([1, 45, 3]), (1, 2, 3), False, False),
+    # test 3d
+    (
+        np.array(np.arange(24).reshape(2, 3, 4)),
+        np.array(np.arange(24).reshape(2, 3, 4)),
+        False,
+        True,
+    ),
+    (
+        np.array(np.arange(24).reshape(2, 3, 4)),
+        ARRAY1,
+        False,
+        False,
+    ),
+    # test masked is not ignored
+    (
+        ma.masked_array([1, 2, 3], mask=[1, 0, 1]),
+        ma.masked_array([2, 2, 2], mask=[1, 0, 1]),
+        False,
+        True,
+    ),
+    # test masked is different
+    (
+        ma.masked_array([1, 2, 3], mask=[1, 0, 1]),
+        ma.masked_array([1, 2, 3], mask=[0, 0, 1]),
+        False,
+        False,
+    ),
+    # test masked isn't unmasked
+    (
+        np.array([1, 2, 2]),
+        ma.masked_array([1, 2, 2], mask=[0, 0, 1]),
+        False,
+        False,
+    ),
+    (
+        ma.masked_array([1, 2, 2], mask=[0, 0, 1]),
+        ma.masked_array([1, 2, 2]),
+        False,
+        False,
+    ),
+    (
+        np.array([1, 2]),
+        ma.masked_array([1, 3], mask=[0, 1]),
+        False,
+        False,
+    ),
+    # test masked/unmasked_equivalence
+    (
+        np.array([1, 2, 2]),
+        ma.masked_array([1, 2, 2]),
+        False,
+        True,
+    ),
+    (
+        np.array([1, 2, 2]),
+        ma.masked_array([1, 2, 2], mask=[0, 0, 0]),
+        False,
+        True,
+    ),
+    # test fully masked arrays
+    (
+        ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True),
+        ma.masked_array(np.arange(24).reshape(2, 3, 4), mask=True),
+        False,
+        True,
+    ),
+    # test fully masked 0d arrays
+    (
+        ma.masked_array(3, mask=True),
+        ma.masked_array(3, mask=True),
+        False,
+        True,
+    ),
+    # test fully masked string arrays
+    (
+        ma.masked_array(["a", "b", "c"], mask=True),
+        ma.masked_array(["a", "b", "c"], mask=[1, 1, 1]),
+        False,
+        True,
+    ),
+    # test partially masked string arrays
+    (
+        ma.masked_array(["a", "b", "c"], mask=[1, 0, 1]),
+        ma.masked_array(["a", "b", "c"], mask=[1, 0, 1]),
+        False,
+        True,
+    ),
+    # test string arrays equal
+    (
+        np.array(["abc", "def", "efg"]),
+        np.array(["abc", "def", "efg"]),
+        False,
+        True,
+    ),
+    # test string arrays different contents
+    (
+        np.array(["abc", "def", "efg"]),
+        np.array(["abc", "de", "efg"]),
+        False,
+        False,
+    ),
+    # test string arrays subset
+    (
+        np.array(["abc", "def", "efg"]),
+        np.array(["abc", "def"]),
+        False,
+        False,
+    ),
+    (
+        np.array(["abc", "def"]),
+        np.array(["abc", "def", "efg"]),
+        False,
+        False,
+    ),
+    # test string arrays unequal dimensionality
+    (np.array("abc"), np.array(["abc"]), False, False),
+    (np.array(["abc"]), np.array("abc"), False, False),
+    (np.array("abc"), np.array([["abc"]]), False, False),
+    (np.array(["abc"]), np.array([["abc"]]), False, False),
+    # test string arrays 0d and scalar
+    (np.array("foobar"), "foobar", False, True),
+    (np.array("foobar"), "foo", False, False),
+    (np.array("foobar"), "foobar.", False, False),
+    # test nan equality nan ne nan
+    (ARRAY2, ARRAY2, False, False),
+    (ARRAY2, ARRAY2.copy(), False, False),
+    # test nan equality nan naneq nan
+    (ARRAY2, ARRAY2, True, True),
+    (ARRAY2, ARRAY2.copy(), True, True),
+    # test nan equality nan nanne a
+    (
+        np.array([1.0, np.nan, 2.0, np.nan, 3.0]),
+        np.array([1.0, np.nan, 2.0, 0.0, 3.0]),
+        True,
+        False,
+    ),
+    # test nan equality a nanne b
+    (
+        np.array([1.0, np.nan, 2.0, np.nan, 3.0]),
+        np.array([1.0, np.nan, 2.0, np.nan, 4.0]),
+        True,
+        False,
+    ),
+]
+
+
+@pytest.mark.parametrize("lazy", [False, True])
+@pytest.mark.parametrize("array_a,array_b,withnans,eq", TEST_CASES)
+def test_array_equal(array_a, array_b, withnans, eq, lazy):
+    if lazy:
+        identical = array_a is array_b
+        if isinstance(array_a, np.ndarray):
+            array_a = da.asarray(array_a, chunks=2)
+        if isinstance(array_b, np.ndarray):
+            array_b = da.asarray(array_b, chunks=1)
+        if identical:
+            array_b = array_a
+    assert eq == array_equal(array_a, array_b, withnans=withnans)
diff --git a/lib/iris/util.py b/lib/iris/util.py
index 94cb077a2f..14682314b0 100644
--- a/lib/iris/util.py
+++ b/lib/iris/util.py
@@ -387,14 +387,58 @@ def _rolling_window(array):
     return rw
 
 
-def array_equal(array1, array2, withnans=False):
+def _masked_array_equal(
+    array1: np.ndarray,
+    array2: np.ndarray,
+    equal_nan: bool,
+) -> np.ndarray:
+    """Return whether two, possibly masked, arrays are equal."""
+    mask1 = ma.getmask(array1)
+    mask2 = ma.getmask(array2)
+
+    # Compare mask equality.
+    if mask1 is ma.nomask and mask2 is ma.nomask:
+        eq = True
+    elif mask1 is ma.nomask:
+        eq = not mask2.any()
+    elif mask2 is ma.nomask:
+        eq = not mask1.any()
+    else:
+        eq = np.array_equal(mask1, mask2)
+
+    if not eq:
+        eqs = np.zeros(array1.shape, dtype=bool)
+    else:
+        # Compare data equality.
+        if not (mask1 is ma.nomask or mask2 is ma.nomask):
+            # Ignore masked data.
+            ignore = mask1
+        else:
+            ignore = None
+
+        if equal_nan:
+            # Ignore data that is np.nan in both arrays.
+            nanmask = np.isnan(array1) & np.isnan(array2)
+            if ignore is None:
+                ignore = nanmask
+            else:
+                ignore |= nanmask
+
+        eqs = ma.getdata(array1) == ma.getdata(array2)
+        if ignore is not None:
+            eqs = np.where(ignore, True, eqs)
+
+    return eqs
+
+
+def array_equal(array1, array2, withnans: bool = False) -> bool:
     """Return whether two arrays have the same shape and elements.
 
     Parameters
     ----------
     array1, array2 : arraylike
         Args to be compared, normalised if necessary with :func:`np.asarray`.
-    withnans : bool, default=False
+    withnans : default=False
         When unset (default), the result is False if either input contains NaN
         points.  This is the normal floating-point arithmetic result.
         When set, return True if inputs contain the same value in all elements,
@@ -412,6 +456,9 @@ def array_equal(array1, array2, withnans=False):
     if withnans and (array1 is array2):
         return True
 
+    if withnans and not (array1.dtype.kind == "f" or array2.dtype.kind == "f"):
+        withnans = False
+
     def normalise_array(array):
         if not is_lazy_data(array):
             if not ma.isMaskedArray(array):
@@ -422,18 +469,25 @@ def normalise_array(array):
 
     eq = array1.shape == array2.shape
     if eq:
-        array1_masked = ma.is_masked(array1)
-        eq = array1_masked == ma.is_masked(array2)
-    if eq and array1_masked:
-        eq = np.array_equal(ma.getmaskarray(array1), ma.getmaskarray(array2))
-    if eq:
-        eqs = array1 == array2
-        if withnans and (array1.dtype.kind == "f" or array2.dtype.kind == "f"):
-            eqs = np.where(np.isnan(array1) & np.isnan(array2), True, eqs)
-        eq = np.all(eqs)
-        eq = bool(eq) or eq is ma.masked
+        if is_lazy_data(array1) or is_lazy_data(array2):
+            # Use a separate map and reduce operation to avoid running out of memory.
+            ndim = array1.ndim
+            indices = tuple(range(ndim))
+            eq = da.blockwise(
+                _masked_array_equal,
+                indices,
+                array1,
+                indices,
+                array2,
+                indices,
+                dtype=bool,
+                meta=np.empty((0,) * ndim, dtype=bool),
+                equal_nan=withnans,
+            ).all()
+        else:
+            eq = _masked_array_equal(array1, array2, equal_nan=withnans).all()
 
-    return eq
+    return bool(eq)
 
 
 def approx_equal(a, b, max_absolute_error=1e-10, max_relative_error=1e-10):