diff --git a/docs/iris/src/whatsnew/contributions_2.1/newfeature_2018-Mar-08_co_realise_cubes.txt b/docs/iris/src/whatsnew/contributions_2.1/newfeature_2018-Mar-08_co_realise_cubes.txt new file mode 100644 index 0000000000..274ff0de5d --- /dev/null +++ b/docs/iris/src/whatsnew/contributions_2.1/newfeature_2018-Mar-08_co_realise_cubes.txt @@ -0,0 +1,3 @@ +* Added new function :func:`iris.co_realise_cubes` to compute multiple lazy + values in a single operation, avoiding repeated re-loading of data or + re-calculation of expressions. diff --git a/lib/iris/__init__.py b/lib/iris/__init__.py index 4a9fa43800..24c205aefb 100644 --- a/lib/iris/__init__.py +++ b/lib/iris/__init__.py @@ -112,6 +112,7 @@ def callback(cube, field, filename): from iris._deprecation import IrisDeprecation, warn_deprecated import iris.fileformats import iris.io +from iris._lazy_data import co_realise_cubes try: @@ -127,7 +128,7 @@ def callback(cube, field, filename): __all__ = ['load', 'load_cube', 'load_cubes', 'load_raw', 'save', 'Constraint', 'AttributeConstraint', 'sample_data_path', 'site_configuration', 'Future', 'FUTURE', - 'IrisDeprecation'] + 'IrisDeprecation', 'co_realise_cubes'] Constraint = iris._constraints.Constraint diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py index 15dac90e7f..285aa8d9b1 100644 --- a/lib/iris/_lazy_data.py +++ b/lib/iris/_lazy_data.py @@ -1,4 +1,4 @@ -# (C) British Crown Copyright 2017, Met Office +# (C) British Crown Copyright 2017 - 2018, Met Office # # This file is part of Iris. # @@ -102,6 +102,39 @@ def as_lazy_data(data, chunks=None, asarray=False): return data +def _co_realise_lazy_arrays(arrays): + """ + Compute multiple lazy arrays and return a list of real values. + + All the arrays are computed together, so they can share results for common + graph elements. + + Casts all results with `np.asanyarray`, and converts any MaskedConstants + appearing into masked arrays, to ensure that all return values are + writeable NumPy array objects. + + Any non-lazy arrays are passed through, as they are by `da.compute`. + They undergo the same result standardisation. + + """ + computed_arrays = da.compute(*arrays) + results = [] + for lazy_in, real_out in zip(arrays, computed_arrays): + # Ensure we always have arrays. + # Note : in some cases dask (and numpy) will return a scalar + # numpy.int/numpy.float object rather than an ndarray. + # Recorded in https://github.com/dask/dask/issues/2111. + real_out = np.asanyarray(real_out) + if isinstance(real_out, ma.core.MaskedConstant): + # Convert any masked constants into NumPy masked arrays. + # NOTE: in this case, also apply the original lazy-array dtype, as + # masked constants *always* have dtype float64. + real_out = ma.masked_array(real_out.data, mask=real_out.mask, + dtype=lazy_in.dtype) + results.append(real_out) + return results + + def as_concrete_data(data): """ Return the actual content of a lazy array, as a numpy array. @@ -120,14 +153,7 @@ def as_concrete_data(data): """ if is_lazy_data(data): - # Realise dask array, ensuring the data result is always a NumPy array. - # In some cases dask may return a scalar numpy.int/numpy.float object - # rather than a numpy.ndarray object. - # Recorded in https://github.com/dask/dask/issues/2111. - dtype = data.dtype - data = np.asanyarray(data.compute()) - if isinstance(data, ma.core.MaskedConstant): - data = ma.masked_array(data.data, dtype=dtype, mask=data.mask) + data, = _co_realise_lazy_arrays([data]) return data @@ -158,3 +184,39 @@ def multidim_lazy_stack(stack): result = da.stack([multidim_lazy_stack(subarray) for subarray in stack]) return result + + +def co_realise_cubes(*cubes): + """ + Fetch 'real' data for multiple cubes, in a shared calculation. + + This computes any lazy data, equivalent to accessing each `cube.data`. + However, lazy calculations and data fetches can be shared between the + computations, improving performance. + + Args: + + * cubes (list of :class:`~iris.cube.Cube`): + Arguments, each of which is a cube to be realised. + + For example:: + + # Form stats. + a_std = cube_a.collapsed(['x', 'y'], iris.analysis.STD_DEV) + b_std = cube_b.collapsed(['x', 'y'], iris.analysis.STD_DEV) + ab_mean_diff = (cube_b - cube_a).collapsed(['x', 'y'], + iris.analysis.MEAN) + std_err = (a_std * a_std + b_std * b_std) ** 0.5 + + # Compute stats together (to avoid multiple data passes). + iris.co_realise_cubes(a_std, b_std, ab_mean_diff, std_err) + + + .. Note:: + + Cubes with non-lazy data may also be passed, with no ill effect. + + """ + results = _co_realise_lazy_arrays([cube.core_data() for cube in cubes]) + for cube, result in zip(cubes, results): + cube.data = result diff --git a/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py b/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py new file mode 100644 index 0000000000..03782cda85 --- /dev/null +++ b/lib/iris/tests/unit/lazy_data/test_co_realise_cubes.py @@ -0,0 +1,86 @@ +# (C) British Crown Copyright 2018, Met Office +# +# This file is part of Iris. +# +# Iris is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the +# Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Iris is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with Iris. If not, see . +"""Test function :func:`iris._lazy data.co_realise_cubes`.""" + +from __future__ import (absolute_import, division, print_function) +from six.moves import (filter, input, map, range, zip) # noqa + +# Import iris.tests first so that some things can be initialised before +# importing anything else. +import iris.tests as tests + +from mock import MagicMock +import numpy as np + +from iris.cube import Cube +from iris._lazy_data import as_lazy_data + +from iris._lazy_data import co_realise_cubes + + +class ArrayAccessCounter(object): + def __init__(self, array): + self.dtype = array.dtype + self.shape = array.shape + self._array = array + self.access_count = 0 + + def __getitem__(self, keys): + self.access_count += 1 + return self._array[keys] + + +class Test_co_realise_cubes(tests.IrisTest): + def test_empty(self): + # Ensure that 'no args' case does not raise an error. + co_realise_cubes() + + def test_basic(self): + real_data = np.arange(3.) + cube = Cube(as_lazy_data(real_data)) + co_realise_cubes(cube) + self.assertFalse(cube.has_lazy_data()) + self.assertArrayAllClose(cube.core_data(), real_data) + + def test_multi(self): + real_data = np.arange(3.) + cube_base = Cube(as_lazy_data(real_data)) + cube_inner = cube_base + 1 + result_a = cube_base + 1 + result_b = cube_inner + 1 + co_realise_cubes(result_a, result_b) + # Check that target cubes were realised. + self.assertFalse(result_a.has_lazy_data()) + self.assertFalse(result_b.has_lazy_data()) + # Check that other cubes referenced remain lazy. + self.assertTrue(cube_base.has_lazy_data()) + self.assertTrue(cube_inner.has_lazy_data()) + + def test_combined_access(self): + wrapped_array = ArrayAccessCounter(np.arange(3.)) + lazy_array = as_lazy_data(wrapped_array) + derived_a = lazy_array + 1 + derived_b = lazy_array + 2 + cube_a = Cube(derived_a) + cube_b = Cube(derived_b) + co_realise_cubes(cube_a, cube_b) + # Though used twice, the source data should only get fetched once. + self.assertEqual(wrapped_array.access_count, 1) + + +if __name__ == '__main__': + tests.main()