diff --git a/docs/iris/src/whatsnew/contributions_2.4.0/bugfix_2020-Feb-14_pp_emptyslices.txt b/docs/iris/src/whatsnew/contributions_2.4.0/bugfix_2020-Feb-14_pp_emptyslices.txt new file mode 100644 index 0000000000..389209ae7e --- /dev/null +++ b/docs/iris/src/whatsnew/contributions_2.4.0/bugfix_2020-Feb-14_pp_emptyslices.txt @@ -0,0 +1,5 @@ +* Fixed a problem which was causing file loads to fetch *all* field data + whenever UM files (PP or Fieldsfiles) were loaded. + With large sourcefiles, initial file loads are slow, with large memory usage + before any cube data is even fetched. Large enough files will cause a crash. + The problem occurs only with Dask versions >= 2.0. diff --git a/lib/iris/fileformats/pp.py b/lib/iris/fileformats/pp.py index 6cc54a61aa..199cc5130f 100644 --- a/lib/iris/fileformats/pp.py +++ b/lib/iris/fileformats/pp.py @@ -1,4 +1,4 @@ -# (C) British Crown Copyright 2010 - 2019, Met Office +# (C) British Crown Copyright 2010 - 2020, Met Office # # This file is part of Iris. # @@ -52,7 +52,7 @@ LBPROC_MAP as lbproc_map) import iris.fileformats.rules import iris.coord_systems - +from iris.util import _array_slice_ifempty try: import mo_pack @@ -644,16 +644,22 @@ def ndim(self): return len(self.shape) def __getitem__(self, keys): - with open(self.path, 'rb') as pp_file: - pp_file.seek(self.offset, os.SEEK_SET) - data_bytes = pp_file.read(self.data_len) - data = _data_bytes_to_shaped_array(data_bytes, - self.lbpack, - self.boundary_packing, - self.shape, self.src_dtype, - self.mdi) - data = data.__getitem__(keys) - return np.asanyarray(data, dtype=self.dtype) + # Check for 'empty' slicings, in which case don't fetch the data. + # Because, since Dask v2, 'dask.array.from_array' performs an empty + # slicing and we must not fetch the data at that time. + result = _array_slice_ifempty(keys, self.shape, self.dtype) + if result is None: + with open(self.path, 'rb') as pp_file: + pp_file.seek(self.offset, os.SEEK_SET) + data_bytes = pp_file.read(self.data_len) + data = _data_bytes_to_shaped_array(data_bytes, + self.lbpack, + self.boundary_packing, + self.shape, self.src_dtype, + self.mdi) + result = data.__getitem__(keys) + + return np.asanyarray(result, dtype=self.dtype) def __repr__(self): fmt = '<{self.__class__.__name__} shape={self.shape}' \ diff --git a/lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py b/lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py index 8dc21f288f..9faa3bd161 100644 --- a/lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py +++ b/lib/iris/tests/unit/fileformats/pp/test_PPDataProxy.py @@ -1,4 +1,4 @@ -# (C) British Crown Copyright 2014 - 2019, Met Office +# (C) British Crown Copyright 2014 - 2020, Met Office # # This file is part of Iris. # @@ -19,10 +19,14 @@ from __future__ import (absolute_import, division, print_function) from six.moves import (filter, input, map, range, zip) # noqa +import six + # Import iris.tests first so that some things can be initialised before # importing anything else. import iris.tests as tests +import numpy as np + from iris.fileformats.pp import PPDataProxy, SplittableInt from iris.tests import mock @@ -35,7 +39,7 @@ def test_lbpack_SplittableInt(self): self.assertEqual(proxy.lbpack, lbpack) self.assertIs(proxy.lbpack, lbpack) - def test_lnpack_raw(self): + def test_lbpack_raw(self): lbpack = 4321 proxy = PPDataProxy(None, None, None, None, None, lbpack, None, None) @@ -48,5 +52,114 @@ def test_lnpack_raw(self): self.assertEqual(proxy.lbpack.n4, lbpack // 1000 % 10) +class SliceTranslator(): + """ + Class to translate an array-indexing expression into a tuple of keys. + + An instance just returns the argument of its __getitem__ call. + + """ + def __getitem__(self, keys): + return keys + + +# A multidimensional-indexable object that returns its index keys, so we can +# use multidimensional-indexing notation to specify a slicing expression. +Slices = SliceTranslator() + + +class Test__getitem__slicing(tests.IrisTest): + def _check_slicing(self, test_shape, indices, result_shape, + data_was_fetched=True): + # Check behaviour of the getitem call with specific slicings. + # Especially: check cases where a fetch does *not* read from the file. + # This is necessary because, since Dask 2.0, the "from_array" function + # takes a zero-length slice of its array argument, to capture array + # metadata, and in those cases we want to avoid file access. + test_dtype = np.dtype(np.float32) + proxy = PPDataProxy(shape=test_shape, src_dtype=test_dtype, + path=None, offset=None, data_len=None, + lbpack=0, # Note: a 'real' value is needed. + boundary_packing=None, mdi=None) + + # Mock out the file-open call, to see if the file would be read. + if six.PY2: + builtin_open_func_name = '__builtin__.open' + else: + builtin_open_func_name = 'builtins.open' + mock_fileopen = self.patch(builtin_open_func_name) + + # Also mock out the 'databytes_to_shaped_array' call, to fake minimal + # operation in the cases where file-open *does* get called. + fake_data = np.zeros(test_shape, dtype=test_dtype) + self.patch('iris.fileformats.pp._data_bytes_to_shaped_array', + mock.MagicMock(return_value=fake_data)) + + # Test the requested indexing operation. + result = proxy.__getitem__(indices) + + # Check the behaviour and results were as expected. + self.assertEqual(mock_fileopen.called, data_was_fetched) + self.assertIsInstance(result, np.ndarray) + self.assertEqual(result.dtype, test_dtype) + self.assertEqual(result.shape, result_shape) + + def test_slicing_1d_normal(self): + # A 'normal' 1d testcase with no empty slices. + self._check_slicing( + test_shape=(3,), + indices=Slices[1:10], + result_shape=(2,), + data_was_fetched=True) + + def test_slicing_1d_empty(self): + # A 1d testcase with an empty slicing. + self._check_slicing( + test_shape=(3,), + indices=Slices[0:0], + result_shape=(0,), + data_was_fetched=False) + + def test_slicing_2d_normal(self): + # A 2d testcase with no empty slices. + self._check_slicing( + test_shape=(3, 4), + indices=Slices[2, :3], + result_shape=(3,), + data_was_fetched=True) + + def test_slicing_2d_allempty(self): + # A 2d testcase with all empty slices. + self._check_slicing( + test_shape=(3, 4), + indices=Slices[0:0, 0:0], + result_shape=(0, 0), + data_was_fetched=False) + + def test_slicing_2d_empty_dim0(self): + # A 2d testcase with an empty slice. + self._check_slicing( + test_shape=(3, 4), + indices=Slices[0:0], + result_shape=(0, 4), + data_was_fetched=False) + + def test_slicing_2d_empty_dim1(self): + # A 2d testcase with an empty slice, and an integer index. + self._check_slicing( + test_shape=(3, 4), + indices=Slices[1, 0:0], + result_shape=(0,), + data_was_fetched=False) + + def test_slicing_complex(self): + # Multiple dimensions with multiple empty slices. + self._check_slicing( + test_shape=(3, 4, 2, 5, 6, 3, 7), + indices=Slices[1:3, 2, 0:0, :, 1:1, :100], + result_shape=(2, 0, 5, 0, 3, 7), + data_was_fetched=False) + + if __name__ == '__main__': tests.main() diff --git a/lib/iris/util.py b/lib/iris/util.py index 19d50ebc62..19e375db64 100644 --- a/lib/iris/util.py +++ b/lib/iris/util.py @@ -1,4 +1,4 @@ -# (C) British Crown Copyright 2010 - 2019, Met Office +# (C) British Crown Copyright 2010 - 2020, Met Office # # This file is part of Iris. # @@ -915,6 +915,67 @@ def __lt__(self, other): return NotImplemented +def _array_slice_ifempty(keys, shape, dtype): + """ + Detect cases where an array slice will contain no data, as it contains a + zero-length dimension, and produce an equivalent result for those cases. + + The function indicates 'empty' slicing cases, by returning an array equal + to the slice result in those cases. + + Args: + + * keys (indexing key, or tuple of keys): + The argument from an array __getitem__ call. + Only tuples of integers and slices are supported, in particular no + newaxis, ellipsis or array keys. + These are the types of array access usage we expect from Dask. + * shape (tuple of int): + The shape of the array being indexed. + * dtype (numpy.dtype): + The dtype of the array being indexed. + + Returns: + result (np.ndarray or None): + If 'keys' contains a slice(0, 0), this is an ndarray of the correct + resulting shape and provided dtype. + Otherwise it is None. + + .. note:: + + This is used to prevent DataProxy arraylike objects from fetching their + file data when wrapped as Dask arrays. + This is because, for Dask >= 2.0, the "dask.array.from_array" call + performs a fetch like [0:0, 0:0, ...], to 'snapshot' array metadata. + This function enables us to avoid triggering a file data fetch in those + cases : This is consistent because the result will not contain any + actual data content. + + """ + # Convert a single key into a 1-tuple, so we always have a tuple of keys. + if isinstance(keys, tuple): + keys_tuple = keys + else: + keys_tuple = (keys,) + + if any(key == slice(0, 0) for key in keys_tuple): + # An 'empty' slice is present : Return a 'fake' array instead. + target_shape = list(shape) + for i_dim, key in enumerate(keys_tuple): + if key == slice(0, 0): + # Reduce dims with empty slicing to length 0. + target_shape[i_dim] = 0 + # Create a prototype result : no memory usage, as some dims are 0. + result = np.zeros(target_shape, dtype=dtype) + # Index with original keys to produce the desired result shape. + # Note : also ok in 0-length dims, as the slice is always '0:0'. + result = result[keys] + else: + result = None + + return result + + def create_temp_filename(suffix=''): """Return a temporary file name.