diff --git a/docs/iris/src/whatsnew/contributions_2.2.0/newfeature_2018-Aug-14_load-nc-chunks.txt b/docs/iris/src/whatsnew/contributions_2.2.0/newfeature_2018-Aug-14_load-nc-chunks.txt new file mode 100644 index 0000000000..2a3780115d --- /dev/null +++ b/docs/iris/src/whatsnew/contributions_2.2.0/newfeature_2018-Aug-14_load-nc-chunks.txt @@ -0,0 +1 @@ +* NetCDF data variable chunk sizes are utilised at load time for significant performance improvements. \ No newline at end of file diff --git a/lib/iris/fileformats/_pyke_rules/fc_rules_cf.krb b/lib/iris/fileformats/_pyke_rules/fc_rules_cf.krb index 942450847b..f301f0fc9b 100644 --- a/lib/iris/fileformats/_pyke_rules/fc_rules_cf.krb +++ b/lib/iris/fileformats/_pyke_rules/fc_rules_cf.krb @@ -1070,7 +1070,7 @@ fc_extras import iris.coord_systems import iris.fileformats.cf as cf import iris.fileformats.netcdf - from iris.fileformats.netcdf import parse_cell_methods, UnknownCellMethodWarning + from iris.fileformats.netcdf import _get_cf_var_data, parse_cell_methods, UnknownCellMethodWarning import iris.fileformats.pp as pp import iris.exceptions import iris.std_names @@ -1712,25 +1712,16 @@ fc_extras # Get units attr_units = get_attr_units(cf_coord_var, attributes) - def cf_var_as_array(cf_var): - dtype = iris.fileformats.netcdf._get_actual_dtype(cf_var) - fill_value = getattr(cf_var.cf_data, '_FillValue', - netCDF4.default_fillvals[dtype.str[1:]]) - proxy = iris.fileformats.netcdf.NetCDFDataProxy( - cf_var.shape, dtype, engine.filename, - cf_var.cf_name, fill_value) - return as_lazy_data(proxy) - # Get any coordinate point data. if isinstance(cf_coord_var, cf.CFLabelVariable): points_data = cf_coord_var.cf_label_data(cf_var) else: - points_data = cf_var_as_array(cf_coord_var) + points_data = _get_cf_var_data(cf_coord_var, engine.filename) # Get any coordinate bounds. cf_bounds_var = get_cf_bounds_var(cf_coord_var) if cf_bounds_var is not None: - bounds_data = cf_var_as_array(cf_bounds_var) + bounds_data = _get_cf_var_data(cf_bounds_var, engine.filename) # Handle transposed bounds where the vertex dimension is not # the last one. Test based on shape to support different @@ -1783,16 +1774,7 @@ fc_extras # Get units attr_units = get_attr_units(cf_cm_attr, attributes) - def cf_var_as_array(cf_var): - dtype = cf_var.dtype - fill_value = getattr(cf_var.cf_data, '_FillValue', - netCDF4.default_fillvals[dtype.str[1:]]) - proxy = iris.fileformats.netcdf.NetCDFDataProxy( - cf_var.shape, dtype, engine.filename, - cf_var.cf_name, fill_value) - return as_lazy_data(proxy) - - data = cf_var_as_array(cf_cm_attr) + data = _get_cf_var_data(cf_cm_attr, engine.filename) # Determine the name of the dimension/s shared between the CF-netCDF data variable # and the coordinate being built. diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index 6ac9c26811..8c965ef1a5 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -501,8 +501,8 @@ def _get_actual_dtype(cf_var): return dummy_data.dtype -def _load_cube(engine, cf, cf_var, filename): - """Create the cube associated with the CF-netCDF data variable.""" +def _get_cf_var_data(cf_var, filename): + # Get lazy chunked data out of a cf variable. dtype = _get_actual_dtype(cf_var) # Create cube with deferred data, but no metadata @@ -510,7 +510,16 @@ def _load_cube(engine, cf, cf_var, filename): netCDF4.default_fillvals[cf_var.dtype.str[1:]]) proxy = NetCDFDataProxy(cf_var.shape, dtype, filename, cf_var.cf_name, fill_value) - data = as_lazy_data(proxy) + chunks = cf_var.cf_data.chunking() + # Chunks can be an iterable, None, or `'contiguous'`. + if chunks == 'contiguous': + chunks = None + return as_lazy_data(proxy, chunks=chunks) + + +def _load_cube(engine, cf, cf_var, filename): + """Create the cube associated with the CF-netCDF data variable.""" + data = _get_cf_var_data(cf_var, filename) cube = iris.cube.Cube(data) # Reset the pyke inference engine. diff --git a/lib/iris/tests/unit/fileformats/netcdf/test__get_cf_var_data.py b/lib/iris/tests/unit/fileformats/netcdf/test__get_cf_var_data.py new file mode 100644 index 0000000000..6a1aeb9d61 --- /dev/null +++ b/lib/iris/tests/unit/fileformats/netcdf/test__get_cf_var_data.py @@ -0,0 +1,83 @@ +# (C) British Crown Copyright 2018, Met Office +# +# This file is part of Iris. +# +# Iris is free software: you can redistribute it and/or modify it under +# the terms of the GNU Lesser General Public License as published by the +# Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# Iris is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with Iris. If not, see . +"""Unit tests for the `iris.fileformats.netcdf._get_cf_var_data` function.""" + +from __future__ import (absolute_import, division, print_function) +from six.moves import (filter, input, map, range, zip) # noqa + +# Import iris.tests first so that some things can be initialised before +# importing anything else. +import iris.tests as tests + +from dask.array import Array as dask_array +import numpy as np + +from iris._lazy_data import _limited_shape +import iris.fileformats.cf +from iris.fileformats.netcdf import _get_cf_var_data +from iris.tests import mock + + +class Test__get_cf_var_data(tests.IrisTest): + def setUp(self): + self.filename = 'DUMMY' + self.shape = (3, 240, 200) + self.expected_chunks = _limited_shape(self.shape) + + def _make(self, chunksizes): + cf_data = mock.Mock(_FillValue=None) + cf_data.chunking = mock.MagicMock(return_value=chunksizes) + cf_var = mock.MagicMock(spec=iris.fileformats.cf.CFVariable, + dtype=np.dtype('i4'), + cf_data=cf_data, + cf_name='DUMMY_VAR', + shape=self.shape) + return cf_var + + def test_cf_data_type(self): + chunks = [1, 12, 100] + cf_var = self._make(chunks) + lazy_data = _get_cf_var_data(cf_var, self.filename) + self.assertIsInstance(lazy_data, dask_array) + + def test_cf_data_chunks(self): + chunks = [1, 12, 100] + cf_var = self._make(chunks) + lazy_data = _get_cf_var_data(cf_var, self.filename) + lazy_data_chunks = [c[0] for c in lazy_data.chunks] + self.assertArrayEqual(chunks, lazy_data_chunks) + + def test_cf_data_no_chunks(self): + # No chunks means chunks are calculated from the array's shape by + # `iris._lazy_data._limited_shape()`. + chunks = None + cf_var = self._make(chunks) + lazy_data = _get_cf_var_data(cf_var, self.filename) + lazy_data_chunks = [c[0] for c in lazy_data.chunks] + self.assertArrayEqual(lazy_data_chunks, self.expected_chunks) + + def test_cf_data_contiguous(self): + # Chunks 'contiguous' is equivalent to no chunks. + chunks = 'contiguous' + cf_var = self._make(chunks) + lazy_data = _get_cf_var_data(cf_var, self.filename) + lazy_data_chunks = [c[0] for c in lazy_data.chunks] + self.assertArrayEqual(lazy_data_chunks, self.expected_chunks) + + +if __name__ == "__main__": + tests.main() diff --git a/lib/iris/tests/unit/fileformats/netcdf/test__load_cube.py b/lib/iris/tests/unit/fileformats/netcdf/test__load_cube.py index 1e38baac86..41b6ca5ff0 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test__load_cube.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test__load_cube.py @@ -57,6 +57,7 @@ def setUp(self): def _make(self, names, attrs): coords = [DimCoord(i, long_name=name) for i, name in enumerate(names)] + shape = (1,) cf_group = {} for name, cf_attrs in zip(names, attrs): @@ -64,12 +65,14 @@ def _make(self, names, attrs): cf_group[name] = mock.Mock(cf_attrs_unused=cf_attrs_unused) cf = mock.Mock(cf_group=cf_group) + cf_data = mock.Mock(_FillValue=None) + cf_data.chunking = mock.MagicMock(return_value=shape) cf_var = mock.MagicMock(spec=iris.fileformats.cf.CFVariable, dtype=np.dtype('i4'), - cf_data=mock.Mock(_FillValue=None), + cf_data=cf_data, cf_name='DUMMY_VAR', cf_group=coords, - shape=(1,)) + shape=shape) return cf, cf_var def test_flag_pass_thru(self): @@ -129,14 +132,17 @@ def setUp(self): self.valid_max = mock.sentinel.valid_max def _make(self, attrs): + shape = (1,) cf_attrs_unused = mock.Mock(return_value=attrs) + cf_data = mock.Mock(_FillValue=None) + cf_data.chunking = mock.MagicMock(return_value=shape) cf_var = mock.MagicMock(spec=iris.fileformats.cf.CFVariable, dtype=np.dtype('i4'), - cf_data=mock.Mock(_FillValue=None), + cf_data=cf_data, cf_name='DUMMY_VAR', cf_group=mock.Mock(), cf_attrs_unused=cf_attrs_unused, - shape=(1,)) + shape=shape) return cf_var def test_flag_pass_thru(self): diff --git a/lib/iris/tests/unit/fileformats/pyke_rules/compiled_krb/fc_rules_cf_fc/test_build_auxiliary_coordinate.py b/lib/iris/tests/unit/fileformats/pyke_rules/compiled_krb/fc_rules_cf_fc/test_build_auxiliary_coordinate.py index a796621cc0..4bb653f23f 100644 --- a/lib/iris/tests/unit/fileformats/pyke_rules/compiled_krb/fc_rules_cf_fc/test_build_auxiliary_coordinate.py +++ b/lib/iris/tests/unit/fileformats/pyke_rules/compiled_krb/fc_rules_cf_fc/test_build_auxiliary_coordinate.py @@ -40,11 +40,13 @@ class TestBoundsVertexDim(tests.IrisTest): def setUp(self): # Create coordinate cf variables and pyke engine. points = np.arange(6).reshape(2, 3) + + cf_data = self._make_cf_data(points) self.cf_coord_var = mock.Mock( spec=CFVariable, dimensions=('foo', 'bar'), cf_name='wibble', - cf_data=mock.Mock(), + cf_data=cf_data, standard_name=None, long_name='wibble', units='m', @@ -54,7 +56,8 @@ def setUp(self): self.engine = mock.Mock( cube=mock.Mock(), - cf_var=mock.Mock(dimensions=('foo', 'bar')), + cf_var=mock.Mock(dimensions=('foo', 'bar'), + cf_data=cf_data), filename='DUMMY', provides=dict(coordinates=[])) @@ -72,14 +75,21 @@ def patched__getitem__(proxy_self, keys): 'iris.fileformats.netcdf.NetCDFDataProxy.__getitem__', new=patched__getitem__) + @staticmethod + def _make_cf_data(vals): + cf_data = mock.Mock(_FillValue=None) + cf_data.chunking = mock.MagicMock(return_value=vals.shape) + return cf_data + def test_slowest_varying_vertex_dim(self): # Create the bounds cf variable. bounds = np.arange(24).reshape(4, 2, 3) + cf_data = self._make_cf_data(bounds) self.cf_bounds_var = mock.Mock( spec=CFVariable, dimensions=('nv', 'foo', 'bar'), cf_name='wibble_bnds', - cf_data=mock.Mock(), + cf_data=cf_data, shape=bounds.shape, dtype=bounds.dtype, __getitem__=lambda self, key: bounds[key]) @@ -116,11 +126,12 @@ def test_slowest_varying_vertex_dim(self): def test_fastest_varying_vertex_dim(self): bounds = np.arange(24).reshape(2, 3, 4) + cf_data = self._make_cf_data(bounds) self.cf_bounds_var = mock.Mock( spec=CFVariable, dimensions=('foo', 'bar', 'nv'), cf_name='wibble_bnds', - cf_data=mock.Mock(), + cf_data=cf_data, shape=bounds.shape, dtype=bounds.dtype, __getitem__=lambda self, key: bounds[key]) @@ -155,11 +166,12 @@ def test_fastest_with_different_dim_names(self): # which are 'foo' and 'bar' (as permitted by the cf spec), # this should still work because the vertex dim is the fastest varying. bounds = np.arange(24).reshape(2, 3, 4) + cf_data = self._make_cf_data(bounds) self.cf_bounds_var = mock.Mock( spec=CFVariable, dimensions=('x', 'y', 'nv'), cf_name='wibble_bnds', - cf_data=mock.Mock(), + cf_data=cf_data, shape=bounds.shape, dtype=bounds.dtype, __getitem__=lambda self, key: bounds[key]) @@ -194,11 +206,14 @@ class TestDtype(tests.IrisTest): def setUp(self): # Create coordinate cf variables and pyke engine. points = np.arange(6).reshape(2, 3) + cf_data = mock.Mock(_FillValue=None) + cf_data.chunking = mock.MagicMock(return_value=points.shape) + self.cf_coord_var = mock.Mock( spec=CFVariable, dimensions=('foo', 'bar'), cf_name='wibble', - cf_data=mock.Mock(), + cf_data=cf_data, standard_name=None, long_name='wibble', units='m',