SciTools · bjlittle · Oct 19, 2016 · Oct 13, 2016 · Oct 13, 2016 · Oct 13, 2016
diff --git a/docs/iris/src/whatsnew/contributions_1.11/newfeature_2016-Oct-19_netcdf_packing_support.txt b/docs/iris/src/whatsnew/contributions_1.11/newfeature_2016-Oct-19_netcdf_packing_support.txt
@@ -0,0 +1 @@
+* Support for saving to netCDF with data packing has been added.
diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py
@@ -26,9 +26,11 @@
 
 from __future__ import (absolute_import, division, print_function)
 from six.moves import (filter, input, map, range, zip)  # noqa
+from six.moves import zip_longest  # Previous line may not be tampered with!
 import six
 
 import collections
+from itertools import repeat
 import os
 import os.path
 import re
@@ -806,7 +808,7 @@ def __exit__(self, type, value, traceback):
     def write(self, cube, local_keys=None, unlimited_dimensions=None,
               zlib=False, complevel=4, shuffle=True, fletcher32=False,
               contiguous=False, chunksizes=None, endian='native',
-              least_significant_digit=None):
+              least_significant_digit=None, packing=None):
         """
         Wrapper for saving cubes to a NetCDF file.
 
@@ -885,6 +887,23 @@ def write(self, cube, local_keys=None, unlimited_dimensions=None,
             place in unpacked data that is a reliable value". Default is
             `None`, or no quantization, or 'lossless' compression.
 
+        * packing (type or string or dict or list): A numpy integer datatype
+            (signed or unsigned) or a string that describes a numpy integer
+            dtype(i.e. 'i2', 'short', 'u4') or a dict of packing parameters as
+            described below. This provides support for netCDF data packing as
+            described in
+            http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html#bp_Packed-Data-Values
+            If this argument is a type (or type string), appropriate values of
+            scale_factor and add_offset will be automatically calculated based
+            on `cube.data` and possible masking. For masked data, `fill_value`
+            is taken from netCDF4.default_fillvals. For more control, pass a
+            dict with one or more of the following keys: `dtype` (required),
+            `scale_factor`, `add_offset`, and `fill_value`. Note that automatic
+            calculation of packing parameters will trigger loading of lazy
+            data; set them manually using a dict to avoid this. The default is
+            `None`, in which case the datatype is determined from the cube and
+            no packing will occur.
+
         Returns:
             None.
 
@@ -932,7 +951,7 @@ def write(self, cube, local_keys=None, unlimited_dimensions=None,
             cube, dimension_names, local_keys, zlib=zlib, complevel=complevel,
             shuffle=shuffle, fletcher32=fletcher32, contiguous=contiguous,
             chunksizes=chunksizes, endian=endian,
-            least_significant_digit=least_significant_digit)
+            least_significant_digit=least_significant_digit, packing=packing)
 
         # Add coordinate variables.
         self._add_dim_coords(cube, dimension_names)
@@ -1833,36 +1852,98 @@ def _create_cf_data_variable(self, cube, dimension_names, local_keys=None,
             The newly created CF-netCDF data variable.
 
         """
+        if 'packing' in kwargs:
+            packing = kwargs.pop('packing')
+            if packing:
+                scale_factor = None
+                add_offset = None
+                fill_value = None
+                if isinstance(packing, dict):
+                    if 'dtype' not in packing:
+                        msg = "The dtype attribute is required for packing."
+                        raise ValueError(msg)
+                    dtype = np.dtype(packing['dtype'])
+                    if 'scale_factor' in packing:
+                        scale_factor = packing['scale_factor']
+                    if 'add_offset' in packing:
+                        add_offset = packing['add_offset']
+                    if 'fill_value' in packing:
+                        fill_value = packing['fill_value']
+                else:
+                    masked = ma.isMaskedArray(cube.data)
+                    dtype = np.dtype(packing)
+                    cmax = cube.data.max()
+                    cmin = cube.data.min()
+                    n = dtype.itemsize * 8
+                    if masked:
+                        scale_factor = (cmax - cmin)/(2**n-2)
+                    else:
+                        scale_factor = (cmax - cmin)/(2**n-1)
+                    if dtype.kind == 'u':
+                        add_offset = cmin
+                    elif dtype.kind == 'i':
+                        if masked:
+                            add_offset = (cmax + cmin)/2
+                        else:
+                            add_offset = cmin + 2**(n-1)*scale_factor
+                needs_fill_value = (cube.has_lazy_data() or
+                                    ma.isMaskedArray(cube.data))
+                if needs_fill_value and fill_value is None:
+                    dtstr = dtype.str[1:]
+                    fill_value = netCDF4.default_fillvals[dtstr]
+        else:
+            packing = None
+
+        def set_packing_ncattrs(cfvar):
+            """Set netCDF packing attributes."""
+            if packing:
+                if scale_factor:
+                    _setncattr(cfvar, 'scale_factor', scale_factor)
+                if add_offset:
+                    _setncattr(cfvar, 'add_offset', add_offset)
+
         cf_name = self._get_cube_variable_name(cube)
         while cf_name in self._dataset.variables:
             cf_name = self._increment_name(cf_name)
 
         # if netcdf3 avoid streaming due to dtype handling
         if (not cube.has_lazy_data() or
-                self._dataset.file_format in ('NETCDF3_CLASSIC',
-                                              'NETCDF3_64BIT')):
-            # Determine whether there is a cube MDI value.
-            fill_value = None
-            if isinstance(cube.data, ma.core.MaskedArray):
-                fill_value = cube.data.fill_value
+            self._dataset.file_format in ('NETCDF3_CLASSIC',
+                                          'NETCDF3_64BIT')):
+
+            if packing is None:
+                # Determine whether there is a cube MDI value.
+                if ma.isMaskedArray(cube.data):
+                    fill_value = cube.data.fill_value
+                else:
+                    fill_value = None
 
             # Get the values in a form which is valid for the file format.
             data = self._ensure_valid_dtype(cube.data, 'cube', cube)
 
+            if packing is None:
+                dtype = data.dtype.newbyteorder('=')
+
             # Create the cube CF-netCDF data variable with data payload.
             cf_var = self._dataset.createVariable(
-                cf_name, data.dtype.newbyteorder('='), dimension_names,
+                cf_name, dtype, dimension_names,
                 fill_value=fill_value, **kwargs)
+            set_packing_ncattrs(cf_var)
             cf_var[:] = data
 
         else:
             # Create the cube CF-netCDF data variable.
             # Explicitly assign the fill_value, which will be the type default
             # in the case of an unmasked array.
+            if packing is None:
+                fill_value = cube.lazy_data().fill_value
+                dtype = cube.lazy_data().dtype.newbyteorder('=')
+
             cf_var = self._dataset.createVariable(
-                cf_name, cube.lazy_data().dtype.newbyteorder('='),
-                dimension_names, fill_value=cube.lazy_data().fill_value,
+                cf_name, dtype,
+                dimension_names, fill_value=fill_value,
                 **kwargs)
+            set_packing_ncattrs(cf_var)
             # stream the data
             biggus.save([cube.lazy_data()], [cf_var], masked=True)
 
@@ -1951,7 +2032,7 @@ def _increment_name(self, varname):
 def save(cube, filename, netcdf_format='NETCDF4', local_keys=None,
          unlimited_dimensions=None, zlib=False, complevel=4, shuffle=True,
          fletcher32=False, contiguous=False, chunksizes=None, endian='native',
-         least_significant_digit=None):
+         least_significant_digit=None, packing=None):
     """
     Save cube(s) to a netCDF file, given the cube and the filename.
 
@@ -2048,6 +2129,24 @@ def save(cube, filename, netcdf_format='NETCDF4', local_keys=None,
         in unpacked data that is a reliable value". Default is `None`, or no
         quantization, or 'lossless' compression.
 
+    * packing (type or string or dict or list): A numpy integer datatype
+        (signed or unsigned) or a string that describes a numpy integer dtype
+        (i.e. 'i2', 'short', 'u4') or a dict of packing parameters as described
+        below or an iterable of such types, strings, or dicts.
+        This provides support for netCDF data packing as described in
+        http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html#bp_Packed-Data-Values
+        If this argument is a type (or type string), appropriate values of
+        scale_factor and add_offset will be automatically calculated based on
+        `cube.data` and possible masking. For masked data, `fill_value` is
+        taken from netCDF4.default_fillvals. For more control, pass a dict with
+        one or more of the following keys: `dtype` (required), `scale_factor`,
+        `add_offset`, and `fill_value`. To save multiple cubes with different
+        packing parameters, pass an iterable of types, strings, dicts, or
+        `None`, one for each cube.  Note that automatic calculation of packing
+        parameters will trigger loading of lazy data; set them manually using a
+        dict to avoid this. The default is `None`, in which case the datatype
+        is determined from the cube and no packing will occur.
+
     Returns:
         None.
 
@@ -2102,13 +2201,48 @@ def save(cube, filename, netcdf_format='NETCDF4', local_keys=None,
         common_keys.difference_update(different_value_keys)
         local_keys.update(different_value_keys)
 
+    def is_valid_packspec(p):
+        """ Only checks that the datatype is valid. """
+        if isinstance(p, dict):
+            if 'dtype' in p:
+                return is_valid_packspec(p['dtype'])
+            else:
+                msg = "The argument to packing must contain the key 'dtype'."
+                raise ValueError(msg)
+        elif (isinstance(p, six.text_type) or isinstance(p, type) or
+              isinstance(p, str)):
+            pdtype = np.dtype(p)  # Does nothing if it's already a numpy dtype
+            if pdtype.kind != 'i' and pdtype.kind != 'u':
+                msg = "The packing datatype must be a numpy integer type."
+                raise ValueError(msg)
+            return True
+        elif p is None:
+            return True
+        else:
+            return False
+
+    if is_valid_packspec(packing):
+        packspecs = repeat(packing)
+    else:
+        # Assume iterable, make sure packing is the same length as cubes.
+        for cube, packspec in zip_longest(cubes, packing, fillvalue=-1):
+            if cube == -1 or packspec == -1:
+                msg = ('If packing is a list, it must have the '
+                       'same number of elements as the argument to'
+                       'cube.')
+                raise ValueError(msg)
+            if not is_valid_packspec(packspec):
+                msg = ('Invalid packing argument: {}.'.format(packspec))
+                raise ValueError(msg)
+        packspecs = packing
+
     # Initialise Manager for saving
     with Saver(filename, netcdf_format) as sman:
         # Iterate through the cubelist.
-        for cube in cubes:
+        for cube, packspec in zip(cubes, packspecs):
             sman.write(cube, local_keys, unlimited_dimensions, zlib, complevel,
                        shuffle, fletcher32, contiguous, chunksizes, endian,
-                       least_significant_digit)
+                       least_significant_digit, packing=packspec)
 
         conventions = CF_CONVENTIONS_VERSION
 

diff --git a/lib/iris/tests/integration/test_netcdf.py b/lib/iris/tests/integration/test_netcdf.py
@@ -24,12 +24,14 @@
 import iris.tests as tests
 
 from contextlib import contextmanager
+from itertools import repeat
 import os.path
 import shutil
 import tempfile
 import warnings
 
 import numpy as np
+import numpy.ma as ma
 
 import iris
 from iris.coords import CellMethod
@@ -309,5 +311,120 @@ def test_load_laea_grid(self):
         self.assertCML(cube, ('netcdf', 'netcdf_laea.cml'))
 
 
+def _get_scale_factor_add_offset(cube, datatype):
+    """Utility function used by netCDF data packing tests."""
+    if isinstance(datatype, dict):
+        dt = np.dtype(datatype['dtype'])
+    else:
+        dt = np.dtype(datatype)
+    cmax = cube.data.max()
+    cmin = cube.data.min()
+    n = dt.itemsize * 8
+    if ma.isMaskedArray(cube.data):
+        masked = True
+    else:
+        masked = False
+    if masked:
+        scale_factor = (cmax - cmin)/(2**n-2)
+    else:
+        scale_factor = (cmax - cmin)/(2**n-1)
+    if dt.kind == 'u':
+        add_offset = cmin
+    elif dt.kind == 'i':
+        if masked:
+            add_offset = (cmax + cmin)/2
+        else:
+            add_offset = cmin + 2**(n-1)*scale_factor
+    return (scale_factor, add_offset)
+
+
+@tests.skip_data
+class TestPackedData(tests.IrisTest):
+    def _single_test(self, datatype, CDLfilename, manual=False):
+        # Read PP input file.
+        file_in = tests.get_data_path(
+            ('PP', 'cf_processing',
+             '000003000000.03.236.000128.1990.12.01.00.00.b.pp'))
+        cube = iris.load_cube(file_in)
+        scale_factor, offset = _get_scale_factor_add_offset(cube, datatype)
+        if manual:
+            packspec = dict(dtype=datatype, scale_factor=scale_factor,
+                            add_offset=offset)
+        else:
+            packspec = datatype
+        # Write Cube to netCDF file.
+        with self.temp_filename(suffix='.nc') as file_out:
+            iris.save(cube, file_out, packing=packspec)
+            decimal = int(-np.log10(scale_factor))
+            packedcube = iris.load_cube(file_out)
+            # Check that packed cube is accurate to expected precision
+            self.assertArrayAlmostEqual(cube.data, packedcube.data,
+                                        decimal=decimal)
+            # Check the netCDF file against CDL expected output.
+            self.assertCDL(file_out, ('integration', 'netcdf',
+                                      'TestPackedData', CDLfilename))
+
+    def test_single_packed_signed(self):
+        """Test saving a single CF-netCDF file with packing."""
+        self._single_test('i2', 'single_packed_signed.cdl')
+
+    def test_single_packed_unsigned(self):
+        """Test saving a single CF-netCDF file with packing into unsigned. """
+        self._single_test('u1', 'single_packed_unsigned.cdl')
+
+    def test_single_packed_manual_scale(self):
+        """Test saving a single CF-netCDF file with packing with scale
+        factor and add_offset set manually."""
+        self._single_test('i2', 'single_packed_manual.cdl', manual=True)
+
+    def _multi_test(self, CDLfilename, multi_dtype=False):
+        """Test saving multiple packed cubes with pack_dtype list."""
+        # Read PP input file.
+        file_in = tests.get_data_path(('PP', 'cf_processing',
+                                       'abcza_pa19591997_daily_29.b.pp'))
+        cubes = iris.load(file_in)
+        # ensure cube order is the same:
+        cubes.sort(key=lambda cube: cube.cell_methods[0].method)
+        datatype = 'i2'
+        scale_factor, offset = _get_scale_factor_add_offset(cubes[0],
+                                                            datatype)
+        if multi_dtype:
+            packdict = dict(dtype=datatype, scale_factor=scale_factor,
+                            add_offset=offset)
+            packspec = [packdict, None, 'u2']
+            dtypes = packspec
+        else:
+            packspec = datatype
+            dtypes = repeat(packspec)
+
+        # Write Cube to netCDF file.
+        with self.temp_filename(suffix='.nc') as file_out:
+            iris.save(cubes, file_out, packing=packspec)
+            # Check the netCDF file against CDL expected output.
+            self.assertCDL(file_out, ('integration', 'netcdf',
+                                      'TestPackedData', CDLfilename))
+            packedcubes = iris.load(file_out)
+            packedcubes.sort(key=lambda cube: cube.cell_methods[0].method)
+            for cube, packedcube, dtype in zip(cubes, packedcubes, dtypes):
+                if dtype:
+                    sf, ao = _get_scale_factor_add_offset(cube, dtype)
+                    decimal = int(-np.log10(sf))
+                    # Check that packed cube is accurate to expected precision
+                    self.assertArrayAlmostEqual(cube.data, packedcube.data,
+                                                decimal=decimal)
+                else:
+                    self.assertArrayEqual(cube.data, packedcube.data)
+
+    def test_multi_packed_single_dtype(self):
+        """Test saving multiple packed cubes with the same pack_dtype."""
+        # Read PP input file.
+        self._multi_test('multi_packed_single_dtype.cdl')
+
+    def test_multi_packed_multi_dtype(self):
+        """Test saving multiple packed cubes with pack_dtype list."""
+        # Read PP input file.
+        self._multi_test('multi_packed_multi_dtype.cdl', multi_dtype=True)
+
+
 if __name__ == "__main__":
     tests.main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		* Support for saving to netCDF with data packing has been added.