Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
21b17c2
Implemented support for netcdf data packing
Oct 13, 2016
872a8bc
Fixed some bugs in netcdf packing implementation.
Oct 13, 2016
821d55f
Removed test files.
Oct 13, 2016
3e19c7a
New output test files for packing.
jswanljung Oct 13, 2016
a224c88
Removed an inadvertent change to .gitignore
jswanljung Oct 13, 2016
851b6c7
Made changes after review.
jswanljung Oct 14, 2016
4496a04
python 2/3 compatibility fix for zip_longest
Oct 14, 2016
6775972
Tests failing due to six.moves change.
Oct 14, 2016
430f81e
Fix for inadvertent lazy_data loading.
Oct 14, 2016
0f2998a
pep violation fix
Oct 14, 2016
3089dc5
Fixed problems with masked and lazy_data
Oct 14, 2016
46f732a
Fixed a bug in zip_longest arg
Oct 14, 2016
3ac9861
Fixed bug with packing specification check for iterable.
Oct 14, 2016
4109433
Packing specification checks now handling None correctly.
Oct 14, 2016
700de12
Moved packing tests to integration.
Oct 14, 2016
a15fc0b
Addressed bogus pep violations
Oct 14, 2016
cc4a836
Replaced unicode with six.text_type
Oct 14, 2016
6e0a7a6
Removed unnecessary test for str.
Oct 14, 2016
5434785
Reverted inadvertent deleted line break in travis.yml
Oct 14, 2016
cef2b8f
Turns out str test was necessary after all.
Oct 14, 2016
9a2d2df
Put test_netcdf.py imports in alphabetical order.
Oct 14, 2016
e367b93
Added what's new.
Oct 15, 2016
29df858
added what's new entry
Oct 19, 2016
41766ae
Renamed what's new entry to conform to spec
Oct 19, 2016
5f08ab8
Actually set datatype so packing will occur. Replaced CDL files to re…
Oct 19, 2016
aba9510
Removed an extra 'a' that snuck into a docstring.
Oct 19, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
* Support for saving to netCDF with data packing has been added.
162 changes: 148 additions & 14 deletions lib/iris/fileformats/netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@

from __future__ import (absolute_import, division, print_function)
from six.moves import (filter, input, map, range, zip) # noqa
from six.moves import zip_longest # Previous line may not be tampered with!
import six

import collections
from itertools import repeat
import os
import os.path
import re
Expand Down Expand Up @@ -806,7 +808,7 @@ def __exit__(self, type, value, traceback):
def write(self, cube, local_keys=None, unlimited_dimensions=None,
zlib=False, complevel=4, shuffle=True, fletcher32=False,
contiguous=False, chunksizes=None, endian='native',
least_significant_digit=None):
least_significant_digit=None, packing=None):
"""
Wrapper for saving cubes to a NetCDF file.

Expand Down Expand Up @@ -885,6 +887,23 @@ def write(self, cube, local_keys=None, unlimited_dimensions=None,
place in unpacked data that is a reliable value". Default is
`None`, or no quantization, or 'lossless' compression.

* packing (type or string or dict or list): A numpy integer datatype
(signed or unsigned) or a string that describes a numpy integer
dtype(i.e. 'i2', 'short', 'u4') or a dict of packing parameters as
described below. This provides support for netCDF data packing as
described in
http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html#bp_Packed-Data-Values
If this argument is a type (or type string), appropriate values of
scale_factor and add_offset will be automatically calculated based
on `cube.data` and possible masking. For masked data, `fill_value`
is taken from netCDF4.default_fillvals. For more control, pass a
dict with one or more of the following keys: `dtype` (required),
`scale_factor`, `add_offset`, and `fill_value`. Note that automatic
calculation of packing parameters will trigger loading of lazy
data; set them manually using a dict to avoid this. The default is
`None`, in which case the datatype is determined from the cube and
no packing will occur.

Returns:
None.

Expand Down Expand Up @@ -932,7 +951,7 @@ def write(self, cube, local_keys=None, unlimited_dimensions=None,
cube, dimension_names, local_keys, zlib=zlib, complevel=complevel,
shuffle=shuffle, fletcher32=fletcher32, contiguous=contiguous,
chunksizes=chunksizes, endian=endian,
least_significant_digit=least_significant_digit)
least_significant_digit=least_significant_digit, packing=packing)

# Add coordinate variables.
self._add_dim_coords(cube, dimension_names)
Expand Down Expand Up @@ -1833,36 +1852,98 @@ def _create_cf_data_variable(self, cube, dimension_names, local_keys=None,
The newly created CF-netCDF data variable.

"""
if 'packing' in kwargs:
packing = kwargs.pop('packing')
if packing:
scale_factor = None
add_offset = None
fill_value = None
if isinstance(packing, dict):
if 'dtype' not in packing:
msg = "The dtype attribute is required for packing."
raise ValueError(msg)
dtype = np.dtype(packing['dtype'])
if 'scale_factor' in packing:
scale_factor = packing['scale_factor']
if 'add_offset' in packing:
add_offset = packing['add_offset']
if 'fill_value' in packing:
fill_value = packing['fill_value']
else:
masked = ma.isMaskedArray(cube.data)
dtype = np.dtype(packing)
cmax = cube.data.max()
cmin = cube.data.min()
n = dtype.itemsize * 8
if masked:
scale_factor = (cmax - cmin)/(2**n-2)
else:
scale_factor = (cmax - cmin)/(2**n-1)
if dtype.kind == 'u':
add_offset = cmin
elif dtype.kind == 'i':
if masked:
add_offset = (cmax + cmin)/2
else:
add_offset = cmin + 2**(n-1)*scale_factor
needs_fill_value = (cube.has_lazy_data() or
ma.isMaskedArray(cube.data))
if needs_fill_value and fill_value is None:
dtstr = dtype.str[1:]
fill_value = netCDF4.default_fillvals[dtstr]
else:
packing = None

def set_packing_ncattrs(cfvar):
"""Set netCDF packing attributes."""
if packing:
if scale_factor:
_setncattr(cfvar, 'scale_factor', scale_factor)
if add_offset:
_setncattr(cfvar, 'add_offset', add_offset)

cf_name = self._get_cube_variable_name(cube)
while cf_name in self._dataset.variables:
cf_name = self._increment_name(cf_name)

# if netcdf3 avoid streaming due to dtype handling
if (not cube.has_lazy_data() or
self._dataset.file_format in ('NETCDF3_CLASSIC',
'NETCDF3_64BIT')):
# Determine whether there is a cube MDI value.
fill_value = None
if isinstance(cube.data, ma.core.MaskedArray):
fill_value = cube.data.fill_value
self._dataset.file_format in ('NETCDF3_CLASSIC',
'NETCDF3_64BIT')):

if packing is None:
# Determine whether there is a cube MDI value.
if ma.isMaskedArray(cube.data):
fill_value = cube.data.fill_value
else:
fill_value = None

# Get the values in a form which is valid for the file format.
data = self._ensure_valid_dtype(cube.data, 'cube', cube)

if packing is None:
dtype = data.dtype.newbyteorder('=')

# Create the cube CF-netCDF data variable with data payload.
cf_var = self._dataset.createVariable(
cf_name, data.dtype.newbyteorder('='), dimension_names,
cf_name, dtype, dimension_names,
fill_value=fill_value, **kwargs)
set_packing_ncattrs(cf_var)
cf_var[:] = data

else:
# Create the cube CF-netCDF data variable.
# Explicitly assign the fill_value, which will be the type default
# in the case of an unmasked array.
if packing is None:
fill_value = cube.lazy_data().fill_value
dtype = cube.lazy_data().dtype.newbyteorder('=')

cf_var = self._dataset.createVariable(
cf_name, cube.lazy_data().dtype.newbyteorder('='),
dimension_names, fill_value=cube.lazy_data().fill_value,
cf_name, dtype,
dimension_names, fill_value=fill_value,
**kwargs)
set_packing_ncattrs(cf_var)
# stream the data
biggus.save([cube.lazy_data()], [cf_var], masked=True)

Expand Down Expand Up @@ -1951,7 +2032,7 @@ def _increment_name(self, varname):
def save(cube, filename, netcdf_format='NETCDF4', local_keys=None,
unlimited_dimensions=None, zlib=False, complevel=4, shuffle=True,
fletcher32=False, contiguous=False, chunksizes=None, endian='native',
least_significant_digit=None):
least_significant_digit=None, packing=None):
"""
Save cube(s) to a netCDF file, given the cube and the filename.

Expand Down Expand Up @@ -2048,6 +2129,24 @@ def save(cube, filename, netcdf_format='NETCDF4', local_keys=None,
in unpacked data that is a reliable value". Default is `None`, or no
quantization, or 'lossless' compression.

* packing (type or string or dict or list): A numpy integer datatype
(signed or unsigned) or a string that describes a numpy integer dtype
(i.e. 'i2', 'short', 'u4') or a dict of packing parameters as described
below or an iterable of such types, strings, or dicts.
This provides support for netCDF data packing as described in
http://www.unidata.ucar.edu/software/netcdf/docs/BestPractices.html#bp_Packed-Data-Values
If this argument is a type (or type string), appropriate values of
scale_factor and add_offset will be automatically calculated based on
`cube.data` and possible masking. For masked data, `fill_value` is
taken from netCDF4.default_fillvals. For more control, pass a dict with
one or more of the following keys: `dtype` (required), `scale_factor`,
`add_offset`, and `fill_value`. To save multiple cubes with different
packing parameters, pass an iterable of types, strings, dicts, or
`None`, one for each cube. Note that automatic calculation of packing
parameters will trigger loading of lazy data; set them manually using a
dict to avoid this. The default is `None`, in which case the datatype
is determined from the cube and no packing will occur.

Returns:
None.

Expand Down Expand Up @@ -2102,13 +2201,48 @@ def save(cube, filename, netcdf_format='NETCDF4', local_keys=None,
common_keys.difference_update(different_value_keys)
local_keys.update(different_value_keys)

def is_valid_packspec(p):
""" Only checks that the datatype is valid. """
if isinstance(p, dict):
if 'dtype' in p:
return is_valid_packspec(p['dtype'])
else:
msg = "The argument to packing must contain the key 'dtype'."
raise ValueError(msg)
elif (isinstance(p, six.text_type) or isinstance(p, type) or
isinstance(p, str)):
pdtype = np.dtype(p) # Does nothing if it's already a numpy dtype
if pdtype.kind != 'i' and pdtype.kind != 'u':
msg = "The packing datatype must be a numpy integer type."
raise ValueError(msg)
return True
elif p is None:
return True
else:
return False

if is_valid_packspec(packing):
packspecs = repeat(packing)
else:
# Assume iterable, make sure packing is the same length as cubes.
for cube, packspec in zip_longest(cubes, packing, fillvalue=-1):
if cube == -1 or packspec == -1:
msg = ('If packing is a list, it must have the '
'same number of elements as the argument to'
'cube.')
raise ValueError(msg)
if not is_valid_packspec(packspec):
msg = ('Invalid packing argument: {}.'.format(packspec))
raise ValueError(msg)
packspecs = packing

# Initialise Manager for saving
with Saver(filename, netcdf_format) as sman:
# Iterate through the cubelist.
for cube in cubes:
for cube, packspec in zip(cubes, packspecs):
sman.write(cube, local_keys, unlimited_dimensions, zlib, complevel,
shuffle, fletcher32, contiguous, chunksizes, endian,
least_significant_digit)
least_significant_digit, packing=packspec)

conventions = CF_CONVENTIONS_VERSION

Expand Down
117 changes: 117 additions & 0 deletions lib/iris/tests/integration/test_netcdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@
import iris.tests as tests

from contextlib import contextmanager
from itertools import repeat
import os.path
import shutil
import tempfile
import warnings

import numpy as np
import numpy.ma as ma

import iris
from iris.coords import CellMethod
Expand Down Expand Up @@ -309,5 +311,120 @@ def test_load_laea_grid(self):
self.assertCML(cube, ('netcdf', 'netcdf_laea.cml'))


def _get_scale_factor_add_offset(cube, datatype):
"""Utility function used by netCDF data packing tests."""
if isinstance(datatype, dict):
dt = np.dtype(datatype['dtype'])
else:
dt = np.dtype(datatype)
cmax = cube.data.max()
cmin = cube.data.min()
n = dt.itemsize * 8
if ma.isMaskedArray(cube.data):
masked = True
else:
masked = False
if masked:
scale_factor = (cmax - cmin)/(2**n-2)
else:
scale_factor = (cmax - cmin)/(2**n-1)
if dt.kind == 'u':
add_offset = cmin
elif dt.kind == 'i':
if masked:
add_offset = (cmax + cmin)/2
else:
add_offset = cmin + 2**(n-1)*scale_factor
return (scale_factor, add_offset)


@tests.skip_data
class TestPackedData(tests.IrisTest):
def _single_test(self, datatype, CDLfilename, manual=False):
# Read PP input file.
file_in = tests.get_data_path(
('PP', 'cf_processing',
'000003000000.03.236.000128.1990.12.01.00.00.b.pp'))
cube = iris.load_cube(file_in)
scale_factor, offset = _get_scale_factor_add_offset(cube, datatype)
if manual:
packspec = dict(dtype=datatype, scale_factor=scale_factor,
add_offset=offset)
else:
packspec = datatype
# Write Cube to netCDF file.
with self.temp_filename(suffix='.nc') as file_out:
iris.save(cube, file_out, packing=packspec)
decimal = int(-np.log10(scale_factor))
packedcube = iris.load_cube(file_out)
# Check that packed cube is accurate to expected precision
self.assertArrayAlmostEqual(cube.data, packedcube.data,
decimal=decimal)
# Check the netCDF file against CDL expected output.
self.assertCDL(file_out, ('integration', 'netcdf',
'TestPackedData', CDLfilename))

def test_single_packed_signed(self):
"""Test saving a single CF-netCDF file with packing."""
self._single_test('i2', 'single_packed_signed.cdl')

def test_single_packed_unsigned(self):
"""Test saving a single CF-netCDF file with packing into unsigned. """
self._single_test('u1', 'single_packed_unsigned.cdl')

def test_single_packed_manual_scale(self):
"""Test saving a single CF-netCDF file with packing with scale
factor and add_offset set manually."""
self._single_test('i2', 'single_packed_manual.cdl', manual=True)

def _multi_test(self, CDLfilename, multi_dtype=False):
"""Test saving multiple packed cubes with pack_dtype list."""
# Read PP input file.
file_in = tests.get_data_path(('PP', 'cf_processing',
'abcza_pa19591997_daily_29.b.pp'))
cubes = iris.load(file_in)
# ensure cube order is the same:
cubes.sort(key=lambda cube: cube.cell_methods[0].method)
datatype = 'i2'
scale_factor, offset = _get_scale_factor_add_offset(cubes[0],
datatype)
if multi_dtype:
packdict = dict(dtype=datatype, scale_factor=scale_factor,
add_offset=offset)
packspec = [packdict, None, 'u2']
dtypes = packspec
else:
packspec = datatype
dtypes = repeat(packspec)

# Write Cube to netCDF file.
with self.temp_filename(suffix='.nc') as file_out:
iris.save(cubes, file_out, packing=packspec)
# Check the netCDF file against CDL expected output.
self.assertCDL(file_out, ('integration', 'netcdf',
'TestPackedData', CDLfilename))
packedcubes = iris.load(file_out)
packedcubes.sort(key=lambda cube: cube.cell_methods[0].method)
for cube, packedcube, dtype in zip(cubes, packedcubes, dtypes):
if dtype:
sf, ao = _get_scale_factor_add_offset(cube, dtype)
decimal = int(-np.log10(sf))
# Check that packed cube is accurate to expected precision
self.assertArrayAlmostEqual(cube.data, packedcube.data,
decimal=decimal)
else:
self.assertArrayEqual(cube.data, packedcube.data)

def test_multi_packed_single_dtype(self):
"""Test saving multiple packed cubes with the same pack_dtype."""
# Read PP input file.
self._multi_test('multi_packed_single_dtype.cdl')

def test_multi_packed_multi_dtype(self):
"""Test saving multiple packed cubes with pack_dtype list."""
# Read PP input file.
self._multi_test('multi_packed_multi_dtype.cdl', multi_dtype=True)


if __name__ == "__main__":
tests.main()
Loading