From 667a191d6661e4e8925a542451bc7e5b73c9a7cf Mon Sep 17 00:00:00 2001
From: marqh <markh@metarelate.net>
Date: Thu, 16 Feb 2017 14:47:20 +0000
Subject: [PATCH 1/2] not my data

---
 lib/iris/_lazy_data.py | 25 +++++++++++++++++++------
 lib/iris/cube.py       | 32 +++++++++++++++-----------------
 2 files changed, 34 insertions(+), 23 deletions(-)

diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py
index cf6dcc43bf..6a5f5117a0 100644
--- a/lib/iris/_lazy_data.py
+++ b/lib/iris/_lazy_data.py
@@ -22,6 +22,7 @@
 """
 from __future__ import (absolute_import, division, print_function)
 from six.moves import (filter, input, map, range, zip)  # noqa
+import six
 
 import dask.array as da
 import numpy as np
@@ -63,12 +64,24 @@ def as_concrete_data(data):
             # treat all as masked, for standard cube.data behaviour.
             data = data.masked_array()
         else:
+            fill_value=None
+            fill_values = set()
+            for dkey in data.dask.keys():
+                if (isinstance(dkey, six.string_types) and
+                   dkey.startswith('array-original-')):
+                    if hasattr(data.dask.get(dkey), 'fill_value'):
+                        fill_values.add(data.dask.get(dkey).fill_value)
+            if len(fill_values) == 1:
+                fill_value = fill_values.pop()
+            elif len(fill_values) > 1:
+                raise ValueError('Multiple fill values in a dask graph '
+                                 'is not supported')
             # Grab a fill value, in case this is just a converted masked array.
-            fill_value = getattr(data, 'fill_value', None)
+            # fill_value = getattr(data, 'fill_value', None)
             # Realise dask array.
             data = data.compute()
             # Convert NaN arrays into masked arrays for Iris' consumption.
-            mask = np.isnan(data)
+            mask = np.logical_or(np.isnan(data), data == fill_value)
             if np.all(~mask):
                 mask = None
             data = np.ma.masked_array(data, mask=mask,
@@ -95,16 +108,16 @@ def as_lazy_data(data):
     """
     if not is_lazy_data(data):
         # record the original fill value.
-        fill_value = getattr(data, 'fill_value', None)
+        # fill_value = getattr(data, 'fill_value', None)
         if isinstance(data, np.ma.MaskedArray):
             # Use with NaNs replacing the mask.
             data = array_masked_to_nans(data)
         data = da.from_array(data, chunks=_MAX_CHUNK_SIZE)
         # Attach any fill value to the dask object.
         # Note: this is not passed on to dask arrays derived from this one.
-        data.fill_value = fill_value
-    elif not hasattr(data, 'fill_value'):
-        data.fill_value = None  # make it look more like a biggus Array ?
+        # data.fill_value = fill_value
+    # elif not hasattr(data, 'fill_value'):
+    #     data.fill_value = None  # make it look more like a biggus Array ?
     return data
 
 
diff --git a/lib/iris/cube.py b/lib/iris/cube.py
index 54a7369190..fccdcc8b2c 100644
--- a/lib/iris/cube.py
+++ b/lib/iris/cube.py
@@ -716,7 +716,7 @@ def __init__(self, data, standard_name=None, long_name=None,
 
         if not is_lazy_data(data):
             data = np.asarray(data)
-        self._my_data = data
+        self.data_graph = as_lazy_data(data)
 
         #: The "standard name" for the Cube's phenomenon.
         self.standard_name = standard_name
@@ -1592,13 +1592,13 @@ def cell_methods(self, cell_methods):
     @property
     def shape(self):
         """The shape of the data of this cube."""
-        shape = self._my_data.shape
+        shape = self.data_graph.shape
         return shape
 
     @property
     def dtype(self):
         """The :class:`numpy.dtype` of the data of this cube."""
-        return self._my_data.dtype
+        return self.data_graph.dtype
 
     @property
     def ndim(self):
@@ -1642,11 +1642,8 @@ def lazy_data(self, array=None):
                 if self.shape or array.shape != (1,):
                     raise ValueError('Require cube data with shape %r, got '
                                      '%r.' % (self.shape, array.shape))
-            self._my_data = array
-        else:
-            array = self._my_data
-        array = as_lazy_data(array)
-        return array
+            self.data_graph = array
+        return self.data_graph
 
     @property
     def data(self):
@@ -1681,7 +1678,7 @@ def data(self):
             (10, 20)
 
         """
-        data = self._my_data
+        data = self.data_graph
         if is_lazy_data(data):
             try:
                 data = as_concrete_data(data)
@@ -1699,8 +1696,8 @@ def data(self):
                     ma.count_masked(data) == 0):
                 data = data.data
             # data may be a numeric type, so ensure an np.ndarray is returned
-            self._my_data = np.asanyarray(data)
-        return self._my_data
+            data = np.asanyarray(data)
+        return data
 
     @data.setter
     def data(self, value):
@@ -1714,10 +1711,11 @@ def data(self, value):
                 raise ValueError('Require cube data with shape %r, got '
                                  '%r.' % (self.shape, data.shape))
 
-        self._my_data = data
+        self.data_graph = as_lazy_data(data)
 
     def has_lazy_data(self):
-        return is_lazy_data(self._my_data)
+        # now this always returns true, new pattern needed
+        return is_lazy_data(self.data_graph)
 
     @property
     def dim_coords(self):
@@ -2182,9 +2180,9 @@ def new_cell_measure_dims(cm_):
             first_slice = None
 
         if first_slice is not None:
-            data = self._my_data[first_slice]
+            data = self.data_graph[first_slice]
         else:
-            data = copy.deepcopy(self._my_data)
+            data = copy.deepcopy(self.data_graph)
 
         for other_slice in slice_gen:
             data = data[other_slice]
@@ -2819,9 +2817,9 @@ def transpose(self, new_order=None):
             raise ValueError('Incorrect number of dimensions.')
 
         if self.has_lazy_data():
-            self._my_data = self.lazy_data().transpose(new_order)
+            self.data_graph = self.lazy_data().transpose(new_order)
         else:
-            self._my_data = self.data.transpose(new_order)
+            self.data_graph = self.data.transpose(new_order)
 
         dim_mapping = {src: dest for dest, src in enumerate(new_order)}
 

From dd593e8ab11cc01ea5188b65f24d42e62972bb3f Mon Sep 17 00:00:00 2001
From: marqh <markh@metarelate.net>
Date: Fri, 17 Feb 2017 16:19:18 +0000
Subject: [PATCH 2/2] missing data and fill values

---
 lib/iris/_lazy_data.py         | 56 +++++--------------------
 lib/iris/cube.py               | 75 ++++++++++++++++++++--------------
 lib/iris/fileformats/netcdf.py | 34 +++++++++------
 lib/iris/fileformats/pp.py     | 28 +++++++++----
 lib/iris/fileformats/rules.py  |  3 +-
 5 files changed, 96 insertions(+), 100 deletions(-)

diff --git a/lib/iris/_lazy_data.py b/lib/iris/_lazy_data.py
index 6a5f5117a0..e1f918c874 100644
--- a/lib/iris/_lazy_data.py
+++ b/lib/iris/_lazy_data.py
@@ -28,30 +28,19 @@
 import numpy as np
 
 
-# Whether to recognise biggus arrays as lazy, *as well as* dask.
-# NOTE: in either case, this module will not *make* biggus arrays, only dask.
-_SUPPORT_BIGGUS = True
-
-if _SUPPORT_BIGGUS:
-    import biggus
-
-
 def is_lazy_data(data):
     """
     Return whether the argument is an Iris 'lazy' data array.
 
     At present, this means simply a Dask array.
     We determine this by checking for a "compute" property.
-    NOTE: ***for now only*** accept Biggus arrays also.
 
     """
     result = hasattr(data, 'compute')
-    if not result and _SUPPORT_BIGGUS:
-        result = isinstance(data, biggus.Array)
     return result
 
 
-def as_concrete_data(data):
+def as_concrete_data(data, fill_value=None):
     """
     Return the actual content of the argument, as a numpy masked array.
 
@@ -59,33 +48,15 @@ def as_concrete_data(data):
 
     """
     if is_lazy_data(data):
-        if _SUPPORT_BIGGUS and isinstance(data, biggus.Array):
-            # Realise biggus array.
-            # treat all as masked, for standard cube.data behaviour.
-            data = data.masked_array()
-        else:
-            fill_value=None
-            fill_values = set()
-            for dkey in data.dask.keys():
-                if (isinstance(dkey, six.string_types) and
-                   dkey.startswith('array-original-')):
-                    if hasattr(data.dask.get(dkey), 'fill_value'):
-                        fill_values.add(data.dask.get(dkey).fill_value)
-            if len(fill_values) == 1:
-                fill_value = fill_values.pop()
-            elif len(fill_values) > 1:
-                raise ValueError('Multiple fill values in a dask graph '
-                                 'is not supported')
-            # Grab a fill value, in case this is just a converted masked array.
-            # fill_value = getattr(data, 'fill_value', None)
-            # Realise dask array.
-            data = data.compute()
-            # Convert NaN arrays into masked arrays for Iris' consumption.
-            mask = np.logical_or(np.isnan(data), data == fill_value)
-            if np.all(~mask):
-                mask = None
-            data = np.ma.masked_array(data, mask=mask,
-                                      fill_value=fill_value)
+        # Realise dask array.
+        data = data.compute()
+        # Convert NaN arrays into masked arrays for Iris' consumption.
+        mask = np.isnan(data)
+
+        if np.all(~mask):
+            mask = None
+        data = np.ma.masked_array(data, mask=mask,
+                                  fill_value=fill_value)
     return data
 
 
@@ -107,17 +78,10 @@ def as_lazy_data(data):
 
     """
     if not is_lazy_data(data):
-        # record the original fill value.
-        # fill_value = getattr(data, 'fill_value', None)
         if isinstance(data, np.ma.MaskedArray):
             # Use with NaNs replacing the mask.
             data = array_masked_to_nans(data)
         data = da.from_array(data, chunks=_MAX_CHUNK_SIZE)
-        # Attach any fill value to the dask object.
-        # Note: this is not passed on to dask arrays derived from this one.
-        # data.fill_value = fill_value
-    # elif not hasattr(data, 'fill_value'):
-    #     data.fill_value = None  # make it look more like a biggus Array ?
     return data
 
 
diff --git a/lib/iris/cube.py b/lib/iris/cube.py
index fccdcc8b2c..20b1f14757 100644
--- a/lib/iris/cube.py
+++ b/lib/iris/cube.py
@@ -33,6 +33,7 @@
 import zlib
 
 import biggus
+import dask.array as da
 import numpy as np
 import numpy.ma as ma
 
@@ -64,7 +65,8 @@ class CubeMetadata(collections.namedtuple('CubeMetadata',
                                            'var_name',
                                            'units',
                                            'attributes',
-                                           'cell_methods'])):
+                                           'cell_methods',
+                                           'dtype', 'fill_value'])):
     """
     Represents the phenomenon metadata for a single :class:`Cube`.
 
@@ -648,7 +650,7 @@ def __init__(self, data, standard_name=None, long_name=None,
                  var_name=None, units=None, attributes=None,
                  cell_methods=None, dim_coords_and_dims=None,
                  aux_coords_and_dims=None, aux_factories=None,
-                 cell_measures_and_dims=None):
+                 cell_measures_and_dims=None, dtype=None, fill_value=None):
         """
         Creates a cube with data and optional metadata.
 
@@ -714,6 +716,12 @@ def __init__(self, data, standard_name=None, long_name=None,
         if isinstance(data, six.string_types):
             raise TypeError('Invalid data type: {!r}.'.format(data))
 
+        self.shape = data.shape
+        if dtype is not None and dtype != data.dtype:
+            raise ValueError('dtype must match data')
+        self.dtype = data.dtype
+        self.fill_value = fill_value
+
         if not is_lazy_data(data):
             data = np.asarray(data)
         self.data_graph = as_lazy_data(data)
@@ -786,7 +794,8 @@ def metadata(self):
 
         """
         return CubeMetadata(self.standard_name, self.long_name, self.var_name,
-                            self.units, self.attributes, self.cell_methods)
+                            self.units, self.attributes, self.cell_methods,
+                            self.dtype, self.fill_value)
 
     @metadata.setter
     def metadata(self, value):
@@ -1589,16 +1598,16 @@ def cell_methods(self):
     def cell_methods(self, cell_methods):
         self._cell_methods = tuple(cell_methods) if cell_methods else tuple()
 
-    @property
-    def shape(self):
-        """The shape of the data of this cube."""
-        shape = self.data_graph.shape
-        return shape
+    # @property
+    # def shape(self):
+    #     """The shape of the data of this cube."""
+    #     shape = self.data_graph.shape
+    #     return shape
 
-    @property
-    def dtype(self):
-        """The :class:`numpy.dtype` of the data of this cube."""
-        return self.data_graph.dtype
+    # @property
+    # def dtype(self):
+    #     """The :class:`numpy.dtype` of the data of this cube."""
+    #     return self.data_graph.dtype
 
     @property
     def ndim(self):
@@ -1679,24 +1688,27 @@ def data(self):
 
         """
         data = self.data_graph
-        if is_lazy_data(data):
-            try:
-                data = as_concrete_data(data)
-            except MemoryError:
-                msg = "Failed to create the cube's data as there was not" \
-                      " enough memory available.\n" \
-                      "The array shape would have been {0!r} and the data" \
-                      " type {1}.\n" \
-                      "Consider freeing up variables or indexing the cube" \
-                      " before getting its data."
-                msg = msg.format(self.shape, data.dtype)
-                raise MemoryError(msg)
-            # Unmask the array only if it is filled.
-            if (isinstance(data, np.ma.masked_array) and
-                    ma.count_masked(data) == 0):
-                data = data.data
-            # data may be a numeric type, so ensure an np.ndarray is returned
-            data = np.asanyarray(data)
+        chunks = self.data_graph.chunks
+        try:
+            data = as_concrete_data(data, fill_value=self.fill_value)
+        except MemoryError:
+            msg = "Failed to create the cube's data as there was not" \
+                  " enough memory available.\n" \
+                  "The array shape would have been {0!r} and the data" \
+                  " type {1}.\n" \
+                  "Consider freeing up variables or indexing the cube" \
+                  " before getting its data."
+            msg = msg.format(self.shape, data.dtype)
+            raise MemoryError(msg)
+
+        # Unmask the array only if it is filled.
+        if (isinstance(data, np.ma.masked_array) and
+                ma.count_masked(data) == 0):
+            data = data.data
+        # data may be a numeric type, so ensure an np.ndarray is returned
+        data = np.asanyarray(data)
+        # Create a dask data_graph and link the cube to this
+        self.data_graph = da.from_array(data.data, chunks)
         return data
 
     @data.setter
@@ -1710,12 +1722,13 @@ def data(self, value):
             if self.shape or data.shape != (1,):
                 raise ValueError('Require cube data with shape %r, got '
                                  '%r.' % (self.shape, data.shape))
-
+        self.dtype = data.dtype
         self.data_graph = as_lazy_data(data)
 
     def has_lazy_data(self):
         # now this always returns true, new pattern needed
         return is_lazy_data(self.data_graph)
+        
 
     @property
     def dim_coords(self):
diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py
index 5b89c110a1..26656b0fc0 100644
--- a/lib/iris/fileformats/netcdf.py
+++ b/lib/iris/fileformats/netcdf.py
@@ -38,6 +38,7 @@
 import warnings
 
 import biggus
+import dask.array as da
 import netCDF4
 import numpy as np
 import numpy.ma as ma
@@ -56,7 +57,6 @@
 import iris.fileformats._pyke_rules
 import iris.io
 import iris.util
-import iris._lazy_data
 
 
 # Show Pyke inference engine statistics.
@@ -374,7 +374,8 @@ def _pyke_kb_engine():
 class NetCDFDataProxy(object):
     """A reference to the data payload of a single NetCDF file variable."""
 
-    __slots__ = ('shape', 'dtype', 'path', 'variable_name', 'fill_value')
+    __slots__ = ('shape', 'dtype', 'path', 'variable_name', 'fill_value',
+                 '_data_cache')
 
     def __init__(self, shape, dtype, path, variable_name, fill_value):
         self.shape = shape
@@ -382,19 +383,26 @@ def __init__(self, shape, dtype, path, variable_name, fill_value):
         self.path = path
         self.variable_name = variable_name
         self.fill_value = fill_value
+        self._data_cache = {}
 
     @property
     def ndim(self):
         return len(self.shape)
 
     def __getitem__(self, keys):
-        dataset = netCDF4.Dataset(self.path)
-        try:
-            variable = dataset.variables[self.variable_name]
-            # Get the NetCDF variable data and slice.
-            data = variable[keys]
-        finally:
-            dataset.close()
+        if keys not in self._data_cache.keys():
+            dataset = netCDF4.Dataset(self.path)
+            try:
+                variable = dataset.variables[self.variable_name]
+                # Get the NetCDF variable data and slice.
+                v = variable[keys]
+                if isinstance(v, np.ma.MaskedArray):
+                    self._data_cache[str(keys)] = v.filled(np.nan)
+                else:
+                    self._data_cache[str(keys)] = v[keys]
+            finally:
+                dataset.close()
+        data = self._data_cache[str(keys)]
         return data
 
     def __repr__(self):
@@ -501,12 +509,12 @@ def _load_cube(engine, cf, cf_var, filename):
         dummy_data = cf_var.add_offset + dummy_data
 
     # Create cube with deferred data, but no metadata
-    fill_value = getattr(cf_var.cf_data, '_FillValue',
-                         netCDF4.default_fillvals[cf_var.dtype.str[1:]])
+    fill_value = getattr(cf_var.cf_data, '_FillValue', None)
+                         # netCDF4.default_fillvals[cf_var.dtype.str[1:]])
     proxy = NetCDFDataProxy(cf_var.shape, dummy_data.dtype,
                             filename, cf_var.cf_name, fill_value)
-    data = iris._lazy_data.as_lazy_data(proxy)
-    cube = iris.cube.Cube(data)
+    data = da.from_array(proxy, chunks=100)
+    cube = iris.cube.Cube(data, fill_value=fill_value)
 
     # Reset the pyke inference engine.
     engine.reset()
diff --git a/lib/iris/fileformats/pp.py b/lib/iris/fileformats/pp.py
index 5c928ca6f5..9f55150b65 100644
--- a/lib/iris/fileformats/pp.py
+++ b/lib/iris/fileformats/pp.py
@@ -43,7 +43,6 @@
 import iris.fileformats.rules
 import iris.fileformats.pp_rules
 import iris.coord_systems
-import iris._lazy_data
 
 try:
     import mo_pack
@@ -831,7 +830,7 @@ class PPDataProxy(object):
     """A reference to the data payload of a single PP field."""
 
     __slots__ = ('shape', 'src_dtype', 'path', 'offset', 'data_len',
-                 '_lbpack', 'boundary_packing', 'mdi', 'mask')
+                 '_lbpack', 'boundary_packing', 'mdi', 'mask', '_data_cache')
 
     def __init__(self, shape, src_dtype, path, offset, data_len,
                  lbpack, boundary_packing, mdi, mask):
@@ -844,6 +843,7 @@ def __init__(self, shape, src_dtype, path, offset, data_len,
         self.boundary_packing = boundary_packing
         self.mdi = mdi
         self.mask = mask
+        self._data_cache = None
 
     # lbpack
     def _lbpack_setter(self, value):
@@ -874,12 +874,18 @@ def __getitem__(self, keys):
         with open(self.path, 'rb') as pp_file:
             pp_file.seek(self.offset, os.SEEK_SET)
             data_bytes = pp_file.read(self.data_len)
-            data = _data_bytes_to_shaped_array(data_bytes,
-                                               self.lbpack,
-                                               self.boundary_packing,
-                                               self.shape, self.src_dtype,
-                                               self.mdi, self.mask)
-        return data.__getitem__(keys)
+            # Only read from disk if the data is not cached or
+            # if it is not the correct shape.
+            if (self._data_cache is None or
+               not hasattr(self._data_cache, 'shape') or
+               self._data_cache.shape != self.shape):
+                data = _data_bytes_to_shaped_array(data_bytes,
+                                                   self.lbpack,
+                                                   self.boundary_packing,
+                                                   self.shape, self.src_dtype,
+                                                   self.mdi, self.mask)
+                self._data_cache = data
+        return self._data_cache.__getitem__(keys)
 
     def __repr__(self):
         fmt = '<{self.__class__.__name__} shape={self.shape}' \
@@ -1035,9 +1041,13 @@ def _data_bytes_to_shaped_array(data_bytes, lbpack, boundary_packing,
         # Reform in row-column order
         data.shape = data_shape
 
+    if np.ma.is_masked(data):
+        data = data.filled(np.nan)
     # Mask the array?
     if mdi in data:
-        data = ma.masked_values(data, mdi, copy=False)
+        # data = ma.masked_values(data, mdi, copy=False)
+        # data = array_masked_to_nans(data)
+        data[data==mdi] = np.nan
 
     return data
 
diff --git a/lib/iris/fileformats/rules.py b/lib/iris/fileformats/rules.py
index 137aec545e..0e853f720b 100644
--- a/lib/iris/fileformats/rules.py
+++ b/lib/iris/fileformats/rules.py
@@ -909,7 +909,8 @@ def _make_cube(field, converter):
                           attributes=metadata.attributes,
                           cell_methods=metadata.cell_methods,
                           dim_coords_and_dims=metadata.dim_coords_and_dims,
-                          aux_coords_and_dims=metadata.aux_coords_and_dims)
+                          aux_coords_and_dims=metadata.aux_coords_and_dims,
+                          fill_value=field.bmdi)
 
     # Temporary code to deal with invalid standard names in the
     # translation table.