zarr-developers · alimanfoo · Dec 6, 2017 · Nov 30, 2017 · Nov 30, 2017 · Nov 30, 2017
diff --git a/notebooks/object_arrays.ipynb b/notebooks/object_arrays.ipynb
diff --git a/zarr/core.py b/zarr/core.py
@@ -1498,7 +1498,11 @@ def _set_selection(self, indexer, value, fields=None):
         sel_shape = indexer.shape
 
         # check value shape
-        if is_scalar(value, self._dtype):
+        if sel_shape == ():
+            # setting a single item
+            pass
+        elif is_scalar(value, self._dtype):
+            # setting a scalar value
             pass
         else:
             if not hasattr(value, 'shape'):
@@ -1509,7 +1513,9 @@ def _set_selection(self, indexer, value, fields=None):
         for chunk_coords, chunk_selection, out_selection in indexer:
 
             # extract data to store
-            if is_scalar(value, self._dtype):
+            if sel_shape == ():
+                chunk_value = value
+            elif is_scalar(value, self._dtype):
                 chunk_value = value
             else:
                 chunk_value = value[out_selection]
@@ -1676,6 +1682,8 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non
                 if self._fill_value is not None:
                     chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order)
                     chunk.fill(self._fill_value)
+                elif self._dtype == object:
+                    chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order)
                 else:
                     # N.B., use zeros here so any region beyond the array has consistent
                     # and compressible data
@@ -1736,6 +1744,11 @@ def _encode_chunk(self, chunk):
             for f in self._filters:
                 chunk = f.encode(chunk)
 
+        # check object encoding
+        if isinstance(chunk, np.ndarray) and chunk.dtype == object:
+            # TODO review error message
+            raise RuntimeError('object array detected without encoding')
+
         # compress
         if self._compressor:
             cdata = self._compressor.encode(chunk)

diff --git a/zarr/creation.py b/zarr/creation.py
@@ -16,7 +16,8 @@
 def create(shape, chunks=True, dtype=None, compressor='default',
            fill_value=0, order='C', store=None, synchronizer=None,
            overwrite=False, path=None, chunk_store=None, filters=None,
-           cache_metadata=True, read_only=False, **kwargs):
+           cache_metadata=True, read_only=False, object_codec=None,
+           **kwargs):
     """Create an array.
 
     Parameters
@@ -55,6 +56,8 @@ def create(shape, chunks=True, dtype=None, compressor='default',
         overhead depending on storage and data access pattern).
     read_only : bool, optional
         True if array should be protected against modification.
+    object_codec : Codec, optional
+        A codec to encode object arrays, only needed if dtype=object.
 
     Returns
     -------
@@ -82,8 +85,8 @@ def create(shape, chunks=True, dtype=None, compressor='default',
     e.g., `MsgPack` or `Pickle` from `numcodecs`::
 
         >>> from numcodecs import MsgPack
-        >>> z = zarr.create((10000, 10000), chunks=(1000, 1000), dtype='object',
-        ...                 filters=[MsgPack()])
+        >>> z = zarr.create((10000, 10000), chunks=(1000, 1000), dtype=object,
+        ...                 object_codec=MsgPack())
         >>> z
         <zarr.core.Array (10000, 10000) object>
 
@@ -108,7 +111,7 @@ def create(shape, chunks=True, dtype=None, compressor='default',
     # initialize array metadata
     init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor,
                fill_value=fill_value, order=order, overwrite=overwrite, path=path,
-               chunk_store=chunk_store, filters=filters)
+               chunk_store=chunk_store, filters=filters, object_codec=object_codec)
 
     # instantiate array
     z = Array(store, path=path, chunk_store=chunk_store, synchronizer=synchronizer,
@@ -340,7 +343,7 @@ def array(data, **kwargs):
 
 def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor='default',
                fill_value=0, order='C', synchronizer=None, filters=None, cache_metadata=True,
-               path=None, **kwargs):
+               path=None, object_codec=None, **kwargs):
     """Open an array using file-mode-like semantics.
 
     Parameters
@@ -376,6 +379,8 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor=
         overhead depending on storage and data access pattern).
     path : string, optional
         Array path within store.
+    object_codec : Codec, optional
+        A codec to encode object arrays, only needed if dtype=object.
 
     Returns
     -------
@@ -432,15 +437,17 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor=
     elif mode == 'w':
         init_array(store, shape=shape, chunks=chunks, dtype=dtype,
                    compressor=compressor, fill_value=fill_value,
-                   order=order, filters=filters, overwrite=True, path=path)
+                   order=order, filters=filters, overwrite=True, path=path,
+                   object_codec=object_codec)
 
     elif mode == 'a':
         if contains_group(store, path=path):
             err_contains_group(path)
         elif not contains_array(store, path=path):
             init_array(store, shape=shape, chunks=chunks, dtype=dtype,
                        compressor=compressor, fill_value=fill_value,
-                       order=order, filters=filters, path=path)
+                       order=order, filters=filters, path=path,
+                       object_codec=object_codec)
 
     elif mode in ['w-', 'x']:
         if contains_group(store, path=path):
@@ -450,7 +457,8 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor=
         else:
             init_array(store, shape=shape, chunks=chunks, dtype=dtype,
                        compressor=compressor, fill_value=fill_value,
-                       order=order, filters=filters, path=path)
+                       order=order, filters=filters, path=path,
+                       object_codec=object_codec)
 
     # determine read only status
     read_only = mode == 'r'

diff --git a/zarr/storage.py b/zarr/storage.py
@@ -173,7 +173,7 @@ def _require_parent_group(path, store, chunk_store, overwrite):
 
 def init_array(store, shape, chunks=True, dtype=None, compressor='default',
                fill_value=None, order='C', overwrite=False, path=None,
-               chunk_store=None, filters=None):
+               chunk_store=None, filters=None, object_codec=None):
     """Initialize an array store with the given configuration. Note that this is a low-level
     function and there should be no need to call this directly from user code.
 
@@ -203,6 +203,8 @@ def init_array(store, shape, chunks=True, dtype=None, compressor='default',
         for storage of both chunks and metadata.
     filters : sequence, optional
         Sequence of filters to use to encode chunk data prior to compression.
+    object_codec : Codec, optional
+        A codec to encode object arrays, only needed if dtype=object.
 
     Examples
     --------
@@ -284,12 +286,13 @@ def init_array(store, shape, chunks=True, dtype=None, compressor='default',
     _init_array_metadata(store, shape=shape, chunks=chunks, dtype=dtype,
                          compressor=compressor, fill_value=fill_value,
                          order=order, overwrite=overwrite, path=path,
-                         chunk_store=chunk_store, filters=filters)
+                         chunk_store=chunk_store, filters=filters,
+                         object_codec=object_codec)
 
 
 def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='default',
                          fill_value=None, order='C', overwrite=False, path=None,
-                         chunk_store=None, filters=None):
+                         chunk_store=None, filters=None, object_codec=None):
 
     # guard conditions
     if overwrite:
@@ -334,6 +337,19 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='defa
     if filters:
         filters_config = [f.get_config() for f in filters]
     else:
+        filters_config = []
+
+    # deal with object encoding
+    if dtype == object:
+        if object_codec is None:
+            raise ValueError('an object_codec is required for object arrays')
+        else:
+            filters_config.insert(0, object_codec.get_config())
+    elif object_codec is not None:
+        raise ValueError('an object_codec is only needed for object arrays')
+
+    # use null to indicate no filters
+    if not filters_config:
         filters_config = None
 
     # initialize metadata

diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py
@@ -21,7 +21,7 @@
 from zarr.errors import PermissionError
 from zarr.compat import PY2
 from zarr.util import buffer_size
-from numcodecs import Delta, FixedScaleOffset, Zlib, Blosc, BZ2
+from numcodecs import Delta, FixedScaleOffset, Zlib, Blosc, BZ2, MsgPack, Pickle
 
 
 # noinspection PyMethodMayBeStatic
@@ -864,6 +864,53 @@ def test_dtypes(self):
             with assert_raises(ValueError):
                 self.create_array(shape=10, dtype='timedelta64[{}]'.format(resolution))
 
+    def test_object_arrays(self):
+
+        # an object_codec is required for object arrays
+        with assert_raises(ValueError):
+            self.create_array(shape=10, chunks=3, dtype=object)
+
+        # create an object array using msgpack
+        z = self.create_array(shape=10, chunks=3, dtype=object, object_codec=MsgPack())
+        z[0] = 'foo'
+        assert z[0] == 'foo'
+        z[1] = b'bar'
+        assert z[1] == 'bar'  # msgpack gets this wrong
+        z[2] = 1
+        assert z[2] == 1
+        z[3] = [2, 4, 6, 'baz']
+        assert z[3] == [2, 4, 6, 'baz']
+        z[4] = {'a': 'b', 'c': 'd'}
+        assert z[4] == {'a': 'b', 'c': 'd'}
+        a = z[:]
+        assert a.dtype == object
+
+        # create an object array using pickle
+        z = self.create_array(shape=10, chunks=3, dtype=object, object_codec=Pickle())
+        z[0] = 'foo'
+        assert z[0] == 'foo'
+        z[1] = b'bar'
+        assert z[1] == b'bar'
+        z[2] = 1
+        assert z[2] == 1
+        z[3] = [2, 4, 6, 'baz']
+        assert z[3] == [2, 4, 6, 'baz']
+        z[4] = {'a': 'b', 'c': 'd'}
+        assert z[4] == {'a': 'b', 'c': 'd'}
+        a = z[:]
+        assert a.dtype == object
+
+    def test_object_arrays_danger(self):
+
+        # do something dangerous - manually force an object array with no object codec
+        z = self.create_array(shape=5, chunks=2, dtype=object, fill_value=0,
+                              object_codec=MsgPack())
+        z._filters = None  # wipe filters
+        with assert_raises(RuntimeError):
+            z[0] = 'foo'
+        with assert_raises(RuntimeError):
+            z[:] = 42
+
 
 class TestArrayWithPath(TestArray):
 
@@ -1282,11 +1329,19 @@ def test_astype(self):
         assert_array_equal(expected, z2)
 
     def test_structured_array(self):
-        # don't implement this one, cannot do delta on structured array
+        # skip this one, cannot do delta on structured array
         pass
 
     def test_dtypes(self):
-        # don't implement this one, delta messes up floats
+        # skip this one, delta messes up floats
+        pass
+
+    def test_object_arrays(self):
+        # skip this one, cannot use delta with objects
+        pass
+
+    def test_object_arrays_danger(self):
+        # skip this one, cannot use delta with objects
         pass
 
 
@@ -1380,3 +1435,7 @@ def test_cache_metadata(self):
         eq(300, a2.size)
         eq(300, a2.nbytes)
         eq(30, a2.nchunks)
+
+    def test_object_arrays_danger(self):
+        # skip this one as it only works if metadata are cached
+        pass
diff --git a/zarr/tests/test_sync.py b/zarr/tests/test_sync.py
@@ -177,6 +177,10 @@ def test_hexdigest(self):
         z.attrs['foo'] = 'bar'
         eq('05b0663ffe1785f38d3a459dec17e57a18f254af', z.hexdigest())
 
+    def test_object_arrays_danger(self):
+        # skip this one, metadata get reloaded in each process
+        pass
+
 
 def _create_group(arg):
     g, name = arg