Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
274 changes: 274 additions & 0 deletions notebooks/object_arrays.ipynb

Large diffs are not rendered by default.

17 changes: 15 additions & 2 deletions zarr/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1498,7 +1498,11 @@ def _set_selection(self, indexer, value, fields=None):
sel_shape = indexer.shape

# check value shape
if is_scalar(value, self._dtype):
if sel_shape == ():
# setting a single item
pass
elif is_scalar(value, self._dtype):
# setting a scalar value
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to be clear, this is more of a general change to how selection works and not something that is specific to object type data (though it likely will be used for that case), correct?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is necessary to allow storing a list as an item in an object array. I.e.,

z = zarr.empty(10, dtype=object)
z[0] = ['foo', 'bar', 'baz']

...needs this change to work.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry. Meant to put that comment near the () piece above. The scalar part is clear. :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No worries. Yes it is a general thing, although it is really an optimisation for the case of setting a single item, and shouldn't change behaviour for anything other than allowing more flexibility in what kinds of items can be set in object arrays.

pass
else:
if not hasattr(value, 'shape'):
Expand All @@ -1509,7 +1513,9 @@ def _set_selection(self, indexer, value, fields=None):
for chunk_coords, chunk_selection, out_selection in indexer:

# extract data to store
if is_scalar(value, self._dtype):
if sel_shape == ():
chunk_value = value
elif is_scalar(value, self._dtype):
chunk_value = value
else:
chunk_value = value[out_selection]
Expand Down Expand Up @@ -1676,6 +1682,8 @@ def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=Non
if self._fill_value is not None:
chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order)
chunk.fill(self._fill_value)
elif self._dtype == object:
chunk = np.empty(self._chunks, dtype=self._dtype, order=self._order)
else:
# N.B., use zeros here so any region beyond the array has consistent
# and compressible data
Expand Down Expand Up @@ -1736,6 +1744,11 @@ def _encode_chunk(self, chunk):
for f in self._filters:
chunk = f.encode(chunk)

# check object encoding
if isinstance(chunk, np.ndarray) and chunk.dtype == object:
# TODO review error message
raise RuntimeError('object array detected without encoding')

# compress
if self._compressor:
cdata = self._compressor.encode(chunk)
Expand Down
24 changes: 16 additions & 8 deletions zarr/creation.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@
def create(shape, chunks=True, dtype=None, compressor='default',
fill_value=0, order='C', store=None, synchronizer=None,
overwrite=False, path=None, chunk_store=None, filters=None,
cache_metadata=True, read_only=False, **kwargs):
cache_metadata=True, read_only=False, object_codec=None,
**kwargs):
"""Create an array.

Parameters
Expand Down Expand Up @@ -55,6 +56,8 @@ def create(shape, chunks=True, dtype=None, compressor='default',
overhead depending on storage and data access pattern).
read_only : bool, optional
True if array should be protected against modification.
object_codec : Codec, optional
A codec to encode object arrays, only needed if dtype=object.

Returns
-------
Expand Down Expand Up @@ -82,8 +85,8 @@ def create(shape, chunks=True, dtype=None, compressor='default',
e.g., `MsgPack` or `Pickle` from `numcodecs`::

>>> from numcodecs import MsgPack
>>> z = zarr.create((10000, 10000), chunks=(1000, 1000), dtype='object',
... filters=[MsgPack()])
>>> z = zarr.create((10000, 10000), chunks=(1000, 1000), dtype=object,
... object_codec=MsgPack())
>>> z
<zarr.core.Array (10000, 10000) object>

Expand All @@ -108,7 +111,7 @@ def create(shape, chunks=True, dtype=None, compressor='default',
# initialize array metadata
init_array(store, shape=shape, chunks=chunks, dtype=dtype, compressor=compressor,
fill_value=fill_value, order=order, overwrite=overwrite, path=path,
chunk_store=chunk_store, filters=filters)
chunk_store=chunk_store, filters=filters, object_codec=object_codec)

# instantiate array
z = Array(store, path=path, chunk_store=chunk_store, synchronizer=synchronizer,
Expand Down Expand Up @@ -340,7 +343,7 @@ def array(data, **kwargs):

def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor='default',
fill_value=0, order='C', synchronizer=None, filters=None, cache_metadata=True,
path=None, **kwargs):
path=None, object_codec=None, **kwargs):
"""Open an array using file-mode-like semantics.

Parameters
Expand Down Expand Up @@ -376,6 +379,8 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor=
overhead depending on storage and data access pattern).
path : string, optional
Array path within store.
object_codec : Codec, optional
A codec to encode object arrays, only needed if dtype=object.

Returns
-------
Expand Down Expand Up @@ -432,15 +437,17 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor=
elif mode == 'w':
init_array(store, shape=shape, chunks=chunks, dtype=dtype,
compressor=compressor, fill_value=fill_value,
order=order, filters=filters, overwrite=True, path=path)
order=order, filters=filters, overwrite=True, path=path,
object_codec=object_codec)

elif mode == 'a':
if contains_group(store, path=path):
err_contains_group(path)
elif not contains_array(store, path=path):
init_array(store, shape=shape, chunks=chunks, dtype=dtype,
compressor=compressor, fill_value=fill_value,
order=order, filters=filters, path=path)
order=order, filters=filters, path=path,
object_codec=object_codec)

elif mode in ['w-', 'x']:
if contains_group(store, path=path):
Expand All @@ -450,7 +457,8 @@ def open_array(store, mode='a', shape=None, chunks=True, dtype=None, compressor=
else:
init_array(store, shape=shape, chunks=chunks, dtype=dtype,
compressor=compressor, fill_value=fill_value,
order=order, filters=filters, path=path)
order=order, filters=filters, path=path,
object_codec=object_codec)

# determine read only status
read_only = mode == 'r'
Expand Down
22 changes: 19 additions & 3 deletions zarr/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def _require_parent_group(path, store, chunk_store, overwrite):

def init_array(store, shape, chunks=True, dtype=None, compressor='default',
fill_value=None, order='C', overwrite=False, path=None,
chunk_store=None, filters=None):
chunk_store=None, filters=None, object_codec=None):
"""Initialize an array store with the given configuration. Note that this is a low-level
function and there should be no need to call this directly from user code.

Expand Down Expand Up @@ -203,6 +203,8 @@ def init_array(store, shape, chunks=True, dtype=None, compressor='default',
for storage of both chunks and metadata.
filters : sequence, optional
Sequence of filters to use to encode chunk data prior to compression.
object_codec : Codec, optional
A codec to encode object arrays, only needed if dtype=object.

Examples
--------
Expand Down Expand Up @@ -284,12 +286,13 @@ def init_array(store, shape, chunks=True, dtype=None, compressor='default',
_init_array_metadata(store, shape=shape, chunks=chunks, dtype=dtype,
compressor=compressor, fill_value=fill_value,
order=order, overwrite=overwrite, path=path,
chunk_store=chunk_store, filters=filters)
chunk_store=chunk_store, filters=filters,
object_codec=object_codec)


def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='default',
fill_value=None, order='C', overwrite=False, path=None,
chunk_store=None, filters=None):
chunk_store=None, filters=None, object_codec=None):

# guard conditions
if overwrite:
Expand Down Expand Up @@ -334,6 +337,19 @@ def _init_array_metadata(store, shape, chunks=None, dtype=None, compressor='defa
if filters:
filters_config = [f.get_config() for f in filters]
else:
filters_config = []

# deal with object encoding
if dtype == object:
if object_codec is None:
raise ValueError('an object_codec is required for object arrays')
else:
filters_config.insert(0, object_codec.get_config())
elif object_codec is not None:
raise ValueError('an object_codec is only needed for object arrays')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe this should be softened to a warning? While it certainly isn't sensible for a user to set object_codec when there is no object, there isn't any problem caused by adding it. We just ignore it in this case anyways.

We could also explicitly set object_codec to None if we make this a warning should it accidentally be used later.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I did wonder about that. In the end I figured 6 of one, half dozen of the other. Also I was being slightly lazy, I hate testing warnings, have had awful PY2/PY3 compatibility issues.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So as a friendly tip about testing warnings. Normally I having the warnings module convert them to errors and then test them in exact same way. The last half of this SO comment is useful for showing this. When we switch to pytest, they have a nicer way of doing this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks, I will soften to warning. I have some other tests that test warnings by converting to errors, but I could never get the catch_warnings context manager to work in PY2 so there is a bunch of ugly calls to warnings.resetwarnings(). Maybe this is time to start using pytest if that solutions works in PY2 and PY3.


# use null to indicate no filters
if not filters_config:
filters_config = None

# initialize metadata
Expand Down
65 changes: 62 additions & 3 deletions zarr/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from zarr.errors import PermissionError
from zarr.compat import PY2
from zarr.util import buffer_size
from numcodecs import Delta, FixedScaleOffset, Zlib, Blosc, BZ2
from numcodecs import Delta, FixedScaleOffset, Zlib, Blosc, BZ2, MsgPack, Pickle


# noinspection PyMethodMayBeStatic
Expand Down Expand Up @@ -864,6 +864,53 @@ def test_dtypes(self):
with assert_raises(ValueError):
self.create_array(shape=10, dtype='timedelta64[{}]'.format(resolution))

def test_object_arrays(self):

# an object_codec is required for object arrays
with assert_raises(ValueError):
self.create_array(shape=10, chunks=3, dtype=object)

# create an object array using msgpack
z = self.create_array(shape=10, chunks=3, dtype=object, object_codec=MsgPack())
z[0] = 'foo'
assert z[0] == 'foo'
z[1] = b'bar'
assert z[1] == 'bar' # msgpack gets this wrong
z[2] = 1
assert z[2] == 1
z[3] = [2, 4, 6, 'baz']
assert z[3] == [2, 4, 6, 'baz']
z[4] = {'a': 'b', 'c': 'd'}
assert z[4] == {'a': 'b', 'c': 'd'}
a = z[:]
assert a.dtype == object

# create an object array using pickle
z = self.create_array(shape=10, chunks=3, dtype=object, object_codec=Pickle())
z[0] = 'foo'
assert z[0] == 'foo'
z[1] = b'bar'
assert z[1] == b'bar'
z[2] = 1
assert z[2] == 1
z[3] = [2, 4, 6, 'baz']
assert z[3] == [2, 4, 6, 'baz']
z[4] = {'a': 'b', 'c': 'd'}
assert z[4] == {'a': 'b', 'c': 'd'}
a = z[:]
assert a.dtype == object

def test_object_arrays_danger(self):

# do something dangerous - manually force an object array with no object codec
z = self.create_array(shape=5, chunks=2, dtype=object, fill_value=0,
object_codec=MsgPack())
z._filters = None # wipe filters
with assert_raises(RuntimeError):
z[0] = 'foo'
with assert_raises(RuntimeError):
z[:] = 42


class TestArrayWithPath(TestArray):

Expand Down Expand Up @@ -1282,11 +1329,19 @@ def test_astype(self):
assert_array_equal(expected, z2)

def test_structured_array(self):
# don't implement this one, cannot do delta on structured array
# skip this one, cannot do delta on structured array
pass

def test_dtypes(self):
# don't implement this one, delta messes up floats
# skip this one, delta messes up floats
pass

def test_object_arrays(self):
# skip this one, cannot use delta with objects
pass

def test_object_arrays_danger(self):
# skip this one, cannot use delta with objects
pass


Expand Down Expand Up @@ -1380,3 +1435,7 @@ def test_cache_metadata(self):
eq(300, a2.size)
eq(300, a2.nbytes)
eq(30, a2.nchunks)

def test_object_arrays_danger(self):
# skip this one as it only works if metadata are cached
pass
4 changes: 4 additions & 0 deletions zarr/tests/test_sync.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,10 @@ def test_hexdigest(self):
z.attrs['foo'] = 'bar'
eq('05b0663ffe1785f38d3a459dec17e57a18f254af', z.hexdigest())

def test_object_arrays_danger(self):
# skip this one, metadata get reloaded in each process
pass


def _create_group(arg):
g, name = arg
Expand Down