diff --git a/docs/api/codecs.rst b/docs/api/codecs.rst index 6cff84f01a..e35de08b30 100644 --- a/docs/api/codecs.rst +++ b/docs/api/codecs.rst @@ -21,6 +21,7 @@ code of this module for details. .. autoclass:: BZ2 .. autoclass:: LZMA .. autoclass:: Delta +.. autoclass:: AsType .. autoclass:: FixedScaleOffset .. autoclass:: Quantize .. autoclass:: PackBits diff --git a/docs/api/core.rst b/docs/api/core.rst index 6024fdc6c4..4f2c5cc6bb 100644 --- a/docs/api/core.rst +++ b/docs/api/core.rst @@ -9,3 +9,4 @@ The Array class (``zarr.core``) .. automethod:: resize .. automethod:: append .. automethod:: view + .. automethod:: astype diff --git a/zarr/codecs.py b/zarr/codecs.py index b299bb8db5..bc4d5a3b7c 100644 --- a/zarr/codecs.py +++ b/zarr/codecs.py @@ -485,6 +485,89 @@ def __repr__(self): codec_registry[Delta.codec_id] = Delta +class AsType(Codec): + """Filter to convert data between different types. + + Parameters + ---------- + encode_dtype : dtype + Data type to use for encoded data. + decode_dtype : dtype, optional + Data type to use for decoded data. + + Notes + ----- + If `encode_dtype` is of lower precision than `decode_dtype`, please be + aware that data loss can occur by writing data to disk using this filter. + No checks are made to ensure the casting will work in that direction and + data corruption will occur. + + Examples + -------- + >>> import zarr + >>> import numpy as np + >>> x = np.arange(100, 120, 2, dtype=np.int8) + >>> x + array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8) + >>> f = zarr.AsType(encode_dtype=x.dtype, decode_dtype=np.int64) + >>> y = f.decode(x) + >>> y + array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118]) + >>> z = f.encode(y) + >>> z + array([100, 102, 104, 106, 108, 110, 112, 114, 116, 118], dtype=int8) + + """ # flake8: noqa + + codec_id = 'astype' + + def __init__(self, encode_dtype, decode_dtype): + self.encode_dtype = np.dtype(encode_dtype) + self.decode_dtype = np.dtype(decode_dtype) + + def encode(self, buf): + + # view input data as 1D array + arr = _ndarray_from_buffer(buf, self.decode_dtype) + + # convert and copy + enc = arr.astype(self.encode_dtype) + + return enc + + def decode(self, buf, out=None): + + # view encoded data as 1D array + enc = _ndarray_from_buffer(buf, self.encode_dtype) + + # convert and copy + dec = enc.astype(self.decode_dtype) + + # handle output + out = _buffer_copy(dec, out) + + return out + + def get_config(self): + config = dict() + config['id'] = self.codec_id + config['encode_dtype'] = self.encode_dtype.str + config['decode_dtype'] = self.decode_dtype.str + return config + + def __repr__(self): + return ( + '%s(encode_dtype=%s, decode_dtype=%s)' % ( + type(self).__name__, + self.encode_dtype, + self.decode_dtype + ) + ) + + +codec_registry[AsType.codec_id] = AsType + + class FixedScaleOffset(Codec): """Simplified version of the scale-offset filter available in HDF5. Applies the transformation `(x - offset) * scale` to all chunks. Results diff --git a/zarr/core.py b/zarr/core.py index 58356b884f..b1f7303620 100644 --- a/zarr/core.py +++ b/zarr/core.py @@ -15,7 +15,7 @@ from zarr.attrs import Attributes from zarr.errors import PermissionError, err_read_only, err_array_not_found from zarr.compat import reduce -from zarr.codecs import get_codec +from zarr.codecs import AsType, get_codec class Array(object): @@ -73,6 +73,7 @@ class Array(object): resize append view + astype """ # flake8: noqa @@ -1176,3 +1177,63 @@ def view(self, shape=None, chunks=None, dtype=None, a._filters = filters return a + + def astype(self, dtype): + """Does on the fly type conversion of the underlying data. + + Parameters + ---------- + dtype : string or dtype + NumPy dtype. + + Notes + ----- + This method returns a new Array object which is a view on the same + underlying chunk data. Modifying any data via the view is currently + not permitted and will result in an error. This is an experimental + feature and its behavior is subject to change in the future. + + See Also + -------- + Array.view + + Examples + -------- + + >>> import zarr + >>> import numpy as np + >>> data = np.arange(100, dtype=np.uint8) + >>> a = zarr.array(data, chunks=10) + >>> a[:] + array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, + 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, + 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, + 96, 97, 98, 99], dtype=uint8) + >>> v = a.astype(np.float32) + >>> v.is_view + True + >>> v[:] + array([ 0., 1., 2., 3., 4., 5., 6., 7., 8., 9., + 10., 11., 12., 13., 14., 15., 16., 17., 18., 19., + 20., 21., 22., 23., 24., 25., 26., 27., 28., 29., + 30., 31., 32., 33., 34., 35., 36., 37., 38., 39., + 40., 41., 42., 43., 44., 45., 46., 47., 48., 49., + 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., + 60., 61., 62., 63., 64., 65., 66., 67., 68., 69., + 70., 71., 72., 73., 74., 75., 76., 77., 78., 79., + 80., 81., 82., 83., 84., 85., 86., 87., 88., 89., + 90., 91., 92., 93., 94., 95., 96., 97., 98., 99.], + dtype=float32) + """ # flake8: noqa + + dtype = np.dtype(dtype) + + filters = [] + if self._filters: + filters.extend(self._filters) + filters.insert(0, AsType(encode_dtype=self._dtype, decode_dtype=dtype)) + + return self.view(filters=filters, dtype=dtype, read_only=True) diff --git a/zarr/tests/test_codecs.py b/zarr/tests/test_codecs.py index 7538813e6c..10412b6365 100644 --- a/zarr/tests/test_codecs.py +++ b/zarr/tests/test_codecs.py @@ -294,6 +294,64 @@ def test_repr(self): eq(expect, actual) +class TestAsType(CodecTests, unittest.TestCase): + + codec_id = 'astype' + + def test_encode(self): + for arr in test_arrays: + if arr.dtype.kind in {'f', 'i', 'u'}: + self._test_encode( + arr, + encode_dtype=arr.dtype, + decode_dtype=arr.dtype + ) + + def test_decode(self): + for arr in test_arrays: + if arr.dtype.kind == 'f': + self._test_decode_lossy( + arr, + decimal=10, + encode_dtype=arr.dtype, + decode_dtype=arr.dtype + ) + elif arr.dtype.kind in {'i', 'u'}: + self._test_decode_lossless( + arr, encode_dtype=arr.dtype, decode_dtype=arr.dtype + ) + + def test_encode_output(self): + encode_dtype = 'i4' + decode_dtype = 'i8' + codec = self.init_codec( + encode_dtype=encode_dtype, decode_dtype=decode_dtype + ) + arr = np.arange(10, 20, 1, dtype=decode_dtype) + expect = arr.astype(encode_dtype) + actual = codec.encode(arr) + assert_array_equal(expect, actual) + eq(np.dtype(encode_dtype), actual.dtype) + + def test_decode_input(self): + encode_dtype = 'i4' + decode_dtype = 'i8' + codec = self.init_codec( + encode_dtype=encode_dtype, decode_dtype=decode_dtype + ) + arr = np.arange(10, 20, 1, dtype=encode_dtype) + expect = arr.astype(decode_dtype) + actual = codec.decode(arr) + assert_array_equal(expect, actual) + eq(np.dtype(decode_dtype), actual.dtype) + + def test_repr(self): + codec = self.init_codec(encode_dtype='i4', decode_dtype='i8') + expect = 'AsType(encode_dtype=int32, decode_dtype=int64)' + actual = repr(codec) + eq(expect, actual) + + class TestFixedScaleOffset(CodecTests, unittest.TestCase): codec_id = 'fixedscaleoffset' diff --git a/zarr/tests/test_core.py b/zarr/tests/test_core.py index 4f899da781..670a295f2e 100644 --- a/zarr/tests/test_core.py +++ b/zarr/tests/test_core.py @@ -766,6 +766,40 @@ def test_repr(self): for l1, l2 in zip(expect.split('\n'), actual.split('\n')): eq(l1, l2) + def test_astype_no_filters(self): + shape = (100,) + dtype = np.dtype(np.int8) + astype = np.dtype(np.float32) + + store = dict() + init_array(store, shape=shape, chunks=10, dtype=dtype) + + data = np.arange(np.prod(shape), dtype=dtype).reshape(shape) + + z1 = Array(store) + z1[...] = data + z2 = z1.astype(astype) + + expected = data.astype(astype) + assert_array_equal(expected, z2) + eq(z2.read_only, True) + + def test_astype(self): + shape = (100,) + chunks = (10,) + + dtype = np.dtype(np.int8) + astype = np.dtype(np.float32) + + data = np.arange(np.prod(shape), dtype=dtype).reshape(shape) + + z1 = self.create_array(shape=shape, chunks=chunks, dtype=dtype) + z1[...] = data + z2 = z1.astype(astype) + + expected = data.astype(astype) + assert_array_equal(expected, z2) + # custom store, does not support getsize() class CustomMapping(object): diff --git a/zarr/tests/test_filters.py b/zarr/tests/test_filters.py index ecbf3748d8..720767a057 100644 --- a/zarr/tests/test_filters.py +++ b/zarr/tests/test_filters.py @@ -7,7 +7,7 @@ from nose.tools import eq_ as eq -from zarr.codecs import Delta, FixedScaleOffset, \ +from zarr.codecs import AsType, Delta, FixedScaleOffset, \ Quantize, PackBits, Categorize, \ Zlib, Blosc, BZ2 from zarr.creation import array @@ -55,6 +55,38 @@ def test_array_with_delta_filter(): assert_array_equal(expect, actual) +def test_array_with_astype_filter(): + + # setup + encode_dtype = 'i1' + decode_dtype = 'i8' + filters = [AsType(encode_dtype=encode_dtype, decode_dtype=decode_dtype)] + chunks = 10 + chunk_size = 10 + shape = chunks * chunk_size + data = np.arange(shape, dtype=decode_dtype) + + for compressor in compressors: + print(repr(compressor)) + + a = array(data, chunks=chunks, compressor=compressor, filters=filters) + + # check round-trip + assert data.dtype == a.dtype + assert_array_equal(data, a[:]) + + # check chunks + for i in range(chunks): + cdata = a.store[str(i)] + if compressor: + chunk = compressor.decode(cdata) + else: + chunk = cdata + actual = np.frombuffer(chunk, dtype=encode_dtype) + expect = data.astype(encode_dtype)[i*chunk_size:(i+1)*chunk_size] + assert_array_equal(expect, actual) + + def test_array_with_scaleoffset_filter(): # setup