diff --git a/.travis.yml b/.travis.yml index d797e9844bc..2c6ad370f26 100644 --- a/.travis.yml +++ b/.travis.yml @@ -43,6 +43,8 @@ matrix: env: CONDA_ENV=py36-pynio-dev - python: 3.6 env: CONDA_ENV=py36-rasterio1.0alpha + - python: 3.6 + env: CONDA_ENV=py36-zarr-dev allow_failures: - python: 3.6 env: @@ -67,6 +69,8 @@ matrix: env: CONDA_ENV=py36-pynio-dev - python: 3.6 env: CONDA_ENV=py36-rasterio1.0alpha + - python: 3.6 + env: CONDA_ENV=py36-zarr-dev before_install: - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then diff --git a/ci/requirements-py27-cdat+pynio.yml b/ci/requirements-py27-cdat+pynio.yml index 126dd416509..8bf4fe601e2 100644 --- a/ci/requirements-py27-cdat+pynio.yml +++ b/ci/requirements-py27-cdat+pynio.yml @@ -22,6 +22,7 @@ dependencies: - seaborn - toolz - rasterio + - zarr - pip: - coveralls - pytest-cov diff --git a/ci/requirements-py27-windows.yml b/ci/requirements-py27-windows.yml index bb76afd60c9..a39b24b887c 100644 --- a/ci/requirements-py27-windows.yml +++ b/ci/requirements-py27-windows.yml @@ -19,3 +19,4 @@ dependencies: - seaborn - toolz - rasterio + - zarr diff --git a/ci/requirements-py35.yml b/ci/requirements-py35.yml index 8c5ca993541..6f9ae2490b9 100644 --- a/ci/requirements-py35.yml +++ b/ci/requirements-py35.yml @@ -17,6 +17,7 @@ dependencies: - seaborn - toolz - rasterio + - zarr - pip: - coveralls - pytest-cov diff --git a/ci/requirements-py36-windows.yml b/ci/requirements-py36-windows.yml index 70ff3e50a1b..ea366bd04f7 100644 --- a/ci/requirements-py36-windows.yml +++ b/ci/requirements-py36-windows.yml @@ -16,3 +16,4 @@ dependencies: - seaborn - toolz - rasterio + - zarr diff --git a/ci/requirements-py36-zarr-dev.yml b/ci/requirements-py36-zarr-dev.yml new file mode 100644 index 00000000000..9be522882c5 --- /dev/null +++ b/ci/requirements-py36-zarr-dev.yml @@ -0,0 +1,20 @@ +name: test_env +channels: + - conda-forge +dependencies: + - python=3.6 + - dask + - distributed + - matplotlib + - pytest + - flake8 + - numpy + - pandas + - scipy + - seaborn + - toolz + - bottleneck + - pip: + - coveralls + - pytest-cov + - git+https://github.com/alimanfoo/zarr.git diff --git a/ci/requirements-py36.yml b/ci/requirements-py36.yml index d679c831cdc..6bf2efe2659 100644 --- a/ci/requirements-py36.yml +++ b/ci/requirements-py36.yml @@ -18,6 +18,7 @@ dependencies: - toolz - rasterio - bottleneck + - zarr - pip: - coveralls - pytest-cov diff --git a/doc/api.rst b/doc/api.rst index 7bcb844783a..60b0aa47e1f 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -415,7 +415,9 @@ Dataset methods open_dataset open_mfdataset open_rasterio + open_zarr Dataset.to_netcdf + Dataset.to_zarr save_mfdataset Dataset.to_array Dataset.to_dataframe diff --git a/doc/conf.py b/doc/conf.py index 5a62cb59733..8a6d5ae4c4d 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -23,7 +23,7 @@ print("python exec:", sys.executable) print("sys.path:", sys.path) for name in ('numpy scipy pandas matplotlib dask IPython seaborn ' - 'cartopy netCDF4 rasterio').split(): + 'cartopy netCDF4 rasterio zarr').split(): try: module = importlib.import_module(name) if name == 'matplotlib': diff --git a/doc/environment.yml b/doc/environment.yml index ae3ddb81719..45fa6417e16 100644 --- a/doc/environment.yml +++ b/doc/environment.yml @@ -16,3 +16,4 @@ dependencies: - cartopy=0.15.1 - rasterio=0.36.0 - sphinx-gallery + - zarr diff --git a/doc/installing.rst b/doc/installing.rst index 62d2a673e22..75d79c296bc 100644 --- a/doc/installing.rst +++ b/doc/installing.rst @@ -24,6 +24,7 @@ For netCDF and IO reading and writing netCDF4 files that does not use the netCDF-C libraries - `pynio `__: for reading GRIB and other geoscience specific file formats +- `zarr `_ if you feel the need to deviate from UTF-8, by setting the ``_Encoding`` field in ``encoding``. But -`we don't recommend it`_. +`we don't recommend it `_. .. warning:: @@ -502,6 +502,103 @@ longitudes and latitudes. .. _test files: https://github.com/mapbox/rasterio/blob/master/tests/data/RGB.byte.tif .. _pyproj: https://github.com/jswhit/pyproj +.. _io.zarr: + +Zarr +---- + +`Zarr`_ is a Python package providing an implementation of chunked, compressed, +N-dimensional arrays. +Zarr has the ability to store arrays in a range of ways, including in memory, +in files, and in cloud-based object storage such as `Amazon S3`_ and +`Google Cloud Storage`_. +Xarray's Zarr backend allows xarray to leverage these capabilities. + +.. warning:: + + Zarr support is still an experimental feature. Please report any bugs or + unexepected behavior via github issues. + +Xarray can't open just any zarr dataset, because xarray requires special +metadata (attributes) describing the dataset dimensions and coordinates. +At this time, xarray can only open zarr datasets that have been written by +xarray. To write a dataset with zarr, we use the +:py:attr:`Dataset.to_zarr ` method. +To write to a local directory, we pass a path to a directory + +.. ipython:: python + :suppress: + + ! rm -rf path/to/directory.zarr + +.. ipython:: python + + ds = xr.Dataset({'foo': (('x', 'y'), np.random.rand(4, 5))}, + coords={'x': [10, 20, 30, 40], + 'y': pd.date_range('2000-01-01', periods=5), + 'z': ('x', list('abcd'))}) + ds.to_zarr('path/to/directory.zarr') + +(The suffix ``.zarr`` is optional--just a reminder that a zarr store lives +there.) If the directory does not exist, it will be created. If a zarr +store is already present at that path, an error will be raised, preventing it +from being overwritten. To override this behavior and overwrite an existing +store, add ``mode='w'`` when invoking ``to_zarr``. + +To read back a zarr dataset that has been created this way, we use the +:py:func:`~xarray.open_zarr` method: + +.. ipython:: python + + ds_zarr = xr.open_zarr('path/to/directory.zarr') + ds_zarr + +Cloud Storage Buckets +~~~~~~~~~~~~~~~~~~~~~ + +It is possible to read and write xarray datasets directly from / to cloud +storage buckets using zarr. This example uses the `gcsfs`_ package to provide +a ``MutableMapping`` interface to `Google Cloud Storage`_, which we can then +pass to xarray:: + + import gcsfs + fs = gcsfs.GCSFileSystem(project='', token=None) + gcsmap = gcsfs.mapping.GCSMap('', gcs=fs, check=True, create=False) + # write to the bucket + ds.to_zarr(store=gcsmap) + # read it back + ds_gcs = xr.open_zarr(gcsmap, mode='r') + +.. _Zarr: http://zarr.readthedocs.io/ +.. _Amazon S3: https://aws.amazon.com/s3/ +.. _Google Cloud Storage: https://cloud.google.com/storage/ +.. _gcsfs: https://github.com/dask/gcsfs + +Zarr Compressors and Filters +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +There are many different options for compression and filtering possible with +zarr. These are described in the +`zarr documentation `_. +These options can be passed to the ``to_zarr`` method as variable encoding. +For example: + +.. ipython:: python + :suppress: + + ! rm -rf foo.zarr + +.. ipython:: python + + import zarr + compressor = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) + ds.to_zarr('foo.zarr', encoding={'foo': {'compressor': compressor}}) + +.. note:: + + Not all native zarr compression and filtering options have been tested with + xarray. + .. _io.pynio: Formats supported by PyNIO @@ -529,6 +626,7 @@ exporting your objects to pandas and using its broad range of `IO tools`_. .. _IO tools: http://pandas.pydata.org/pandas-docs/stable/io.html + Combining multiple files ------------------------ diff --git a/doc/whats-new.rst b/doc/whats-new.rst index a1851633fd1..10182df7831 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -28,12 +28,15 @@ Enhancements - Use ``pandas.Grouper`` class in xarray resample methods rather than the deprecated ``pandas.TimeGrouper`` class (:issue:`1766`). By `Joe Hamman `_. - - +- Support for using `Zarr`_ as storage layer for xarray. + By `Ryan Abernathey `_. - Experimental support for parsing ENVI metadata to coordinates and attributes in :py:func:`xarray.open_rasterio`. By `Matti Eskelinen ` +.. _Zarr: http://zarr.readthedocs.io/ + + Bug fixes ~~~~~~~~~ diff --git a/xarray/__init__.py b/xarray/__init__.py index 1b344252e57..3e80acd1572 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -18,6 +18,7 @@ from .backends.api import (open_dataset, open_dataarray, open_mfdataset, save_mfdataset) from .backends.rasterio_ import open_rasterio +from .backends.zarr import open_zarr from .conventions import decode_cf, SerializationWarning diff --git a/xarray/backends/__init__.py b/xarray/backends/__init__.py index a082bd53e5e..a8a4afc359a 100644 --- a/xarray/backends/__init__.py +++ b/xarray/backends/__init__.py @@ -10,3 +10,4 @@ from .pynio_ import NioDataStore from .scipy_ import ScipyDataStore from .h5netcdf_ import H5NetCDFStore +from .zarr import ZarrStore diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 36c686e7a91..cdeb8c0c0c2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -713,3 +713,29 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None, finally: for store in stores: store.close() + + +def to_zarr(dataset, store=None, mode='w-', synchronizer=None, group=None, + encoding=None): + """This function creates an appropriate datastore for writing a dataset to + a zarr ztore + + See `Dataset.to_zarr` for full API docs. + """ + if isinstance(store, path_type): + store = str(store) + if encoding is None: + encoding = {} + + # validate Dataset keys, DataArray names, and attr keys/values + _validate_dataset_names(dataset) + _validate_attrs(dataset) + + store = backends.ZarrStore.open_group(store=store, mode=mode, + synchronizer=synchronizer, + group=group, writer=None) + + # I think zarr stores should always be sync'd immediately + # TODO: figure out how to properly handle unlimited_dims + dataset.dump_to_store(store, sync=True, encoding=encoding) + return store diff --git a/xarray/backends/common.py b/xarray/backends/common.py index d33bffb1c1e..fd408877f87 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -164,9 +164,10 @@ def __exit__(self, exception_type, exception_value, traceback): class ArrayWriter(object): - def __init__(self): + def __init__(self, lock=GLOBAL_LOCK): self.sources = [] self.targets = [] + self.lock = lock def add(self, source, target): if isinstance(source, dask_array_type): @@ -184,7 +185,7 @@ def sync(self): import dask.array as da import dask if LooseVersion(dask.__version__) > LooseVersion('0.8.1'): - da.store(self.sources, self.targets, lock=GLOBAL_LOCK) + da.store(self.sources, self.targets, lock=self.lock) else: da.store(self.sources, self.targets) self.sources = [] diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py new file mode 100644 index 00000000000..779d8d07886 --- /dev/null +++ b/xarray/backends/zarr.py @@ -0,0 +1,542 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from itertools import product +from base64 import b64encode + +import numpy as np + +from .. import Variable +from ..core import indexing +from ..core.utils import FrozenOrderedDict, HiddenKeyDict +from ..core.pycompat import iteritems, OrderedDict, integer_types +from .common import AbstractWritableDataStore, BackendArray, ArrayWriter +from .. import conventions + +# need some special secret attributes to tell us the dimensions +_DIMENSION_KEY = '_ARRAY_DIMENSIONS' + + +# zarr attributes have to be serializable as json +# many xarray datasets / variables have numpy arrays and values +# these functions handle encoding / decoding of such items +def _encode_zarr_attr_value(value): + if isinstance(value, np.ndarray): + encoded = value.tolist() + # this checks if it's a scalar number + elif isinstance(value, np.generic): + encoded = value.item() + # np.string_('X').item() returns a type `bytes` + # zarr still doesn't like that + if type(encoded) is bytes: + encoded = b64encode(encoded) + else: + encoded = value + return encoded + + +def _ensure_valid_fill_value(value, dtype): + if dtype.type == np.string_ and type(value) == bytes: + valid = b64encode(value) + else: + valid = value + return _encode_zarr_attr_value(valid) + + +def _decode_zarr_attr_value(value): + return value + + +def _decode_zarr_attrs(attrs): + return OrderedDict([(k, _decode_zarr_attr_value(v)) + for k, v in attrs.items()]) + + +def _replace_slices_with_arrays(key, shape): + """Replace slice objects in vindex with equivalent ndarray objects.""" + num_slices = sum(1 for k in key if isinstance(k, slice)) + ndims = [k.ndim for k in key if isinstance(k, np.ndarray)] + array_subspace_size = max(ndims) if ndims else 0 + assert len(key) == len(shape) + new_key = [] + slice_count = 0 + for k, size in zip(key, shape): + if isinstance(k, slice): + # the slice subspace always appears after the ndarray subspace + array = np.arange(*k.indices(size)) + sl = [np.newaxis] * len(shape) + sl[array_subspace_size + slice_count] = slice(None) + k = array[tuple(sl)] + slice_count += 1 + else: + assert isinstance(k, np.ndarray) + k = k[(slice(None),) * array_subspace_size + + (np.newaxis,) * num_slices] + new_key.append(k) + return tuple(new_key) + + +class ZarrArrayWrapper(BackendArray): + def __init__(self, variable_name, datastore): + self.datastore = datastore + self.variable_name = variable_name + + array = self.get_array() + self.shape = array.shape + + dtype = array.dtype + self.dtype = dtype + + def get_array(self): + return self.datastore.ds[self.variable_name] + + def __getitem__(self, key): + array = self.get_array() + if isinstance(key, indexing.BasicIndexer): + return array[key.tuple] + elif isinstance(key, indexing.VectorizedIndexer): + return array.vindex[_replace_slices_with_arrays(key.tuple, + self.shape)] + else: + assert isinstance(key, indexing.OuterIndexer) + return array.oindex[key.tuple] + # if self.ndim == 0: + # could possibly have a work-around for 0d data here + + +def _determine_zarr_chunks(enc_chunks, var_chunks, ndim): + """ + Given encoding chunks (possibly None) and variable chunks (possibly None) + """ + + # zarr chunk spec: + # chunks : int or tuple of ints, optional + # Chunk shape. If not provided, will be guessed from shape and dtype. + + # if there are no chunks in encoding and the variable data is a numpy + # array, then we let zarr use its own heuristics to pick the chunks + if var_chunks is None and enc_chunks is None: + return None + + # if there are no chunks in encoding but there are dask chunks, we try to + # use the same chunks in zarr + # However, zarr chunks needs to be uniform for each array + # http://zarr.readthedocs.io/en/latest/spec/v1.html#chunks + # while dask chunks can be variable sized + # http://dask.pydata.org/en/latest/array-design.html#chunks + if var_chunks and enc_chunks is None: + all_var_chunks = list(product(*var_chunks)) + first_var_chunk = all_var_chunks[0] + # all but the last chunk have to match exactly + for this_chunk in all_var_chunks[:-1]: + if this_chunk != first_var_chunk: + raise ValueError( + "Zarr requires uniform chunk sizes excpet for final chunk." + " Variable %r has incompatible chunks. Consider " + "rechunking using `chunk()`." % (var_chunks,)) + # last chunk is allowed to be smaller + last_var_chunk = all_var_chunks[-1] + for len_first, len_last in zip(first_var_chunk, last_var_chunk): + if len_last > len_first: + raise ValueError( + "Final chunk of Zarr array must be smaller than first. " + "Variable %r has incompatible chunks. Consider rechunking " + "using `chunk()`." % var_chunks) + return first_var_chunk + + # from here on, we are dealing with user-specified chunks in encoding + # zarr allows chunks to be an integer, in which case it uses the same chunk + # size on each dimension. + # Here we re-implement this expansion ourselves. That makes the logic of + # checking chunk compatibility easier + + if isinstance(enc_chunks, integer_types): + enc_chunks_tuple = ndim * (enc_chunks,) + else: + enc_chunks_tuple = tuple(enc_chunks) + + if len(enc_chunks_tuple) != ndim: + raise ValueError("zarr chunks tuple %r must have same length as " + "variable.ndim %g" % + (enc_chunks_tuple, ndim)) + + for x in enc_chunks_tuple: + if not isinstance(x, int): + raise TypeError("zarr chunks must be an int or a tuple of ints. " + "Instead found %r" % (enc_chunks_tuple,)) + + # if there are chunks in encoding and the variable data is a numpy array, + # we use the specified chunks + if var_chunks is None: + return enc_chunks_tuple + + # the hard case + # DESIGN CHOICE: do not allow multiple dask chunks on a single zarr chunk + # this avoids the need to get involved in zarr synchronization / locking + # From zarr docs: + # "If each worker in a parallel computation is writing to a separate + # region of the array, and if region boundaries are perfectly aligned + # with chunk boundaries, then no synchronization is required." + # TODO: incorporate synchronizer to allow writes from multiple dask + # threads + if var_chunks and enc_chunks_tuple: + for zchunk, dchunks in zip(enc_chunks_tuple, var_chunks): + for dchunk in dchunks: + if dchunk % zchunk: + raise NotImplementedError( + "Specified zarr chunks %r would overlap multiple dask " + "chunks %r. This is not implemented in xarray yet. " + " Consider rechunking the data using " + "`chunk()` or specifying different chunks in encoding." + % (enc_chunks_tuple, var_chunks)) + return enc_chunks_tuple + + raise AssertionError( + "We should never get here. Function logic must be wrong.") + + +def _get_zarr_dims_and_attrs(zarr_obj, dimension_key): + # Zarr arrays do not have dimenions. To get around this problem, we add + # an attribute that specifies the dimension. We have to hide this attribute + # when we send the attributes to the user. + # zarr_obj can be either a zarr group or zarr array + try: + dimensions = zarr_obj.attrs[dimension_key] + except KeyError: + raise KeyError("Zarr object is missing the attribute `%s`, which is " + "required for xarray to determine variable dimensions." + % (dimension_key)) + attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key]) + return dimensions, attributes + + +def _extract_zarr_variable_encoding(variable, raise_on_invalid=False): + encoding = variable.encoding.copy() + + valid_encodings = set(['chunks', 'compressor', 'filters', + 'cache_metadata']) + + if raise_on_invalid: + invalid = [k for k in encoding if k not in valid_encodings] + if invalid: + raise ValueError('unexpected encoding parameters for zarr ' + 'backend: %r' % invalid) + else: + for k in list(encoding): + if k not in valid_encodings: + del encoding[k] + + chunks = _determine_zarr_chunks(encoding.get('chunks'), variable.chunks, + variable.ndim) + encoding['chunks'] = chunks + return encoding + + +# Function below is copied from conventions.encode_cf_variable. +# The only change is to raise an error for object dtypes. +def encode_zarr_variable(var, needs_copy=True, name=None): + """ + Converts an Variable into an Variable which follows some + of the CF conventions: + + - Nans are masked using _FillValue (or the deprecated missing_value) + - Rescaling via: scale_factor and add_offset + - datetimes are converted to the CF 'units since time' format + - dtype encodings are enforced. + + Parameters + ---------- + var : xarray.Variable + A variable holding un-encoded data. + + Returns + ------- + out : xarray.Variable + A variable which has been encoded as described above. + """ + + if var.dtype.kind == 'O': + raise NotImplementedError("Variable `%s` is an object. Zarr " + "store can't yet encode objects." % name) + + var = conventions.maybe_encode_datetime(var, name=name) + var = conventions.maybe_encode_timedelta(var, name=name) + var, needs_copy = conventions.maybe_encode_offset_and_scale(var, + needs_copy, + name=name) + var, needs_copy = conventions.maybe_encode_fill_value(var, needs_copy, + name=name) + var = conventions.maybe_encode_nonstring_dtype(var, name=name) + var = conventions.maybe_default_fill_value(var) + var = conventions.maybe_encode_bools(var) + var = conventions.ensure_dtype_not_object(var, name=name) + var = conventions.maybe_encode_string_dtype(var, name=name) + return var + + +class ZarrStore(AbstractWritableDataStore): + """Store for reading and writing data via zarr + """ + + @classmethod + def open_group(cls, store, mode='r', synchronizer=None, group=None, + writer=None): + import zarr + zarr_group = zarr.open_group(store=store, mode=mode, + synchronizer=synchronizer, path=group) + return cls(zarr_group, writer=writer) + + def __init__(self, zarr_group, writer=None): + self.ds = zarr_group + self._read_only = self.ds.read_only + self._synchronizer = self.ds.synchronizer + self._group = self.ds.path + + if _DIMENSION_KEY not in self.ds.attrs: + if self._read_only: + raise KeyError("Zarr group can't be read by xarray because " + "it is missing the `%s` attribute." % + _DIMENSION_KEY) + else: + # initialize hidden dimension attribute + self.ds.attrs[_DIMENSION_KEY] = {} + + if writer is None: + # by default, we should not need a lock for writing zarr because + # we do not (yet) allow overlapping chunks during write + zarr_writer = ArrayWriter(lock=False) + else: + zarr_writer = writer + + # do we need to define attributes for all of the opener keyword args? + super(ZarrStore, self).__init__(zarr_writer) + + def open_store_variable(self, name, zarr_array): + data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) + dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, + _DIMENSION_KEY) + attributes = _decode_zarr_attrs(attributes) + encoding = {'chunks': zarr_array.chunks, + 'compressor': zarr_array.compressor, + 'filters': zarr_array.filters} + # _FillValue needs to be in attributes, not encoding, so it will get + # picked up by decode_cf + if getattr(zarr_array, 'fill_value') is not None: + attributes['_FillValue'] = zarr_array.fill_value + + return Variable(dimensions, data, attributes, encoding) + + def get_variables(self): + return FrozenOrderedDict((k, self.open_store_variable(k, v)) + for k, v in self.ds.arrays()) + + def get_attrs(self): + _, attributes = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) + return _decode_zarr_attrs(attributes) + + def get_dimensions(self): + dimensions, _ = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) + return dimensions + + def set_dimension(self, name, length, is_unlimited=False): + if is_unlimited: + raise NotImplementedError( + "Zarr backend doesn't know how to handle unlimited dimensions") + # consistency check + if name in self.ds.attrs[_DIMENSION_KEY]: + if self.ds.attrs[_DIMENSION_KEY][name] != length: + raise ValueError("Pre-existing array dimensions %r " + "encoded in Zarr attributes are incompatible " + "with newly specified dimension `%s`: %g" % + (self.ds.attrs[_DIMENSION_KEY], name, length)) + self.ds.attrs[_DIMENSION_KEY][name] = length + + def set_attribute(self, key, value): + _, attributes = _get_zarr_dims_and_attrs(self.ds, _DIMENSION_KEY) + attributes[key] = _encode_zarr_attr_value(value) + + def prepare_variable(self, name, variable, check_encoding=False, + unlimited_dims=None): + + attrs = variable.attrs.copy() + dims = variable.dims + dtype = variable.dtype + shape = variable.shape + + # TODO: figure out how zarr should deal with unlimited dimensions + self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims) + + fill_value = _ensure_valid_fill_value(attrs.pop('_FillValue', None), + dtype) + + # TODO: figure out what encoding is needed for zarr + encoding = _extract_zarr_variable_encoding( + variable, raise_on_invalid=check_encoding) + + # arguments for zarr.create: + # zarr.creation.create(shape, chunks=None, dtype=None, + # compressor='default', fill_value=0, order='C', store=None, + # synchronizer=None, overwrite=False, path=None, chunk_store=None, + # filters=None, cache_metadata=True, **kwargs) + zarr_array = self.ds.create(name, shape=shape, dtype=dtype, + fill_value=fill_value, **encoding) + # decided not to explicity enumerate encoding options because we + # risk overriding zarr's defaults (e.g. if we specificy + # cache_metadata=None instead of True). Alternative is to have lots of + # logic in _extract_zarr_variable encoding to duplicate zarr defaults. + # chunks=encoding.get('chunks'), + # compressor=encoding.get('compressor'), + # filters=encodings.get('filters'), + # cache_metadata=encoding.get('cache_metadata')) + + # the magic for storing the hidden dimension data + zarr_array.attrs[_DIMENSION_KEY] = dims + _, attributes = _get_zarr_dims_and_attrs(zarr_array, _DIMENSION_KEY) + + for k, v in iteritems(attrs): + attributes[k] = _encode_zarr_attr_value(v) + + return zarr_array, variable.data + + def store(self, variables, attributes, *args, **kwargs): + new_vars = OrderedDict((k, encode_zarr_variable(v, name=k)) + for k, v in iteritems(variables)) + AbstractWritableDataStore.store(self, new_vars, attributes, + *args, **kwargs) + # sync() and close() methods should not be needed with zarr + + +# from zarr docs + +# Zarr arrays can be used as either the source or sink for data in parallel +# computations. Both multi-threaded and multi-process parallelism are +# supported. The Python global interpreter lock (GIL) is released for both +# compression and decompression operations, so Zarr will not block other Python +# threads from running. +# +# A Zarr array can be read concurrently by multiple threads or processes. No +# synchronization (i.e., locking) is required for concurrent reads. +# +# A Zarr array can also be written to concurrently by multiple threads or +# processes. Some synchronization may be required, depending on the way the +# data is being written. + +# If each worker in a parallel computation is writing to a separate region of +# the array, and if region boundaries are perfectly aligned with chunk +# boundaries, then no synchronization is required. However, if region and chunk +# boundaries are not perfectly aligned, then synchronization is required to +# avoid two workers attempting to modify the same chunk at the same time. + + +def open_zarr(store, group=None, synchronizer=None, auto_chunk=True, + decode_cf=True, mask_and_scale=True, decode_times=True, + concat_characters=True, decode_coords=True, + drop_variables=None): + """Load and decode a dataset from a Zarr store. + + .. note:: Experimental + The Zarr backend is new and experimental. Please report any + unexpected behavior via github issues. + + The `store` object should be a valid store for a Zarr group. `store` + variables must contain dimension metadata encoded in the + `_ARRAY_DIMENSIONS` attribute. + + Parameters + ---------- + store : MutableMapping or str + A MutableMapping where a Zarr Group has been stored or a path to a + directory in file system where a Zarr DirectoryStore has been stored. + synchronizer : object, optional + Array synchronizer provided to zarr + group : str, obtional + Group path. (a.k.a. `path` in zarr terminology.) + auto_chunk : bool, optional + Whether to automatically create dask chunks corresponding to each + variable's zarr chunks. If False, zarr array data will lazily convert + to numpy arrays upon access. + decode_cf : bool, optional + Whether to decode these variables, assuming they were saved according + to CF conventions. + mask_and_scale : bool, optional + If True, replace array values equal to `_FillValue` with NA and scale + values according to the formula `original_values * scale_factor + + add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are + taken from variable attributes (if they exist). If the `_FillValue` or + `missing_value` attribute contains multiple values a warning will be + issued and all array values matching one of the multiple values will + be replaced by NA. + decode_times : bool, optional + If True, decode times encoded in the standard NetCDF datetime format + into datetime objects. Otherwise, leave them encoded as numbers. + concat_characters : bool, optional + If True, concatenate along the last dimension of character arrays to + form string arrays. Dimensions will only be concatenated over (and + removed) if they have no corresponding variable and if they are only + used as the last dimension of character arrays. + decode_coords : bool, optional + If True, decode the 'coordinates' attribute to identify coordinates in + the resulting dataset. + drop_variables: string or iterable, optional + A variable or list of variables to exclude from being parsed from the + dataset. This may be useful to drop variables with problems or + inconsistent values. + + Returns + ------- + dataset : Dataset + The newly created dataset. + + See Also + -------- + open_dataset + + References + ---------- + http://zarr.readthedocs.io/ + """ + + if not decode_cf: + mask_and_scale = False + decode_times = False + concat_characters = False + decode_coords = False + + def maybe_decode_store(store, lock=False): + ds = conventions.decode_cf( + store, mask_and_scale=mask_and_scale, decode_times=decode_times, + concat_characters=concat_characters, decode_coords=decode_coords, + drop_variables=drop_variables) + + # TODO: this is where we would apply caching + + return ds + + # Zarr supports a wide range of access modes, but for now xarray either + # reads or writes from a store, never both. For open_zarr, we only read + mode = 'r' + zarr_store = ZarrStore.open_group(store, mode=mode, + synchronizer=synchronizer, + group=group) + ds = maybe_decode_store(zarr_store) + + # auto chunking needs to be here and not in ZarrStore because variable + # chunks do not survive decode_cf + if auto_chunk: + # adapted from Dataset.Chunk() + def maybe_chunk(name, var): + from dask.base import tokenize + chunks = var.encoding.get('chunks') + if (var.ndim > 0) and (chunks is not None): + # does this cause any data to be read? + token2 = tokenize(name, var._data) + name2 = 'zarr-%s-%s' % (name, token2) + return var.chunk(chunks, name=name2, lock=None) + else: + return var + + variables = OrderedDict([(k, maybe_chunk(k, v)) + for k, v in ds.variables.items()]) + return ds._replace_vars_and_dims(variables) + else: + return ds diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index 681390f8504..82f6d768bd5 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -1087,7 +1087,7 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, Write ('w') or append ('a') mode. If mode='w', any existing file at this location will be overwritten. If mode='a', existing variables will be overwritten. - format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT', 'NETCDF3_CLASSIC'}, optional + format : {'NETCDF4', 'NETCDF4_CLASSIC', 'NETCDF3_64BIT','NETCDF3_CLASSIC'}, optional File format for the resulting netCDF file: * NETCDF4: Data is stored in an HDF5 file, using netCDF4 API @@ -1132,6 +1132,40 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None, engine=engine, encoding=encoding, unlimited_dims=unlimited_dims) + def to_zarr(self, store=None, mode='w-', synchronizer=None, group=None, + encoding=None): + """Write dataset contents to a zarr group. + + .. note:: Experimental + The Zarr backend is new and experimental. Please report any + unexpected behavior via github issues. + + Parameters + ---------- + store : MutableMapping or str, optional + Store or path to directory in file system. + mode : {'w', 'w-'} + Persistence mode: 'w' means create (overwrite if exists); + 'w-' means create (fail if exists). + synchronizer : object, optional + Array synchronizer + group : str, obtional + Group path. (a.k.a. `path` in zarr terminology.) + encoding : dict, optional + Nested dictionary with variable names as keys and dictionaries of + variable specific encodings as values, e.g., + ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,}, ...}`` + """ + if encoding is None: + encoding = {} + if mode not in ['w', 'w-']: + # TODO: figure out how to handle 'r+' and 'a' + raise ValueError("The only supported options for mode are 'w' " + "and 'w-'.") + from ..backends.api import to_zarr + return to_zarr(self, store=store, mode=mode, synchronizer=synchronizer, + group=group, encoding=encoding) + def __unicode__(self): return formatting.dataset_repr(self) diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 3aea8ca6b8a..f1bbe202fa8 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -376,9 +376,9 @@ class VectorizedIndexer(ExplicitIndexer): """Tuple for vectorized indexing. All elements should be slice or N-dimensional np.ndarray objects with an - integer dtype. Indexing follows proposed rules for np.ndarray.vindex, which - matches NumPy's advanced indexing rules (including broadcasting) except - sliced axes are always moved to the end: + integer dtype and the same number of dimensions. Indexing follows proposed + rules for np.ndarray.vindex, which matches NumPy's advanced indexing rules + (including broadcasting) except sliced axes are always moved to the end: https://github.com/numpy/numpy/pull/6256 """ def __init__(self, key): @@ -386,6 +386,7 @@ def __init__(self, key): raise TypeError('key must be a tuple: {!r}'.format(key)) new_key = [] + ndim = None for k in key: if isinstance(k, slice): k = as_integer_slice(k) @@ -393,6 +394,13 @@ def __init__(self, key): if not np.issubdtype(k.dtype, np.integer): raise TypeError('invalid indexer array, does not have ' 'integer dtype: {!r}'.format(k)) + if ndim is None: + ndim = k.ndim + elif ndim != k.ndim: + ndims = [k.ndim for k in key if isinstance(k, np.ndarray)] + raise ValueError('invalid indexer key: ndarray arguments ' + 'have different numbers of dimensions: {}' + .format(ndims)) k = np.asarray(k, dtype=np.int64) else: raise TypeError('unexpected indexer type for {}: {!r}' diff --git a/xarray/core/utils.py b/xarray/core/utils.py index ac70d7f7aea..46fea23b9ff 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -533,3 +533,41 @@ def ensure_us_time_resolution(val): elif np.issubdtype(val.dtype, np.timedelta64): val = val.astype('timedelta64[us]') return val + + +class HiddenKeyDict(MutableMapping): + ''' + Acts like a normal dictionary, but hides certain keys. + ''' + # ``__init__`` method required to create instance from class. + def __init__(self, data, hidden_keys): + self._data = data + if type(hidden_keys) not in (list, tuple): + raise TypeError("hidden_keys must be a list or tuple") + self._hidden_keys = hidden_keys + + def _raise_if_hidden(self, key): + if key in self._hidden_keys: + raise KeyError('Key `%r` is hidden.' % key) + + # The next five methods are requirements of the ABC. + def __setitem__(self, key, value): + self._raise_if_hidden(key) + self._data[key] = value + + def __getitem__(self, key): + self._raise_if_hidden(key) + return self._data[key] + + def __delitem__(self, key): + self._raise_if_hidden(key) + del self._data[key] + + def __iter__(self): + for k in self._data: + if k not in self._hidden_keys: + yield k + + def __len__(self): + num_hidden = sum([k in self._hidden_keys for k in self._data]) + return len(self._data) - num_hidden diff --git a/xarray/core/variable.py b/xarray/core/variable.py index 5c1a2bbe1a7..b0c578665a4 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -1618,6 +1618,11 @@ def _unified_dims(variables): def _broadcast_compat_variables(*variables): + """Create broadcast compatible variables, with the same dimensions. + + Unlike the result of broadcast_variables(), some variables may have + dimensions of size 1 instead of the the size of the broadcast dimension. + """ dims = tuple(_unified_dims(variables)) return tuple(var.set_dims(dims) if var.dims != dims else var for var in variables) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 1716c3ee537..235c6e9e410 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -71,6 +71,7 @@ def _importorskip(modname, minversion=None): has_bottleneck, requires_bottleneck = _importorskip('bottleneck') has_rasterio, requires_rasterio = _importorskip('rasterio') has_pathlib, requires_pathlib = _importorskip('pathlib') +has_zarr, requires_zarr = _importorskip('zarr', minversion='2.2.0') # some special cases has_scipy_or_netCDF4 = has_scipy or has_netCDF4 diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d6993219505..6b0cd59eb9e 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -27,12 +27,13 @@ from . import (TestCase, requires_scipy, requires_netCDF4, requires_pydap, requires_scipy_or_netCDF4, requires_dask, requires_h5netcdf, - requires_pynio, requires_pathlib, has_netCDF4, has_scipy, - assert_allclose, flaky, network, requires_rasterio, - assert_identical, raises_regex) + requires_pynio, requires_pathlib, requires_zarr, + requires_rasterio, has_netCDF4, has_scipy, assert_allclose, + flaky, network, assert_identical, raises_regex) + from .test_dataset import create_test_data -from xarray.tests import mock, assert_identical +from xarray.tests import mock try: import netCDF4 as nc4 @@ -507,6 +508,7 @@ def test_roundtrip_bytes_with_fill_value(self): encoding = {'_FillValue': b'X', 'dtype': 'S1'} original = Dataset({'x': ('t', values, {}, encoding)}) expected = original.copy(deep=True) + print(original) with self.roundtrip(original) as actual: self.assertDatasetIdentical(expected, actual) @@ -1080,6 +1082,249 @@ class NetCDF4ViaDaskDataTestAutocloseTrue(NetCDF4ViaDaskDataTest): autoclose = True +@requires_zarr +class BaseZarrTest(CFEncodedDataTest): + + DIMENSION_KEY = '_ARRAY_DIMENSIONS' + + @contextlib.contextmanager + def create_store(self): + with self.create_zarr_target() as store_target: + yield backends.ZarrStore.open_group(store_target, mode='w') + + @contextlib.contextmanager + def roundtrip(self, data, save_kwargs={}, open_kwargs={}, + allow_cleanup_failure=False): + with self.create_zarr_target() as store_target: + data.to_zarr(store=store_target, **save_kwargs) + yield xr.open_zarr(store_target, **open_kwargs) + + def test_auto_chunk(self): + original = create_test_data().chunk() + + with self.roundtrip( + original, open_kwargs={'auto_chunk': False}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + self.assertEqual(v._in_memory, k in actual.dims) + # there should be no chunks + self.assertEqual(v.chunks, None) + + with self.roundtrip( + original, open_kwargs={'auto_chunk': True}) as actual: + for k, v in actual.variables.items(): + # only index variables should be in memory + self.assertEqual(v._in_memory, k in actual.dims) + # chunk size should be the same as original + self.assertEqual(v.chunks, original[k].chunks) + + def test_chunk_encoding(self): + # These datasets have no dask chunks. All chunking specified in + # encoding + data = create_test_data() + chunks = (5, 5) + data['var2'].encoding.update({'chunks': chunks}) + + with self.roundtrip(data) as actual: + self.assertEqual(chunks, actual['var2'].encoding['chunks']) + + # expect an error with non-integer chunks + data['var2'].encoding.update({'chunks': (5, 4.5)}) + with pytest.raises(TypeError): + with self.roundtrip(data) as actual: + pass + + def test_chunk_encoding_with_dask(self): + # These datasets DO have dask chunks. Need to check for various + # interactions between dask and zarr chunks + ds = xr.DataArray((np.arange(12)), dims='x', name='var1').to_dataset() + + # - no encoding specified - + # zarr automatically gets chunk information from dask chunks + ds_chunk4 = ds.chunk({'x': 4}) + with self.roundtrip(ds_chunk4) as actual: + self.assertEqual((4,), actual['var1'].encoding['chunks']) + + # should fail if dask_chunks are irregular... + ds_chunk_irreg = ds.chunk({'x': (5, 4, 3)}) + with pytest.raises(ValueError) as e_info: + with self.roundtrip(ds_chunk_irreg) as actual: + pass + # make sure this error message is correct and not some other error + assert e_info.match('chunks') + + # ... except if the last chunk is smaller than the first + ds_chunk_irreg = ds.chunk({'x': (5, 5, 2)}) + with self.roundtrip(ds_chunk_irreg) as actual: + self.assertEqual((5,), actual['var1'].encoding['chunks']) + + # - encoding specified - + # specify compatible encodings + for chunk_enc in 4, (4, ): + ds_chunk4['var1'].encoding.update({'chunks': chunk_enc}) + with self.roundtrip(ds_chunk4) as actual: + self.assertEqual((4,), actual['var1'].encoding['chunks']) + + # specify incompatible encoding + ds_chunk4['var1'].encoding.update({'chunks': (5, 5)}) + with pytest.raises(ValueError) as e_info: + with self.roundtrip(ds_chunk4) as actual: + pass + assert e_info.match('chunks') + + # TODO: remove this failure once syncronized overlapping writes are + # supported by xarray + ds_chunk4['var1'].encoding.update({'chunks': 5}) + with pytest.raises(NotImplementedError): + with self.roundtrip(ds_chunk4) as actual: + pass + + def test_vectorized_indexing(self): + self._test_vectorized_indexing(vindex_support=True) + + def test_hidden_zarr_keys(self): + expected = create_test_data() + with self.create_store() as store: + expected.dump_to_store(store) + zarr_group = store.ds + + # check that the global hidden attribute is present + assert self.DIMENSION_KEY in zarr_group.attrs + + # check that a variable hidden attribute is present and correct + # JSON only has a single array type, which maps to list in Python. + # In contrast, dims in xarray is always a tuple. + for var in expected.variables.keys(): + assert (zarr_group[var].attrs[self.DIMENSION_KEY] + == list(expected[var].dims)) + + with xr.decode_cf(store) as actual: + # make sure it is hidden + assert self.DIMENSION_KEY not in actual.attrs + for var in expected.variables.keys(): + assert self.DIMENSION_KEY not in expected[var].attrs + + # verify that the dataset fails to open if dimension key is missing + del zarr_group.attrs[self.DIMENSION_KEY] + with pytest.raises(KeyError): + with xr.decode_cf(store) as actual: + pass + + # put it back and try removing from a variable + zarr_group.attrs[self.DIMENSION_KEY] = {} + del zarr_group.var2.attrs[self.DIMENSION_KEY] + with pytest.raises(KeyError): + with xr.decode_cf(store) as actual: + pass + + def test_write_persistence_modes(self): + original = create_test_data() + + # overwrite mode + with self.roundtrip(original, save_kwargs={'mode': 'w'}) as actual: + self.assertDatasetIdentical(original, actual) + + # don't overwrite mode + with self.roundtrip(original, save_kwargs={'mode': 'w-'}) as actual: + self.assertDatasetIdentical(original, actual) + + # make sure overwriting works as expected + with self.create_zarr_target() as store: + original.to_zarr(store) + # should overwrite with no error + original.to_zarr(store, mode='w') + actual = xr.open_zarr(store) + self.assertDatasetIdentical(original, actual) + with pytest.raises(ValueError): + original.to_zarr(store, mode='w-') + + # check that we can't use other persistence modes + # TODO: reconsider whether other persistence modes should be supported + with pytest.raises(ValueError): + with self.roundtrip(original, save_kwargs={'mode': 'a'}) as actual: + pass + + def test_compressor_encoding(self): + original = create_test_data() + # specify a custom compressor + import zarr + blosc_comp = zarr.Blosc(cname='zstd', clevel=3, shuffle=2) + save_kwargs = dict(encoding={'var1': {'compressor': blosc_comp}}) + with self.roundtrip(original, save_kwargs=save_kwargs) as actual: + assert actual.var1.encoding['compressor'] == blosc_comp + + def test_group(self): + original = create_test_data() + group = 'some/random/path' + with self.roundtrip(original, save_kwargs={'group': group}, + open_kwargs={'group': group}) as actual: + self.assertDatasetIdentical(original, actual) + with pytest.raises(KeyError): + with self.roundtrip(original, + save_kwargs={'group': group}) as actual: + self.assertDatasetIdentical(original, actual) + + # TODO: implement zarr object encoding and make these tests pass + @pytest.mark.xfail(reason="Zarr object encoding not implemented") + def test_multiindex_not_implemented(self): + super(CFEncodedDataTest, self).test_multiindex_not_implemented() + + @pytest.mark.xfail(reason="Zarr object encoding not implemented") + def test_roundtrip_bytes_with_fill_value(self): + super(CFEncodedDataTest, self).test_roundtrip_bytes_with_fill_value() + + @pytest.mark.xfail(reason="Zarr object encoding not implemented") + def test_roundtrip_object_dtype(self): + super(CFEncodedDataTest, self).test_roundtrip_object_dtype() + + @pytest.mark.xfail(reason="Zarr object encoding not implemented") + def test_roundtrip_string_encoded_characters(self): + super(CFEncodedDataTest, + self).test_roundtrip_string_encoded_characters() + + # TODO: someone who understand caching figure out whether chaching + # makes sense for Zarr backend + @pytest.mark.xfail(reason="Zarr caching not implemented") + def test_dataset_caching(self): + super(CFEncodedDataTest, self).test_dataset_caching() + + +@requires_zarr +class ZarrDictStoreTest(BaseZarrTest, TestCase): + @contextlib.contextmanager + def create_zarr_target(self): + yield {} + + +@requires_zarr +class ZarrDirectoryStoreTest(BaseZarrTest, TestCase): + @contextlib.contextmanager + def create_zarr_target(self): + with create_tmp_file(suffix='.zarr') as tmp: + yield tmp + + +def test_replace_slices_with_arrays(): + (actual,) = xr.backends.zarr._replace_slices_with_arrays( + key=(slice(None),), shape=(5,)) + np.testing.assert_array_equal(actual, np.arange(5)) + + actual = xr.backends.zarr._replace_slices_with_arrays( + key=(np.arange(5),) * 3, shape=(8, 10, 12)) + expected = np.stack([np.arange(5)] * 3) + np.testing.assert_array_equal(np.stack(actual), expected) + + a, b = xr.backends.zarr._replace_slices_with_arrays( + key=(np.arange(5), slice(None)), shape=(8, 10)) + np.testing.assert_array_equal(a, np.arange(5)[:, np.newaxis]) + np.testing.assert_array_equal(b, np.arange(10)[np.newaxis, :]) + + a, b = xr.backends.zarr._replace_slices_with_arrays( + key=(slice(None), np.arange(5)), shape=(8, 10)) + np.testing.assert_array_equal(a, np.arange(8)[np.newaxis, :]) + np.testing.assert_array_equal(b, np.arange(5)[:, np.newaxis]) + + @requires_scipy class ScipyInMemoryDataTest(CFEncodedDataTest, NetCDF3Only, TestCase): engine = 'scipy' @@ -1147,7 +1392,7 @@ def create_store(self): def test_array_attrs(self): ds = Dataset(attrs={'foo': [[1, 2], [3, 4]]}) with raises_regex(ValueError, 'must be 1-dimensional'): - with self.roundtrip(ds) as roundtripped: + with self.roundtrip(ds): pass def test_roundtrip_example_1_netcdf_gz(self): diff --git a/xarray/tests/test_distributed.py b/xarray/tests/test_distributed.py index 2761f85f3af..1d0c51322a1 100644 --- a/xarray/tests/test_distributed.py +++ b/xarray/tests/test_distributed.py @@ -13,7 +13,8 @@ from xarray.tests.test_backends import create_tmp_file, ON_WINDOWS from xarray.tests.test_dataset import create_test_data -from . import assert_allclose, has_scipy, has_netCDF4, has_h5netcdf +from . import (assert_allclose, has_scipy, has_netCDF4, has_h5netcdf, + requires_zarr) ENGINES = [] @@ -28,7 +29,7 @@ @pytest.mark.xfail(sys.platform == 'win32', reason='https://github.com/pydata/xarray/issues/1738') @pytest.mark.parametrize('engine', ENGINES) -def test_dask_distributed_integration_test(loop, engine): +def test_dask_distributed_netcdf_integration_test(loop, engine): with cluster() as (s, _): with distributed.Client(s['address'], loop=loop): original = create_test_data() @@ -39,6 +40,17 @@ def test_dask_distributed_integration_test(loop, engine): computed = restored.compute() assert_allclose(original, computed) +@requires_zarr +def test_dask_distributed_zarr_integration_test(loop): + with cluster() as (s, _): + with distributed.Client(s['address'], loop=loop): + original = create_test_data() + with create_tmp_file(allow_cleanup_failure=ON_WINDOWS) as filename: + original.to_zarr(filename) + with xr.open_zarr(filename) as restored: + assert isinstance(restored.var1.data, da.Array) + computed = restored.compute() + assert_allclose(original, computed) @pytest.mark.skipif(distributed.__version__ <= '1.19.3', reason='Need recent distributed version to clean up get') diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 28fecdb4827..590492414b9 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -328,6 +328,9 @@ def test_vectorized_indexer(): check_slice(indexing.VectorizedIndexer) check_array1d(indexing.VectorizedIndexer) check_array2d(indexing.VectorizedIndexer) + with raises_regex(ValueError, 'numbers of dimensions'): + indexing.VectorizedIndexer((np.array(1, dtype=np.int64), + np.arange(5, dtype=np.int64))) def test_unwrap_explicit_indexer(): diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index c1df1da8c86..1813e2b6df8 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -189,3 +189,18 @@ def test_dask_array_is_scalar(): y = da.arange(8, chunks=4) assert not utils.is_scalar(y) + + +def test_hidden_key_dict(): + hidden_key = '_hidden_key' + data = {'a': 1, 'b': 2, hidden_key: 3} + data_expected = {'a': 1, 'b': 2} + hkd = utils.HiddenKeyDict(data, [hidden_key]) + assert len(hkd) == 2 + assert hidden_key not in hkd + for k, v in data_expected.items(): + assert hkd[k] == v + with pytest.raises(KeyError): + hkd[hidden_key] + with pytest.raises(KeyError): + del hkd[hidden_key]