diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 7e3badc7143..97911ddabd8 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -29,6 +29,8 @@ Bug Fixes - Ensure that ``keep_attrs='drop'`` and ``keep_attrs=False`` remove attrs from result, even when there is only one xarray object given to ``apply_ufunc`` (:issue:`10982` :pull:`10997`). By `Julia Signell `_. +- Improve the robustness of ``open_zarr`` by attempting to load datasets with default dimension names if dimension name metadata is missing (:issue:`8749`). + By `Ewan Short `_. Documentation ~~~~~~~~~~~~~ diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index fe004c212b6..fda77002423 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -4,6 +4,7 @@ import json import os import struct +import warnings from collections.abc import Hashable, Iterable, Mapping from typing import TYPE_CHECKING, Any, Literal, Self, cast @@ -355,56 +356,96 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name): def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr): - # Zarr V3 explicitly stores the dimension names in the metadata + def get_default_dims(zarr_obj): + # Helper function to create default dimension names in cases where these don't + # exist in the metadata. Note the dimension_names parameter is optional in + # zarr version 3 + # https://zarr-specs.readthedocs.io/en/latest/v3/core/index.html#dimension-names + # Dimension name metadata is also optional in zarr version 2 + # https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html#attributes + + return tuple(f"dim_{n}" for n in range(len(zarr_obj.shape))) + + def get_nczarr_dims(zarr_obj): + # Helper function to extract dimension names from NCZarr metadata in .zarray + # https://docs.unidata.ucar.edu/netcdf/NUG/nczarr_head.html + zarray_path = os.path.join(zarr_obj.path, ".zarray") + # Check available zarr module version (not zarr version of zarr_obj) version + if _zarr_v3(): + import asyncio + + zarray_str = asyncio.run(zarr_obj.store.get(zarray_path)).to_bytes() + else: + zarray_str = zarr_obj.store.get(zarray_path) + zarray = json.loads(zarray_str) + try: + # NCZarr uses Fully Qualified Names + dimensions = [ + os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"] + ] + except KeyError: + dimensions = get_default_dims(zarr_obj) + return dimensions + + # Zarr V3 specifies an optional dimension_names array metadata parameter, so + # check if this exists try: - # if this exists, we are looking at a Zarr V3 array - # convert None to empty tuple + # If dimension_names exists, we are looking at a Zarr V3 array + # Convert None to empty tuple dimensions = zarr_obj.metadata.dimension_names or () except AttributeError: - # continue to old code path + # Continue to old code path pass else: + # Check number of dimensions in metadata match shape of array. + # If not, use defaults attributes = dict(zarr_obj.attrs) if len(zarr_obj.shape) != len(dimensions): - raise KeyError( - "Zarr object is missing the `dimension_names` metadata which is " - "required for xarray to determine variable dimensions." - ) + if not dimensions: + message = "Missing metadata dimension names." + else: + message = ( + f"Metadata dimension names {dimensions} inconsistent with array " + f"shape {zarr_obj.shape}." + ) + message += " Attempting with defaults." + warnings.warn(message, UserWarning, stacklevel=2) + dimensions = get_default_dims(zarr_obj) return dimensions, attributes - # Zarr arrays do not have dimensions. To get around this problem, we add - # an attribute that specifies the dimension. We have to hide this attribute - # when we send the attributes to the user. - # zarr_obj can be either a zarr group or zarr array + # Zarr 2 arrays do not necessarily have dimension names. + # https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html#attributes + # To get around this problem, we add an attribute that specifies the + # dimension. We have to hide this attribute when we send the + # attributes to the user. Note zarr_obj can be either a zarr group + # or zarr array + + # First try to read dimension names using old xarray-zarr convention. + # dimension_key typically _ARRAY_DIMENSIONS try: # Xarray-Zarr dimensions = zarr_obj.attrs[dimension_key] - except KeyError as e: + except KeyError: + warnings.warn( + "Failed to read dimension names from xarray zarr metadata.", + UserWarning, + stacklevel=2, + ) if not try_nczarr: - raise KeyError( - f"Zarr object is missing the attribute `{dimension_key}`, which is " - "required for xarray to determine variable dimensions." - ) from e - - # NCZarr defines dimensions through metadata in .zarray - zarray_path = os.path.join(zarr_obj.path, ".zarray") - if _zarr_v3(): - import asyncio - - zarray_str = asyncio.run(zarr_obj.store.get(zarray_path)).to_bytes() + # Skip straight to using default dimensions + dimensions = get_default_dims(zarr_obj) else: - zarray_str = zarr_obj.store.get(zarray_path) - zarray = json.loads(zarray_str) - try: - # NCZarr uses Fully Qualified Names - dimensions = [ - os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"] - ] - except KeyError as e: - raise KeyError( - f"Zarr object is missing the attribute `{dimension_key}` and the NCZarr metadata, " - "which are required for xarray to determine variable dimensions." - ) from e + # Try to read dimension names using NCZarr convention + try: + dimensions = get_nczarr_dims(zarr_obj) + except Exception: + # Fallback to default dimension names + warnings.warn( + "Failed to read dimension names from netcdf zarr metadata.", + UserWarning, + stacklevel=2, + ) + dimensions = get_default_dims(zarr_obj) nc_attrs = [attr for attr in zarr_obj.attrs if attr.lower().startswith("_nc")] attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key] + nc_attrs) @@ -1424,8 +1465,10 @@ def open_zarr( """Load and decode a dataset from a Zarr store. The `store` object should be a valid store for a Zarr group. `store` - variables must contain dimension metadata encoded in the - `_ARRAY_DIMENSIONS` attribute or must have NCZarr format. + variables should contain dimension metadata encoded in the + `_ARRAY_DIMENSIONS` attribute for zarr 2, the `dimension_names` attribute for zarr + 3, or the `dim_refs` parameter for NCZarr. If dimension name metadata is missing, + `open_zarr` will attempt to build the dataset using default dimension names. Parameters ---------- diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 5aab153117d..be7ac06c068 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2,8 +2,10 @@ import asyncio import contextlib +import glob import gzip import itertools +import json import math import os.path import pickle @@ -3022,7 +3024,7 @@ def test_hidden_zarr_keys(self) -> None: del attrs[self.DIMENSION_KEY] zarr_group["var2"].attrs.put(attrs) - with pytest.raises(KeyError): + with pytest.warns(UserWarning, match="Failed to read dimension names"): with xr.decode_cf(store): pass @@ -4371,6 +4373,87 @@ def create_zarr_target(self): with create_tmp_file(suffix=".zarr") as tmp: yield tmp + # Helper functions for stripping dimension metadata from zarr stores + def _strip_zarr_3(self, ds_path, stripped_ds_path): + """Create a copy of a zarr 3 with dimension_names metadata removed.""" + shutil.copytree(ds_path, stripped_ds_path, dirs_exist_ok=True) + # Get all the zarr.json metadata files. + metadata_files = glob.glob(f"{stripped_ds_path}/**/zarr.json", recursive=True) + # Iterate through and remove all "dimension_names" entries + for file in metadata_files: + with open(file) as f: + metadata = json.load(f) + metadata.pop("dimension_names", None) + con_metadata = metadata.get("consolidated_metadata", None) + if con_metadata: + for k in con_metadata["metadata"].keys(): + con_metadata["metadata"][k].pop("dimension_names", None) + + with open(file, "w") as f: + json.dump(metadata, f, indent=2) + + def _strip_zarr_2(self, ds_path, stripped_ds_path): + """Create a copy of a zarr 2 with _ARRAY_DIMENSIONS metadata removed.""" + # Get all the .zattrs files. Note .zattrs are optional in zarr 2, but xarray uses + # them to store dimension name metadata. + # https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html#attributes + shutil.copytree(ds_path, stripped_ds_path, dirs_exist_ok=True) + zattrs_files = glob.glob(f"{stripped_ds_path}/**/.zattrs", recursive=True) + # Iterate through and remove all "_ARRAY_DIMENSIONS" entries + for file in zattrs_files: + with open(file) as f: + metadata = json.load(f) + metadata.pop("_ARRAY_DIMENSIONS", None) + with open(file, "w") as f: + json.dump(metadata, f, indent=2) + zmetadata_file = Path(stripped_ds_path) / ".zmetadata" + if zmetadata_file.exists(): + with open(zmetadata_file) as f: + metadata = json.load(f) + for k in metadata["metadata"].keys(): + metadata["metadata"][k].pop("_ARRAY_DIMENSIONS", None) + with open(zmetadata_file, "w") as f: + json.dump(metadata, f, indent=2) + + @pytest.mark.parametrize("consolidated", [True, False]) + def test_default_dims(self, consolidated): + zarr_format = zarr.config.get("default_zarr_format") if has_zarr_v3 else 2 + # Create example data that can be read without dimension name metadata + da_a = xr.DataArray(np.arange(3 * 18).reshape(3, 18), dims=["label", "z"]) + da_b = xr.DataArray(np.arange(3), dims="label") + ds_1 = xr.Dataset({"a": da_a, "b": da_b}) + + # Specify what we expect to get when dimension name metadata is missing + expected = ds_1.rename_dims({"label": "dim_0", "z": "dim_1"}) + + def get_stripped_ds(ds, consolidated, zarr_format): + with self.create_zarr_target() as ds_target: + kwargs = {"consolidated": consolidated, "zarr_format": zarr_format} + ds.to_zarr(ds_target, **kwargs) + with self.create_zarr_target() as stripped_ds_target: + if zarr_format == 3: + self._strip_zarr_3(ds_target, stripped_ds_target) + else: + self._strip_zarr_2(ds_target, stripped_ds_target) + with pytest.warns(UserWarning, match="dimension names"): + return xr.open_zarr(stripped_ds_target, **kwargs).compute() + + stripped_ds_1 = get_stripped_ds(ds_1, consolidated, zarr_format) + assert_equal(stripped_ds_1, expected) + + # Create example data that cannot be read without dimension name metadata + da_c = xr.DataArray(np.arange(18), dims="z") + ds_2 = xr.Dataset({"a": da_a, "c": da_c}) + + with pytest.raises(ValueError, match="conflicting sizes for dimension"): + get_stripped_ds(ds_2, consolidated, zarr_format) + + # Failure to open_zarr on ds_2 reflects the failure to create an xarray Dataset without + # proper labeling of dimensions; variables must have consistent dimensions reading + # shape from left to right. + with pytest.raises(AlignmentError, match="cannot reindex or align along"): + xr.Dataset({"a": xr.DataArray(da_a.values), "c": xr.DataArray(da_c.values)}) + @requires_zarr class TestZarrWriteEmpty(TestZarrDirectoryStore): @@ -7587,13 +7670,13 @@ def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None: @requires_zarr @pytest.mark.usefixtures("default_zarr_format") -def test_raises_key_error_on_invalid_zarr_store(tmp_path): +def test_user_warning_on_invalid_zarr_store(tmp_path): root = zarr.open_group(tmp_path / "tmp.zarr") if Version(zarr.__version__) < Version("3.0.0"): root.create_dataset("bar", shape=(3, 5), dtype=np.float32) else: root.create_array("bar", shape=(3, 5), dtype=np.float32) - with pytest.raises(KeyError, match=r"xarray to determine variable dimensions"): + with pytest.warns(UserWarning, match=r"dimension names"): xr.open_zarr(tmp_path / "tmp.zarr", consolidated=False)