Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ Bug Fixes
- Ensure that ``keep_attrs='drop'`` and ``keep_attrs=False`` remove attrs from result, even when there is
only one xarray object given to ``apply_ufunc`` (:issue:`10982` :pull:`10997`).
By `Julia Signell <https://github.com/jsignell>`_.
- Improve the robustness of ``open_zarr`` by attempting to load datasets with default dimension names if dimension name metadata is missing (:issue:`8749`).
By `Ewan Short <https://github.com/eshort0401>`_.

Documentation
~~~~~~~~~~~~~
Expand Down
119 changes: 81 additions & 38 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import json
import os
import struct
import warnings
from collections.abc import Hashable, Iterable, Mapping
from typing import TYPE_CHECKING, Any, Literal, Self, cast

Expand Down Expand Up @@ -355,56 +356,96 @@ def _determine_zarr_chunks(enc_chunks, var_chunks, ndim, name):


def _get_zarr_dims_and_attrs(zarr_obj, dimension_key, try_nczarr):
# Zarr V3 explicitly stores the dimension names in the metadata
def get_default_dims(zarr_obj):
# Helper function to create default dimension names in cases where these don't
# exist in the metadata. Note the dimension_names parameter is optional in
# zarr version 3
# https://zarr-specs.readthedocs.io/en/latest/v3/core/index.html#dimension-names
# Dimension name metadata is also optional in zarr version 2
# https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html#attributes

return tuple(f"dim_{n}" for n in range(len(zarr_obj.shape)))

def get_nczarr_dims(zarr_obj):
# Helper function to extract dimension names from NCZarr metadata in .zarray
# https://docs.unidata.ucar.edu/netcdf/NUG/nczarr_head.html
zarray_path = os.path.join(zarr_obj.path, ".zarray")
# Check available zarr module version (not zarr version of zarr_obj) version
if _zarr_v3():
import asyncio

zarray_str = asyncio.run(zarr_obj.store.get(zarray_path)).to_bytes()
else:
zarray_str = zarr_obj.store.get(zarray_path)
zarray = json.loads(zarray_str)
try:
# NCZarr uses Fully Qualified Names
dimensions = [
os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"]
]
except KeyError:
dimensions = get_default_dims(zarr_obj)
return dimensions

# Zarr V3 specifies an optional dimension_names array metadata parameter, so
# check if this exists
try:
# if this exists, we are looking at a Zarr V3 array
# convert None to empty tuple
# If dimension_names exists, we are looking at a Zarr V3 array
# Convert None to empty tuple
dimensions = zarr_obj.metadata.dimension_names or ()
except AttributeError:
# continue to old code path
# Continue to old code path
pass
else:
# Check number of dimensions in metadata match shape of array.
# If not, use defaults
attributes = dict(zarr_obj.attrs)
if len(zarr_obj.shape) != len(dimensions):
raise KeyError(
"Zarr object is missing the `dimension_names` metadata which is "
"required for xarray to determine variable dimensions."
)
if not dimensions:
message = "Missing metadata dimension names."
else:
message = (
f"Metadata dimension names {dimensions} inconsistent with array "
f"shape {zarr_obj.shape}."
)
message += " Attempting with defaults."
warnings.warn(message, UserWarning, stacklevel=2)
dimensions = get_default_dims(zarr_obj)
return dimensions, attributes

# Zarr arrays do not have dimensions. To get around this problem, we add
# an attribute that specifies the dimension. We have to hide this attribute
# when we send the attributes to the user.
# zarr_obj can be either a zarr group or zarr array
# Zarr 2 arrays do not necessarily have dimension names.
# https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html#attributes
# To get around this problem, we add an attribute that specifies the
# dimension. We have to hide this attribute when we send the
# attributes to the user. Note zarr_obj can be either a zarr group
# or zarr array

# First try to read dimension names using old xarray-zarr convention.
# dimension_key typically _ARRAY_DIMENSIONS
try:
# Xarray-Zarr
dimensions = zarr_obj.attrs[dimension_key]
except KeyError as e:
except KeyError:
warnings.warn(
"Failed to read dimension names from xarray zarr metadata.",
UserWarning,
stacklevel=2,
)
if not try_nczarr:
raise KeyError(
f"Zarr object is missing the attribute `{dimension_key}`, which is "
"required for xarray to determine variable dimensions."
) from e

# NCZarr defines dimensions through metadata in .zarray
zarray_path = os.path.join(zarr_obj.path, ".zarray")
if _zarr_v3():
import asyncio

zarray_str = asyncio.run(zarr_obj.store.get(zarray_path)).to_bytes()
# Skip straight to using default dimensions
dimensions = get_default_dims(zarr_obj)
else:
zarray_str = zarr_obj.store.get(zarray_path)
zarray = json.loads(zarray_str)
try:
# NCZarr uses Fully Qualified Names
dimensions = [
os.path.basename(dim) for dim in zarray["_NCZARR_ARRAY"]["dimrefs"]
]
except KeyError as e:
raise KeyError(
f"Zarr object is missing the attribute `{dimension_key}` and the NCZarr metadata, "
"which are required for xarray to determine variable dimensions."
) from e
# Try to read dimension names using NCZarr convention
try:
dimensions = get_nczarr_dims(zarr_obj)
except Exception:
# Fallback to default dimension names
warnings.warn(
"Failed to read dimension names from netcdf zarr metadata.",
UserWarning,
stacklevel=2,
)
dimensions = get_default_dims(zarr_obj)

nc_attrs = [attr for attr in zarr_obj.attrs if attr.lower().startswith("_nc")]
attributes = HiddenKeyDict(zarr_obj.attrs, [dimension_key] + nc_attrs)
Expand Down Expand Up @@ -1424,8 +1465,10 @@ def open_zarr(
"""Load and decode a dataset from a Zarr store.

The `store` object should be a valid store for a Zarr group. `store`
variables must contain dimension metadata encoded in the
`_ARRAY_DIMENSIONS` attribute or must have NCZarr format.
variables should contain dimension metadata encoded in the
`_ARRAY_DIMENSIONS` attribute for zarr 2, the `dimension_names` attribute for zarr
3, or the `dim_refs` parameter for NCZarr. If dimension name metadata is missing,
`open_zarr` will attempt to build the dataset using default dimension names.

Parameters
----------
Expand Down
89 changes: 86 additions & 3 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

import asyncio
import contextlib
import glob
import gzip
import itertools
import json
import math
import os.path
import pickle
Expand Down Expand Up @@ -3022,7 +3024,7 @@ def test_hidden_zarr_keys(self) -> None:
del attrs[self.DIMENSION_KEY]
zarr_group["var2"].attrs.put(attrs)

with pytest.raises(KeyError):
with pytest.warns(UserWarning, match="Failed to read dimension names"):
with xr.decode_cf(store):
pass

Expand Down Expand Up @@ -4371,6 +4373,87 @@ def create_zarr_target(self):
with create_tmp_file(suffix=".zarr") as tmp:
yield tmp

# Helper functions for stripping dimension metadata from zarr stores
def _strip_zarr_3(self, ds_path, stripped_ds_path):
"""Create a copy of a zarr 3 with dimension_names metadata removed."""
shutil.copytree(ds_path, stripped_ds_path, dirs_exist_ok=True)
# Get all the zarr.json metadata files.
metadata_files = glob.glob(f"{stripped_ds_path}/**/zarr.json", recursive=True)
# Iterate through and remove all "dimension_names" entries
for file in metadata_files:
with open(file) as f:
metadata = json.load(f)
metadata.pop("dimension_names", None)
con_metadata = metadata.get("consolidated_metadata", None)
if con_metadata:
for k in con_metadata["metadata"].keys():
con_metadata["metadata"][k].pop("dimension_names", None)

with open(file, "w") as f:
json.dump(metadata, f, indent=2)

def _strip_zarr_2(self, ds_path, stripped_ds_path):
"""Create a copy of a zarr 2 with _ARRAY_DIMENSIONS metadata removed."""
# Get all the .zattrs files. Note .zattrs are optional in zarr 2, but xarray uses
# them to store dimension name metadata.
# https://zarr-specs.readthedocs.io/en/latest/v2/v2.0.html#attributes
shutil.copytree(ds_path, stripped_ds_path, dirs_exist_ok=True)
zattrs_files = glob.glob(f"{stripped_ds_path}/**/.zattrs", recursive=True)
# Iterate through and remove all "_ARRAY_DIMENSIONS" entries
for file in zattrs_files:
with open(file) as f:
metadata = json.load(f)
metadata.pop("_ARRAY_DIMENSIONS", None)
with open(file, "w") as f:
json.dump(metadata, f, indent=2)
zmetadata_file = Path(stripped_ds_path) / ".zmetadata"
if zmetadata_file.exists():
with open(zmetadata_file) as f:
metadata = json.load(f)
for k in metadata["metadata"].keys():
metadata["metadata"][k].pop("_ARRAY_DIMENSIONS", None)
with open(zmetadata_file, "w") as f:
json.dump(metadata, f, indent=2)

@pytest.mark.parametrize("consolidated", [True, False])
def test_default_dims(self, consolidated):
zarr_format = zarr.config.get("default_zarr_format") if has_zarr_v3 else 2
# Create example data that can be read without dimension name metadata
da_a = xr.DataArray(np.arange(3 * 18).reshape(3, 18), dims=["label", "z"])
da_b = xr.DataArray(np.arange(3), dims="label")
ds_1 = xr.Dataset({"a": da_a, "b": da_b})

# Specify what we expect to get when dimension name metadata is missing
expected = ds_1.rename_dims({"label": "dim_0", "z": "dim_1"})

def get_stripped_ds(ds, consolidated, zarr_format):
with self.create_zarr_target() as ds_target:
kwargs = {"consolidated": consolidated, "zarr_format": zarr_format}
ds.to_zarr(ds_target, **kwargs)
with self.create_zarr_target() as stripped_ds_target:
if zarr_format == 3:
self._strip_zarr_3(ds_target, stripped_ds_target)
else:
self._strip_zarr_2(ds_target, stripped_ds_target)
with pytest.warns(UserWarning, match="dimension names"):
return xr.open_zarr(stripped_ds_target, **kwargs).compute()

stripped_ds_1 = get_stripped_ds(ds_1, consolidated, zarr_format)
assert_equal(stripped_ds_1, expected)

# Create example data that cannot be read without dimension name metadata
da_c = xr.DataArray(np.arange(18), dims="z")
ds_2 = xr.Dataset({"a": da_a, "c": da_c})

with pytest.raises(ValueError, match="conflicting sizes for dimension"):
get_stripped_ds(ds_2, consolidated, zarr_format)

# Failure to open_zarr on ds_2 reflects the failure to create an xarray Dataset without
# proper labeling of dimensions; variables must have consistent dimensions reading
# shape from left to right.
with pytest.raises(AlignmentError, match="cannot reindex or align along"):
xr.Dataset({"a": xr.DataArray(da_a.values), "c": xr.DataArray(da_c.values)})


@requires_zarr
class TestZarrWriteEmpty(TestZarrDirectoryStore):
Expand Down Expand Up @@ -7587,13 +7670,13 @@ def test_zarr_create_default_indexes(tmp_path, create_default_indexes) -> None:

@requires_zarr
@pytest.mark.usefixtures("default_zarr_format")
def test_raises_key_error_on_invalid_zarr_store(tmp_path):
def test_user_warning_on_invalid_zarr_store(tmp_path):
root = zarr.open_group(tmp_path / "tmp.zarr")
if Version(zarr.__version__) < Version("3.0.0"):
root.create_dataset("bar", shape=(3, 5), dtype=np.float32)
else:
root.create_array("bar", shape=(3, 5), dtype=np.float32)
with pytest.raises(KeyError, match=r"xarray to determine variable dimensions"):
with pytest.warns(UserWarning, match=r"dimension names"):
xr.open_zarr(tmp_path / "tmp.zarr", consolidated=False)


Expand Down
Loading