Skip to content

Commit

Permalink
Zarr: optimize appending
Browse files Browse the repository at this point in the history
  • Loading branch information
dcherian committed May 3, 2024
1 parent 7489aba commit 6fcd873
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 40 deletions.
43 changes: 3 additions & 40 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -1521,42 +1521,6 @@ def save_mfdataset(
)


def _validate_datatypes_for_zarr_append(zstore, dataset):
"""If variable exists in the store, confirm dtype of the data to append is compatible with
existing dtype.
"""

existing_vars = zstore.get_variables()

def check_dtype(vname, var):
if (
vname not in existing_vars
or np.issubdtype(var.dtype, np.number)
or np.issubdtype(var.dtype, np.datetime64)
or np.issubdtype(var.dtype, np.bool_)
or var.dtype == object
):
# We can skip dtype equality checks under two conditions: (1) if the var to append is
# new to the dataset, because in this case there is no existing var to compare it to;
# or (2) if var to append's dtype is known to be easy-to-append, because in this case
# we can be confident appending won't cause problems. Examples of dtypes which are not
# easy-to-append include length-specified strings of type `|S*` or `<U*` (where * is a
# positive integer character length). For these dtypes, appending dissimilar lengths
# can result in truncation of appended data. Therefore, variables which already exist
# in the dataset, and with dtypes which are not known to be easy-to-append, necessitate
# exact dtype equality, as checked below.
pass
elif not var.dtype == existing_vars[vname].dtype:
raise ValueError(
f"Mismatched dtypes for variable {vname} between Zarr store on disk "
f"and dataset to append. Store has dtype {existing_vars[vname].dtype} but "
f"dataset to append has dtype {var.dtype}."
)

for vname, var in dataset.data_vars.items():
check_dtype(vname, var)


# compute=True returns ZarrStore
@overload
def to_zarr(
Expand Down Expand Up @@ -1721,17 +1685,16 @@ def to_zarr(
)

if mode in ["a", "a-", "r+"]:
_validate_datatypes_for_zarr_append(zstore, dataset)
if append_dim is not None:
existing_var_names = set(zstore.zarr_group.array_keys())
if append_dim is not None and append_dim not in existing_var_names:
existing_dims = zstore.get_dimensions()
if append_dim not in existing_dims:
raise ValueError(
f"append_dim={append_dim!r} does not match any existing "
f"dataset dimensions {existing_dims}"
)
existing_var_names = set(zstore.zarr_group.array_keys())
for var_name in existing_var_names:
if var_name in encoding.keys():
if var_name in encoding:
raise ValueError(
f"variable {var_name!r} already exists, but encoding was provided"
)
Expand Down
32 changes: 32 additions & 0 deletions xarray/backends/zarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,34 @@ def encode_zarr_variable(var, needs_copy=True, name=None):
return var


def _validate_datatypes_for_zarr_append(vname, existing_var, new_var):
"""If variable exists in the store, confirm dtype of the data to append is compatible with
existing dtype.
"""
if (
np.issubdtype(new_var.dtype, np.number)
or np.issubdtype(new_var.dtype, np.datetime64)
or np.issubdtype(new_var.dtype, np.bool_)
or new_var.dtype == object
):
# We can skip dtype equality checks under two conditions: (1) if the var to append is
# new to the dataset, because in this case there is no existing var to compare it to;
# or (2) if var to append's dtype is known to be easy-to-append, because in this case
# we can be confident appending won't cause problems. Examples of dtypes which are not
# easy-to-append include length-specified strings of type `|S*` or `<U*` (where * is a
# positive integer character length). For these dtypes, appending dissimilar lengths
# can result in truncation of appended data. Therefore, variables which already exist
# in the dataset, and with dtypes which are not known to be easy-to-append, necessitate
# exact dtype equality, as checked below.
pass
elif not new_var.dtype == existing_var.dtype:
raise ValueError(
f"Mismatched dtypes for variable {vname} between Zarr store on disk "
f"and dataset to append. Store has dtype {existing_var.dtype} but "
f"dataset to append has dtype {new_var.dtype}."
)


def _validate_and_transpose_existing_dims(
var_name, new_var, existing_var, region, append_dim
):
Expand Down Expand Up @@ -632,6 +660,10 @@ def store(
# Modified variables must use the same encoding as the store.
vars_with_encoding = {}
for vn in existing_variable_names:
if self._mode in ["a", "a-", "r+"]:
_validate_datatypes_for_zarr_append(
vn, existing_vars[vn], variables[vn]
)
vars_with_encoding[vn] = variables[vn].copy(deep=False)
vars_with_encoding[vn].encoding = existing_vars[vn].encoding
vars_with_encoding, _ = self.encode(vars_with_encoding, {})
Expand Down

0 comments on commit 6fcd873

Please sign in to comment.