@@ -324,6 +324,34 @@ def encode_zarr_variable(var, needs_copy=True, name=None):
324
324
return var
325
325
326
326
327
+ def _validate_datatypes_for_zarr_append (vname , existing_var , new_var ):
328
+ """If variable exists in the store, confirm dtype of the data to append is compatible with
329
+ existing dtype.
330
+ """
331
+ if (
332
+ np .issubdtype (new_var .dtype , np .number )
333
+ or np .issubdtype (new_var .dtype , np .datetime64 )
334
+ or np .issubdtype (new_var .dtype , np .bool_ )
335
+ or new_var .dtype == object
336
+ ):
337
+ # We can skip dtype equality checks under two conditions: (1) if the var to append is
338
+ # new to the dataset, because in this case there is no existing var to compare it to;
339
+ # or (2) if var to append's dtype is known to be easy-to-append, because in this case
340
+ # we can be confident appending won't cause problems. Examples of dtypes which are not
341
+ # easy-to-append include length-specified strings of type `|S*` or `<U*` (where * is a
342
+ # positive integer character length). For these dtypes, appending dissimilar lengths
343
+ # can result in truncation of appended data. Therefore, variables which already exist
344
+ # in the dataset, and with dtypes which are not known to be easy-to-append, necessitate
345
+ # exact dtype equality, as checked below.
346
+ pass
347
+ elif not new_var .dtype == existing_var .dtype :
348
+ raise ValueError (
349
+ f"Mismatched dtypes for variable { vname } between Zarr store on disk "
350
+ f"and dataset to append. Store has dtype { existing_var .dtype } but "
351
+ f"dataset to append has dtype { new_var .dtype } ."
352
+ )
353
+
354
+
327
355
def _validate_and_transpose_existing_dims (
328
356
var_name , new_var , existing_var , region , append_dim
329
357
):
@@ -612,26 +640,58 @@ def store(
612
640
import zarr
613
641
614
642
existing_keys = tuple (self .zarr_group .array_keys ())
643
+
644
+ if self ._mode == "r+" :
645
+ new_names = [k for k in variables if k not in existing_keys ]
646
+ if new_names :
647
+ raise ValueError (
648
+ f"dataset contains non-pre-existing variables { new_names } , "
649
+ "which is not allowed in ``xarray.Dataset.to_zarr()`` with "
650
+ "``mode='r+'``. To allow writing new variables, set ``mode='a'``."
651
+ )
652
+
653
+ if self ._append_dim is not None and self ._append_dim not in existing_keys :
654
+ # For dimensions without coordinate values, we must parse
655
+ # the _ARRAY_DIMENSIONS attribute on *all* arrays to check if it
656
+ # is a valid existing dimension name.
657
+ # TODO: This `get_dimensions` method also does shape checking
658
+ # which isn't strictly necessary for our check.
659
+ existing_dims = self .get_dimensions ()
660
+ if self ._append_dim not in existing_dims :
661
+ raise ValueError (
662
+ f"append_dim={ self ._append_dim !r} does not match any existing "
663
+ f"dataset dimensions { existing_dims } "
664
+ )
665
+
615
666
existing_variable_names = {
616
667
vn for vn in variables if _encode_variable_name (vn ) in existing_keys
617
668
}
618
- new_variables = set (variables ) - existing_variable_names
619
- variables_without_encoding = {vn : variables [vn ] for vn in new_variables }
669
+ new_variable_names = set (variables ) - existing_variable_names
620
670
variables_encoded , attributes = self .encode (
621
- variables_without_encoding , attributes
671
+ { vn : variables [ vn ] for vn in new_variable_names } , attributes
622
672
)
623
673
624
674
if existing_variable_names :
625
- # Decode variables directly, without going via xarray.Dataset to
626
- # avoid needing to load index variables into memory.
627
- # TODO: consider making loading indexes lazy again?
675
+ # We make sure that values to be appended are encoded *exactly*
676
+ # as the current values in the store.
677
+ # To do so, we decode variables directly to access the proper encoding,
678
+ # without going via xarray.Dataset to avoid needing to load
679
+ # index variables into memory.
628
680
existing_vars , _ , _ = conventions .decode_cf_variables (
629
- {k : self .open_store_variable (name = k ) for k in existing_variable_names },
630
- self .get_attrs (),
681
+ variables = {
682
+ k : self .open_store_variable (name = k ) for k in existing_variable_names
683
+ },
684
+ # attributes = {} since we don't care about parsing the global
685
+ # "coordinates" attribute
686
+ attributes = {},
631
687
)
632
688
# Modified variables must use the same encoding as the store.
633
689
vars_with_encoding = {}
634
690
for vn in existing_variable_names :
691
+ if self ._mode in ["a" , "a-" , "r+" ]:
692
+ _validate_datatypes_for_zarr_append (
693
+ vn , existing_vars [vn ], variables [vn ]
694
+ )
635
695
vars_with_encoding [vn ] = variables [vn ].copy (deep = False )
636
696
vars_with_encoding [vn ].encoding = existing_vars [vn ].encoding
637
697
vars_with_encoding , _ = self .encode (vars_with_encoding , {})
@@ -696,7 +756,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
696
756
697
757
for vn , v in variables .items ():
698
758
name = _encode_variable_name (vn )
699
- check = vn in check_encoding_set
759
+
700
760
attrs = v .attrs .copy ()
701
761
dims = v .dims
702
762
dtype = v .dtype
@@ -712,7 +772,7 @@ def set_variables(self, variables, check_encoding_set, writer, unlimited_dims=No
712
772
# https://github.com/pydata/xarray/issues/8371 for details.
713
773
encoding = extract_zarr_variable_encoding (
714
774
v ,
715
- raise_on_invalid = check ,
775
+ raise_on_invalid = vn in check_encoding_set ,
716
776
name = vn ,
717
777
safe_chunks = self ._safe_chunks ,
718
778
)
@@ -815,7 +875,7 @@ def _auto_detect_regions(self, ds, region):
815
875
assert variable .dims == (dim ,)
816
876
index = pd .Index (variable .data )
817
877
idxs = index .get_indexer (ds [dim ].data )
818
- if any (idxs == - 1 ):
878
+ if (idxs == - 1 ). any ( ):
819
879
raise KeyError (
820
880
f"Not all values of coordinate '{ dim } ' in the new array were"
821
881
" found in the original store. Writing to a zarr region slice"
0 commit comments