Skip to content

Commit 8bb8ee6

Browse files
committed
Remove VLenUTF8 from filters to avoid double encoding error pydata/xarray#3476
1 parent 6cf0ccb commit 8bb8ee6

File tree

2 files changed

+20
-1
lines changed

2 files changed

+20
-1
lines changed

Diff for: sgkit/io/dataset.py

+10
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
from typing import Any, Dict, MutableMapping, Optional, Union
33

44
import fsspec
5+
import numcodecs
56
import xarray as xr
67
from xarray import Dataset
78

@@ -38,6 +39,15 @@ def save_dataset(
3839
for v in ds:
3940
# Workaround for https://github.com/pydata/xarray/issues/4380
4041
ds[v].encoding.pop("chunks", None)
42+
43+
# Remove VLenUTF8 from filters to avoid double encoding error https://github.com/pydata/xarray/issues/3476
44+
filters = ds[v].encoding.get("filters", None)
45+
var_len_str_codec = numcodecs.VLenUTF8()
46+
if filters is not None and var_len_str_codec in filters:
47+
filters = list(filters)
48+
filters.remove(var_len_str_codec)
49+
ds[v].encoding["filters"] = filters
50+
4151
ds.to_zarr(store, **kwargs)
4252

4353

Diff for: sgkit/tests/io/vcf/test_vcf_reader.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,15 @@
77
from numcodecs import Blosc, PackBits, VLenUTF8
88
from numpy.testing import assert_allclose, assert_array_equal
99

10-
from sgkit import load_dataset
10+
from sgkit import load_dataset, save_dataset
1111
from sgkit.io.utils import FLOAT32_FILL, INT_FILL, INT_MISSING
1212
from sgkit.io.vcf import (
1313
MaxAltAllelesExceededWarning,
1414
partition_into_regions,
1515
vcf_to_zarr,
1616
)
1717
from sgkit.io.vcf.vcf_reader import zarr_array_sizes
18+
from sgkit.tests.io.test_dataset import assert_identical
1819

1920
from .utils import path_for_test
2021

@@ -95,6 +96,14 @@ def test_vcf_to_zarr__small_vcf(shared_datadir, is_path, tmp_path):
9596
assert_array_equal(ds["call_genotype_mask"], call_genotype < 0)
9697
assert_array_equal(ds["call_genotype_phased"], call_genotype_phased)
9798

99+
# save and load again to test https://github.com/pydata/xarray/issues/3476
100+
with pytest.warns(xr.coding.variables.SerializationWarning):
101+
path2 = tmp_path / "ds2.zarr"
102+
if not is_path:
103+
path2 = str(path2)
104+
save_dataset(ds, path2)
105+
assert_identical(ds, load_dataset(path2))
106+
98107

99108
@pytest.mark.parametrize(
100109
"is_path",

0 commit comments

Comments
 (0)