Skip to content

Commit 1bcc78d

Browse files
committed
Support encoding on array writes.
1 parent b0b673b commit 1bcc78d

File tree

3 files changed

+116
-31
lines changed

3 files changed

+116
-31
lines changed

lib/iris/fileformats/netcdf/_thread_safe_nc.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,8 @@ def __setitem__(self, keys, array_data):
393393
try:
394394
dataset = netCDF4.Dataset(self.path, "r+")
395395
var = dataset.variables[self.varname]
396+
# **Always** disable encode/decode of bytes to strings
397+
var.set_auto_chartostring(False)
396398
var[keys] = array_data
397399
finally:
398400
try:

lib/iris/fileformats/netcdf/saver.py

Lines changed: 64 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
1515
"""
1616

17+
import codecs
1718
import collections
1819
from itertools import repeat, zip_longest
1920
import os
@@ -1801,46 +1802,85 @@ def _create_generic_cf_array_var(
18011802
if np.issubdtype(data.dtype, np.str_):
18021803
# Deal with string-type variables.
18031804
# Typically CF label variables, but also possibly ancil-vars ?
1804-
string_dimension_depth = data.dtype.itemsize
1805-
if data.dtype.kind == "U":
1806-
string_dimension_depth //= 4
1807-
string_dimension_name = "string%d" % string_dimension_depth
1805+
1806+
# Encode data into bytes, and determine the string-dimension length.
1807+
# * we can't work this out without first encoding the data
1808+
# * UNLESS the target length is given (.iris_string_dimlength)
1809+
# * we can't create the dimension before we know the length
1810+
# * we can't create the variable before creating the dim (if needed)
1811+
# TODO: we can keep data lazy IFF there is a user-specified string-length
1812+
1813+
# Calculate encoding to apply.
1814+
default_encoding = "utf-8"
1815+
encoding = element.attributes.get("_Encoding", None)
1816+
if encoding is None:
1817+
# utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
1818+
encoding = default_encoding
1819+
else:
1820+
try:
1821+
# Accept + normalise naming of encodings
1822+
encoding = codecs.lookup(encoding).name
1823+
# NOTE: if encoding does not suit data, errors can occur.
1824+
# For example, _Encoding = "ascii", with non-ascii content.
1825+
except LookupError:
1826+
# Replace some invalid setting with "safe"(ish) fallback.
1827+
encoding = default_encoding
1828+
1829+
# Convert data from an array of strings into a character array
1830+
# with an extra string-length dimension.
1831+
1832+
# TODO: support lazy in some cases??
1833+
# (N.B. can do when 'iris_string_dimlength' is provided)
1834+
if is_lazy_data(data):
1835+
data = dask.compute(data)
1836+
1837+
element_shape = data.shape
1838+
max_length = 1 # this is a MINIMUM - i.e. not zero!
1839+
data_elements = np.zeros(element_shape, dtype=object)
1840+
for index in np.ndindex(element_shape):
1841+
data_element = data[index].encode(encoding)
1842+
element_length = len(data_element)
1843+
data_elements[index] = data_element
1844+
if element_length > max_length:
1845+
max_length = element_length
1846+
1847+
string_dimension_length = element.attributes.get(
1848+
"iris_string_dimlength", None
1849+
)
1850+
if string_dimension_length is None:
1851+
string_dimension_length = max_length
1852+
1853+
# We already encoded all the strings, but stored them in an object-array as
1854+
# we didn't yet know the fixed byte-length to convert to.
1855+
# Now convert to fixed-width char array
1856+
data = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
1857+
right_pad = b"\0" * string_dimension_length
1858+
for index in np.ndindex(element_shape):
1859+
bytes = data_elements[index]
1860+
bytes = (bytes + right_pad)[:string_dimension_length]
1861+
data[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
18081862

18091863
# Determine whether to create the string length dimension.
1864+
string_dimension_name = f"string{string_dimension_length}"
18101865
if string_dimension_name not in self._dataset.dimensions:
18111866
while string_dimension_name in self._dataset.variables:
18121867
# Also avoid collision with variable names.
18131868
# See '_get_dim_names' for reason.
18141869
string_dimension_name = self._increment_name(string_dimension_name)
18151870
self._dataset.createDimension(
1816-
string_dimension_name, string_dimension_depth
1871+
string_dimension_name, string_dimension_length
18171872
)
18181873

18191874
# Add the string length dimension to the variable dimensions.
18201875
element_dims.append(string_dimension_name)
18211876

18221877
# Create the label coordinate variable.
18231878
cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims)
1879+
# Force to always exchange data as byte arrays
1880+
# TODO: ?remove when bug fixed
1881+
# see : https://github.com/Unidata/netcdf4-python/issues/1440
1882+
cf_var.set_auto_chartostring(False)
18241883

1825-
# Convert data from an array of strings into a character array
1826-
# with an extra string-length dimension.
1827-
if len(element_dims) == 1:
1828-
# Scalar variable (only has string dimension).
1829-
data_first = data[0]
1830-
if is_lazy_data(data_first):
1831-
data_first = dask.compute(data_first)
1832-
data = list("%- *s" % (string_dimension_depth, data_first))
1833-
else:
1834-
# NOTE: at present, can't do this lazily??
1835-
orig_shape = data.shape
1836-
new_shape = orig_shape + (string_dimension_depth,)
1837-
new_data = np.zeros(new_shape, cf_var.dtype)
1838-
for index in np.ndindex(orig_shape):
1839-
index_slice = tuple(list(index) + [slice(None, None)])
1840-
new_data[index_slice] = list(
1841-
"%- *s" % (string_dimension_depth, data[index])
1842-
)
1843-
data = new_data
18441884
else:
18451885
# A normal (numeric) variable.
18461886
# ensure a valid datatype for the file format.

lib/iris/tests/integration/netcdf/test_chararrays.py

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,20 +3,31 @@
33
import pytest
44

55
import iris
6+
from iris.coords import AuxCoord, DimCoord
7+
from iris.cube import Cube
68

79
NX, N_STRLEN = 3, 64
810
TEST_STRINGS = ["Münster", "London", "Amsterdam"]
911
TEST_COORD_VALS = ["bun", "éclair", "sandwich"]
1012

1113

12-
def convert_chararray(string_array_1d, maxlen, encoding="utf-8"):
14+
def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"):
1315
bbytes = [text.encode(encoding) for text in string_array_1d]
1416
pad = b"\0" * maxlen
1517
bbytes = [(x + pad)[:maxlen] for x in bbytes]
1618
chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes])
1719
return chararray
1820

1921

22+
# def convert_chararray_to_strings(char_array_2d, maxlen: int | None =0, encoding="utf-8"):
23+
# strings = [bytes.decode(encoding) for bytes in char_array_2d]
24+
# if not maxlen:
25+
# maxlen = max(len(string) for string in strings)
26+
# dtype_str = f"S{maxlen}"
27+
# string_array = np.array(strings, dtype=dtype_str)
28+
# return string_array
29+
30+
2031
INCLUDE_COORD = True
2132
# INCLUDE_COORD = False
2233

@@ -55,6 +66,23 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None):
5566
v.coordinates = "v_co"
5667

5768

69+
def make_testcube(
70+
dataarray,
71+
coordarray, # for now, these are always *string* arrays
72+
encoding_str: str | None = None,
73+
):
74+
cube = Cube(dataarray, var_name="v")
75+
cube.add_dim_coord(DimCoord(np.arange(NX), var_name="x"), 0)
76+
if encoding_str is not None:
77+
cube.attributes["_Encoding"] = encoding_str
78+
if INCLUDE_COORD:
79+
co_x = AuxCoord(coordarray, var_name="v_co")
80+
if encoding_str is not None:
81+
co_x.attributes["_Encoding"] = encoding_str
82+
cube.add_aux_coord(co_x, 0)
83+
return cube
84+
85+
5886
def show_result(filepath):
5987
from pp_utils import ncdump
6088

@@ -73,14 +101,14 @@ def show_result(filepath):
73101
# print(repr(v[:]))
74102
print("\nAs iris cube..")
75103
try:
104+
iris.loading.LOAD_PROBLEMS.reset()
76105
cube = iris.load_cube(filepath)
77106
print(cube)
78-
if iris.loading.LOAD_PROBLEMS._problems:
107+
if iris.loading.LOAD_PROBLEMS.problems:
79108
print(iris.loading.LOAD_PROBLEMS)
80109
print(
81-
"\n".join(iris.loading.LOAD_PROBLEMS._problems[0].stack_trace.format())
110+
"\n".join(iris.loading.LOAD_PROBLEMS.problems[0].stack_trace.format())
82111
)
83-
iris.loading.LOAD_PROBLEMS._problems = []
84112
print("-data-")
85113
print(repr(cube.data))
86114
if INCLUDE_COORD:
@@ -106,14 +134,29 @@ def show_result(filepath):
106134

107135

108136
@pytest.mark.parametrize("encoding", tsts)
109-
def test_encodings(encoding):
137+
def test_load_encodings(encoding):
110138
# small change
111139
print(f"\n=========\nTesting encoding: {encoding}")
112140
filepath = f"tmp_{str(encoding)}.nc"
113141
do_as = encoding
114142
if encoding != "utf-32":
115143
do_as = "utf-8"
116-
TEST_CHARARRAY = convert_chararray(TEST_STRINGS, N_STRLEN, encoding=do_as)
117-
TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as)
144+
TEST_CHARARRAY = convert_strings_to_chararray(
145+
TEST_STRINGS, N_STRLEN, encoding=do_as
146+
)
147+
TEST_COORDARRAY = convert_strings_to_chararray(
148+
TEST_COORD_VALS, N_STRLEN, encoding=do_as
149+
)
118150
make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding)
119151
show_result(filepath)
152+
153+
154+
@pytest.mark.parametrize("encoding", tsts)
155+
def test_save_encodings(encoding):
156+
cube = make_testcube(
157+
dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding
158+
)
159+
print(cube)
160+
filepath = f"tmp_save_{str(encoding)}.nc"
161+
iris.save(cube, filepath)
162+
show_result(filepath)

0 commit comments

Comments
 (0)