Support encoding on array writes.

pp-mo · pp-mo · commit 1bcc78dec4ed · 2025-10-28T18:19:15.000Z
diff --git a/lib/iris/fileformats/netcdf/_thread_safe_nc.py b/lib/iris/fileformats/netcdf/_thread_safe_nc.py
@@ -393,6 +393,8 @@ def __setitem__(self, keys, array_data):
             try:
                 dataset = netCDF4.Dataset(self.path, "r+")
                 var = dataset.variables[self.varname]
+                # **Always** disable encode/decode of bytes to strings
+                var.set_auto_chartostring(False)
                 var[keys] = array_data
             finally:
                 try:
diff --git a/lib/iris/fileformats/netcdf/saver.py b/lib/iris/fileformats/netcdf/saver.py
@@ -14,6 +14,7 @@
 
 """
 
+import codecs
 import collections
 from itertools import repeat, zip_longest
 import os
@@ -1801,46 +1802,85 @@ def _create_generic_cf_array_var(
         if np.issubdtype(data.dtype, np.str_):
             # Deal with string-type variables.
             # Typically CF label variables, but also possibly ancil-vars ?
-            string_dimension_depth = data.dtype.itemsize
-            if data.dtype.kind == "U":
-                string_dimension_depth //= 4
-            string_dimension_name = "string%d" % string_dimension_depth
+
+            # Encode data into bytes, and determine the string-dimension length.
+            #  * we can't work this out without first encoding the data
+            #    * UNLESS the target length is given (.iris_string_dimlength)
+            #  * we can't create the dimension before we know the length
+            #  * we can't create the variable before creating the dim (if needed)
+            # TODO: we can keep data lazy IFF there is a user-specified string-length
+
+            # Calculate encoding to apply.
+            default_encoding = "utf-8"
+            encoding = element.attributes.get("_Encoding", None)
+            if encoding is None:
+                # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data
+                encoding = default_encoding
+            else:
+                try:
+                    # Accept + normalise naming of encodings
+                    encoding = codecs.lookup(encoding).name
+                    # NOTE: if encoding does not suit data, errors can occur.
+                    # For example, _Encoding = "ascii", with non-ascii content.
+                except LookupError:
+                    # Replace some invalid setting with "safe"(ish) fallback.
+                    encoding = default_encoding
+
+            # Convert data from an array of strings into a character array
+            # with an extra string-length dimension.
+
+            # TODO: support lazy in some cases??
+            #  (N.B. can do when 'iris_string_dimlength' is provided)
+            if is_lazy_data(data):
+                data = dask.compute(data)
+
+            element_shape = data.shape
+            max_length = 1  # this is a MINIMUM - i.e. not zero!
+            data_elements = np.zeros(element_shape, dtype=object)
+            for index in np.ndindex(element_shape):
+                data_element = data[index].encode(encoding)
+                element_length = len(data_element)
+                data_elements[index] = data_element
+                if element_length > max_length:
+                    max_length = element_length
+
+            string_dimension_length = element.attributes.get(
+                "iris_string_dimlength", None
+            )
+            if string_dimension_length is None:
+                string_dimension_length = max_length
+
+            # We already encoded all the strings, but stored them in an object-array as
+            #  we didn't yet know the fixed byte-length to convert to.
+            # Now convert to fixed-width char array
+            data = np.zeros(element_shape + (string_dimension_length,), dtype="S1")
+            right_pad = b"\0" * string_dimension_length
+            for index in np.ndindex(element_shape):
+                bytes = data_elements[index]
+                bytes = (bytes + right_pad)[:string_dimension_length]
+                data[index] = [bytes[i : i + 1] for i in range(string_dimension_length)]
 
             # Determine whether to create the string length dimension.
+            string_dimension_name = f"string{string_dimension_length}"
             if string_dimension_name not in self._dataset.dimensions:
                 while string_dimension_name in self._dataset.variables:
                     # Also avoid collision with variable names.
                     # See '_get_dim_names' for reason.
                     string_dimension_name = self._increment_name(string_dimension_name)
                 self._dataset.createDimension(
-                    string_dimension_name, string_dimension_depth
+                    string_dimension_name, string_dimension_length
                 )
 
             # Add the string length dimension to the variable dimensions.
             element_dims.append(string_dimension_name)
 
             # Create the label coordinate variable.
             cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims)
+            # Force to always exchange data as byte arrays
+            # TODO: ?remove when bug fixed
+            #  see : https://github.com/Unidata/netcdf4-python/issues/1440
+            cf_var.set_auto_chartostring(False)
 
-            # Convert data from an array of strings into a character array
-            # with an extra string-length dimension.
-            if len(element_dims) == 1:
-                # Scalar variable (only has string dimension).
-                data_first = data[0]
-                if is_lazy_data(data_first):
-                    data_first = dask.compute(data_first)
-                data = list("%- *s" % (string_dimension_depth, data_first))
-            else:
-                # NOTE: at present, can't do this lazily??
-                orig_shape = data.shape
-                new_shape = orig_shape + (string_dimension_depth,)
-                new_data = np.zeros(new_shape, cf_var.dtype)
-                for index in np.ndindex(orig_shape):
-                    index_slice = tuple(list(index) + [slice(None, None)])
-                    new_data[index_slice] = list(
-                        "%- *s" % (string_dimension_depth, data[index])
-                    )
-                    data = new_data
         else:
             # A normal (numeric) variable.
             # ensure a valid datatype for the file format.
diff --git a/lib/iris/tests/integration/netcdf/test_chararrays.py b/lib/iris/tests/integration/netcdf/test_chararrays.py
@@ -3,20 +3,31 @@
 import pytest
 
 import iris
+from iris.coords import AuxCoord, DimCoord
+from iris.cube import Cube
 
 NX, N_STRLEN = 3, 64
 TEST_STRINGS = ["Münster", "London", "Amsterdam"]
 TEST_COORD_VALS = ["bun", "éclair", "sandwich"]
 
 
-def convert_chararray(string_array_1d, maxlen, encoding="utf-8"):
+def convert_strings_to_chararray(string_array_1d, maxlen, encoding="utf-8"):
     bbytes = [text.encode(encoding) for text in string_array_1d]
     pad = b"\0" * maxlen
     bbytes = [(x + pad)[:maxlen] for x in bbytes]
     chararray = np.array([[bb[i : i + 1] for i in range(maxlen)] for bb in bbytes])
     return chararray
 
 
+# def convert_chararray_to_strings(char_array_2d, maxlen: int | None =0, encoding="utf-8"):
+#     strings = [bytes.decode(encoding) for bytes in char_array_2d]
+#     if not maxlen:
+#         maxlen = max(len(string) for string in strings)
+#     dtype_str = f"S{maxlen}"
+#     string_array = np.array(strings, dtype=dtype_str)
+#     return string_array
+
+
 INCLUDE_COORD = True
 # INCLUDE_COORD = False
 
@@ -55,6 +66,23 @@ def make_testfile(filepath, chararray, coordarray, encoding_str=None):
             v.coordinates = "v_co"
 
 
+def make_testcube(
+    dataarray,
+    coordarray,  # for now, these are always *string* arrays
+    encoding_str: str | None = None,
+):
+    cube = Cube(dataarray, var_name="v")
+    cube.add_dim_coord(DimCoord(np.arange(NX), var_name="x"), 0)
+    if encoding_str is not None:
+        cube.attributes["_Encoding"] = encoding_str
+    if INCLUDE_COORD:
+        co_x = AuxCoord(coordarray, var_name="v_co")
+        if encoding_str is not None:
+            co_x.attributes["_Encoding"] = encoding_str
+        cube.add_aux_coord(co_x, 0)
+    return cube
+
+
 def show_result(filepath):
     from pp_utils import ncdump
 
@@ -73,14 +101,14 @@ def show_result(filepath):
     #     print(repr(v[:]))
     print("\nAs iris cube..")
     try:
+        iris.loading.LOAD_PROBLEMS.reset()
         cube = iris.load_cube(filepath)
         print(cube)
-        if iris.loading.LOAD_PROBLEMS._problems:
+        if iris.loading.LOAD_PROBLEMS.problems:
             print(iris.loading.LOAD_PROBLEMS)
             print(
-                "\n".join(iris.loading.LOAD_PROBLEMS._problems[0].stack_trace.format())
+                "\n".join(iris.loading.LOAD_PROBLEMS.problems[0].stack_trace.format())
             )
-            iris.loading.LOAD_PROBLEMS._problems = []
         print("-data-")
         print(repr(cube.data))
         if INCLUDE_COORD:
@@ -106,14 +134,29 @@ def show_result(filepath):
 
 
 @pytest.mark.parametrize("encoding", tsts)
-def test_encodings(encoding):
+def test_load_encodings(encoding):
     # small change
     print(f"\n=========\nTesting encoding: {encoding}")
     filepath = f"tmp_{str(encoding)}.nc"
     do_as = encoding
     if encoding != "utf-32":
         do_as = "utf-8"
-    TEST_CHARARRAY = convert_chararray(TEST_STRINGS, N_STRLEN, encoding=do_as)
-    TEST_COORDARRAY = convert_chararray(TEST_COORD_VALS, N_STRLEN, encoding=do_as)
+    TEST_CHARARRAY = convert_strings_to_chararray(
+        TEST_STRINGS, N_STRLEN, encoding=do_as
+    )
+    TEST_COORDARRAY = convert_strings_to_chararray(
+        TEST_COORD_VALS, N_STRLEN, encoding=do_as
+    )
     make_testfile(filepath, TEST_CHARARRAY, TEST_COORDARRAY, encoding_str=encoding)
     show_result(filepath)
+
+
+@pytest.mark.parametrize("encoding", tsts)
+def test_save_encodings(encoding):
+    cube = make_testcube(
+        dataarray=TEST_STRINGS, coordarray=TEST_COORD_VALS, encoding_str=encoding
+    )
+    print(cube)
+    filepath = f"tmp_save_{str(encoding)}.nc"
+    iris.save(cube, filepath)
+    show_result(filepath)