diff --git a/Changelog b/Changelog index f9d0de0f5..c74134c2a 100644 --- a/Changelog +++ b/Changelog @@ -1,3 +1,8 @@ + version 1.7.4 (not yet released) + ================================ + * Make sure automatic conversion of character arrays <--> string arrays works for Unicode strings (issue #1440). + (previously only worked correctly for encoding="ascii"). + version 1.7.3 (tag v1.7.3rel) ============================= * Python 3.14 wheels (issue #1432) diff --git a/include/netcdf-compat.h b/include/netcdf-compat.h index d1144d979..ccfb8322e 100644 --- a/include/netcdf-compat.h +++ b/include/netcdf-compat.h @@ -60,7 +60,7 @@ static inline int nc_get_alignment(int* thresholdp, int* alignmentp) { #else #define HAS_NCRCSET 0 static inline int nc_rc_set(const char* key, const char* value) { return NC_EINVAL; } -static inline const char *nc_rc_get(const char* key) { return NC_EINVAL; } +static inline const char *nc_rc_get(const char* key) { return NULL; } #endif #if NC_VERSION_GE(4, 4, 0) diff --git a/src/netCDF4/__init__.pyi b/src/netCDF4/__init__.pyi index 97062cf51..e27fb6370 100644 --- a/src/netCDF4/__init__.pyi +++ b/src/netCDF4/__init__.pyi @@ -699,6 +699,7 @@ def stringtoarr( def stringtochar( a: npt.NDArray[np.character], encoding: Literal["none", "None", "bytes"], + n_strlen: int | None = None, ) -> npt.NDArray[np.bytes_]: ... @overload def stringtochar( diff --git a/src/netCDF4/_netCDF4.pyx b/src/netCDF4/_netCDF4.pyx index acdfdf5a5..f0025dcd2 100644 --- a/src/netCDF4/_netCDF4.pyx +++ b/src/netCDF4/_netCDF4.pyx @@ -1066,7 +1066,7 @@ If the `_Encoding` special attribute is set for a character array (dtype `S1`) variable, the `chartostring` utility function is used to convert the array of characters to an array of strings with one less dimension (the last dimension is interpreted as the length of each string) when reading the data. The character -set (usually ascii) is specified by the `_Encoding` attribute. If `_Encoding` +set is specified by the `_Encoding` attribute. If `_Encoding` is 'none' or 'bytes', then the character array is converted to a numpy fixed-width byte string array (dtype `S#`), otherwise a numpy unicode (dtype `U#`) array is created. When writing the data, @@ -5525,11 +5525,15 @@ cannot be safely cast to variable data type""" % attname # if data is a string or a bytes object, convert to a numpy string array # whose length is equal to the rightmost dimension of the # variable. - if type(data) in [str,bytes]: data = numpy.asarray(data,dtype='S'+repr(self.shape[-1])) + if type(data) in [str,bytes]: + if encoding == 'ascii': + data = numpy.asarray(data,dtype='S'+repr(self.shape[-1])) + else: + data = numpy.asarray(data,dtype='U'+repr(self.shape[-1])) if data.dtype.kind in ['S','U'] and data.dtype.itemsize > 1: # if data is a numpy string array, convert it to an array # of characters with one more dimension. - data = stringtochar(data, encoding=encoding) + data = stringtochar(data, encoding=encoding,n_strlen=self.shape[-1]) # if structured data has strings (and _Encoding att set), create view as char arrays # (issue #773) @@ -6771,9 +6775,9 @@ returns a rank 1 numpy character array of length NUMCHARS with datatype `'S1'` arr[0:len(string)] = tuple(string) return arr -def stringtochar(a,encoding='utf-8'): +def stringtochar(a,encoding='utf-8',n_strlen=None): """ -**`stringtochar(a,encoding='utf-8')`** +**`stringtochar(a,encoding='utf-8',n_strlen=None)`** convert a string array to a character array with one extra dimension @@ -6785,16 +6789,29 @@ optional kwarg `encoding` can be used to specify character encoding (default `utf-8`). If `encoding` is 'none' or 'bytes', a `numpy.string_` the input array is treated a raw byte strings (`numpy.string_`). +optional kwarg `n_strlen` is the number of characters in each string. Default +is None, which means `n_strlen` will be set to a.itemsize (the number of bytes +used to represent each string in the input array). + returns a numpy character array with datatype `'S1'` or `'U1'` and shape `a.shape + (N,)`, where N is the length of each string in a.""" dtype = a.dtype.kind + if n_strlen is None: + n_strlen = a.dtype.itemsize if dtype not in ["S","U"]: raise ValueError("type must string or unicode ('S' or 'U')") if encoding in ['none','None','bytes']: b = numpy.array(tuple(a.tobytes()),'S1') - else: + elif encoding == 'ascii': b = numpy.array(tuple(a.tobytes().decode(encoding)),dtype+'1') - b.shape = a.shape + (a.itemsize,) + b.shape = a.shape + (n_strlen,) + else: + if not a.ndim: + a = numpy.array([a]) + bbytes = [text.encode(encoding) for text in a] + pad = b'\0' * n_strlen + bbytes = [(x + pad)[:n_strlen] for x in bbytes] + b = numpy.array([[bb[i:i+1] for i in range(n_strlen)] for bb in bbytes]) return b def chartostring(b,encoding='utf-8'): @@ -6816,15 +6833,12 @@ returns a numpy string array with datatype `'UN'` (or `'SN'`) and shape dtype = b.dtype.kind if dtype not in ["S","U"]: raise ValueError("type must be string or unicode ('S' or 'U')") - if encoding in ['none','None','bytes']: - bs = b.tobytes() - else: - bs = b.tobytes().decode(encoding) + bs = b.tobytes() slen = int(b.shape[-1]) if encoding in ['none','None','bytes']: a = numpy.array([bs[n1:n1+slen] for n1 in range(0,len(bs),slen)],'S'+repr(slen)) else: - a = numpy.array([bs[n1:n1+slen] for n1 in range(0,len(bs),slen)],'U'+repr(slen)) + a = numpy.array([bs[n1:n1+slen].decode(encoding) for n1 in range(0,len(bs),slen)],'U'+repr(slen)) a.shape = b.shape[:-1] return a diff --git a/test/test_stringarr.py b/test/test_stringarr.py index 9d4fcd909..780dd7ca7 100644 --- a/test/test_stringarr.py +++ b/test/test_stringarr.py @@ -3,6 +3,7 @@ import unittest import os from numpy.testing import assert_array_equal, assert_array_almost_equal +import numpy as np def generateString(length, alphabet=string.ascii_letters + string.digits + string.punctuation): return(''.join([random.choice(alphabet) for i in range(length)])) @@ -20,6 +21,11 @@ def generateString(length, alphabet=string.ascii_letters + string.digits + strin datau = data.astype('U') datac = stringtochar(data, encoding='ascii') +nx, n_strlen = 3, 12 +unicode_strings = np.array(['Münster', 'Liége', '東京'],dtype='U'+str(n_strlen)) +unicode_strings2 = np.array(['Münster', 'Москва', '東京'],dtype='U'+str(n_strlen)) +unicode_strings2_bytes = [b'M', b'\xc3', b'\xbc', b'n', b's', b't', b'e', b'r', b'\xd0', b'\x9c', b'\xd0', b'\xbe', b'\xd1', b'\x81', b'\xd0', b'\xba', b'\xd0', b'\xb2', b'\xd0', b'\xb0', b'\xe6', b'\x9d', b'\xb1', b'\xe4', b'\xba', b'\xac'] + class StringArrayTestCase(unittest.TestCase): def setUp(self): @@ -28,6 +34,8 @@ def setUp(self): nc.createDimension('n1',None) nc.createDimension('n2',n2) nc.createDimension('nchar',nchar) + nc.createDimension("x", nx) + nc.createDimension("nstr", n_strlen) v = nc.createVariable('strings','S1',('n1','n2','nchar')) v2 = nc.createVariable('strings2','S1',('n1','n2','nchar')) # if _Encoding set, string array should automatically be converted @@ -44,6 +52,11 @@ def setUp(self): v2[-1,-1] = data[-1,-1].tobytes() # write single python string # _Encoding should be ignored if an array of characters is specified v3[:] = stringtochar(data, encoding='ascii') + # test unicode strings (issue #1440) + v4 = nc.createVariable("strings4", "S1", dimensions=("x", "nstr",)) + v4._Encoding = "UTF-8" + v4[:] = unicode_strings + v4[1] = "Москва" nc.close() def tearDown(self): @@ -57,6 +70,10 @@ def runTest(self): v = nc.variables['strings'] v2 = nc.variables['strings2'] v3 = nc.variables['strings3'] + v4 = nc.variables['strings4'] + assert np.all(v4[:]==unicode_strings2) + v4.set_auto_chartostring(False) + assert (v4[:].compressed().tolist() == unicode_strings2_bytes) assert v.dtype.str[1:] in ['S1','U1'] assert v.shape == (nrecs,n2,nchar) for nrec in range(nrecs):