Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
version 1.7.4 (not yet released)
================================
* Make sure automatic conversion of character arrays <--> string arrays works for Unicode strings (issue #1440).
(previously only worked correctly for encoding="ascii").

version 1.7.3 (tag v1.7.3rel)
=============================
* Python 3.14 wheels (issue #1432)
Expand Down
2 changes: 1 addition & 1 deletion include/netcdf-compat.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ static inline int nc_get_alignment(int* thresholdp, int* alignmentp) {
#else
#define HAS_NCRCSET 0
static inline int nc_rc_set(const char* key, const char* value) { return NC_EINVAL; }
static inline const char *nc_rc_get(const char* key) { return NC_EINVAL; }
static inline const char *nc_rc_get(const char* key) { return NULL; }
#endif

#if NC_VERSION_GE(4, 4, 0)
Expand Down
1 change: 1 addition & 0 deletions src/netCDF4/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -699,6 +699,7 @@ def stringtoarr(
def stringtochar(
a: npt.NDArray[np.character],
encoding: Literal["none", "None", "bytes"],
n_strlen: int | None = None,
) -> npt.NDArray[np.bytes_]: ...
@overload
def stringtochar(
Expand Down
38 changes: 26 additions & 12 deletions src/netCDF4/_netCDF4.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1066,7 +1066,7 @@ If the `_Encoding` special attribute is set for a character array
(dtype `S1`) variable, the `chartostring` utility function is used to convert the array of
characters to an array of strings with one less dimension (the last dimension is
interpreted as the length of each string) when reading the data. The character
set (usually ascii) is specified by the `_Encoding` attribute. If `_Encoding`
set is specified by the `_Encoding` attribute. If `_Encoding`
is 'none' or 'bytes', then the character array is converted to a numpy
fixed-width byte string array (dtype `S#`), otherwise a numpy unicode (dtype
`U#`) array is created. When writing the data,
Expand Down Expand Up @@ -5525,11 +5525,15 @@ cannot be safely cast to variable data type""" % attname
# if data is a string or a bytes object, convert to a numpy string array
# whose length is equal to the rightmost dimension of the
# variable.
if type(data) in [str,bytes]: data = numpy.asarray(data,dtype='S'+repr(self.shape[-1]))
if type(data) in [str,bytes]:
if encoding == 'ascii':
data = numpy.asarray(data,dtype='S'+repr(self.shape[-1]))
else:
data = numpy.asarray(data,dtype='U'+repr(self.shape[-1]))
if data.dtype.kind in ['S','U'] and data.dtype.itemsize > 1:
# if data is a numpy string array, convert it to an array
# of characters with one more dimension.
data = stringtochar(data, encoding=encoding)
data = stringtochar(data, encoding=encoding,n_strlen=self.shape[-1])

# if structured data has strings (and _Encoding att set), create view as char arrays
# (issue #773)
Expand Down Expand Up @@ -6771,9 +6775,9 @@ returns a rank 1 numpy character array of length NUMCHARS with datatype `'S1'`
arr[0:len(string)] = tuple(string)
return arr

def stringtochar(a,encoding='utf-8'):
def stringtochar(a,encoding='utf-8',n_strlen=None):
"""
**`stringtochar(a,encoding='utf-8')`**
**`stringtochar(a,encoding='utf-8',n_strlen=None)`**

convert a string array to a character array with one extra dimension

Expand All @@ -6785,16 +6789,29 @@ optional kwarg `encoding` can be used to specify character encoding (default
`utf-8`). If `encoding` is 'none' or 'bytes', a `numpy.string_` the input array
is treated a raw byte strings (`numpy.string_`).

optional kwarg `n_strlen` is the number of characters in each string. Default
is None, which means `n_strlen` will be set to a.itemsize (the number of bytes
used to represent each string in the input array).

returns a numpy character array with datatype `'S1'` or `'U1'`
and shape `a.shape + (N,)`, where N is the length of each string in a."""
dtype = a.dtype.kind
if n_strlen is None:
n_strlen = a.dtype.itemsize
if dtype not in ["S","U"]:
raise ValueError("type must string or unicode ('S' or 'U')")
if encoding in ['none','None','bytes']:
b = numpy.array(tuple(a.tobytes()),'S1')
else:
elif encoding == 'ascii':
b = numpy.array(tuple(a.tobytes().decode(encoding)),dtype+'1')
b.shape = a.shape + (a.itemsize,)
b.shape = a.shape + (n_strlen,)
else:
if not a.ndim:
a = numpy.array([a])
bbytes = [text.encode(encoding) for text in a]
pad = b'\0' * n_strlen
bbytes = [(x + pad)[:n_strlen] for x in bbytes]
b = numpy.array([[bb[i:i+1] for i in range(n_strlen)] for bb in bbytes])
return b

def chartostring(b,encoding='utf-8'):
Expand All @@ -6816,15 +6833,12 @@ returns a numpy string array with datatype `'UN'` (or `'SN'`) and shape
dtype = b.dtype.kind
if dtype not in ["S","U"]:
raise ValueError("type must be string or unicode ('S' or 'U')")
if encoding in ['none','None','bytes']:
bs = b.tobytes()
else:
bs = b.tobytes().decode(encoding)
bs = b.tobytes()
slen = int(b.shape[-1])
if encoding in ['none','None','bytes']:
a = numpy.array([bs[n1:n1+slen] for n1 in range(0,len(bs),slen)],'S'+repr(slen))
else:
a = numpy.array([bs[n1:n1+slen] for n1 in range(0,len(bs),slen)],'U'+repr(slen))
a = numpy.array([bs[n1:n1+slen].decode(encoding) for n1 in range(0,len(bs),slen)],'U'+repr(slen))
a.shape = b.shape[:-1]
return a

Expand Down
17 changes: 17 additions & 0 deletions test/test_stringarr.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import unittest
import os
from numpy.testing import assert_array_equal, assert_array_almost_equal
import numpy as np

def generateString(length, alphabet=string.ascii_letters + string.digits + string.punctuation):
return(''.join([random.choice(alphabet) for i in range(length)]))
Expand All @@ -20,6 +21,11 @@ def generateString(length, alphabet=string.ascii_letters + string.digits + strin
datau = data.astype('U')
datac = stringtochar(data, encoding='ascii')

nx, n_strlen = 3, 12
unicode_strings = np.array(['Münster', 'Liége', '東京'],dtype='U'+str(n_strlen))
unicode_strings2 = np.array(['Münster', 'Москва', '東京'],dtype='U'+str(n_strlen))
unicode_strings2_bytes = [b'M', b'\xc3', b'\xbc', b'n', b's', b't', b'e', b'r', b'\xd0', b'\x9c', b'\xd0', b'\xbe', b'\xd1', b'\x81', b'\xd0', b'\xba', b'\xd0', b'\xb2', b'\xd0', b'\xb0', b'\xe6', b'\x9d', b'\xb1', b'\xe4', b'\xba', b'\xac']

class StringArrayTestCase(unittest.TestCase):

def setUp(self):
Expand All @@ -28,6 +34,8 @@ def setUp(self):
nc.createDimension('n1',None)
nc.createDimension('n2',n2)
nc.createDimension('nchar',nchar)
nc.createDimension("x", nx)
nc.createDimension("nstr", n_strlen)
v = nc.createVariable('strings','S1',('n1','n2','nchar'))
v2 = nc.createVariable('strings2','S1',('n1','n2','nchar'))
# if _Encoding set, string array should automatically be converted
Expand All @@ -44,6 +52,11 @@ def setUp(self):
v2[-1,-1] = data[-1,-1].tobytes() # write single python string
# _Encoding should be ignored if an array of characters is specified
v3[:] = stringtochar(data, encoding='ascii')
# test unicode strings (issue #1440)
v4 = nc.createVariable("strings4", "S1", dimensions=("x", "nstr",))
v4._Encoding = "UTF-8"
v4[:] = unicode_strings
v4[1] = "Москва"
nc.close()

def tearDown(self):
Expand All @@ -57,6 +70,10 @@ def runTest(self):
v = nc.variables['strings']
v2 = nc.variables['strings2']
v3 = nc.variables['strings3']
v4 = nc.variables['strings4']
assert np.all(v4[:]==unicode_strings2)
v4.set_auto_chartostring(False)
assert (v4[:].compressed().tolist() == unicode_strings2_bytes)
assert v.dtype.str[1:] in ['S1','U1']
assert v.shape == (nrecs,n2,nchar)
for nrec in range(nrecs):
Expand Down
Loading