-
Notifications
You must be signed in to change notification settings - Fork 268
Closed
Description
Is this a bug?
My example is pretty simple, if maybe a bit long,
but here it is ...
>>> import netCDF4 as nc
>>> import numpy as np
>>> import sys
>>>
>>> print("Python version:", sys.version)
Python version: 3.12.12 | packaged by conda-forge | (main, Oct 13 2025, 14:34:15) [GCC 14.3.0]
>>> print("NetCDF4 version:", nc.__version__)
NetCDF4 version: 1.7.2
>>>
>>> filepath = "tmp.nc"
>>> nx, n_strlen = 3, 10
>>>
>>> ds = nc.Dataset(filepath, "w")
>>> ds.createDimension("x", nx)
"<class 'netCDF4.Dimension'>": name = 'x', size = 3
>>> ds.createDimension("nstr", n_strlen)
"<class 'netCDF4.Dimension'>": name = 'nstr', size = 10
>>> v = ds.createVariable("v", "S1", dimensions=("x", "nstr",))
>>>
>>> strings = ['Münster', 'London', 'Amsterdam']
>>> print(
... "Strings for variable content:"
... f"\n{strings}"
... "\n"
... )
Strings for variable content:
['Münster', 'London', 'Amsterdam']
>>>
>>> bbytes = [text.encode("utf-8") for text in strings]
>>> pad = b'\0' * n_strlen
>>> bbytes = [(x + pad)[:n_strlen] for x in bbytes]
>>> print(
... "Equal-length byte objects with utf-8 encoding:"
... f"\n{bbytes}"
... "\n"
... )
Equal-length byte objects with utf-8 encoding:
[b'M\xc3\xbcnster\x00\x00', b'London\x00\x00\x00\x00', b'Amsterdam\x00']
>>>
>>> bytesarray = np.array(bbytes, dtype=f"S{n_strlen}")
>>> print(
... "\nBytes(Sxx) array:"
... f"\n{bytesarray}"
... f"\n :: shape={bytesarray.shape} dtype ={bytesarray.dtype}"
... "\n"
... )
Bytes(Sxx) array:
[b'M\xc3\xbcnster' b'London' b'Amsterdam']
:: shape=(3,) dtype =|S10
>>>
>>> chararray = np.array([[bb[i:i+1] for i in range(n_strlen)] for bb in bbytes])
>>> print(
... "\nCharacter(S1) array:"
... f"\n{chararray}"
... f" :: shape={chararray.shape} dtype ={chararray.dtype}"
... "\n"
... )
Character(S1) array:
[[b'M' b'\xc3' b'\xbc' b'n' b's' b't' b'e' b'r' b'' b'']
[b'L' b'o' b'n' b'd' b'o' b'n' b'' b'' b'' b'']
[b'A' b'm' b's' b't' b'e' b'r' b'd' b'a' b'm' b'']] :: shape=(3, 10) dtype =|S1
>>>
>>> # Store chararray in variable, and mark as UTF8 encoded
>>> v._Encoding = "UTF-8"
>>> v[:] = chararray
>>>
>>> ds.close()
>>>
>>> from os import system as run
>>> print("\nNCDUMP of file:")
NCDUMP of file:
>>> run(f"ncdump {filepath}")
netcdf tmp {
dimensions:
x = 3 ;
nstr = 10 ;
variables:
char v(x, nstr) ;
v:_Encoding = "UTF-8" ;
data:
v =
"M\303\274nster",
"London",
"Amsterdam" ;
}
0
>>>
>>> ds2 = nc.Dataset(filepath)
>>> v = ds2.variables['v']
>>> data = v[:]
>>> print(
... "\nData read back from file variable:"
... f"\n{data}"
... f" :: shape={data.shape} dtype ={data.dtype}"
... "\n"
... )
Data read back from file variable:
['Münster\x00\x00L' 'ondon\x00\x00\x00\x00A' 'msterdam'] :: shape=(3,) dtype =<U10
>>>
>>> print(
... "individual elements..."
... f"{''.join(f"\n {elem}" for elem in data)}"
... "\n"
... )
individual elements...
MünsterL
ondonA
msterdam
>>>
>>> v.set_auto_chartostring(False)
>>> data = v[:]
>>>
>>> print(
... "\nSame variable data, read *without* encoding interpretation:",
... '\n', data,
... f"\n :: shape={data.shape} dtype ={data.dtype}"
... "\n"
... )
Same variable data, read *without* encoding interpretation:
[[b'M' b'\xc3' b'\xbc' b'n' b's' b't' b'e' b'r' -- --]
[b'L' b'o' b'n' b'd' b'o' b'n' -- -- -- --]
[b'A' b'm' b's' b't' b'e' b'r' b'd' b'a' b'm' --]]
:: shape=(3, 10) dtype =|S1So what??
The key point is what happens when I try to read back the variable, with "chartostring" enabled:
Data read back from file variable:
['Münster\x00\x00L' 'ondon\x00\x00\x00\x00A' 'msterdam'] :: shape=(3,) dtype =<U10individual elements...
MünsterL
ondonA
msterdam
So, that does look wrong.
However, when read without decoding, the content is all ok.
Additional
When I try to write the variable, this also hits a problem...
>>> with nc.Dataset(filepath, 'r+') as ds2:
... print("\nModifying variable:")
... v = ds2.variables['v']
... v[0] = "Liége"
...
Modifying variable:
Traceback (most recent call last):
File "<stdin>", line 4, in <module>
File "src/netCDF4/_netCDF4.pyx", line 5513, in netCDF4._netCDF4.Variable.__setitem__
UnicodeEncodeError: 'ascii' codec can't encode character '\xe9' in position 2: ordinal not in range(128)
>>>
So, do I just need to set "_Encoding" to something different?
Metadata
Metadata
Assignees
Labels
No labels