|
14 | 14 |
|
15 | 15 | """ |
16 | 16 |
|
| 17 | +import codecs |
17 | 18 | import collections |
18 | 19 | from itertools import repeat, zip_longest |
19 | 20 | import os |
@@ -1801,46 +1802,85 @@ def _create_generic_cf_array_var( |
1801 | 1802 | if np.issubdtype(data.dtype, np.str_): |
1802 | 1803 | # Deal with string-type variables. |
1803 | 1804 | # Typically CF label variables, but also possibly ancil-vars ? |
1804 | | - string_dimension_depth = data.dtype.itemsize |
1805 | | - if data.dtype.kind == "U": |
1806 | | - string_dimension_depth //= 4 |
1807 | | - string_dimension_name = "string%d" % string_dimension_depth |
| 1805 | + |
| 1806 | + # Encode data into bytes, and determine the string-dimension length. |
| 1807 | + # * we can't work this out without first encoding the data |
| 1808 | + # * UNLESS the target length is given (.iris_string_dimlength) |
| 1809 | + # * we can't create the dimension before we know the length |
| 1810 | + # * we can't create the variable before creating the dim (if needed) |
| 1811 | + # TODO: we can keep data lazy IFF there is a user-specified string-length |
| 1812 | + |
| 1813 | + # Calculate encoding to apply. |
| 1814 | + default_encoding = "utf-8" |
| 1815 | + encoding = element.attributes.get("_Encoding", None) |
| 1816 | + if encoding is None: |
| 1817 | + # utf-8 is a reasonable "safe" default, equivalent to 'ascii' for ascii data |
| 1818 | + encoding = default_encoding |
| 1819 | + else: |
| 1820 | + try: |
| 1821 | + # Accept + normalise naming of encodings |
| 1822 | + encoding = codecs.lookup(encoding).name |
| 1823 | + # NOTE: if encoding does not suit data, errors can occur. |
| 1824 | + # For example, _Encoding = "ascii", with non-ascii content. |
| 1825 | + except LookupError: |
| 1826 | + # Replace some invalid setting with "safe"(ish) fallback. |
| 1827 | + encoding = default_encoding |
| 1828 | + |
| 1829 | + # Convert data from an array of strings into a character array |
| 1830 | + # with an extra string-length dimension. |
| 1831 | + |
| 1832 | + # TODO: support lazy in some cases?? |
| 1833 | + # (N.B. can do when 'iris_string_dimlength' is provided) |
| 1834 | + if is_lazy_data(data): |
| 1835 | + data = dask.compute(data) |
| 1836 | + |
| 1837 | + element_shape = data.shape |
| 1838 | + max_length = 1 # this is a MINIMUM - i.e. not zero! |
| 1839 | + data_elements = np.zeros(element_shape, dtype=object) |
| 1840 | + for index in np.ndindex(element_shape): |
| 1841 | + data_element = data[index].encode(encoding) |
| 1842 | + element_length = len(data_element) |
| 1843 | + data_elements[index] = data_element |
| 1844 | + if element_length > max_length: |
| 1845 | + max_length = element_length |
| 1846 | + |
| 1847 | + string_dimension_length = element.attributes.get( |
| 1848 | + "iris_string_dimlength", None |
| 1849 | + ) |
| 1850 | + if string_dimension_length is None: |
| 1851 | + string_dimension_length = max_length |
| 1852 | + |
| 1853 | + # We already encoded all the strings, but stored them in an object-array as |
| 1854 | + # we didn't yet know the fixed byte-length to convert to. |
| 1855 | + # Now convert to fixed-width char array |
| 1856 | + data = np.zeros(element_shape + (string_dimension_length,), dtype="S1") |
| 1857 | + right_pad = b"\0" * string_dimension_length |
| 1858 | + for index in np.ndindex(element_shape): |
| 1859 | + bytes = data_elements[index] |
| 1860 | + bytes = (bytes + right_pad)[:string_dimension_length] |
| 1861 | + data[index] = [bytes[i : i + 1] for i in range(string_dimension_length)] |
1808 | 1862 |
|
1809 | 1863 | # Determine whether to create the string length dimension. |
| 1864 | + string_dimension_name = f"string{string_dimension_length}" |
1810 | 1865 | if string_dimension_name not in self._dataset.dimensions: |
1811 | 1866 | while string_dimension_name in self._dataset.variables: |
1812 | 1867 | # Also avoid collision with variable names. |
1813 | 1868 | # See '_get_dim_names' for reason. |
1814 | 1869 | string_dimension_name = self._increment_name(string_dimension_name) |
1815 | 1870 | self._dataset.createDimension( |
1816 | | - string_dimension_name, string_dimension_depth |
| 1871 | + string_dimension_name, string_dimension_length |
1817 | 1872 | ) |
1818 | 1873 |
|
1819 | 1874 | # Add the string length dimension to the variable dimensions. |
1820 | 1875 | element_dims.append(string_dimension_name) |
1821 | 1876 |
|
1822 | 1877 | # Create the label coordinate variable. |
1823 | 1878 | cf_var = self._dataset.createVariable(cf_name, "|S1", element_dims) |
| 1879 | + # Force to always exchange data as byte arrays |
| 1880 | + # TODO: ?remove when bug fixed |
| 1881 | + # see : https://github.com/Unidata/netcdf4-python/issues/1440 |
| 1882 | + cf_var.set_auto_chartostring(False) |
1824 | 1883 |
|
1825 | | - # Convert data from an array of strings into a character array |
1826 | | - # with an extra string-length dimension. |
1827 | | - if len(element_dims) == 1: |
1828 | | - # Scalar variable (only has string dimension). |
1829 | | - data_first = data[0] |
1830 | | - if is_lazy_data(data_first): |
1831 | | - data_first = dask.compute(data_first) |
1832 | | - data = list("%- *s" % (string_dimension_depth, data_first)) |
1833 | | - else: |
1834 | | - # NOTE: at present, can't do this lazily?? |
1835 | | - orig_shape = data.shape |
1836 | | - new_shape = orig_shape + (string_dimension_depth,) |
1837 | | - new_data = np.zeros(new_shape, cf_var.dtype) |
1838 | | - for index in np.ndindex(orig_shape): |
1839 | | - index_slice = tuple(list(index) + [slice(None, None)]) |
1840 | | - new_data[index_slice] = list( |
1841 | | - "%- *s" % (string_dimension_depth, data[index]) |
1842 | | - ) |
1843 | | - data = new_data |
1844 | 1884 | else: |
1845 | 1885 | # A normal (numeric) variable. |
1846 | 1886 | # ensure a valid datatype for the file format. |
|
0 commit comments