diff --git a/lib/iris/fileformats/netcdf.py b/lib/iris/fileformats/netcdf.py index 4818f621c8..d4b7c84c44 100644 --- a/lib/iris/fileformats/netcdf.py +++ b/lib/iris/fileformats/netcdf.py @@ -1376,6 +1376,8 @@ def _create_cf_dimensions( unlimited_dim_names.append(dim_name) for dim_name in dimension_names: + # NOTE: these dim-names have been chosen by _get_dim_names, and + # were already checked+fixed to avoid any name collisions. if dim_name not in self._dataset.dimensions: if dim_name in unlimited_dim_names: size = None @@ -1468,6 +1470,10 @@ def _add_mesh(self, cube_or_mesh): last_dim = f"{cf_mesh_name}_{loc_from}_N_{loc_to}s" # Create if it does not already exist. if last_dim not in self._dataset.dimensions: + while last_dim in self._dataset.variables: + # Also avoid collision with variable names. + # See '_get_dim_names' for reason. + last_dim = self._increment_name(last_dim) length = conn.shape[1 - conn.location_axis] self._dataset.createDimension(last_dim, length) @@ -1869,8 +1875,19 @@ def record_dimension(names_list, dim_name, length, matching_coords=[]): assert dim_name is not None # Ensure it is a valid variable name. dim_name = self.cf_valid_var_name(dim_name) - # Disambiguate if it matches an existing one. - while dim_name in self._existing_dim: + # Disambiguate if it has the same name as an existing + # dimension. + # NOTE: *OR* if it matches the name of an existing file + # variable. Because there is a bug ... + # See https://github.com/Unidata/netcdf-c/issues/1772 + # N.B. the workarounds here *ONLY* function because the + # caller (write) will not create any more variables + # in between choosing dim names (here), and creating + # the new dims (via '_create_cf_dimensions'). + while ( + dim_name in self._existing_dim + or dim_name in self._dataset.variables + ): dim_name = self._increment_name(dim_name) # Record the new dimension. @@ -1915,9 +1932,15 @@ def record_dimension(names_list, dim_name, length, matching_coords=[]): dim_name = self._get_coord_variable_name( cube, coord ) + # Disambiguate if it has the same name as an + # existing dimension. + # OR if it matches an existing file variable name. + # NOTE: check against variable names is needed + # because of a netcdf bug ... see note in the + # mesh dimensions block above. while ( dim_name in self._existing_dim - or dim_name in self._name_coord_map.names + or dim_name in self._dataset.variables ): dim_name = self._increment_name(dim_name) @@ -1925,16 +1948,18 @@ def record_dimension(names_list, dim_name, length, matching_coords=[]): # No CF-netCDF coordinates describe this data dimension. # Make up a new, distinct dimension name dim_name = f"dim{dim}" - if dim_name in self._existing_dim: - # Increment name if conflicted with one already existing. - if self._existing_dim[dim_name] != cube.shape[dim]: - while ( - dim_name in self._existing_dim - and self._existing_dim[dim_name] - != cube.shape[dim] - or dim_name in self._name_coord_map.names - ): - dim_name = self._increment_name(dim_name) + # Increment name if conflicted with one already existing + # (or planned) + # NOTE: check against variable names is needed because + # of a netcdf bug ... see note in the mesh dimensions + # block above. + while ( + dim_name in self._existing_dim + and ( + self._existing_dim[dim_name] != cube.shape[dim] + ) + ) or dim_name in self._dataset.variables: + dim_name = self._increment_name(dim_name) # Record the dimension. record_dimension( @@ -2065,6 +2090,12 @@ def _create_cf_bounds(self, coord, cf_var, cf_name): if bounds_dimension_name not in self._dataset.dimensions: # Create the bounds dimension with the appropriate extent. + while bounds_dimension_name in self._dataset.variables: + # Also avoid collision with variable names. + # See '_get_dim_names' for reason. + bounds_dimension_name = self._increment_name( + bounds_dimension_name + ) self._dataset.createDimension(bounds_dimension_name, n_bounds) boundsvar_name = "{}_{}".format(cf_name, varname_extra) @@ -2345,6 +2376,12 @@ def _create_generic_cf_array_var( # Determine whether to create the string length dimension. if string_dimension_name not in self._dataset.dimensions: + while string_dimension_name in self._dataset.variables: + # Also avoid collision with variable names. + # See '_get_dim_names' for reason. + string_dimension_name = self._increment_name( + string_dimension_name + ) self._dataset.createDimension( string_dimension_name, string_dimension_depth ) diff --git a/lib/iris/tests/stock/mesh.py b/lib/iris/tests/stock/mesh.py index ca15ee1c97..da226a3790 100644 --- a/lib/iris/tests/stock/mesh.py +++ b/lib/iris/tests/stock/mesh.py @@ -61,7 +61,11 @@ def sample_mesh(n_nodes=None, n_faces=None, n_edges=None, lazy_values=False): units="degrees_east", long_name="long-name", var_name="var-name", - attributes={"a": 1, "b": "c"}, + attributes={ + # N.B. cast this so that a save-load roundtrip preserves it + "a": np.int64(1), + "b": "c", + }, ) node_y = AuxCoord(1200 + arr.arange(n_nodes), standard_name="latitude") diff --git a/lib/iris/tests/unit/fileformats/netcdf/test_save.py b/lib/iris/tests/unit/fileformats/netcdf/test_save.py index 830d8c5e52..669a3c4137 100644 --- a/lib/iris/tests/unit/fileformats/netcdf/test_save.py +++ b/lib/iris/tests/unit/fileformats/netcdf/test_save.py @@ -9,16 +9,21 @@ # importing anything else. import iris.tests as tests # isort:skip +from pathlib import Path +from shutil import rmtree +from tempfile import mkdtemp from unittest import mock import netCDF4 as nc import numpy as np import iris -from iris.coords import DimCoord +from iris.coords import AuxCoord, DimCoord from iris.cube import Cube, CubeList +from iris.experimental.ugrid import PARSE_UGRID_ON_LOAD from iris.fileformats.netcdf import CF_CONVENTIONS_VERSION, save from iris.tests.stock import lat_lon_cube +from iris.tests.stock.mesh import sample_mesh_cube class Test_conventions(tests.IrisTest): @@ -211,5 +216,145 @@ def test_multi_wrong_length(self): save(cubes, "dummy.nc", fill_value=fill_values) +class Test_HdfSaveBug(tests.IrisTest): + """ + Check for a known problem with netcdf4. + + If you create dimension with the same name as an existing variable, there + is a specific problem, relating to HDF so limited to netcdf-4 formats. + See : https://github.com/Unidata/netcdf-c/issues/1772 + + In all these testcases, a straightforward translation to the file would be + able to save [cube_2, cube_1], but *not* [cube_1, cube_2], + because the latter creates a dim of the same name as the 'cube_1' data + variable. + + Here, we are testing the specific workarounds in Iris netcdf save which + avoids that problem. + Unfortunately, owing to the complexity of the iris.fileformats.netcdf.Saver + code, there are several separate places where this had to be fixed. + + N.B. we also check that the data (mostly) survives a save-load roundtrip. + To identify the read-back cubes with the originals, we use var-names, + which works because the save code opts to adjust dimension names _instead_. + + """ + + def _check_save_and_reload(self, cubes): + tempdir = Path(mkdtemp()) + filepath = tempdir / "tmp.nc" + try: + # Save the given cubes. + save(cubes, filepath) + + # Load them back for roundtrip testing. + with PARSE_UGRID_ON_LOAD.context(): + new_cubes = iris.load(str(filepath)) + + # There should definitely still be the same number of cubes. + self.assertEqual(len(new_cubes), len(cubes)) + + # Get results in the input order, matching by var_names. + result = [new_cubes.extract_cube(cube.var_name) for cube in cubes] + + # Check that input + output match cube-for-cube. + # NB in this codeblock, before we destroy the temporary file. + for cube_in, cube_out in zip(cubes, result): + # Using special tolerant equivalence-check. + self.assertSameCubes(cube_in, cube_out) + + finally: + rmtree(tempdir) + + # Return result cubes for any additional checks. + return result + + def assertSameCubes(self, cube1, cube2): + """ + A special tolerant cube compare. + + Ignore any 'Conventions' attributes. + Ignore all var-names. + + """ + + def clean_cube(cube): + cube = cube.copy() # dont modify the original + # Remove any 'Conventions' attributes + cube.attributes.pop("Conventions", None) + # Remove var-names (as original mesh components wouldn't have them) + cube.var_name = None + for coord in cube.coords(): + coord.var_name = None + mesh = cube.mesh + if mesh: + mesh.var_name = None + for component in mesh.coords() + mesh.connectivities(): + component.var_name = None + + return cube + + self.assertEqual(clean_cube(cube1), clean_cube(cube2)) + + def test_dimcoord_varname_collision(self): + cube_2 = Cube([0, 1], var_name="cube_2") + x_dim = DimCoord([0, 1], long_name="dim_x", var_name="dimco_name") + cube_2.add_dim_coord(x_dim, 0) + # First cube has a varname which collides with the dimcoord. + cube_1 = Cube([0, 1], long_name="cube_1", var_name="dimco_name") + # Test save + loadback + reload_1, reload_2 = self._check_save_and_reload([cube_1, cube_2]) + # As re-loaded, the coord will have a different varname. + self.assertEqual(reload_2.coord("dim_x").var_name, "dimco_name_0") + + def test_anonymous_dim_varname_collision(self): + # Second cube is going to name an anonymous dim. + cube_2 = Cube([0, 1], var_name="cube_2") + # First cube has a varname which collides with the dim-name. + cube_1 = Cube([0, 1], long_name="cube_1", var_name="dim0") + # Add a dimcoord to prevent the *first* cube having an anonymous dim. + x_dim = DimCoord([0, 1], long_name="dim_x", var_name="dimco_name") + cube_1.add_dim_coord(x_dim, 0) + # Test save + loadback + self._check_save_and_reload([cube_1, cube_2]) + + def test_bounds_dim_varname_collision(self): + cube_2 = Cube([0, 1], var_name="cube_2") + x_dim = DimCoord([0, 1], long_name="dim_x", var_name="dimco_name") + x_dim.guess_bounds() + cube_2.add_dim_coord(x_dim, 0) + # First cube has a varname which collides with the bounds dimension. + cube_1 = Cube([0], long_name="cube_1", var_name="bnds") + # Test save + loadback + self._check_save_and_reload([cube_1, cube_2]) + + def test_string_dim_varname_collision(self): + cube_2 = Cube([0, 1], var_name="cube_2") + # NOTE: it *should* be possible for a cube with string data to cause + # this collision, but cubes with string data are currently not working. + # See : https://github.com/SciTools/iris/issues/4412 + x_dim = AuxCoord( + ["this", "that"], long_name="dim_x", var_name="string_auxco" + ) + cube_2.add_aux_coord(x_dim, 0) + cube_1 = Cube([0], long_name="cube_1", var_name="string4") + # Test save + loadback + self._check_save_and_reload([cube_1, cube_2]) + + def test_mesh_location_dim_varname_collision(self): + cube_2 = sample_mesh_cube() + cube_2.var_name = "cube_2" # Make it identifiable + cube_1 = Cube([0], long_name="cube_1", var_name="Mesh2d_node") + # Test save + loadback + self._check_save_and_reload([cube_1, cube_2]) + + def test_connectivity_dim_varname_collision(self): + cube_2 = sample_mesh_cube() + cube_2.var_name = "cube_2" # Make it identifiable + cube_1 = Cube([0], long_name="cube_1", var_name="Mesh_2d_face_N_nodes") + # Test save + loadback + self._check_save_and_reload([cube_1, cube_2]) + + if __name__ == "__main__": tests.main()