diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d1fb0d7f..98eb93ac 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,9 +8,6 @@ repos: - id: check-added-large-files - id: check-json - id: check-toml - - id: name-tests-test - args: [--pytest-test-first] - - id: check-docstring-first - repo: https://github.com/psf/black rev: 24.10.0 hooks: diff --git a/CHANGELOG.md b/CHANGELOG.md index 58a101e6..e931b8ce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,8 @@ * Made docs site point to latest stable release: https://hdmf-zarr.readthedocs.io/en/stable/ instead of "latest" tag. @rly [#254](https://github.com/hdmf-dev/hdmf-zarr/pull/254) * Removed GitHub Actions workflow that automatically generated GitHub pre-releases on the "latest" tag on each commit to dev to simplify maintenance. @rly [#254](https://github.com/hdmf-dev/hdmf-zarr/pull/254) * Clarified docs and updated links to say that only Zarr v2 is currently supported. @rly [#257](https://github.com/hdmf-dev/hdmf-zarr/pull/257) +* Removed `ZarrIO.get_zarr_parent_path` and `ZarrIO.is_zarr_file` methods. @rly [#256](https://github.com/hdmf-dev/hdmf-zarr/pull/256) +* Fixed bug in how links and references are stored in the Zarr file. They are now written as relative paths from the Zarr file, using "." to indicate the current file. This is how hdmf-zarr wrote internal links and references pre-0.10.0. @rly [#256](https://github.com/hdmf-dev/hdmf-zarr/pull/256) ## 0.10.0 (December 18, 2024) diff --git a/src/hdmf_zarr/backend.py b/src/hdmf_zarr/backend.py index b6394488..eddd3bcf 100644 --- a/src/hdmf_zarr/backend.py +++ b/src/hdmf_zarr/backend.py @@ -685,7 +685,7 @@ def write_attributes(self, **kwargs): raise TypeError(str(e) + " type=" + str(type(value)) + " data=" + str(value)) from e # Case 2: References elif isinstance(value, (Builder, ReferenceBuilder)): - refs = self._create_ref(value, self.path) + refs = self._create_ref(value, ref_link_source=self.path) tmp = {"zarr_dtype": "object", "value": refs} obj.attrs[key] = tmp # Case 3: Scalar attributes @@ -726,55 +726,16 @@ def __get_path(self, builder): path = "%s%s" % (delim, delim.join(reversed(names))) return path - @staticmethod - def get_zarr_paths(zarr_object): - """ - For a Zarr object find 1) the path to the main zarr file it is in and 2) the path to the object within the file - :param zarr_object: Object for which we are looking up the path - :type zarr_object: Zarr Group or Array - :return: Tuple of two string with: 1) path of the Zarr file and 2) full path within the zarr file to the object - """ - # In Zarr the path is a combination of the path of the store and the path of the object. So we first need to - # merge those two paths, then remove the path of the file, add the missing leading "/" and then compute the - # directory name to get the path of the parent - fpath = ZarrIO._ZarrIO__get_store_path(zarr_object.store) - fullpath = os.path.normpath(os.path.join(fpath, zarr_object.path)).replace("\\", "/") - # To determine the filepath we now iterate over the path and check if the .zgroup object exists at - # a level, indicating that we are still within the Zarr file. The first level we hit where the parent - # directory does not have a .zgroup means we have found the main file - filepath = fullpath - while os.path.exists(os.path.join(os.path.dirname(filepath), ".zgroup")): - filepath = os.path.dirname(filepath) - # From the fullpath and filepath we can now compute the objectpath within the zarr file as the relative - # path from the filepath to the object - objectpath = "/" + os.path.relpath(fullpath, filepath) - # return the result - return filepath, objectpath - @staticmethod def get_zarr_parent_path(zarr_object): """ - Get the location of the parent of a zarr_object within the file + Get the absolute Unix path to the parent of a zarr_object from the root of the Zarr file :param zarr_object: Object for which we are looking up the path :type zarr_object: Zarr Group or Array :return: String with the path """ - filepath, objectpath = ZarrIO.get_zarr_paths(zarr_object) - parentpath = os.path.dirname(objectpath) - return parentpath - - @staticmethod - def is_zarr_file(path): - """ - Check if the given path defines a Zarr file - :param path: Full path to main directory - :return: Bool - """ - if os.path.exists(path): - if os.path.isdir(path): - if os.path.exists(os.path.join(path, ".zgroup")): - return True - return False + parent_path = "/" + os.path.dirname(zarr_object.path).replace("\\", "/") + return parent_path def __is_ref(self, dtype): if isinstance(dtype, DtypeSpec): @@ -801,12 +762,13 @@ def resolve_ref(self, zarr_ref): source_file = str(zarr_ref["path"]) else: source_file = str(zarr_ref["source"]) - # Resolve the path relative to the current file + if not self.is_remote(): if isinstance(self.source, str) and self.source.startswith(("s3://")): source_file = self.source else: - source_file = os.path.abspath(source_file) + # Join with source_file to resolve the relative path + source_file = os.path.normpath(os.path.join(self.source, source_file)) else: # get rid of extra "/" and "./" in the path root and source_file root_path = str(self.path).rstrip("/") @@ -895,7 +857,7 @@ def _create_ref(self, ref_object, ref_link_source=None): str_path = self.path.path else: str_path = self.path - rel_source = os.path.relpath(os.path.abspath(ref_link_source), os.path.dirname(os.path.abspath(str_path))) + rel_source = os.path.relpath(os.path.abspath(ref_link_source), os.path.abspath(str_path)) # Return the ZarrReference object ref = ZarrReference( @@ -965,7 +927,7 @@ def write_link(self, **kwargs): name = builder.name # Get the reference - zarr_ref = self._create_ref(builder, ref_link_source) + zarr_ref = self._create_ref(builder, ref_link_source=ref_link_source) self.__add_link__(parent, zarr_ref.source, zarr_ref.path, name) self._written_builders.set_written(builder) # record that the builder has been written @@ -1078,9 +1040,13 @@ def write_dataset(self, **kwargs): # noqa: C901 if isinstance(data, Array): # copy the dataset data_filename = self.__get_store_path(data.store) + str_path = self.path + if not isinstance(str_path, str): # a store + str_path = self.path.path + rel_data_filename = os.path.relpath(os.path.abspath(data_filename), os.path.abspath(str_path)) if link_data: if export_source is None: # not exporting - self.__add_link__(parent, data_filename, data.name, name) + self.__add_link__(parent, rel_data_filename, data.name, name) linked = True dset = None else: # exporting @@ -1089,7 +1055,7 @@ def write_dataset(self, **kwargs): # noqa: C901 # I have three files, FileA, FileB, FileC. I want to export FileA to FileB. FileA has an # EXTERNAL link to a dataset in Filec. This case preserves the link to FileC to also be in FileB. if data_filename != export_source: - self.__add_link__(parent, data_filename, data.name, name) + self.__add_link__(parent, rel_data_filename, data.name, name) linked = True dset = None # Case 2: If the dataset is in the export source and has a DIFFERENT path as the builder, @@ -1098,7 +1064,7 @@ def write_dataset(self, **kwargs): # noqa: C901 # INTERNAL link. This case preserves the link to also be in FileB. ############### elif parent.name != data_parent: - self.__add_link__(parent, self.path, data.name, name) + self.__add_link__(parent, ".", data.name, name) linked = True dset = None diff --git a/tests/unit/base_tests_zarrio.py b/tests/unit/base_tests_zarrio.py index c30cf482..a4287105 100644 --- a/tests/unit/base_tests_zarrio.py +++ b/tests/unit/base_tests_zarrio.py @@ -345,7 +345,7 @@ def test_write_references_roundtrip(self): baz_name = "baz%d" % i expected_container = read_container.bazs[baz_name] expected_value = { - "source": "test_io.zarr", + "source": ".", "path": "/bazs/" + baz_name, "object_id": expected_container.object_id, "source_object_id": read_container.object_id, @@ -664,7 +664,7 @@ def test_write_attributes_write_reference_to_datasetbuilder(self): "attr1": { "zarr_dtype": "object", "value": { - "source": "test_io.zarr", + "source": ".", "path": "/dataset_1", "object_id": None, "source_object_id": None, @@ -689,7 +689,7 @@ def test_write_attributes_write_reference_to_referencebuilder(self): "attr1": { "zarr_dtype": "object", "value": { - "source": "test_io.zarr", + "source": ".", "path": "/dataset_1", "source_object_id": None, "object_id": None, @@ -917,7 +917,7 @@ def test_link_zarr_dataset_input(self): expected_link = { "name": "test_softlink", "path": "/test_dataset", - "source": os.path.abspath(self.store_path), + "source": ".", } self.assertEqual(len(tempf.attrs["zarr_link"]), 1) self.assertDictEqual(tempf.attrs["zarr_link"][0], expected_link) @@ -958,7 +958,7 @@ def test_link_dataset_zarrdataio_input(self): expected_link = { "name": "test_softlink", "path": "/test_dataset", - "source": os.path.abspath(self.store_path), + "source": ".", } self.assertEqual(len(tempf.attrs["zarr_link"]), 1) self.assertDictEqual(tempf.attrs["zarr_link"][0], expected_link) @@ -1029,11 +1029,11 @@ def test_basic(self): export_io.export(src_io=read_io) self.assertTrue(os.path.exists(self.store_path[1])) - self.assertEqual(os.path.relpath(foofile.container_source), self.store_path[0]) + self.assertEqual(foofile.container_source, os.path.abspath(self.store_path[0])) with ZarrIO(self.store_path[1], manager=get_foo_buildmanager(), mode="r") as read_io: read_foofile = read_io.read() - self.assertEqual(os.path.relpath(read_foofile.container_source), self.store_path[1]) + self.assertEqual(read_foofile.container_source, os.path.abspath(self.store_path[1])) self.assertContainerEqual(foofile, read_foofile, ignore_hdmf_attrs=True) def test_basic_container(self): @@ -1051,11 +1051,11 @@ def test_basic_container(self): export_io.export(src_io=read_io, container=read_foofile) self.assertTrue(os.path.exists(self.store_path[1])) - self.assertEqual(os.path.relpath(foofile.container_source), self.store_path[0]) + self.assertEqual(foofile.container_source, os.path.abspath(self.store_path[0])) with ZarrIO(self.store_path[1], manager=get_foo_buildmanager(), mode="r") as read_io: read_foofile = read_io.read() - self.assertEqual(os.path.relpath(read_foofile.container_source), self.store_path[1]) + self.assertEqual(read_foofile.container_source, os.path.abspath(self.store_path[1])) self.assertContainerEqual(foofile, read_foofile, ignore_hdmf_attrs=True) def test_container_part(self): @@ -1145,7 +1145,7 @@ def test_soft_link_group(self): with ZarrIO(self.store_path[1], manager=get_foo_buildmanager(), mode="r") as read_io: read_foofile2 = read_io.read() if isinstance(self.store_path[1], str): - self.assertEqual(os.path.relpath(read_foofile2.foo_link.container_source), self.store_path[1]) + self.assertEqual(read_foofile2.foo_link.container_source, os.path.abspath(self.store_path[1])) else: self.assertEqual(read_foofile2.foo_link.container_source, self.store_path[1].path) @@ -1175,7 +1175,7 @@ def test_external_link_group(self): read_foofile2 = read_io.read() # make sure the linked group is read from the first file if isinstance(self.store_path[0], str): - self.assertEqual(os.path.relpath(read_foofile2.foo_link.container_source), self.store_path[0]) + self.assertEqual(read_foofile2.foo_link.container_source, os.path.abspath(self.store_path[0])) else: self.assertEqual(read_foofile2.foo_link.container_source, self.store_path[0].path) @@ -1232,7 +1232,7 @@ def test_external_link_dataset(self): with ZarrIO(self.store_path[2], manager=get_foo_buildmanager(), mode="r") as read_io: read_foofile2 = read_io.read() # make sure the linked dataset is read from the first file - self.assertEqual(os.path.relpath(read_foofile2.foofile_data.store.store.path), self.store_path[0]) + self.assertEqual(read_foofile2.foofile_data.store.store.path, os.path.abspath(self.store_path[0])) def test_external_link_link(self): """Test that exporting a written file with external links to external links maintains the links.""" @@ -1281,23 +1281,19 @@ def test_attr_reference(self): export_io.export(src_io=read_io) with ZarrIO(self.store_path[1], manager=get_foo_buildmanager(), mode="r") as read_io: read_foofile2 = read_io.read() - expected = ZarrIO.get_zarr_paths(read_foofile2.foo_ref_attr.my_data) - if isinstance(self.store_path[1], str): - self.assertTupleEqual( - (os.path.normpath(expected[0]), os.path.normpath(expected[1])), - ( - os.path.normpath(os.path.abspath(self.store_path[1])), - os.path.normpath("/buckets/bucket1/foo_holder/foo1/my_data"), - ), - ) - else: - self.assertTupleEqual( - (os.path.normpath(expected[0]), os.path.normpath(expected[1])), - ( - os.path.normpath(os.path.abspath(self.store_path[1].path)), - os.path.normpath("/buckets/bucket1/foo_holder/foo1/my_data"), - ), - ) + + # check the raw zarr attribute reference + expected_attr_reference = { + "value": { + "object_id": foo1.object_id, + "path": "/buckets/bucket1/foo_holder/foo1", + "source": ".", + "source_object_id": foofile.object_id, + }, + "zarr_dtype": "object", + } + self.assertEqual(read_io.file.attrs["foo_ref_attr"], expected_attr_reference) + # make sure the attribute reference resolves to the container within the same file self.assertIs(read_foofile2.foo_ref_attr, read_foofile2.buckets["bucket1"].foos["foo1"]) diff --git a/tests/unit/test_zarrio.py b/tests/unit/test_zarrio.py index 0dfa2e35..7141b3d4 100644 --- a/tests/unit/test_zarrio.py +++ b/tests/unit/test_zarrio.py @@ -10,6 +10,7 @@ need to implement the tests separately for the different backends. """ +from unittest import TestCase from tests.unit.base_tests_zarrio import ( BaseTestZarrWriter, ZarrStoreTestCase, @@ -17,7 +18,7 @@ BaseTestExportZarrToZarr, ) from zarr.storage import DirectoryStore, NestedDirectoryStore -from tests.unit.utils import Baz, BazData, BazBucket, get_baz_buildmanager +from tests.unit.utils import Baz, BazData, BazBucket, get_baz_buildmanager, get_foo_buildmanager import zarr from hdmf_zarr.backend import ZarrIO @@ -29,9 +30,6 @@ import pathlib -CUR_DIR = os.path.dirname(os.path.realpath(__file__)) - - ###################################################### # Default storage backend using just a string path ###################################################### @@ -68,6 +66,48 @@ class TestExportZarrToZarrDefaultStore(BaseTestExportZarrToZarr): pass +##################################################################### +# Default storage backend using just a string path to a subdirectory +##################################################################### +class TestZarrWriterSubdirectory(BaseTestZarrWriter): + """Test writing of builder with Zarr using a custom DirectoryStore""" + + def setUp(self): + os.makedirs("test_dir") + self.store_path = "test_dir/test_io.zarr" + self.manager = get_foo_buildmanager() + + def tearDown(self): + if os.path.exists("test_dir"): + shutil.rmtree("test_dir") + + +class TestZarrWriteUnitSubdirectory(BaseTestZarrWriteUnit): + """Unit test for individual write functions using a custom DirectoryStore""" + + def setUp(self): + os.makedirs("test_dir") + self.store_path = "test_dir/test_io.zarr" + self.manager = get_foo_buildmanager() + + def tearDown(self): + if os.path.exists("test_dir"): + shutil.rmtree("test_dir") + + +class TestExportZarrToZarrSubdirectory(BaseTestExportZarrToZarr): + """Test exporting Zarr to Zarr using DirectoryStore""" + + def setUp(self): + os.makedirs("test_dir") + self.store_path = [os.path.join("test_dir", f"file{i}.zarr") for i in range(3)] + self.manager = get_foo_buildmanager() + + def tearDown(self): + if os.path.exists("test_dir"): + shutil.rmtree("test_dir") + + ######################################### # DirectoryStore tests ######################################### @@ -249,7 +289,7 @@ def test_build(self): self.assertEqual(file.bar_datas[0].data.attrs["_ARRAY_DIMENSIONS"], ["a", "b"]) -class TestDatasetofReferences(ZarrStoreTestCase): +class TestDatasetOfReferences(TestCase): def setUp(self): self.store_path = "test_io.zarr" self.store = DirectoryStore(self.store_path)