Skip to content

Commit

Permalink
Fix relative references when store not in cwd (#256)
Browse files Browse the repository at this point in the history
Co-authored-by: Matthew Avaylon <[email protected]>
  • Loading branch information
rly and mavaylon1 authored Jan 16, 2025
1 parent 5b67a1c commit 133523e
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 87 deletions.
3 changes: 0 additions & 3 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ repos:
- id: check-added-large-files
- id: check-json
- id: check-toml
- id: name-tests-test
args: [--pytest-test-first]
- id: check-docstring-first
- repo: https://github.com/psf/black
rev: 24.10.0
hooks:
Expand Down
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
* Made docs site point to latest stable release: https://hdmf-zarr.readthedocs.io/en/stable/ instead of "latest" tag. @rly [#254](https://github.com/hdmf-dev/hdmf-zarr/pull/254)
* Removed GitHub Actions workflow that automatically generated GitHub pre-releases on the "latest" tag on each commit to dev to simplify maintenance. @rly [#254](https://github.com/hdmf-dev/hdmf-zarr/pull/254)
* Clarified docs and updated links to say that only Zarr v2 is currently supported. @rly [#257](https://github.com/hdmf-dev/hdmf-zarr/pull/257)
* Removed `ZarrIO.get_zarr_parent_path` and `ZarrIO.is_zarr_file` methods. @rly [#256](https://github.com/hdmf-dev/hdmf-zarr/pull/256)
* Fixed bug in how links and references are stored in the Zarr file. They are now written as relative paths from the Zarr file, using "." to indicate the current file. This is how hdmf-zarr wrote internal links and references pre-0.10.0. @rly [#256](https://github.com/hdmf-dev/hdmf-zarr/pull/256)

## 0.10.0 (December 18, 2024)

Expand Down
66 changes: 16 additions & 50 deletions src/hdmf_zarr/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -685,7 +685,7 @@ def write_attributes(self, **kwargs):
raise TypeError(str(e) + " type=" + str(type(value)) + " data=" + str(value)) from e
# Case 2: References
elif isinstance(value, (Builder, ReferenceBuilder)):
refs = self._create_ref(value, self.path)
refs = self._create_ref(value, ref_link_source=self.path)
tmp = {"zarr_dtype": "object", "value": refs}
obj.attrs[key] = tmp
# Case 3: Scalar attributes
Expand Down Expand Up @@ -726,55 +726,16 @@ def __get_path(self, builder):
path = "%s%s" % (delim, delim.join(reversed(names)))
return path

@staticmethod
def get_zarr_paths(zarr_object):
"""
For a Zarr object find 1) the path to the main zarr file it is in and 2) the path to the object within the file
:param zarr_object: Object for which we are looking up the path
:type zarr_object: Zarr Group or Array
:return: Tuple of two string with: 1) path of the Zarr file and 2) full path within the zarr file to the object
"""
# In Zarr the path is a combination of the path of the store and the path of the object. So we first need to
# merge those two paths, then remove the path of the file, add the missing leading "/" and then compute the
# directory name to get the path of the parent
fpath = ZarrIO._ZarrIO__get_store_path(zarr_object.store)
fullpath = os.path.normpath(os.path.join(fpath, zarr_object.path)).replace("\\", "/")
# To determine the filepath we now iterate over the path and check if the .zgroup object exists at
# a level, indicating that we are still within the Zarr file. The first level we hit where the parent
# directory does not have a .zgroup means we have found the main file
filepath = fullpath
while os.path.exists(os.path.join(os.path.dirname(filepath), ".zgroup")):
filepath = os.path.dirname(filepath)
# From the fullpath and filepath we can now compute the objectpath within the zarr file as the relative
# path from the filepath to the object
objectpath = "/" + os.path.relpath(fullpath, filepath)
# return the result
return filepath, objectpath

@staticmethod
def get_zarr_parent_path(zarr_object):
"""
Get the location of the parent of a zarr_object within the file
Get the absolute Unix path to the parent of a zarr_object from the root of the Zarr file
:param zarr_object: Object for which we are looking up the path
:type zarr_object: Zarr Group or Array
:return: String with the path
"""
filepath, objectpath = ZarrIO.get_zarr_paths(zarr_object)
parentpath = os.path.dirname(objectpath)
return parentpath

@staticmethod
def is_zarr_file(path):
"""
Check if the given path defines a Zarr file
:param path: Full path to main directory
:return: Bool
"""
if os.path.exists(path):
if os.path.isdir(path):
if os.path.exists(os.path.join(path, ".zgroup")):
return True
return False
parent_path = "/" + os.path.dirname(zarr_object.path).replace("\\", "/")
return parent_path

def __is_ref(self, dtype):
if isinstance(dtype, DtypeSpec):
Expand All @@ -801,12 +762,13 @@ def resolve_ref(self, zarr_ref):
source_file = str(zarr_ref["path"])
else:
source_file = str(zarr_ref["source"])
# Resolve the path relative to the current file

if not self.is_remote():
if isinstance(self.source, str) and self.source.startswith(("s3://")):
source_file = self.source
else:
source_file = os.path.abspath(source_file)
# Join with source_file to resolve the relative path
source_file = os.path.normpath(os.path.join(self.source, source_file))
else:
# get rid of extra "/" and "./" in the path root and source_file
root_path = str(self.path).rstrip("/")
Expand Down Expand Up @@ -895,7 +857,7 @@ def _create_ref(self, ref_object, ref_link_source=None):
str_path = self.path.path
else:
str_path = self.path
rel_source = os.path.relpath(os.path.abspath(ref_link_source), os.path.dirname(os.path.abspath(str_path)))
rel_source = os.path.relpath(os.path.abspath(ref_link_source), os.path.abspath(str_path))

# Return the ZarrReference object
ref = ZarrReference(
Expand Down Expand Up @@ -965,7 +927,7 @@ def write_link(self, **kwargs):

name = builder.name
# Get the reference
zarr_ref = self._create_ref(builder, ref_link_source)
zarr_ref = self._create_ref(builder, ref_link_source=ref_link_source)

self.__add_link__(parent, zarr_ref.source, zarr_ref.path, name)
self._written_builders.set_written(builder) # record that the builder has been written
Expand Down Expand Up @@ -1078,9 +1040,13 @@ def write_dataset(self, **kwargs): # noqa: C901
if isinstance(data, Array):
# copy the dataset
data_filename = self.__get_store_path(data.store)
str_path = self.path
if not isinstance(str_path, str): # a store
str_path = self.path.path
rel_data_filename = os.path.relpath(os.path.abspath(data_filename), os.path.abspath(str_path))
if link_data:
if export_source is None: # not exporting
self.__add_link__(parent, data_filename, data.name, name)
self.__add_link__(parent, rel_data_filename, data.name, name)
linked = True
dset = None
else: # exporting
Expand All @@ -1089,7 +1055,7 @@ def write_dataset(self, **kwargs): # noqa: C901
# I have three files, FileA, FileB, FileC. I want to export FileA to FileB. FileA has an
# EXTERNAL link to a dataset in Filec. This case preserves the link to FileC to also be in FileB.
if data_filename != export_source:
self.__add_link__(parent, data_filename, data.name, name)
self.__add_link__(parent, rel_data_filename, data.name, name)
linked = True
dset = None
# Case 2: If the dataset is in the export source and has a DIFFERENT path as the builder,
Expand All @@ -1098,7 +1064,7 @@ def write_dataset(self, **kwargs): # noqa: C901
# INTERNAL link. This case preserves the link to also be in FileB.
###############
elif parent.name != data_parent:
self.__add_link__(parent, self.path, data.name, name)
self.__add_link__(parent, ".", data.name, name)
linked = True
dset = None

Expand Down
54 changes: 25 additions & 29 deletions tests/unit/base_tests_zarrio.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,7 @@ def test_write_references_roundtrip(self):
baz_name = "baz%d" % i
expected_container = read_container.bazs[baz_name]
expected_value = {
"source": "test_io.zarr",
"source": ".",
"path": "/bazs/" + baz_name,
"object_id": expected_container.object_id,
"source_object_id": read_container.object_id,
Expand Down Expand Up @@ -664,7 +664,7 @@ def test_write_attributes_write_reference_to_datasetbuilder(self):
"attr1": {
"zarr_dtype": "object",
"value": {
"source": "test_io.zarr",
"source": ".",
"path": "/dataset_1",
"object_id": None,
"source_object_id": None,
Expand All @@ -689,7 +689,7 @@ def test_write_attributes_write_reference_to_referencebuilder(self):
"attr1": {
"zarr_dtype": "object",
"value": {
"source": "test_io.zarr",
"source": ".",
"path": "/dataset_1",
"source_object_id": None,
"object_id": None,
Expand Down Expand Up @@ -917,7 +917,7 @@ def test_link_zarr_dataset_input(self):
expected_link = {
"name": "test_softlink",
"path": "/test_dataset",
"source": os.path.abspath(self.store_path),
"source": ".",
}
self.assertEqual(len(tempf.attrs["zarr_link"]), 1)
self.assertDictEqual(tempf.attrs["zarr_link"][0], expected_link)
Expand Down Expand Up @@ -958,7 +958,7 @@ def test_link_dataset_zarrdataio_input(self):
expected_link = {
"name": "test_softlink",
"path": "/test_dataset",
"source": os.path.abspath(self.store_path),
"source": ".",
}
self.assertEqual(len(tempf.attrs["zarr_link"]), 1)
self.assertDictEqual(tempf.attrs["zarr_link"][0], expected_link)
Expand Down Expand Up @@ -1029,11 +1029,11 @@ def test_basic(self):
export_io.export(src_io=read_io)

self.assertTrue(os.path.exists(self.store_path[1]))
self.assertEqual(os.path.relpath(foofile.container_source), self.store_path[0])
self.assertEqual(foofile.container_source, os.path.abspath(self.store_path[0]))

with ZarrIO(self.store_path[1], manager=get_foo_buildmanager(), mode="r") as read_io:
read_foofile = read_io.read()
self.assertEqual(os.path.relpath(read_foofile.container_source), self.store_path[1])
self.assertEqual(read_foofile.container_source, os.path.abspath(self.store_path[1]))
self.assertContainerEqual(foofile, read_foofile, ignore_hdmf_attrs=True)

def test_basic_container(self):
Expand All @@ -1051,11 +1051,11 @@ def test_basic_container(self):
export_io.export(src_io=read_io, container=read_foofile)

self.assertTrue(os.path.exists(self.store_path[1]))
self.assertEqual(os.path.relpath(foofile.container_source), self.store_path[0])
self.assertEqual(foofile.container_source, os.path.abspath(self.store_path[0]))

with ZarrIO(self.store_path[1], manager=get_foo_buildmanager(), mode="r") as read_io:
read_foofile = read_io.read()
self.assertEqual(os.path.relpath(read_foofile.container_source), self.store_path[1])
self.assertEqual(read_foofile.container_source, os.path.abspath(self.store_path[1]))
self.assertContainerEqual(foofile, read_foofile, ignore_hdmf_attrs=True)

def test_container_part(self):
Expand Down Expand Up @@ -1145,7 +1145,7 @@ def test_soft_link_group(self):
with ZarrIO(self.store_path[1], manager=get_foo_buildmanager(), mode="r") as read_io:
read_foofile2 = read_io.read()
if isinstance(self.store_path[1], str):
self.assertEqual(os.path.relpath(read_foofile2.foo_link.container_source), self.store_path[1])
self.assertEqual(read_foofile2.foo_link.container_source, os.path.abspath(self.store_path[1]))
else:
self.assertEqual(read_foofile2.foo_link.container_source, self.store_path[1].path)

Expand Down Expand Up @@ -1175,7 +1175,7 @@ def test_external_link_group(self):
read_foofile2 = read_io.read()
# make sure the linked group is read from the first file
if isinstance(self.store_path[0], str):
self.assertEqual(os.path.relpath(read_foofile2.foo_link.container_source), self.store_path[0])
self.assertEqual(read_foofile2.foo_link.container_source, os.path.abspath(self.store_path[0]))
else:
self.assertEqual(read_foofile2.foo_link.container_source, self.store_path[0].path)

Expand Down Expand Up @@ -1232,7 +1232,7 @@ def test_external_link_dataset(self):
with ZarrIO(self.store_path[2], manager=get_foo_buildmanager(), mode="r") as read_io:
read_foofile2 = read_io.read()
# make sure the linked dataset is read from the first file
self.assertEqual(os.path.relpath(read_foofile2.foofile_data.store.store.path), self.store_path[0])
self.assertEqual(read_foofile2.foofile_data.store.store.path, os.path.abspath(self.store_path[0]))

def test_external_link_link(self):
"""Test that exporting a written file with external links to external links maintains the links."""
Expand Down Expand Up @@ -1281,23 +1281,19 @@ def test_attr_reference(self):
export_io.export(src_io=read_io)
with ZarrIO(self.store_path[1], manager=get_foo_buildmanager(), mode="r") as read_io:
read_foofile2 = read_io.read()
expected = ZarrIO.get_zarr_paths(read_foofile2.foo_ref_attr.my_data)
if isinstance(self.store_path[1], str):
self.assertTupleEqual(
(os.path.normpath(expected[0]), os.path.normpath(expected[1])),
(
os.path.normpath(os.path.abspath(self.store_path[1])),
os.path.normpath("/buckets/bucket1/foo_holder/foo1/my_data"),
),
)
else:
self.assertTupleEqual(
(os.path.normpath(expected[0]), os.path.normpath(expected[1])),
(
os.path.normpath(os.path.abspath(self.store_path[1].path)),
os.path.normpath("/buckets/bucket1/foo_holder/foo1/my_data"),
),
)

# check the raw zarr attribute reference
expected_attr_reference = {
"value": {
"object_id": foo1.object_id,
"path": "/buckets/bucket1/foo_holder/foo1",
"source": ".",
"source_object_id": foofile.object_id,
},
"zarr_dtype": "object",
}
self.assertEqual(read_io.file.attrs["foo_ref_attr"], expected_attr_reference)

# make sure the attribute reference resolves to the container within the same file
self.assertIs(read_foofile2.foo_ref_attr, read_foofile2.buckets["bucket1"].foos["foo1"])

Expand Down
50 changes: 45 additions & 5 deletions tests/unit/test_zarrio.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,15 @@
need to implement the tests separately for the different backends.
"""

from unittest import TestCase
from tests.unit.base_tests_zarrio import (
BaseTestZarrWriter,
ZarrStoreTestCase,
BaseTestZarrWriteUnit,
BaseTestExportZarrToZarr,
)
from zarr.storage import DirectoryStore, NestedDirectoryStore
from tests.unit.utils import Baz, BazData, BazBucket, get_baz_buildmanager
from tests.unit.utils import Baz, BazData, BazBucket, get_baz_buildmanager, get_foo_buildmanager

import zarr
from hdmf_zarr.backend import ZarrIO
Expand All @@ -29,9 +30,6 @@
import pathlib


CUR_DIR = os.path.dirname(os.path.realpath(__file__))


######################################################
# Default storage backend using just a string path
######################################################
Expand Down Expand Up @@ -68,6 +66,48 @@ class TestExportZarrToZarrDefaultStore(BaseTestExportZarrToZarr):
pass


#####################################################################
# Default storage backend using just a string path to a subdirectory
#####################################################################
class TestZarrWriterSubdirectory(BaseTestZarrWriter):
"""Test writing of builder with Zarr using a custom DirectoryStore"""

def setUp(self):
os.makedirs("test_dir")
self.store_path = "test_dir/test_io.zarr"
self.manager = get_foo_buildmanager()

def tearDown(self):
if os.path.exists("test_dir"):
shutil.rmtree("test_dir")


class TestZarrWriteUnitSubdirectory(BaseTestZarrWriteUnit):
"""Unit test for individual write functions using a custom DirectoryStore"""

def setUp(self):
os.makedirs("test_dir")
self.store_path = "test_dir/test_io.zarr"
self.manager = get_foo_buildmanager()

def tearDown(self):
if os.path.exists("test_dir"):
shutil.rmtree("test_dir")


class TestExportZarrToZarrSubdirectory(BaseTestExportZarrToZarr):
"""Test exporting Zarr to Zarr using DirectoryStore"""

def setUp(self):
os.makedirs("test_dir")
self.store_path = [os.path.join("test_dir", f"file{i}.zarr") for i in range(3)]
self.manager = get_foo_buildmanager()

def tearDown(self):
if os.path.exists("test_dir"):
shutil.rmtree("test_dir")


#########################################
# DirectoryStore tests
#########################################
Expand Down Expand Up @@ -249,7 +289,7 @@ def test_build(self):
self.assertEqual(file.bar_datas[0].data.attrs["_ARRAY_DIMENSIONS"], ["a", "b"])


class TestDatasetofReferences(ZarrStoreTestCase):
class TestDatasetOfReferences(TestCase):
def setUp(self):
self.store_path = "test_io.zarr"
self.store = DirectoryStore(self.store_path)
Expand Down

0 comments on commit 133523e

Please sign in to comment.