From 45117ef7ab69ab4d86e931d3298aa1b10d834479 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:05:57 +1100 Subject: [PATCH 01/35] added code to handle "locally-persistent-ids" --- pydra/utils/hash.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 0c5f5f870..1d62490a9 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -90,10 +90,23 @@ def hash_single(obj: object, cache: Cache) -> Hash: # Handle recursion by putting a dummy value in the cache cache[objid] = Hash(b"\x00") h = blake2b(digest_size=16, person=b"pydra-hash") - for chunk in bytes_repr(obj, cache): + bytes_it = bytes_repr(obj, cache) + first = next(bytes_it) + if isinstance(first, str): + cache_id = first + try: + return cache[cache_id] + except KeyError: + pass + else: + h.update(first) + cache_id = None + for chunk in bytes_it: h.update(chunk) hsh = cache[objid] = Hash(h.digest()) logger.debug("Hash of %s object is %s", obj, hsh) + if cache_id is not None: + cache[cache_id] = hsh return cache[objid] From 2b7ca50f2b45492d01572b7a49eee9a688927619 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:05:57 +1100 Subject: [PATCH 02/35] implemented persistent hash cache to avoid rehashing files --- pydra/utils/hash.py | 155 ++++++++++++++++++++++++++++----- pydra/utils/tests/test_hash.py | 24 +++++ 2 files changed, 157 insertions(+), 22 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 1d62490a9..8a5f1045b 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -4,13 +4,13 @@ # import stat import struct +import tempfile import typing as ty +from pathlib import Path from collections.abc import Mapping from functools import singledispatch from hashlib import blake2b import logging - -# from pathlib import Path from typing import ( Dict, Iterator, @@ -18,7 +18,9 @@ Sequence, Set, ) +from filelock import SoftFileLock import attrs.exceptions +from fileformats.core import FileSet logger = logging.getLogger("pydra") @@ -52,19 +54,109 @@ ) Hash = NewType("Hash", bytes) -Cache = NewType("Cache", Dict[int, Hash]) +CacheKey = NewType("CacheKey", ty.Tuple[ty.Hashable, ty.Hashable]) + + +@attrs.define +class PersistentCache: + """Persistent cache in which to store computationally expensive hashes between nodes + and workflow/task runs + + Parameters + ---------- + location: Path + the directory in which to store the hashes cache + """ + + location: Path = attrs.field() + _hashes: ty.Dict[CacheKey, Hash] = attrs.field(factory=dict) + + @location.validator + def location_validator(self, _, location): + if not os.path.isdir(location): + raise ValueError( + f"Persistent cache location '{location}' is not a directory" + ) + + def get_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash: + """Check whether key is present in the persistent cache store and return it if so. + Otherwise use `calculate_hash` to generate the hash and save it in the persistent + store. + + Parameters + ---------- + key : CacheKey + locally unique key (e.g. to the host) used to lookup the corresponding hash + in the persistent store + calculate_hash : ty.Callable + function to calculate the hash if it isn't present in the persistent store + + Returns + ------- + Hash + _description_ + """ + try: + return self._hashes[key] + except KeyError: + pass + key_path = self.location / blake2b(str(key).encode()).hexdigest() + with SoftFileLock(key_path.with_suffix(".lock")): + if key_path.exists(): + return Hash(key_path.read_bytes()) + hsh = calculate_hash() + key_path.write_bytes(hsh) + return Hash(hsh) + + +def persistent_cache_converter( + path: ty.Union[Path, str, PersistentCache, None] +) -> PersistentCache: + if isinstance(path, PersistentCache): + return path + if path is None: + path = tempfile.mkdtemp() + return PersistentCache(Path(path)) + + +@attrs.define +class Cache: + persistent: ty.Optional[PersistentCache] = attrs.field( + default=None, + converter=persistent_cache_converter, + ) + _hashes: ty.Dict[int, Hash] = attrs.field(factory=dict) + + def __getitem__(self, object_id: int) -> Hash: + return self._hashes[object_id] + + def __setitem__(self, object_id: int, hsh: Hash): + self._hashes[object_id] = hsh + + def __contains__(self, object_id): + return object_id in self._hashes class UnhashableError(ValueError): """Error for objects that cannot be hashed""" -def hash_function(obj): +def hash_function(obj, persistent_cache: ty.Optional[Path] = None): """Generate hash of object.""" - return hash_object(obj).hex() + if persistent_cache is None: + # FIXME: Ideally the default location would be inside one of the cache_locations + # but can't think of a clean way to pass the cache_locations down to this part + # of the code, so just dumping in the home directory instead + persistent_cache = (Path("~") / ".pydra-hash-cache").expanduser() + try: + if not persistent_cache.exists(): + persistent_cache.mkdir() + except Exception: + persistent_cache = None + return hash_object(obj, persistent_cache=persistent_cache).hex() -def hash_object(obj: object) -> Hash: +def hash_object(obj: object, persistent_cache: ty.Optional[Path] = None) -> Hash: """Hash an object Constructs a byte string that uniquely identifies the object, @@ -74,9 +166,9 @@ def hash_object(obj: object) -> Hash: dicts. Custom types can be registered with :func:`register_serializer`. """ try: - return hash_single(obj, Cache({})) + return hash_single(obj, Cache(persistent=persistent_cache)) except Exception as e: - raise UnhashableError(f"Cannot hash object {obj!r}") from e + raise UnhashableError(f"Cannot hash object {obj!r} due to '{e}'") from e def hash_single(obj: object, cache: Cache) -> Hash: @@ -89,24 +181,31 @@ def hash_single(obj: object, cache: Cache) -> Hash: if objid not in cache: # Handle recursion by putting a dummy value in the cache cache[objid] = Hash(b"\x00") - h = blake2b(digest_size=16, person=b"pydra-hash") bytes_it = bytes_repr(obj, cache) + # Pop first element from the bytes_repr iterator and check whether it is a + # "local cache key" (e.g. file-system path + mtime tuple) or the first bytes + # chunk + + def calc_hash(first: ty.Optional[bytes] = None) -> Hash: + h = blake2b(digest_size=16, person=b"pydra-hash") + if first is not None: + h.update(first) + for chunk in bytes_it: # NB: bytes_it is in outer scope + h.update(chunk) + return Hash(h.digest()) + first = next(bytes_it) - if isinstance(first, str): - cache_id = first - try: - return cache[cache_id] - except KeyError: - pass + if isinstance(first, tuple): + tp = type(obj) + key = ( + tp.__module__, + tp.__name__, + ) + first + hsh = cache.persistent.get_hash(key, calc_hash) else: - h.update(first) - cache_id = None - for chunk in bytes_it: - h.update(chunk) - hsh = cache[objid] = Hash(h.digest()) + hsh = calc_hash(first=first) logger.debug("Hash of %s object is %s", obj, hsh) - if cache_id is not None: - cache[cache_id] = hsh + cache[objid] = hsh return cache[objid] @@ -271,6 +370,18 @@ def type_name(tp): yield b")" +@register_serializer(FileSet) +def bytes_repr_fileset( + fileset: FileSet, cache: Cache +) -> Iterator[ty.Union[CacheKey, bytes]]: + fspaths = sorted(fileset.fspaths) + yield CacheKey( + tuple(repr(p) for p in fspaths) # type: ignore[arg-type] + + tuple(p.lstat().st_mtime for p in fspaths) + ) + yield from fileset.__bytes_repr__(cache) + + @register_serializer(list) @register_serializer(tuple) def bytes_repr_seq(obj: Sequence, cache: Cache) -> Iterator[bytes]: diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index 8da055e11..c151b27ce 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -6,6 +6,7 @@ import pytest import typing as ty from fileformats.application import Zip, Json +from fileformats.text import TextFile from ..hash import Cache, UnhashableError, bytes_repr, hash_object, register_serializer @@ -296,3 +297,26 @@ def _(obj: MyClass, cache: Cache): register_serializer(MyNewClass, _) assert join_bytes_repr(MyNewClass(1)) == b"serializer" + + +def test_persistent_hash_cache(tmp_path): + cache_path = tmp_path / "hash-cache" + cache_path.mkdir() + text_file_path = tmp_path / "text-file.txt" + text_file_path.write_text("foo") + text_file = TextFile(text_file_path) + + # Test hash is stable between calls + hsh = hash_object(text_file, persistent_cache=cache_path) + assert hsh == hash_object(text_file, persistent_cache=cache_path) + + # Test that cached hash has been used + cache_files = list(cache_path.iterdir()) + assert len(cache_files) == 1 + modified_hash = "modified".encode() + cache_files[0].write_bytes(modified_hash) + assert hash_object(text_file, persistent_cache=cache_path) == modified_hash + + # Test that changes to the text file result in new hash + text_file_path.write_text("bar") + assert hash_object(text_file, persistent_cache=cache_path) != modified_hash From 04b95ff189edf1046961eb8a6ddba331a7aa7a3a Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:05:57 +1100 Subject: [PATCH 03/35] touched up persistent_hash_cache test --- pydra/utils/tests/test_hash.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index c151b27ce..4a7209d78 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -320,3 +320,4 @@ def test_persistent_hash_cache(tmp_path): # Test that changes to the text file result in new hash text_file_path.write_text("bar") assert hash_object(text_file, persistent_cache=cache_path) != modified_hash + assert len(list(cache_path.iterdir())) == 2 From 0c865f4ed6883137ab8535f3634ac7b8c3f8a578 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:05:57 +1100 Subject: [PATCH 04/35] replaced Cache({}) with Cache() to match new proper class --- pydra/utils/hash.py | 6 +++--- pydra/utils/tests/test_hash.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 8a5f1045b..c1c262152 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -190,7 +190,7 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash: h = blake2b(digest_size=16, person=b"pydra-hash") if first is not None: h.update(first) - for chunk in bytes_it: # NB: bytes_it is in outer scope + for chunk in bytes_it: # Note that `bytes_it` is in outer scope h.update(chunk) return Hash(h.digest()) @@ -406,7 +406,7 @@ def bytes_repr_mapping_contents(mapping: Mapping, cache: Cache) -> Iterator[byte .. code-block:: python >>> from pydra.utils.hash import bytes_repr_mapping_contents, Cache - >>> generator = bytes_repr_mapping_contents({"a": 1, "b": 2}, Cache({})) + >>> generator = bytes_repr_mapping_contents({"a": 1, "b": 2}, Cache()) >>> b''.join(generator) b'str:1:a=...str:1:b=...' """ @@ -424,7 +424,7 @@ def bytes_repr_sequence_contents(seq: Sequence, cache: Cache) -> Iterator[bytes] .. code-block:: python >>> from pydra.utils.hash import bytes_repr_sequence_contents, Cache - >>> generator = bytes_repr_sequence_contents([1, 2], Cache({})) + >>> generator = bytes_repr_sequence_contents([1, 2], Cache()) >>> list(generator) [b'\x6d...', b'\xa3...'] """ diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index 4a7209d78..fabbaa550 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -16,7 +16,7 @@ def hasher(): def join_bytes_repr(obj): - return b"".join(bytes_repr(obj, Cache({}))) + return b"".join(bytes_repr(obj, Cache())) def test_bytes_repr_builtins(): From 3b3fdb7933fa3578676fa0cc8a0f5af536c61b9e Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:05:57 +1100 Subject: [PATCH 05/35] upped resolution of mtime to nanoseconds --- pydra/utils/hash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index c1c262152..650bd6f86 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -377,7 +377,7 @@ def bytes_repr_fileset( fspaths = sorted(fileset.fspaths) yield CacheKey( tuple(repr(p) for p in fspaths) # type: ignore[arg-type] - + tuple(p.lstat().st_mtime for p in fspaths) + + tuple(p.lstat().st_mtime_ns for p in fspaths) ) yield from fileset.__bytes_repr__(cache) From 81a5108e8eead74a8bcca539f334c743f0cd3e9a Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:05:57 +1100 Subject: [PATCH 06/35] added sleep to various tests to ensure file mtimes are different --- pydra/engine/tests/test_node_task.py | 2 ++ pydra/engine/tests/test_specs.py | 2 ++ pydra/utils/tests/test_hash.py | 2 ++ 3 files changed, 6 insertions(+) diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py index 4e182781b..83cd91bd7 100644 --- a/pydra/engine/tests/test_node_task.py +++ b/pydra/engine/tests/test_node_task.py @@ -3,6 +3,7 @@ import attr import numpy as np import pytest +import time from .utils import ( fun_addtwo, @@ -320,6 +321,7 @@ def test_task_init_7(tmp_path): output_dir1 = nn1.output_dir # changing the content of the file + time.sleep(2) # need the mtime to be different file2 = tmp_path / "file2.txt" with open(file2, "w") as f: f.write("from pydra") diff --git a/pydra/engine/tests/test_specs.py b/pydra/engine/tests/test_specs.py index 0370d92a5..7a7cac4df 100644 --- a/pydra/engine/tests/test_specs.py +++ b/pydra/engine/tests/test_specs.py @@ -3,6 +3,7 @@ import os import attrs from copy import deepcopy +import time from ..specs import ( BaseSpec, @@ -288,6 +289,7 @@ def test_input_file_hash_4(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash + time.sleep(2) # need the mtime to be different file_diffcontent = tmp_path / "in_file_1.txt" with open(file_diffcontent, "w") as f: f.write("hi") diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index fabbaa550..af8bb27e7 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -1,6 +1,7 @@ import re from hashlib import blake2b from pathlib import Path +import time import attrs import pytest @@ -318,6 +319,7 @@ def test_persistent_hash_cache(tmp_path): assert hash_object(text_file, persistent_cache=cache_path) == modified_hash # Test that changes to the text file result in new hash + time.sleep(2) # Need to ensure that the mtimes will be different text_file_path.write_text("bar") assert hash_object(text_file, persistent_cache=cache_path) != modified_hash assert len(list(cache_path.iterdir())) == 2 From 0c4b1799541108f1d6fdabc9bc5d47843bcbc72b Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:05:57 +1100 Subject: [PATCH 07/35] added more sleeps to ensure mtimes of input files are different in tests --- pydra/engine/tests/test_specs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pydra/engine/tests/test_specs.py b/pydra/engine/tests/test_specs.py index 7a7cac4df..e73688594 100644 --- a/pydra/engine/tests/test_specs.py +++ b/pydra/engine/tests/test_specs.py @@ -164,6 +164,7 @@ def test_input_file_hash_2(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash + time.sleep(2) # ensure mtime is different file_diffcontent = tmp_path / "in_file_1.txt" with open(file_diffcontent, "w") as f: f.write("hi") @@ -194,6 +195,7 @@ def test_input_file_hash_2a(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash + time.sleep(2) # ensure mtime is different file_diffcontent = tmp_path / "in_file_1.txt" with open(file_diffcontent, "w") as f: f.write("hi") @@ -235,6 +237,7 @@ def test_input_file_hash_3(tmp_path): # assert id(files_hash1["in_file"][filename]) == id(files_hash2["in_file"][filename]) # recreating the file + time.sleep(2) # ensure mtime is different with open(file, "w") as f: f.write("hello") @@ -326,6 +329,7 @@ def test_input_file_hash_5(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash + time.sleep(2) # ensure mtime is different file_diffcontent = tmp_path / "in_file_1.txt" with open(file_diffcontent, "w") as f: f.write("hi") From 615d59075a57972aa8bb67addcb0089e3961ef99 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:06:38 +1100 Subject: [PATCH 08/35] debugged setting hash cache via env var and added clean up of directory --- pydra/engine/submitter.py | 2 ++ pydra/utils/hash.py | 75 +++++++++++++++++++++++++-------------- 2 files changed, 50 insertions(+), 27 deletions(-) diff --git a/pydra/engine/submitter.py b/pydra/engine/submitter.py index 3906955b2..f92841c48 100644 --- a/pydra/engine/submitter.py +++ b/pydra/engine/submitter.py @@ -6,6 +6,7 @@ from .workers import WORKERS from .core import is_workflow from .helpers import get_open_loop, load_and_run_async +from ..utils.hash import PersistentCache import logging @@ -43,6 +44,7 @@ def __call__(self, runnable, cache_locations=None, rerun=False, environment=None self.loop.run_until_complete( self.submit_from_call(runnable, rerun, environment) ) + PersistentCache.clean_up() return runnable.result() async def submit_from_call(self, runnable, rerun, environment): diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 650bd6f86..10f8682fb 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -1,9 +1,8 @@ """Generic object hashing dispatch""" import os - -# import stat import struct +from datetime import datetime import tempfile import typing as ty from pathlib import Path @@ -78,7 +77,7 @@ def location_validator(self, _, location): f"Persistent cache location '{location}' is not a directory" ) - def get_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash: + def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash: """Check whether key is present in the persistent cache store and return it if so. Otherwise use `calculate_hash` to generate the hash and save it in the persistent store. @@ -108,22 +107,52 @@ def get_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash: key_path.write_bytes(hsh) return Hash(hsh) - -def persistent_cache_converter( - path: ty.Union[Path, str, PersistentCache, None] -) -> PersistentCache: - if isinstance(path, PersistentCache): - return path - if path is None: - path = tempfile.mkdtemp() - return PersistentCache(Path(path)) + @classmethod + def clean_up(cls): + """Cleans up old hash caches that haven't been accessed in the last 30 days""" + now = datetime.now() + for path in cls.DEFAULT_LOCATION.iterdir(): + days = (now - datetime.fromtimestamp(path.lstat().st_atime)).days + if days > cls.CLEAN_UP_PERIOD: + path.unlink() + + @classmethod + def from_path( + cls, path: ty.Union[Path, str, "PersistentCache", None] + ) -> "PersistentCache": + if isinstance(path, PersistentCache): + return path + if path is None: + path = Path(tempfile.mkdtemp()) + else: + path = Path(path) + if not path.exists(): + try: + Path(path).mkdir() + except Exception as e: + raise RuntimeError( + f"Could not create cache directory for file hashes at {path}, " + "please use 'PYDRA_HASH_CACHE' environment variable to control " + "where it is created by default" + ) from e + return PersistentCache(path) + + # FIXME: Ideally the default location would be inside one of the cache_locations + # but can't think of a clean way to pass the cache_locations down to this part + # of the code, so just dumping in the home directory instead by default. In any case, + # this needs to be documented + DEFAULT_LOCATION = Path( + os.environ.get("PYDRA_HASH_CACHE", Path("~").expanduser() / ".pydra-hash-cache") + ) + # Set the period after which old hash cache files are cleaned up in days + CLEAN_UP_PERIOD = os.environ.get("PYDRA_HASH_CACHE_CLEAN_UP_PERIOD", 30) @attrs.define class Cache: persistent: ty.Optional[PersistentCache] = attrs.field( default=None, - converter=persistent_cache_converter, + converter=PersistentCache.from_path, # type: ignore[misc] ) _hashes: ty.Dict[int, Hash] = attrs.field(factory=dict) @@ -141,22 +170,14 @@ class UnhashableError(ValueError): """Error for objects that cannot be hashed""" -def hash_function(obj, persistent_cache: ty.Optional[Path] = None): +def hash_function(obj, **kwargs): """Generate hash of object.""" - if persistent_cache is None: - # FIXME: Ideally the default location would be inside one of the cache_locations - # but can't think of a clean way to pass the cache_locations down to this part - # of the code, so just dumping in the home directory instead - persistent_cache = (Path("~") / ".pydra-hash-cache").expanduser() - try: - if not persistent_cache.exists(): - persistent_cache.mkdir() - except Exception: - persistent_cache = None - return hash_object(obj, persistent_cache=persistent_cache).hex() + return hash_object(obj, **kwargs).hex() -def hash_object(obj: object, persistent_cache: ty.Optional[Path] = None) -> Hash: +def hash_object( + obj: object, persistent_cache: ty.Optional[Path] = PersistentCache.DEFAULT_LOCATION +) -> Hash: """Hash an object Constructs a byte string that uniquely identifies the object, @@ -201,7 +222,7 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash: tp.__module__, tp.__name__, ) + first - hsh = cache.persistent.get_hash(key, calc_hash) + hsh = cache.persistent.get_or_calculate_hash(key, calc_hash) else: hsh = calc_hash(first=first) logger.debug("Hash of %s object is %s", obj, hsh) From 55b660e1e422953ec8f3b3c1dff6fa475c8defcb Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:06:38 +1100 Subject: [PATCH 09/35] mock mtime writing instead of adding sleeps to ensure mtimes are different --- pydra/engine/tests/test_node_task.py | 9 ++++-- pydra/engine/tests/test_specs.py | 45 ++++++++++++++++++---------- pydra/utils/tests/test_hash.py | 8 +++-- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py index 83cd91bd7..e01659424 100644 --- a/pydra/engine/tests/test_node_task.py +++ b/pydra/engine/tests/test_node_task.py @@ -321,10 +321,13 @@ def test_task_init_7(tmp_path): output_dir1 = nn1.output_dir # changing the content of the file - time.sleep(2) # need the mtime to be different file2 = tmp_path / "file2.txt" - with open(file2, "w") as f: - f.write("from pydra") + with mock.patch("time.time") as t: + t.return_value = ( + time.time() + 10 + ) # mock mtime writing to ensure it is different + with open(file2, "w") as f: + f.write("from pydra") nn2 = fun_file_list(name="NA", filename_list=[file1, file2]) output_dir2 = nn2.output_dir diff --git a/pydra/engine/tests/test_specs.py b/pydra/engine/tests/test_specs.py index e73688594..517b2a88d 100644 --- a/pydra/engine/tests/test_specs.py +++ b/pydra/engine/tests/test_specs.py @@ -3,6 +3,7 @@ import os import attrs from copy import deepcopy +from unittest import mock import time from ..specs import ( @@ -164,10 +165,13 @@ def test_input_file_hash_2(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash - time.sleep(2) # ensure mtime is different file_diffcontent = tmp_path / "in_file_1.txt" - with open(file_diffcontent, "w") as f: - f.write("hi") + with mock.patch("time.time") as t: + t.return_value = ( + time.time() + 10 + ) # mock mtime writing to ensure it is different + with open(file_diffcontent, "w") as f: + f.write("hi") hash3 = inputs(in_file=file_diffcontent).hash assert hash1 != hash3 @@ -195,10 +199,12 @@ def test_input_file_hash_2a(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash - time.sleep(2) # ensure mtime is different file_diffcontent = tmp_path / "in_file_1.txt" - with open(file_diffcontent, "w") as f: - f.write("hi") + # Ensure that the mtime will be incremented by mocking time.time + with mock.patch("time.time") as t: + t.return_value = time.time() + 10 + with open(file_diffcontent, "w") as f: + f.write("hi") hash3 = inputs(in_file=file_diffcontent).hash assert hash1 != hash3 @@ -237,9 +243,12 @@ def test_input_file_hash_3(tmp_path): # assert id(files_hash1["in_file"][filename]) == id(files_hash2["in_file"][filename]) # recreating the file - time.sleep(2) # ensure mtime is different - with open(file, "w") as f: - f.write("hello") + with mock.patch("time.time") as t: + t.return_value = ( + time.time() + 10 + ) # mock mtime writing to ensure it is different + with open(file, "w") as f: + f.write("hello") hash3 = my_inp.hash # files_hash3 = deepcopy(my_inp.files_hash) @@ -292,10 +301,13 @@ def test_input_file_hash_4(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash - time.sleep(2) # need the mtime to be different file_diffcontent = tmp_path / "in_file_1.txt" - with open(file_diffcontent, "w") as f: - f.write("hi") + with mock.patch("time.time") as t: + t.return_value = ( + time.time() + 10 + ) # mock mtime writing to ensure it is different + with open(file_diffcontent, "w") as f: + f.write("hi") hash3 = inputs(in_file=[[file_diffcontent, 3]]).hash assert hash1 != hash3 @@ -329,10 +341,13 @@ def test_input_file_hash_5(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash - time.sleep(2) # ensure mtime is different file_diffcontent = tmp_path / "in_file_1.txt" - with open(file_diffcontent, "w") as f: - f.write("hi") + with mock.patch("time.time") as t: + t.return_value = ( + time.time() + 10 + ) # mock mtime writing to ensure it is different + with open(file_diffcontent, "w") as f: + f.write("hi") hash3 = inputs(in_file=[{"file": file_diffcontent, "int": 3}]).hash assert hash1 != hash3 diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index af8bb27e7..048c225e6 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -2,7 +2,7 @@ from hashlib import blake2b from pathlib import Path import time - +from unittest import mock import attrs import pytest import typing as ty @@ -319,7 +319,9 @@ def test_persistent_hash_cache(tmp_path): assert hash_object(text_file, persistent_cache=cache_path) == modified_hash # Test that changes to the text file result in new hash - time.sleep(2) # Need to ensure that the mtimes will be different text_file_path.write_text("bar") - assert hash_object(text_file, persistent_cache=cache_path) != modified_hash + # Ensure that the mtime will be incremented by mocking time.time + with mock.patch("time.time") as t: + t.return_value = time.time() + 10 + assert hash_object(text_file, persistent_cache=cache_path) != modified_hash assert len(list(cache_path.iterdir())) == 2 From 5d51736fff1fc827c39191e2adb4e5d81828bcfd Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:06:38 +1100 Subject: [PATCH 10/35] undid overzealous black --- pydra/engine/tests/test_node_task.py | 4 +--- pydra/engine/tests/test_specs.py | 16 ++++------------ 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py index e01659424..7ce6ffd45 100644 --- a/pydra/engine/tests/test_node_task.py +++ b/pydra/engine/tests/test_node_task.py @@ -323,9 +323,7 @@ def test_task_init_7(tmp_path): # changing the content of the file file2 = tmp_path / "file2.txt" with mock.patch("time.time") as t: - t.return_value = ( - time.time() + 10 - ) # mock mtime writing to ensure it is different + t.return_value = time.time() + 10 # mock mtime to ensure different with open(file2, "w") as f: f.write("from pydra") diff --git a/pydra/engine/tests/test_specs.py b/pydra/engine/tests/test_specs.py index 517b2a88d..98cbdec38 100644 --- a/pydra/engine/tests/test_specs.py +++ b/pydra/engine/tests/test_specs.py @@ -167,9 +167,7 @@ def test_input_file_hash_2(tmp_path): # checking if different content (the same name) affects the hash file_diffcontent = tmp_path / "in_file_1.txt" with mock.patch("time.time") as t: - t.return_value = ( - time.time() + 10 - ) # mock mtime writing to ensure it is different + t.return_value = time.time() + 10 # mock mtime to ensure different with open(file_diffcontent, "w") as f: f.write("hi") hash3 = inputs(in_file=file_diffcontent).hash @@ -244,9 +242,7 @@ def test_input_file_hash_3(tmp_path): # recreating the file with mock.patch("time.time") as t: - t.return_value = ( - time.time() + 10 - ) # mock mtime writing to ensure it is different + t.return_value = time.time() + 10 # mock mtime to ensure different with open(file, "w") as f: f.write("hello") @@ -303,9 +299,7 @@ def test_input_file_hash_4(tmp_path): # checking if different content (the same name) affects the hash file_diffcontent = tmp_path / "in_file_1.txt" with mock.patch("time.time") as t: - t.return_value = ( - time.time() + 10 - ) # mock mtime writing to ensure it is different + t.return_value = time.time() + 10 # mock mtime to ensure different with open(file_diffcontent, "w") as f: f.write("hi") hash3 = inputs(in_file=[[file_diffcontent, 3]]).hash @@ -343,9 +337,7 @@ def test_input_file_hash_5(tmp_path): # checking if different content (the same name) affects the hash file_diffcontent = tmp_path / "in_file_1.txt" with mock.patch("time.time") as t: - t.return_value = ( - time.time() + 10 - ) # mock mtime writing to ensure it is different + t.return_value = time.time() + 10 # mock mtime to ensure different with open(file_diffcontent, "w") as f: f.write("hi") hash3 = inputs(in_file=[{"file": file_diffcontent, "int": 3}]).hash From 0421f854b4e952fab8387c43738b35f8999dee98 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:06:38 +1100 Subject: [PATCH 11/35] added missing import --- pydra/engine/tests/test_node_task.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py index 7ce6ffd45..b1418cddc 100644 --- a/pydra/engine/tests/test_node_task.py +++ b/pydra/engine/tests/test_node_task.py @@ -2,6 +2,7 @@ import shutil import attr import numpy as np +from unittest import mock import pytest import time From a864b32bc32367086031d7dd894322e499c84a36 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:06:38 +1100 Subject: [PATCH 12/35] Adds platformdirs dependency and use it to store the hash cache within --- pydra/utils/hash.py | 19 ++++++++++++++----- pyproject.toml | 1 + 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 10f8682fb..94e276ce9 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -18,8 +18,10 @@ Set, ) from filelock import SoftFileLock +import platformdirs import attrs.exceptions from fileformats.core import FileSet +from pydra._version import __version__ logger = logging.getLogger("pydra") @@ -128,12 +130,12 @@ def from_path( path = Path(path) if not path.exists(): try: - Path(path).mkdir() + Path(path).mkdir(parents=True) except Exception as e: raise RuntimeError( f"Could not create cache directory for file hashes at {path}, " - "please use 'PYDRA_HASH_CACHE' environment variable to control " - "where it is created by default" + "please use 'PYDRA_HASH_CACHE' environment variable to manually " + "set where it is created" ) from e return PersistentCache(path) @@ -142,10 +144,17 @@ def from_path( # of the code, so just dumping in the home directory instead by default. In any case, # this needs to be documented DEFAULT_LOCATION = Path( - os.environ.get("PYDRA_HASH_CACHE", Path("~").expanduser() / ".pydra-hash-cache") + os.environ.get( + "PYDRA_HASH_CACHE", + platformdirs.user_cache_dir( + appname="pydra", + appauthor="nipype", + version=__version__, + ), + ) ) # Set the period after which old hash cache files are cleaned up in days - CLEAN_UP_PERIOD = os.environ.get("PYDRA_HASH_CACHE_CLEAN_UP_PERIOD", 30) + CLEAN_UP_PERIOD = os.environ.get("PYDRA_HASH_CACHE_CLEANUP_PERIOD", 30) @attrs.define diff --git a/pyproject.toml b/pyproject.toml index ad8f61ea8..6a6ad5e70 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,6 +14,7 @@ dependencies = [ "filelock >=3.0.0", "fileformats >=0.8", "importlib_resources >=5.7; python_version < '3.11'", + "platformdirs >=2", "typing_extensions >=4.6.3; python_version < '3.10'", "typing_utils >=0.1.0; python_version < '3.10'", ] From 05ca6958009b9255482a434de94ebb224fee33ad Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:06:38 +1100 Subject: [PATCH 13/35] added unittests to hit exceptions in persistentcache init --- pydra/utils/hash.py | 82 +++++++++++++++++----------------- pydra/utils/tests/test_hash.py | 49 ++++++++++++++++++-- 2 files changed, 86 insertions(+), 45 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 94e276ce9..a34901e41 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -3,7 +3,6 @@ import os import struct from datetime import datetime -import tempfile import typing as ty from pathlib import Path from collections.abc import Mapping @@ -58,6 +57,12 @@ CacheKey = NewType("CacheKey", ty.Tuple[ty.Hashable, ty.Hashable]) +def location_converter(path: ty.Union[Path, str, None]) -> Path: + if path is None: + path = PersistentCache.location_default() + return Path(path) + + @attrs.define class PersistentCache: """Persistent cache in which to store computationally expensive hashes between nodes @@ -69,9 +74,31 @@ class PersistentCache: the directory in which to store the hashes cache """ - location: Path = attrs.field() + location: Path = attrs.field(converter=location_converter) # type: ignore[misc] + cleanup_period: int = attrs.field() _hashes: ty.Dict[CacheKey, Hash] = attrs.field(factory=dict) + # Set the location of the persistent hash cache + LOCATION_ENV_VAR = "PYDRA_HASH_CACHE" + CLEANUP_ENV_VAR = "PYDRA_HASH_CACHE_CLEANUP_PERIOD" + + @classmethod + def location_default(cls): + try: + location = os.environ[cls.LOCATION_ENV_VAR] + except KeyError: + location = platformdirs.user_cache_dir( + appname="pydra", + appauthor="nipype", + version=__version__, + ) + return location + + # the default needs to be an instance method + @location.default + def _location_default(self): + return self.location_default() + @location.validator def location_validator(self, _, location): if not os.path.isdir(location): @@ -79,6 +106,10 @@ def location_validator(self, _, location): f"Persistent cache location '{location}' is not a directory" ) + @cleanup_period.default + def cleanup_period_default(self): + return int(os.environ.get(self.CLEANUP_ENV_VAR, 30)) + def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash: """Check whether key is present in the persistent cache store and return it if so. Otherwise use `calculate_hash` to generate the hash and save it in the persistent @@ -109,13 +140,12 @@ def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> H key_path.write_bytes(hsh) return Hash(hsh) - @classmethod - def clean_up(cls): + def clean_up(self): """Cleans up old hash caches that haven't been accessed in the last 30 days""" now = datetime.now() - for path in cls.DEFAULT_LOCATION.iterdir(): + for path in self.location.iterdir(): days = (now - datetime.fromtimestamp(path.lstat().st_atime)).days - if days > cls.CLEAN_UP_PERIOD: + if days > self.cleanup_period: path.unlink() @classmethod @@ -124,38 +154,8 @@ def from_path( ) -> "PersistentCache": if isinstance(path, PersistentCache): return path - if path is None: - path = Path(tempfile.mkdtemp()) - else: - path = Path(path) - if not path.exists(): - try: - Path(path).mkdir(parents=True) - except Exception as e: - raise RuntimeError( - f"Could not create cache directory for file hashes at {path}, " - "please use 'PYDRA_HASH_CACHE' environment variable to manually " - "set where it is created" - ) from e return PersistentCache(path) - # FIXME: Ideally the default location would be inside one of the cache_locations - # but can't think of a clean way to pass the cache_locations down to this part - # of the code, so just dumping in the home directory instead by default. In any case, - # this needs to be documented - DEFAULT_LOCATION = Path( - os.environ.get( - "PYDRA_HASH_CACHE", - platformdirs.user_cache_dir( - appname="pydra", - appauthor="nipype", - version=__version__, - ), - ) - ) - # Set the period after which old hash cache files are cleaned up in days - CLEAN_UP_PERIOD = os.environ.get("PYDRA_HASH_CACHE_CLEANUP_PERIOD", 30) - @attrs.define class Cache: @@ -185,7 +185,7 @@ def hash_function(obj, **kwargs): def hash_object( - obj: object, persistent_cache: ty.Optional[Path] = PersistentCache.DEFAULT_LOCATION + obj: object, persistent_cache: ty.Union[PersistentCache, Path, None] = None ) -> Hash: """Hash an object @@ -195,10 +195,10 @@ def hash_object( Base Python types are implemented, including recursive lists and dicts. Custom types can be registered with :func:`register_serializer`. """ - try: - return hash_single(obj, Cache(persistent=persistent_cache)) - except Exception as e: - raise UnhashableError(f"Cannot hash object {obj!r} due to '{e}'") from e + # try: + return hash_single(obj, Cache(persistent=persistent_cache)) + # except Exception as e: + # raise UnhashableError(f"Cannot hash object {obj!r} due to '{e}'") from e def hash_single(obj: object, cache: Cache) -> Hash: diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index 048c225e6..7ed93c6ed 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -1,4 +1,5 @@ import re +import os from hashlib import blake2b from pathlib import Path import time @@ -8,7 +9,14 @@ import typing as ty from fileformats.application import Zip, Json from fileformats.text import TextFile -from ..hash import Cache, UnhashableError, bytes_repr, hash_object, register_serializer +from ..hash import ( + Cache, + UnhashableError, + bytes_repr, + hash_object, + register_serializer, + PersistentCache, +) @pytest.fixture @@ -300,13 +308,21 @@ def _(obj: MyClass, cache: Cache): assert join_bytes_repr(MyNewClass(1)) == b"serializer" -def test_persistent_hash_cache(tmp_path): +@pytest.fixture +def cache_path(tmp_path): cache_path = tmp_path / "hash-cache" cache_path.mkdir() + return cache_path + + +@pytest.fixture +def text_file(tmp_path): text_file_path = tmp_path / "text-file.txt" text_file_path.write_text("foo") - text_file = TextFile(text_file_path) + return TextFile(text_file_path) + +def test_persistent_hash_cache(cache_path, text_file): # Test hash is stable between calls hsh = hash_object(text_file, persistent_cache=cache_path) assert hsh == hash_object(text_file, persistent_cache=cache_path) @@ -319,9 +335,34 @@ def test_persistent_hash_cache(tmp_path): assert hash_object(text_file, persistent_cache=cache_path) == modified_hash # Test that changes to the text file result in new hash - text_file_path.write_text("bar") + text_file.fspath.write_text("bar") # Ensure that the mtime will be incremented by mocking time.time with mock.patch("time.time") as t: t.return_value = time.time() + 10 assert hash_object(text_file, persistent_cache=cache_path) != modified_hash assert len(list(cache_path.iterdir())) == 2 + + +def test_persistent_hash_cache_cleanup(cache_path, text_file): + with mock.patch.dict( + os.environ, + {"PYDRA_HASH_CACHE": str(cache_path), "PYDRA_HASH_CACHE_CLEANUP_PERIOD": "-1"}, + ): + persistent_cache = PersistentCache() + hsh = hash_object(text_file, persistent_cache=persistent_cache) + assert len(list(cache_path.iterdir())) == 1 + persistent_cache.clean_up() + assert len(list(cache_path.iterdir())) == 0 + + +def test_persistent_hash_cache_badpath(cache_path, text_file): + persistent_cache = PersistentCache(cache_path, cleanup_period=-1) + hsh = hash_object(text_file, persistent_cache=persistent_cache) + assert len(list(cache_path.iterdir())) == 1 + persistent_cache.clean_up() + assert len(list(cache_path.iterdir())) == 0 + + +def test_persistent_hash_cache_not_dir(text_file): + with pytest.raises(ValueError, match="is not a directory"): + PersistentCache(text_file.fspath) From 52ef03fc5aed32484ea142ad4c293494f971fcba Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:06:38 +1100 Subject: [PATCH 14/35] added mkdir to location converter --- pydra/utils/hash.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index a34901e41..25f46450e 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -60,7 +60,9 @@ def location_converter(path: ty.Union[Path, str, None]) -> Path: if path is None: path = PersistentCache.location_default() - return Path(path) + path = Path(path) + path.mkdir(parents=True) + return path @attrs.define From 021623630665fa527679ea4f9b8901c10e2d0217 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:07:17 +1100 Subject: [PATCH 15/35] debugged mkdir of persistent cache --- pydra/engine/submitter.py | 2 +- pydra/utils/hash.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pydra/engine/submitter.py b/pydra/engine/submitter.py index f92841c48..25e54dae7 100644 --- a/pydra/engine/submitter.py +++ b/pydra/engine/submitter.py @@ -44,7 +44,7 @@ def __call__(self, runnable, cache_locations=None, rerun=False, environment=None self.loop.run_until_complete( self.submit_from_call(runnable, rerun, environment) ) - PersistentCache.clean_up() + PersistentCache().clean_up() return runnable.result() async def submit_from_call(self, runnable, rerun, environment): diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 25f46450e..40050b692 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -61,7 +61,7 @@ def location_converter(path: ty.Union[Path, str, None]) -> Path: if path is None: path = PersistentCache.location_default() path = Path(path) - path.mkdir(parents=True) + path.mkdir(parents=True, exist_ok=True) return path From bad261bbbc3cca428291597b02752353b53e62b3 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:07:17 +1100 Subject: [PATCH 16/35] bug fixes in persistentcache location init --- pydra/utils/hash.py | 3 ++- pydra/utils/tests/test_hash.py | 6 ++---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 40050b692..5b4ae600d 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -61,7 +61,8 @@ def location_converter(path: ty.Union[Path, str, None]) -> Path: if path is None: path = PersistentCache.location_default() path = Path(path) - path.mkdir(parents=True, exist_ok=True) + if not path.exists(): + path.mkdir(parents=True) return path diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index 7ed93c6ed..91333690a 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -335,11 +335,9 @@ def test_persistent_hash_cache(cache_path, text_file): assert hash_object(text_file, persistent_cache=cache_path) == modified_hash # Test that changes to the text file result in new hash + time.sleep(2) # Need to ensure that the mtimes will be different text_file.fspath.write_text("bar") - # Ensure that the mtime will be incremented by mocking time.time - with mock.patch("time.time") as t: - t.return_value = time.time() + 10 - assert hash_object(text_file, persistent_cache=cache_path) != modified_hash + assert hash_object(text_file, persistent_cache=cache_path) != modified_hash assert len(list(cache_path.iterdir())) == 2 From 2fbee2b27177f92c5a95eed4dda3c76b19d31b04 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:07:17 +1100 Subject: [PATCH 17/35] Revert "mock mtime writing instead of adding sleeps to ensure mtimes are different" This reverts commit 089596526df085fac65c96b787cd2e49d47c3331. --- pydra/engine/tests/test_node_task.py | 7 +++--- pydra/engine/tests/test_specs.py | 37 +++++++++++----------------- pydra/utils/tests/test_hash.py | 1 - 3 files changed, 18 insertions(+), 27 deletions(-) diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py index b1418cddc..f8b961907 100644 --- a/pydra/engine/tests/test_node_task.py +++ b/pydra/engine/tests/test_node_task.py @@ -322,11 +322,10 @@ def test_task_init_7(tmp_path): output_dir1 = nn1.output_dir # changing the content of the file + time.sleep(2) # need the mtime to be different file2 = tmp_path / "file2.txt" - with mock.patch("time.time") as t: - t.return_value = time.time() + 10 # mock mtime to ensure different - with open(file2, "w") as f: - f.write("from pydra") + with open(file2, "w") as f: + f.write("from pydra") nn2 = fun_file_list(name="NA", filename_list=[file1, file2]) output_dir2 = nn2.output_dir diff --git a/pydra/engine/tests/test_specs.py b/pydra/engine/tests/test_specs.py index 98cbdec38..e73688594 100644 --- a/pydra/engine/tests/test_specs.py +++ b/pydra/engine/tests/test_specs.py @@ -3,7 +3,6 @@ import os import attrs from copy import deepcopy -from unittest import mock import time from ..specs import ( @@ -165,11 +164,10 @@ def test_input_file_hash_2(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash + time.sleep(2) # ensure mtime is different file_diffcontent = tmp_path / "in_file_1.txt" - with mock.patch("time.time") as t: - t.return_value = time.time() + 10 # mock mtime to ensure different - with open(file_diffcontent, "w") as f: - f.write("hi") + with open(file_diffcontent, "w") as f: + f.write("hi") hash3 = inputs(in_file=file_diffcontent).hash assert hash1 != hash3 @@ -197,12 +195,10 @@ def test_input_file_hash_2a(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash + time.sleep(2) # ensure mtime is different file_diffcontent = tmp_path / "in_file_1.txt" - # Ensure that the mtime will be incremented by mocking time.time - with mock.patch("time.time") as t: - t.return_value = time.time() + 10 - with open(file_diffcontent, "w") as f: - f.write("hi") + with open(file_diffcontent, "w") as f: + f.write("hi") hash3 = inputs(in_file=file_diffcontent).hash assert hash1 != hash3 @@ -241,10 +237,9 @@ def test_input_file_hash_3(tmp_path): # assert id(files_hash1["in_file"][filename]) == id(files_hash2["in_file"][filename]) # recreating the file - with mock.patch("time.time") as t: - t.return_value = time.time() + 10 # mock mtime to ensure different - with open(file, "w") as f: - f.write("hello") + time.sleep(2) # ensure mtime is different + with open(file, "w") as f: + f.write("hello") hash3 = my_inp.hash # files_hash3 = deepcopy(my_inp.files_hash) @@ -297,11 +292,10 @@ def test_input_file_hash_4(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash + time.sleep(2) # need the mtime to be different file_diffcontent = tmp_path / "in_file_1.txt" - with mock.patch("time.time") as t: - t.return_value = time.time() + 10 # mock mtime to ensure different - with open(file_diffcontent, "w") as f: - f.write("hi") + with open(file_diffcontent, "w") as f: + f.write("hi") hash3 = inputs(in_file=[[file_diffcontent, 3]]).hash assert hash1 != hash3 @@ -335,11 +329,10 @@ def test_input_file_hash_5(tmp_path): assert hash1 == hash2 # checking if different content (the same name) affects the hash + time.sleep(2) # ensure mtime is different file_diffcontent = tmp_path / "in_file_1.txt" - with mock.patch("time.time") as t: - t.return_value = time.time() + 10 # mock mtime to ensure different - with open(file_diffcontent, "w") as f: - f.write("hi") + with open(file_diffcontent, "w") as f: + f.write("hi") hash3 = inputs(in_file=[{"file": file_diffcontent, "int": 3}]).hash assert hash1 != hash3 diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index 91333690a..ed2432b72 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -3,7 +3,6 @@ from hashlib import blake2b from pathlib import Path import time -from unittest import mock import attrs import pytest import typing as ty From 91948f0f9773f0fcd8c8ef4afbdbd05dbdd450f1 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:07:17 +1100 Subject: [PATCH 18/35] skip lock files in directory clean up --- pydra/utils/hash.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 5b4ae600d..601c5af80 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -7,7 +7,7 @@ from pathlib import Path from collections.abc import Mapping from functools import singledispatch -from hashlib import blake2b +from hashlib import blake2b, blake2s import logging from typing import ( Dict, @@ -135,7 +135,7 @@ def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> H return self._hashes[key] except KeyError: pass - key_path = self.location / blake2b(str(key).encode()).hexdigest() + key_path = self.location / blake2s(str(key).encode()).hexdigest() with SoftFileLock(key_path.with_suffix(".lock")): if key_path.exists(): return Hash(key_path.read_bytes()) @@ -147,6 +147,8 @@ def clean_up(self): """Cleans up old hash caches that haven't been accessed in the last 30 days""" now = datetime.now() for path in self.location.iterdir(): + if path.endswith(".lock"): + continue days = (now - datetime.fromtimestamp(path.lstat().st_atime)).days if days > self.cleanup_period: path.unlink() @@ -198,10 +200,10 @@ def hash_object( Base Python types are implemented, including recursive lists and dicts. Custom types can be registered with :func:`register_serializer`. """ - # try: - return hash_single(obj, Cache(persistent=persistent_cache)) - # except Exception as e: - # raise UnhashableError(f"Cannot hash object {obj!r} due to '{e}'") from e + try: + return hash_single(obj, Cache(persistent=persistent_cache)) + except Exception as e: + raise UnhashableError(f"Cannot hash object {obj!r} due to '{e}'") from e def hash_single(obj: object, cache: Cache) -> Hash: From e0584085675d9a9b03e1784e91693848096e94aa Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:07:17 +1100 Subject: [PATCH 19/35] fixed clean-up bug --- pydra/utils/hash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 601c5af80..f0eae477a 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -147,7 +147,7 @@ def clean_up(self): """Cleans up old hash caches that haven't been accessed in the last 30 days""" now = datetime.now() for path in self.location.iterdir(): - if path.endswith(".lock"): + if path.name.endswith(".lock"): continue days = (now - datetime.fromtimestamp(path.lstat().st_atime)).days if days > self.cleanup_period: From f1ded7a7d1e6c98658785c80ea250fcb924a73f7 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:07:17 +1100 Subject: [PATCH 20/35] added mock import --- pydra/utils/tests/test_hash.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index ed2432b72..91333690a 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -3,6 +3,7 @@ from hashlib import blake2b from pathlib import Path import time +from unittest import mock import attrs import pytest import typing as ty From bb11067153a3dfd16b074093e4e8890a17d82429 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 24 Feb 2024 22:07:17 +1100 Subject: [PATCH 21/35] added another sleep to trigger atime change --- pydra/utils/tests/test_hash.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index 91333690a..c753f2095 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -341,10 +341,13 @@ def test_persistent_hash_cache(cache_path, text_file): assert len(list(cache_path.iterdir())) == 2 -def test_persistent_hash_cache_cleanup(cache_path, text_file): +def test_persistent_hash_cache_cleanup1(cache_path, text_file): with mock.patch.dict( os.environ, - {"PYDRA_HASH_CACHE": str(cache_path), "PYDRA_HASH_CACHE_CLEANUP_PERIOD": "-1"}, + { + "PYDRA_HASH_CACHE": str(cache_path), + "PYDRA_HASH_CACHE_CLEANUP_PERIOD": "-100", + }, ): persistent_cache = PersistentCache() hsh = hash_object(text_file, persistent_cache=persistent_cache) @@ -353,10 +356,11 @@ def test_persistent_hash_cache_cleanup(cache_path, text_file): assert len(list(cache_path.iterdir())) == 0 -def test_persistent_hash_cache_badpath(cache_path, text_file): - persistent_cache = PersistentCache(cache_path, cleanup_period=-1) +def test_persistent_hash_cache_cleanup2(cache_path, text_file): + persistent_cache = PersistentCache(cache_path, cleanup_period=-100) hsh = hash_object(text_file, persistent_cache=persistent_cache) assert len(list(cache_path.iterdir())) == 1 + time.sleep(2) persistent_cache.clean_up() assert len(list(cache_path.iterdir())) == 0 From a031ea5dfc1571d2171e0df4157bb97114b9b1eb Mon Sep 17 00:00:00 2001 From: Tom Close Date: Thu, 29 Feb 2024 19:11:06 +1100 Subject: [PATCH 22/35] implementing @effigies suggestions --- pydra/utils/hash.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index f0eae477a..7d09f8255 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -129,7 +129,8 @@ def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> H Returns ------- Hash - _description_ + the hash corresponding to the key, which is either retrieved from the persistent + store or calculated using `calculate_hash` if not present """ try: return self._hashes[key] @@ -141,6 +142,7 @@ def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> H return Hash(key_path.read_bytes()) hsh = calculate_hash() key_path.write_bytes(hsh) + self._hashes[key] = Hash(hsh) return Hash(hsh) def clean_up(self): From f2f70a6a0c45b68d521818c472a906e226694206 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Thu, 29 Feb 2024 19:26:57 +1100 Subject: [PATCH 23/35] added comments and doc strings to explain the use of the persistent cache --- pydra/utils/hash.py | 19 +++++++++++++++++++ pydra/utils/tests/test_hash.py | 24 +++++++++++++++++++++--- 2 files changed, 40 insertions(+), 3 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 7d09f8255..959d4eb60 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -224,13 +224,32 @@ def hash_single(obj: object, cache: Cache) -> Hash: # chunk def calc_hash(first: ty.Optional[bytes] = None) -> Hash: + """ + Calculate the hash of the object + + Parameters + ---------- + first : ty.Optional[bytes] + the first bytes chunk from the bytes_repr iterator, passed if the first + chunk wasn't a local cache key + """ h = blake2b(digest_size=16, person=b"pydra-hash") + # We want to use the first chunk that was popped to check for a cache-key + # if present if first is not None: h.update(first) for chunk in bytes_it: # Note that `bytes_it` is in outer scope h.update(chunk) return Hash(h.digest()) + # Read the first chunk of the bytes_repr iterator, check to see whether it returns + # a "cache-key" tuple instead of a bytes chunk for the type of the object to cache + # (i.e. file objects). If it does use that key to check the persistent cache for + # a precomputed hash and otherwise calculate the hash and store it in the + # persistent cache with that key. + + # If the first chunk is a bytes chunk (i.e. the object type doesn't have an associated + # 'cache-key'), then simply calculate the hash of the object. first = next(bytes_it) if isinstance(first, tuple): tp = type(obj) diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py index c753f2095..56a7d9e68 100644 --- a/pydra/utils/tests/test_hash.py +++ b/pydra/utils/tests/test_hash.py @@ -323,11 +323,18 @@ def text_file(tmp_path): def test_persistent_hash_cache(cache_path, text_file): + """ + Test the persistent hash cache with a text file + + The cache is used to store the hash of the text file, and the hash is + retrieved from the cache when the file is unchanged. + """ # Test hash is stable between calls hsh = hash_object(text_file, persistent_cache=cache_path) assert hsh == hash_object(text_file, persistent_cache=cache_path) - # Test that cached hash has been used + # Test that cached hash has been used by explicitly modifying it and seeing that the + # hash is the same as the modified hash cache_files = list(cache_path.iterdir()) assert len(cache_files) == 1 modified_hash = "modified".encode() @@ -342,6 +349,10 @@ def test_persistent_hash_cache(cache_path, text_file): def test_persistent_hash_cache_cleanup1(cache_path, text_file): + """ + Test the persistent hash is cleaned up after use if the periods between cleanups + is greater than the environment variable PYDRA_HASH_CACHE_CLEANUP_PERIOD + """ with mock.patch.dict( os.environ, { @@ -350,15 +361,19 @@ def test_persistent_hash_cache_cleanup1(cache_path, text_file): }, ): persistent_cache = PersistentCache() - hsh = hash_object(text_file, persistent_cache=persistent_cache) + hash_object(text_file, persistent_cache=persistent_cache) assert len(list(cache_path.iterdir())) == 1 persistent_cache.clean_up() assert len(list(cache_path.iterdir())) == 0 def test_persistent_hash_cache_cleanup2(cache_path, text_file): + """ + Test the persistent hash is cleaned up after use if the periods between cleanups + is greater than the explicitly provided cleanup_period + """ persistent_cache = PersistentCache(cache_path, cleanup_period=-100) - hsh = hash_object(text_file, persistent_cache=persistent_cache) + hash_object(text_file, persistent_cache=persistent_cache) assert len(list(cache_path.iterdir())) == 1 time.sleep(2) persistent_cache.clean_up() @@ -366,5 +381,8 @@ def test_persistent_hash_cache_cleanup2(cache_path, text_file): def test_persistent_hash_cache_not_dir(text_file): + """ + Test that an error is raised if the provided cache path is not a directory + """ with pytest.raises(ValueError, match="is not a directory"): PersistentCache(text_file.fspath) From 191aa9c604d5d47cc4a8d0fe12695a5481c87e17 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Fri, 1 Mar 2024 09:17:41 +1100 Subject: [PATCH 24/35] touched up comments --- pydra/utils/hash.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 959d4eb60..89d81a317 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -242,14 +242,12 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash: h.update(chunk) return Hash(h.digest()) - # Read the first chunk of the bytes_repr iterator, check to see whether it returns - # a "cache-key" tuple instead of a bytes chunk for the type of the object to cache + # Read the first item of the bytes_repr iterator, check to see whether it returns + # a "cache-key" tuple instead of a bytes chunk for the type of the object to be cached # (i.e. file objects). If it does use that key to check the persistent cache for - # a precomputed hash and otherwise calculate the hash and store it in the - # persistent cache with that key. - - # If the first chunk is a bytes chunk (i.e. the object type doesn't have an associated - # 'cache-key'), then simply calculate the hash of the object. + # a precomputed hash and return it if it is, otherwise calculate the hash and + # store it in the persistent cache with that hash of that key (not to be confused + # with the hash of the object that is saved/retrieved). first = next(bytes_it) if isinstance(first, tuple): tp = type(obj) @@ -259,6 +257,10 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash: ) + first hsh = cache.persistent.get_or_calculate_hash(key, calc_hash) else: + # If the first item is a bytes chunk (i.e. the object type doesn't have an + # associated 'cache-key'), then simply calculate the hash of the object, + # passing the first chunk to the `calc_hash` function so it can be included + # in the hash calculation hsh = calc_hash(first=first) logger.debug("Hash of %s object is %s", obj, hsh) cache[objid] = hsh From 3076fea489b1d3f84c6daa2a82ff8277bb1f1c2d Mon Sep 17 00:00:00 2001 From: Tom Close Date: Fri, 1 Mar 2024 09:19:02 +1100 Subject: [PATCH 25/35] another comment touch up --- pydra/utils/hash.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 89d81a317..e3214008d 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -242,12 +242,12 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash: h.update(chunk) return Hash(h.digest()) - # Read the first item of the bytes_repr iterator, check to see whether it returns + # Read the first item of the bytes_repr iterator and check to see whether it returns # a "cache-key" tuple instead of a bytes chunk for the type of the object to be cached - # (i.e. file objects). If it does use that key to check the persistent cache for - # a precomputed hash and return it if it is, otherwise calculate the hash and - # store it in the persistent cache with that hash of that key (not to be confused - # with the hash of the object that is saved/retrieved). + # (e.g. fileformats.core.FileSet objects). If it does use that key to check the + # persistent cache for a precomputed hash and return it if it is, otherwise + # calculate the hash and store it in the persistent cache with that hash of + # that key (not to be confused with the hash of the object that is saved/retrieved). first = next(bytes_it) if isinstance(first, tuple): tp = type(obj) From a094fbc572274a90fab56e058801d0cf68dca010 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Fri, 1 Mar 2024 09:44:55 +1100 Subject: [PATCH 26/35] touch up comments again --- pydra/utils/hash.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index e3214008d..dcf2695b7 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -242,12 +242,13 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash: h.update(chunk) return Hash(h.digest()) - # Read the first item of the bytes_repr iterator and check to see whether it returns + # Read the first item of the bytes_repr iterator and check to see whether it yields # a "cache-key" tuple instead of a bytes chunk for the type of the object to be cached - # (e.g. fileformats.core.FileSet objects). If it does use that key to check the - # persistent cache for a precomputed hash and return it if it is, otherwise - # calculate the hash and store it in the persistent cache with that hash of - # that key (not to be confused with the hash of the object that is saved/retrieved). + # (e.g. file-system path + mtime for fileformats.core.FileSet objects). If it + # does, use that key to check the persistent cache for a precomputed hash and + # return it if it is, otherwise calculate the hash and store it in the persistent + # cache with that hash of that key (not to be confused with the hash of the + # object that is saved/retrieved). first = next(bytes_it) if isinstance(first, tuple): tp = type(obj) From 291f29f91bf13ba921d6d928af20afcaffb33d8f Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 8 Mar 2024 06:12:47 +0000 Subject: [PATCH 27/35] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- pydra/utils/hash.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 0352b1397..906ec0814 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -192,7 +192,9 @@ def hash_function(obj, **kwargs): def hash_object( - obj: object, cache: ty.Optional[Cache] = None, persistent_cache: ty.Union[PersistentCache, Path, None] = None + obj: object, + cache: ty.Optional[Cache] = None, + persistent_cache: ty.Union[PersistentCache, Path, None] = None, ) -> Hash: """Hash an object From 0a10f6c04391098073315a762710fb050fa2875b Mon Sep 17 00:00:00 2001 From: Tom Close Date: Fri, 8 Mar 2024 17:14:56 +1100 Subject: [PATCH 28/35] added in @djarecka's test for moving file cache locations --- pydra/engine/tests/test_node_task.py | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py index 7e3ae15e2..611116a02 100644 --- a/pydra/engine/tests/test_node_task.py +++ b/pydra/engine/tests/test_node_task.py @@ -1563,3 +1563,39 @@ def test_task_state_cachelocations_updated(plugin, tmp_path): # both workflows should be run assert all([dir.exists() for dir in nn.output_dir]) assert all([dir.exists() for dir in nn2.output_dir]) + + +def test_task_files_cachelocations(plugin_dask_opt, tmp_path): + """ + Two identical tasks with provided cache_dir that use file as an input; + the second task has cache_locations and should not recompute the results + """ + cache_dir = tmp_path / "test_task_nostate" + cache_dir.mkdir() + cache_dir2 = tmp_path / "test_task_nostate2" + cache_dir2.mkdir() + input_dir = tmp_path / "input" + input_dir.mkdir() + + input1 = input_dir / "input1.txt" + input1.write_text("test") + input2 = input_dir / "input2.txt" + input2.write_text("test") + + nn = fun_file(name="NA", filename=input1, cache_dir=cache_dir) + with Submitter(plugin=plugin_dask_opt) as sub: + sub(nn) + + nn2 = fun_file( + name="NA", filename=input2, cache_dir=cache_dir2, cache_locations=cache_dir + ) + with Submitter(plugin=plugin_dask_opt) as sub: + sub(nn2) + + # checking the results + results2 = nn2.result() + assert results2.output.out == "test" + + # checking if the second task didn't run the interface again + assert nn.output_dir.exists() + assert not nn2.output_dir.exists() From 311e3ddeff100c2b6fc314d867f91cf4b4b0b406 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Fri, 8 Mar 2024 17:18:17 +1100 Subject: [PATCH 29/35] updated cache initialisation --- pydra/engine/specs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pydra/engine/specs.py b/pydra/engine/specs.py index 16cd925ce..feeeb9665 100644 --- a/pydra/engine/specs.py +++ b/pydra/engine/specs.py @@ -102,7 +102,7 @@ def _compute_hashes(self) -> ty.Tuple[bytes, ty.Dict[str, bytes]]: if "container_path" in field.metadata: continue inp_dict[field.name] = getattr(self, field.name) - hash_cache = Cache({}) + hash_cache = Cache() field_hashes = { k: hash_function(v, cache=hash_cache) for k, v in inp_dict.items() } From 482736569a4063c8f1ab6fa1ce19ba10b35d08e4 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Fri, 8 Mar 2024 17:27:43 +1100 Subject: [PATCH 30/35] switched to use blake2b isntead of blake2s --- pydra/utils/hash.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 906ec0814..553aec18c 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -7,7 +7,7 @@ from pathlib import Path from collections.abc import Mapping from functools import singledispatch -from hashlib import blake2b, blake2s +from hashlib import blake2b import logging from typing import ( Dict, @@ -136,7 +136,7 @@ def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> H return self._hashes[key] except KeyError: pass - key_path = self.location / blake2s(str(key).encode()).hexdigest() + key_path = self.location / blake2b(str(key).encode()).hexdigest() with SoftFileLock(key_path.with_suffix(".lock")): if key_path.exists(): return Hash(key_path.read_bytes()) From b6799b6972fb80c0d3fc8059e47a4af61935ab34 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Fri, 8 Mar 2024 19:54:09 +1100 Subject: [PATCH 31/35] [skip ci] deleted already commented-out code --- pydra/utils/hash.py | 36 ------------------------------------ 1 file changed, 36 deletions(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 553aec18c..0f20a90a1 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -524,39 +524,3 @@ def bytes_repr_numpy(obj: numpy.ndarray, cache: Cache) -> Iterator[bytes]: NUMPY_CHUNK_LEN = 8192 - - -# class MtimeCachingHash: -# """Hashing object that stores a cache of hash values for PathLikes - -# The cache only stores values for PathLikes pointing to existing files, -# and the mtime is checked to validate the cache. If the mtime differs, -# the old hash is discarded and a new mtime-tagged hash is stored. - -# The cache can grow without bound; we may want to consider using an LRU -# cache. -# """ - -# def __init__(self) -> None: -# self.cache: ty.Dict[os.PathLike, ty.Tuple[float, Hash]] = {} - -# def __call__(self, obj: object) -> Hash: -# if isinstance(obj, os.PathLike): -# path = Path(obj) -# try: -# stat_res = path.stat() -# mode, mtime = stat_res.st_mode, stat_res.st_mtime -# except FileNotFoundError: -# # Only attempt to cache existing files -# pass -# else: -# if stat.S_ISREG(mode) and obj in self.cache: -# # Cache (and hash) the actual object, as different pathlikes will have -# # different serializations -# save_mtime, save_hash = self.cache[obj] -# if mtime == save_mtime: -# return save_hash -# new_hash = hash_object(obj) -# self.cache[obj] = (mtime, new_hash) -# return new_hash -# return hash_object(obj) From 2bb86fe5ac7d2b8d5105e849f90c4c10ed77c72c Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 9 Mar 2024 00:08:17 +1100 Subject: [PATCH 32/35] additional doc strings for hash cache objects --- pydra/utils/hash.py | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 0f20a90a1..abfd908a6 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -69,7 +69,16 @@ def location_converter(path: ty.Union[Path, str, None]) -> Path: @attrs.define class PersistentCache: """Persistent cache in which to store computationally expensive hashes between nodes - and workflow/task runs + and workflow/task runs. It does this in via the `get_or_calculate_hash` method, which + takes a locally unique key (e.g. file-system path + mtime) and a function to + calculate the hash if it isn't present in the persistent store. + + The locally unique key is hashed (cheaply) using hashlib cryptography and this + "local hash" is use to name the entry of the (potentially expensive) hash of the + object itself (e.g. the contents of a file). This entry is saved as a text file + within a user-specific cache directory (see `platformdirs.user_cache_dir`), with + the name of the file being the "local hash" of the key and the contents of the + file being the "globally unique hash" of the object itself. Parameters ---------- @@ -166,6 +175,17 @@ def from_path( @attrs.define class Cache: + """Cache for hashing objects, used to avoid infinite recursion caused by circular + references between objects, and to store hashes of objects that have already been + hashed to avoid recomputation. + + This concept is extended to persistent caching of hashes for certain object types, + for which calculating the hash is a potentially expensive operation (e.g. + File/Directory types). For these classes the `bytes_repr` override function yields a + "locally unique cache key" (e.g. file-system path + mtime) as the first item of its + iterator. + """ + persistent: ty.Optional[PersistentCache] = attrs.field( default=None, converter=PersistentCache.from_path, # type: ignore[misc] From 1f601e184f9caaf327c7be99e4568607768e44e5 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sat, 16 Mar 2024 11:22:07 +1100 Subject: [PATCH 33/35] added test to see that persistent cache is used in the running of tasks --- pydra/engine/tests/test_node_task.py | 64 ++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py index 611116a02..bceaf9740 100644 --- a/pydra/engine/tests/test_node_task.py +++ b/pydra/engine/tests/test_node_task.py @@ -1,10 +1,15 @@ import os import shutil import attr +import typing as ty import numpy as np +import time from unittest import mock +from pathlib import Path import pytest import time +from fileformats.generic import File +import pydra.mark from .utils import ( fun_addtwo, @@ -1599,3 +1604,62 @@ def test_task_files_cachelocations(plugin_dask_opt, tmp_path): # checking if the second task didn't run the interface again assert nn.output_dir.exists() assert not nn2.output_dir.exists() + + +class OverriddenContentsFile(File): + """A class for testing purposes, to that enables you to override the contents + of the file to allow you to check whether the persistent cache is used.""" + + def __init__( + self, + fspaths: ty.Iterator[Path], + contents: ty.Optional[bytes] = None, + metadata: ty.Dict[str, ty.Any] = None, + ): + super().__init__(fspaths, metadata=metadata) + self._contents = contents + + def byte_chunks(self, **kwargs) -> ty.Generator[ty.Tuple[str, bytes], None, None]: + if self._contents is not None: + yield (str(self.fspath), iter([self._contents])) + else: + yield from super().byte_chunks(**kwargs) + + @property + def contents(self): + if self._contents is not None: + return self._contents + return super().contents + + +def test_task_files_persistentcache(tmp_path): + """ + Two identical tasks with provided cache_dir that use file as an input; + the second task has cache_locations and should not recompute the results + """ + test_file_path = tmp_path / "test_file.txt" + test_file_path.write_bytes(b"foo") + cache_dir = tmp_path / "cache-dir" + cache_dir.mkdir() + test_file = OverriddenContentsFile(test_file_path) + + @pydra.mark.task + def read_contents(x: OverriddenContentsFile) -> bytes: + return x.contents + + assert ( + read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out + == b"foo" + ) + test_file._contents = b"bar" + # should return result from the first run using the persistent cache + assert ( + read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out + == b"foo" + ) + time.sleep(2) # Windows has a 2-second resolution for mtime + test_file_path.touch() # update the mtime to invalidate the persistent cache value + assert ( + read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out + == b"bar" + ) # returns the overridden value From 7e60c41a834de6df90374790bd5a79dd316a0c7e Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sun, 17 Mar 2024 11:04:49 +1100 Subject: [PATCH 34/35] moved persistent hash cache within "hash_cache" subdirectory of the pydra user cache dir --- pydra/utils/__init__.py | 11 +++++++++++ pydra/utils/hash.py | 9 ++------- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/pydra/utils/__init__.py b/pydra/utils/__init__.py index e69de29bb..4c7dcd71b 100644 --- a/pydra/utils/__init__.py +++ b/pydra/utils/__init__.py @@ -0,0 +1,11 @@ +from pathlib import Path +import platformdirs +from pydra import __version__ + +user_cache_dir = Path( + platformdirs.user_cache_dir( + appname="pydra", + appauthor="nipype", + version=__version__, + ) +) diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index abfd908a6..81e4e773f 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -17,10 +17,9 @@ Set, ) from filelock import SoftFileLock -import platformdirs import attrs.exceptions from fileformats.core import FileSet -from pydra._version import __version__ +from . import user_cache_dir logger = logging.getLogger("pydra") @@ -99,11 +98,7 @@ def location_default(cls): try: location = os.environ[cls.LOCATION_ENV_VAR] except KeyError: - location = platformdirs.user_cache_dir( - appname="pydra", - appauthor="nipype", - version=__version__, - ) + location = user_cache_dir / "hash_cache" return location # the default needs to be an instance method From 921979c5387ec23e51d53a47c0cf6b097c4cc383 Mon Sep 17 00:00:00 2001 From: Tom Close Date: Sun, 17 Mar 2024 11:13:52 +1100 Subject: [PATCH 35/35] fixed import issue --- pydra/utils/__init__.py | 2 +- pydra/utils/hash.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pydra/utils/__init__.py b/pydra/utils/__init__.py index 4c7dcd71b..7fe4b8595 100644 --- a/pydra/utils/__init__.py +++ b/pydra/utils/__init__.py @@ -1,6 +1,6 @@ from pathlib import Path import platformdirs -from pydra import __version__ +from pydra._version import __version__ user_cache_dir = Path( platformdirs.user_cache_dir( diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py index 81e4e773f..90e132d1e 100644 --- a/pydra/utils/hash.py +++ b/pydra/utils/hash.py @@ -98,7 +98,7 @@ def location_default(cls): try: location = os.environ[cls.LOCATION_ENV_VAR] except KeyError: - location = user_cache_dir / "hash_cache" + location = user_cache_dir / "hashes" return location # the default needs to be an instance method