From 45117ef7ab69ab4d86e931d3298aa1b10d834479 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:05:57 +1100
Subject: [PATCH 01/35] added code to handle "locally-persistent-ids"

---
 pydra/utils/hash.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 0c5f5f870..1d62490a9 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -90,10 +90,23 @@ def hash_single(obj: object, cache: Cache) -> Hash:
         # Handle recursion by putting a dummy value in the cache
         cache[objid] = Hash(b"\x00")
         h = blake2b(digest_size=16, person=b"pydra-hash")
-        for chunk in bytes_repr(obj, cache):
+        bytes_it = bytes_repr(obj, cache)
+        first = next(bytes_it)
+        if isinstance(first, str):
+            cache_id = first
+            try:
+                return cache[cache_id]
+            except KeyError:
+                pass
+        else:
+            h.update(first)
+            cache_id = None
+        for chunk in bytes_it:
             h.update(chunk)
         hsh = cache[objid] = Hash(h.digest())
         logger.debug("Hash of %s object is %s", obj, hsh)
+        if cache_id is not None:
+            cache[cache_id] = hsh
     return cache[objid]
 
 

From 2b7ca50f2b45492d01572b7a49eee9a688927619 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:05:57 +1100
Subject: [PATCH 02/35] implemented persistent hash cache to avoid rehashing
 files

---
 pydra/utils/hash.py            | 155 ++++++++++++++++++++++++++++-----
 pydra/utils/tests/test_hash.py |  24 +++++
 2 files changed, 157 insertions(+), 22 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 1d62490a9..8a5f1045b 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -4,13 +4,13 @@
 
 # import stat
 import struct
+import tempfile
 import typing as ty
+from pathlib import Path
 from collections.abc import Mapping
 from functools import singledispatch
 from hashlib import blake2b
 import logging
-
-# from pathlib import Path
 from typing import (
     Dict,
     Iterator,
@@ -18,7 +18,9 @@
     Sequence,
     Set,
 )
+from filelock import SoftFileLock
 import attrs.exceptions
+from fileformats.core import FileSet
 
 logger = logging.getLogger("pydra")
 
@@ -52,19 +54,109 @@
 )
 
 Hash = NewType("Hash", bytes)
-Cache = NewType("Cache", Dict[int, Hash])
+CacheKey = NewType("CacheKey", ty.Tuple[ty.Hashable, ty.Hashable])
+
+
+@attrs.define
+class PersistentCache:
+    """Persistent cache in which to store computationally expensive hashes between nodes
+    and workflow/task runs
+
+    Parameters
+    ----------
+    location: Path
+        the directory in which to store the hashes cache
+    """
+
+    location: Path = attrs.field()
+    _hashes: ty.Dict[CacheKey, Hash] = attrs.field(factory=dict)
+
+    @location.validator
+    def location_validator(self, _, location):
+        if not os.path.isdir(location):
+            raise ValueError(
+                f"Persistent cache location '{location}' is not a directory"
+            )
+
+    def get_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash:
+        """Check whether key is present in the persistent cache store and return it if so.
+        Otherwise use `calculate_hash` to generate the hash and save it in the persistent
+        store.
+
+        Parameters
+        ----------
+        key : CacheKey
+            locally unique key (e.g. to the host) used to lookup the corresponding hash
+            in the persistent store
+        calculate_hash : ty.Callable
+            function to calculate the hash if it isn't present in the persistent store
+
+        Returns
+        -------
+        Hash
+            _description_
+        """
+        try:
+            return self._hashes[key]
+        except KeyError:
+            pass
+        key_path = self.location / blake2b(str(key).encode()).hexdigest()
+        with SoftFileLock(key_path.with_suffix(".lock")):
+            if key_path.exists():
+                return Hash(key_path.read_bytes())
+            hsh = calculate_hash()
+            key_path.write_bytes(hsh)
+        return Hash(hsh)
+
+
+def persistent_cache_converter(
+    path: ty.Union[Path, str, PersistentCache, None]
+) -> PersistentCache:
+    if isinstance(path, PersistentCache):
+        return path
+    if path is None:
+        path = tempfile.mkdtemp()
+    return PersistentCache(Path(path))
+
+
+@attrs.define
+class Cache:
+    persistent: ty.Optional[PersistentCache] = attrs.field(
+        default=None,
+        converter=persistent_cache_converter,
+    )
+    _hashes: ty.Dict[int, Hash] = attrs.field(factory=dict)
+
+    def __getitem__(self, object_id: int) -> Hash:
+        return self._hashes[object_id]
+
+    def __setitem__(self, object_id: int, hsh: Hash):
+        self._hashes[object_id] = hsh
+
+    def __contains__(self, object_id):
+        return object_id in self._hashes
 
 
 class UnhashableError(ValueError):
     """Error for objects that cannot be hashed"""
 
 
-def hash_function(obj):
+def hash_function(obj, persistent_cache: ty.Optional[Path] = None):
     """Generate hash of object."""
-    return hash_object(obj).hex()
+    if persistent_cache is None:
+        # FIXME: Ideally the default location would be inside one of the cache_locations
+        # but can't think of a clean way to pass the cache_locations down to this part
+        # of the code, so just dumping in the home directory instead
+        persistent_cache = (Path("~") / ".pydra-hash-cache").expanduser()
+        try:
+            if not persistent_cache.exists():
+                persistent_cache.mkdir()
+        except Exception:
+            persistent_cache = None
+    return hash_object(obj, persistent_cache=persistent_cache).hex()
 
 
-def hash_object(obj: object) -> Hash:
+def hash_object(obj: object, persistent_cache: ty.Optional[Path] = None) -> Hash:
     """Hash an object
 
     Constructs a byte string that uniquely identifies the object,
@@ -74,9 +166,9 @@ def hash_object(obj: object) -> Hash:
     dicts. Custom types can be registered with :func:`register_serializer`.
     """
     try:
-        return hash_single(obj, Cache({}))
+        return hash_single(obj, Cache(persistent=persistent_cache))
     except Exception as e:
-        raise UnhashableError(f"Cannot hash object {obj!r}") from e
+        raise UnhashableError(f"Cannot hash object {obj!r} due to '{e}'") from e
 
 
 def hash_single(obj: object, cache: Cache) -> Hash:
@@ -89,24 +181,31 @@ def hash_single(obj: object, cache: Cache) -> Hash:
     if objid not in cache:
         # Handle recursion by putting a dummy value in the cache
         cache[objid] = Hash(b"\x00")
-        h = blake2b(digest_size=16, person=b"pydra-hash")
         bytes_it = bytes_repr(obj, cache)
+        # Pop first element from the bytes_repr iterator and check whether it is a
+        # "local cache key" (e.g. file-system path + mtime tuple) or the first bytes
+        # chunk
+
+        def calc_hash(first: ty.Optional[bytes] = None) -> Hash:
+            h = blake2b(digest_size=16, person=b"pydra-hash")
+            if first is not None:
+                h.update(first)
+            for chunk in bytes_it:  # NB: bytes_it is in outer scope
+                h.update(chunk)
+            return Hash(h.digest())
+
         first = next(bytes_it)
-        if isinstance(first, str):
-            cache_id = first
-            try:
-                return cache[cache_id]
-            except KeyError:
-                pass
+        if isinstance(first, tuple):
+            tp = type(obj)
+            key = (
+                tp.__module__,
+                tp.__name__,
+            ) + first
+            hsh = cache.persistent.get_hash(key, calc_hash)
         else:
-            h.update(first)
-            cache_id = None
-        for chunk in bytes_it:
-            h.update(chunk)
-        hsh = cache[objid] = Hash(h.digest())
+            hsh = calc_hash(first=first)
         logger.debug("Hash of %s object is %s", obj, hsh)
-        if cache_id is not None:
-            cache[cache_id] = hsh
+        cache[objid] = hsh
     return cache[objid]
 
 
@@ -271,6 +370,18 @@ def type_name(tp):
     yield b")"
 
 
+@register_serializer(FileSet)
+def bytes_repr_fileset(
+    fileset: FileSet, cache: Cache
+) -> Iterator[ty.Union[CacheKey, bytes]]:
+    fspaths = sorted(fileset.fspaths)
+    yield CacheKey(
+        tuple(repr(p) for p in fspaths)  # type: ignore[arg-type]
+        + tuple(p.lstat().st_mtime for p in fspaths)
+    )
+    yield from fileset.__bytes_repr__(cache)
+
+
 @register_serializer(list)
 @register_serializer(tuple)
 def bytes_repr_seq(obj: Sequence, cache: Cache) -> Iterator[bytes]:
diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py
index 8da055e11..c151b27ce 100644
--- a/pydra/utils/tests/test_hash.py
+++ b/pydra/utils/tests/test_hash.py
@@ -6,6 +6,7 @@
 import pytest
 import typing as ty
 from fileformats.application import Zip, Json
+from fileformats.text import TextFile
 from ..hash import Cache, UnhashableError, bytes_repr, hash_object, register_serializer
 
 
@@ -296,3 +297,26 @@ def _(obj: MyClass, cache: Cache):
     register_serializer(MyNewClass, _)
 
     assert join_bytes_repr(MyNewClass(1)) == b"serializer"
+
+
+def test_persistent_hash_cache(tmp_path):
+    cache_path = tmp_path / "hash-cache"
+    cache_path.mkdir()
+    text_file_path = tmp_path / "text-file.txt"
+    text_file_path.write_text("foo")
+    text_file = TextFile(text_file_path)
+
+    # Test hash is stable between calls
+    hsh = hash_object(text_file, persistent_cache=cache_path)
+    assert hsh == hash_object(text_file, persistent_cache=cache_path)
+
+    # Test that cached hash has been used
+    cache_files = list(cache_path.iterdir())
+    assert len(cache_files) == 1
+    modified_hash = "modified".encode()
+    cache_files[0].write_bytes(modified_hash)
+    assert hash_object(text_file, persistent_cache=cache_path) == modified_hash
+
+    # Test that changes to the text file result in new hash
+    text_file_path.write_text("bar")
+    assert hash_object(text_file, persistent_cache=cache_path) != modified_hash

From 04b95ff189edf1046961eb8a6ddba331a7aa7a3a Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:05:57 +1100
Subject: [PATCH 03/35] touched up persistent_hash_cache test

---
 pydra/utils/tests/test_hash.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py
index c151b27ce..4a7209d78 100644
--- a/pydra/utils/tests/test_hash.py
+++ b/pydra/utils/tests/test_hash.py
@@ -320,3 +320,4 @@ def test_persistent_hash_cache(tmp_path):
     # Test that changes to the text file result in new hash
     text_file_path.write_text("bar")
     assert hash_object(text_file, persistent_cache=cache_path) != modified_hash
+    assert len(list(cache_path.iterdir())) == 2

From 0c865f4ed6883137ab8535f3634ac7b8c3f8a578 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:05:57 +1100
Subject: [PATCH 04/35] replaced Cache({}) with Cache() to match new proper
 class

---
 pydra/utils/hash.py            | 6 +++---
 pydra/utils/tests/test_hash.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 8a5f1045b..c1c262152 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -190,7 +190,7 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash:
             h = blake2b(digest_size=16, person=b"pydra-hash")
             if first is not None:
                 h.update(first)
-            for chunk in bytes_it:  # NB: bytes_it is in outer scope
+            for chunk in bytes_it:  # Note that `bytes_it` is in outer scope
                 h.update(chunk)
             return Hash(h.digest())
 
@@ -406,7 +406,7 @@ def bytes_repr_mapping_contents(mapping: Mapping, cache: Cache) -> Iterator[byte
     .. code-block:: python
 
         >>> from pydra.utils.hash import bytes_repr_mapping_contents, Cache
-        >>> generator = bytes_repr_mapping_contents({"a": 1, "b": 2}, Cache({}))
+        >>> generator = bytes_repr_mapping_contents({"a": 1, "b": 2}, Cache())
         >>> b''.join(generator)
         b'str:1:a=...str:1:b=...'
     """
@@ -424,7 +424,7 @@ def bytes_repr_sequence_contents(seq: Sequence, cache: Cache) -> Iterator[bytes]
     .. code-block:: python
 
         >>> from pydra.utils.hash import bytes_repr_sequence_contents, Cache
-        >>> generator = bytes_repr_sequence_contents([1, 2], Cache({}))
+        >>> generator = bytes_repr_sequence_contents([1, 2], Cache())
         >>> list(generator)
         [b'\x6d...', b'\xa3...']
     """
diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py
index 4a7209d78..fabbaa550 100644
--- a/pydra/utils/tests/test_hash.py
+++ b/pydra/utils/tests/test_hash.py
@@ -16,7 +16,7 @@ def hasher():
 
 
 def join_bytes_repr(obj):
-    return b"".join(bytes_repr(obj, Cache({})))
+    return b"".join(bytes_repr(obj, Cache()))
 
 
 def test_bytes_repr_builtins():

From 3b3fdb7933fa3578676fa0cc8a0f5af536c61b9e Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:05:57 +1100
Subject: [PATCH 05/35] upped resolution of mtime to nanoseconds

---
 pydra/utils/hash.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index c1c262152..650bd6f86 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -377,7 +377,7 @@ def bytes_repr_fileset(
     fspaths = sorted(fileset.fspaths)
     yield CacheKey(
         tuple(repr(p) for p in fspaths)  # type: ignore[arg-type]
-        + tuple(p.lstat().st_mtime for p in fspaths)
+        + tuple(p.lstat().st_mtime_ns for p in fspaths)
     )
     yield from fileset.__bytes_repr__(cache)
 

From 81a5108e8eead74a8bcca539f334c743f0cd3e9a Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:05:57 +1100
Subject: [PATCH 06/35] added sleep to various tests to ensure file mtimes are
 different

---
 pydra/engine/tests/test_node_task.py | 2 ++
 pydra/engine/tests/test_specs.py     | 2 ++
 pydra/utils/tests/test_hash.py       | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py
index 4e182781b..83cd91bd7 100644
--- a/pydra/engine/tests/test_node_task.py
+++ b/pydra/engine/tests/test_node_task.py
@@ -3,6 +3,7 @@
 import attr
 import numpy as np
 import pytest
+import time
 
 from .utils import (
     fun_addtwo,
@@ -320,6 +321,7 @@ def test_task_init_7(tmp_path):
     output_dir1 = nn1.output_dir
 
     # changing the content of the file
+    time.sleep(2)  # need the mtime to be different
     file2 = tmp_path / "file2.txt"
     with open(file2, "w") as f:
         f.write("from pydra")
diff --git a/pydra/engine/tests/test_specs.py b/pydra/engine/tests/test_specs.py
index 0370d92a5..7a7cac4df 100644
--- a/pydra/engine/tests/test_specs.py
+++ b/pydra/engine/tests/test_specs.py
@@ -3,6 +3,7 @@
 import os
 import attrs
 from copy import deepcopy
+import time
 
 from ..specs import (
     BaseSpec,
@@ -288,6 +289,7 @@ def test_input_file_hash_4(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
+    time.sleep(2)  # need the mtime to be different
     file_diffcontent = tmp_path / "in_file_1.txt"
     with open(file_diffcontent, "w") as f:
         f.write("hi")
diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py
index fabbaa550..af8bb27e7 100644
--- a/pydra/utils/tests/test_hash.py
+++ b/pydra/utils/tests/test_hash.py
@@ -1,6 +1,7 @@
 import re
 from hashlib import blake2b
 from pathlib import Path
+import time
 
 import attrs
 import pytest
@@ -318,6 +319,7 @@ def test_persistent_hash_cache(tmp_path):
     assert hash_object(text_file, persistent_cache=cache_path) == modified_hash
 
     # Test that changes to the text file result in new hash
+    time.sleep(2)  # Need to ensure that the mtimes will be different
     text_file_path.write_text("bar")
     assert hash_object(text_file, persistent_cache=cache_path) != modified_hash
     assert len(list(cache_path.iterdir())) == 2

From 0c4b1799541108f1d6fdabc9bc5d47843bcbc72b Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:05:57 +1100
Subject: [PATCH 07/35] added more sleeps to ensure mtimes of input files are
 different in tests

---
 pydra/engine/tests/test_specs.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pydra/engine/tests/test_specs.py b/pydra/engine/tests/test_specs.py
index 7a7cac4df..e73688594 100644
--- a/pydra/engine/tests/test_specs.py
+++ b/pydra/engine/tests/test_specs.py
@@ -164,6 +164,7 @@ def test_input_file_hash_2(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
+    time.sleep(2)  # ensure mtime is different
     file_diffcontent = tmp_path / "in_file_1.txt"
     with open(file_diffcontent, "w") as f:
         f.write("hi")
@@ -194,6 +195,7 @@ def test_input_file_hash_2a(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
+    time.sleep(2)  # ensure mtime is different
     file_diffcontent = tmp_path / "in_file_1.txt"
     with open(file_diffcontent, "w") as f:
         f.write("hi")
@@ -235,6 +237,7 @@ def test_input_file_hash_3(tmp_path):
     # assert id(files_hash1["in_file"][filename]) == id(files_hash2["in_file"][filename])
 
     # recreating the file
+    time.sleep(2)  # ensure mtime is different
     with open(file, "w") as f:
         f.write("hello")
 
@@ -326,6 +329,7 @@ def test_input_file_hash_5(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
+    time.sleep(2)  # ensure mtime is different
     file_diffcontent = tmp_path / "in_file_1.txt"
     with open(file_diffcontent, "w") as f:
         f.write("hi")

From 615d59075a57972aa8bb67addcb0089e3961ef99 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:06:38 +1100
Subject: [PATCH 08/35] debugged setting hash cache via env var and added clean
 up of directory

---
 pydra/engine/submitter.py |  2 ++
 pydra/utils/hash.py       | 75 +++++++++++++++++++++++++--------------
 2 files changed, 50 insertions(+), 27 deletions(-)

diff --git a/pydra/engine/submitter.py b/pydra/engine/submitter.py
index 3906955b2..f92841c48 100644
--- a/pydra/engine/submitter.py
+++ b/pydra/engine/submitter.py
@@ -6,6 +6,7 @@
 from .workers import WORKERS
 from .core import is_workflow
 from .helpers import get_open_loop, load_and_run_async
+from ..utils.hash import PersistentCache
 
 import logging
 
@@ -43,6 +44,7 @@ def __call__(self, runnable, cache_locations=None, rerun=False, environment=None
         self.loop.run_until_complete(
             self.submit_from_call(runnable, rerun, environment)
         )
+        PersistentCache.clean_up()
         return runnable.result()
 
     async def submit_from_call(self, runnable, rerun, environment):
diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 650bd6f86..10f8682fb 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -1,9 +1,8 @@
 """Generic object hashing dispatch"""
 
 import os
-
-# import stat
 import struct
+from datetime import datetime
 import tempfile
 import typing as ty
 from pathlib import Path
@@ -78,7 +77,7 @@ def location_validator(self, _, location):
                 f"Persistent cache location '{location}' is not a directory"
             )
 
-    def get_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash:
+    def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash:
         """Check whether key is present in the persistent cache store and return it if so.
         Otherwise use `calculate_hash` to generate the hash and save it in the persistent
         store.
@@ -108,22 +107,52 @@ def get_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash:
             key_path.write_bytes(hsh)
         return Hash(hsh)
 
-
-def persistent_cache_converter(
-    path: ty.Union[Path, str, PersistentCache, None]
-) -> PersistentCache:
-    if isinstance(path, PersistentCache):
-        return path
-    if path is None:
-        path = tempfile.mkdtemp()
-    return PersistentCache(Path(path))
+    @classmethod
+    def clean_up(cls):
+        """Cleans up old hash caches that haven't been accessed in the last 30 days"""
+        now = datetime.now()
+        for path in cls.DEFAULT_LOCATION.iterdir():
+            days = (now - datetime.fromtimestamp(path.lstat().st_atime)).days
+            if days > cls.CLEAN_UP_PERIOD:
+                path.unlink()
+
+    @classmethod
+    def from_path(
+        cls, path: ty.Union[Path, str, "PersistentCache", None]
+    ) -> "PersistentCache":
+        if isinstance(path, PersistentCache):
+            return path
+        if path is None:
+            path = Path(tempfile.mkdtemp())
+        else:
+            path = Path(path)
+            if not path.exists():
+                try:
+                    Path(path).mkdir()
+                except Exception as e:
+                    raise RuntimeError(
+                        f"Could not create cache directory for file hashes at {path}, "
+                        "please use 'PYDRA_HASH_CACHE' environment variable to control "
+                        "where it is created by default"
+                    ) from e
+        return PersistentCache(path)
+
+    # FIXME: Ideally the default location would be inside one of the cache_locations
+    # but can't think of a clean way to pass the cache_locations down to this part
+    # of the code, so just dumping in the home directory instead by default. In any case,
+    # this needs to be documented
+    DEFAULT_LOCATION = Path(
+        os.environ.get("PYDRA_HASH_CACHE", Path("~").expanduser() / ".pydra-hash-cache")
+    )
+    # Set the period after which old hash cache files are cleaned up in days
+    CLEAN_UP_PERIOD = os.environ.get("PYDRA_HASH_CACHE_CLEAN_UP_PERIOD", 30)
 
 
 @attrs.define
 class Cache:
     persistent: ty.Optional[PersistentCache] = attrs.field(
         default=None,
-        converter=persistent_cache_converter,
+        converter=PersistentCache.from_path,  # type: ignore[misc]
     )
     _hashes: ty.Dict[int, Hash] = attrs.field(factory=dict)
 
@@ -141,22 +170,14 @@ class UnhashableError(ValueError):
     """Error for objects that cannot be hashed"""
 
 
-def hash_function(obj, persistent_cache: ty.Optional[Path] = None):
+def hash_function(obj, **kwargs):
     """Generate hash of object."""
-    if persistent_cache is None:
-        # FIXME: Ideally the default location would be inside one of the cache_locations
-        # but can't think of a clean way to pass the cache_locations down to this part
-        # of the code, so just dumping in the home directory instead
-        persistent_cache = (Path("~") / ".pydra-hash-cache").expanduser()
-        try:
-            if not persistent_cache.exists():
-                persistent_cache.mkdir()
-        except Exception:
-            persistent_cache = None
-    return hash_object(obj, persistent_cache=persistent_cache).hex()
+    return hash_object(obj, **kwargs).hex()
 
 
-def hash_object(obj: object, persistent_cache: ty.Optional[Path] = None) -> Hash:
+def hash_object(
+    obj: object, persistent_cache: ty.Optional[Path] = PersistentCache.DEFAULT_LOCATION
+) -> Hash:
     """Hash an object
 
     Constructs a byte string that uniquely identifies the object,
@@ -201,7 +222,7 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash:
                 tp.__module__,
                 tp.__name__,
             ) + first
-            hsh = cache.persistent.get_hash(key, calc_hash)
+            hsh = cache.persistent.get_or_calculate_hash(key, calc_hash)
         else:
             hsh = calc_hash(first=first)
         logger.debug("Hash of %s object is %s", obj, hsh)

From 55b660e1e422953ec8f3b3c1dff6fa475c8defcb Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:06:38 +1100
Subject: [PATCH 09/35] mock mtime writing instead of adding sleeps to ensure
 mtimes are different

---
 pydra/engine/tests/test_node_task.py |  9 ++++--
 pydra/engine/tests/test_specs.py     | 45 ++++++++++++++++++----------
 pydra/utils/tests/test_hash.py       |  8 +++--
 3 files changed, 41 insertions(+), 21 deletions(-)

diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py
index 83cd91bd7..e01659424 100644
--- a/pydra/engine/tests/test_node_task.py
+++ b/pydra/engine/tests/test_node_task.py
@@ -321,10 +321,13 @@ def test_task_init_7(tmp_path):
     output_dir1 = nn1.output_dir
 
     # changing the content of the file
-    time.sleep(2)  # need the mtime to be different
     file2 = tmp_path / "file2.txt"
-    with open(file2, "w") as f:
-        f.write("from pydra")
+    with mock.patch("time.time") as t:
+        t.return_value = (
+            time.time() + 10
+        )  # mock mtime writing to ensure it is different
+        with open(file2, "w") as f:
+            f.write("from pydra")
 
     nn2 = fun_file_list(name="NA", filename_list=[file1, file2])
     output_dir2 = nn2.output_dir
diff --git a/pydra/engine/tests/test_specs.py b/pydra/engine/tests/test_specs.py
index e73688594..517b2a88d 100644
--- a/pydra/engine/tests/test_specs.py
+++ b/pydra/engine/tests/test_specs.py
@@ -3,6 +3,7 @@
 import os
 import attrs
 from copy import deepcopy
+from unittest import mock
 import time
 
 from ..specs import (
@@ -164,10 +165,13 @@ def test_input_file_hash_2(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
-    time.sleep(2)  # ensure mtime is different
     file_diffcontent = tmp_path / "in_file_1.txt"
-    with open(file_diffcontent, "w") as f:
-        f.write("hi")
+    with mock.patch("time.time") as t:
+        t.return_value = (
+            time.time() + 10
+        )  # mock mtime writing to ensure it is different
+        with open(file_diffcontent, "w") as f:
+            f.write("hi")
     hash3 = inputs(in_file=file_diffcontent).hash
     assert hash1 != hash3
 
@@ -195,10 +199,12 @@ def test_input_file_hash_2a(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
-    time.sleep(2)  # ensure mtime is different
     file_diffcontent = tmp_path / "in_file_1.txt"
-    with open(file_diffcontent, "w") as f:
-        f.write("hi")
+    # Ensure that the mtime will be incremented by mocking time.time
+    with mock.patch("time.time") as t:
+        t.return_value = time.time() + 10
+        with open(file_diffcontent, "w") as f:
+            f.write("hi")
     hash3 = inputs(in_file=file_diffcontent).hash
     assert hash1 != hash3
 
@@ -237,9 +243,12 @@ def test_input_file_hash_3(tmp_path):
     # assert id(files_hash1["in_file"][filename]) == id(files_hash2["in_file"][filename])
 
     # recreating the file
-    time.sleep(2)  # ensure mtime is different
-    with open(file, "w") as f:
-        f.write("hello")
+    with mock.patch("time.time") as t:
+        t.return_value = (
+            time.time() + 10
+        )  # mock mtime writing to ensure it is different
+        with open(file, "w") as f:
+            f.write("hello")
 
     hash3 = my_inp.hash
     # files_hash3 = deepcopy(my_inp.files_hash)
@@ -292,10 +301,13 @@ def test_input_file_hash_4(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
-    time.sleep(2)  # need the mtime to be different
     file_diffcontent = tmp_path / "in_file_1.txt"
-    with open(file_diffcontent, "w") as f:
-        f.write("hi")
+    with mock.patch("time.time") as t:
+        t.return_value = (
+            time.time() + 10
+        )  # mock mtime writing to ensure it is different
+        with open(file_diffcontent, "w") as f:
+            f.write("hi")
     hash3 = inputs(in_file=[[file_diffcontent, 3]]).hash
     assert hash1 != hash3
 
@@ -329,10 +341,13 @@ def test_input_file_hash_5(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
-    time.sleep(2)  # ensure mtime is different
     file_diffcontent = tmp_path / "in_file_1.txt"
-    with open(file_diffcontent, "w") as f:
-        f.write("hi")
+    with mock.patch("time.time") as t:
+        t.return_value = (
+            time.time() + 10
+        )  # mock mtime writing to ensure it is different
+        with open(file_diffcontent, "w") as f:
+            f.write("hi")
     hash3 = inputs(in_file=[{"file": file_diffcontent, "int": 3}]).hash
     assert hash1 != hash3
 
diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py
index af8bb27e7..048c225e6 100644
--- a/pydra/utils/tests/test_hash.py
+++ b/pydra/utils/tests/test_hash.py
@@ -2,7 +2,7 @@
 from hashlib import blake2b
 from pathlib import Path
 import time
-
+from unittest import mock
 import attrs
 import pytest
 import typing as ty
@@ -319,7 +319,9 @@ def test_persistent_hash_cache(tmp_path):
     assert hash_object(text_file, persistent_cache=cache_path) == modified_hash
 
     # Test that changes to the text file result in new hash
-    time.sleep(2)  # Need to ensure that the mtimes will be different
     text_file_path.write_text("bar")
-    assert hash_object(text_file, persistent_cache=cache_path) != modified_hash
+    # Ensure that the mtime will be incremented by mocking time.time
+    with mock.patch("time.time") as t:
+        t.return_value = time.time() + 10
+        assert hash_object(text_file, persistent_cache=cache_path) != modified_hash
     assert len(list(cache_path.iterdir())) == 2

From 5d51736fff1fc827c39191e2adb4e5d81828bcfd Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:06:38 +1100
Subject: [PATCH 10/35] undid overzealous black

---
 pydra/engine/tests/test_node_task.py |  4 +---
 pydra/engine/tests/test_specs.py     | 16 ++++------------
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py
index e01659424..7ce6ffd45 100644
--- a/pydra/engine/tests/test_node_task.py
+++ b/pydra/engine/tests/test_node_task.py
@@ -323,9 +323,7 @@ def test_task_init_7(tmp_path):
     # changing the content of the file
     file2 = tmp_path / "file2.txt"
     with mock.patch("time.time") as t:
-        t.return_value = (
-            time.time() + 10
-        )  # mock mtime writing to ensure it is different
+        t.return_value = time.time() + 10  # mock mtime to ensure different
         with open(file2, "w") as f:
             f.write("from pydra")
 
diff --git a/pydra/engine/tests/test_specs.py b/pydra/engine/tests/test_specs.py
index 517b2a88d..98cbdec38 100644
--- a/pydra/engine/tests/test_specs.py
+++ b/pydra/engine/tests/test_specs.py
@@ -167,9 +167,7 @@ def test_input_file_hash_2(tmp_path):
     # checking if different content (the same name) affects the hash
     file_diffcontent = tmp_path / "in_file_1.txt"
     with mock.patch("time.time") as t:
-        t.return_value = (
-            time.time() + 10
-        )  # mock mtime writing to ensure it is different
+        t.return_value = time.time() + 10  # mock mtime to ensure different
         with open(file_diffcontent, "w") as f:
             f.write("hi")
     hash3 = inputs(in_file=file_diffcontent).hash
@@ -244,9 +242,7 @@ def test_input_file_hash_3(tmp_path):
 
     # recreating the file
     with mock.patch("time.time") as t:
-        t.return_value = (
-            time.time() + 10
-        )  # mock mtime writing to ensure it is different
+        t.return_value = time.time() + 10  # mock mtime to ensure different
         with open(file, "w") as f:
             f.write("hello")
 
@@ -303,9 +299,7 @@ def test_input_file_hash_4(tmp_path):
     # checking if different content (the same name) affects the hash
     file_diffcontent = tmp_path / "in_file_1.txt"
     with mock.patch("time.time") as t:
-        t.return_value = (
-            time.time() + 10
-        )  # mock mtime writing to ensure it is different
+        t.return_value = time.time() + 10  # mock mtime to ensure different
         with open(file_diffcontent, "w") as f:
             f.write("hi")
     hash3 = inputs(in_file=[[file_diffcontent, 3]]).hash
@@ -343,9 +337,7 @@ def test_input_file_hash_5(tmp_path):
     # checking if different content (the same name) affects the hash
     file_diffcontent = tmp_path / "in_file_1.txt"
     with mock.patch("time.time") as t:
-        t.return_value = (
-            time.time() + 10
-        )  # mock mtime writing to ensure it is different
+        t.return_value = time.time() + 10  # mock mtime to ensure different
         with open(file_diffcontent, "w") as f:
             f.write("hi")
     hash3 = inputs(in_file=[{"file": file_diffcontent, "int": 3}]).hash

From 0421f854b4e952fab8387c43738b35f8999dee98 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:06:38 +1100
Subject: [PATCH 11/35] added missing import

---
 pydra/engine/tests/test_node_task.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py
index 7ce6ffd45..b1418cddc 100644
--- a/pydra/engine/tests/test_node_task.py
+++ b/pydra/engine/tests/test_node_task.py
@@ -2,6 +2,7 @@
 import shutil
 import attr
 import numpy as np
+from unittest import mock
 import pytest
 import time
 

From a864b32bc32367086031d7dd894322e499c84a36 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:06:38 +1100
Subject: [PATCH 12/35] Adds platformdirs dependency and use it to store the
 hash cache within

---
 pydra/utils/hash.py | 19 ++++++++++++++-----
 pyproject.toml      |  1 +
 2 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 10f8682fb..94e276ce9 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -18,8 +18,10 @@
     Set,
 )
 from filelock import SoftFileLock
+import platformdirs
 import attrs.exceptions
 from fileformats.core import FileSet
+from pydra._version import __version__
 
 logger = logging.getLogger("pydra")
 
@@ -128,12 +130,12 @@ def from_path(
             path = Path(path)
             if not path.exists():
                 try:
-                    Path(path).mkdir()
+                    Path(path).mkdir(parents=True)
                 except Exception as e:
                     raise RuntimeError(
                         f"Could not create cache directory for file hashes at {path}, "
-                        "please use 'PYDRA_HASH_CACHE' environment variable to control "
-                        "where it is created by default"
+                        "please use 'PYDRA_HASH_CACHE' environment variable to manually "
+                        "set where it is created"
                     ) from e
         return PersistentCache(path)
 
@@ -142,10 +144,17 @@ def from_path(
     # of the code, so just dumping in the home directory instead by default. In any case,
     # this needs to be documented
     DEFAULT_LOCATION = Path(
-        os.environ.get("PYDRA_HASH_CACHE", Path("~").expanduser() / ".pydra-hash-cache")
+        os.environ.get(
+            "PYDRA_HASH_CACHE",
+            platformdirs.user_cache_dir(
+                appname="pydra",
+                appauthor="nipype",
+                version=__version__,
+            ),
+        )
     )
     # Set the period after which old hash cache files are cleaned up in days
-    CLEAN_UP_PERIOD = os.environ.get("PYDRA_HASH_CACHE_CLEAN_UP_PERIOD", 30)
+    CLEAN_UP_PERIOD = os.environ.get("PYDRA_HASH_CACHE_CLEANUP_PERIOD", 30)
 
 
 @attrs.define
diff --git a/pyproject.toml b/pyproject.toml
index ad8f61ea8..6a6ad5e70 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,7 @@ dependencies = [
     "filelock >=3.0.0",
     "fileformats >=0.8",
     "importlib_resources >=5.7; python_version < '3.11'",
+    "platformdirs >=2",
     "typing_extensions >=4.6.3; python_version < '3.10'",
     "typing_utils >=0.1.0; python_version < '3.10'",
 ]

From 05ca6958009b9255482a434de94ebb224fee33ad Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:06:38 +1100
Subject: [PATCH 13/35] added unittests to hit exceptions in persistentcache
 init

---
 pydra/utils/hash.py            | 82 +++++++++++++++++-----------------
 pydra/utils/tests/test_hash.py | 49 ++++++++++++++++++--
 2 files changed, 86 insertions(+), 45 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 94e276ce9..a34901e41 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -3,7 +3,6 @@
 import os
 import struct
 from datetime import datetime
-import tempfile
 import typing as ty
 from pathlib import Path
 from collections.abc import Mapping
@@ -58,6 +57,12 @@
 CacheKey = NewType("CacheKey", ty.Tuple[ty.Hashable, ty.Hashable])
 
 
+def location_converter(path: ty.Union[Path, str, None]) -> Path:
+    if path is None:
+        path = PersistentCache.location_default()
+    return Path(path)
+
+
 @attrs.define
 class PersistentCache:
     """Persistent cache in which to store computationally expensive hashes between nodes
@@ -69,9 +74,31 @@ class PersistentCache:
         the directory in which to store the hashes cache
     """
 
-    location: Path = attrs.field()
+    location: Path = attrs.field(converter=location_converter)  # type: ignore[misc]
+    cleanup_period: int = attrs.field()
     _hashes: ty.Dict[CacheKey, Hash] = attrs.field(factory=dict)
 
+    # Set the location of the persistent hash cache
+    LOCATION_ENV_VAR = "PYDRA_HASH_CACHE"
+    CLEANUP_ENV_VAR = "PYDRA_HASH_CACHE_CLEANUP_PERIOD"
+
+    @classmethod
+    def location_default(cls):
+        try:
+            location = os.environ[cls.LOCATION_ENV_VAR]
+        except KeyError:
+            location = platformdirs.user_cache_dir(
+                appname="pydra",
+                appauthor="nipype",
+                version=__version__,
+            )
+        return location
+
+    # the default needs to be an instance method
+    @location.default
+    def _location_default(self):
+        return self.location_default()
+
     @location.validator
     def location_validator(self, _, location):
         if not os.path.isdir(location):
@@ -79,6 +106,10 @@ def location_validator(self, _, location):
                 f"Persistent cache location '{location}' is not a directory"
             )
 
+    @cleanup_period.default
+    def cleanup_period_default(self):
+        return int(os.environ.get(self.CLEANUP_ENV_VAR, 30))
+
     def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> Hash:
         """Check whether key is present in the persistent cache store and return it if so.
         Otherwise use `calculate_hash` to generate the hash and save it in the persistent
@@ -109,13 +140,12 @@ def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> H
             key_path.write_bytes(hsh)
         return Hash(hsh)
 
-    @classmethod
-    def clean_up(cls):
+    def clean_up(self):
         """Cleans up old hash caches that haven't been accessed in the last 30 days"""
         now = datetime.now()
-        for path in cls.DEFAULT_LOCATION.iterdir():
+        for path in self.location.iterdir():
             days = (now - datetime.fromtimestamp(path.lstat().st_atime)).days
-            if days > cls.CLEAN_UP_PERIOD:
+            if days > self.cleanup_period:
                 path.unlink()
 
     @classmethod
@@ -124,38 +154,8 @@ def from_path(
     ) -> "PersistentCache":
         if isinstance(path, PersistentCache):
             return path
-        if path is None:
-            path = Path(tempfile.mkdtemp())
-        else:
-            path = Path(path)
-            if not path.exists():
-                try:
-                    Path(path).mkdir(parents=True)
-                except Exception as e:
-                    raise RuntimeError(
-                        f"Could not create cache directory for file hashes at {path}, "
-                        "please use 'PYDRA_HASH_CACHE' environment variable to manually "
-                        "set where it is created"
-                    ) from e
         return PersistentCache(path)
 
-    # FIXME: Ideally the default location would be inside one of the cache_locations
-    # but can't think of a clean way to pass the cache_locations down to this part
-    # of the code, so just dumping in the home directory instead by default. In any case,
-    # this needs to be documented
-    DEFAULT_LOCATION = Path(
-        os.environ.get(
-            "PYDRA_HASH_CACHE",
-            platformdirs.user_cache_dir(
-                appname="pydra",
-                appauthor="nipype",
-                version=__version__,
-            ),
-        )
-    )
-    # Set the period after which old hash cache files are cleaned up in days
-    CLEAN_UP_PERIOD = os.environ.get("PYDRA_HASH_CACHE_CLEANUP_PERIOD", 30)
-
 
 @attrs.define
 class Cache:
@@ -185,7 +185,7 @@ def hash_function(obj, **kwargs):
 
 
 def hash_object(
-    obj: object, persistent_cache: ty.Optional[Path] = PersistentCache.DEFAULT_LOCATION
+    obj: object, persistent_cache: ty.Union[PersistentCache, Path, None] = None
 ) -> Hash:
     """Hash an object
 
@@ -195,10 +195,10 @@ def hash_object(
     Base Python types are implemented, including recursive lists and
     dicts. Custom types can be registered with :func:`register_serializer`.
     """
-    try:
-        return hash_single(obj, Cache(persistent=persistent_cache))
-    except Exception as e:
-        raise UnhashableError(f"Cannot hash object {obj!r} due to '{e}'") from e
+    # try:
+    return hash_single(obj, Cache(persistent=persistent_cache))
+    # except Exception as e:
+    #     raise UnhashableError(f"Cannot hash object {obj!r} due to '{e}'") from e
 
 
 def hash_single(obj: object, cache: Cache) -> Hash:
diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py
index 048c225e6..7ed93c6ed 100644
--- a/pydra/utils/tests/test_hash.py
+++ b/pydra/utils/tests/test_hash.py
@@ -1,4 +1,5 @@
 import re
+import os
 from hashlib import blake2b
 from pathlib import Path
 import time
@@ -8,7 +9,14 @@
 import typing as ty
 from fileformats.application import Zip, Json
 from fileformats.text import TextFile
-from ..hash import Cache, UnhashableError, bytes_repr, hash_object, register_serializer
+from ..hash import (
+    Cache,
+    UnhashableError,
+    bytes_repr,
+    hash_object,
+    register_serializer,
+    PersistentCache,
+)
 
 
 @pytest.fixture
@@ -300,13 +308,21 @@ def _(obj: MyClass, cache: Cache):
     assert join_bytes_repr(MyNewClass(1)) == b"serializer"
 
 
-def test_persistent_hash_cache(tmp_path):
+@pytest.fixture
+def cache_path(tmp_path):
     cache_path = tmp_path / "hash-cache"
     cache_path.mkdir()
+    return cache_path
+
+
+@pytest.fixture
+def text_file(tmp_path):
     text_file_path = tmp_path / "text-file.txt"
     text_file_path.write_text("foo")
-    text_file = TextFile(text_file_path)
+    return TextFile(text_file_path)
+
 
+def test_persistent_hash_cache(cache_path, text_file):
     # Test hash is stable between calls
     hsh = hash_object(text_file, persistent_cache=cache_path)
     assert hsh == hash_object(text_file, persistent_cache=cache_path)
@@ -319,9 +335,34 @@ def test_persistent_hash_cache(tmp_path):
     assert hash_object(text_file, persistent_cache=cache_path) == modified_hash
 
     # Test that changes to the text file result in new hash
-    text_file_path.write_text("bar")
+    text_file.fspath.write_text("bar")
     # Ensure that the mtime will be incremented by mocking time.time
     with mock.patch("time.time") as t:
         t.return_value = time.time() + 10
         assert hash_object(text_file, persistent_cache=cache_path) != modified_hash
     assert len(list(cache_path.iterdir())) == 2
+
+
+def test_persistent_hash_cache_cleanup(cache_path, text_file):
+    with mock.patch.dict(
+        os.environ,
+        {"PYDRA_HASH_CACHE": str(cache_path), "PYDRA_HASH_CACHE_CLEANUP_PERIOD": "-1"},
+    ):
+        persistent_cache = PersistentCache()
+    hsh = hash_object(text_file, persistent_cache=persistent_cache)
+    assert len(list(cache_path.iterdir())) == 1
+    persistent_cache.clean_up()
+    assert len(list(cache_path.iterdir())) == 0
+
+
+def test_persistent_hash_cache_badpath(cache_path, text_file):
+    persistent_cache = PersistentCache(cache_path, cleanup_period=-1)
+    hsh = hash_object(text_file, persistent_cache=persistent_cache)
+    assert len(list(cache_path.iterdir())) == 1
+    persistent_cache.clean_up()
+    assert len(list(cache_path.iterdir())) == 0
+
+
+def test_persistent_hash_cache_not_dir(text_file):
+    with pytest.raises(ValueError, match="is not a directory"):
+        PersistentCache(text_file.fspath)

From 52ef03fc5aed32484ea142ad4c293494f971fcba Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:06:38 +1100
Subject: [PATCH 14/35] added mkdir to location converter

---
 pydra/utils/hash.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index a34901e41..25f46450e 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -60,7 +60,9 @@
 def location_converter(path: ty.Union[Path, str, None]) -> Path:
     if path is None:
         path = PersistentCache.location_default()
-    return Path(path)
+    path = Path(path)
+    path.mkdir(parents=True)
+    return path
 
 
 @attrs.define

From 021623630665fa527679ea4f9b8901c10e2d0217 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:07:17 +1100
Subject: [PATCH 15/35] debugged mkdir of persistent cache

---
 pydra/engine/submitter.py | 2 +-
 pydra/utils/hash.py       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydra/engine/submitter.py b/pydra/engine/submitter.py
index f92841c48..25e54dae7 100644
--- a/pydra/engine/submitter.py
+++ b/pydra/engine/submitter.py
@@ -44,7 +44,7 @@ def __call__(self, runnable, cache_locations=None, rerun=False, environment=None
         self.loop.run_until_complete(
             self.submit_from_call(runnable, rerun, environment)
         )
-        PersistentCache.clean_up()
+        PersistentCache().clean_up()
         return runnable.result()
 
     async def submit_from_call(self, runnable, rerun, environment):
diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 25f46450e..40050b692 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -61,7 +61,7 @@ def location_converter(path: ty.Union[Path, str, None]) -> Path:
     if path is None:
         path = PersistentCache.location_default()
     path = Path(path)
-    path.mkdir(parents=True)
+    path.mkdir(parents=True, exist_ok=True)
     return path
 
 

From bad261bbbc3cca428291597b02752353b53e62b3 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:07:17 +1100
Subject: [PATCH 16/35] bug fixes in persistentcache location init

---
 pydra/utils/hash.py            | 3 ++-
 pydra/utils/tests/test_hash.py | 6 ++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 40050b692..5b4ae600d 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -61,7 +61,8 @@ def location_converter(path: ty.Union[Path, str, None]) -> Path:
     if path is None:
         path = PersistentCache.location_default()
     path = Path(path)
-    path.mkdir(parents=True, exist_ok=True)
+    if not path.exists():
+        path.mkdir(parents=True)
     return path
 
 
diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py
index 7ed93c6ed..91333690a 100644
--- a/pydra/utils/tests/test_hash.py
+++ b/pydra/utils/tests/test_hash.py
@@ -335,11 +335,9 @@ def test_persistent_hash_cache(cache_path, text_file):
     assert hash_object(text_file, persistent_cache=cache_path) == modified_hash
 
     # Test that changes to the text file result in new hash
+    time.sleep(2)  # Need to ensure that the mtimes will be different
     text_file.fspath.write_text("bar")
-    # Ensure that the mtime will be incremented by mocking time.time
-    with mock.patch("time.time") as t:
-        t.return_value = time.time() + 10
-        assert hash_object(text_file, persistent_cache=cache_path) != modified_hash
+    assert hash_object(text_file, persistent_cache=cache_path) != modified_hash
     assert len(list(cache_path.iterdir())) == 2
 
 

From 2fbee2b27177f92c5a95eed4dda3c76b19d31b04 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:07:17 +1100
Subject: [PATCH 17/35] Revert "mock mtime writing instead of adding sleeps to
 ensure mtimes are different"

This reverts commit 089596526df085fac65c96b787cd2e49d47c3331.
---
 pydra/engine/tests/test_node_task.py |  7 +++---
 pydra/engine/tests/test_specs.py     | 37 +++++++++++-----------------
 pydra/utils/tests/test_hash.py       |  1 -
 3 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py
index b1418cddc..f8b961907 100644
--- a/pydra/engine/tests/test_node_task.py
+++ b/pydra/engine/tests/test_node_task.py
@@ -322,11 +322,10 @@ def test_task_init_7(tmp_path):
     output_dir1 = nn1.output_dir
 
     # changing the content of the file
+    time.sleep(2)  # need the mtime to be different
     file2 = tmp_path / "file2.txt"
-    with mock.patch("time.time") as t:
-        t.return_value = time.time() + 10  # mock mtime to ensure different
-        with open(file2, "w") as f:
-            f.write("from pydra")
+    with open(file2, "w") as f:
+        f.write("from pydra")
 
     nn2 = fun_file_list(name="NA", filename_list=[file1, file2])
     output_dir2 = nn2.output_dir
diff --git a/pydra/engine/tests/test_specs.py b/pydra/engine/tests/test_specs.py
index 98cbdec38..e73688594 100644
--- a/pydra/engine/tests/test_specs.py
+++ b/pydra/engine/tests/test_specs.py
@@ -3,7 +3,6 @@
 import os
 import attrs
 from copy import deepcopy
-from unittest import mock
 import time
 
 from ..specs import (
@@ -165,11 +164,10 @@ def test_input_file_hash_2(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
+    time.sleep(2)  # ensure mtime is different
     file_diffcontent = tmp_path / "in_file_1.txt"
-    with mock.patch("time.time") as t:
-        t.return_value = time.time() + 10  # mock mtime to ensure different
-        with open(file_diffcontent, "w") as f:
-            f.write("hi")
+    with open(file_diffcontent, "w") as f:
+        f.write("hi")
     hash3 = inputs(in_file=file_diffcontent).hash
     assert hash1 != hash3
 
@@ -197,12 +195,10 @@ def test_input_file_hash_2a(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
+    time.sleep(2)  # ensure mtime is different
     file_diffcontent = tmp_path / "in_file_1.txt"
-    # Ensure that the mtime will be incremented by mocking time.time
-    with mock.patch("time.time") as t:
-        t.return_value = time.time() + 10
-        with open(file_diffcontent, "w") as f:
-            f.write("hi")
+    with open(file_diffcontent, "w") as f:
+        f.write("hi")
     hash3 = inputs(in_file=file_diffcontent).hash
     assert hash1 != hash3
 
@@ -241,10 +237,9 @@ def test_input_file_hash_3(tmp_path):
     # assert id(files_hash1["in_file"][filename]) == id(files_hash2["in_file"][filename])
 
     # recreating the file
-    with mock.patch("time.time") as t:
-        t.return_value = time.time() + 10  # mock mtime to ensure different
-        with open(file, "w") as f:
-            f.write("hello")
+    time.sleep(2)  # ensure mtime is different
+    with open(file, "w") as f:
+        f.write("hello")
 
     hash3 = my_inp.hash
     # files_hash3 = deepcopy(my_inp.files_hash)
@@ -297,11 +292,10 @@ def test_input_file_hash_4(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
+    time.sleep(2)  # need the mtime to be different
     file_diffcontent = tmp_path / "in_file_1.txt"
-    with mock.patch("time.time") as t:
-        t.return_value = time.time() + 10  # mock mtime to ensure different
-        with open(file_diffcontent, "w") as f:
-            f.write("hi")
+    with open(file_diffcontent, "w") as f:
+        f.write("hi")
     hash3 = inputs(in_file=[[file_diffcontent, 3]]).hash
     assert hash1 != hash3
 
@@ -335,11 +329,10 @@ def test_input_file_hash_5(tmp_path):
     assert hash1 == hash2
 
     # checking if different content (the same name) affects the hash
+    time.sleep(2)  # ensure mtime is different
     file_diffcontent = tmp_path / "in_file_1.txt"
-    with mock.patch("time.time") as t:
-        t.return_value = time.time() + 10  # mock mtime to ensure different
-        with open(file_diffcontent, "w") as f:
-            f.write("hi")
+    with open(file_diffcontent, "w") as f:
+        f.write("hi")
     hash3 = inputs(in_file=[{"file": file_diffcontent, "int": 3}]).hash
     assert hash1 != hash3
 
diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py
index 91333690a..ed2432b72 100644
--- a/pydra/utils/tests/test_hash.py
+++ b/pydra/utils/tests/test_hash.py
@@ -3,7 +3,6 @@
 from hashlib import blake2b
 from pathlib import Path
 import time
-from unittest import mock
 import attrs
 import pytest
 import typing as ty

From 91948f0f9773f0fcd8c8ef4afbdbd05dbdd450f1 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:07:17 +1100
Subject: [PATCH 18/35] skip lock files in directory clean up

---
 pydra/utils/hash.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 5b4ae600d..601c5af80 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from collections.abc import Mapping
 from functools import singledispatch
-from hashlib import blake2b
+from hashlib import blake2b, blake2s
 import logging
 from typing import (
     Dict,
@@ -135,7 +135,7 @@ def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> H
             return self._hashes[key]
         except KeyError:
             pass
-        key_path = self.location / blake2b(str(key).encode()).hexdigest()
+        key_path = self.location / blake2s(str(key).encode()).hexdigest()
         with SoftFileLock(key_path.with_suffix(".lock")):
             if key_path.exists():
                 return Hash(key_path.read_bytes())
@@ -147,6 +147,8 @@ def clean_up(self):
         """Cleans up old hash caches that haven't been accessed in the last 30 days"""
         now = datetime.now()
         for path in self.location.iterdir():
+            if path.endswith(".lock"):
+                continue
             days = (now - datetime.fromtimestamp(path.lstat().st_atime)).days
             if days > self.cleanup_period:
                 path.unlink()
@@ -198,10 +200,10 @@ def hash_object(
     Base Python types are implemented, including recursive lists and
     dicts. Custom types can be registered with :func:`register_serializer`.
     """
-    # try:
-    return hash_single(obj, Cache(persistent=persistent_cache))
-    # except Exception as e:
-    #     raise UnhashableError(f"Cannot hash object {obj!r} due to '{e}'") from e
+    try:
+        return hash_single(obj, Cache(persistent=persistent_cache))
+    except Exception as e:
+        raise UnhashableError(f"Cannot hash object {obj!r} due to '{e}'") from e
 
 
 def hash_single(obj: object, cache: Cache) -> Hash:

From e0584085675d9a9b03e1784e91693848096e94aa Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:07:17 +1100
Subject: [PATCH 19/35] fixed clean-up bug

---
 pydra/utils/hash.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 601c5af80..f0eae477a 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -147,7 +147,7 @@ def clean_up(self):
         """Cleans up old hash caches that haven't been accessed in the last 30 days"""
         now = datetime.now()
         for path in self.location.iterdir():
-            if path.endswith(".lock"):
+            if path.name.endswith(".lock"):
                 continue
             days = (now - datetime.fromtimestamp(path.lstat().st_atime)).days
             if days > self.cleanup_period:

From f1ded7a7d1e6c98658785c80ea250fcb924a73f7 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:07:17 +1100
Subject: [PATCH 20/35] added mock import

---
 pydra/utils/tests/test_hash.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py
index ed2432b72..91333690a 100644
--- a/pydra/utils/tests/test_hash.py
+++ b/pydra/utils/tests/test_hash.py
@@ -3,6 +3,7 @@
 from hashlib import blake2b
 from pathlib import Path
 import time
+from unittest import mock
 import attrs
 import pytest
 import typing as ty

From bb11067153a3dfd16b074093e4e8890a17d82429 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 24 Feb 2024 22:07:17 +1100
Subject: [PATCH 21/35] added another sleep to trigger atime change

---
 pydra/utils/tests/test_hash.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py
index 91333690a..c753f2095 100644
--- a/pydra/utils/tests/test_hash.py
+++ b/pydra/utils/tests/test_hash.py
@@ -341,10 +341,13 @@ def test_persistent_hash_cache(cache_path, text_file):
     assert len(list(cache_path.iterdir())) == 2
 
 
-def test_persistent_hash_cache_cleanup(cache_path, text_file):
+def test_persistent_hash_cache_cleanup1(cache_path, text_file):
     with mock.patch.dict(
         os.environ,
-        {"PYDRA_HASH_CACHE": str(cache_path), "PYDRA_HASH_CACHE_CLEANUP_PERIOD": "-1"},
+        {
+            "PYDRA_HASH_CACHE": str(cache_path),
+            "PYDRA_HASH_CACHE_CLEANUP_PERIOD": "-100",
+        },
     ):
         persistent_cache = PersistentCache()
     hsh = hash_object(text_file, persistent_cache=persistent_cache)
@@ -353,10 +356,11 @@ def test_persistent_hash_cache_cleanup(cache_path, text_file):
     assert len(list(cache_path.iterdir())) == 0
 
 
-def test_persistent_hash_cache_badpath(cache_path, text_file):
-    persistent_cache = PersistentCache(cache_path, cleanup_period=-1)
+def test_persistent_hash_cache_cleanup2(cache_path, text_file):
+    persistent_cache = PersistentCache(cache_path, cleanup_period=-100)
     hsh = hash_object(text_file, persistent_cache=persistent_cache)
     assert len(list(cache_path.iterdir())) == 1
+    time.sleep(2)
     persistent_cache.clean_up()
     assert len(list(cache_path.iterdir())) == 0
 

From a031ea5dfc1571d2171e0df4157bb97114b9b1eb Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Thu, 29 Feb 2024 19:11:06 +1100
Subject: [PATCH 22/35] implementing @effigies suggestions

---
 pydra/utils/hash.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index f0eae477a..7d09f8255 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -129,7 +129,8 @@ def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> H
         Returns
         -------
         Hash
-            _description_
+            the hash corresponding to the key, which is either retrieved from the persistent
+            store or calculated using `calculate_hash` if not present
         """
         try:
             return self._hashes[key]
@@ -141,6 +142,7 @@ def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> H
                 return Hash(key_path.read_bytes())
             hsh = calculate_hash()
             key_path.write_bytes(hsh)
+            self._hashes[key] = Hash(hsh)
         return Hash(hsh)
 
     def clean_up(self):

From f2f70a6a0c45b68d521818c472a906e226694206 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Thu, 29 Feb 2024 19:26:57 +1100
Subject: [PATCH 23/35] added comments and doc strings to explain the use of
 the persistent cache

---
 pydra/utils/hash.py            | 19 +++++++++++++++++++
 pydra/utils/tests/test_hash.py | 24 +++++++++++++++++++++---
 2 files changed, 40 insertions(+), 3 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 7d09f8255..959d4eb60 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -224,13 +224,32 @@ def hash_single(obj: object, cache: Cache) -> Hash:
         # chunk
 
         def calc_hash(first: ty.Optional[bytes] = None) -> Hash:
+            """
+            Calculate the hash of the object
+
+            Parameters
+            ----------
+            first : ty.Optional[bytes]
+                the first bytes chunk from the bytes_repr iterator, passed if the first
+                chunk wasn't a local cache key
+            """
             h = blake2b(digest_size=16, person=b"pydra-hash")
+            # We want to use the first chunk that was popped to check for a cache-key
+            # if present
             if first is not None:
                 h.update(first)
             for chunk in bytes_it:  # Note that `bytes_it` is in outer scope
                 h.update(chunk)
             return Hash(h.digest())
 
+        # Read the first chunk of the bytes_repr iterator, check to see whether it returns
+        # a "cache-key" tuple instead of a bytes chunk for the type of the object to cache
+        # (i.e. file objects). If it does use that key to check the persistent cache for
+        # a precomputed hash and otherwise calculate the hash and store it in the
+        # persistent cache with that key.
+
+        # If the first chunk is a bytes chunk (i.e. the object type doesn't have an associated
+        # 'cache-key'), then simply calculate the hash of the object.
         first = next(bytes_it)
         if isinstance(first, tuple):
             tp = type(obj)
diff --git a/pydra/utils/tests/test_hash.py b/pydra/utils/tests/test_hash.py
index c753f2095..56a7d9e68 100644
--- a/pydra/utils/tests/test_hash.py
+++ b/pydra/utils/tests/test_hash.py
@@ -323,11 +323,18 @@ def text_file(tmp_path):
 
 
 def test_persistent_hash_cache(cache_path, text_file):
+    """
+    Test the persistent hash cache with a text file
+
+    The cache is used to store the hash of the text file, and the hash is
+    retrieved from the cache when the file is unchanged.
+    """
     # Test hash is stable between calls
     hsh = hash_object(text_file, persistent_cache=cache_path)
     assert hsh == hash_object(text_file, persistent_cache=cache_path)
 
-    # Test that cached hash has been used
+    # Test that cached hash has been used by explicitly modifying it and seeing that the
+    # hash is the same as the modified hash
     cache_files = list(cache_path.iterdir())
     assert len(cache_files) == 1
     modified_hash = "modified".encode()
@@ -342,6 +349,10 @@ def test_persistent_hash_cache(cache_path, text_file):
 
 
 def test_persistent_hash_cache_cleanup1(cache_path, text_file):
+    """
+    Test the persistent hash is cleaned up after use if the periods between cleanups
+    is greater than the environment variable PYDRA_HASH_CACHE_CLEANUP_PERIOD
+    """
     with mock.patch.dict(
         os.environ,
         {
@@ -350,15 +361,19 @@ def test_persistent_hash_cache_cleanup1(cache_path, text_file):
         },
     ):
         persistent_cache = PersistentCache()
-    hsh = hash_object(text_file, persistent_cache=persistent_cache)
+    hash_object(text_file, persistent_cache=persistent_cache)
     assert len(list(cache_path.iterdir())) == 1
     persistent_cache.clean_up()
     assert len(list(cache_path.iterdir())) == 0
 
 
 def test_persistent_hash_cache_cleanup2(cache_path, text_file):
+    """
+    Test the persistent hash is cleaned up after use if the periods between cleanups
+    is greater than the explicitly provided cleanup_period
+    """
     persistent_cache = PersistentCache(cache_path, cleanup_period=-100)
-    hsh = hash_object(text_file, persistent_cache=persistent_cache)
+    hash_object(text_file, persistent_cache=persistent_cache)
     assert len(list(cache_path.iterdir())) == 1
     time.sleep(2)
     persistent_cache.clean_up()
@@ -366,5 +381,8 @@ def test_persistent_hash_cache_cleanup2(cache_path, text_file):
 
 
 def test_persistent_hash_cache_not_dir(text_file):
+    """
+    Test that an error is raised if the provided cache path is not a directory
+    """
     with pytest.raises(ValueError, match="is not a directory"):
         PersistentCache(text_file.fspath)

From 191aa9c604d5d47cc4a8d0fe12695a5481c87e17 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Fri, 1 Mar 2024 09:17:41 +1100
Subject: [PATCH 24/35] touched up comments

---
 pydra/utils/hash.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 959d4eb60..89d81a317 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -242,14 +242,12 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash:
                 h.update(chunk)
             return Hash(h.digest())
 
-        # Read the first chunk of the bytes_repr iterator, check to see whether it returns
-        # a "cache-key" tuple instead of a bytes chunk for the type of the object to cache
+        # Read the first item of the bytes_repr iterator, check to see whether it returns
+        # a "cache-key" tuple instead of a bytes chunk for the type of the object to be cached
         # (i.e. file objects). If it does use that key to check the persistent cache for
-        # a precomputed hash and otherwise calculate the hash and store it in the
-        # persistent cache with that key.
-
-        # If the first chunk is a bytes chunk (i.e. the object type doesn't have an associated
-        # 'cache-key'), then simply calculate the hash of the object.
+        # a precomputed hash and return it if it is, otherwise calculate the hash and
+        # store it in the persistent cache with that hash of that key (not to be confused
+        # with the hash of the object that is saved/retrieved).
         first = next(bytes_it)
         if isinstance(first, tuple):
             tp = type(obj)
@@ -259,6 +257,10 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash:
             ) + first
             hsh = cache.persistent.get_or_calculate_hash(key, calc_hash)
         else:
+            # If the first item is a bytes chunk (i.e. the object type doesn't have an
+            # associated 'cache-key'), then simply calculate the hash of the object,
+            # passing the first chunk to the `calc_hash` function so it can be included
+            # in the hash calculation
             hsh = calc_hash(first=first)
         logger.debug("Hash of %s object is %s", obj, hsh)
         cache[objid] = hsh

From 3076fea489b1d3f84c6daa2a82ff8277bb1f1c2d Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Fri, 1 Mar 2024 09:19:02 +1100
Subject: [PATCH 25/35] another comment touch up

---
 pydra/utils/hash.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 89d81a317..e3214008d 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -242,12 +242,12 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash:
                 h.update(chunk)
             return Hash(h.digest())
 
-        # Read the first item of the bytes_repr iterator, check to see whether it returns
+        # Read the first item of the bytes_repr iterator and check to see whether it returns
         # a "cache-key" tuple instead of a bytes chunk for the type of the object to be cached
-        # (i.e. file objects). If it does use that key to check the persistent cache for
-        # a precomputed hash and return it if it is, otherwise calculate the hash and
-        # store it in the persistent cache with that hash of that key (not to be confused
-        # with the hash of the object that is saved/retrieved).
+        # (e.g. fileformats.core.FileSet objects). If it does use that key to check the
+        # persistent cache for a precomputed hash and return it if it is, otherwise
+        # calculate the hash and store it in the persistent cache with that hash of
+        # that key (not to be confused with the hash of the object that is saved/retrieved).
         first = next(bytes_it)
         if isinstance(first, tuple):
             tp = type(obj)

From a094fbc572274a90fab56e058801d0cf68dca010 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Fri, 1 Mar 2024 09:44:55 +1100
Subject: [PATCH 26/35] touch up comments again

---
 pydra/utils/hash.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index e3214008d..dcf2695b7 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -242,12 +242,13 @@ def calc_hash(first: ty.Optional[bytes] = None) -> Hash:
                 h.update(chunk)
             return Hash(h.digest())
 
-        # Read the first item of the bytes_repr iterator and check to see whether it returns
+        # Read the first item of the bytes_repr iterator and check to see whether it yields
         # a "cache-key" tuple instead of a bytes chunk for the type of the object to be cached
-        # (e.g. fileformats.core.FileSet objects). If it does use that key to check the
-        # persistent cache for a precomputed hash and return it if it is, otherwise
-        # calculate the hash and store it in the persistent cache with that hash of
-        # that key (not to be confused with the hash of the object that is saved/retrieved).
+        # (e.g. file-system path + mtime for fileformats.core.FileSet objects). If it
+        # does, use that key to check the persistent cache for a precomputed hash and
+        # return it if it is, otherwise calculate the hash and store it in the persistent
+        # cache with that hash of that key (not to be confused with the hash of the
+        # object that is saved/retrieved).
         first = next(bytes_it)
         if isinstance(first, tuple):
             tp = type(obj)

From 291f29f91bf13ba921d6d928af20afcaffb33d8f Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 8 Mar 2024 06:12:47 +0000
Subject: [PATCH 27/35] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 pydra/utils/hash.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 0352b1397..906ec0814 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -192,7 +192,9 @@ def hash_function(obj, **kwargs):
 
 
 def hash_object(
-    obj: object, cache: ty.Optional[Cache] = None, persistent_cache: ty.Union[PersistentCache, Path, None] = None
+    obj: object,
+    cache: ty.Optional[Cache] = None,
+    persistent_cache: ty.Union[PersistentCache, Path, None] = None,
 ) -> Hash:
     """Hash an object
 

From 0a10f6c04391098073315a762710fb050fa2875b Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Fri, 8 Mar 2024 17:14:56 +1100
Subject: [PATCH 28/35] added in @djarecka's test for moving file cache
 locations

---
 pydra/engine/tests/test_node_task.py | 36 ++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py
index 7e3ae15e2..611116a02 100644
--- a/pydra/engine/tests/test_node_task.py
+++ b/pydra/engine/tests/test_node_task.py
@@ -1563,3 +1563,39 @@ def test_task_state_cachelocations_updated(plugin, tmp_path):
     # both workflows should be run
     assert all([dir.exists() for dir in nn.output_dir])
     assert all([dir.exists() for dir in nn2.output_dir])
+
+
+def test_task_files_cachelocations(plugin_dask_opt, tmp_path):
+    """
+    Two identical tasks with provided cache_dir that use file as an input;
+    the second task has cache_locations and should not recompute the results
+    """
+    cache_dir = tmp_path / "test_task_nostate"
+    cache_dir.mkdir()
+    cache_dir2 = tmp_path / "test_task_nostate2"
+    cache_dir2.mkdir()
+    input_dir = tmp_path / "input"
+    input_dir.mkdir()
+
+    input1 = input_dir / "input1.txt"
+    input1.write_text("test")
+    input2 = input_dir / "input2.txt"
+    input2.write_text("test")
+
+    nn = fun_file(name="NA", filename=input1, cache_dir=cache_dir)
+    with Submitter(plugin=plugin_dask_opt) as sub:
+        sub(nn)
+
+    nn2 = fun_file(
+        name="NA", filename=input2, cache_dir=cache_dir2, cache_locations=cache_dir
+    )
+    with Submitter(plugin=plugin_dask_opt) as sub:
+        sub(nn2)
+
+    # checking the results
+    results2 = nn2.result()
+    assert results2.output.out == "test"
+
+    # checking if the second task didn't run the interface again
+    assert nn.output_dir.exists()
+    assert not nn2.output_dir.exists()

From 311e3ddeff100c2b6fc314d867f91cf4b4b0b406 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Fri, 8 Mar 2024 17:18:17 +1100
Subject: [PATCH 29/35] updated cache initialisation

---
 pydra/engine/specs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pydra/engine/specs.py b/pydra/engine/specs.py
index 16cd925ce..feeeb9665 100644
--- a/pydra/engine/specs.py
+++ b/pydra/engine/specs.py
@@ -102,7 +102,7 @@ def _compute_hashes(self) -> ty.Tuple[bytes, ty.Dict[str, bytes]]:
             if "container_path" in field.metadata:
                 continue
             inp_dict[field.name] = getattr(self, field.name)
-        hash_cache = Cache({})
+        hash_cache = Cache()
         field_hashes = {
             k: hash_function(v, cache=hash_cache) for k, v in inp_dict.items()
         }

From 482736569a4063c8f1ab6fa1ce19ba10b35d08e4 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Fri, 8 Mar 2024 17:27:43 +1100
Subject: [PATCH 30/35] switched to use blake2b isntead of blake2s

---
 pydra/utils/hash.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 906ec0814..553aec18c 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -7,7 +7,7 @@
 from pathlib import Path
 from collections.abc import Mapping
 from functools import singledispatch
-from hashlib import blake2b, blake2s
+from hashlib import blake2b
 import logging
 from typing import (
     Dict,
@@ -136,7 +136,7 @@ def get_or_calculate_hash(self, key: CacheKey, calculate_hash: ty.Callable) -> H
             return self._hashes[key]
         except KeyError:
             pass
-        key_path = self.location / blake2s(str(key).encode()).hexdigest()
+        key_path = self.location / blake2b(str(key).encode()).hexdigest()
         with SoftFileLock(key_path.with_suffix(".lock")):
             if key_path.exists():
                 return Hash(key_path.read_bytes())

From b6799b6972fb80c0d3fc8059e47a4af61935ab34 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Fri, 8 Mar 2024 19:54:09 +1100
Subject: [PATCH 31/35] [skip ci] deleted already commented-out code

---
 pydra/utils/hash.py | 36 ------------------------------------
 1 file changed, 36 deletions(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 553aec18c..0f20a90a1 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -524,39 +524,3 @@ def bytes_repr_numpy(obj: numpy.ndarray, cache: Cache) -> Iterator[bytes]:
 
 
 NUMPY_CHUNK_LEN = 8192
-
-
-# class MtimeCachingHash:
-#     """Hashing object that stores a cache of hash values for PathLikes
-
-#     The cache only stores values for PathLikes pointing to existing files,
-#     and the mtime is checked to validate the cache. If the mtime differs,
-#     the old hash is discarded and a new mtime-tagged hash is stored.
-
-#     The cache can grow without bound; we may want to consider using an LRU
-#     cache.
-#     """
-
-#     def __init__(self) -> None:
-#         self.cache: ty.Dict[os.PathLike, ty.Tuple[float, Hash]] = {}
-
-#     def __call__(self, obj: object) -> Hash:
-#         if isinstance(obj, os.PathLike):
-#             path = Path(obj)
-#             try:
-#                 stat_res = path.stat()
-#                 mode, mtime = stat_res.st_mode, stat_res.st_mtime
-#             except FileNotFoundError:
-#                 # Only attempt to cache existing files
-#                 pass
-#             else:
-#                 if stat.S_ISREG(mode) and obj in self.cache:
-#                     # Cache (and hash) the actual object, as different pathlikes will have
-#                     # different serializations
-#                     save_mtime, save_hash = self.cache[obj]
-#                     if mtime == save_mtime:
-#                         return save_hash
-#                     new_hash = hash_object(obj)
-#                     self.cache[obj] = (mtime, new_hash)
-#                     return new_hash
-#         return hash_object(obj)

From 2bb86fe5ac7d2b8d5105e849f90c4c10ed77c72c Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 9 Mar 2024 00:08:17 +1100
Subject: [PATCH 32/35] additional doc strings for hash cache objects

---
 pydra/utils/hash.py | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 0f20a90a1..abfd908a6 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -69,7 +69,16 @@ def location_converter(path: ty.Union[Path, str, None]) -> Path:
 @attrs.define
 class PersistentCache:
     """Persistent cache in which to store computationally expensive hashes between nodes
-    and workflow/task runs
+    and workflow/task runs. It does this in via the `get_or_calculate_hash` method, which
+    takes a locally unique key (e.g. file-system path + mtime) and a function to
+    calculate the hash if it isn't present in the persistent store.
+
+    The locally unique key is hashed (cheaply) using hashlib cryptography and this
+    "local hash" is use to name the entry of the (potentially expensive) hash of the
+    object itself (e.g. the contents of a file). This entry is saved as a text file
+    within a user-specific cache directory (see `platformdirs.user_cache_dir`), with
+    the name of the file being the "local hash" of the key and the contents of the
+    file being the "globally unique hash" of the object itself.
 
     Parameters
     ----------
@@ -166,6 +175,17 @@ def from_path(
 
 @attrs.define
 class Cache:
+    """Cache for hashing objects, used to avoid infinite recursion caused by circular
+    references between objects, and to store hashes of objects that have already been
+    hashed to avoid recomputation.
+
+    This concept is extended to persistent caching of hashes for certain object types,
+    for which calculating the hash is a potentially expensive operation (e.g.
+    File/Directory types). For these classes the `bytes_repr` override function yields a
+    "locally unique cache key" (e.g. file-system path + mtime) as the first item of its
+    iterator.
+    """
+
     persistent: ty.Optional[PersistentCache] = attrs.field(
         default=None,
         converter=PersistentCache.from_path,  # type: ignore[misc]

From 1f601e184f9caaf327c7be99e4568607768e44e5 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sat, 16 Mar 2024 11:22:07 +1100
Subject: [PATCH 33/35] added test to see that persistent cache is used in the
 running of tasks

---
 pydra/engine/tests/test_node_task.py | 64 ++++++++++++++++++++++++++++
 1 file changed, 64 insertions(+)

diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py
index 611116a02..bceaf9740 100644
--- a/pydra/engine/tests/test_node_task.py
+++ b/pydra/engine/tests/test_node_task.py
@@ -1,10 +1,15 @@
 import os
 import shutil
 import attr
+import typing as ty
 import numpy as np
+import time
 from unittest import mock
+from pathlib import Path
 import pytest
 import time
+from fileformats.generic import File
+import pydra.mark
 
 from .utils import (
     fun_addtwo,
@@ -1599,3 +1604,62 @@ def test_task_files_cachelocations(plugin_dask_opt, tmp_path):
     # checking if the second task didn't run the interface again
     assert nn.output_dir.exists()
     assert not nn2.output_dir.exists()
+
+
+class OverriddenContentsFile(File):
+    """A class for testing purposes, to that enables you to override the contents
+    of the file to allow you to check whether the persistent cache is used."""
+
+    def __init__(
+        self,
+        fspaths: ty.Iterator[Path],
+        contents: ty.Optional[bytes] = None,
+        metadata: ty.Dict[str, ty.Any] = None,
+    ):
+        super().__init__(fspaths, metadata=metadata)
+        self._contents = contents
+
+    def byte_chunks(self, **kwargs) -> ty.Generator[ty.Tuple[str, bytes], None, None]:
+        if self._contents is not None:
+            yield (str(self.fspath), iter([self._contents]))
+        else:
+            yield from super().byte_chunks(**kwargs)
+
+    @property
+    def contents(self):
+        if self._contents is not None:
+            return self._contents
+        return super().contents
+
+
+def test_task_files_persistentcache(tmp_path):
+    """
+    Two identical tasks with provided cache_dir that use file as an input;
+    the second task has cache_locations and should not recompute the results
+    """
+    test_file_path = tmp_path / "test_file.txt"
+    test_file_path.write_bytes(b"foo")
+    cache_dir = tmp_path / "cache-dir"
+    cache_dir.mkdir()
+    test_file = OverriddenContentsFile(test_file_path)
+
+    @pydra.mark.task
+    def read_contents(x: OverriddenContentsFile) -> bytes:
+        return x.contents
+
+    assert (
+        read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
+        == b"foo"
+    )
+    test_file._contents = b"bar"
+    # should return result from the first run using the persistent cache
+    assert (
+        read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
+        == b"foo"
+    )
+    time.sleep(2)  # Windows has a 2-second resolution for mtime
+    test_file_path.touch()  # update the mtime to invalidate the persistent cache value
+    assert (
+        read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
+        == b"bar"
+    )  # returns the overridden value

From 7e60c41a834de6df90374790bd5a79dd316a0c7e Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sun, 17 Mar 2024 11:04:49 +1100
Subject: [PATCH 34/35] moved persistent hash cache within "hash_cache"
 subdirectory of the pydra user cache dir

---
 pydra/utils/__init__.py | 11 +++++++++++
 pydra/utils/hash.py     |  9 ++-------
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/pydra/utils/__init__.py b/pydra/utils/__init__.py
index e69de29bb..4c7dcd71b 100644
--- a/pydra/utils/__init__.py
+++ b/pydra/utils/__init__.py
@@ -0,0 +1,11 @@
+from pathlib import Path
+import platformdirs
+from pydra import __version__
+
+user_cache_dir = Path(
+    platformdirs.user_cache_dir(
+        appname="pydra",
+        appauthor="nipype",
+        version=__version__,
+    )
+)
diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index abfd908a6..81e4e773f 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -17,10 +17,9 @@
     Set,
 )
 from filelock import SoftFileLock
-import platformdirs
 import attrs.exceptions
 from fileformats.core import FileSet
-from pydra._version import __version__
+from . import user_cache_dir
 
 logger = logging.getLogger("pydra")
 
@@ -99,11 +98,7 @@ def location_default(cls):
         try:
             location = os.environ[cls.LOCATION_ENV_VAR]
         except KeyError:
-            location = platformdirs.user_cache_dir(
-                appname="pydra",
-                appauthor="nipype",
-                version=__version__,
-            )
+            location = user_cache_dir / "hash_cache"
         return location
 
     # the default needs to be an instance method

From 921979c5387ec23e51d53a47c0cf6b097c4cc383 Mon Sep 17 00:00:00 2001
From: Tom Close <tom.g.close@gmail.com>
Date: Sun, 17 Mar 2024 11:13:52 +1100
Subject: [PATCH 35/35] fixed import issue

---
 pydra/utils/__init__.py | 2 +-
 pydra/utils/hash.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydra/utils/__init__.py b/pydra/utils/__init__.py
index 4c7dcd71b..7fe4b8595 100644
--- a/pydra/utils/__init__.py
+++ b/pydra/utils/__init__.py
@@ -1,6 +1,6 @@
 from pathlib import Path
 import platformdirs
-from pydra import __version__
+from pydra._version import __version__
 
 user_cache_dir = Path(
     platformdirs.user_cache_dir(
diff --git a/pydra/utils/hash.py b/pydra/utils/hash.py
index 81e4e773f..90e132d1e 100644
--- a/pydra/utils/hash.py
+++ b/pydra/utils/hash.py
@@ -98,7 +98,7 @@ def location_default(cls):
         try:
             location = os.environ[cls.LOCATION_ENV_VAR]
         except KeyError:
-            location = user_cache_dir / "hash_cache"
+            location = user_cache_dir / "hashes"
         return location
 
     # the default needs to be an instance method