Skip to content

Commit

Permalink
Merge pull request #700 from tclose/local-cache-ids
Browse files Browse the repository at this point in the history
Caching of file-set hashes by local path and mtimes
  • Loading branch information
djarecka authored Mar 17, 2024
2 parents ff01e4c + 921979c commit 811dc45
Show file tree
Hide file tree
Showing 8 changed files with 433 additions and 56 deletions.
2 changes: 1 addition & 1 deletion pydra/engine/specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def _compute_hashes(self) -> ty.Tuple[bytes, ty.Dict[str, bytes]]:
if "container_path" in field.metadata:
continue
inp_dict[field.name] = getattr(self, field.name)
hash_cache = Cache({})
hash_cache = Cache()
field_hashes = {
k: hash_function(v, cache=hash_cache) for k, v in inp_dict.items()
}
Expand Down
2 changes: 2 additions & 0 deletions pydra/engine/submitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from .workers import Worker, WORKERS
from .core import is_workflow
from .helpers import get_open_loop, load_and_run_async
from ..utils.hash import PersistentCache

import logging

Expand Down Expand Up @@ -54,6 +55,7 @@ def __call__(self, runnable, cache_locations=None, rerun=False, environment=None
self.loop.run_until_complete(
self.submit_from_call(runnable, rerun, environment)
)
PersistentCache().clean_up()
return runnable.result()

async def submit_from_call(self, runnable, rerun, environment):
Expand Down
103 changes: 103 additions & 0 deletions pydra/engine/tests/test_node_task.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
import os
import shutil
import attr
import typing as ty
import numpy as np
import time
from unittest import mock
from pathlib import Path
import pytest
import time
from fileformats.generic import File
import pydra.mark

from .utils import (
fun_addtwo,
Expand Down Expand Up @@ -306,6 +313,7 @@ def test_task_init_7(tmp_path):
output_dir1 = nn1.output_dir

# changing the content of the file
time.sleep(2) # need the mtime to be different
file2 = tmp_path / "file2.txt"
with open(file2, "w") as f:
f.write("from pydra")
Expand Down Expand Up @@ -1560,3 +1568,98 @@ def test_task_state_cachelocations_updated(plugin, tmp_path):
# both workflows should be run
assert all([dir.exists() for dir in nn.output_dir])
assert all([dir.exists() for dir in nn2.output_dir])


def test_task_files_cachelocations(plugin_dask_opt, tmp_path):
"""
Two identical tasks with provided cache_dir that use file as an input;
the second task has cache_locations and should not recompute the results
"""
cache_dir = tmp_path / "test_task_nostate"
cache_dir.mkdir()
cache_dir2 = tmp_path / "test_task_nostate2"
cache_dir2.mkdir()
input_dir = tmp_path / "input"
input_dir.mkdir()

input1 = input_dir / "input1.txt"
input1.write_text("test")
input2 = input_dir / "input2.txt"
input2.write_text("test")

nn = fun_file(name="NA", filename=input1, cache_dir=cache_dir)
with Submitter(plugin=plugin_dask_opt) as sub:
sub(nn)

nn2 = fun_file(
name="NA", filename=input2, cache_dir=cache_dir2, cache_locations=cache_dir
)
with Submitter(plugin=plugin_dask_opt) as sub:
sub(nn2)

# checking the results
results2 = nn2.result()
assert results2.output.out == "test"

# checking if the second task didn't run the interface again
assert nn.output_dir.exists()
assert not nn2.output_dir.exists()


class OverriddenContentsFile(File):
"""A class for testing purposes, to that enables you to override the contents
of the file to allow you to check whether the persistent cache is used."""

def __init__(
self,
fspaths: ty.Iterator[Path],
contents: ty.Optional[bytes] = None,
metadata: ty.Dict[str, ty.Any] = None,
):
super().__init__(fspaths, metadata=metadata)
self._contents = contents

def byte_chunks(self, **kwargs) -> ty.Generator[ty.Tuple[str, bytes], None, None]:
if self._contents is not None:
yield (str(self.fspath), iter([self._contents]))
else:
yield from super().byte_chunks(**kwargs)

@property
def contents(self):
if self._contents is not None:
return self._contents
return super().contents


def test_task_files_persistentcache(tmp_path):
"""
Two identical tasks with provided cache_dir that use file as an input;
the second task has cache_locations and should not recompute the results
"""
test_file_path = tmp_path / "test_file.txt"
test_file_path.write_bytes(b"foo")
cache_dir = tmp_path / "cache-dir"
cache_dir.mkdir()
test_file = OverriddenContentsFile(test_file_path)

@pydra.mark.task
def read_contents(x: OverriddenContentsFile) -> bytes:
return x.contents

assert (
read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
== b"foo"
)
test_file._contents = b"bar"
# should return result from the first run using the persistent cache
assert (
read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
== b"foo"
)
time.sleep(2) # Windows has a 2-second resolution for mtime
test_file_path.touch() # update the mtime to invalidate the persistent cache value
assert (
read_contents(x=test_file, cache_dir=cache_dir)(plugin="serial").output.out
== b"bar"
) # returns the overridden value
6 changes: 6 additions & 0 deletions pydra/engine/tests/test_specs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import attrs
from copy import deepcopy
import time

from ..specs import (
BaseSpec,
Expand Down Expand Up @@ -163,6 +164,7 @@ def test_input_file_hash_2(tmp_path):
assert hash1 == hash2

# checking if different content (the same name) affects the hash
time.sleep(2) # ensure mtime is different
file_diffcontent = tmp_path / "in_file_1.txt"
with open(file_diffcontent, "w") as f:
f.write("hi")
Expand Down Expand Up @@ -193,6 +195,7 @@ def test_input_file_hash_2a(tmp_path):
assert hash1 == hash2

# checking if different content (the same name) affects the hash
time.sleep(2) # ensure mtime is different
file_diffcontent = tmp_path / "in_file_1.txt"
with open(file_diffcontent, "w") as f:
f.write("hi")
Expand Down Expand Up @@ -234,6 +237,7 @@ def test_input_file_hash_3(tmp_path):
# assert id(files_hash1["in_file"][filename]) == id(files_hash2["in_file"][filename])

# recreating the file
time.sleep(2) # ensure mtime is different
with open(file, "w") as f:
f.write("hello")

Expand Down Expand Up @@ -288,6 +292,7 @@ def test_input_file_hash_4(tmp_path):
assert hash1 == hash2

# checking if different content (the same name) affects the hash
time.sleep(2) # need the mtime to be different
file_diffcontent = tmp_path / "in_file_1.txt"
with open(file_diffcontent, "w") as f:
f.write("hi")
Expand Down Expand Up @@ -324,6 +329,7 @@ def test_input_file_hash_5(tmp_path):
assert hash1 == hash2

# checking if different content (the same name) affects the hash
time.sleep(2) # ensure mtime is different
file_diffcontent = tmp_path / "in_file_1.txt"
with open(file_diffcontent, "w") as f:
f.write("hi")
Expand Down
11 changes: 11 additions & 0 deletions pydra/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from pathlib import Path
import platformdirs
from pydra._version import __version__

user_cache_dir = Path(
platformdirs.user_cache_dir(
appname="pydra",
appauthor="nipype",
version=__version__,
)
)
Loading

0 comments on commit 811dc45

Please sign in to comment.