Skip to content

Commit

Permalink
Fix dir_hash vs. bytecode compilation races.
Browse files Browse the repository at this point in the history
Previously `dir_hash` (and `pex_hash`) were able to observe in-flight
bytecode compilation which would lead to observe-delete-failedhash
sequencing.

Fixes pex-tool#1051
  • Loading branch information
jsirois committed Oct 17, 2020
1 parent d70512c commit 62d4f34
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 18 deletions.
24 changes: 15 additions & 9 deletions pex/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import contextlib
import os
import re
import sys
import tempfile
from hashlib import sha1
Expand Down Expand Up @@ -133,20 +134,25 @@ def _compute_hash(cls, names, stream_factory):
return digest.hexdigest()

@classmethod
def _iter_files(cls, directory):
def _iter_non_pyc_files(cls, directory):
# type: (str) -> Iterator[str]
normpath = os.path.realpath(os.path.normpath(directory))
for root, _, files in os.walk(normpath):
for root, dirs, files in os.walk(normpath):
dirs[:] = [d for d in dirs if d != "__pycache__"]
for f in files:
yield os.path.relpath(os.path.join(root, f), normpath)
# For Python 2.7, `.pyc` files are compiled as siblings to `.py` files (there is no
# __pycache__ dir. We rely on the fact that the temporary files created by CPython
# have object id (integer) suffixes to avoid picking up either finished `.pyc` files
# or files where Python bytecode compilation is in-flight; i.e.:
# `.pyc.0123456789`-style files.
if not re.search(r"\.pyc(?:\.[0-9]+)?$", f):
yield os.path.relpath(os.path.join(root, f), normpath)

@classmethod
def pex_hash(cls, d):
# type: (str) -> str
"""Return a reproducible hash of the contents of a directory."""
names = sorted(
f for f in cls._iter_files(d) if not (f.endswith(".pyc") or f.startswith("."))
)
"""Return a reproducible hash of the contents of a loose PEX; excluding all `.pyc` files."""
names = sorted(f for f in cls._iter_non_pyc_files(d) if not f.startswith("."))

def stream_factory(name):
# type: (str) -> IO
Expand All @@ -157,8 +163,8 @@ def stream_factory(name):
@classmethod
def dir_hash(cls, d):
# type: (str) -> str
"""Return a reproducible hash of the contents of a directory."""
names = sorted(f for f in cls._iter_files(d) if not f.endswith(".pyc"))
"""Return a reproducible hash of the contents of a directory; excluding all `.pyc` files."""
names = sorted(cls._iter_non_pyc_files(d))

def stream_factory(name):
# type: (str) -> IO
Expand Down
42 changes: 33 additions & 9 deletions tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from hashlib import sha1
from textwrap import dedent

from pex.common import safe_mkdir, temporary_dir
from pex.common import safe_mkdir, safe_open, temporary_dir, touch
from pex.compatibility import nested, to_bytes
from pex.pex import PEX
from pex.pex_builder import PEXBuilder
Expand Down Expand Up @@ -74,14 +74,38 @@ def test_hash():
assert hash_output == empty_hash.hexdigest()


CONTENT = {
"__main__.py": 200,
".deps/morfgorf": 10000,
"twitter/__init__.py": 0,
"twitter/common/python/foo.py": 4000,
"twitter/common/python/bar.py": 8000,
"twitter/common/python/bar.pyc": 6000,
}
def test_dir_hash():
# type: () -> None
with temporary_dir() as tmp_dir:
safe_mkdir(os.path.join(tmp_dir, "a", "b"))
with safe_open(os.path.join(tmp_dir, "c", "d", "e.py"), "w") as fp:
fp.write("contents1")
with safe_open(os.path.join(tmp_dir, "f.py"), "w") as fp:
fp.write("contents2")
hash1 = CacheHelper.dir_hash(tmp_dir)

os.rename(os.path.join(tmp_dir, "c"), os.path.join(tmp_dir, "c-renamed"))
assert hash1 != CacheHelper.dir_hash(tmp_dir)

os.rename(os.path.join(tmp_dir, "c-renamed"), os.path.join(tmp_dir, "c"))
assert hash1 == CacheHelper.dir_hash(tmp_dir)

touch(os.path.join(tmp_dir, "c", "d", "e.pyc"))
assert hash1 == CacheHelper.dir_hash(tmp_dir)
touch(os.path.join(tmp_dir, "c", "d", "e.pyc.123456789"))
assert hash1 == CacheHelper.dir_hash(tmp_dir)

pycache_dir = os.path.join(tmp_dir, "__pycache__")
safe_mkdir(pycache_dir)
touch(os.path.join(pycache_dir, "f.pyc"))
assert hash1 == CacheHelper.dir_hash(tmp_dir)
touch(os.path.join(pycache_dir, "f.pyc.123456789"))
assert hash1 == CacheHelper.dir_hash(tmp_dir)

touch(os.path.join(pycache_dir, "f.py"))
assert hash1 == CacheHelper.dir_hash(
tmp_dir
), "All content under __pycache__ directories should be ignored."


try:
Expand Down

0 comments on commit 62d4f34

Please sign in to comment.