diff --git a/pex/util.py b/pex/util.py index ca4bf5273..ae54d7662 100644 --- a/pex/util.py +++ b/pex/util.py @@ -5,6 +5,7 @@ import contextlib import os +import re import sys import tempfile from hashlib import sha1 @@ -133,20 +134,25 @@ def _compute_hash(cls, names, stream_factory): return digest.hexdigest() @classmethod - def _iter_files(cls, directory): + def _iter_non_pyc_files(cls, directory): # type: (str) -> Iterator[str] normpath = os.path.realpath(os.path.normpath(directory)) - for root, _, files in os.walk(normpath): + for root, dirs, files in os.walk(normpath): + dirs[:] = [d for d in dirs if d != "__pycache__"] for f in files: - yield os.path.relpath(os.path.join(root, f), normpath) + # For Python 2.7, `.pyc` files are compiled as siblings to `.py` files (there is no + # __pycache__ dir. We rely on the fact that the temporary files created by CPython + # have object id (integer) suffixes to avoid picking up either finished `.pyc` files + # or files where Python bytecode compilation is in-flight; i.e.: + # `.pyc.0123456789`-style files. + if not re.search(r"\.pyc(?:\.[0-9]+)?$", f): + yield os.path.relpath(os.path.join(root, f), normpath) @classmethod def pex_hash(cls, d): # type: (str) -> str - """Return a reproducible hash of the contents of a directory.""" - names = sorted( - f for f in cls._iter_files(d) if not (f.endswith(".pyc") or f.startswith(".")) - ) + """Return a reproducible hash of the contents of a loose PEX; excluding all `.pyc` files.""" + names = sorted(f for f in cls._iter_non_pyc_files(d) if not f.startswith(".")) def stream_factory(name): # type: (str) -> IO @@ -157,8 +163,8 @@ def stream_factory(name): @classmethod def dir_hash(cls, d): # type: (str) -> str - """Return a reproducible hash of the contents of a directory.""" - names = sorted(f for f in cls._iter_files(d) if not f.endswith(".pyc")) + """Return a reproducible hash of the contents of a directory; excluding all `.pyc` files.""" + names = sorted(cls._iter_non_pyc_files(d)) def stream_factory(name): # type: (str) -> IO diff --git a/tests/test_util.py b/tests/test_util.py index 26bc5de67..171e0ac81 100644 --- a/tests/test_util.py +++ b/tests/test_util.py @@ -6,7 +6,7 @@ from hashlib import sha1 from textwrap import dedent -from pex.common import safe_mkdir, temporary_dir +from pex.common import safe_mkdir, safe_open, temporary_dir, touch from pex.compatibility import nested, to_bytes from pex.pex import PEX from pex.pex_builder import PEXBuilder @@ -74,14 +74,38 @@ def test_hash(): assert hash_output == empty_hash.hexdigest() -CONTENT = { - "__main__.py": 200, - ".deps/morfgorf": 10000, - "twitter/__init__.py": 0, - "twitter/common/python/foo.py": 4000, - "twitter/common/python/bar.py": 8000, - "twitter/common/python/bar.pyc": 6000, -} +def test_dir_hash(): + # type: () -> None + with temporary_dir() as tmp_dir: + safe_mkdir(os.path.join(tmp_dir, "a", "b")) + with safe_open(os.path.join(tmp_dir, "c", "d", "e.py"), "w") as fp: + fp.write("contents1") + with safe_open(os.path.join(tmp_dir, "f.py"), "w") as fp: + fp.write("contents2") + hash1 = CacheHelper.dir_hash(tmp_dir) + + os.rename(os.path.join(tmp_dir, "c"), os.path.join(tmp_dir, "c-renamed")) + assert hash1 != CacheHelper.dir_hash(tmp_dir) + + os.rename(os.path.join(tmp_dir, "c-renamed"), os.path.join(tmp_dir, "c")) + assert hash1 == CacheHelper.dir_hash(tmp_dir) + + touch(os.path.join(tmp_dir, "c", "d", "e.pyc")) + assert hash1 == CacheHelper.dir_hash(tmp_dir) + touch(os.path.join(tmp_dir, "c", "d", "e.pyc.123456789")) + assert hash1 == CacheHelper.dir_hash(tmp_dir) + + pycache_dir = os.path.join(tmp_dir, "__pycache__") + safe_mkdir(pycache_dir) + touch(os.path.join(pycache_dir, "f.pyc")) + assert hash1 == CacheHelper.dir_hash(tmp_dir) + touch(os.path.join(pycache_dir, "f.pyc.123456789")) + assert hash1 == CacheHelper.dir_hash(tmp_dir) + + touch(os.path.join(pycache_dir, "f.py")) + assert hash1 == CacheHelper.dir_hash( + tmp_dir + ), "All content under __pycache__ directories should be ignored." try: