Skip to content

Commit

Permalink
hookutils: adjust behavior of collect_data_files with include_py_files
Browse files Browse the repository at this point in the history
Adjust the behavior of `collect_data_files` when the `include_py_files`
flag is enabled: collect only `.py` and `.pyc` files, and never
collect `.pyc` files from the `__pycache__` directory.

Update the description to note that module collection mode is
preferable way of ensuring that source .py files are collected
(in addition or in lieu to byte-compiled modules in PYZ).

Adjust the `test_collect_data_all_included` accordingly.
  • Loading branch information
rokm committed Sep 21, 2023
1 parent deb6613 commit 984545a
Show file tree
Hide file tree
Showing 4 changed files with 98 additions and 56 deletions.
25 changes: 17 additions & 8 deletions PyInstaller/utils/hooks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -754,15 +754,22 @@ def collect_data_files(
includes: list | None = None,
):
r"""
This function produces a list of ``(source, dest)`` non-Python (i.e., data) files that reside in ``package``.
This function produces a list of ``(source, dest)`` entries for data files that reside in ``package``.
Its output can be directly assigned to ``datas`` in a hook script; for example, see ``hook-sphinx.py``.
The data files are all files that are not shared libraries / binary python extensions (based on extension
check) and are not python source (.py) files or byte-compiled modules (.pyc). Collection of the .py and .pyc
files can be toggled via the ``include_py_files`` flag.
Parameters:
- The ``package`` parameter is a string which names the package.
- By default, all Python executable files (those ending in ``.py``, ``.pyc``, and so on) will NOT be collected;
setting the ``include_py_files`` argument to ``True`` collects these files as well. This is typically used with
Python functions (such as those in ``pkgutil``) that search a given directory for Python executable files and
load them as extensions or plugins.
- By default, python source files and byte-compiled modules (files with ``.py`` and ``.pyc`` suffix) are not
collected; setting the ``include_py_files`` argument to ``True`` collects these files as well. This is typically
used when a package requires source .py files to be available; for example, JIT compilation used in
deep-learning frameworks, code that requires access to .py files (for example, to check their date), or code
that tries to extend `sys.path` with subpackage paths in a way that is incompatible with PyInstaller's frozen
importer.. However, in contemporary PyInstaller versions, the preferred way of collecting source .py files is by
using the **module collection mode** setting (which enables collection of source .py files in addition to or
in lieu of collecting byte-compiled modules into PYZ archive).
- The ``subdir`` argument gives a subdirectory relative to ``package`` to search, which is helpful when submodules
are imported at run-time from a directory lacking ``__init__.py``.
- The ``excludes`` argument contains a sequence of strings or Paths. These provide a list of
Expand Down Expand Up @@ -800,10 +807,12 @@ def collect_data_files(
# do not modify ``excludes_len``.
if not include_py_files:
excludes += ['**/*' + s for s in compat.ALL_SUFFIXES]
else:
# include_py_files should collect only .py and .pyc files, and not the extensions / shared libs.
excludes += ['**/*' + s for s in compat.ALL_SUFFIXES if s not in {'.py', '.pyc'}]

# Exclude .pyo files if include_py_files is False.
if not include_py_files and ".pyo" not in compat.ALL_SUFFIXES:
excludes.append('**/*.pyo')
# Never, ever, collect .pyc files from __pycache__.
excludes.append('**/__pycache__/*.pyc')

# If not specified, include all files. Follow the same process as the excludes.
includes = list(includes) if includes else ["**/*"]
Expand Down
7 changes: 7 additions & 0 deletions news/7943.breaking.1.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
The collection of "py files", enabled by the ``include_py_files=True``
argument to the :func:`PyInstaller.utils.hooks.collect_data_files` hook
utility function, is now restricted to only ``.py`` and ``.pyc`` files.
Previously, all suffices from ``importlib.machinery.all_suffices``were
enabled, which resulted in spurious collection of dynamic libraries and
extensions (due to ``.so``, ``.abi3.so``, ``.pyd``, etc. being among
those suffices).
3 changes: 3 additions & 0 deletions news/7943.breaking.2.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
The :func:`PyInstaller.utils.hooks.collect_data_files` hook utility
helper does not collect ``.pyc`` files from ``__pycache__`` directories
anymore, even with ``include_py_files=True`` argument.
119 changes: 71 additions & 48 deletions tests/unit/test_hookutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import os
import pytest
import shutil
from os.path import join
import re

from PyInstaller.utils.hooks import collect_data_files, collect_submodules, \
Expand Down Expand Up @@ -271,55 +270,79 @@ def test_collect_data_module():
'dynamiclib.dll',
'dynamiclib.dylib',
'nine.dat',
join('py_files_not_in_package', 'data', 'eleven.dat'),
join('py_files_not_in_package', 'ten.dat'),
os.path.join('py_files_not_in_package', 'data', 'eleven.dat'),
os.path.join('py_files_not_in_package', 'ten.dat'),
# Not backwards! On Windows, ``.so`` files are just data and vice versa.
'pyextension.so' if is_win else 'pyextension.pyd',
join('subpkg', 'thirteen.txt'),
)
os.path.join('subpkg', 'thirteen.txt'),
),
),
# Test collecting from a subpackage.
([TEST_MOD + '.subpkg'], {}, (join('subpkg', 'thirteen.txt'),)),
([TEST_MOD], dict(include_py_files=True, excludes=['**/__pycache__']), (
'__init__.py',
'dynamiclib.dll',
'dynamiclib.dylib',
'nine.dat',
join('py_files_not_in_package', 'data', 'eleven.dat'),
join('py_files_not_in_package', 'one.py'),
join('py_files_not_in_package', 'sub_pkg', '__init__.py'),
join('py_files_not_in_package', 'sub_pkg', 'three.py'),
join('py_files_not_in_package', 'ten.dat'),
'pyextension.pyd',
'pyextension.so',
join('raises_error_on_import_1', '__init__.py'),
join('raises_error_on_import_1', 'foo.py'),
join('raises_error_on_import_2', '__init__.py'),
join('raises_error_on_import_2', 'foo.py'),
join('subpkg', '__init__.py'),
join('subpkg', 'thirteen.txt'),
join('subpkg', 'twelve.py'),
'two.py',
)),
([TEST_MOD], dict(excludes=['py_files_not_in_package', '**/__pycache__']), (
'dynamiclib.dll',
'dynamiclib.dylib',
'nine.dat',
'pyextension.so' if is_win else 'pyextension.pyd',
join('subpkg', 'thirteen.txt'),
)),
([TEST_MOD], dict(includes=['**/*.dat', '**/*.txt']), (
'nine.dat',
join('py_files_not_in_package', 'data', 'eleven.dat'),
join('py_files_not_in_package', 'ten.dat'),
join('subpkg', 'thirteen.txt'),
)),
([TEST_MOD], dict(includes=['*.dat']), ('nine.dat',)),
([TEST_MOD], dict(subdir="py_files_not_in_package", excludes=['**/__pycache__']), (
join('py_files_not_in_package', 'data', 'eleven.dat'),
join('py_files_not_in_package', 'ten.dat'),
)),
], # yapf: disable
(
[TEST_MOD + '.subpkg'],
{},
(os.path.join('subpkg', 'thirteen.txt'),),
),
(
[TEST_MOD],
dict(include_py_files=True, excludes=['**/__pycache__']),
(
'__init__.py',
'dynamiclib.dll',
'dynamiclib.dylib',
'nine.dat',
os.path.join('py_files_not_in_package', 'data', 'eleven.dat'),
os.path.join('py_files_not_in_package', 'one.py'),
os.path.join('py_files_not_in_package', 'sub_pkg', '__init__.py'),
os.path.join('py_files_not_in_package', 'sub_pkg', 'three.py'),
os.path.join('py_files_not_in_package', 'ten.dat'),
# Not backwards! On Windows, ``.so`` files are just data and vice versa.
'pyextension.so' if is_win else 'pyextension.pyd',
os.path.join('raises_error_on_import_1', '__init__.py'),
os.path.join('raises_error_on_import_1', 'foo.py'),
os.path.join('raises_error_on_import_2', '__init__.py'),
os.path.join('raises_error_on_import_2', 'foo.py'),
os.path.join('subpkg', '__init__.py'),
os.path.join('subpkg', 'thirteen.txt'),
os.path.join('subpkg', 'twelve.py'),
'two.py',
),
),
(
[TEST_MOD],
dict(excludes=['py_files_not_in_package', '**/__pycache__']),
(
'dynamiclib.dll',
'dynamiclib.dylib',
'nine.dat',
'pyextension.so' if is_win else 'pyextension.pyd',
os.path.join('subpkg', 'thirteen.txt'),
),
),
(
[TEST_MOD],
dict(includes=['**/*.dat', '**/*.txt']),
(
'nine.dat',
os.path.join('py_files_not_in_package', 'data', 'eleven.dat'),
os.path.join('py_files_not_in_package', 'ten.dat'),
os.path.join('subpkg', 'thirteen.txt'),
),
),
(
[TEST_MOD],
dict(includes=['*.dat']),
('nine.dat',),
),
(
[TEST_MOD],
dict(subdir="py_files_not_in_package", excludes=['**/__pycache__']),
(
os.path.join('py_files_not_in_package', 'data', 'eleven.dat'),
os.path.join('py_files_not_in_package', 'ten.dat'),
),
),
],
ids=['package', 'subpackage', 'package with py files', 'excludes', '** includes', 'includes', 'subdir']
)
def data_lists(monkeypatch, request):
Expand All @@ -345,8 +368,8 @@ def _sort(sequence):
def test_collect_data_all_included(data_lists):
subfiles, src, dst = data_lists
# Check the source and dest lists against the correct values in subfiles.
src_compare = tuple([join(TEST_MOD_PATH, TEST_MOD, subpath) for subpath in subfiles])
dst_compare = [os.path.dirname(join(TEST_MOD, subpath)) for subpath in subfiles]
src_compare = tuple([os.path.join(TEST_MOD_PATH, TEST_MOD, subpath) for subpath in subfiles])
dst_compare = [os.path.dirname(os.path.join(TEST_MOD, subpath)) for subpath in subfiles]
dst_compare.sort()
dst_compare = tuple(dst_compare)
assert src == src_compare
Expand Down

0 comments on commit 984545a

Please sign in to comment.