Clean up

edenhaus · edenhaus · commit 787040e88634 · 2024-03-28T17:39:19.000+01:00
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -706,10 +706,10 @@ jobs:
           key: >-
             ${{ runner.os }}-${{ steps.python.outputs.python-version }}-${{
             needs.info.outputs.python_cache_key }}
-      - name: Run split_tests.py
+      - name: Run split.py
         run: |
           . venv/bin/activate
-          python -m script.split_tests_pytest ${{ needs.info.outputs.test_group_count }}
+          python -m script.split_tests ${{ needs.info.outputs.test_group_count }}
       - name: Upload pytest_buckets
         uses: actions/upload-artifact@v4.3.1
         with:
@@ -1178,7 +1178,7 @@ jobs:
           ./script/check_dirty
 
   coverage-partial:
-    name: Upload test coverage to Codecov
+    name: Upload test coverage to Codecov (partial suite)
     if: needs.info.outputs.skip_coverage != 'true'
     runs-on: ubuntu-22.04
     needs:
@@ -1215,7 +1215,7 @@ jobs:
           attempt_delay: 30000
 
   coverage-full:
-    name: Upload test coverage to Codecov
+    name: Upload test coverage to Codecov (full suite)
     if: needs.info.outputs.skip_coverage != 'true'
     runs-on: ubuntu-22.04
     needs:
diff --git a/.gitignore b/.gitignore
@@ -132,3 +132,6 @@ tmp_cache
 
 # python-language-server / Rope
 .ropeproject
+
+# Will be created from script/split_tests.py
+pytest_buckets.txt
diff --git a/script/split_tests.py b/script/split_tests.py
@@ -7,56 +7,28 @@
 from dataclasses import dataclass, field
 from math import ceil
 import os
-import re
+import subprocess
+import sys
 
 
-@dataclass
-class TestFile:
-    """Class to hold test information."""
+class Bucket:
+    """Class to hold bucket."""
 
-    path: str
-    total_tests: int
+    def __init__(
+        self,
+    ):
+        """Initialize bucket."""
+        self.tests = 0
+        self._paths = []
 
+    def add(self, part: TestFolder | TestFile):
+        """Add tests to bucket."""
+        self.tests += part.total_tests
+        self._paths.append(part.path)
 
-@dataclass
-class TestFolder:
-    """Class to hold test information."""
-
-    path: str
-    children: list[TestFolder | TestFile] = field(default_factory=list)
-
-    @property
-    def total_tests(self) -> int:
-        """Return total tests."""
-        return sum([test.total_tests for test in self.children])
-
-    def __repr__(self):
-        """Return representation."""
-        return f"TestFolder(path='{self.path}', total={self.total_tests}, children={len(self.children)})"
-
-
-def count_tests(test_folder: TestFolder) -> int:
-    """Count tests in folder."""
-    max_tests_in_file = 0
-    for entry in os.listdir(test_folder.path):
-        if entry in ("__pycache__", "__init__.py", "conftest.py"):
-            continue
-
-        entry_path = os.path.join(test_folder.path, entry)
-        if os.path.isdir(entry_path):
-            sub_folder = TestFolder(entry_path)
-            test_folder.children.append(sub_folder)
-            max_tests_in_file = max(max_tests_in_file, count_tests(sub_folder))
-        elif os.path.isfile(entry_path) and entry.startswith("test_"):
-            tests = 0
-            with open(entry_path) as file:
-                for line in file:
-                    if re.match(r"^(async\s+)?def\s+test_\w+\(", line):
-                        tests += 1
-            test_folder.children.append(TestFile(entry_path, tests))
-            max_tests_in_file = max(max_tests_in_file, tests)
-
-    return max_tests_in_file
+    def get_paths_line(self) -> str:
+        """Return paths."""
+        return " ".join(self._paths) + "\n"
 
 
 class BucketHolder:
@@ -66,38 +38,119 @@ def __init__(self, tests_per_bucket: int, bucket_count: int) -> None:
         """Initialize bucket holder."""
         self._tests_per_bucket = tests_per_bucket
         self._bucket_count = bucket_count
-        self._current_bucket = []
-        self._current_tests = 0
-        self._buckets: list[list[str]] = [self._current_bucket]
+        self._current_bucket = Bucket()
+        self._buckets: list[Bucket] = [self._current_bucket]
+        self._last_bucket = False
 
     def split_tests(self, tests: TestFolder | TestFile) -> None:
         """Split tests into buckets."""
-        if self._current_tests + tests.total_tests < self._tests_per_bucket:
-            self._current_bucket.append(tests.path)
-            self._current_tests += tests.total_tests
+        if (
+            self._current_bucket.tests + tests.total_tests < self._tests_per_bucket
+        ) or self._last_bucket:
+            self._current_bucket.add(tests)
             return
 
         if isinstance(tests, TestFolder):
-            for test in tests.children:
+            for test in tests.children.values():
                 self.split_tests(test)
             return
 
         # Create new bucket
-        self._current_tests = 0
-
-        # The last bucket is lightly bigger (max the maximum number of tests in a single file)
-        if len(self._buckets) != self._bucket_count:
-            self._current_bucket = []
+        if len(self._buckets) == self._bucket_count:
+            # Last bucket, add all tests to it
+            self._last_bucket = True
+        else:
+            self._current_bucket = Bucket()
             self._buckets.append(self._current_bucket)
 
         # Add test to new bucket
         self.split_tests(tests)
 
-    def create_ouput_files(self) -> None:
-        """Create output files."""
+    def create_ouput_file(self) -> None:
+        """Create output file."""
         with open("pytest_buckets.txt", "w") as file:
             for bucket in self._buckets:
-                file.write(" ".join(bucket) + "\n")
+                print(f"Bucket has {bucket.tests} tests")
+                file.write(bucket.get_paths_line())
+
+
+@dataclass
+class TestFile:
+    """Class to hold number of tests."""
+
+    path: str
+    total_tests: int
+
+    def __gt__(self, other):
+        """Return if greater than."""
+        return self.total_tests > other.total_tests
+
+
+@dataclass
+class TestFolder:
+    """Class to hold test information."""
+
+    path: str
+    children: dict[str, TestFolder | TestFile] = field(default_factory=dict)
+
+    @property
+    def total_tests(self) -> int:
+        """Return total tests."""
+        return sum([test.total_tests for test in self.children.values()])
+
+    def __repr__(self):
+        """Return representation."""
+        return f"TestFolder(total={self.total_tests}, children={len(self.children)})"
+
+
+def insert_at_correct_position(
+    test_holder: TestFolder, test_path: str, total_tests: int
+) -> None:
+    """Insert test at correct position."""
+    current_path = test_holder
+    for part in test_path.split("/")[1:]:
+        if part.endswith(".py"):
+            current_path.children[part] = TestFile(test_path, total_tests)
+        else:
+            current_path = current_path.children.setdefault(
+                part, TestFolder(os.path.join(current_path.path, part))
+            )
+
+
+def collect_tests(path: str) -> tuple[TestFolder, TestFile]:
+    """Collect all tests."""
+    result = subprocess.run(
+        ["pytest", "--collect-only", "-qq", "-p", "no:warnings", path],
+        check=False,
+        capture_output=True,
+        text=True,
+    )
+
+    if result.returncode != 0:
+        print("Failed to collect tests:")
+        print(result.stderr)
+        print(result.stdout)
+        sys.exit(1)
+
+    folder = TestFolder(path.split("/")[0])
+    insert_at_correct_position(folder, path, 0)
+    max_tests_in_file = TestFile("", 0)
+
+    for line in result.stdout.splitlines():
+        if not line.strip():
+            continue
+        parts = [x.strip() for x in line.split(":")]
+        if len(parts) != 2:
+            print(f"Unexpected line: {line}")
+            sys.exit(1)
+
+        path = parts[0]
+        total_tests = int(parts[1])
+        max_tests_in_file = max(max_tests_in_file, TestFile(path, total_tests))
+
+        insert_at_correct_position(folder, path, total_tests)
+
+    return (folder, max_tests_in_file)
 
 
 def main() -> None:
@@ -120,22 +173,23 @@ def check_greater_0(value: str) -> int:
 
     arguments = parser.parse_args()
 
-    tests = TestFolder("tests")
-    max_tests_in_file = count_tests(tests)
-    print(f"Maximum tests in a single file: {max_tests_in_file}")
+    (tests, max_tests_in_file) = collect_tests("tests")
+    print(
+        f"Maximum tests in a single file are {max_tests_in_file.total_tests} tests (in {max_tests_in_file.path})"
+    )
     print(f"Total tests: {tests.total_tests}")
 
     tests_per_bucket = ceil(tests.total_tests / arguments.bucket_count)
     print(f"Estimated tests per bucket: {tests_per_bucket}")
 
-    if max_tests_in_file > tests_per_bucket:
+    if max_tests_in_file.total_tests > tests_per_bucket:
         raise ValueError(
             f"There are more tests in a single file ({max_tests_in_file}) than tests per bucket ({tests_per_bucket})"
         )
 
     bucket_holder = BucketHolder(tests_per_bucket, arguments.bucket_count)
     bucket_holder.split_tests(tests)
-    bucket_holder.create_ouput_files()
+    bucket_holder.create_ouput_file()
 
 
 if __name__ == "__main__":
diff --git a/script/split_tests_pytest.py b/script/split_tests_pytest.py