perf: bulk narrowing to avoid N**2. #2048

nedbat · nedbat · commit 0d5a112fc54c · 2025-09-21T07:19:33.000-04:00
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -23,11 +23,16 @@ upgrading your version of coverage.py.
 Unreleased
 ----------
 
+- Performance: with branch coverage in large files, generating HTML, JSON, or
+  LCOV reports could take far too long due to some quadratic behavior.  This is
+  now fixed, closing `issue 2048`_.  Thanks to Daniel Diniz for help diagnosing
+  the problem.
+
 - Most warnings and a few errors now have links to a page in the docs
   explaining the specific message.  Closes `issue 1921`_.
 
 .. _issue 1921: https://github.com/nedbat/coveragepy/issues/1921
-
+.. _issue 2048: https://github.com/nedbat/coveragepy/issues/2048
 
 
 .. start-releases
diff --git a/coverage/html.py b/coverage/html.py
@@ -32,7 +32,7 @@
     stdout_link,
 )
 from coverage.report_core import get_analysis_to_report
-from coverage.results import Analysis, Numbers
+from coverage.results import Analysis, AnalysisNarrower, Numbers
 from coverage.templite import Templite
 from coverage.types import TLineNo, TMorf
 from coverage.version import __url__
@@ -582,13 +582,21 @@ def write_region_index_pages(self, files_to_report: Iterable[FileToReport]) -> N
 
             for noun in region_nouns:
                 page_data = self.index_pages[noun]
-                outside_lines = set(range(1, num_lines + 1))
 
+                outside_lines = set(range(1, num_lines + 1))
                 for region in regions:
                     if region.kind != noun:
                         continue
                     outside_lines -= region.lines
-                    analysis = ftr.analysis.narrow(region.lines)
+
+                narrower = AnalysisNarrower(ftr.analysis)
+                narrower.add_regions(r.lines for r in regions if r.kind == noun)
+                narrower.add_regions([outside_lines])
+
+                for region in regions:
+                    if region.kind != noun:
+                        continue
+                    analysis = narrower.narrow(region.lines)
                     if not self.should_report(analysis, page_data):
                         continue
                     sorting_name = region.name.rpartition(".")[-1].lstrip("_")
@@ -605,7 +613,7 @@ def write_region_index_pages(self, files_to_report: Iterable[FileToReport]) -> N
                         )
                     )
 
-                analysis = ftr.analysis.narrow(outside_lines)
+                analysis = narrower.narrow(outside_lines)
                 if self.should_report(analysis, page_data):
                     page_data.summaries.append(
                         IndexItem(
diff --git a/coverage/jsonreport.py b/coverage/jsonreport.py
@@ -13,7 +13,7 @@
 
 from coverage import __version__
 from coverage.report_core import get_analysis_to_report
-from coverage.results import Analysis, Numbers
+from coverage.results import Analysis, AnalysisNarrower, Numbers
 from coverage.types import TLineNo, TMorf
 
 if TYPE_CHECKING:
@@ -128,21 +128,30 @@ def report_one_file(
             )
 
         num_lines = len(file_reporter.source().splitlines())
+        regions = file_reporter.code_regions()
         for noun, plural in file_reporter.code_region_kinds():
-            reported_file[plural] = region_data = {}
             outside_lines = set(range(1, num_lines + 1))
-            for region in file_reporter.code_regions():
+            for region in regions:
                 if region.kind != noun:
                     continue
                 outside_lines -= region.lines
+
+            narrower = AnalysisNarrower(analysis)
+            narrower.add_regions(r.lines for r in regions if r.kind == noun)
+            narrower.add_regions([outside_lines])
+
+            reported_file[plural] = region_data = {}
+            for region in regions:
+                if region.kind != noun:
+                    continue
                 region_data[region.name] = self.make_region_data(
                     coverage_data,
-                    analysis.narrow(region.lines),
+                    narrower.narrow(region.lines),
                 )
 
             region_data[""] = self.make_region_data(
                 coverage_data,
-                analysis.narrow(outside_lines),
+                narrower.narrow(outside_lines),
             )
         return reported_file
 
diff --git a/coverage/lcovreport.py b/coverage/lcovreport.py
@@ -13,7 +13,7 @@
 
 from coverage.plugin import FileReporter
 from coverage.report_core import get_analysis_to_report
-from coverage.results import Analysis, Numbers
+from coverage.results import Analysis, AnalysisNarrower, Numbers
 from coverage.types import TMorf
 
 if TYPE_CHECKING:
@@ -81,12 +81,15 @@ def lcov_functions(
     if not functions:
         return
 
+    narrower = AnalysisNarrower(file_analysis)
+    narrower.add_regions(r.lines for _, _, r in functions)
+
     functions.sort()
     functions_hit = 0
     for first_line, last_line, region in functions:
         # A function counts as having been executed if any of it has been
         # executed.
-        analysis = file_analysis.narrow(region.lines)
+        analysis = narrower.narrow(region.lines)
         hit = int(analysis.numbers.n_executed > 0)
         functions_hit += hit
 
diff --git a/coverage/results.py b/coverage/results.py
@@ -7,7 +7,7 @@
 
 import collections
 import dataclasses
-from collections.abc import Container, Iterable
+from collections.abc import Iterable
 from typing import TYPE_CHECKING
 
 from coverage.exceptions import ConfigError
@@ -113,45 +113,6 @@ def __post_init__(self) -> None:
             n_missing_branches=n_missing_branches,
         )
 
-    def narrow(self, lines: Container[TLineNo]) -> Analysis:
-        """Create a narrowed Analysis.
-
-        The current analysis is copied to make a new one that only considers
-        the lines in `lines`.
-        """
-
-        statements = {lno for lno in self.statements if lno in lines}
-        excluded = {lno for lno in self.excluded if lno in lines}
-        executed = {lno for lno in self.executed if lno in lines}
-
-        if self.has_arcs:
-            arc_possibilities_set = {
-                (a, b) for a, b in self.arc_possibilities_set if a in lines or b in lines
-            }
-            arcs_executed_set = {
-                (a, b) for a, b in self.arcs_executed_set if a in lines or b in lines
-            }
-            exit_counts = {lno: num for lno, num in self.exit_counts.items() if lno in lines}
-            no_branch = {lno for lno in self.no_branch if lno in lines}
-        else:
-            arc_possibilities_set = set()
-            arcs_executed_set = set()
-            exit_counts = {}
-            no_branch = set()
-
-        return Analysis(
-            precision=self.precision,
-            filename=self.filename,
-            has_arcs=self.has_arcs,
-            statements=statements,
-            excluded=excluded,
-            executed=executed,
-            arc_possibilities_set=arc_possibilities_set,
-            arcs_executed_set=arcs_executed_set,
-            exit_counts=exit_counts,
-            no_branch=no_branch,
-        )
-
     def missing_formatted(self, branches: bool = False) -> str:
         """The missing line numbers, formatted nicely.
 
@@ -236,6 +197,104 @@ def branch_stats(self) -> dict[TLineNo, tuple[int, int]]:
         return stats
 
 
+TRegionLines = frozenset[TLineNo]
+
+
+class AnalysisNarrower:
+    """
+    For reducing an `Analysis` to a subset of its lines.
+
+    Originally this was a simpler method on Analysis, but that led to quadratic
+    behavior.  This class does the bulk of the work up-front to provide the
+    same results in linear time.
+
+    Create an AnalysisNarrower from an Analysis, bulk-add region lines to it
+    with `add_regions`, then individually request new narrowed Analysis objects
+    for each region with `narrow`.  Doing most of the work in limited calls to
+    `add_regions` lets us avoid poor performance.
+    """
+
+    # In this class, regions are represented by a frozenset of their lines.
+
+    def __init__(self, analysis: Analysis) -> None:
+        self.analysis = analysis
+        self.region2arc_possibilities: dict[TRegionLines, set[TArc]] = collections.defaultdict(set)
+        self.region2arc_executed: dict[TRegionLines, set[TArc]] = collections.defaultdict(set)
+        self.region2exit_counts: dict[TRegionLines, dict[TLineNo, int]] = collections.defaultdict(
+            dict
+        )
+
+    def add_regions(self, liness: Iterable[set[TLineNo]]) -> None:
+        """
+        Pre-process a number of sets of line numbers.  Later calls to `narrow`
+        with one of these sets will provide a narrowed Analysis.
+        """
+        if self.analysis.has_arcs:
+            line2region: dict[TLineNo, TRegionLines] = {}
+
+            for lines in liness:
+                fzlines = frozenset(lines)
+                for line in lines:
+                    line2region[line] = fzlines
+
+            def collect_arcs(
+                arc_set: set[TArc],
+                region2arcs: dict[TRegionLines, set[TArc]],
+            ) -> None:
+                for a, b in arc_set:
+                    if r := line2region.get(a):
+                        region2arcs[r].add((a, b))
+                    if r := line2region.get(b):
+                        region2arcs[r].add((a, b))
+
+            collect_arcs(self.analysis.arc_possibilities_set, self.region2arc_possibilities)
+            collect_arcs(self.analysis.arcs_executed_set, self.region2arc_executed)
+
+            for lno, num in self.analysis.exit_counts.items():
+                if r := line2region.get(lno):
+                    self.region2exit_counts[r][lno] = num
+
+    def narrow(self, lines: set[TLineNo]) -> Analysis:
+        """Create a narrowed Analysis.
+
+        The current analysis is copied to make a new one that only considers
+        the lines in `lines`.
+        """
+
+        # Technically, the set intersections in this method are still O(N**2)
+        # since this method is called N times, but they're very fast and moving
+        # them to `add_regions` won't avoid the quadratic time.
+
+        statements = self.analysis.statements & lines
+        excluded = self.analysis.excluded & lines
+        executed = self.analysis.executed & lines
+
+        if self.analysis.has_arcs:
+            fzlines = frozenset(lines)
+            arc_possibilities_set = self.region2arc_possibilities[fzlines]
+            arcs_executed_set = self.region2arc_executed[fzlines]
+            exit_counts = self.region2exit_counts[fzlines]
+            no_branch = self.analysis.no_branch & lines
+        else:
+            arc_possibilities_set = set()
+            arcs_executed_set = set()
+            exit_counts = {}
+            no_branch = set()
+
+        return Analysis(
+            precision=self.analysis.precision,
+            filename=self.analysis.filename,
+            has_arcs=self.analysis.has_arcs,
+            statements=statements,
+            excluded=excluded,
+            executed=executed,
+            arc_possibilities_set=arc_possibilities_set,
+            arcs_executed_set=arcs_executed_set,
+            exit_counts=exit_counts,
+            no_branch=no_branch,
+        )
+
+
 @dataclasses.dataclass
 class Numbers:
     """The numerical results of measuring coverage.