perf: it's faster in all versions if we don't cache tokenize #1791

nedbat · nedbat · commit b666f3af275a · 2024-05-28T09:25:12.000-04:00
diff --git a/CHANGES.rst b/CHANGES.rst
@@ -28,6 +28,12 @@ Unreleased
   extreme case of combining 700+ data files, the time dropped from more than
   three hours to seven minutes.  Thanks for Kraken Tech for funding the fix.
 
+- Performance improvements for generating HTML reports, with a side benefit of
+  reducing memory use, closing `issue 1791`_.  Thanks to Daniel Diniz for
+  helping to diagnose the problem.
+
+.. _issue 1791: https://github.com/nedbat/coveragepy/issues/1791
+
 
 .. scriv-start-here
 
diff --git a/coverage/phystokens.py b/coverage/phystokens.py
@@ -6,7 +6,6 @@
 from __future__ import annotations
 
 import ast
-import functools
 import io
 import keyword
 import re
@@ -163,20 +162,15 @@ def source_token_lines(source: str) -> TSourceTokenLines:
         yield line
 
 
-@functools.lru_cache(maxsize=100)
 def generate_tokens(text: str) -> TokenInfos:
-    """A cached version of `tokenize.generate_tokens`.
+    """A helper around `tokenize.generate_tokens`.
 
-    When reporting, coverage.py tokenizes files twice, once to find the
-    structure of the file, and once to syntax-color it.  Tokenizing is
-    expensive, and easily cached.
+    Originally this was used to cache the results, but it didn't seem to make
+    reporting go faster, and caused issues with using too much memory.
 
-    Unfortunately, the HTML report code tokenizes all the files the first time
-    before then tokenizing them a second time, so we cache many.  Ideally we'd
-    rearrange the code to tokenize each file twice before moving onto the next.
     """
     readline = io.StringIO(text).readline
-    return list(tokenize.generate_tokens(readline))
+    return tokenize.generate_tokens(readline)
 
 
 def source_encoding(source: bytes) -> str: