dbt-labs · peterallenwebb · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
@@ -0,0 +1,6 @@
+kind: Features
+body: Accelerate block tag iteration.
+time: 2024-10-15T17:48:41.299686-04:00
+custom:
+  Author: peterallenwebb
+  Issue: "205"
@@ -1,6 +1,7 @@
+import dataclasses
 import re
 from collections import namedtuple
-from typing import Iterator, List, Optional, Set, Union
+from typing import Dict, Iterator, List, Optional, Set, Union
 
 from dbt_common.exceptions import (
     BlockDefinitionNotAtTopError,
@@ -104,11 +105,25 @@ def end_pat(self) -> re.Pattern:
 QUOTE_START_PATTERN = regex(r"""(?P<quote>(['"]))""")
 
 
+@dataclasses.dataclass
+class PositionedMatch:
+    """This class is used to cache search information, accelerating TagIterator.
+    It records the result of searching a string from the start_pos and also
+    the position of the first match, or None if there is no match."""
+
+    start_pos: int
+    match: Optional[re.Match]
+
+
 class TagIterator:
     def __init__(self, text: str) -> None:
         self.text: str = text
         self.pos: int = 0
 
+        # A cache of the most recent matches seen for each pattern, maintained
+        # in order to avoid slowly re-searching long inputs many times.
+        self._past_matches: Dict[re.Pattern, PositionedMatch] = {}
+
     def linepos(self, end: Optional[int] = None) -> str:
         """Return relative position in line.
 
@@ -130,7 +145,31 @@ def rewind(self, amount: int = 1) -> None:
         self.pos -= amount
 
     def _search(self, pattern: re.Pattern) -> Optional[re.Match]:
-        return pattern.search(self.text, self.pos)
+        # Check to see if we have cached a search for this pattern already.
+        positioned_match = self._past_matches.get(pattern)
+
+        if positioned_match is None or positioned_match.start_pos > self.pos:
+            # We did not have a cached search, or we did, but it was done at a location
+            # further along in the string and can't be used. Do a search and cache it.
+            match = pattern.search(self.text, self.pos)
+            self._past_matches[pattern] = PositionedMatch(self.pos, match)
+        else:
+            # We have a cached search and its start position falls before (or at) the
+            # current search position...
+            if positioned_match.match is None:
+                # ...but there is no match in the rest of the text.
+                match = None
+            elif positioned_match.match.start() >= self.pos:
+                # ...and there is a match we can reuse, because we have not yet passed
+                # the start position of the match. It's still the next match.
+                match = positioned_match.match
+            else:
+                # ...but we have passed the start of the cached match, and need to do a
+                # new search from our current position and cache it.
+                match = pattern.search(self.text, self.pos)
+                self._past_matches[pattern] = PositionedMatch(self.pos, match)
+
+        return match
 
     def _match(self, pattern: re.Pattern) -> Optional[re.Match]:
         return pattern.match(self.text, self.pos)