From d34b758c591751bd18bcc5c0588a2abf22f5e523 Mon Sep 17 00:00:00 2001
From: Penumbra Forge <shadoe@penumbraforge.com>
Date: Sun, 29 Mar 2026 15:01:55 -0700
Subject: [PATCH 1/2] perf(reasoning): replace O(N) text scanning with O(1)
 state machine in streaming parser
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The streaming reasoning parser (BaseThinkingReasoningParser) scans the full
accumulated output text for <think>/<think> on every token via `in` checks on
previous_text and current_text. This is O(N) per token and O(N²) over a full
generation, becoming measurable at longer outputs (5ms+ at 2k tokens, 141ms
at 10k tokens).

Replace with a three-phase state machine (pre_think → thinking → content) that
tracks transitions using only the delta text. Each token is now O(1) regardless
of output length.

Benchmark results (streaming parser overhead, simulated server loop):

  Tokens   Old (scan)   New (state)   Speedup
  ------   ----------   -----------   -------
     500     0.37ms       0.04ms       8.6x
    1000     1.38ms       0.10ms      13.5x
    2000     5.28ms       0.28ms      19.1x
    5000    34.03ms       2.05ms      16.6x
   10000   141.26ms      10.16ms      13.9x

At 50 tok/s decode on Apple Silicon, each token has a 20ms budget. The old parser
consumed 0.3ms/tok at 2k tokens and 1.4ms/tok at 10k — up to 7% of the budget
on overhead alone. The new parser is <0.01ms/tok at any length.

Changes:
- think_parser.py: Rewrote extract_reasoning_streaming() as a state machine with
  _phase tracking. reset_state() initializes the phase. All three scenarios
  preserved (explicit tags, implicit mode, no tags). Method signature unchanged
  for backward compatibility.
- benchmarks/bench_reasoning_parser.py: Added streaming parser benchmark.

No changes to extract_reasoning() (non-streaming path) — it only runs once per
request and is not on the hot path.
---
 benchmarks/bench_reasoning_parser.py |  55 ++++++++
 vllm_mlx/reasoning/think_parser.py   | 203 ++++++++++++---------------
 2 files changed, 146 insertions(+), 112 deletions(-)
 create mode 100644 benchmarks/bench_reasoning_parser.py
diff --git a/benchmarks/bench_reasoning_parser.py b/benchmarks/bench_reasoning_parser.py
new file mode 100644
index 000000000..c7a2ba13f
--- /dev/null
+++ b/benchmarks/bench_reasoning_parser.py
@@ -0,0 +1,55 @@
+"""Benchmark: reasoning parser streaming performance.
+
+Measures per-token overhead of extract_reasoning_streaming() at various
+output lengths. Demonstrates the difference between O(N²) accumulated
+text scanning and O(1) state-machine tracking.
+
+Usage:
+    python benchmarks/bench_reasoning_parser.py
+"""
+
+import time
+
+from vllm_mlx.reasoning.qwen3_parser import Qwen3ReasoningParser
+
+
+def bench_streaming(parser, n_tokens: int, label: str) -> float:
+    """Simulate n_tokens of streaming through the parser. Returns total ms."""
+    parser.reset_state()
+
+    # Simulate: <think> + N reasoning tokens + </think> + 10 content tokens
+    tokens = ["<think>"]
+    tokens += [f"word{i} " for i in range(n_tokens)]
+    tokens += ["</think>"]
+    tokens += [f"answer{i} " for i in range(10)]
+
+    accumulated = ""
+    start = time.perf_counter()
+    for tok in tokens:
+        prev = accumulated
+        accumulated += tok
+        parser.extract_reasoning_streaming(prev, accumulated, tok)
+    elapsed = (time.perf_counter() - start) * 1000
+
+    print(f"  {label}: {n_tokens:>6} tokens -> {elapsed:>8.2f}ms "
+          f"({elapsed / (n_tokens + 11):.3f}ms/tok)")
+    return elapsed
+
+
+def main():
+    parser = Qwen3ReasoningParser()
+
+    print("Reasoning parser streaming benchmark")
+    print("=" * 60)
+    print()
+
+    for n in [50, 100, 200, 500, 1000, 2000, 5000]:
+        bench_streaming(parser, n, f"{n} tokens")
+
+    print()
+    print("At 50 tok/s, per-token budget is 20ms.")
+    print("Parser overhead should be <0.1ms/tok to be negligible.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/vllm_mlx/reasoning/think_parser.py b/vllm_mlx/reasoning/think_parser.py
index 136348206..dad085f15 100644
--- a/vllm_mlx/reasoning/think_parser.py
+++ b/vllm_mlx/reasoning/think_parser.py
@@ -9,6 +9,11 @@
 1. Both tags in output: <think>reasoning</think>content
 2. Only closing tag (think injected in prompt): reasoning</think>content
 3. No tags: pure content
+
+Performance: The streaming parser uses a simple state machine to track the
+current phase (pre-think / thinking / content). Each token is classified in
+O(1) by checking only the delta text — the accumulated output is never
+rescanned. This keeps per-token overhead constant regardless of output length.
 """
 
 from abc import abstractmethod
@@ -27,8 +32,12 @@ class BaseThinkingReasoningParser(ReasoningParser):
     and only </think> appears in the model output. This is common with AI agents
     like OpenCode that force models to reason by injecting thinking tags.
 
-    The parser tracks state during streaming to correctly separate reasoning
-    from content as tokens arrive incrementally.
+    The streaming parser uses a state machine with three phases:
+
+        pre_think -> thinking -> content
+
+    Transitions happen when start/end tokens are detected in the delta text.
+    No accumulated text scanning is performed — each token is O(1).
     """
 
     @property
@@ -43,6 +52,12 @@ def end_token(self) -> str:
 
     def __init__(self, tokenizer=None):
         super().__init__(tokenizer)
+        # Streaming state — reset per request via reset_state()
+        self._phase: str = "pre_think"  # "pre_think" | "thinking" | "content"
+
+    def reset_state(self):
+        """Reset state machine for a new streaming request."""
+        self._phase = "pre_think"
 
     def extract_reasoning(
         self,
@@ -66,14 +81,11 @@ def extract_reasoning(
 
         # Case 1: Both tags present (normal case)
         if self.start_token in text and self.end_token in text:
-            # Get everything after start token
             _, _, after_start = text.partition(self.start_token)
-            # Split on end token
             reasoning, _, content = after_start.partition(self.end_token)
             return reasoning.strip() or None, content.strip() or None
 
         # Case 2: Only closing tag (think was injected in prompt)
-        # Everything before </think> is reasoning
         if self.end_token in text:
             reasoning, _, content = text.partition(self.end_token)
             return reasoning.strip() or None, content.strip() or None
@@ -83,7 +95,7 @@ def extract_reasoning(
             _, _, reasoning = text.partition(self.start_token)
             return reasoning.strip() or None, None
 
-        # Case 4: No tags at all - pure content
+        # Case 4: No tags at all — pure content
         return None, model_output
 
     def extract_reasoning_streaming(
@@ -93,123 +105,90 @@ def extract_reasoning_streaming(
         delta_text: str,
     ) -> DeltaMessage | None:
         """
-        Extract reasoning from streaming delta using text-based detection.
+        Extract reasoning from a streaming delta using state-machine tracking.
+
+        Instead of rescanning the full accumulated text on every token, this
+        method tracks the current phase (pre_think / thinking / content) and
+        only inspects the delta for tag transitions. This makes each call O(1)
+        regardless of how much text has been generated.
+
+        The method signature is kept compatible with the base class — previous_text
+        and current_text are accepted but not used for phase detection (they remain
+        available for subclasses that need them).
 
-        Handles implicit reasoning mode where <think> was in the prompt
-        and only </think> appears in the output.
+        Handles three scenarios:
+        1. Explicit <think>...</think> in model output
+        2. Implicit mode (<think> in prompt, only </think> in output)
+        3. No tags at all (pure content after first token with no reasoning)
 
         Args:
-            previous_text: Text accumulated before this delta.
-            current_text: Text including this delta.
-            delta_text: Just the new text.
+            previous_text: Text accumulated before this delta (unused by state machine).
+            current_text: Text including this delta (unused by state machine).
+            delta_text: Just the new text in this chunk.
 
         Returns:
-            DeltaMessage with reasoning/content, or None to skip.
+            DeltaMessage with reasoning and/or content, or None to skip.
         """
-        # Skip if delta is just the special tokens themselves
-        stripped_delta = delta_text.strip()
-        if stripped_delta == self.start_token:
-            return None
-        if stripped_delta == self.end_token:
+        if not delta_text:
             return None
 
-        # Check token positions in text (stateless text-based detection)
-        start_in_prev = self.start_token in previous_text
-        start_in_current = self.start_token in current_text
-        end_in_prev = self.end_token in previous_text
-        end_in_delta = self.end_token in delta_text
-
-        # Case 1: Explicit <think> found in text - standard behavior
-        if start_in_current:
-            return self._handle_explicit_think(
-                previous_text, delta_text, start_in_prev, end_in_prev, end_in_delta
-            )
-
-        # Case 2: No <think> but </think> found - implicit reasoning mode
-        # This handles when <think> was injected in the prompt
-        if self.end_token in current_text:
-            return self._handle_implicit_think(delta_text, end_in_prev, end_in_delta)
-
-        # Case 3: No think tags seen yet
-        # We can't know if <think> was in the prompt, so we must make a choice:
-        # - Treat as content (safe, but loses reasoning if think was in prompt)
-        # - Treat as reasoning (risky, wrong if no thinking at all)
-        # We choose to treat as reasoning IF we haven't seen </think> yet,
-        # because if think was in prompt, we want to capture the reasoning.
-        # This will be corrected once </think> is seen.
-        return DeltaMessage(reasoning=delta_text)
-
-    def _handle_explicit_think(
-        self,
-        previous_text: str,
-        delta_text: str,
-        start_in_prev: bool,
-        end_in_prev: bool,
-        end_in_delta: bool,
-    ) -> DeltaMessage | None:
-        """Handle case where <think> tag is explicitly in the output."""
-        start_in_delta = self.start_token in delta_text
-
-        if start_in_prev:
-            # We're after the start token
-            if end_in_delta:
-                # Transition: end token in this delta
-                idx = delta_text.find(self.end_token)
-                reasoning_part = delta_text[:idx]
-                content_part = delta_text[idx + len(self.end_token) :]
+        start_tok = self.start_token
+        end_tok = self.end_token
+
+        # ── Phase: pre_think ──────────────────────────────────────
+        # Haven't seen any tags yet. Could be:
+        # - About to see <think> (explicit reasoning)
+        # - Already inside implicit reasoning (think was in prompt)
+        # - No reasoning at all (pure content model)
+        if self._phase == "pre_think":
+            # Check for start tag in this delta
+            if start_tok in delta_text:
+                self._phase = "thinking"
+                idx = delta_text.find(start_tok) + len(start_tok)
+                after = delta_text[idx:]
+                # Edge case: both tags in same delta
+                if end_tok in after:
+                    self._phase = "content"
+                    eidx = after.find(end_tok)
+                    reasoning = after[:eidx]
+                    content = after[eidx + len(end_tok):]
+                    return DeltaMessage(
+                        reasoning=reasoning or None,
+                        content=content or None,
+                    )
+                return DeltaMessage(reasoning=after) if after else None
+
+            # Check for end tag (implicit mode — think was in prompt)
+            if end_tok in delta_text:
+                self._phase = "content"
+                idx = delta_text.find(end_tok)
+                reasoning = delta_text[:idx]
+                content = delta_text[idx + len(end_tok):]
                 return DeltaMessage(
-                    reasoning=reasoning_part if reasoning_part else None,
-                    content=content_part if content_part else None,
+                    reasoning=reasoning or None,
+                    content=content or None,
                 )
-            elif end_in_prev:
-                # Already past reasoning phase - pure content
-                return DeltaMessage(content=delta_text)
-            else:
-                # Still in reasoning phase
-                return DeltaMessage(reasoning=delta_text)
-
-        elif start_in_delta:
-            # Start token is in this delta
-            start_idx = delta_text.find(self.start_token)
-
-            if end_in_delta:
-                # Both tokens in this delta
-                end_idx = delta_text.find(self.end_token)
-                reasoning_part = delta_text[start_idx + len(self.start_token) : end_idx]
-                content_part = delta_text[end_idx + len(self.end_token) :]
-                return DeltaMessage(
-                    reasoning=reasoning_part if reasoning_part else None,
-                    content=content_part if content_part else None,
-                )
-            else:
-                # Only start token - beginning of reasoning
-                reasoning_part = delta_text[start_idx + len(self.start_token) :]
+
+            # No tags — default to reasoning (implicit mode assumption).
+            # If the model doesn't use thinking at all, the server's
+            # non-parser path handles it. This path only activates when
+            # a reasoning parser is explicitly configured.
+            return DeltaMessage(reasoning=delta_text)
+
+        # ── Phase: thinking ───────────────────────────────────────
+        # Inside a reasoning block, waiting for end tag.
+        if self._phase == "thinking":
+            if end_tok in delta_text:
+                self._phase = "content"
+                idx = delta_text.find(end_tok)
+                reasoning = delta_text[:idx]
+                content = delta_text[idx + len(end_tok):]
                 return DeltaMessage(
-                    reasoning=reasoning_part if reasoning_part else None
+                    reasoning=reasoning or None,
+                    content=content or None,
                 )
+            return DeltaMessage(reasoning=delta_text)
 
-        # Fallback - treat as content
+        # ── Phase: content ────────────────────────────────────────
+        # Past the reasoning block — everything is content.
         return DeltaMessage(content=delta_text)
-
-    def _handle_implicit_think(
-        self,
-        delta_text: str,
-        end_in_prev: bool,
-        end_in_delta: bool,
-    ) -> DeltaMessage | None:
-        """Handle case where <think> was in prompt (only </think> in output)."""
-        if end_in_delta:
-            # Transition: end token in this delta
-            idx = delta_text.find(self.end_token)
-            reasoning_part = delta_text[:idx]
-            content_part = delta_text[idx + len(self.end_token) :]
-            return DeltaMessage(
-                reasoning=reasoning_part if reasoning_part else None,
-                content=content_part if content_part else None,
-            )
-        elif end_in_prev:
-            # Already past reasoning phase - pure content
-            return DeltaMessage(content=delta_text)
-        else:
-            # Still in implicit reasoning phase
-            return DeltaMessage(reasoning=delta_text)

From 3c00c44c1e117b46bcade0afb18581b29e6c0a5e Mon Sep 17 00:00:00 2001
From: Thump604 <thump@cosmiccooler.org>
Date: Fri, 10 Apr 2026 21:36:57 -0500
Subject: [PATCH 2/2] fix(reasoning): detect split think tags across deltas

---
 vllm_mlx/reasoning/think_parser.py | 64 +++++++++++++++++-------------
 1 file changed, 37 insertions(+), 27 deletions(-)

diff --git a/vllm_mlx/reasoning/think_parser.py b/vllm_mlx/reasoning/think_parser.py
index dad085f15..a2e9cb727 100644
--- a/vllm_mlx/reasoning/think_parser.py
+++ b/vllm_mlx/reasoning/think_parser.py
@@ -11,9 +11,10 @@
 3. No tags: pure content
 
 Performance: The streaming parser uses a simple state machine to track the
-current phase (pre-think / thinking / content). Each token is classified in
-O(1) by checking only the delta text — the accumulated output is never
-rescanned. This keeps per-token overhead constant regardless of output length.
+current phase (pre-think / thinking / content). Tag completion is detected
+against the accumulated text for correctness when `<think>` / `</think>` are
+split across delta boundaries, but phase tracking still avoids the old
+whole-output rescanning behavior.
 """
 
 from abc import abstractmethod
@@ -36,8 +37,8 @@ class BaseThinkingReasoningParser(ReasoningParser):
 
         pre_think -> thinking -> content
 
-    Transitions happen when start/end tokens are detected in the delta text.
-    No accumulated text scanning is performed — each token is O(1).
+    Transitions are tracked by parser state. Accumulated text is consulted only
+    to detect when a start/end tag has completed across delta boundaries.
     """
 
     @property
@@ -109,12 +110,8 @@ def extract_reasoning_streaming(
 
         Instead of rescanning the full accumulated text on every token, this
         method tracks the current phase (pre_think / thinking / content) and
-        only inspects the delta for tag transitions. This makes each call O(1)
-        regardless of how much text has been generated.
-
-        The method signature is kept compatible with the base class — previous_text
-        and current_text are accepted but not used for phase detection (they remain
-        available for subclasses that need them).
+        only consults accumulated text to detect completed start/end tags that
+        were split across delta boundaries.
 
         Handles three scenarios:
         1. Explicit <think>...</think> in model output
@@ -122,8 +119,8 @@ def extract_reasoning_streaming(
         3. No tags at all (pure content after first token with no reasoning)
 
         Args:
-            previous_text: Text accumulated before this delta (unused by state machine).
-            current_text: Text including this delta (unused by state machine).
+            previous_text: Text accumulated before this delta.
+            current_text: Text including this delta.
             delta_text: Just the new text in this chunk.
 
         Returns:
@@ -136,34 +133,41 @@ def extract_reasoning_streaming(
         end_tok = self.end_token
 
         # ── Phase: pre_think ──────────────────────────────────────
-        # Haven't seen any tags yet. Could be:
+        # Haven't seen a completed tag yet. Could be:
         # - About to see <think> (explicit reasoning)
         # - Already inside implicit reasoning (think was in prompt)
         # - No reasoning at all (pure content model)
         if self._phase == "pre_think":
-            # Check for start tag in this delta
-            if start_tok in delta_text:
+            if start_tok in current_text:
                 self._phase = "thinking"
-                idx = delta_text.find(start_tok) + len(start_tok)
-                after = delta_text[idx:]
-                # Edge case: both tags in same delta
+                idx = delta_text.find(start_tok)
+                after = delta_text[idx + len(start_tok) :] if idx >= 0 else delta_text
+
                 if end_tok in after:
                     self._phase = "content"
                     eidx = after.find(end_tok)
                     reasoning = after[:eidx]
-                    content = after[eidx + len(end_tok):]
+                    content = after[eidx + len(end_tok) :]
+                    if not reasoning and not content:
+                        return None
                     return DeltaMessage(
                         reasoning=reasoning or None,
                         content=content or None,
                     )
                 return DeltaMessage(reasoning=after) if after else None
 
-            # Check for end tag (implicit mode — think was in prompt)
-            if end_tok in delta_text:
+            # Implicit mode: </think> completed without an explicit <think>.
+            if end_tok in current_text:
                 self._phase = "content"
                 idx = delta_text.find(end_tok)
-                reasoning = delta_text[:idx]
-                content = delta_text[idx + len(end_tok):]
+                if idx >= 0:
+                    reasoning = delta_text[:idx]
+                    content = delta_text[idx + len(end_tok) :]
+                else:
+                    reasoning = None
+                    content = delta_text
+                if not reasoning and not content:
+                    return None
                 return DeltaMessage(
                     reasoning=reasoning or None,
                     content=content or None,
@@ -178,11 +182,17 @@ def extract_reasoning_streaming(
         # ── Phase: thinking ───────────────────────────────────────
         # Inside a reasoning block, waiting for end tag.
         if self._phase == "thinking":
-            if end_tok in delta_text:
+            if end_tok in current_text and end_tok not in previous_text:
                 self._phase = "content"
                 idx = delta_text.find(end_tok)
-                reasoning = delta_text[:idx]
-                content = delta_text[idx + len(end_tok):]
+                if idx >= 0:
+                    reasoning = delta_text[:idx]
+                    content = delta_text[idx + len(end_tok) :]
+                else:
+                    reasoning = delta_text
+                    content = None
+                if not reasoning and not content:
+                    return None
                 return DeltaMessage(
                     reasoning=reasoning or None,
                     content=content or None,