diff --git a/benchmarks/bench_reasoning_parser.py b/benchmarks/bench_reasoning_parser.py new file mode 100644 index 000000000..c7a2ba13f --- /dev/null +++ b/benchmarks/bench_reasoning_parser.py @@ -0,0 +1,55 @@ +"""Benchmark: reasoning parser streaming performance. + +Measures per-token overhead of extract_reasoning_streaming() at various +output lengths. Demonstrates the difference between O(N²) accumulated +text scanning and O(1) state-machine tracking. + +Usage: + python benchmarks/bench_reasoning_parser.py +""" + +import time + +from vllm_mlx.reasoning.qwen3_parser import Qwen3ReasoningParser + + +def bench_streaming(parser, n_tokens: int, label: str) -> float: + """Simulate n_tokens of streaming through the parser. Returns total ms.""" + parser.reset_state() + + # Simulate: + N reasoning tokens + + 10 content tokens + tokens = [""] + tokens += [f"word{i} " for i in range(n_tokens)] + tokens += [""] + tokens += [f"answer{i} " for i in range(10)] + + accumulated = "" + start = time.perf_counter() + for tok in tokens: + prev = accumulated + accumulated += tok + parser.extract_reasoning_streaming(prev, accumulated, tok) + elapsed = (time.perf_counter() - start) * 1000 + + print(f" {label}: {n_tokens:>6} tokens -> {elapsed:>8.2f}ms " + f"({elapsed / (n_tokens + 11):.3f}ms/tok)") + return elapsed + + +def main(): + parser = Qwen3ReasoningParser() + + print("Reasoning parser streaming benchmark") + print("=" * 60) + print() + + for n in [50, 100, 200, 500, 1000, 2000, 5000]: + bench_streaming(parser, n, f"{n} tokens") + + print() + print("At 50 tok/s, per-token budget is 20ms.") + print("Parser overhead should be <0.1ms/tok to be negligible.") + + +if __name__ == "__main__": + main() diff --git a/vllm_mlx/reasoning/think_parser.py b/vllm_mlx/reasoning/think_parser.py index 136348206..a2e9cb727 100644 --- a/vllm_mlx/reasoning/think_parser.py +++ b/vllm_mlx/reasoning/think_parser.py @@ -9,6 +9,12 @@ 1. Both tags in output: reasoningcontent 2. Only closing tag (think injected in prompt): reasoningcontent 3. No tags: pure content + +Performance: The streaming parser uses a simple state machine to track the +current phase (pre-think / thinking / content). Tag completion is detected +against the accumulated text for correctness when `` / `` are +split across delta boundaries, but phase tracking still avoids the old +whole-output rescanning behavior. """ from abc import abstractmethod @@ -27,8 +33,12 @@ class BaseThinkingReasoningParser(ReasoningParser): and only appears in the model output. This is common with AI agents like OpenCode that force models to reason by injecting thinking tags. - The parser tracks state during streaming to correctly separate reasoning - from content as tokens arrive incrementally. + The streaming parser uses a state machine with three phases: + + pre_think -> thinking -> content + + Transitions are tracked by parser state. Accumulated text is consulted only + to detect when a start/end tag has completed across delta boundaries. """ @property @@ -43,6 +53,12 @@ def end_token(self) -> str: def __init__(self, tokenizer=None): super().__init__(tokenizer) + # Streaming state — reset per request via reset_state() + self._phase: str = "pre_think" # "pre_think" | "thinking" | "content" + + def reset_state(self): + """Reset state machine for a new streaming request.""" + self._phase = "pre_think" def extract_reasoning( self, @@ -66,14 +82,11 @@ def extract_reasoning( # Case 1: Both tags present (normal case) if self.start_token in text and self.end_token in text: - # Get everything after start token _, _, after_start = text.partition(self.start_token) - # Split on end token reasoning, _, content = after_start.partition(self.end_token) return reasoning.strip() or None, content.strip() or None # Case 2: Only closing tag (think was injected in prompt) - # Everything before is reasoning if self.end_token in text: reasoning, _, content = text.partition(self.end_token) return reasoning.strip() or None, content.strip() or None @@ -83,7 +96,7 @@ def extract_reasoning( _, _, reasoning = text.partition(self.start_token) return reasoning.strip() or None, None - # Case 4: No tags at all - pure content + # Case 4: No tags at all — pure content return None, model_output def extract_reasoning_streaming( @@ -93,123 +106,99 @@ def extract_reasoning_streaming( delta_text: str, ) -> DeltaMessage | None: """ - Extract reasoning from streaming delta using text-based detection. + Extract reasoning from a streaming delta using state-machine tracking. + + Instead of rescanning the full accumulated text on every token, this + method tracks the current phase (pre_think / thinking / content) and + only consults accumulated text to detect completed start/end tags that + were split across delta boundaries. - Handles implicit reasoning mode where was in the prompt - and only appears in the output. + Handles three scenarios: + 1. Explicit ... in model output + 2. Implicit mode ( in prompt, only in output) + 3. No tags at all (pure content after first token with no reasoning) Args: previous_text: Text accumulated before this delta. current_text: Text including this delta. - delta_text: Just the new text. + delta_text: Just the new text in this chunk. Returns: - DeltaMessage with reasoning/content, or None to skip. + DeltaMessage with reasoning and/or content, or None to skip. """ - # Skip if delta is just the special tokens themselves - stripped_delta = delta_text.strip() - if stripped_delta == self.start_token: - return None - if stripped_delta == self.end_token: + if not delta_text: return None - # Check token positions in text (stateless text-based detection) - start_in_prev = self.start_token in previous_text - start_in_current = self.start_token in current_text - end_in_prev = self.end_token in previous_text - end_in_delta = self.end_token in delta_text - - # Case 1: Explicit found in text - standard behavior - if start_in_current: - return self._handle_explicit_think( - previous_text, delta_text, start_in_prev, end_in_prev, end_in_delta - ) - - # Case 2: No but found - implicit reasoning mode - # This handles when was injected in the prompt - if self.end_token in current_text: - return self._handle_implicit_think(delta_text, end_in_prev, end_in_delta) - - # Case 3: No think tags seen yet - # We can't know if was in the prompt, so we must make a choice: - # - Treat as content (safe, but loses reasoning if think was in prompt) - # - Treat as reasoning (risky, wrong if no thinking at all) - # We choose to treat as reasoning IF we haven't seen yet, - # because if think was in prompt, we want to capture the reasoning. - # This will be corrected once is seen. - return DeltaMessage(reasoning=delta_text) - - def _handle_explicit_think( - self, - previous_text: str, - delta_text: str, - start_in_prev: bool, - end_in_prev: bool, - end_in_delta: bool, - ) -> DeltaMessage | None: - """Handle case where tag is explicitly in the output.""" - start_in_delta = self.start_token in delta_text - - if start_in_prev: - # We're after the start token - if end_in_delta: - # Transition: end token in this delta - idx = delta_text.find(self.end_token) - reasoning_part = delta_text[:idx] - content_part = delta_text[idx + len(self.end_token) :] + start_tok = self.start_token + end_tok = self.end_token + + # ── Phase: pre_think ────────────────────────────────────── + # Haven't seen a completed tag yet. Could be: + # - About to see (explicit reasoning) + # - Already inside implicit reasoning (think was in prompt) + # - No reasoning at all (pure content model) + if self._phase == "pre_think": + if start_tok in current_text: + self._phase = "thinking" + idx = delta_text.find(start_tok) + after = delta_text[idx + len(start_tok) :] if idx >= 0 else delta_text + + if end_tok in after: + self._phase = "content" + eidx = after.find(end_tok) + reasoning = after[:eidx] + content = after[eidx + len(end_tok) :] + if not reasoning and not content: + return None + return DeltaMessage( + reasoning=reasoning or None, + content=content or None, + ) + return DeltaMessage(reasoning=after) if after else None + + # Implicit mode: completed without an explicit . + if end_tok in current_text: + self._phase = "content" + idx = delta_text.find(end_tok) + if idx >= 0: + reasoning = delta_text[:idx] + content = delta_text[idx + len(end_tok) :] + else: + reasoning = None + content = delta_text + if not reasoning and not content: + return None return DeltaMessage( - reasoning=reasoning_part if reasoning_part else None, - content=content_part if content_part else None, + reasoning=reasoning or None, + content=content or None, ) - elif end_in_prev: - # Already past reasoning phase - pure content - return DeltaMessage(content=delta_text) - else: - # Still in reasoning phase - return DeltaMessage(reasoning=delta_text) - - elif start_in_delta: - # Start token is in this delta - start_idx = delta_text.find(self.start_token) - - if end_in_delta: - # Both tokens in this delta - end_idx = delta_text.find(self.end_token) - reasoning_part = delta_text[start_idx + len(self.start_token) : end_idx] - content_part = delta_text[end_idx + len(self.end_token) :] - return DeltaMessage( - reasoning=reasoning_part if reasoning_part else None, - content=content_part if content_part else None, - ) - else: - # Only start token - beginning of reasoning - reasoning_part = delta_text[start_idx + len(self.start_token) :] + + # No tags — default to reasoning (implicit mode assumption). + # If the model doesn't use thinking at all, the server's + # non-parser path handles it. This path only activates when + # a reasoning parser is explicitly configured. + return DeltaMessage(reasoning=delta_text) + + # ── Phase: thinking ─────────────────────────────────────── + # Inside a reasoning block, waiting for end tag. + if self._phase == "thinking": + if end_tok in current_text and end_tok not in previous_text: + self._phase = "content" + idx = delta_text.find(end_tok) + if idx >= 0: + reasoning = delta_text[:idx] + content = delta_text[idx + len(end_tok) :] + else: + reasoning = delta_text + content = None + if not reasoning and not content: + return None return DeltaMessage( - reasoning=reasoning_part if reasoning_part else None + reasoning=reasoning or None, + content=content or None, ) + return DeltaMessage(reasoning=delta_text) - # Fallback - treat as content + # ── Phase: content ──────────────────────────────────────── + # Past the reasoning block — everything is content. return DeltaMessage(content=delta_text) - - def _handle_implicit_think( - self, - delta_text: str, - end_in_prev: bool, - end_in_delta: bool, - ) -> DeltaMessage | None: - """Handle case where was in prompt (only in output).""" - if end_in_delta: - # Transition: end token in this delta - idx = delta_text.find(self.end_token) - reasoning_part = delta_text[:idx] - content_part = delta_text[idx + len(self.end_token) :] - return DeltaMessage( - reasoning=reasoning_part if reasoning_part else None, - content=content_part if content_part else None, - ) - elif end_in_prev: - # Already past reasoning phase - pure content - return DeltaMessage(content=delta_text) - else: - # Still in implicit reasoning phase - return DeltaMessage(reasoning=delta_text)