diff --git a/tests/test_anthropic_stream_scrubber.py b/tests/test_anthropic_stream_scrubber.py
index d436b7fb..eda4f4ac 100644
--- a/tests/test_anthropic_stream_scrubber.py
+++ b/tests/test_anthropic_stream_scrubber.py
@@ -293,15 +293,11 @@ class TestScrubberParameterTags:
def test_parameter_tag_single_delta(self):
scrubber = _AnthropicStreamScrubber()
result = scrubber.feed("Before NYC After")
- # parameter= maps to IN_FUNCTION mode, closes on
- # But is consumed as stray closing tag in TEXT mode...
- # Actually
- # So won't close IN_FUNCTION. Let's verify behavior.
- # The close for IN_FUNCTION is , so content after
- # stays suppressed until is seen or stream ends.
result += scrubber.flush()
assert " tag should be stripped."""
@@ -685,10 +681,10 @@ def test_function_back_to_text(self):
scrubber.feed("body")
assert scrubber.mode == "TEXT"
- def test_text_to_parameter_enters_function_mode(self):
+ def test_text_to_parameter_enters_parameter_mode(self):
scrubber = _AnthropicStreamScrubber()
scrubber.feed("text")
- assert scrubber.mode == "IN_FUNCTION"
+ assert scrubber.mode == "IN_PARAMETER"
def test_stray_close_stays_in_text(self):
"""Stray closing tags should not change mode."""
@@ -1337,3 +1333,136 @@ def test_thinking_disabled_model(self):
thinking=AnthropicThinkingConfig(type="disabled"),
)
assert _is_thinking_enabled(req) is False
+
+
+# =============================================================================
+# Review Fix Tests
+# =============================================================================
+
+
+class TestScrubberCarryLimit:
+ """Fix 1: Carry buffer must not grow without bound when '>' never arrives."""
+
+ def test_long_false_prefix_emitted(self):
+ """' is literal text."""
+ scrubber = _AnthropicStreamScrubber()
+ fake = "' must flush carry at some point."""
+ scrubber = _AnthropicStreamScrubber()
+ scrubber.feed(" must close on , not ."""
+
+ def test_standalone_parameter_closes_on_parameter_close(self):
+ scrubber = _AnthropicStreamScrubber()
+ text = "beforehiddenafter"
+ result = scrubber.feed(text) + scrubber.flush()
+ assert result == "beforeafter"
+
+ def test_standalone_parameter_does_not_eat_until_function_close(self):
+ """ must NOT suppress until ."""
+ scrubber = _AnthropicStreamScrubber()
+ text = "beforehiddenmiddleafter"
+ result = scrubber.feed(text) + scrubber.flush()
+ assert "before" in result
+ assert "middle" in result
+ assert "after" in result
+ assert "hidden" not in result
+
+ def test_parameter_inside_function_still_suppressed(self):
+ """Within , everything is suppressed by the outer block."""
+ scrubber = _AnthropicStreamScrubber()
+ text = "okvdone"
+ result = scrubber.feed(text) + scrubber.flush()
+ assert result == "okdone"
+
+
+class TestRouterCarryLimit:
+ """Fix 1: Router carry buffer limit."""
+
+ def test_long_false_prefix_emitted(self):
+ router = _AnthropicStreamRouter()
+ fake = "hiddenafter")
+ pieces += router.flush()
+ text = "".join(t for k, t in pieces if k == "text")
+ assert text == "beforeafter"
+
+
+class TestRouterDeadCode:
+ """Fix 4: _implicit_think attribute should not exist."""
+
+ def test_no_implicit_think_attribute(self):
+ router = _AnthropicStreamRouter()
+ assert not hasattr(router, "_implicit_think")
+
+ def test_no_implicit_think_when_start_true(self):
+ router = _AnthropicStreamRouter(start_in_thinking=True)
+ assert not hasattr(router, "_implicit_think")
diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
index a6e9ebb5..4ac501e3 100644
--- a/vllm_mlx/server.py
+++ b/vllm_mlx/server.py
@@ -42,6 +42,7 @@
import json
import logging
import os
+import re
import secrets
import tempfile
import threading
@@ -1690,6 +1691,7 @@ class _AnthropicStreamScrubber:
* **IN_THINK** – suppress until ````.
* **IN_TOOLCALL** – suppress until ````.
* **IN_FUNCTION** – suppress until ````.
+ * **IN_PARAMETER** – suppress until ````.
"""
# --- Fixed (exact-match) tags ----------------------------------------
@@ -1729,7 +1731,7 @@ class _AnthropicStreamScrubber:
THINK_OPEN: "IN_THINK",
TOOL_OPEN: "IN_TOOLCALL",
FUNC_PREFIX: "IN_FUNCTION",
- PARAM_PREFIX: "IN_FUNCTION", # parameters inside function blocks
+ PARAM_PREFIX: "IN_PARAMETER",
}
# Map from suppression mode → closing tag
@@ -1737,6 +1739,7 @@ class _AnthropicStreamScrubber:
"IN_THINK": THINK_CLOSE,
"IN_TOOLCALL": TOOL_CLOSE,
"IN_FUNCTION": FUNC_CLOSE,
+ "IN_PARAMETER": PARAM_CLOSE,
}
def __init__(self) -> None:
@@ -1817,9 +1820,16 @@ def feed(self, delta: str) -> str:
if consume < 0:
# Prefix tag found but closing '>' missing – truncated.
+ carry_candidate = s[pos:]
+ if len(carry_candidate) > self.MAX_TAG * 2:
+ # Carry grew too large — this is a literal '<',
+ # not a real tag. Emit everything and move on.
+ out.append(s[i : pos + len(marker)])
+ i = pos + len(marker)
+ continue
if pos > i:
out.append(s[i:pos])
- self.carry = s[pos:]
+ self.carry = carry_candidate
return "".join(out)
tag_end = pos + consume
@@ -1870,10 +1880,11 @@ def flush(self) -> str:
for tag in self._EXACT_TAGS:
result = result.replace(tag, "")
# Strip any residual prefix tags (e.g. ````).
- import re
-
result = re.sub(r"]*>", "", result)
result = re.sub(r"]*>", "", result)
+ # Strip partial prefix tags at end of stream (no closing '>').
+ result = re.sub(r"]*$", "", result)
+ result = re.sub(r"]*$", "", result)
self.carry = ""
return result
self.carry = ""
@@ -1912,6 +1923,7 @@ class _AnthropicStreamRouter:
_EXACT_TAGS = _AnthropicStreamScrubber._EXACT_TAGS
_PREFIX_TAGS = _AnthropicStreamScrubber._PREFIX_TAGS
+ MAX_TAG = _AnthropicStreamScrubber.MAX_TAG
CARRY_N = _AnthropicStreamScrubber.CARRY_N
_MODE_MAP = _AnthropicStreamScrubber._MODE_MAP
@@ -1923,7 +1935,6 @@ def __init__(self, start_in_thinking: bool = False) -> None:
# into the prompt, so the first output IS thinking content).
self.mode: str = "IN_THINK" if start_in_thinking else "TEXT"
self.carry: str = ""
- self._implicit_think = start_in_thinking
# Delegate marker scanning to a scrubber instance.
self._scanner = _AnthropicStreamScrubber()
@@ -1959,9 +1970,15 @@ def feed(self, delta: str) -> list[tuple[str, str]]:
pos, marker, consume = hit
if consume < 0:
+ carry_candidate = s[pos:]
+ if len(carry_candidate) > self.MAX_TAG * 2:
+ # Carry grew too large — emit as literal text.
+ pieces.append(("text", s[i : pos + len(marker)]))
+ i = pos + len(marker)
+ continue
if pos > i:
pieces.append(("text", s[i:pos]))
- self.carry = s[pos:]
+ self.carry = carry_candidate
return pieces
tag_end = pos + consume
@@ -2001,7 +2018,7 @@ def feed(self, delta: str) -> list[tuple[str, str]]:
self.mode = "TEXT"
else:
- # IN_TOOLCALL or IN_FUNCTION – suppress content.
+ # IN_TOOLCALL, IN_FUNCTION, or IN_PARAMETER – suppress.
close_tag = self._CLOSE_MAP[self.mode]
close_pos = s.find(close_tag, i)
if close_pos == -1:
@@ -2025,10 +2042,11 @@ def flush(self) -> list[tuple[str, str]]:
result = self.carry
for tag in self._EXACT_TAGS:
result = result.replace(tag, "")
- import re
-
result = re.sub(r"]*>", "", result)
result = re.sub(r"]*>", "", result)
+ # Strip partial prefix tags at end of stream (no closing '>').
+ result = re.sub(r"]*$", "", result)
+ result = re.sub(r"]*$", "", result)
if result:
pieces.append(("text", result))
# IN_TOOLCALL/IN_FUNCTION – discard.
@@ -2116,6 +2134,10 @@ async def _stream_anthropic_messages(
if thinking_enabled:
# Use the stream router which yields typed (kind, text) pieces
# that separate thinking content from user-facing text.
+ # start_in_thinking=True is correct for current models (Qwen3/3.5
+ # with --reasoning-parser qwen3): the chat template injects
+ # into the prompt, so the model's first output is thinking content
+ # WITHOUT an explicit tag.
router: _AnthropicStreamRouter | None = _AnthropicStreamRouter(
start_in_thinking=True
)