diff --git a/vllm_mlx/api/anthropic_adapter.py b/vllm_mlx/api/anthropic_adapter.py
index dbb94200f..7e05ff35b 100644
--- a/vllm_mlx/api/anthropic_adapter.py
+++ b/vllm_mlx/api/anthropic_adapter.py
@@ -9,6 +9,7 @@
"""
import json
+import re
import uuid
from .anthropic_models import (
@@ -26,6 +27,13 @@
ToolDefinition,
)
+# Compiled patterns for stripping streaming artifacts from assistant message history.
+# In streaming mode, vllm-mlx emits all model output (thinking and tool-call XML) as
+# text deltas before extracting tool_use blocks. These patterns strip that leaked content
+# from text blocks when rebuilding conversation history for subsequent turns.
+_STREAMING_THINK_STRIP = re.compile(r".*?\s*", re.DOTALL)
+_STREAMING_TOOL_CALL_STRIP = re.compile(r".*?\s*", re.DOTALL)
+
def anthropic_to_openai(request: AnthropicRequest) -> ChatCompletionRequest:
"""
@@ -178,7 +186,18 @@ def _convert_message(msg: AnthropicMessage) -> list[Message]:
for block in msg.content:
if block.type == "text":
- text_parts.append(block.text or "")
+ text = block.text or ""
+ # Strip tags and XML that leaked from streaming mode.
+ # In streaming mode, all model output (including thinking and tool call XML)
+ # is emitted as text deltas before tool_use blocks are parsed. If not stripped
+ # here, the chat template on the next turn renders duplicate tool calls,
+ # causing an infinite reasoning loop (900+ second stalls observed with
+ # Qwen3.5 models).
+ text = _STREAMING_THINK_STRIP.sub("", text)
+ text = _STREAMING_TOOL_CALL_STRIP.sub("", text)
+ text = text.strip()
+ if text:
+ text_parts.append(text)
elif block.type == "tool_use":
# Assistant message with tool calls
diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
index f0328d4e6..695304cb9 100644
--- a/vllm_mlx/server.py
+++ b/vllm_mlx/server.py
@@ -1735,20 +1735,27 @@ async def _stream_anthropic_messages(
completion_tokens = output.completion_tokens
if delta_text:
- # Filter special tokens
+ # Filter special tokens but BUFFER — do NOT stream yet.
+ # Tool call XML (…) and tags must not
+ # leak into text content blocks. We parse and clean the full accumulated
+ # text after the model finishes, then emit only the cleaned version.
content = SPECIAL_TOKENS_PATTERN.sub("", delta_text)
if content:
accumulated_text += content
- delta_event = {
- "type": "content_block_delta",
- "index": 0,
- "delta": {"type": "text_delta", "text": content},
- }
- yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"
- # Check for tool calls in accumulated text
- _, tool_calls = _parse_tool_calls_with_parser(accumulated_text, openai_request)
+ # Parse tool calls and get cleaned text (strips and XML).
+ # IMPORTANT: use cleaned_text here, not the raw accumulated_text.
+ cleaned_text, tool_calls = _parse_tool_calls_with_parser(accumulated_text, openai_request)
+
+ # Emit cleaned text as a single text delta (raw streaming would leak XML).
+ if cleaned_text:
+ delta_event = {
+ "type": "content_block_delta",
+ "index": 0,
+ "delta": {"type": "text_delta", "text": cleaned_text},
+ }
+ yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"
# Emit content_block_stop for text block
yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"