waybarrios · PollyBot13 · Mar 13, 2026 · Mar 13, 2026
diff --git a/vllm_mlx/api/anthropic_adapter.py b/vllm_mlx/api/anthropic_adapter.py
@@ -9,6 +9,7 @@
 """
 
 import json
+import re
 import uuid
 
 from .anthropic_models import (
@@ -26,6 +27,13 @@
     ToolDefinition,
 )
 
+# Compiled patterns for stripping streaming artifacts from assistant message history.
+# In streaming mode, vllm-mlx emits all model output (thinking and tool-call XML) as
+# text deltas before extracting tool_use blocks. These patterns strip that leaked content
+# from text blocks when rebuilding conversation history for subsequent turns.
+_STREAMING_THINK_STRIP = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
+_STREAMING_TOOL_CALL_STRIP = re.compile(r"<tool_call>.*?</tool_call>\s*", re.DOTALL)
+
 
 def anthropic_to_openai(request: AnthropicRequest) -> ChatCompletionRequest:
     """
@@ -178,7 +186,18 @@ def _convert_message(msg: AnthropicMessage) -> list[Message]:
 
     for block in msg.content:
         if block.type == "text":
-            text_parts.append(block.text or "")
+            text = block.text or ""
+            # Strip <think> tags and <tool_call> XML that leaked from streaming mode.
+            # In streaming mode, all model output (including thinking and tool call XML)
+            # is emitted as text deltas before tool_use blocks are parsed. If not stripped
+            # here, the chat template on the next turn renders duplicate tool calls,
+            # causing an infinite reasoning loop (900+ second stalls observed with
+            # Qwen3.5 models).
+            text = _STREAMING_THINK_STRIP.sub("", text)
+            text = _STREAMING_TOOL_CALL_STRIP.sub("", text)
+            text = text.strip()
+            if text:
+                text_parts.append(text)
 
         elif block.type == "tool_use":
             # Assistant message with tool calls

diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
@@ -1735,20 +1735,27 @@ async def _stream_anthropic_messages(
             completion_tokens = output.completion_tokens
 
         if delta_text:
-            # Filter special tokens
+            # Filter special tokens but BUFFER — do NOT stream yet.
+            # Tool call XML (<tool_call>…</tool_call>) and <think> tags must not
+            # leak into text content blocks. We parse and clean the full accumulated
+            # text after the model finishes, then emit only the cleaned version.
             content = SPECIAL_TOKENS_PATTERN.sub("", delta_text)
 
             if content:
                 accumulated_text += content
-                delta_event = {
-                    "type": "content_block_delta",
-                    "index": 0,
-                    "delta": {"type": "text_delta", "text": content},
-                }
-                yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"
 
-    # Check for tool calls in accumulated text
-    _, tool_calls = _parse_tool_calls_with_parser(accumulated_text, openai_request)
+    # Parse tool calls and get cleaned text (strips <tool_call> and <think> XML).
+    # IMPORTANT: use cleaned_text here, not the raw accumulated_text.
+    cleaned_text, tool_calls = _parse_tool_calls_with_parser(accumulated_text, openai_request)
+
+    # Emit cleaned text as a single text delta (raw streaming would leak XML).
+    if cleaned_text:
+        delta_event = {
+            "type": "content_block_delta",
+            "index": 0,
+            "delta": {"type": "text_delta", "text": cleaned_text},
+        }
+        yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"
 
     # Emit content_block_stop for text block
     yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"