Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion vllm_mlx/api/anthropic_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
"""

import json
import re
import uuid

from .anthropic_models import (
Expand All @@ -26,6 +27,13 @@
ToolDefinition,
)

# Compiled patterns for stripping streaming artifacts from assistant message history.
# In streaming mode, vllm-mlx emits all model output (thinking and tool-call XML) as
# text deltas before extracting tool_use blocks. These patterns strip that leaked content
# from text blocks when rebuilding conversation history for subsequent turns.
_STREAMING_THINK_STRIP = re.compile(r"<think>.*?</think>\s*", re.DOTALL)
_STREAMING_TOOL_CALL_STRIP = re.compile(r"<tool_call>.*?</tool_call>\s*", re.DOTALL)


def anthropic_to_openai(request: AnthropicRequest) -> ChatCompletionRequest:
"""
Expand Down Expand Up @@ -178,7 +186,18 @@ def _convert_message(msg: AnthropicMessage) -> list[Message]:

for block in msg.content:
if block.type == "text":
text_parts.append(block.text or "")
text = block.text or ""
# Strip <think> tags and <tool_call> XML that leaked from streaming mode.
# In streaming mode, all model output (including thinking and tool call XML)
# is emitted as text deltas before tool_use blocks are parsed. If not stripped
# here, the chat template on the next turn renders duplicate tool calls,
# causing an infinite reasoning loop (900+ second stalls observed with
# Qwen3.5 models).
text = _STREAMING_THINK_STRIP.sub("", text)
text = _STREAMING_TOOL_CALL_STRIP.sub("", text)
text = text.strip()
if text:
text_parts.append(text)

elif block.type == "tool_use":
# Assistant message with tool calls
Expand Down
25 changes: 16 additions & 9 deletions vllm_mlx/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1735,20 +1735,27 @@ async def _stream_anthropic_messages(
completion_tokens = output.completion_tokens

if delta_text:
# Filter special tokens
# Filter special tokens but BUFFER — do NOT stream yet.
# Tool call XML (<tool_call>…</tool_call>) and <think> tags must not
# leak into text content blocks. We parse and clean the full accumulated
# text after the model finishes, then emit only the cleaned version.
content = SPECIAL_TOKENS_PATTERN.sub("", delta_text)

if content:
accumulated_text += content
delta_event = {
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": content},
}
yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"

# Check for tool calls in accumulated text
_, tool_calls = _parse_tool_calls_with_parser(accumulated_text, openai_request)
# Parse tool calls and get cleaned text (strips <tool_call> and <think> XML).
# IMPORTANT: use cleaned_text here, not the raw accumulated_text.
cleaned_text, tool_calls = _parse_tool_calls_with_parser(accumulated_text, openai_request)

# Emit cleaned text as a single text delta (raw streaming would leak XML).
if cleaned_text:
delta_event = {
"type": "content_block_delta",
"index": 0,
"delta": {"type": "text_delta", "text": cleaned_text},
}
yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n"

# Emit content_block_stop for text block
yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"
Expand Down