diff --git a/vllm_mlx/api/anthropic_adapter.py b/vllm_mlx/api/anthropic_adapter.py index dbb94200f..7e05ff35b 100644 --- a/vllm_mlx/api/anthropic_adapter.py +++ b/vllm_mlx/api/anthropic_adapter.py @@ -9,6 +9,7 @@ """ import json +import re import uuid from .anthropic_models import ( @@ -26,6 +27,13 @@ ToolDefinition, ) +# Compiled patterns for stripping streaming artifacts from assistant message history. +# In streaming mode, vllm-mlx emits all model output (thinking and tool-call XML) as +# text deltas before extracting tool_use blocks. These patterns strip that leaked content +# from text blocks when rebuilding conversation history for subsequent turns. +_STREAMING_THINK_STRIP = re.compile(r".*?\s*", re.DOTALL) +_STREAMING_TOOL_CALL_STRIP = re.compile(r".*?\s*", re.DOTALL) + def anthropic_to_openai(request: AnthropicRequest) -> ChatCompletionRequest: """ @@ -178,7 +186,18 @@ def _convert_message(msg: AnthropicMessage) -> list[Message]: for block in msg.content: if block.type == "text": - text_parts.append(block.text or "") + text = block.text or "" + # Strip tags and XML that leaked from streaming mode. + # In streaming mode, all model output (including thinking and tool call XML) + # is emitted as text deltas before tool_use blocks are parsed. If not stripped + # here, the chat template on the next turn renders duplicate tool calls, + # causing an infinite reasoning loop (900+ second stalls observed with + # Qwen3.5 models). + text = _STREAMING_THINK_STRIP.sub("", text) + text = _STREAMING_TOOL_CALL_STRIP.sub("", text) + text = text.strip() + if text: + text_parts.append(text) elif block.type == "tool_use": # Assistant message with tool calls diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index f0328d4e6..695304cb9 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -1735,20 +1735,27 @@ async def _stream_anthropic_messages( completion_tokens = output.completion_tokens if delta_text: - # Filter special tokens + # Filter special tokens but BUFFER — do NOT stream yet. + # Tool call XML () and tags must not + # leak into text content blocks. We parse and clean the full accumulated + # text after the model finishes, then emit only the cleaned version. content = SPECIAL_TOKENS_PATTERN.sub("", delta_text) if content: accumulated_text += content - delta_event = { - "type": "content_block_delta", - "index": 0, - "delta": {"type": "text_delta", "text": content}, - } - yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n" - # Check for tool calls in accumulated text - _, tool_calls = _parse_tool_calls_with_parser(accumulated_text, openai_request) + # Parse tool calls and get cleaned text (strips and XML). + # IMPORTANT: use cleaned_text here, not the raw accumulated_text. + cleaned_text, tool_calls = _parse_tool_calls_with_parser(accumulated_text, openai_request) + + # Emit cleaned text as a single text delta (raw streaming would leak XML). + if cleaned_text: + delta_event = { + "type": "content_block_delta", + "index": 0, + "delta": {"type": "text_delta", "text": cleaned_text}, + } + yield f"event: content_block_delta\ndata: {json.dumps(delta_event)}\n\n" # Emit content_block_stop for text block yield f"event: content_block_stop\ndata: {json.dumps({'type': 'content_block_stop', 'index': 0})}\n\n"