From 4e9e73b76e26246d06dd4942281af56f3987964f Mon Sep 17 00:00:00 2001 From: Thump604 Date: Mon, 16 Mar 2026 11:15:53 -0500 Subject: [PATCH 1/2] fix: enable tool call parsing in streaming when reasoning parser is active MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When both --reasoning-parser and --tool-call-parser are configured, streaming tool calls are completely broken. The reasoning parser branch in stream_chat_completion() yields chunks directly without ever passing content through the tool parser. Tool call XML appears as raw text in the content field, tool_calls is null in all chunks, and finish_reason is "stop" instead of "tool_calls". Root cause: The reasoning parser branch (if _reasoning_parser) and tool parser branch are in opposite sides of an if/else — mutually exclusive. Every model that uses both reasoning and tool calling (Qwen3, Qwen3.5, DeepSeek-R1 with tools, etc.) is affected in streaming mode. Fix: After reasoning extraction produces delta_msg.content, pass that content through the tool parser before emitting the chunk. This creates a sequential pipeline matching upstream vLLM's architecture: Phase 1: Reasoning extraction (separate thinking from content) Phase 2: Tool call parsing on content portion Phase 3: Emit chunk with reasoning + structured tool_calls Handles three tool parser states: - None (inside XML markup): suppress content, still emit reasoning - tool_calls detected: emit structured tool_calls with reasoning - Normal content: pass through as before Non-streaming path is unaffected (already works correctly). Tested with Qwen3.5-122B-A10B + qwen3_xml parser + qwen3 reasoning parser on Apple Silicon (M2 Ultra). Streaming now returns structured tool_calls, finish_reason: "tool_calls", with no raw XML in content. Fixes #107 --- vllm_mlx/server.py | 72 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index f0328d4e..ed3c6214 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -1934,16 +1934,82 @@ async def stream_chat_completion( # Skip this chunk (e.g., token itself) continue + # Phase 2: Tool call parsing on content from reasoning parser. + # Without this, tool XML passes through as plain content text + # when both --reasoning-parser and --tool-call-parser are set. + content = delta_msg.content + reasoning_part = delta_msg.reasoning + + if tool_parser and content: + # Fast path: skip full parsing until '<' is seen + if not tool_markup_possible and "<" not in content: + tool_accumulated_text += content + # Fall through to normal chunk emission + else: + if not tool_markup_possible: + tool_markup_possible = True + tool_previous = tool_accumulated_text + tool_accumulated_text += content + tool_result = tool_parser.extract_tool_calls_streaming( + tool_previous, tool_accumulated_text, content + ) + + if tool_result is None: + # Inside tool markup — suppress content, + # emit reasoning only + if reasoning_part: + chunk = ChatCompletionChunk( + id=response_id, + model=request.model, + choices=[ + ChatCompletionChunkChoice( + delta=ChatCompletionChunkDelta( + reasoning=reasoning_part, + ), + finish_reason=None, + ) + ], + ) + yield f"data: {chunk.model_dump_json()}\n\n" + continue + + if "tool_calls" in tool_result: + # Emit structured tool calls with reasoning + tool_calls_detected = True + chunk = ChatCompletionChunk( + id=response_id, + model=request.model, + choices=[ + ChatCompletionChunkChoice( + delta=ChatCompletionChunkDelta( + reasoning=reasoning_part, + tool_calls=tool_result["tool_calls"], + ), + finish_reason="tool_calls" if output.finished else None, + ) + ], + usage=get_usage(output) if output.finished else None, + ) + yield f"data: {chunk.model_dump_json()}\n\n" + continue + + # Normal content from tool parser + content = tool_result.get("content", "") + chunk = ChatCompletionChunk( id=response_id, model=request.model, choices=[ ChatCompletionChunkChoice( delta=ChatCompletionChunkDelta( - content=delta_msg.content, - reasoning=delta_msg.reasoning, + content=content if content else None, + reasoning=reasoning_part, + ), + finish_reason=( + "tool_calls" + if (output.finished and tool_calls_detected) + else (output.finish_reason if output.finished else None) ), - finish_reason=output.finish_reason if output.finished else None, ) ], usage=get_usage(output) if output.finished else None, From 5fc3de091c21ee222e96be0097987411e95afdb3 Mon Sep 17 00:00:00 2001 From: Thump604 Date: Mon, 16 Mar 2026 13:23:21 -0500 Subject: [PATCH 2/2] style: black formatting fix for server.py Line-length wrap on finish_reason ternary expression. --- vllm_mlx/server.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index ed3c6214..8006fc20 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -1985,7 +1985,9 @@ async def stream_chat_completion( reasoning=reasoning_part, tool_calls=tool_result["tool_calls"], ), - finish_reason="tool_calls" if output.finished else None, + finish_reason=( + "tool_calls" if output.finished else None + ), ) ], usage=get_usage(output) if output.finished else None,