From 798096482a2d728635386820a21b6b10c0f370a2 Mon Sep 17 00:00:00 2001 From: Dynamic LLM Date: Mon, 9 Mar 2026 01:22:41 -0500 Subject: [PATCH] fix: integrate tool call parsing with reasoning parser in streaming mode When both --reasoning-parser and --tool-call-parser are enabled, the reasoning parser branch in stream_chat_completion would consume all tokens without routing them through the tool call parser. This meant XML was emitted as raw text in reasoning or content fields instead of being parsed into structured tool_calls. This fix feeds the reasoning parser's output (whether content or reasoning text) through the tool call parser to detect and emit structured tool calls. Tested with Qwen3-Coder-Next-8bit using --reasoning-parser qwen3 --tool-call-parser qwen3_coder under concurrent 4-agent load. --- vllm_mlx/server.py | 53 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py index f0328d4e..b0c08032 100644 --- a/vllm_mlx/server.py +++ b/vllm_mlx/server.py @@ -1934,6 +1934,59 @@ async def stream_chat_completion( # Skip this chunk (e.g., token itself) continue + # Check if tool call markup appears in reasoning or content. + # Some models (e.g. Qwen3-Coder) emit directly + # inside reasoning without a transition, so we need to + # intercept tool call tokens regardless of which field they land in. + effective_text = delta_msg.content or delta_msg.reasoning or "" + if tool_parser and effective_text: + if not tool_markup_possible and "<" not in effective_text: + tool_accumulated_text += effective_text + # No tool markup yet — emit the delta as-is + else: + if not tool_markup_possible: + tool_markup_possible = True + tool_previous = tool_accumulated_text + tool_accumulated_text += effective_text + tool_result = tool_parser.extract_tool_calls_streaming( + tool_previous, tool_accumulated_text, effective_text, + ) + + if tool_result is None: + # Inside tool markup — suppress output entirely + continue + + if "tool_calls" in tool_result: + # Emit structured tool calls instead of reasoning/content + tool_calls_detected = True + chunk = ChatCompletionChunk( + id=response_id, + model=request.model, + choices=[ + ChatCompletionChunkChoice( + delta=ChatCompletionChunkDelta( + tool_calls=tool_result["tool_calls"] + ), + finish_reason=( + "tool_calls" if output.finished else None + ), + ) + ], + usage=get_usage(output) if output.finished else None, + ) + yield f"data: {chunk.model_dump_json()}\n\n" + continue + + # Tool parser returned content (not a tool call) — use it + tool_content = tool_result.get("content", "") + if tool_content: + if delta_msg.reasoning: + delta_msg.reasoning = tool_content + delta_msg.content = None + else: + delta_msg.content = tool_content + delta_msg.reasoning = None + chunk = ChatCompletionChunk( id=response_id, model=request.model,