From 4e9e73b76e26246d06dd4942281af56f3987964f Mon Sep 17 00:00:00 2001
From: Thump604 <thump@cosmiccooler.org>
Date: Mon, 16 Mar 2026 11:15:53 -0500
Subject: [PATCH 1/2] fix: enable tool call parsing in streaming when reasoning
 parser is active
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When both --reasoning-parser and --tool-call-parser are configured,
streaming tool calls are completely broken. The reasoning parser branch
in stream_chat_completion() yields chunks directly without ever passing
content through the tool parser. Tool call XML appears as raw text in
the content field, tool_calls is null in all chunks, and finish_reason
is "stop" instead of "tool_calls".

Root cause: The reasoning parser branch (if _reasoning_parser) and tool
parser branch are in opposite sides of an if/else — mutually exclusive.
Every model that uses both reasoning and tool calling (Qwen3, Qwen3.5,
DeepSeek-R1 with tools, etc.) is affected in streaming mode.

Fix: After reasoning extraction produces delta_msg.content, pass that
content through the tool parser before emitting the chunk. This creates
a sequential pipeline matching upstream vLLM's architecture:
  Phase 1: Reasoning extraction (separate thinking from content)
  Phase 2: Tool call parsing on content portion
  Phase 3: Emit chunk with reasoning + structured tool_calls

Handles three tool parser states:
- None (inside XML markup): suppress content, still emit reasoning
- tool_calls detected: emit structured tool_calls with reasoning
- Normal content: pass through as before

Non-streaming path is unaffected (already works correctly).

Tested with Qwen3.5-122B-A10B + qwen3_xml parser + qwen3 reasoning
parser on Apple Silicon (M2 Ultra). Streaming now returns structured
tool_calls, finish_reason: "tool_calls", with no raw XML in content.

Fixes #107
---
 vllm_mlx/server.py | 72 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 69 insertions(+), 3 deletions(-)
diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
index f0328d4e..ed3c6214 100644
--- a/vllm_mlx/server.py
+++ b/vllm_mlx/server.py
@@ -1934,16 +1934,82 @@ async def stream_chat_completion(
                 # Skip this chunk (e.g., <think> token itself)
                 continue
 
+            # Phase 2: Tool call parsing on content from reasoning parser.
+            # Without this, tool XML passes through as plain content text
+            # when both --reasoning-parser and --tool-call-parser are set.
+            content = delta_msg.content
+            reasoning_part = delta_msg.reasoning
+
+            if tool_parser and content:
+                # Fast path: skip full parsing until '<' is seen
+                if not tool_markup_possible and "<" not in content:
+                    tool_accumulated_text += content
+                    # Fall through to normal chunk emission
+                else:
+                    if not tool_markup_possible:
+                        tool_markup_possible = True
+                    tool_previous = tool_accumulated_text
+                    tool_accumulated_text += content
+                    tool_result = tool_parser.extract_tool_calls_streaming(
+                        tool_previous, tool_accumulated_text, content
+                    )
+
+                    if tool_result is None:
+                        # Inside tool markup — suppress content,
+                        # emit reasoning only
+                        if reasoning_part:
+                            chunk = ChatCompletionChunk(
+                                id=response_id,
+                                model=request.model,
+                                choices=[
+                                    ChatCompletionChunkChoice(
+                                        delta=ChatCompletionChunkDelta(
+                                            reasoning=reasoning_part,
+                                        ),
+                                        finish_reason=None,
+                                    )
+                                ],
+                            )
+                            yield f"data: {chunk.model_dump_json()}\n\n"
+                        continue
+
+                    if "tool_calls" in tool_result:
+                        # Emit structured tool calls with reasoning
+                        tool_calls_detected = True
+                        chunk = ChatCompletionChunk(
+                            id=response_id,
+                            model=request.model,
+                            choices=[
+                                ChatCompletionChunkChoice(
+                                    delta=ChatCompletionChunkDelta(
+                                        reasoning=reasoning_part,
+                                        tool_calls=tool_result["tool_calls"],
+                                    ),
+                                    finish_reason="tool_calls" if output.finished else None,
+                                )
+                            ],
+                            usage=get_usage(output) if output.finished else None,
+                        )
+                        yield f"data: {chunk.model_dump_json()}\n\n"
+                        continue
+
+                    # Normal content from tool parser
+                    content = tool_result.get("content", "")
+
             chunk = ChatCompletionChunk(
                 id=response_id,
                 model=request.model,
                 choices=[
                     ChatCompletionChunkChoice(
                         delta=ChatCompletionChunkDelta(
-                            content=delta_msg.content,
-                            reasoning=delta_msg.reasoning,
+                            content=content if content else None,
+                            reasoning=reasoning_part,
+                        ),
+                        finish_reason=(
+                            "tool_calls"
+                            if (output.finished and tool_calls_detected)
+                            else (output.finish_reason if output.finished else None)
                         ),
-                        finish_reason=output.finish_reason if output.finished else None,
                     )
                 ],
                 usage=get_usage(output) if output.finished else None,

From 5fc3de091c21ee222e96be0097987411e95afdb3 Mon Sep 17 00:00:00 2001
From: Thump604 <thump@cosmiccooler.org>
Date: Mon, 16 Mar 2026 13:23:21 -0500
Subject: [PATCH 2/2] style: black formatting fix for server.py

Line-length wrap on finish_reason ternary expression.
---
 vllm_mlx/server.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm_mlx/server.py b/vllm_mlx/server.py
index ed3c6214..8006fc20 100644
--- a/vllm_mlx/server.py
+++ b/vllm_mlx/server.py
@@ -1985,7 +1985,9 @@ async def stream_chat_completion(
                                         reasoning=reasoning_part,
                                         tool_calls=tool_result["tool_calls"],
                                     ),
-                                    finish_reason="tool_calls" if output.finished else None,
+                                    finish_reason=(
+                                        "tool_calls" if output.finished else None
+                                    ),
                                 )
                             ],
                             usage=get_usage(output) if output.finished else None,