Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 53 additions & 0 deletions vllm_mlx/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1934,6 +1934,59 @@ async def stream_chat_completion(
# Skip this chunk (e.g., <think> token itself)
continue

# Check if tool call markup appears in reasoning or content.
# Some models (e.g. Qwen3-Coder) emit <tool_call> directly
# inside reasoning without a </think> transition, so we need to
# intercept tool call tokens regardless of which field they land in.
effective_text = delta_msg.content or delta_msg.reasoning or ""
if tool_parser and effective_text:
if not tool_markup_possible and "<" not in effective_text:
tool_accumulated_text += effective_text
# No tool markup yet — emit the delta as-is
else:
if not tool_markup_possible:
tool_markup_possible = True
tool_previous = tool_accumulated_text
tool_accumulated_text += effective_text
tool_result = tool_parser.extract_tool_calls_streaming(
tool_previous, tool_accumulated_text, effective_text,
)

if tool_result is None:
# Inside tool markup — suppress output entirely
continue

if "tool_calls" in tool_result:
# Emit structured tool calls instead of reasoning/content
tool_calls_detected = True
chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
choices=[
ChatCompletionChunkChoice(
delta=ChatCompletionChunkDelta(
tool_calls=tool_result["tool_calls"]
),
finish_reason=(
"tool_calls" if output.finished else None
),
)
],
usage=get_usage(output) if output.finished else None,
)
yield f"data: {chunk.model_dump_json()}\n\n"
continue

# Tool parser returned content (not a tool call) — use it
tool_content = tool_result.get("content", "")
if tool_content:
if delta_msg.reasoning:
delta_msg.reasoning = tool_content
delta_msg.content = None
else:
delta_msg.content = tool_content
delta_msg.reasoning = None

chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
Expand Down
Loading