Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 71 additions & 3 deletions vllm_mlx/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -1934,16 +1934,84 @@ async def stream_chat_completion(
# Skip this chunk (e.g., <think> token itself)
continue

# Phase 2: Tool call parsing on content from reasoning parser.
# Without this, tool XML passes through as plain content text
# when both --reasoning-parser and --tool-call-parser are set.
content = delta_msg.content
reasoning_part = delta_msg.reasoning

if tool_parser and content:
# Fast path: skip full parsing until '<' is seen
if not tool_markup_possible and "<" not in content:
tool_accumulated_text += content
# Fall through to normal chunk emission
else:
if not tool_markup_possible:
tool_markup_possible = True
tool_previous = tool_accumulated_text
tool_accumulated_text += content
tool_result = tool_parser.extract_tool_calls_streaming(
tool_previous, tool_accumulated_text, content
)

if tool_result is None:
# Inside tool markup — suppress content,
# emit reasoning only
if reasoning_part:
chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
choices=[
ChatCompletionChunkChoice(
delta=ChatCompletionChunkDelta(
reasoning=reasoning_part,
),
finish_reason=None,
)
],
)
yield f"data: {chunk.model_dump_json()}\n\n"
continue

if "tool_calls" in tool_result:
# Emit structured tool calls with reasoning
tool_calls_detected = True
chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
choices=[
ChatCompletionChunkChoice(
delta=ChatCompletionChunkDelta(
reasoning=reasoning_part,
tool_calls=tool_result["tool_calls"],
),
finish_reason=(
"tool_calls" if output.finished else None
),
)
],
usage=get_usage(output) if output.finished else None,
)
yield f"data: {chunk.model_dump_json()}\n\n"
continue

# Normal content from tool parser
content = tool_result.get("content", "")

chunk = ChatCompletionChunk(
id=response_id,
model=request.model,
choices=[
ChatCompletionChunkChoice(
delta=ChatCompletionChunkDelta(
content=delta_msg.content,
reasoning=delta_msg.reasoning,
content=content if content else None,
reasoning=reasoning_part,
),
finish_reason=(
"tool_calls"
if (output.finished and tool_calls_detected)
else (output.finish_reason if output.finished else None)
),
finish_reason=output.finish_reason if output.finished else None,
)
],
usage=get_usage(output) if output.finished else None,
Expand Down
Loading