Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion tests/entrypoints/openai/parser/test_harmony_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
from openai_harmony import Message, Role
from openai_harmony import Author, Message, Role

from tests.entrypoints.openai.utils import verify_harmony_messages
from vllm.entrypoints.openai.parser.harmony_utils import (
Expand All @@ -12,6 +12,7 @@
has_custom_tools,
parse_chat_input_to_harmony_message,
parse_chat_output,
render_for_completion,
)
from vllm.entrypoints.openai.responses.harmony import (
response_previous_input_to_harmony,
Expand Down Expand Up @@ -841,3 +842,50 @@ def test_all_standard_channels_present(self) -> None:
assert channel in valid_channels, (
f"{channel} missing when with_custom_tools={with_tools}"
)


class TestRenderForCompletion:
def test_preserves_analysis(self):
"""render_for_completion must not strip analysis messages —
vLLM handles that via auto_drop_analysis_messages()."""
messages = [
get_system_message(),
Message.from_role_and_content(Role.USER, "What is 2+2?"),
Message.from_role_and_content(Role.ASSISTANT, "Let me think.").with_channel(
"analysis"
),
Message.from_role_and_content(
Role.ASSISTANT, "The answer is 4."
).with_channel("final"),
]
token_ids = render_for_completion(messages)
decoded = get_encoding().decode(token_ids)
assert "Let me think." in decoded

def test_preserves_reasoning_across_tool_turns(self):
"""Reasoning before a tool call must survive rendering even when
the conversation ends with a final message (which triggers the
encoder's auto_drop_analysis)."""
messages = [
get_system_message(),
Message.from_role_and_content(Role.USER, "What's the weather?"),
Message.from_role_and_content(
Role.ASSISTANT, "I should call the weather API."
).with_channel("analysis"),
Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}')
.with_channel("commentary")
.with_recipient("functions.get_weather")
.with_content_type("json"),
Message.from_author_and_content(
Author.new(Role.TOOL, "functions.get_weather"), "72F, sunny"
)
.with_channel("commentary")
.with_recipient("assistant"),
# Final message triggers the encoder's auto_drop_analysis
Message.from_role_and_content(
Role.ASSISTANT, "It is 72F and sunny in SF."
).with_channel("final"),
]
token_ids = render_for_completion(messages)
decoded = get_encoding().decode(token_ids)
assert "I should call the weather API." in decoded
13 changes: 13 additions & 0 deletions tests/entrypoints/openai/responses/test_harmony_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,19 @@ def test_commentary_with_unknown_recipient_creates_mcp_call(self):
assert output_items[0].name == "custom_tool"
assert output_items[0].server_label == "custom_tool"

def test_analysis_with_function_recipient_creates_function_call(self):
"""GPT-OSS models sometimes emit tool calls on analysis channel.
Should produce function call, not MCP call."""
message = Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}')
message = message.with_channel("analysis")
message = message.with_recipient("functions.get_weather")

output_items = harmony_to_response_output(message)

assert len(output_items) == 1
assert isinstance(output_items[0], ResponseFunctionToolCall)
assert output_items[0].name == "get_weather"

def test_analysis_channel_creates_reasoning(self):
"""Test that analysis channel creates reasoning items."""
message = Message.from_role_and_content(
Expand Down
8 changes: 7 additions & 1 deletion vllm/entrypoints/openai/parser/harmony_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
HarmonyEncodingName,
Message,
ReasoningEffort,
RenderConversationConfig,
Role,
StreamableParser,
SystemContent,
Expand Down Expand Up @@ -318,8 +319,13 @@ def parse_chat_input_to_harmony_message(

def render_for_completion(messages: list[Message]) -> list[int]:
conversation = Conversation.from_messages(messages)
# Disable auto_drop_analysis: vLLM handles analysis filtering via
# auto_drop_analysis_messages(). Letting the encoder also drop causes
# double-filtering that strips reasoning the model needs between
# tool-calling turns.
config = RenderConversationConfig(auto_drop_analysis=False)
token_ids = get_encoding().render_conversation_for_completion(
conversation, Role.ASSISTANT
conversation, Role.ASSISTANT, config=config
)
return token_ids

Expand Down
7 changes: 5 additions & 2 deletions vllm/entrypoints/openai/responses/harmony.py
Original file line number Diff line number Diff line change
Expand Up @@ -419,8 +419,11 @@ def harmony_to_response_output(message: Message) -> list[ResponseOutputItem]:
if recipient.startswith("browser."):
output_items.append(_parse_browser_tool_call(message, recipient))

# Function calls (should only happen on commentary channel)
elif message.channel == "commentary" and recipient.startswith("functions."):
# Function calls (commentary or analysis channel — GPT-OSS models
# sometimes emit tool calls on analysis channel)
elif message.channel in ("commentary", "analysis") and recipient.startswith(
"functions."
):
output_items.extend(_parse_function_call(message, recipient))

# Built-in MCP tools (python, browser, container)
Expand Down