Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 49 additions & 1 deletion tests/entrypoints/openai/parser/test_harmony_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import pytest
from openai_harmony import Message, Role
from openai_harmony import Author, Message, Role

from tests.entrypoints.openai.utils import verify_harmony_messages
from vllm.entrypoints.openai.parser.harmony_utils import (
Expand All @@ -12,6 +12,7 @@
has_custom_tools,
parse_chat_input_to_harmony_message,
parse_chat_output,
render_for_completion,
)
from vllm.entrypoints.openai.responses.harmony import (
response_input_to_harmony,
Expand Down Expand Up @@ -844,6 +845,53 @@ def test_all_standard_channels_present(self) -> None:
)


class TestRenderForCompletion:
def test_preserves_analysis(self):
"""render_for_completion must not strip analysis messages —
vLLM handles that via auto_drop_analysis_messages()."""
messages = [
get_system_message(),
Message.from_role_and_content(Role.USER, "What is 2+2?"),
Message.from_role_and_content(Role.ASSISTANT, "Let me think.").with_channel(
"analysis"
),
Message.from_role_and_content(
Role.ASSISTANT, "The answer is 4."
).with_channel("final"),
]
token_ids = render_for_completion(messages)
decoded = get_encoding().decode(token_ids)
assert "Let me think." in decoded

def test_preserves_reasoning_across_tool_turns(self):
"""Reasoning before a tool call must survive rendering even when
the conversation ends with a final message (which triggers the
encoder's auto_drop_analysis)."""
messages = [
get_system_message(),
Message.from_role_and_content(Role.USER, "What's the weather?"),
Message.from_role_and_content(
Role.ASSISTANT, "I should call the weather API."
).with_channel("analysis"),
Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}')
.with_channel("commentary")
.with_recipient("functions.get_weather")
.with_content_type("json"),
Message.from_author_and_content(
Author.new(Role.TOOL, "functions.get_weather"), "72F, sunny"
)
.with_channel("commentary")
.with_recipient("assistant"),
# Final message triggers the encoder's auto_drop_analysis
Message.from_role_and_content(
Role.ASSISTANT, "It is 72F and sunny in SF."
).with_channel("final"),
]
token_ids = render_for_completion(messages)
decoded = get_encoding().decode(token_ids)
assert "I should call the weather API." in decoded


class TestResponseInputToHarmonyReasoningItem:
"""Tests for response_input_to_harmony handling of reasoning input items.

Expand Down
13 changes: 13 additions & 0 deletions tests/entrypoints/openai/responses/test_harmony_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,19 @@ def test_commentary_with_unknown_recipient_creates_mcp_call(self):
assert output_items[0].name == "custom_tool"
assert output_items[0].server_label == "custom_tool"

def test_analysis_with_function_recipient_creates_function_call(self):
"""GPT-OSS models sometimes emit tool calls on analysis channel.
Should produce function call, not MCP call."""
message = Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}')
message = message.with_channel("analysis")
message = message.with_recipient("functions.get_weather")

output_items = harmony_to_response_output(message)

assert len(output_items) == 1
assert isinstance(output_items[0], ResponseFunctionToolCall)
assert output_items[0].name == "get_weather"

def test_analysis_channel_creates_reasoning(self):
"""Test that analysis channel creates reasoning items."""
message = Message.from_role_and_content(
Expand Down
8 changes: 7 additions & 1 deletion vllm/entrypoints/openai/parser/harmony_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
HarmonyEncodingName,
Message,
ReasoningEffort,
RenderConversationConfig,
Role,
StreamableParser,
SystemContent,
Expand Down Expand Up @@ -318,8 +319,13 @@ def parse_chat_input_to_harmony_message(

def render_for_completion(messages: list[Message]) -> list[int]:
conversation = Conversation.from_messages(messages)
# Disable auto_drop_analysis: vLLM handles analysis filtering via
# auto_drop_analysis_messages(). Letting the encoder also drop causes
# double-filtering that strips reasoning the model needs between
# tool-calling turns.
config = RenderConversationConfig(auto_drop_analysis=False)
token_ids = get_encoding().render_conversation_for_completion(
conversation, Role.ASSISTANT
conversation, Role.ASSISTANT, config=config
)
return token_ids

Expand Down
58 changes: 56 additions & 2 deletions vllm/entrypoints/openai/responses/harmony.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,52 @@ def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem
return output_items


def _try_extract_embedded_function_call(
message: Message,
) -> list[ResponseOutputItem] | None:
"""Try to extract a function call embedded in a preamble message's content.

When the model outputs a preamble (<|channel|>commentary<|message|>) and
then immediately embeds a function-call channel sequence as content, the
harmony parser stores the raw channel tokens in the message content. This
helper detects that pattern and re-parses it as a function call.

Returns a list of output items if an embedded call was detected, else None.
"""
if not message.content:
return None
text = message.content[0].text
# Match: <|channel|>(commentary|analysis) to=functions.NAME<|message|>ARGS
for channel_prefix in (
"<|channel|>commentary to=functions.",
"<|channel|>analysis to=functions.",
):
if not text.startswith(channel_prefix):
continue
rest = text[len(channel_prefix) :]
msg_sep = "<|message|>"
if msg_sep not in rest:
continue
name_part, args_part = rest.split(msg_sep, 1)
function_name = name_part.strip()
if not function_name:
continue
# Strip trailing <|end|> if present in the args
if args_part.endswith("<|end|>"):
args_part = args_part[: -len("<|end|>")]
random_id = random_uuid()
return [
ResponseFunctionToolCall(
arguments=args_part,
call_id=f"call_{random_id}",
type="function_call",
name=function_name,
id=f"fc_{random_id}",
)
]
return None


def _parse_message_no_recipient(
message: Message,
) -> list[ResponseOutputItem]:
Expand All @@ -398,6 +444,11 @@ def _parse_message_no_recipient(
# Per Harmony format, preambles (commentary with no recipient) and
# final channel content are both intended to be shown to end-users.
# See: https://cookbook.openai.com/articles/openai-harmony
# But first check if the content is an embedded function call
# (model output a preamble whose content contains the tool call tokens).
embedded = _try_extract_embedded_function_call(message)
if embedded is not None:
return embedded
return [_parse_final_message(message)]

raise ValueError(f"Unknown channel: {message.channel}")
Expand Down Expand Up @@ -427,8 +478,11 @@ def harmony_to_response_output(message: Message) -> list[ResponseOutputItem]:
if recipient.startswith("browser."):
output_items.append(_parse_browser_tool_call(message, recipient))

# Function calls (should only happen on commentary channel)
elif message.channel == "commentary" and recipient.startswith("functions."):
# Function calls (commentary or analysis channel — GPT-OSS models
# sometimes emit tool calls on analysis channel)
elif message.channel in ("commentary", "analysis") and recipient.startswith(
"functions."
):
output_items.extend(_parse_function_call(message, recipient))

# Built-in MCP tools (python, browser, container)
Expand Down
30 changes: 6 additions & 24 deletions vllm/entrypoints/openai/responses/serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
)
from vllm.entrypoints.openai.models.serving import OpenAIServingModels
from vllm.entrypoints.openai.parser.harmony_utils import (
auto_drop_analysis_messages,
get_developer_message,
get_stop_tokens_for_assistant_actions,
get_system_message,
Expand Down Expand Up @@ -1149,30 +1150,11 @@ def _construct_input_messages_with_harmony(
# instructions are ignored.
prev_msgs = self.msg_store[prev_response.id]

# FIXME(woosuk): The slice-delete-reappend cycle below is
# currently a no-op --- it removes messages then puts them all
# back unfiltered. It may be intentionally deferred (see FIXME
# above) or redundant if the Harmony encoder already strips
# analysis messages at render time. If analysis messages need
# to be dropped here, add a channel != "analysis" filter when
# re-appending, similar to auto_drop_analysis_messages in
# harmony_utils.py.
if len(prev_msgs) > 0:
last_msg = prev_msgs[-1]
assert isinstance(last_msg, OpenAIHarmonyMessage)
if last_msg.channel == "final":
prev_final_msg_idx = -1
for i in range(len(prev_msgs) - 2, -1, -1):
prev_msg_i = prev_msgs[i]
assert isinstance(prev_msg_i, OpenAIHarmonyMessage)
if prev_msg_i.channel == "final":
prev_final_msg_idx = i
break
recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1 :]
del prev_msgs[prev_final_msg_idx + 1 :]
for msg in recent_turn_msgs:
assert isinstance(msg, OpenAIHarmonyMessage)
prev_msgs.append(msg)
# Drop analysis messages from completed turns so they don't
# bloat the prompt in multi-turn conversations. This mirrors
# what parse_chat_inputs_to_harmony_messages does for the
# Chat Completions path.
prev_msgs = auto_drop_analysis_messages(prev_msgs)
messages.extend(prev_msgs)
# Append the new input.
# Responses API supports simple text inputs without chat format.
Expand Down
Loading