diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index 21b53dff1507..a2c3d9f9f48e 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from openai_harmony import Message, Role +from openai_harmony import Author, Message, Role from tests.entrypoints.openai.utils import verify_harmony_messages from vllm.entrypoints.openai.parser.harmony_utils import ( @@ -12,6 +12,7 @@ has_custom_tools, parse_chat_input_to_harmony_message, parse_chat_output, + render_for_completion, ) from vllm.entrypoints.openai.responses.harmony import ( response_input_to_harmony, @@ -844,6 +845,53 @@ def test_all_standard_channels_present(self) -> None: ) +class TestRenderForCompletion: + def test_preserves_analysis(self): + """render_for_completion must not strip analysis messages — + vLLM handles that via auto_drop_analysis_messages().""" + messages = [ + get_system_message(), + Message.from_role_and_content(Role.USER, "What is 2+2?"), + Message.from_role_and_content(Role.ASSISTANT, "Let me think.").with_channel( + "analysis" + ), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + ] + token_ids = render_for_completion(messages) + decoded = get_encoding().decode(token_ids) + assert "Let me think." in decoded + + def test_preserves_reasoning_across_tool_turns(self): + """Reasoning before a tool call must survive rendering even when + the conversation ends with a final message (which triggers the + encoder's auto_drop_analysis).""" + messages = [ + get_system_message(), + Message.from_role_and_content(Role.USER, "What's the weather?"), + Message.from_role_and_content( + Role.ASSISTANT, "I should call the weather API." + ).with_channel("analysis"), + Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}') + .with_channel("commentary") + .with_recipient("functions.get_weather") + .with_content_type("json"), + Message.from_author_and_content( + Author.new(Role.TOOL, "functions.get_weather"), "72F, sunny" + ) + .with_channel("commentary") + .with_recipient("assistant"), + # Final message triggers the encoder's auto_drop_analysis + Message.from_role_and_content( + Role.ASSISTANT, "It is 72F and sunny in SF." + ).with_channel("final"), + ] + token_ids = render_for_completion(messages) + decoded = get_encoding().decode(token_ids) + assert "I should call the weather API." in decoded + + class TestResponseInputToHarmonyReasoningItem: """Tests for response_input_to_harmony handling of reasoning input items. diff --git a/tests/entrypoints/openai/responses/test_harmony_utils.py b/tests/entrypoints/openai/responses/test_harmony_utils.py index e51538298ff9..d555194322be 100644 --- a/tests/entrypoints/openai/responses/test_harmony_utils.py +++ b/tests/entrypoints/openai/responses/test_harmony_utils.py @@ -254,6 +254,19 @@ def test_commentary_with_unknown_recipient_creates_mcp_call(self): assert output_items[0].name == "custom_tool" assert output_items[0].server_label == "custom_tool" + def test_analysis_with_function_recipient_creates_function_call(self): + """GPT-OSS models sometimes emit tool calls on analysis channel. + Should produce function call, not MCP call.""" + message = Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}') + message = message.with_channel("analysis") + message = message.with_recipient("functions.get_weather") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseFunctionToolCall) + assert output_items[0].name == "get_weather" + def test_analysis_channel_creates_reasoning(self): """Test that analysis channel creates reasoning items.""" message = Message.from_role_and_content( diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index 9b4264456c51..72e5d8820c29 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -13,6 +13,7 @@ HarmonyEncodingName, Message, ReasoningEffort, + RenderConversationConfig, Role, StreamableParser, SystemContent, @@ -318,8 +319,13 @@ def parse_chat_input_to_harmony_message( def render_for_completion(messages: list[Message]) -> list[int]: conversation = Conversation.from_messages(messages) + # Disable auto_drop_analysis: vLLM handles analysis filtering via + # auto_drop_analysis_messages(). Letting the encoder also drop causes + # double-filtering that strips reasoning the model needs between + # tool-calling turns. + config = RenderConversationConfig(auto_drop_analysis=False) token_ids = get_encoding().render_conversation_for_completion( - conversation, Role.ASSISTANT + conversation, Role.ASSISTANT, config=config ) return token_ids diff --git a/vllm/entrypoints/openai/responses/harmony.py b/vllm/entrypoints/openai/responses/harmony.py index faab2f7f4cc7..3cbc2bd4af16 100644 --- a/vllm/entrypoints/openai/responses/harmony.py +++ b/vllm/entrypoints/openai/responses/harmony.py @@ -387,6 +387,52 @@ def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem return output_items +def _try_extract_embedded_function_call( + message: Message, +) -> list[ResponseOutputItem] | None: + """Try to extract a function call embedded in a preamble message's content. + + When the model outputs a preamble (<|channel|>commentary<|message|>) and + then immediately embeds a function-call channel sequence as content, the + harmony parser stores the raw channel tokens in the message content. This + helper detects that pattern and re-parses it as a function call. + + Returns a list of output items if an embedded call was detected, else None. + """ + if not message.content: + return None + text = message.content[0].text + # Match: <|channel|>(commentary|analysis) to=functions.NAME<|message|>ARGS + for channel_prefix in ( + "<|channel|>commentary to=functions.", + "<|channel|>analysis to=functions.", + ): + if not text.startswith(channel_prefix): + continue + rest = text[len(channel_prefix) :] + msg_sep = "<|message|>" + if msg_sep not in rest: + continue + name_part, args_part = rest.split(msg_sep, 1) + function_name = name_part.strip() + if not function_name: + continue + # Strip trailing <|end|> if present in the args + if args_part.endswith("<|end|>"): + args_part = args_part[: -len("<|end|>")] + random_id = random_uuid() + return [ + ResponseFunctionToolCall( + arguments=args_part, + call_id=f"call_{random_id}", + type="function_call", + name=function_name, + id=f"fc_{random_id}", + ) + ] + return None + + def _parse_message_no_recipient( message: Message, ) -> list[ResponseOutputItem]: @@ -398,6 +444,11 @@ def _parse_message_no_recipient( # Per Harmony format, preambles (commentary with no recipient) and # final channel content are both intended to be shown to end-users. # See: https://cookbook.openai.com/articles/openai-harmony + # But first check if the content is an embedded function call + # (model output a preamble whose content contains the tool call tokens). + embedded = _try_extract_embedded_function_call(message) + if embedded is not None: + return embedded return [_parse_final_message(message)] raise ValueError(f"Unknown channel: {message.channel}") @@ -427,8 +478,11 @@ def harmony_to_response_output(message: Message) -> list[ResponseOutputItem]: if recipient.startswith("browser."): output_items.append(_parse_browser_tool_call(message, recipient)) - # Function calls (should only happen on commentary channel) - elif message.channel == "commentary" and recipient.startswith("functions."): + # Function calls (commentary or analysis channel — GPT-OSS models + # sometimes emit tool calls on analysis channel) + elif message.channel in ("commentary", "analysis") and recipient.startswith( + "functions." + ): output_items.extend(_parse_function_call(message, recipient)) # Built-in MCP tools (python, browser, container) diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 574282c4cdc6..5775c0872b1e 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -61,6 +61,7 @@ ) from vllm.entrypoints.openai.models.serving import OpenAIServingModels from vllm.entrypoints.openai.parser.harmony_utils import ( + auto_drop_analysis_messages, get_developer_message, get_stop_tokens_for_assistant_actions, get_system_message, @@ -1149,30 +1150,11 @@ def _construct_input_messages_with_harmony( # instructions are ignored. prev_msgs = self.msg_store[prev_response.id] - # FIXME(woosuk): The slice-delete-reappend cycle below is - # currently a no-op --- it removes messages then puts them all - # back unfiltered. It may be intentionally deferred (see FIXME - # above) or redundant if the Harmony encoder already strips - # analysis messages at render time. If analysis messages need - # to be dropped here, add a channel != "analysis" filter when - # re-appending, similar to auto_drop_analysis_messages in - # harmony_utils.py. - if len(prev_msgs) > 0: - last_msg = prev_msgs[-1] - assert isinstance(last_msg, OpenAIHarmonyMessage) - if last_msg.channel == "final": - prev_final_msg_idx = -1 - for i in range(len(prev_msgs) - 2, -1, -1): - prev_msg_i = prev_msgs[i] - assert isinstance(prev_msg_i, OpenAIHarmonyMessage) - if prev_msg_i.channel == "final": - prev_final_msg_idx = i - break - recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1 :] - del prev_msgs[prev_final_msg_idx + 1 :] - for msg in recent_turn_msgs: - assert isinstance(msg, OpenAIHarmonyMessage) - prev_msgs.append(msg) + # Drop analysis messages from completed turns so they don't + # bloat the prompt in multi-turn conversations. This mirrors + # what parse_chat_inputs_to_harmony_messages does for the + # Chat Completions path. + prev_msgs = auto_drop_analysis_messages(prev_msgs) messages.extend(prev_msgs) # Append the new input. # Responses API supports simple text inputs without chat format.