From 3c7c6749cc3205ad55b71061b0c01d8f49c87a99 Mon Sep 17 00:00:00 2001 From: Will Deines Date: Tue, 3 Mar 2026 11:59:02 -0500 Subject: [PATCH] [Harmony] Fix analysis-channel tool calls and preserve reasoning across turns Two fixes for GPT-OSS Harmony model behavior: 1. Accept function calls on analysis channel in harmony_to_response_output() to match the streaming/in-progress parsers that already handle both channels. 2. Disable openai_harmony encoder's auto_drop_analysis to prevent double-filtering with vLLM's auto_drop_analysis_messages(), preserving reasoning context between tool-calling turns. --- .../openai/parser/test_harmony_utils.py | 50 ++++++++++++++++++- .../openai/responses/test_harmony_utils.py | 13 +++++ .../openai/parser/harmony_utils.py | 8 ++- vllm/entrypoints/openai/responses/harmony.py | 7 ++- 4 files changed, 74 insertions(+), 4 deletions(-) diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py index 7842a1fcd757..32f55b21db71 100644 --- a/tests/entrypoints/openai/parser/test_harmony_utils.py +++ b/tests/entrypoints/openai/parser/test_harmony_utils.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest -from openai_harmony import Message, Role +from openai_harmony import Author, Message, Role from tests.entrypoints.openai.utils import verify_harmony_messages from vllm.entrypoints.openai.parser.harmony_utils import ( @@ -12,6 +12,7 @@ has_custom_tools, parse_chat_input_to_harmony_message, parse_chat_output, + render_for_completion, ) from vllm.entrypoints.openai.responses.harmony import ( response_previous_input_to_harmony, @@ -841,3 +842,50 @@ def test_all_standard_channels_present(self) -> None: assert channel in valid_channels, ( f"{channel} missing when with_custom_tools={with_tools}" ) + + +class TestRenderForCompletion: + def test_preserves_analysis(self): + """render_for_completion must not strip analysis messages — + vLLM handles that via auto_drop_analysis_messages().""" + messages = [ + get_system_message(), + Message.from_role_and_content(Role.USER, "What is 2+2?"), + Message.from_role_and_content(Role.ASSISTANT, "Let me think.").with_channel( + "analysis" + ), + Message.from_role_and_content( + Role.ASSISTANT, "The answer is 4." + ).with_channel("final"), + ] + token_ids = render_for_completion(messages) + decoded = get_encoding().decode(token_ids) + assert "Let me think." in decoded + + def test_preserves_reasoning_across_tool_turns(self): + """Reasoning before a tool call must survive rendering even when + the conversation ends with a final message (which triggers the + encoder's auto_drop_analysis).""" + messages = [ + get_system_message(), + Message.from_role_and_content(Role.USER, "What's the weather?"), + Message.from_role_and_content( + Role.ASSISTANT, "I should call the weather API." + ).with_channel("analysis"), + Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}') + .with_channel("commentary") + .with_recipient("functions.get_weather") + .with_content_type("json"), + Message.from_author_and_content( + Author.new(Role.TOOL, "functions.get_weather"), "72F, sunny" + ) + .with_channel("commentary") + .with_recipient("assistant"), + # Final message triggers the encoder's auto_drop_analysis + Message.from_role_and_content( + Role.ASSISTANT, "It is 72F and sunny in SF." + ).with_channel("final"), + ] + token_ids = render_for_completion(messages) + decoded = get_encoding().decode(token_ids) + assert "I should call the weather API." in decoded diff --git a/tests/entrypoints/openai/responses/test_harmony_utils.py b/tests/entrypoints/openai/responses/test_harmony_utils.py index e51538298ff9..d555194322be 100644 --- a/tests/entrypoints/openai/responses/test_harmony_utils.py +++ b/tests/entrypoints/openai/responses/test_harmony_utils.py @@ -254,6 +254,19 @@ def test_commentary_with_unknown_recipient_creates_mcp_call(self): assert output_items[0].name == "custom_tool" assert output_items[0].server_label == "custom_tool" + def test_analysis_with_function_recipient_creates_function_call(self): + """GPT-OSS models sometimes emit tool calls on analysis channel. + Should produce function call, not MCP call.""" + message = Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}') + message = message.with_channel("analysis") + message = message.with_recipient("functions.get_weather") + + output_items = harmony_to_response_output(message) + + assert len(output_items) == 1 + assert isinstance(output_items[0], ResponseFunctionToolCall) + assert output_items[0].name == "get_weather" + def test_analysis_channel_creates_reasoning(self): """Test that analysis channel creates reasoning items.""" message = Message.from_role_and_content( diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py index 9b4264456c51..72e5d8820c29 100644 --- a/vllm/entrypoints/openai/parser/harmony_utils.py +++ b/vllm/entrypoints/openai/parser/harmony_utils.py @@ -13,6 +13,7 @@ HarmonyEncodingName, Message, ReasoningEffort, + RenderConversationConfig, Role, StreamableParser, SystemContent, @@ -318,8 +319,13 @@ def parse_chat_input_to_harmony_message( def render_for_completion(messages: list[Message]) -> list[int]: conversation = Conversation.from_messages(messages) + # Disable auto_drop_analysis: vLLM handles analysis filtering via + # auto_drop_analysis_messages(). Letting the encoder also drop causes + # double-filtering that strips reasoning the model needs between + # tool-calling turns. + config = RenderConversationConfig(auto_drop_analysis=False) token_ids = get_encoding().render_conversation_for_completion( - conversation, Role.ASSISTANT + conversation, Role.ASSISTANT, config=config ) return token_ids diff --git a/vllm/entrypoints/openai/responses/harmony.py b/vllm/entrypoints/openai/responses/harmony.py index 460f310926ad..543f1c1b7d0c 100644 --- a/vllm/entrypoints/openai/responses/harmony.py +++ b/vllm/entrypoints/openai/responses/harmony.py @@ -419,8 +419,11 @@ def harmony_to_response_output(message: Message) -> list[ResponseOutputItem]: if recipient.startswith("browser."): output_items.append(_parse_browser_tool_call(message, recipient)) - # Function calls (should only happen on commentary channel) - elif message.channel == "commentary" and recipient.startswith("functions."): + # Function calls (commentary or analysis channel — GPT-OSS models + # sometimes emit tool calls on analysis channel) + elif message.channel in ("commentary", "analysis") and recipient.startswith( + "functions." + ): output_items.extend(_parse_function_call(message, recipient)) # Built-in MCP tools (python, browser, container)