From 3c7c6749cc3205ad55b71061b0c01d8f49c87a99 Mon Sep 17 00:00:00 2001
From: Will Deines <will@garr.io>
Date: Tue, 3 Mar 2026 11:59:02 -0500
Subject: [PATCH] [Harmony] Fix analysis-channel tool calls and preserve
 reasoning across turns

Two fixes for GPT-OSS Harmony model behavior:

1. Accept function calls on analysis channel in harmony_to_response_output()
   to match the streaming/in-progress parsers that already handle both channels.

2. Disable openai_harmony encoder's auto_drop_analysis to prevent
   double-filtering with vLLM's auto_drop_analysis_messages(), preserving
   reasoning context between tool-calling turns.
---
 .../openai/parser/test_harmony_utils.py       | 50 ++++++++++++++++++-
 .../openai/responses/test_harmony_utils.py    | 13 +++++
 .../openai/parser/harmony_utils.py            |  8 ++-
 vllm/entrypoints/openai/responses/harmony.py  |  7 ++-
 4 files changed, 74 insertions(+), 4 deletions(-)

diff --git a/tests/entrypoints/openai/parser/test_harmony_utils.py b/tests/entrypoints/openai/parser/test_harmony_utils.py
index 7842a1fcd757..32f55b21db71 100644
--- a/tests/entrypoints/openai/parser/test_harmony_utils.py
+++ b/tests/entrypoints/openai/parser/test_harmony_utils.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
-from openai_harmony import Message, Role
+from openai_harmony import Author, Message, Role
 
 from tests.entrypoints.openai.utils import verify_harmony_messages
 from vllm.entrypoints.openai.parser.harmony_utils import (
@@ -12,6 +12,7 @@
     has_custom_tools,
     parse_chat_input_to_harmony_message,
     parse_chat_output,
+    render_for_completion,
 )
 from vllm.entrypoints.openai.responses.harmony import (
     response_previous_input_to_harmony,
@@ -841,3 +842,50 @@ def test_all_standard_channels_present(self) -> None:
                 assert channel in valid_channels, (
                     f"{channel} missing when with_custom_tools={with_tools}"
                 )
+
+
+class TestRenderForCompletion:
+    def test_preserves_analysis(self):
+        """render_for_completion must not strip analysis messages —
+        vLLM handles that via auto_drop_analysis_messages()."""
+        messages = [
+            get_system_message(),
+            Message.from_role_and_content(Role.USER, "What is 2+2?"),
+            Message.from_role_and_content(Role.ASSISTANT, "Let me think.").with_channel(
+                "analysis"
+            ),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "The answer is 4."
+            ).with_channel("final"),
+        ]
+        token_ids = render_for_completion(messages)
+        decoded = get_encoding().decode(token_ids)
+        assert "Let me think." in decoded
+
+    def test_preserves_reasoning_across_tool_turns(self):
+        """Reasoning before a tool call must survive rendering even when
+        the conversation ends with a final message (which triggers the
+        encoder's auto_drop_analysis)."""
+        messages = [
+            get_system_message(),
+            Message.from_role_and_content(Role.USER, "What's the weather?"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I should call the weather API."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}')
+            .with_channel("commentary")
+            .with_recipient("functions.get_weather")
+            .with_content_type("json"),
+            Message.from_author_and_content(
+                Author.new(Role.TOOL, "functions.get_weather"), "72F, sunny"
+            )
+            .with_channel("commentary")
+            .with_recipient("assistant"),
+            # Final message triggers the encoder's auto_drop_analysis
+            Message.from_role_and_content(
+                Role.ASSISTANT, "It is 72F and sunny in SF."
+            ).with_channel("final"),
+        ]
+        token_ids = render_for_completion(messages)
+        decoded = get_encoding().decode(token_ids)
+        assert "I should call the weather API." in decoded
diff --git a/tests/entrypoints/openai/responses/test_harmony_utils.py b/tests/entrypoints/openai/responses/test_harmony_utils.py
index e51538298ff9..d555194322be 100644
--- a/tests/entrypoints/openai/responses/test_harmony_utils.py
+++ b/tests/entrypoints/openai/responses/test_harmony_utils.py
@@ -254,6 +254,19 @@ def test_commentary_with_unknown_recipient_creates_mcp_call(self):
         assert output_items[0].name == "custom_tool"
         assert output_items[0].server_label == "custom_tool"
 
+    def test_analysis_with_function_recipient_creates_function_call(self):
+        """GPT-OSS models sometimes emit tool calls on analysis channel.
+        Should produce function call, not MCP call."""
+        message = Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}')
+        message = message.with_channel("analysis")
+        message = message.with_recipient("functions.get_weather")
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseFunctionToolCall)
+        assert output_items[0].name == "get_weather"
+
     def test_analysis_channel_creates_reasoning(self):
         """Test that analysis channel creates reasoning items."""
         message = Message.from_role_and_content(
diff --git a/vllm/entrypoints/openai/parser/harmony_utils.py b/vllm/entrypoints/openai/parser/harmony_utils.py
index 9b4264456c51..72e5d8820c29 100644
--- a/vllm/entrypoints/openai/parser/harmony_utils.py
+++ b/vllm/entrypoints/openai/parser/harmony_utils.py
@@ -13,6 +13,7 @@
     HarmonyEncodingName,
     Message,
     ReasoningEffort,
+    RenderConversationConfig,
     Role,
     StreamableParser,
     SystemContent,
@@ -318,8 +319,13 @@ def parse_chat_input_to_harmony_message(
 
 def render_for_completion(messages: list[Message]) -> list[int]:
     conversation = Conversation.from_messages(messages)
+    # Disable auto_drop_analysis: vLLM handles analysis filtering via
+    # auto_drop_analysis_messages(). Letting the encoder also drop causes
+    # double-filtering that strips reasoning the model needs between
+    # tool-calling turns.
+    config = RenderConversationConfig(auto_drop_analysis=False)
     token_ids = get_encoding().render_conversation_for_completion(
-        conversation, Role.ASSISTANT
+        conversation, Role.ASSISTANT, config=config
     )
     return token_ids
 
diff --git a/vllm/entrypoints/openai/responses/harmony.py b/vllm/entrypoints/openai/responses/harmony.py
index 460f310926ad..543f1c1b7d0c 100644
--- a/vllm/entrypoints/openai/responses/harmony.py
+++ b/vllm/entrypoints/openai/responses/harmony.py
@@ -419,8 +419,11 @@ def harmony_to_response_output(message: Message) -> list[ResponseOutputItem]:
         if recipient.startswith("browser."):
             output_items.append(_parse_browser_tool_call(message, recipient))
 
-        # Function calls (should only happen on commentary channel)
-        elif message.channel == "commentary" and recipient.startswith("functions."):
+        # Function calls (commentary or analysis channel — GPT-OSS models
+        # sometimes emit tool calls on analysis channel)
+        elif message.channel in ("commentary", "analysis") and recipient.startswith(
+            "functions."
+        ):
             output_items.extend(_parse_function_call(message, recipient))
 
         # Built-in MCP tools (python, browser, container)