vllm-project · will-deines · Mar 3, 2026
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
-from openai_harmony import Message, Role
+from openai_harmony import Author, Message, Role
 
 from tests.entrypoints.openai.utils import verify_harmony_messages
 from vllm.entrypoints.openai.parser.harmony_utils import (
@@ -12,6 +12,7 @@
     has_custom_tools,
     parse_chat_input_to_harmony_message,
     parse_chat_output,
+    render_for_completion,
 )
 from vllm.entrypoints.openai.responses.harmony import (
     response_previous_input_to_harmony,
@@ -841,3 +842,50 @@ def test_all_standard_channels_present(self) -> None:
                 assert channel in valid_channels, (
                     f"{channel} missing when with_custom_tools={with_tools}"
                 )
+
+
+class TestRenderForCompletion:
+    def test_preserves_analysis(self):
+        """render_for_completion must not strip analysis messages —
+        vLLM handles that via auto_drop_analysis_messages()."""
+        messages = [
+            get_system_message(),
+            Message.from_role_and_content(Role.USER, "What is 2+2?"),
+            Message.from_role_and_content(Role.ASSISTANT, "Let me think.").with_channel(
+                "analysis"
+            ),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "The answer is 4."
+            ).with_channel("final"),
+        ]
+        token_ids = render_for_completion(messages)
+        decoded = get_encoding().decode(token_ids)
+        assert "Let me think." in decoded
+
+    def test_preserves_reasoning_across_tool_turns(self):
+        """Reasoning before a tool call must survive rendering even when
+        the conversation ends with a final message (which triggers the
+        encoder's auto_drop_analysis)."""
+        messages = [
+            get_system_message(),
+            Message.from_role_and_content(Role.USER, "What's the weather?"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I should call the weather API."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}')
+            .with_channel("commentary")
+            .with_recipient("functions.get_weather")
+            .with_content_type("json"),
+            Message.from_author_and_content(
+                Author.new(Role.TOOL, "functions.get_weather"), "72F, sunny"
+            )
+            .with_channel("commentary")
+            .with_recipient("assistant"),
+            # Final message triggers the encoder's auto_drop_analysis
+            Message.from_role_and_content(
+                Role.ASSISTANT, "It is 72F and sunny in SF."
+            ).with_channel("final"),
+        ]
+        token_ids = render_for_completion(messages)
+        decoded = get_encoding().decode(token_ids)
+        assert "I should call the weather API." in decoded
@@ -254,6 +254,19 @@ def test_commentary_with_unknown_recipient_creates_mcp_call(self):
         assert output_items[0].name == "custom_tool"
         assert output_items[0].server_label == "custom_tool"
 
+    def test_analysis_with_function_recipient_creates_function_call(self):
+        """GPT-OSS models sometimes emit tool calls on analysis channel.
+        Should produce function call, not MCP call."""
+        message = Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}')
+        message = message.with_channel("analysis")
+        message = message.with_recipient("functions.get_weather")
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseFunctionToolCall)
+        assert output_items[0].name == "get_weather"
+
     def test_analysis_channel_creates_reasoning(self):
         """Test that analysis channel creates reasoning items."""
         message = Message.from_role_and_content(

@@ -13,6 +13,7 @@
     HarmonyEncodingName,
     Message,
     ReasoningEffort,
+    RenderConversationConfig,
     Role,
     StreamableParser,
     SystemContent,
@@ -318,8 +319,13 @@ def parse_chat_input_to_harmony_message(
 
 def render_for_completion(messages: list[Message]) -> list[int]:
     conversation = Conversation.from_messages(messages)
+    # Disable auto_drop_analysis: vLLM handles analysis filtering via
+    # auto_drop_analysis_messages(). Letting the encoder also drop causes
+    # double-filtering that strips reasoning the model needs between
+    # tool-calling turns.
+    config = RenderConversationConfig(auto_drop_analysis=False)
     token_ids = get_encoding().render_conversation_for_completion(
-        conversation, Role.ASSISTANT
+        conversation, Role.ASSISTANT, config=config
     )
     return token_ids
 

@@ -419,8 +419,11 @@ def harmony_to_response_output(message: Message) -> list[ResponseOutputItem]:
         if recipient.startswith("browser."):
             output_items.append(_parse_browser_tool_call(message, recipient))
 
-        # Function calls (should only happen on commentary channel)
-        elif message.channel == "commentary" and recipient.startswith("functions."):
+        # Function calls (commentary or analysis channel — GPT-OSS models
+        # sometimes emit tool calls on analysis channel)
+        elif message.channel in ("commentary", "analysis") and recipient.startswith(
+            "functions."
+        ):
             output_items.extend(_parse_function_call(message, recipient))
 
         # Built-in MCP tools (python, browser, container)