vllm-project · will-deines · Mar 3, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 18, 2026
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 import pytest
-from openai_harmony import Message, Role
+from openai_harmony import Author, Message, Role
 
 from tests.entrypoints.openai.utils import verify_harmony_messages
 from vllm.entrypoints.openai.parser.harmony_utils import (
@@ -12,6 +12,7 @@
     has_custom_tools,
     parse_chat_input_to_harmony_message,
     parse_chat_output,
+    render_for_completion,
 )
 from vllm.entrypoints.openai.responses.harmony import (
     response_input_to_harmony,
@@ -844,6 +845,53 @@ def test_all_standard_channels_present(self) -> None:
                 )
 
 
+class TestRenderForCompletion:
+    def test_preserves_analysis(self):
+        """render_for_completion must not strip analysis messages —
+        vLLM handles that via auto_drop_analysis_messages()."""
+        messages = [
+            get_system_message(),
+            Message.from_role_and_content(Role.USER, "What is 2+2?"),
+            Message.from_role_and_content(Role.ASSISTANT, "Let me think.").with_channel(
+                "analysis"
+            ),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "The answer is 4."
+            ).with_channel("final"),
+        ]
+        token_ids = render_for_completion(messages)
+        decoded = get_encoding().decode(token_ids)
+        assert "Let me think." in decoded
+
+    def test_preserves_reasoning_across_tool_turns(self):
+        """Reasoning before a tool call must survive rendering even when
+        the conversation ends with a final message (which triggers the
+        encoder's auto_drop_analysis)."""
+        messages = [
+            get_system_message(),
+            Message.from_role_and_content(Role.USER, "What's the weather?"),
+            Message.from_role_and_content(
+                Role.ASSISTANT, "I should call the weather API."
+            ).with_channel("analysis"),
+            Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}')
+            .with_channel("commentary")
+            .with_recipient("functions.get_weather")
+            .with_content_type("json"),
+            Message.from_author_and_content(
+                Author.new(Role.TOOL, "functions.get_weather"), "72F, sunny"
+            )
+            .with_channel("commentary")
+            .with_recipient("assistant"),
+            # Final message triggers the encoder's auto_drop_analysis
+            Message.from_role_and_content(
+                Role.ASSISTANT, "It is 72F and sunny in SF."
+            ).with_channel("final"),
+        ]
+        token_ids = render_for_completion(messages)
+        decoded = get_encoding().decode(token_ids)
+        assert "I should call the weather API." in decoded
+
+
 class TestResponseInputToHarmonyReasoningItem:
     """Tests for response_input_to_harmony handling of reasoning input items.
 

@@ -254,6 +254,19 @@ def test_commentary_with_unknown_recipient_creates_mcp_call(self):
         assert output_items[0].name == "custom_tool"
         assert output_items[0].server_label == "custom_tool"
 
+    def test_analysis_with_function_recipient_creates_function_call(self):
+        """GPT-OSS models sometimes emit tool calls on analysis channel.
+        Should produce function call, not MCP call."""
+        message = Message.from_role_and_content(Role.ASSISTANT, '{"location": "SF"}')
+        message = message.with_channel("analysis")
+        message = message.with_recipient("functions.get_weather")
+
+        output_items = harmony_to_response_output(message)
+
+        assert len(output_items) == 1
+        assert isinstance(output_items[0], ResponseFunctionToolCall)
+        assert output_items[0].name == "get_weather"
+
     def test_analysis_channel_creates_reasoning(self):
         """Test that analysis channel creates reasoning items."""
         message = Message.from_role_and_content(

@@ -13,6 +13,7 @@
     HarmonyEncodingName,
     Message,
     ReasoningEffort,
+    RenderConversationConfig,
     Role,
     StreamableParser,
     SystemContent,
@@ -318,8 +319,13 @@ def parse_chat_input_to_harmony_message(
 
 def render_for_completion(messages: list[Message]) -> list[int]:
     conversation = Conversation.from_messages(messages)
+    # Disable auto_drop_analysis: vLLM handles analysis filtering via
+    # auto_drop_analysis_messages(). Letting the encoder also drop causes
+    # double-filtering that strips reasoning the model needs between
+    # tool-calling turns.
+    config = RenderConversationConfig(auto_drop_analysis=False)
     token_ids = get_encoding().render_conversation_for_completion(
-        conversation, Role.ASSISTANT
+        conversation, Role.ASSISTANT, config=config
     )
     return token_ids
 

@@ -387,6 +387,52 @@ def _parse_mcp_call(message: Message, recipient: str) -> list[ResponseOutputItem
     return output_items
 
 
+def _try_extract_embedded_function_call(
+    message: Message,
+) -> list[ResponseOutputItem] | None:
+    """Try to extract a function call embedded in a preamble message's content.
+
+    When the model outputs a preamble (<|channel|>commentary<|message|>) and
+    then immediately embeds a function-call channel sequence as content, the
+    harmony parser stores the raw channel tokens in the message content.  This
+    helper detects that pattern and re-parses it as a function call.
+
+    Returns a list of output items if an embedded call was detected, else None.
+    """
+    if not message.content:
+        return None
+    text = message.content[0].text
+    # Match: <|channel|>(commentary|analysis) to=functions.NAME<|message|>ARGS
+    for channel_prefix in (
+        "<|channel|>commentary to=functions.",
+        "<|channel|>analysis to=functions.",
+    ):
+        if not text.startswith(channel_prefix):
+            continue
+        rest = text[len(channel_prefix) :]
+        msg_sep = "<|message|>"
+        if msg_sep not in rest:
+            continue
+        name_part, args_part = rest.split(msg_sep, 1)
+        function_name = name_part.strip()
+        if not function_name:
+            continue
+        # Strip trailing <|end|> if present in the args
+        if args_part.endswith("<|end|>"):
+            args_part = args_part[: -len("<|end|>")]
+        random_id = random_uuid()
+        return [
+            ResponseFunctionToolCall(
+                arguments=args_part,
+                call_id=f"call_{random_id}",
+                type="function_call",
+                name=function_name,
+                id=f"fc_{random_id}",
+            )
+        ]
+    return None
+
+
 def _parse_message_no_recipient(
     message: Message,
 ) -> list[ResponseOutputItem]:
@@ -398,6 +444,11 @@ def _parse_message_no_recipient(
         # Per Harmony format, preambles (commentary with no recipient) and
         # final channel content are both intended to be shown to end-users.
         # See: https://cookbook.openai.com/articles/openai-harmony
+        # But first check if the content is an embedded function call
+        # (model output a preamble whose content contains the tool call tokens).
+        embedded = _try_extract_embedded_function_call(message)
+        if embedded is not None:
+            return embedded
         return [_parse_final_message(message)]
 
     raise ValueError(f"Unknown channel: {message.channel}")
@@ -427,8 +478,11 @@ def harmony_to_response_output(message: Message) -> list[ResponseOutputItem]:
         if recipient.startswith("browser."):
             output_items.append(_parse_browser_tool_call(message, recipient))
 
-        # Function calls (should only happen on commentary channel)
-        elif message.channel == "commentary" and recipient.startswith("functions."):
+        # Function calls (commentary or analysis channel — GPT-OSS models
+        # sometimes emit tool calls on analysis channel)
+        elif message.channel in ("commentary", "analysis") and recipient.startswith(
+            "functions."
+        ):
             output_items.extend(_parse_function_call(message, recipient))
 
         # Built-in MCP tools (python, browser, container)

@@ -61,6 +61,7 @@
 )
 from vllm.entrypoints.openai.models.serving import OpenAIServingModels
 from vllm.entrypoints.openai.parser.harmony_utils import (
+    auto_drop_analysis_messages,
     get_developer_message,
     get_stop_tokens_for_assistant_actions,
     get_system_message,
@@ -1149,30 +1150,11 @@ def _construct_input_messages_with_harmony(
             # instructions are ignored.
             prev_msgs = self.msg_store[prev_response.id]
 
-            # FIXME(woosuk): The slice-delete-reappend cycle below is
-            # currently a no-op --- it removes messages then puts them all
-            # back unfiltered. It may be intentionally deferred (see FIXME
-            # above) or redundant if the Harmony encoder already strips
-            # analysis messages at render time. If analysis messages need
-            # to be dropped here, add a channel != "analysis" filter when
-            # re-appending, similar to auto_drop_analysis_messages in
-            # harmony_utils.py.
-            if len(prev_msgs) > 0:
-                last_msg = prev_msgs[-1]
-                assert isinstance(last_msg, OpenAIHarmonyMessage)
-                if last_msg.channel == "final":
-                    prev_final_msg_idx = -1
-                    for i in range(len(prev_msgs) - 2, -1, -1):
-                        prev_msg_i = prev_msgs[i]
-                        assert isinstance(prev_msg_i, OpenAIHarmonyMessage)
-                        if prev_msg_i.channel == "final":
-                            prev_final_msg_idx = i
-                            break
-                    recent_turn_msgs = prev_msgs[prev_final_msg_idx + 1 :]
-                    del prev_msgs[prev_final_msg_idx + 1 :]
-                    for msg in recent_turn_msgs:
-                        assert isinstance(msg, OpenAIHarmonyMessage)
-                        prev_msgs.append(msg)
+            # Drop analysis messages from completed turns so they don't
+            # bloat the prompt in multi-turn conversations. This mirrors
+            # what parse_chat_inputs_to_harmony_messages does for the
+            # Chat Completions path.
+            prev_msgs = auto_drop_analysis_messages(prev_msgs)
             messages.extend(prev_msgs)
         # Append the new input.
         # Responses API supports simple text inputs without chat format.