vllm-project · RohanDisa · Jan 24, 2026 · Jan 25, 2026 · Jan 25, 2026 · chaunceyjiang
@@ -139,3 +139,84 @@ async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI):
     assert len(tool_calls.choices[0].message.reasoning) > 0
     assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME
     assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS
+
+
+# test that content does not leak into final chunk when finish_reason=tool_calls
+@pytest.mark.asyncio
+async def test_no_content_leak_when_finish_reason_tool_calls(
+    client: openai.AsyncOpenAI,
+):
+    """
+    Test that when finish_reason='tool_calls', the final chunk does not
+    contain any content field. This prevents reasoning_content from leaking
+    into content, which violates OpenAI's schema contract.
+
+    This test specifically targets the bug where leftover reasoning buffers
+    (especially from speculative decoding) were incorrectly flushed into
+    the content field in the final streamed chunk.
+    """
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=MESSAGES,
+        tools=TOOLS,
+        temperature=0.0,
+        stream=True,
+        tool_choice="auto",
+        include_reasoning=True,
+    )
+
+    chunks = []
+    final_chunk = None
+    async for chunk in stream:
+        chunks.append(chunk)
+        # Track the final chunk with finish_reason
+        if chunk.choices and chunk.choices[0].finish_reason:
+            final_chunk = chunk
+
+    # Ensure we got a final chunk with tool_calls
+    assert final_chunk is not None, "Expected a final chunk with finish_reason"
+    assert final_chunk.choices[0].finish_reason == "tool_calls", (
+        "Expected finish_reason to be 'tool_calls'"
+    )
+
+    delta = final_chunk.choices[0].delta
+
+    # Per OpenAI spec, when finish_reason='tool_calls', content must be null/absent
+    # This is the core fix: prevent reasoning_content from leaking into content
+    assert delta.content is None or delta.content == "", (
+        f"Final chunk with finish_reason='tool_calls' must not have content. "
+        f"Got content='{delta.content}'. This indicates reasoning_content leaked "
+        f"into content field."
+    )
+
+    # Also ensure reasoning fields are not present in final chunk
+    # (they should only appear in earlier chunks)
+    reasoning = getattr(delta, "reasoning", None)
+    reasoning_content = getattr(delta, "reasoning_content", None)
+    assert reasoning is None or reasoning == "", (
+        "Final chunk with tool_calls should not have reasoning field"
+    )
+    assert reasoning_content is None or reasoning_content == "", (
+        "Final chunk with tool_calls should not have reasoning_content field"
+    )
+
+    # Verify tool_calls are present (the expected behavior)
+    assert delta.tool_calls is not None and len(delta.tool_calls) > 0, (
+        "Final chunk with finish_reason='tool_calls' must have tool_calls"
+    )
+
+    # Verify reasoning was streamed in earlier chunks (not in final)
+    reasoning_found_in_earlier_chunks = False
+    for chunk in chunks[:-1]:  # All chunks except the final one
+        if chunk.choices:
+            delta = chunk.choices[0].delta
+            if hasattr(delta, "reasoning") and delta.reasoning:
+                reasoning_found_in_earlier_chunks = True
+                break
+            if hasattr(delta, "reasoning_content") and delta.reasoning_content:
+                reasoning_found_in_earlier_chunks = True
+                break
+
+    assert reasoning_found_in_earlier_chunks, (
+        "Reasoning should be streamed in earlier chunks, not in final chunk"
+    )
@@ -911,6 +911,13 @@ async def chat_completion_stream_generator(
                             )
                         )
                         harmony_tools_streamed[i] |= tools_streamed_flag
+                        # Ensure no content leaks when tool calls are present
+                        # in harmony path. Per OpenAI spec, tool call deltas
+                        # must not contain content.
+                        if tools_streamed_flag and delta_message:
+                            delta_message.content = None
+                            delta_message.reasoning = None
+                            delta_message.reasoning_content = None
                     # handle streaming deltas for tools with named tool_choice
                     elif tool_choice_function_name:
                         if (
@@ -980,6 +987,7 @@ async def chat_completion_stream_generator(
                                 ]
                             )
                             tools_streamed[i] = True
+                            self._clear_non_tool_call_fields(delta_message)
 
                     elif request.tool_choice == "required":
                         assert previous_texts is not None
@@ -1036,6 +1044,7 @@ async def chat_completion_stream_generator(
                             ):
                                 history_tool_call_cnt += 1
                                 tools_streamed[i] = True
+                                self._clear_non_tool_call_fields(delta_message)
 
                     # handle streaming deltas for tools with "auto" tool choice
                     # and reasoning parser
@@ -1111,6 +1120,7 @@ async def chat_completion_stream_generator(
                             )
                             if delta_message and delta_message.tool_calls:
                                 tools_streamed[i] = True
+                                self._clear_non_tool_call_fields(delta_message)
                     # when only tool calls
                     elif tool_choice_auto:
                         assert tool_parser is not None
@@ -1125,6 +1135,7 @@ async def chat_completion_stream_generator(
                         )
                         if delta_message and delta_message.tool_calls:
                             tools_streamed[i] = True
+                            self._clear_non_tool_call_fields(delta_message)
 
                     # when only reasoning
                     elif self.reasoning_parser:
@@ -1290,6 +1301,18 @@ async def chat_completion_stream_generator(
                             finish_reason_ = (
                                 output.finish_reason if output.finish_reason else "stop"
                             )
+
+                        # When finish_reason is "tool_calls", ensure no content
+                        # or reasoning fields leak into the final delta.
+                        # Per OpenAI spec, tool call responses must only contain
+                        # tool_calls and finish_reason, never content.
+                        if finish_reason_ == "tool_calls":
+                            if delta_message is None:
+                                # Create empty delta message if none exists
+                                delta_message = DeltaMessage()
+                            else:
+                                self._clear_non_tool_call_fields(delta_message)
+
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
                             delta=delta_message,
@@ -1824,6 +1847,18 @@ def _create_chat_logprobs(
 
         return ChatCompletionLogProbs(content=logprobs_content)
 
+    def _clear_non_tool_call_fields(self, delta_message: DeltaMessage) -> None:
+        """
+        Clear content and reasoning fields from a delta message.
+
+        Per OpenAI spec, tool call deltas must not contain content or reasoning
+        fields. This prevents leakage from reasoning buffers or speculative
+        decoding when tool calls are present.
+        """
+        delta_message.content = None
+        delta_message.reasoning = None
+        delta_message.reasoning_content = None
+
     def _should_stream_with_auto_tool_parsing(self, request: ChatCompletionRequest):
         """
         Utility function to check if streamed tokens should go through the tool