diff --git a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py index 445fa389d000..3cdbb027b736 100644 --- a/tests/entrypoints/openai/test_chat_with_tool_reasoning.py +++ b/tests/entrypoints/openai/test_chat_with_tool_reasoning.py @@ -139,3 +139,84 @@ async def test_chat_full_of_tool_and_reasoning(client: openai.AsyncOpenAI): assert len(tool_calls.choices[0].message.reasoning) > 0 assert tool_calls.choices[0].message.tool_calls[0].function.name == FUNC_NAME assert tool_calls.choices[0].message.tool_calls[0].function.arguments == FUNC_ARGS + + +# test that content does not leak into final chunk when finish_reason=tool_calls +@pytest.mark.asyncio +async def test_no_content_leak_when_finish_reason_tool_calls( + client: openai.AsyncOpenAI, +): + """ + Test that when finish_reason='tool_calls', the final chunk does not + contain any content field. This prevents reasoning_content from leaking + into content, which violates OpenAI's schema contract. + + This test specifically targets the bug where leftover reasoning buffers + (especially from speculative decoding) were incorrectly flushed into + the content field in the final streamed chunk. + """ + stream = await client.chat.completions.create( + model=MODEL_NAME, + messages=MESSAGES, + tools=TOOLS, + temperature=0.0, + stream=True, + tool_choice="auto", + include_reasoning=True, + ) + + chunks = [] + final_chunk = None + async for chunk in stream: + chunks.append(chunk) + # Track the final chunk with finish_reason + if chunk.choices and chunk.choices[0].finish_reason: + final_chunk = chunk + + # Ensure we got a final chunk with tool_calls + assert final_chunk is not None, "Expected a final chunk with finish_reason" + assert final_chunk.choices[0].finish_reason == "tool_calls", ( + "Expected finish_reason to be 'tool_calls'" + ) + + delta = final_chunk.choices[0].delta + + # Per OpenAI spec, when finish_reason='tool_calls', content must be null/absent + # This is the core fix: prevent reasoning_content from leaking into content + assert delta.content is None or delta.content == "", ( + f"Final chunk with finish_reason='tool_calls' must not have content. " + f"Got content='{delta.content}'. This indicates reasoning_content leaked " + f"into content field." + ) + + # Also ensure reasoning fields are not present in final chunk + # (they should only appear in earlier chunks) + reasoning = getattr(delta, "reasoning", None) + reasoning_content = getattr(delta, "reasoning_content", None) + assert reasoning is None or reasoning == "", ( + "Final chunk with tool_calls should not have reasoning field" + ) + assert reasoning_content is None or reasoning_content == "", ( + "Final chunk with tool_calls should not have reasoning_content field" + ) + + # Verify tool_calls are present (the expected behavior) + assert delta.tool_calls is not None and len(delta.tool_calls) > 0, ( + "Final chunk with finish_reason='tool_calls' must have tool_calls" + ) + + # Verify reasoning was streamed in earlier chunks (not in final) + reasoning_found_in_earlier_chunks = False + for chunk in chunks[:-1]: # All chunks except the final one + if chunk.choices: + delta = chunk.choices[0].delta + if hasattr(delta, "reasoning") and delta.reasoning: + reasoning_found_in_earlier_chunks = True + break + if hasattr(delta, "reasoning_content") and delta.reasoning_content: + reasoning_found_in_earlier_chunks = True + break + + assert reasoning_found_in_earlier_chunks, ( + "Reasoning should be streamed in earlier chunks, not in final chunk" + ) diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index a15c99c24c28..5bc69b0a056e 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -911,6 +911,13 @@ async def chat_completion_stream_generator( ) ) harmony_tools_streamed[i] |= tools_streamed_flag + # Ensure no content leaks when tool calls are present + # in harmony path. Per OpenAI spec, tool call deltas + # must not contain content. + if tools_streamed_flag and delta_message: + delta_message.content = None + delta_message.reasoning = None + delta_message.reasoning_content = None # handle streaming deltas for tools with named tool_choice elif tool_choice_function_name: if ( @@ -980,6 +987,7 @@ async def chat_completion_stream_generator( ] ) tools_streamed[i] = True + self._clear_non_tool_call_fields(delta_message) elif request.tool_choice == "required": assert previous_texts is not None @@ -1036,6 +1044,7 @@ async def chat_completion_stream_generator( ): history_tool_call_cnt += 1 tools_streamed[i] = True + self._clear_non_tool_call_fields(delta_message) # handle streaming deltas for tools with "auto" tool choice # and reasoning parser @@ -1111,6 +1120,7 @@ async def chat_completion_stream_generator( ) if delta_message and delta_message.tool_calls: tools_streamed[i] = True + self._clear_non_tool_call_fields(delta_message) # when only tool calls elif tool_choice_auto: assert tool_parser is not None @@ -1125,6 +1135,7 @@ async def chat_completion_stream_generator( ) if delta_message and delta_message.tool_calls: tools_streamed[i] = True + self._clear_non_tool_call_fields(delta_message) # when only reasoning elif self.reasoning_parser: @@ -1290,6 +1301,18 @@ async def chat_completion_stream_generator( finish_reason_ = ( output.finish_reason if output.finish_reason else "stop" ) + + # When finish_reason is "tool_calls", ensure no content + # or reasoning fields leak into the final delta. + # Per OpenAI spec, tool call responses must only contain + # tool_calls and finish_reason, never content. + if finish_reason_ == "tool_calls": + if delta_message is None: + # Create empty delta message if none exists + delta_message = DeltaMessage() + else: + self._clear_non_tool_call_fields(delta_message) + choice_data = ChatCompletionResponseStreamChoice( index=i, delta=delta_message, @@ -1824,6 +1847,18 @@ def _create_chat_logprobs( return ChatCompletionLogProbs(content=logprobs_content) + def _clear_non_tool_call_fields(self, delta_message: DeltaMessage) -> None: + """ + Clear content and reasoning fields from a delta message. + + Per OpenAI spec, tool call deltas must not contain content or reasoning + fields. This prevents leakage from reasoning buffers or speculative + decoding when tool calls are present. + """ + delta_message.content = None + delta_message.reasoning = None + delta_message.reasoning_content = None + def _should_stream_with_auto_tool_parsing(self, request: ChatCompletionRequest): """ Utility function to check if streamed tokens should go through the tool