diff --git a/tests/entrypoints/openai/responses/test_simple.py b/tests/entrypoints/openai/responses/test_simple.py index b67f0d34115a..bbf3cc80ad43 100644 --- a/tests/entrypoints/openai/responses/test_simple.py +++ b/tests/entrypoints/openai/responses/test_simple.py @@ -6,6 +6,7 @@ from openai import OpenAI from ....utils import RemoteOpenAIServer +from .conftest import validate_streaming_event_stack MODEL_NAME = "Qwen/Qwen3-8B" @@ -219,3 +220,23 @@ async def test_extra_sampling_params(client: OpenAI, model_name: str): assert response.status in ["completed", "incomplete"] assert len(response.output) > 0 assert response.output[0].content[0].text # Has text output + + +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_streaming_types( + pairs_of_event_types: dict[str, str], client: OpenAI, model_name: str +): + stream = await client.responses.create( + model=model_name, + input="tell me a story about a cat in 20 words", + reasoning={"effort": "low"}, + tools=[], + stream=True, + background=False, + ) + events = [] + async for event in stream: + events.append(event) + + validate_streaming_event_stack(events, pairs_of_event_types) diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index b9d526e25def..3cfb6fffc3ea 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -85,6 +85,8 @@ ResponseCreatedEvent, ResponseInProgressEvent, ResponseInputOutputMessage, + ResponseReasoningPartAddedEvent, + ResponseReasoningPartDoneEvent, ResponsesRequest, ResponsesResponse, ResponseUsage, @@ -1339,6 +1341,19 @@ async def _process_simple_streaming_events( ), ) ) + yield _increment_sequence_number_and_return( + ResponseReasoningPartAddedEvent( + type="response.reasoning_part.added", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + content_index=current_content_index, + part=ResponseReasoningTextContent( + text="", + type="reasoning_text", + ), + ) + ) else: yield _increment_sequence_number_and_return( ResponseOutputItemAddedEvent( @@ -1354,22 +1369,21 @@ async def _process_simple_streaming_events( ), ) ) - yield _increment_sequence_number_and_return( - ResponseContentPartAddedEvent( - type="response.content_part.added", - sequence_number=-1, - output_index=current_output_index, - item_id=current_item_id, - content_index=current_content_index, - part=ResponseOutputText( - type="output_text", - text="", - annotations=[], - logprobs=[], - ), + yield _increment_sequence_number_and_return( + ResponseContentPartAddedEvent( + type="response.content_part.added", + sequence_number=-1, + output_index=current_output_index, + item_id=current_item_id, + content_index=current_content_index, + part=ResponseOutputText( + type="output_text", + text="", + annotations=[], + logprobs=[], + ), + ) ) - ) - current_content_index += 1 first_delta_sent = True # todo(kebe7jun) tool call support @@ -1397,6 +1411,19 @@ async def _process_simple_streaming_events( text=reason_content, ) ) + yield _increment_sequence_number_and_return( + ResponseReasoningPartDoneEvent( + type="response.reasoning_part.done", + sequence_number=-1, + item_id=current_item_id, + output_index=current_output_index, + content_index=current_content_index, + part=ResponseReasoningTextContent( + text=reason_content, + type="reasoning_text", + ), + ) + ) current_content_index = 0 reasoning_item = ResponseReasoningItem( type="reasoning", @@ -1418,6 +1445,8 @@ async def _process_simple_streaming_events( item=reasoning_item, ) ) + current_output_index += 1 + current_item_id = str(uuid.uuid4()) yield _increment_sequence_number_and_return( ResponseOutputItemAddedEvent( type="response.output_item.added", @@ -1432,8 +1461,6 @@ async def _process_simple_streaming_events( ), ) ) - current_output_index += 1 - current_item_id = str(uuid.uuid4()) yield _increment_sequence_number_and_return( ResponseContentPartAddedEvent( type="response.content_part.added", @@ -1449,7 +1476,6 @@ async def _process_simple_streaming_events( ), ) ) - current_content_index += 1 # reset previous delta messages previous_delta_messages = [] @@ -1485,7 +1511,6 @@ async def _process_simple_streaming_events( ), ) ) - current_content_index += 1 previous_delta_messages.append(delta_message) if previous_delta_messages: @@ -1505,7 +1530,19 @@ async def _process_simple_streaming_events( text=reason_content, ) ) - current_content_index += 1 + yield _increment_sequence_number_and_return( + ResponseReasoningPartDoneEvent( + type="response.reasoning_part.done", + sequence_number=-1, + item_id=current_item_id, + output_index=current_output_index, + content_index=current_content_index, + part=ResponseReasoningTextContent( + text=reason_content, + type="reasoning_text", + ), + ) + ) reasoning_item = ResponseReasoningItem( type="reasoning", content=[ @@ -1543,7 +1580,6 @@ async def _process_simple_streaming_events( item_id=current_item_id, ) ) - current_content_index += 1 part = ResponseOutputText( text=final_content, type="output_text", @@ -1559,7 +1595,6 @@ async def _process_simple_streaming_events( part=part, ) ) - current_content_index += 1 item = ResponseOutputMessage( type="message", role="assistant",