diff --git a/tests/entrypoints/openai/test_response_api_with_harmony.py b/tests/entrypoints/openai/test_response_api_with_harmony.py index 0d5836fab5a7..88b3795abe73 100644 --- a/tests/entrypoints/openai/test_response_api_with_harmony.py +++ b/tests/entrypoints/openai/test_response_api_with_harmony.py @@ -74,6 +74,20 @@ async def test_basic_with_reasoning_effort(client: OpenAI, model_name: str): assert response.status == "completed" +@pytest.mark.asyncio +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +async def test_max_tokens(client: OpenAI, model_name: str): + response = await client.responses.create( + model=model_name, + input="What is the first paragraph of Moby Dick?", + reasoning={"effort": "low"}, + max_output_tokens=30, + ) + assert response is not None + assert response.status == "incomplete" + assert response.incomplete_details.reason == "max_output_tokens" + + @pytest.mark.asyncio @pytest.mark.parametrize("model_name", [MODEL_NAME]) async def test_chat(client: OpenAI, model_name: str): diff --git a/vllm/entrypoints/context.py b/vllm/entrypoints/context.py index 9012639457ca..6658f91595e5 100644 --- a/vllm/entrypoints/context.py +++ b/vllm/entrypoints/context.py @@ -112,6 +112,7 @@ def __init__( available_tools: list[str], ): self._messages = messages + self.finish_reason: Optional[str] = None self.available_tools = available_tools self._tool_sessions: dict[str, Union[ClientSession, Tool]] = {} self.called_tools: set[str] = set() @@ -135,7 +136,8 @@ def _update_num_reasoning_tokens(self): if self.parser.current_channel in {"analysis", "commentary"}: self.num_reasoning_tokens += 1 - def append_output(self, output) -> None: + def append_output(self, output: Union[RequestOutput, + list[Message]]) -> None: if isinstance(output, RequestOutput): output_token_ids = output.outputs[0].token_ids self.parser = get_streamable_parser_for_assistant() @@ -150,6 +152,8 @@ def append_output(self, output) -> None: # Move current turn to previous turn for next turn's calculations self.previous_turn = self.current_turn.copy() output_msgs = self.parser.messages + # The responses finish reason is set in the last message + self.finish_reason = output.outputs[0].finish_reason else: # Tool output. output_msgs = output @@ -157,18 +161,18 @@ def append_output(self, output) -> None: def _update_prefill_token_usage(self, output: RequestOutput) -> None: """Update token usage statistics for the prefill phase of generation. - + The prefill phase processes the input prompt tokens. This method: 1. Counts the prompt tokens for this turn 2. Calculates tool output tokens for multi-turn conversations 3. Updates cached token counts 4. Tracks state for next turn calculations - + Tool output tokens are calculated as: - current_prompt_tokens - last_turn_prompt_tokens - + current_prompt_tokens - last_turn_prompt_tokens - last_turn_output_tokens This represents tokens added between turns (typically tool responses). - + Args: output: The RequestOutput containing prompt token information """ @@ -214,18 +218,18 @@ def _update_prefill_token_usage(self, output: RequestOutput) -> None: def _update_decode_token_usage(self, output: RequestOutput) -> int: """Update token usage statistics for the decode phase of generation. - + The decode phase processes the generated output tokens. This method: 1. Counts output tokens from all completion outputs 2. Updates the total output token count 3. Tracks tokens generated in the current turn - + In streaming mode, this is called for each token generated. In non-streaming mode, this is called once with all output tokens. - + Args: output: The RequestOutput containing generated token information - + Returns: int: Number of output tokens processed in this call """ @@ -385,7 +389,8 @@ def __init__(self, *args, **kwargs): def messages(self) -> list: return self.parser.messages - def append_output(self, output) -> None: + def append_output(self, output: Union[RequestOutput, + list[Message]]) -> None: if isinstance(output, RequestOutput): # append_output is called for each output token in streaming case, # so we only want to add the prompt tokens once for each message. diff --git a/vllm/entrypoints/harmony_utils.py b/vllm/entrypoints/harmony_utils.py index f7528ba81dce..1364a41be950 100644 --- a/vllm/entrypoints/harmony_utils.py +++ b/vllm/entrypoints/harmony_utils.py @@ -387,7 +387,9 @@ def parse_remaining_state( id=f"msg_{random_uuid()}", content=[output_text], role="assistant", - status="completed", + # if the parser still has messages (ie if the generator got cut + # abruptly), this should be incomplete + status="incomplete", type="message", ) return [text_item] diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 4dcb1f3f1c89..8ecb1a8239c3 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -30,7 +30,7 @@ from openai.types.responses import (ResponseFormatTextConfig as ResponseTextConfig) -from openai.types.responses.response import ToolChoice +from openai.types.responses.response import IncompleteDetails, ToolChoice from openai.types.responses.tool import Tool from openai.types.shared import Metadata, Reasoning from pydantic import (BaseModel, ConfigDict, Field, TypeAdapter, @@ -1868,7 +1868,7 @@ class ResponsesResponse(OpenAIBaseModel): id: str = Field(default_factory=lambda: f"resp_{random_uuid()}") created_at: int = Field(default_factory=lambda: int(time.time())) # error: Optional[ResponseError] = None - # incomplete_details: Optional[IncompleteDetails] = None + incomplete_details: Optional[IncompleteDetails] = None instructions: Optional[str] = None metadata: Optional[Metadata] = None model: str @@ -1904,9 +1904,18 @@ def from_request( status: ResponseStatus, usage: Optional[ResponseUsage] = None, ) -> "ResponsesResponse": + + incomplete_details: Optional[IncompleteDetails] = None + if status == 'incomplete': + incomplete_details = IncompleteDetails(reason='max_output_tokens') + # TODO: implement the other reason for incomplete_details, + # which is content_filter + # incomplete_details = IncompleteDetails(reason='content_filter') + return cls( id=request.request_id, created_at=created_time, + incomplete_details=incomplete_details, instructions=request.instructions, metadata=request.metadata, model=model_name, @@ -2109,7 +2118,7 @@ class DetokenizeResponse(OpenAIBaseModel): class TokenizerInfoResponse(OpenAIBaseModel): """ - Response containing tokenizer configuration + Response containing tokenizer configuration equivalent to tokenizer_config.json """ @@ -2199,7 +2208,7 @@ class TranscriptionRequest(OpenAIBaseModel): to_language: Optional[str] = None """The language of the output audio we transcribe to. - Please note that this is not currently used by supported models at this + Please note that this is not currently used by supported models at this time, but it is a placeholder for future use, matching translation api. """ diff --git a/vllm/entrypoints/openai/serving_responses.py b/vllm/entrypoints/openai/serving_responses.py index 401ba6c53331..088051f783a8 100644 --- a/vllm/entrypoints/openai/serving_responses.py +++ b/vllm/entrypoints/openai/serving_responses.py @@ -27,7 +27,7 @@ ResponseReasoningItem, ResponseReasoningTextDeltaEvent, ResponseReasoningTextDoneEvent, - response_text_delta_event) + ResponseStatus, response_text_delta_event) from openai.types.responses.response_output_text import (Logprob, LogprobTopLogprob) # yapf: enable @@ -461,10 +461,22 @@ async def responses_full_generator( # TODO: Use a vllm-specific Validation Error return self.create_error_response(str(e)) + # NOTE: Implementation of stauts is still WIP, but for now + # we guarantee that if the status is not "completed", it is accurate. + # "completed" is implemented as the "catch-all" for now. + status: ResponseStatus = "completed" + if self.use_harmony: assert isinstance(context, HarmonyContext) output = self._make_response_output_items_with_harmony(context) num_tool_output_tokens = context.num_tool_output_tokens + if len(output) > 0: + if context.finish_reason == "length": + status = "incomplete" + elif context.finish_reason == "abort": + status = "cancelled" + else: + status = "incomplete" else: assert isinstance(context, SimpleContext) final_res = context.last_output @@ -501,7 +513,7 @@ async def responses_full_generator( model_name=model_name, created_time=created_time, output=output, - status="completed", + status=status, usage=usage, ) @@ -658,7 +670,7 @@ def _make_response_output_items_with_harmony( self, context: HarmonyContext, ) -> list[ResponseOutputItem]: - output_items = [] + output_items: list[ResponseOutputItem] = [] num_init_messages = context.num_init_messages for msg in context.messages[num_init_messages:]: output_items.extend(parse_output_message(msg)) diff --git a/vllm/v1/core/sched/utils.py b/vllm/v1/core/sched/utils.py index 42d3e5c68b4c..c431843de6ba 100644 --- a/vllm/v1/core/sched/utils.py +++ b/vllm/v1/core/sched/utils.py @@ -10,19 +10,19 @@ def remove_all(lst: list, items_to_remove: set) -> list: """Remove all items from a list that are in the items_to_remove set. - + This method optimizes for the common case of removing a single item, falling back to list comprehension for multiple items. - + Args: lst: The list to remove items from items_to_remove: Set of items to remove - + Returns: Either the modified original list (for single item removal) or a new list (for multiple item removal). Callers should use the returned value. - + Note: For single item removal, this modifies the original list in-place and returns it. For multiple items, it creates and returns a new list. diff --git a/vllm/v1/engine/output_processor.py b/vllm/v1/engine/output_processor.py index 02c8c61cb909..14ac1e3e5afa 100644 --- a/vllm/v1/engine/output_processor.py +++ b/vllm/v1/engine/output_processor.py @@ -373,17 +373,17 @@ def process_outputs( 1) Compute stats for logging 2) Detokenize 3) Create and handle RequestOutput objects: - * If there is a queue (for usage with AsyncLLM), + * If there is a queue (for usage with AsyncLLM), put the RequestOutput objects into the queue for handling by the per-request generate() tasks. - * If there is no queue (for usage with LLMEngine), + * If there is no queue (for usage with LLMEngine), return a list of RequestOutput objects. NOTE FOR DEVELOPERS vLLM V1 minimizes the number of python loops over the full - batch to ensure system overheads are minimized. This is the + batch to ensure system overheads are minimized. This is the only function that should loop over EngineCoreOutputs. If you need to touch every element of the batch, do it from