diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 32cce3ef4cff..1ed7a79cc876 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -63,7 +63,6 @@ from vllm.entrypoints.chat_utils import ( ChatCompletionMessageParam, ChatTemplateContentFormatOption, - make_tool_call_id, ) from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.mcp.tool_server import ToolServer @@ -915,114 +914,57 @@ def _make_response_output_items( final_output: CompletionOutput, tokenizer: TokenizerLike, ) -> list[ResponseOutputItem]: - if self.parser and self.parser.reasoning_parser_cls: - try: - reasoning_parser = self.parser.reasoning_parser_cls(tokenizer) - except RuntimeError as e: - logger.exception("Error in reasoning parser creation.") - raise e - - reasoning, content = reasoning_parser.extract_reasoning( - final_output.text, request=request - ) - else: - reasoning = None - content = final_output.text - # Log complete response if output logging is enabled if self.enable_log_outputs and self.request_logger: - output_text = "" - if content: - output_text = content - elif reasoning: - output_text = f"[reasoning: {reasoning}]" - - if output_text: - self.request_logger.log_outputs( - request_id=request.request_id, - outputs=output_text, - output_token_ids=final_output.token_ids, - finish_reason=final_output.finish_reason, - is_streaming=False, - delta=False, - ) + self.request_logger.log_outputs( + request_id=request.request_id, + outputs=final_output.text, + output_token_ids=final_output.token_ids, + finish_reason=final_output.finish_reason, + is_streaming=False, + delta=False, + ) + + # Compute logprobs if requested + logprobs = None + if request.is_include_output_logprobs() and final_output.logprobs: + logprobs = self._create_response_logprobs( + token_ids=final_output.token_ids, + logprobs=final_output.logprobs, + tokenizer=tokenizer, + top_logprobs=request.top_logprobs, + ) - reasoning_item = None - message_item = None - if reasoning: - reasoning_item = ResponseReasoningItem( - id=f"rs_{random_uuid()}", - summary=[], - type="reasoning", - content=[ - ResponseReasoningTextContent(text=reasoning, type="reasoning_text") - ], - status=None, # NOTE: Only the last output item has status. + # Use parser to extract and create response output items + if self.parser: + parser = self.parser(tokenizer) + return parser.extract_response_outputs( + model_output=final_output.text, + request=request, + enable_auto_tools=self.enable_auto_tools, + tool_call_id_type=self.tool_call_id_type, + logprobs=logprobs, ) - tool_calls, content = self._parse_tool_calls_from_content( - request=request, - tokenizer=tokenizer, - content=content, - enable_auto_tools=self.enable_auto_tools, - tool_parser_cls=self.parser.tool_parser_cls if self.parser else None, - ) - if content or (self.use_harmony and tool_calls): - res_text_part = None - if content: - res_text_part = ResponseOutputText( - text=content, - annotations=[], # TODO - type="output_text", - logprobs=( - self._create_response_logprobs( - token_ids=final_output.token_ids, - logprobs=final_output.logprobs, - tokenizer=tokenizer, - top_logprobs=request.top_logprobs, - ) - if request.is_include_output_logprobs() - else None - ), - ) - message_item = ResponseOutputMessage( + # Fallback when no parser is configured + return [ + ResponseOutputMessage( id=f"msg_{random_uuid()}", - content=[res_text_part] if res_text_part else [], + content=[ + ResponseOutputText( + text=final_output.text, + annotations=[], + type="output_text", + logprobs=logprobs, + ) + ] + if final_output.text + else [], role="assistant", status="completed", type="message", ) - outputs = [] - - if reasoning_item: - outputs.append(reasoning_item) - if message_item: - outputs.append(message_item) - if tool_calls: - # We use a simple counter for history_tool_call_count because - # we don't track the history of tool calls in the Responses API yet. - # This means that the tool call index will start from 0 for each - # request. - tool_call_items = [] - for history_tool_call_cnt, tool_call in enumerate(tool_calls): - tool_call_items.append( - ResponseFunctionToolCall( - id=f"fc_{random_uuid()}", - call_id=tool_call.id - if tool_call.id - else make_tool_call_id( - id_type=self.tool_call_id_type, - func_name=tool_call.name, - idx=history_tool_call_cnt, - ), - type="function_call", - status="completed", - name=tool_call.name, - arguments=tool_call.arguments, - ) - ) - outputs.extend(tool_call_items) - return outputs + ] def _make_response_output_items_with_harmony( self, diff --git a/vllm/parser/abstract_parser.py b/vllm/parser/abstract_parser.py index f5cd1430a18f..aa145bab2121 100644 --- a/vllm/parser/abstract_parser.py +++ b/vllm/parser/abstract_parser.py @@ -1,23 +1,46 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json from abc import abstractmethod from collections.abc import Sequence from functools import cached_property +from openai.types.responses import ( + ResponseFunctionToolCall, + ResponseOutputItem, + ResponseOutputMessage, + ResponseOutputText, + ResponseReasoningItem, + ToolChoiceFunction, +) +from openai.types.responses.response_output_text import Logprob +from openai.types.responses.response_reasoning_item import ( + Content as ResponseReasoningTextContent, +) +from pydantic import TypeAdapter + +from vllm.entrypoints.chat_utils import make_tool_call_id from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, ) from vllm.entrypoints.openai.engine.protocol import ( DeltaMessage, ExtractedToolCallInformation, + FunctionCall, + FunctionDefinition, ) from vllm.entrypoints.openai.responses.protocol import ( ResponsesRequest, ) +from vllm.logger import init_logger from vllm.reasoning.abs_reasoning_parsers import ReasoningParser from vllm.tokenizers import TokenizerLike from vllm.tool_parsers.abstract_tool_parser import ToolParser +from vllm.utils import random_uuid + +logger = init_logger(__name__) class Parser: @@ -128,6 +151,33 @@ def extract_content_ids(self, input_ids: list[int]) -> list[int]: The extracted content token IDs. """ + @abstractmethod + def extract_response_outputs( + self, + model_output: str, + request: ResponsesRequest, + enable_auto_tools: bool = False, + tool_call_id_type: str = "random", + logprobs: list[Logprob] | None = None, + ) -> list[ResponseOutputItem]: + """ + Extract reasoning, content, and tool calls from a complete + model-generated string and return as ResponseOutputItem objects. + + Used for non-streaming responses where we have the entire model + response available before sending to the client. + + Args: + model_output: The complete model-generated string. + request: The request object used to generate the output. + enable_auto_tools: Whether to enable automatic tool call parsing. + tool_call_id_type: Type of tool call ID generation ("random", etc). + logprobs: Pre-computed logprobs for the output text, if any. + + Returns: + A list of ResponseOutputItem objects. + """ + @abstractmethod def extract_reasoning( self, @@ -260,6 +310,156 @@ def extract_reasoning( return None, model_output return self._reasoning_parser.extract_reasoning(model_output, request) + def extract_response_outputs( + self, + model_output: str, + request: ResponsesRequest, + enable_auto_tools: bool = False, + tool_call_id_type: str = "random", + logprobs: list[Logprob] | None = None, + ) -> list[ResponseOutputItem]: + # First extract reasoning + reasoning, content = self.extract_reasoning(model_output, request) + + # Then parse tool calls from the content + tool_calls, content = self._parse_tool_calls( + request=request, + content=content, + enable_auto_tools=enable_auto_tools, + ) + + # Build output items + outputs: list[ResponseOutputItem] = [] + + # Add reasoning item if present + if reasoning: + reasoning_item = ResponseReasoningItem( + id=f"rs_{random_uuid()}", + summary=[], + type="reasoning", + content=[ + ResponseReasoningTextContent(text=reasoning, type="reasoning_text") + ], + status=None, # NOTE: Only the last output item has status. + ) + outputs.append(reasoning_item) + + # Add message item if there's content + if content: + res_text_part = ResponseOutputText( + text=content, + annotations=[], + type="output_text", + logprobs=logprobs, + ) + message_item = ResponseOutputMessage( + id=f"msg_{random_uuid()}", + content=[res_text_part], + role="assistant", + status="completed", + type="message", + ) + outputs.append(message_item) + + if tool_calls: + # We use a simple counter for history_tool_call_count because + # we don't track the history of tool calls in the Responses API yet. + # This means that the tool call index will start from 0 for each + # request. + for history_tool_call_cnt, tool_call in enumerate(tool_calls): + tool_call_item = ResponseFunctionToolCall( + id=f"fc_{random_uuid()}", + call_id=tool_call.id + if tool_call.id + else make_tool_call_id( + id_type=tool_call_id_type, + func_name=tool_call.name, + idx=history_tool_call_cnt, + ), + type="function_call", + status="completed", + name=tool_call.name, + arguments=tool_call.arguments, + ) + outputs.append(tool_call_item) + + return outputs + + def _parse_tool_calls( + self, + request: ResponsesRequest, + content: str | None, + enable_auto_tools: bool, + ) -> tuple[list[FunctionCall], str | None]: + """ + TODO(qandrew): merge _parse_tool_calls_from_content + for ChatCompletions into this function + Parse tool calls from content based on request tool_choice settings. + + Returns: + A tuple of (function_calls, remaining_content) if tool calls + were parsed + """ + function_calls: list[FunctionCall] = [] + + if request.tool_choice and isinstance(request.tool_choice, ToolChoiceFunction): + # Forced Function Call (Responses API style) + assert content is not None + function_calls.append( + FunctionCall(name=request.tool_choice.name, arguments=content) + ) + return function_calls, None # Clear content since tool is called. + + if request.tool_choice and isinstance( + request.tool_choice, ChatCompletionNamedToolChoiceParam + ): + # Forced Function Call (Chat Completion API style) + assert content is not None + function_calls.append( + FunctionCall(name=request.tool_choice.function.name, arguments=content) + ) + return function_calls, None # Clear content since tool is called. + + if request.tool_choice == "required": + # Required tool calls - parse JSON + assert content is not None + tool_calls = TypeAdapter(list[FunctionDefinition]).validate_json(content) + function_calls.extend( + FunctionCall( + name=tool_call.name, + arguments=json.dumps(tool_call.parameters, ensure_ascii=False), + ) + for tool_call in tool_calls + ) + return function_calls, None # Clear content since tool is called. + + if ( + self._tool_parser is not None + and enable_auto_tools + and (request.tool_choice == "auto" or request.tool_choice is None) + ): + # Automatic Tool Call Parsing + tool_call_info = self._tool_parser.extract_tool_calls( + content if content is not None else "", + request=request, # type: ignore + ) + if tool_call_info is not None and tool_call_info.tools_called: + function_calls.extend( + FunctionCall( + id=tool_call.id, + name=tool_call.function.name, + arguments=tool_call.function.arguments, + ) + for tool_call in tool_call_info.tool_calls + ) + remaining_content = tool_call_info.content + if remaining_content and remaining_content.strip() == "": + remaining_content = None + return function_calls, remaining_content + + # No tool calls + return [], content + def extract_reasoning_streaming( self, previous_text: str,