diff --git a/vllm/entrypoints/openai/responses/serving.py b/vllm/entrypoints/openai/responses/serving.py index 6d0041813e35..45e1dcbca30c 100644 --- a/vllm/entrypoints/openai/responses/serving.py +++ b/vllm/entrypoints/openai/responses/serving.py @@ -903,6 +903,7 @@ def _make_response_output_items( parser = self.parser(tokenizer) return parser.extract_response_outputs( model_output=final_output.text, + model_output_token_ids=final_output.token_ids, request=request, enable_auto_tools=self.enable_auto_tools, tool_call_id_type=self.tool_call_id_type, diff --git a/vllm/parser/abstract_parser.py b/vllm/parser/abstract_parser.py index aa145bab2121..51ea5e57c9db 100644 --- a/vllm/parser/abstract_parser.py +++ b/vllm/parser/abstract_parser.py @@ -154,7 +154,9 @@ def extract_content_ids(self, input_ids: list[int]) -> list[int]: @abstractmethod def extract_response_outputs( self, + *, model_output: str, + model_output_token_ids: Sequence[int], request: ResponsesRequest, enable_auto_tools: bool = False, tool_call_id_type: str = "random", @@ -169,6 +171,7 @@ def extract_response_outputs( Args: model_output: The complete model-generated string. + model_output_token_ids: The token IDs of the model output. request: The request object used to generate the output. enable_auto_tools: Whether to enable automatic tool call parsing. tool_call_id_type: Type of tool call ID generation ("random", etc). @@ -312,7 +315,9 @@ def extract_reasoning( def extract_response_outputs( self, + *, model_output: str, + model_output_token_ids: Sequence[int], request: ResponsesRequest, enable_auto_tools: bool = False, tool_call_id_type: str = "random",