diff --git a/tests/tool_use/test_gemma4_responses_adjust_request.py b/tests/tool_use/test_gemma4_responses_adjust_request.py new file mode 100644 index 000000000000..e08896ee3237 --- /dev/null +++ b/tests/tool_use/test_gemma4_responses_adjust_request.py @@ -0,0 +1,116 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Regression tests for Responses API tool-calling request adjustment. + +Covers two bugs on the ``/v1/responses`` path that broke streaming tool +calling for parsers relying on special-token delimiters (Gemma4): + +1. :class:`Gemma4ToolParser.adjust_request` used an + ``isinstance(request, ChatCompletionRequest)`` guard, so a + :class:`ResponsesRequest` with tools never had + ``skip_special_tokens`` flipped to ``False``. The default (``True``) + stripped ``<|tool_call>`` / ```` delimiters, causing + :meth:`Gemma4ToolParser.extract_tool_calls_streaming` to fall through + to the content branch and leak the raw ``call:fn{...}`` body via + ``response.output_text.delta``. + +2. :meth:`ToolParser.adjust_request` built + :class:`ResponseTextConfig` in two steps (bare constructor then + ``.format = ...``). Under Pydantic v2 the later assignment is not + tracked in ``__fields_set__``, which can drop the nested config from + ``model_dump``. It also passed a ``description`` kwarg carrying the + wrong-purpose string ``"Response format for tool calling"``. +""" + +from __future__ import annotations + +from typing import Any + +from openai.types.responses.tool_param import FunctionToolParam + +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest +from vllm.tool_parsers.abstract_tool_parser import ToolParser +from vllm.tool_parsers.gemma4_tool_parser import Gemma4ToolParser + + +def _get_weather_tool() -> FunctionToolParam: + return FunctionToolParam( + type="function", + name="get_weather", + description="Get current weather for a city", + parameters={ + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"], + }, + strict=True, + ) + + +def _build_responses_request(*, tool_choice: str) -> ResponsesRequest: + return ResponsesRequest( + model="gemma4-test", + input=[{"role": "user", "content": "What is the weather in Hanoi?"}], + tools=[_get_weather_tool()], + tool_choice=tool_choice, + stream=True, + max_output_tokens=200, + ) + + +class _StubTokenizer: + """Minimal tokenizer stub to satisfy ``Gemma4ToolParser.__init__``.""" + + def get_vocab(self) -> dict[str, int]: + return {"<|tool_call>": 256_000, "": 256_001, '<|"|>': 52} + + +def test_gemma4_adjust_request_sets_skip_special_tokens_on_responses() -> None: + """``Gemma4ToolParser.adjust_request`` must flip + ``skip_special_tokens=False`` for both ``ChatCompletionRequest`` and + ``ResponsesRequest`` so that ``<|tool_call>`` delimiters reach the + streaming extractor. The previous + ``isinstance(ChatCompletionRequest)`` guard omitted the Responses + path, causing raw ``call:fn{...}`` text to leak via + ``response.output_text.delta``. + """ + parser = Gemma4ToolParser.__new__(Gemma4ToolParser) + parser.model_tokenizer = _StubTokenizer() + + request = _build_responses_request(tool_choice="auto") + assert request.skip_special_tokens is True, ( + "Precondition: ResponsesRequest.skip_special_tokens default is True" + ) + + Gemma4ToolParser.adjust_request(parser, request) + + assert request.skip_special_tokens is False + + +def test_tool_parser_adjust_request_builds_valid_response_text_config() -> None: + """``ToolParser.adjust_request`` must produce a ``ResponseTextConfig`` + whose dumped form contains the JSON schema under the ``schema`` alias + and does not leak the unrelated ``"Response format for tool calling"`` + description string that the previous two-step construction injected. + """ + parser = ToolParser.__new__(ToolParser) + parser.model_tokenizer = None + + request = _build_responses_request(tool_choice="required") + ToolParser.adjust_request(parser, request) + + assert request.text is not None + assert request.text.format is not None + assert request.text.format.type == "json_schema" + + dump: dict[str, Any] = request.text.model_dump(mode="json", by_alias=True) + fmt = dump.get("format") or {} + assert fmt.get("type") == "json_schema" + assert fmt.get("name") == "tool_calling_response" + assert fmt.get("strict") is True + # Nested config must be present under the alias. Two-step Pydantic v2 + # construction could drop it from __fields_set__. + assert "schema" in fmt and isinstance(fmt["schema"], dict) + # The old code passed a wrong-purpose string; valid field should now + # either be absent or None (the openai-python default). + assert fmt.get("description") in (None, "") diff --git a/vllm/tool_parsers/abstract_tool_parser.py b/vllm/tool_parsers/abstract_tool_parser.py index c127bd9dd689..d1099e3e290f 100644 --- a/vllm/tool_parsers/abstract_tool_parser.py +++ b/vllm/tool_parsers/abstract_tool_parser.py @@ -92,13 +92,20 @@ def adjust_request( ) request.response_format = None if isinstance(request, ResponsesRequest): - request.text = ResponseTextConfig() - request.text.format = ResponseFormatTextJSONSchemaConfig( - name="tool_calling_response", - schema=json_schema_from_tool, - type="json_schema", - description="Response format for tool calling", - strict=True, + # Single-shot construction so Pydantic v2 tracks `format` + # in __fields_set__ — assigning to `.format` after the bare + # `ResponseTextConfig()` constructor does not, which can + # drop the nested config from `model_dump`. Also drop the + # `description` kwarg: it is not a field on + # ResponseFormatTextJSONSchemaConfig and was being silently + # passed through as extra. + request.text = ResponseTextConfig( + format=ResponseFormatTextJSONSchemaConfig( + type="json_schema", + name="tool_calling_response", + schema=json_schema_from_tool, + strict=True, + ) ) return request diff --git a/vllm/tool_parsers/gemma4_tool_parser.py b/vllm/tool_parsers/gemma4_tool_parser.py index ac48ef26cc19..a5ff2bcf8fce 100644 --- a/vllm/tool_parsers/gemma4_tool_parser.py +++ b/vllm/tool_parsers/gemma4_tool_parser.py @@ -360,12 +360,13 @@ def adjust_request( self, request: ChatCompletionRequest | ResponsesRequest ) -> ChatCompletionRequest | ResponsesRequest: request = super().adjust_request(request) - if ( - isinstance(request, ChatCompletionRequest) - and request.tools - and request.tool_choice != "none" - ): - # Don't skip special tokens — <|tool_call> etc. are needed + if request.tools and request.tool_choice != "none": + # Don't skip special tokens — <|tool_call> etc. are needed for + # the parser to detect tool calls. Apply to BOTH + # ChatCompletionRequest and ResponsesRequest (the previous + # isinstance(ChatCompletionRequest) guard caused tool-call + # delimiters to be stripped on /v1/responses, leaking raw + # `call:fn{...}` text via output_text.delta). request.skip_special_tokens = False return request