From cac2a253c63dff98d26eb6139e45476acb723ed5 Mon Sep 17 00:00:00 2001 From: Hoang Nguyen <118159510+hnt2601@users.noreply.github.com> Date: Wed, 15 Apr 2026 09:25:54 +0000 Subject: [PATCH] [Bugfix][Responses API] Fix streaming tool calls on /v1/responses Two bugs made streaming function calling unusable on the Responses API for any tool-call parser that relies on special-token delimiters (Gemma4), and for any parser when tool_choice="required" is combined with stream=True. ## 1. Gemma4 tool calls leak as plain text via response.output_text.delta `Gemma4ToolParser.adjust_request` guarded the `skip_special_tokens = False` line with `isinstance(request, ChatCompletionRequest)`, so a `ResponsesRequest` carrying tools kept the default `skip_special_tokens = True`. The tokenizer then stripped the Gemma4 delimiters (`<|tool_call>`, ``, `<|"|>`) from the detokenized text before the parser saw them, and `Gemma4ToolParser.extract_tool_calls_streaming` took the `self.tool_call_start_token not in current_text` branch and emitted the raw `call:fn{...}` body via `response.output_text.delta` instead of `response.function_call_arguments.delta`. Fix: drop the `isinstance` guard so both `ChatCompletionRequest` and `ResponsesRequest` get `skip_special_tokens = False`, matching the pattern already used by `FunctionGemmaToolParser.adjust_request`. ## 2. tool_choice="required" + stream=True crashes on /v1/responses `ToolParser.adjust_request` built `ResponseTextConfig` in two steps (bare constructor, then `request.text.format = ...`). Under Pydantic v2 the post-init field assignment is not tracked in `__fields_set__`, which can drop the nested config from `model_dump(...)` and surface downstream as `ValidationError: schema field required` when the initial `ResponseCreatedEvent` is serialized. The same call site also passed a `description="Response format for tool calling"` kwarg that is not semantically a tool schema description. Fix: use a single-shot `ResponseTextConfig(format=...)` constructor so `format` is part of `__fields_set__`, and drop the `description` kwarg. ## Tests Added tests/tool_use/test_gemma4_responses_adjust_request.py with two unit regressions: - test_gemma4_adjust_request_sets_skip_special_tokens_on_responses: asserts Gemma4ToolParser.adjust_request flips skip_special_tokens=False for a ResponsesRequest with tools. - test_tool_parser_adjust_request_builds_valid_response_text_config: asserts the dumped ResponseTextConfig (with by_alias=True) has format.type=="json_schema", contains the nested schema key, and does not leak the old "Response format for tool calling" string. Both tests fail on main and pass after this change. End-to-end curl verification against a live Gemma4 server (--tool-call-parser gemma4 --enable-auto-tool-choice on a single H100) confirms response.function_call_arguments.delta events are now emitted and no call:get_weather{...} text leaks via response.output_text.delta. Signed-off-by: Hoang Nguyen <118159510+hnt2601@users.noreply.github.com> Co-authored-by: Claude --- .../test_gemma4_responses_adjust_request.py | 116 ++++++++++++++++++ vllm/tool_parsers/abstract_tool_parser.py | 21 ++-- vllm/tool_parsers/gemma4_tool_parser.py | 13 +- 3 files changed, 137 insertions(+), 13 deletions(-) create mode 100644 tests/tool_use/test_gemma4_responses_adjust_request.py diff --git a/tests/tool_use/test_gemma4_responses_adjust_request.py b/tests/tool_use/test_gemma4_responses_adjust_request.py new file mode 100644 index 000000000000..e08896ee3237 --- /dev/null +++ b/tests/tool_use/test_gemma4_responses_adjust_request.py @@ -0,0 +1,116 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Regression tests for Responses API tool-calling request adjustment. + +Covers two bugs on the ``/v1/responses`` path that broke streaming tool +calling for parsers relying on special-token delimiters (Gemma4): + +1. :class:`Gemma4ToolParser.adjust_request` used an + ``isinstance(request, ChatCompletionRequest)`` guard, so a + :class:`ResponsesRequest` with tools never had + ``skip_special_tokens`` flipped to ``False``. The default (``True``) + stripped ``<|tool_call>`` / ```` delimiters, causing + :meth:`Gemma4ToolParser.extract_tool_calls_streaming` to fall through + to the content branch and leak the raw ``call:fn{...}`` body via + ``response.output_text.delta``. + +2. :meth:`ToolParser.adjust_request` built + :class:`ResponseTextConfig` in two steps (bare constructor then + ``.format = ...``). Under Pydantic v2 the later assignment is not + tracked in ``__fields_set__``, which can drop the nested config from + ``model_dump``. It also passed a ``description`` kwarg carrying the + wrong-purpose string ``"Response format for tool calling"``. +""" + +from __future__ import annotations + +from typing import Any + +from openai.types.responses.tool_param import FunctionToolParam + +from vllm.entrypoints.openai.responses.protocol import ResponsesRequest +from vllm.tool_parsers.abstract_tool_parser import ToolParser +from vllm.tool_parsers.gemma4_tool_parser import Gemma4ToolParser + + +def _get_weather_tool() -> FunctionToolParam: + return FunctionToolParam( + type="function", + name="get_weather", + description="Get current weather for a city", + parameters={ + "type": "object", + "properties": {"city": {"type": "string"}}, + "required": ["city"], + }, + strict=True, + ) + + +def _build_responses_request(*, tool_choice: str) -> ResponsesRequest: + return ResponsesRequest( + model="gemma4-test", + input=[{"role": "user", "content": "What is the weather in Hanoi?"}], + tools=[_get_weather_tool()], + tool_choice=tool_choice, + stream=True, + max_output_tokens=200, + ) + + +class _StubTokenizer: + """Minimal tokenizer stub to satisfy ``Gemma4ToolParser.__init__``.""" + + def get_vocab(self) -> dict[str, int]: + return {"<|tool_call>": 256_000, "": 256_001, '<|"|>': 52} + + +def test_gemma4_adjust_request_sets_skip_special_tokens_on_responses() -> None: + """``Gemma4ToolParser.adjust_request`` must flip + ``skip_special_tokens=False`` for both ``ChatCompletionRequest`` and + ``ResponsesRequest`` so that ``<|tool_call>`` delimiters reach the + streaming extractor. The previous + ``isinstance(ChatCompletionRequest)`` guard omitted the Responses + path, causing raw ``call:fn{...}`` text to leak via + ``response.output_text.delta``. + """ + parser = Gemma4ToolParser.__new__(Gemma4ToolParser) + parser.model_tokenizer = _StubTokenizer() + + request = _build_responses_request(tool_choice="auto") + assert request.skip_special_tokens is True, ( + "Precondition: ResponsesRequest.skip_special_tokens default is True" + ) + + Gemma4ToolParser.adjust_request(parser, request) + + assert request.skip_special_tokens is False + + +def test_tool_parser_adjust_request_builds_valid_response_text_config() -> None: + """``ToolParser.adjust_request`` must produce a ``ResponseTextConfig`` + whose dumped form contains the JSON schema under the ``schema`` alias + and does not leak the unrelated ``"Response format for tool calling"`` + description string that the previous two-step construction injected. + """ + parser = ToolParser.__new__(ToolParser) + parser.model_tokenizer = None + + request = _build_responses_request(tool_choice="required") + ToolParser.adjust_request(parser, request) + + assert request.text is not None + assert request.text.format is not None + assert request.text.format.type == "json_schema" + + dump: dict[str, Any] = request.text.model_dump(mode="json", by_alias=True) + fmt = dump.get("format") or {} + assert fmt.get("type") == "json_schema" + assert fmt.get("name") == "tool_calling_response" + assert fmt.get("strict") is True + # Nested config must be present under the alias. Two-step Pydantic v2 + # construction could drop it from __fields_set__. + assert "schema" in fmt and isinstance(fmt["schema"], dict) + # The old code passed a wrong-purpose string; valid field should now + # either be absent or None (the openai-python default). + assert fmt.get("description") in (None, "") diff --git a/vllm/tool_parsers/abstract_tool_parser.py b/vllm/tool_parsers/abstract_tool_parser.py index c127bd9dd689..d1099e3e290f 100644 --- a/vllm/tool_parsers/abstract_tool_parser.py +++ b/vllm/tool_parsers/abstract_tool_parser.py @@ -92,13 +92,20 @@ def adjust_request( ) request.response_format = None if isinstance(request, ResponsesRequest): - request.text = ResponseTextConfig() - request.text.format = ResponseFormatTextJSONSchemaConfig( - name="tool_calling_response", - schema=json_schema_from_tool, - type="json_schema", - description="Response format for tool calling", - strict=True, + # Single-shot construction so Pydantic v2 tracks `format` + # in __fields_set__ — assigning to `.format` after the bare + # `ResponseTextConfig()` constructor does not, which can + # drop the nested config from `model_dump`. Also drop the + # `description` kwarg: it is not a field on + # ResponseFormatTextJSONSchemaConfig and was being silently + # passed through as extra. + request.text = ResponseTextConfig( + format=ResponseFormatTextJSONSchemaConfig( + type="json_schema", + name="tool_calling_response", + schema=json_schema_from_tool, + strict=True, + ) ) return request diff --git a/vllm/tool_parsers/gemma4_tool_parser.py b/vllm/tool_parsers/gemma4_tool_parser.py index ac48ef26cc19..a5ff2bcf8fce 100644 --- a/vllm/tool_parsers/gemma4_tool_parser.py +++ b/vllm/tool_parsers/gemma4_tool_parser.py @@ -360,12 +360,13 @@ def adjust_request( self, request: ChatCompletionRequest | ResponsesRequest ) -> ChatCompletionRequest | ResponsesRequest: request = super().adjust_request(request) - if ( - isinstance(request, ChatCompletionRequest) - and request.tools - and request.tool_choice != "none" - ): - # Don't skip special tokens — <|tool_call> etc. are needed + if request.tools and request.tool_choice != "none": + # Don't skip special tokens — <|tool_call> etc. are needed for + # the parser to detect tool calls. Apply to BOTH + # ChatCompletionRequest and ResponsesRequest (the previous + # isinstance(ChatCompletionRequest) guard caused tool-call + # delimiters to be stripped on /v1/responses, leaking raw + # `call:fn{...}` text via output_text.delta). request.skip_special_tokens = False return request