From cac2a253c63dff98d26eb6139e45476acb723ed5 Mon Sep 17 00:00:00 2001
From: Hoang Nguyen <118159510+hnt2601@users.noreply.github.com>
Date: Wed, 15 Apr 2026 09:25:54 +0000
Subject: [PATCH] [Bugfix][Responses API] Fix streaming tool calls on
 /v1/responses

Two bugs made streaming function calling unusable on the Responses API
for any tool-call parser that relies on special-token delimiters
(Gemma4), and for any parser when tool_choice="required" is combined
with stream=True.

## 1. Gemma4 tool calls leak as plain text via response.output_text.delta

`Gemma4ToolParser.adjust_request` guarded the `skip_special_tokens =
False` line with `isinstance(request, ChatCompletionRequest)`, so a
`ResponsesRequest` carrying tools kept the default `skip_special_tokens
= True`. The tokenizer then stripped the Gemma4 delimiters
(`<|tool_call>`, `<tool_call|>`, `<|"|>`) from the detokenized text
before the parser saw them, and
`Gemma4ToolParser.extract_tool_calls_streaming` took the
`self.tool_call_start_token not in current_text` branch and emitted the
raw `call:fn{...}` body via `response.output_text.delta` instead of
`response.function_call_arguments.delta`.

Fix: drop the `isinstance` guard so both `ChatCompletionRequest` and
`ResponsesRequest` get `skip_special_tokens = False`, matching the
pattern already used by `FunctionGemmaToolParser.adjust_request`.

## 2. tool_choice="required" + stream=True crashes on /v1/responses

`ToolParser.adjust_request` built `ResponseTextConfig` in two steps
(bare constructor, then `request.text.format = ...`). Under Pydantic
v2 the post-init field assignment is not tracked in `__fields_set__`,
which can drop the nested config from `model_dump(...)` and surface
downstream as `ValidationError: schema field required` when the
initial `ResponseCreatedEvent` is serialized. The same call site also
passed a `description="Response format for tool calling"` kwarg that
is not semantically a tool schema description.

Fix: use a single-shot `ResponseTextConfig(format=...)` constructor so
`format` is part of `__fields_set__`, and drop the `description`
kwarg.

## Tests

Added tests/tool_use/test_gemma4_responses_adjust_request.py with two
unit regressions:

- test_gemma4_adjust_request_sets_skip_special_tokens_on_responses:
  asserts Gemma4ToolParser.adjust_request flips
  skip_special_tokens=False for a ResponsesRequest with tools.
- test_tool_parser_adjust_request_builds_valid_response_text_config:
  asserts the dumped ResponseTextConfig (with by_alias=True) has
  format.type=="json_schema", contains the nested schema key, and does
  not leak the old "Response format for tool calling" string.

Both tests fail on main and pass after this change. End-to-end curl
verification against a live Gemma4 server (--tool-call-parser gemma4
--enable-auto-tool-choice on a single H100) confirms
response.function_call_arguments.delta events are now emitted and no
call:get_weather{...} text leaks via response.output_text.delta.

Signed-off-by: Hoang Nguyen <118159510+hnt2601@users.noreply.github.com>
Co-authored-by: Claude <noreply@anthropic.com>
---
 .../test_gemma4_responses_adjust_request.py   | 116 ++++++++++++++++++
 vllm/tool_parsers/abstract_tool_parser.py     |  21 ++--
 vllm/tool_parsers/gemma4_tool_parser.py       |  13 +-
 3 files changed, 137 insertions(+), 13 deletions(-)
 create mode 100644 tests/tool_use/test_gemma4_responses_adjust_request.py

diff --git a/tests/tool_use/test_gemma4_responses_adjust_request.py b/tests/tool_use/test_gemma4_responses_adjust_request.py
new file mode 100644
index 000000000000..e08896ee3237
--- /dev/null
+++ b/tests/tool_use/test_gemma4_responses_adjust_request.py
@@ -0,0 +1,116 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Regression tests for Responses API tool-calling request adjustment.
+
+Covers two bugs on the ``/v1/responses`` path that broke streaming tool
+calling for parsers relying on special-token delimiters (Gemma4):
+
+1. :class:`Gemma4ToolParser.adjust_request` used an
+   ``isinstance(request, ChatCompletionRequest)`` guard, so a
+   :class:`ResponsesRequest` with tools never had
+   ``skip_special_tokens`` flipped to ``False``. The default (``True``)
+   stripped ``<|tool_call>`` / ``<tool_call|>`` delimiters, causing
+   :meth:`Gemma4ToolParser.extract_tool_calls_streaming` to fall through
+   to the content branch and leak the raw ``call:fn{...}`` body via
+   ``response.output_text.delta``.
+
+2. :meth:`ToolParser.adjust_request` built
+   :class:`ResponseTextConfig` in two steps (bare constructor then
+   ``.format = ...``). Under Pydantic v2 the later assignment is not
+   tracked in ``__fields_set__``, which can drop the nested config from
+   ``model_dump``. It also passed a ``description`` kwarg carrying the
+   wrong-purpose string ``"Response format for tool calling"``.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+from openai.types.responses.tool_param import FunctionToolParam
+
+from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
+from vllm.tool_parsers.abstract_tool_parser import ToolParser
+from vllm.tool_parsers.gemma4_tool_parser import Gemma4ToolParser
+
+
+def _get_weather_tool() -> FunctionToolParam:
+    return FunctionToolParam(
+        type="function",
+        name="get_weather",
+        description="Get current weather for a city",
+        parameters={
+            "type": "object",
+            "properties": {"city": {"type": "string"}},
+            "required": ["city"],
+        },
+        strict=True,
+    )
+
+
+def _build_responses_request(*, tool_choice: str) -> ResponsesRequest:
+    return ResponsesRequest(
+        model="gemma4-test",
+        input=[{"role": "user", "content": "What is the weather in Hanoi?"}],
+        tools=[_get_weather_tool()],
+        tool_choice=tool_choice,
+        stream=True,
+        max_output_tokens=200,
+    )
+
+
+class _StubTokenizer:
+    """Minimal tokenizer stub to satisfy ``Gemma4ToolParser.__init__``."""
+
+    def get_vocab(self) -> dict[str, int]:
+        return {"<|tool_call>": 256_000, "<tool_call|>": 256_001, '<|"|>': 52}
+
+
+def test_gemma4_adjust_request_sets_skip_special_tokens_on_responses() -> None:
+    """``Gemma4ToolParser.adjust_request`` must flip
+    ``skip_special_tokens=False`` for both ``ChatCompletionRequest`` and
+    ``ResponsesRequest`` so that ``<|tool_call>`` delimiters reach the
+    streaming extractor. The previous
+    ``isinstance(ChatCompletionRequest)`` guard omitted the Responses
+    path, causing raw ``call:fn{...}`` text to leak via
+    ``response.output_text.delta``.
+    """
+    parser = Gemma4ToolParser.__new__(Gemma4ToolParser)
+    parser.model_tokenizer = _StubTokenizer()
+
+    request = _build_responses_request(tool_choice="auto")
+    assert request.skip_special_tokens is True, (
+        "Precondition: ResponsesRequest.skip_special_tokens default is True"
+    )
+
+    Gemma4ToolParser.adjust_request(parser, request)
+
+    assert request.skip_special_tokens is False
+
+
+def test_tool_parser_adjust_request_builds_valid_response_text_config() -> None:
+    """``ToolParser.adjust_request`` must produce a ``ResponseTextConfig``
+    whose dumped form contains the JSON schema under the ``schema`` alias
+    and does not leak the unrelated ``"Response format for tool calling"``
+    description string that the previous two-step construction injected.
+    """
+    parser = ToolParser.__new__(ToolParser)
+    parser.model_tokenizer = None
+
+    request = _build_responses_request(tool_choice="required")
+    ToolParser.adjust_request(parser, request)
+
+    assert request.text is not None
+    assert request.text.format is not None
+    assert request.text.format.type == "json_schema"
+
+    dump: dict[str, Any] = request.text.model_dump(mode="json", by_alias=True)
+    fmt = dump.get("format") or {}
+    assert fmt.get("type") == "json_schema"
+    assert fmt.get("name") == "tool_calling_response"
+    assert fmt.get("strict") is True
+    # Nested config must be present under the alias. Two-step Pydantic v2
+    # construction could drop it from __fields_set__.
+    assert "schema" in fmt and isinstance(fmt["schema"], dict)
+    # The old code passed a wrong-purpose string; valid field should now
+    # either be absent or None (the openai-python default).
+    assert fmt.get("description") in (None, "")
diff --git a/vllm/tool_parsers/abstract_tool_parser.py b/vllm/tool_parsers/abstract_tool_parser.py
index c127bd9dd689..d1099e3e290f 100644
--- a/vllm/tool_parsers/abstract_tool_parser.py
+++ b/vllm/tool_parsers/abstract_tool_parser.py
@@ -92,13 +92,20 @@ def adjust_request(
                 )
                 request.response_format = None
             if isinstance(request, ResponsesRequest):
-                request.text = ResponseTextConfig()
-                request.text.format = ResponseFormatTextJSONSchemaConfig(
-                    name="tool_calling_response",
-                    schema=json_schema_from_tool,
-                    type="json_schema",
-                    description="Response format for tool calling",
-                    strict=True,
+                # Single-shot construction so Pydantic v2 tracks `format`
+                # in __fields_set__ — assigning to `.format` after the bare
+                # `ResponseTextConfig()` constructor does not, which can
+                # drop the nested config from `model_dump`. Also drop the
+                # `description` kwarg: it is not a field on
+                # ResponseFormatTextJSONSchemaConfig and was being silently
+                # passed through as extra.
+                request.text = ResponseTextConfig(
+                    format=ResponseFormatTextJSONSchemaConfig(
+                        type="json_schema",
+                        name="tool_calling_response",
+                        schema=json_schema_from_tool,
+                        strict=True,
+                    )
                 )
 
         return request
diff --git a/vllm/tool_parsers/gemma4_tool_parser.py b/vllm/tool_parsers/gemma4_tool_parser.py
index ac48ef26cc19..a5ff2bcf8fce 100644
--- a/vllm/tool_parsers/gemma4_tool_parser.py
+++ b/vllm/tool_parsers/gemma4_tool_parser.py
@@ -360,12 +360,13 @@ def adjust_request(
         self, request: ChatCompletionRequest | ResponsesRequest
     ) -> ChatCompletionRequest | ResponsesRequest:
         request = super().adjust_request(request)
-        if (
-            isinstance(request, ChatCompletionRequest)
-            and request.tools
-            and request.tool_choice != "none"
-        ):
-            # Don't skip special tokens — <|tool_call> etc. are needed
+        if request.tools and request.tool_choice != "none":
+            # Don't skip special tokens — <|tool_call> etc. are needed for
+            # the parser to detect tool calls. Apply to BOTH
+            # ChatCompletionRequest and ResponsesRequest (the previous
+            # isinstance(ChatCompletionRequest) guard caused tool-call
+            # delimiters to be stripped on /v1/responses, leaking raw
+            # `call:fn{...}` text via output_text.delta).
             request.skip_special_tokens = False
         return request