Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions tests/tool_use/test_gemma4_responses_adjust_request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Regression tests for Responses API tool-calling request adjustment.

Covers two bugs on the ``/v1/responses`` path that broke streaming tool
calling for parsers relying on special-token delimiters (Gemma4):

1. :class:`Gemma4ToolParser.adjust_request` used an
``isinstance(request, ChatCompletionRequest)`` guard, so a
:class:`ResponsesRequest` with tools never had
``skip_special_tokens`` flipped to ``False``. The default (``True``)
stripped ``<|tool_call>`` / ``<tool_call|>`` delimiters, causing
:meth:`Gemma4ToolParser.extract_tool_calls_streaming` to fall through
to the content branch and leak the raw ``call:fn{...}`` body via
``response.output_text.delta``.

2. :meth:`ToolParser.adjust_request` built
:class:`ResponseTextConfig` in two steps (bare constructor then
``.format = ...``). Under Pydantic v2 the later assignment is not
tracked in ``__fields_set__``, which can drop the nested config from
``model_dump``. It also passed a ``description`` kwarg carrying the
wrong-purpose string ``"Response format for tool calling"``.
"""

from __future__ import annotations

from typing import Any

from openai.types.responses.tool_param import FunctionToolParam

from vllm.entrypoints.openai.responses.protocol import ResponsesRequest
from vllm.tool_parsers.abstract_tool_parser import ToolParser
from vllm.tool_parsers.gemma4_tool_parser import Gemma4ToolParser


def _get_weather_tool() -> FunctionToolParam:
return FunctionToolParam(
type="function",
name="get_weather",
description="Get current weather for a city",
parameters={
"type": "object",
"properties": {"city": {"type": "string"}},
"required": ["city"],
},
strict=True,
)


def _build_responses_request(*, tool_choice: str) -> ResponsesRequest:
return ResponsesRequest(
model="gemma4-test",
input=[{"role": "user", "content": "What is the weather in Hanoi?"}],
tools=[_get_weather_tool()],
tool_choice=tool_choice,
stream=True,
max_output_tokens=200,
)


class _StubTokenizer:
"""Minimal tokenizer stub to satisfy ``Gemma4ToolParser.__init__``."""

def get_vocab(self) -> dict[str, int]:
return {"<|tool_call>": 256_000, "<tool_call|>": 256_001, '<|"|>': 52}


def test_gemma4_adjust_request_sets_skip_special_tokens_on_responses() -> None:
"""``Gemma4ToolParser.adjust_request`` must flip
``skip_special_tokens=False`` for both ``ChatCompletionRequest`` and
``ResponsesRequest`` so that ``<|tool_call>`` delimiters reach the
streaming extractor. The previous
``isinstance(ChatCompletionRequest)`` guard omitted the Responses
path, causing raw ``call:fn{...}`` text to leak via
``response.output_text.delta``.
"""
parser = Gemma4ToolParser.__new__(Gemma4ToolParser)
parser.model_tokenizer = _StubTokenizer()

request = _build_responses_request(tool_choice="auto")
assert request.skip_special_tokens is True, (
"Precondition: ResponsesRequest.skip_special_tokens default is True"
)

Gemma4ToolParser.adjust_request(parser, request)

assert request.skip_special_tokens is False


def test_tool_parser_adjust_request_builds_valid_response_text_config() -> None:
"""``ToolParser.adjust_request`` must produce a ``ResponseTextConfig``
whose dumped form contains the JSON schema under the ``schema`` alias
and does not leak the unrelated ``"Response format for tool calling"``
description string that the previous two-step construction injected.
"""
parser = ToolParser.__new__(ToolParser)
parser.model_tokenizer = None

request = _build_responses_request(tool_choice="required")
ToolParser.adjust_request(parser, request)

assert request.text is not None
assert request.text.format is not None
assert request.text.format.type == "json_schema"

dump: dict[str, Any] = request.text.model_dump(mode="json", by_alias=True)
fmt = dump.get("format") or {}
assert fmt.get("type") == "json_schema"
assert fmt.get("name") == "tool_calling_response"
assert fmt.get("strict") is True
# Nested config must be present under the alias. Two-step Pydantic v2
# construction could drop it from __fields_set__.
assert "schema" in fmt and isinstance(fmt["schema"], dict)
# The old code passed a wrong-purpose string; valid field should now
# either be absent or None (the openai-python default).
assert fmt.get("description") in (None, "")
21 changes: 14 additions & 7 deletions vllm/tool_parsers/abstract_tool_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,13 +92,20 @@ def adjust_request(
)
request.response_format = None
if isinstance(request, ResponsesRequest):
request.text = ResponseTextConfig()
request.text.format = ResponseFormatTextJSONSchemaConfig(
name="tool_calling_response",
schema=json_schema_from_tool,
type="json_schema",
description="Response format for tool calling",
strict=True,
# Single-shot construction so Pydantic v2 tracks `format`
# in __fields_set__ — assigning to `.format` after the bare
# `ResponseTextConfig()` constructor does not, which can
# drop the nested config from `model_dump`. Also drop the
# `description` kwarg: it is not a field on
# ResponseFormatTextJSONSchemaConfig and was being silently
# passed through as extra.
request.text = ResponseTextConfig(
format=ResponseFormatTextJSONSchemaConfig(
type="json_schema",
name="tool_calling_response",
schema=json_schema_from_tool,
strict=True,
)
)

return request
Expand Down
13 changes: 7 additions & 6 deletions vllm/tool_parsers/gemma4_tool_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -360,12 +360,13 @@ def adjust_request(
self, request: ChatCompletionRequest | ResponsesRequest
) -> ChatCompletionRequest | ResponsesRequest:
request = super().adjust_request(request)
if (
isinstance(request, ChatCompletionRequest)
and request.tools
and request.tool_choice != "none"
):
# Don't skip special tokens — <|tool_call> etc. are needed
if request.tools and request.tool_choice != "none":
# Don't skip special tokens — <|tool_call> etc. are needed for
# the parser to detect tool calls. Apply to BOTH
# ChatCompletionRequest and ResponsesRequest (the previous
# isinstance(ChatCompletionRequest) guard caused tool-call
# delimiters to be stripped on /v1/responses, leaking raw
# `call:fn{...}` text via output_text.delta).
request.skip_special_tokens = False
return request

Expand Down
Loading