Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tests/parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
203 changes: 203 additions & 0 deletions tests/parser/test_reasoning_tool_transition.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Regression tests for Gemma4 reasoning-to-tool-call transition.

Issue #40911: partial tool-call markers (e.g. "<|" from "<|tool_call>")
can leak into content when the reasoning end token and the tool-call
start token arrive in the same streaming delta. The fix lives in
Gemma4ReasoningParser.extract_reasoning_streaming(), which reconstructs
content from token IDs instead of relying on text-based splitting.
"""

import pytest

from vllm.entrypoints.openai.chat_completion.protocol import (
ChatCompletionRequest,
)
from vllm.parser.abstract_parser import _WrappedParser
from vllm.reasoning.gemma4_reasoning_parser import Gemma4ReasoningParser
from vllm.tokenizers.registry import get_tokenizer
from vllm.tool_parsers.gemma4_tool_parser import Gemma4ToolParser

TOKENIZER_NAME = "google/gemma-4-E2B-it"


@pytest.fixture(scope="module")
def tokenizer():
return get_tokenizer(TOKENIZER_NAME)


@pytest.fixture(scope="module")
def vocab(tokenizer):
return tokenizer.get_vocab()


def _make_parser(tokenizer):
"""Create a DelegatingParser with Gemma4 parsers using a local subclass
to avoid polluting _WrappedParser class attributes across tests."""

class _Gemma4TestParser(_WrappedParser):
reasoning_parser_cls = Gemma4ReasoningParser
tool_parser_cls = Gemma4ToolParser

return _Gemma4TestParser(tokenizer)


def _encode(tokenizer, text: str) -> list[int]:
enc = getattr(tokenizer, "tokenizer", tokenizer)
try:
return enc.encode(text, add_special_tokens=False)
except TypeError:
return enc.encode(text)


def _request():
return ChatCompletionRequest(messages=[], model="test-model")


class TestReasoningToToolTransition:
"""Verify that partial tool-call markers do not leak as content."""

def test_partial_marker_does_not_leak(self, tokenizer, vocab):
"""
Regression test for #40911.

Simulates the actual failure mode: the reasoning end token and
tool-call start token arrive in the same delta, but delta_text
contains only a partial prefix of the tool marker (e.g. "<|")
due to incremental detokenization. The partial prefix must not
appear as content.
"""
parser = _make_parser(tokenizer)
request = _request()

channel_end_id = vocab["<channel|>"]
channel_start_id = vocab["<|channel>"]
tool_call_start_id = vocab.get("<|tool_call>")
if tool_call_start_id is None:
pytest.skip("<|tool_call> not in vocab")

# Phase 1: feed reasoning tokens one by one
reasoning_tokens = _encode(tokenizer, "thinking about tools")
for tid in [channel_start_id] + reasoning_tokens:
delta_text = tokenizer.decode([tid], skip_special_tokens=False)
parser.parse_delta(
delta_text=delta_text,
delta_token_ids=[tid],
request=request,
)

# Phase 2: reasoning end + tool call start in same delta.
# Construct delta_text with a partial prefix of the tool token
# to simulate the actual failure mode (incremental detokenization
# can produce a prefix-diff that splits the special token text).
channel_end_text = tokenizer.decode([channel_end_id], skip_special_tokens=False)
tool_token_text = tokenizer.decode(
[tool_call_start_id], skip_special_tokens=False
)
partial_prefix = tool_token_text[:2] # e.g. "<|"
combined_ids = [channel_end_id, tool_call_start_id]
combined_text = channel_end_text + partial_prefix

msg = parser.parse_delta(
delta_text=combined_text,
delta_token_ids=combined_ids,
request=request,
)

if msg is not None and msg.content is not None:
assert partial_prefix not in msg.content, (
f"Partial tool-call marker leaked as content: {msg.content!r}"
)

def test_separate_deltas_no_leak(self, tokenizer, vocab):
"""
When reasoning end and tool-call start arrive in separate deltas,
the tool marker must not leak as content.
"""
parser = _make_parser(tokenizer)
request = _request()

channel_start_id = vocab["<|channel>"]
channel_end_id = vocab["<channel|>"]
tool_call_start_id = vocab.get("<|tool_call>")
if tool_call_start_id is None:
pytest.skip("<|tool_call> not in vocab")

# Reasoning start + content
delta_text = tokenizer.decode([channel_start_id], skip_special_tokens=False)
parser.parse_delta(
delta_text=delta_text,
delta_token_ids=[channel_start_id],
request=request,
)
for tid in _encode(tokenizer, "reasoning"):
delta_text = tokenizer.decode([tid], skip_special_tokens=False)
parser.parse_delta(
delta_text=delta_text,
delta_token_ids=[tid],
request=request,
)

# Reasoning end (separate delta)
delta_text = tokenizer.decode([channel_end_id], skip_special_tokens=False)
parser.parse_delta(
delta_text=delta_text,
delta_token_ids=[channel_end_id],
request=request,
)

# Tool call start (separate delta) — should not leak as content
delta_text = tokenizer.decode([tool_call_start_id], skip_special_tokens=False)
msg = parser.parse_delta(
delta_text=delta_text,
delta_token_ids=[tool_call_start_id],
request=request,
)

if msg is not None and msg.content is not None:
assert "<|tool_call>" not in msg.content, (
f"Tool marker leaked as content: {msg.content!r}"
)

def test_reasoning_end_only_no_leak(self, tokenizer, vocab):
"""
When reasoning ends and no tool tokens follow in the same delta,
no content should leak.
"""
parser = _make_parser(tokenizer)
request = _request()

channel_start_id = vocab["<|channel>"]
channel_end_id = vocab["<channel|>"]

# Start reasoning
delta_text = tokenizer.decode([channel_start_id], skip_special_tokens=False)
parser.parse_delta(
delta_text=delta_text,
delta_token_ids=[channel_start_id],
request=request,
)

# Some reasoning text
for tid in _encode(tokenizer, "some thought"):
delta_text = tokenizer.decode([tid], skip_special_tokens=False)
parser.parse_delta(
delta_text=delta_text,
delta_token_ids=[tid],
request=request,
)

# End reasoning (no tool call follows in this delta)
delta_text = tokenizer.decode([channel_end_id], skip_special_tokens=False)
msg = parser.parse_delta(
delta_text=delta_text,
delta_token_ids=[channel_end_id],
request=request,
)

if msg is not None and msg.content is not None:
assert msg.content.strip() == "", (
f"Unexpected content at reasoning end: {msg.content!r}"
)
17 changes: 17 additions & 0 deletions vllm/reasoning/gemma4_reasoning_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,23 @@ def extract_reasoning_streaming(
if result is None:
return None

# When reasoning ends in this delta, the base class splits
# delta_text on the end-token string to separate reasoning from
# content. That text split can produce partial special-token
# fragments (e.g. "<|" from "<|tool_call>") because the
# incremental detokenizer text may not align with token
# boundaries. Reconstruct content from token IDs instead,
# which are always exact. (#40911)
if self.end_token_id in delta_token_ids and result.content is not None:
content_ids = self.extract_content_ids(list(delta_token_ids))
if content_ids:
result.content = self.model_tokenizer.decode(
content_ids,
skip_special_tokens=False,
)
else:
result.content = None

if result.reasoning is None:
return result

Expand Down
Loading