From 96fec33115df04e5fc0a5117614b6f631086e3d8 Mon Sep 17 00:00:00 2001 From: rixav77 Date: Mon, 27 Apr 2026 11:18:45 +0530 Subject: [PATCH 1/3] fix(parser): prevent partial tool-call markers leaking as content during reasoning-to-tool transition When transitioning from reasoning to tool-call parsing in DelegatingParser.parse_delta(), the content text was taken from the reasoning parser's text-based split (delta_message.content). This could contain partial special-token fragments (e.g. "<|" from "<|tool_call>") when the reasoning end token and tool-call start token arrived in the same streaming delta. Fix: reconstruct the handoff text from extract_content_ids() token IDs via tokenizer.decode() instead of using the text-split content. Token IDs have exact boundaries, so partial-token text cannot leak. Closes #40911 Co-authored-by: Claude Signed-off-by: rixav77 --- tests/parser/__init__.py | 2 + .../parser/test_reasoning_tool_transition.py | 196 ++++++++++++++++++ vllm/parser/abstract_parser.py | 14 +- 3 files changed, 209 insertions(+), 3 deletions(-) create mode 100644 tests/parser/__init__.py create mode 100644 tests/parser/test_reasoning_tool_transition.py diff --git a/tests/parser/__init__.py b/tests/parser/__init__.py new file mode 100644 index 000000000000..208f01a7cb5e --- /dev/null +++ b/tests/parser/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/tests/parser/test_reasoning_tool_transition.py b/tests/parser/test_reasoning_tool_transition.py new file mode 100644 index 000000000000..2e14118edaa5 --- /dev/null +++ b/tests/parser/test_reasoning_tool_transition.py @@ -0,0 +1,196 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Regression tests for DelegatingParser reasoning-to-tool-call transition. + +Issue #40911: partial tool-call markers (e.g. "<|" from "<|tool_call>") +can leak into content when the reasoning end token and the tool-call +start token arrive in the same streaming delta. +""" + +import pytest + +from vllm.entrypoints.openai.chat_completion.protocol import ( + ChatCompletionRequest, +) +from vllm.parser.abstract_parser import _WrappedParser +from vllm.reasoning.gemma4_reasoning_parser import Gemma4ReasoningParser +from vllm.tokenizers.registry import get_tokenizer +from vllm.tool_parsers.gemma4_tool_parser import Gemma4ToolParser + +TOKENIZER_NAME = "google/gemma-4-E2B-it" + + +@pytest.fixture(scope="module") +def tokenizer(): + return get_tokenizer(TOKENIZER_NAME) + + +@pytest.fixture(scope="module") +def vocab(tokenizer): + return tokenizer.get_vocab() + + +def _make_parser(tokenizer): + _WrappedParser.reasoning_parser_cls = Gemma4ReasoningParser + _WrappedParser.tool_parser_cls = Gemma4ToolParser + return _WrappedParser(tokenizer) + + +def _encode(tokenizer, text: str) -> list[int]: + enc = getattr(tokenizer, "tokenizer", tokenizer) + try: + return enc.encode(text, add_special_tokens=False) + except TypeError: + return enc.encode(text) + + +def _request(): + return ChatCompletionRequest(messages=[], model="test-model") + + +class TestReasoningToToolTransition: + """Verify that partial tool-call markers do not leak as content.""" + + def test_partial_marker_does_not_leak(self, tokenizer, vocab): + """ + Regression test for #40911. + + Simulates: reasoning ends with , then partial "<|" + arrives before the full "<|tool_call>" token. The partial "<|" + must not appear as content. + """ + parser = _make_parser(tokenizer) + request = _request() + + channel_end_id = vocab[""] + channel_start_id = vocab["<|channel>"] + tool_call_start_id = vocab.get("<|tool_call>") + if tool_call_start_id is None: + pytest.skip("<|tool_call> not in vocab") + + # Phase 1: start reasoning + reasoning_tokens = _encode(tokenizer, "thinking about tools") + previous_text = "" + previous_token_ids: list[int] = [] + + for tid in [channel_start_id] + reasoning_tokens: + delta_text = tokenizer.decode([tid], skip_special_tokens=False) + current_text = previous_text + delta_text + current_token_ids = previous_token_ids + [tid] + parser.parse_delta( + delta_text=delta_text, + delta_token_ids=[tid], + request=request, + ) + previous_text = current_text + previous_token_ids = current_token_ids + + # Phase 2: reasoning end + tool call start in same delta + combined_ids = [channel_end_id, tool_call_start_id] + combined_text = tokenizer.decode(combined_ids, skip_special_tokens=False) + msg = parser.parse_delta( + delta_text=combined_text, + delta_token_ids=combined_ids, + request=request, + ) + + # The partial marker must not appear as content + if msg is not None and msg.content is not None: + assert "<|" not in msg.content, ( + f"Partial tool-call marker leaked as content: {msg.content!r}" + ) + + def test_full_marker_passes_to_tool_parser(self, tokenizer, vocab): + """ + When reasoning ends and a complete tool-call token follows, + the tool parser should receive it (no leak, no suppression). + """ + parser = _make_parser(tokenizer) + request = _request() + + channel_start_id = vocab["<|channel>"] + channel_end_id = vocab[""] + tool_call_start_id = vocab.get("<|tool_call>") + if tool_call_start_id is None: + pytest.skip("<|tool_call> not in vocab") + + # Send reasoning start + delta_text = tokenizer.decode([channel_start_id], skip_special_tokens=False) + parser.parse_delta( + delta_text=delta_text, + delta_token_ids=[channel_start_id], + request=request, + ) + + # Send reasoning content + content_ids = _encode(tokenizer, "reasoning") + for tid in content_ids: + delta_text = tokenizer.decode([tid], skip_special_tokens=False) + parser.parse_delta( + delta_text=delta_text, + delta_token_ids=[tid], + request=request, + ) + + # Send reasoning end + delta_text = tokenizer.decode([channel_end_id], skip_special_tokens=False) + parser.parse_delta( + delta_text=delta_text, + delta_token_ids=[channel_end_id], + request=request, + ) + + # Send tool call start — now in tool phase + delta_text = tokenizer.decode([tool_call_start_id], skip_special_tokens=False) + msg = parser.parse_delta( + delta_text=delta_text, + delta_token_ids=[tool_call_start_id], + request=request, + ) + + # Should not leak tool marker as content + if msg is not None and msg.content is not None: + assert "<|tool_call>" not in msg.content + + def test_reasoning_end_only_no_leak(self, tokenizer, vocab): + """ + When reasoning ends and no tool tokens follow in the same delta, + no content should leak. + """ + parser = _make_parser(tokenizer) + request = _request() + + channel_start_id = vocab["<|channel>"] + channel_end_id = vocab[""] + + # Start reasoning + delta_text = tokenizer.decode([channel_start_id], skip_special_tokens=False) + parser.parse_delta( + delta_text=delta_text, + delta_token_ids=[channel_start_id], + request=request, + ) + + # Some reasoning text + for tid in _encode(tokenizer, "some thought"): + delta_text = tokenizer.decode([tid], skip_special_tokens=False) + parser.parse_delta( + delta_text=delta_text, + delta_token_ids=[tid], + request=request, + ) + + # End reasoning (no tool call follows in this delta) + delta_text = tokenizer.decode([channel_end_id], skip_special_tokens=False) + msg = parser.parse_delta( + delta_text=delta_text, + delta_token_ids=[channel_end_id], + request=request, + ) + + # No content should leak from the transition + if msg is not None and msg.content is not None: + assert msg.content.strip() == "", ( + f"Unexpected content at reasoning end: {msg.content!r}" + ) diff --git a/vllm/parser/abstract_parser.py b/vllm/parser/abstract_parser.py index b87455ab6f71..27e18e83e6cd 100644 --- a/vllm/parser/abstract_parser.py +++ b/vllm/parser/abstract_parser.py @@ -614,11 +614,19 @@ def parse_delta( if self._tool_parser and self.is_reasoning_end(delta_token_ids): state.reasoning_ended = True current_token_ids = self.extract_content_ids(delta_token_ids) - if delta_message and delta_message.content: - current_text = delta_message.content - delta_message.content = None + # Reconstruct text from token IDs instead of using the + # reasoning parser's text-split content, which can contain + # partial special-token fragments (e.g. "<|" from + # "<|tool_call>") that leak into content. (#40911) + if current_token_ids: + current_text = self.model_tokenizer.decode( + current_token_ids, + skip_special_tokens=False, + ) else: current_text = "" + if delta_message: + delta_message.content = None # Tool call extraction if self._in_tool_call_phase(state): From 2da79cf399d00e58b0061edd8eef8e68eaeeeed1 Mon Sep 17 00:00:00 2001 From: rixav77 Date: Mon, 27 Apr 2026 14:17:12 +0530 Subject: [PATCH 2/3] =?UTF-8?q?fix:=20address=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20use=20local=20subclass,=20simulate=20actual=20parti?= =?UTF-8?q?al=20=20=20=20text,=20rename=20test?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: rixav77 --- .../parser/test_reasoning_tool_transition.py | 67 ++++++++++--------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/tests/parser/test_reasoning_tool_transition.py b/tests/parser/test_reasoning_tool_transition.py index 2e14118edaa5..57d986710e28 100644 --- a/tests/parser/test_reasoning_tool_transition.py +++ b/tests/parser/test_reasoning_tool_transition.py @@ -32,9 +32,14 @@ def vocab(tokenizer): def _make_parser(tokenizer): - _WrappedParser.reasoning_parser_cls = Gemma4ReasoningParser - _WrappedParser.tool_parser_cls = Gemma4ToolParser - return _WrappedParser(tokenizer) + """Create a DelegatingParser with Gemma4 parsers using a local subclass + to avoid polluting _WrappedParser class attributes across tests.""" + + class _Gemma4TestParser(_WrappedParser): + reasoning_parser_cls = Gemma4ReasoningParser + tool_parser_cls = Gemma4ToolParser + + return _Gemma4TestParser(tokenizer) def _encode(tokenizer, text: str) -> list[int]: @@ -56,9 +61,11 @@ def test_partial_marker_does_not_leak(self, tokenizer, vocab): """ Regression test for #40911. - Simulates: reasoning ends with , then partial "<|" - arrives before the full "<|tool_call>" token. The partial "<|" - must not appear as content. + Simulates the actual failure mode: the reasoning end token and + tool-call start token arrive in the same delta, but delta_text + contains only a partial prefix of the tool marker (e.g. "<|") + due to incremental detokenization. The partial prefix must not + appear as content. """ parser = _make_parser(tokenizer) request = _request() @@ -69,42 +76,43 @@ def test_partial_marker_does_not_leak(self, tokenizer, vocab): if tool_call_start_id is None: pytest.skip("<|tool_call> not in vocab") - # Phase 1: start reasoning + # Phase 1: feed reasoning tokens one by one reasoning_tokens = _encode(tokenizer, "thinking about tools") - previous_text = "" - previous_token_ids: list[int] = [] - for tid in [channel_start_id] + reasoning_tokens: delta_text = tokenizer.decode([tid], skip_special_tokens=False) - current_text = previous_text + delta_text - current_token_ids = previous_token_ids + [tid] parser.parse_delta( delta_text=delta_text, delta_token_ids=[tid], request=request, ) - previous_text = current_text - previous_token_ids = current_token_ids - # Phase 2: reasoning end + tool call start in same delta + # Phase 2: reasoning end + tool call start in same delta. + # Construct delta_text with a partial prefix of the tool token + # to simulate the actual failure mode (incremental detokenization + # can produce a prefix-diff that splits the special token text). + channel_end_text = tokenizer.decode([channel_end_id], skip_special_tokens=False) + tool_token_text = tokenizer.decode( + [tool_call_start_id], skip_special_tokens=False + ) + partial_prefix = tool_token_text[:2] # e.g. "<|" combined_ids = [channel_end_id, tool_call_start_id] - combined_text = tokenizer.decode(combined_ids, skip_special_tokens=False) + combined_text = channel_end_text + partial_prefix + msg = parser.parse_delta( delta_text=combined_text, delta_token_ids=combined_ids, request=request, ) - # The partial marker must not appear as content if msg is not None and msg.content is not None: - assert "<|" not in msg.content, ( + assert partial_prefix not in msg.content, ( f"Partial tool-call marker leaked as content: {msg.content!r}" ) - def test_full_marker_passes_to_tool_parser(self, tokenizer, vocab): + def test_separate_deltas_no_leak(self, tokenizer, vocab): """ - When reasoning ends and a complete tool-call token follows, - the tool parser should receive it (no leak, no suppression). + When reasoning end and tool-call start arrive in separate deltas, + the tool marker must not leak as content. """ parser = _make_parser(tokenizer) request = _request() @@ -115,17 +123,14 @@ def test_full_marker_passes_to_tool_parser(self, tokenizer, vocab): if tool_call_start_id is None: pytest.skip("<|tool_call> not in vocab") - # Send reasoning start + # Reasoning start + content delta_text = tokenizer.decode([channel_start_id], skip_special_tokens=False) parser.parse_delta( delta_text=delta_text, delta_token_ids=[channel_start_id], request=request, ) - - # Send reasoning content - content_ids = _encode(tokenizer, "reasoning") - for tid in content_ids: + for tid in _encode(tokenizer, "reasoning"): delta_text = tokenizer.decode([tid], skip_special_tokens=False) parser.parse_delta( delta_text=delta_text, @@ -133,7 +138,7 @@ def test_full_marker_passes_to_tool_parser(self, tokenizer, vocab): request=request, ) - # Send reasoning end + # Reasoning end (separate delta) delta_text = tokenizer.decode([channel_end_id], skip_special_tokens=False) parser.parse_delta( delta_text=delta_text, @@ -141,7 +146,7 @@ def test_full_marker_passes_to_tool_parser(self, tokenizer, vocab): request=request, ) - # Send tool call start — now in tool phase + # Tool call start (separate delta) — should not leak as content delta_text = tokenizer.decode([tool_call_start_id], skip_special_tokens=False) msg = parser.parse_delta( delta_text=delta_text, @@ -149,9 +154,10 @@ def test_full_marker_passes_to_tool_parser(self, tokenizer, vocab): request=request, ) - # Should not leak tool marker as content if msg is not None and msg.content is not None: - assert "<|tool_call>" not in msg.content + assert "<|tool_call>" not in msg.content, ( + f"Tool marker leaked as content: {msg.content!r}" + ) def test_reasoning_end_only_no_leak(self, tokenizer, vocab): """ @@ -189,7 +195,6 @@ def test_reasoning_end_only_no_leak(self, tokenizer, vocab): request=request, ) - # No content should leak from the transition if msg is not None and msg.content is not None: assert msg.content.strip() == "", ( f"Unexpected content at reasoning end: {msg.content!r}" From 349c1ee8b5e35da7069b56c10ce1ea51ab18bf4d Mon Sep 17 00:00:00 2001 From: rixav77 Date: Wed, 29 Apr 2026 05:35:30 +0530 Subject: [PATCH 3/3] fix: move marker-leak fix from abstract_parser to Gemma4ReasoningParser Per maintainer feedback, move the token-ID-based content reconstruction from the shared DelegatingParser into the Gemma4-specific reasoning parser to avoid propagating model-specific changes to other models. abstract_parser.py is fully reverted to its original code. Signed-off-by: Aryaman Verma Co-authored-by: Claude Signed-off-by: rixav77 --- tests/parser/test_reasoning_tool_transition.py | 6 ++++-- vllm/parser/abstract_parser.py | 14 +++----------- vllm/reasoning/gemma4_reasoning_parser.py | 17 +++++++++++++++++ 3 files changed, 24 insertions(+), 13 deletions(-) diff --git a/tests/parser/test_reasoning_tool_transition.py b/tests/parser/test_reasoning_tool_transition.py index 57d986710e28..c1231b312b00 100644 --- a/tests/parser/test_reasoning_tool_transition.py +++ b/tests/parser/test_reasoning_tool_transition.py @@ -1,11 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """ -Regression tests for DelegatingParser reasoning-to-tool-call transition. +Regression tests for Gemma4 reasoning-to-tool-call transition. Issue #40911: partial tool-call markers (e.g. "<|" from "<|tool_call>") can leak into content when the reasoning end token and the tool-call -start token arrive in the same streaming delta. +start token arrive in the same streaming delta. The fix lives in +Gemma4ReasoningParser.extract_reasoning_streaming(), which reconstructs +content from token IDs instead of relying on text-based splitting. """ import pytest diff --git a/vllm/parser/abstract_parser.py b/vllm/parser/abstract_parser.py index 27e18e83e6cd..b87455ab6f71 100644 --- a/vllm/parser/abstract_parser.py +++ b/vllm/parser/abstract_parser.py @@ -614,19 +614,11 @@ def parse_delta( if self._tool_parser and self.is_reasoning_end(delta_token_ids): state.reasoning_ended = True current_token_ids = self.extract_content_ids(delta_token_ids) - # Reconstruct text from token IDs instead of using the - # reasoning parser's text-split content, which can contain - # partial special-token fragments (e.g. "<|" from - # "<|tool_call>") that leak into content. (#40911) - if current_token_ids: - current_text = self.model_tokenizer.decode( - current_token_ids, - skip_special_tokens=False, - ) + if delta_message and delta_message.content: + current_text = delta_message.content + delta_message.content = None else: current_text = "" - if delta_message: - delta_message.content = None # Tool call extraction if self._in_tool_call_phase(state): diff --git a/vllm/reasoning/gemma4_reasoning_parser.py b/vllm/reasoning/gemma4_reasoning_parser.py index 6f2241603f9a..2403a131c099 100644 --- a/vllm/reasoning/gemma4_reasoning_parser.py +++ b/vllm/reasoning/gemma4_reasoning_parser.py @@ -158,6 +158,23 @@ def extract_reasoning_streaming( if result is None: return None + # When reasoning ends in this delta, the base class splits + # delta_text on the end-token string to separate reasoning from + # content. That text split can produce partial special-token + # fragments (e.g. "<|" from "<|tool_call>") because the + # incremental detokenizer text may not align with token + # boundaries. Reconstruct content from token IDs instead, + # which are always exact. (#40911) + if self.end_token_id in delta_token_ids and result.content is not None: + content_ids = self.extract_content_ids(list(delta_token_ids)) + if content_ids: + result.content = self.model_tokenizer.decode( + content_ids, + skip_special_tokens=False, + ) + else: + result.content = None + if result.reasoning is None: return result