From 8db496bc4c423570366c14b3212ad17e458e2835 Mon Sep 17 00:00:00 2001 From: Mohamed Mostafa Date: Sun, 3 May 2026 17:14:35 +0300 Subject: [PATCH] [Bugfix] fix(reasoning): route streaming deltas as content when prompt_is_reasoning_end and no tool parser `DelegatingParser._in_reasoning_phase` returned `True` unconditionally when `self._tool_parser is None`, ignoring `state.reasoning_ended`. When `enable_thinking=False` is used with Qwen3, the chat template injects `\n\n\n\n` into the prompt. The serving layer detects this via `is_reasoning_end(prompt_token_ids)` and sets `state.reasoning_ended=True` before any output tokens arrive. However, because `_in_reasoning_phase` ignored `state.reasoning_ended` in the no-tool-parser path, all generated tokens still flowed through `extract_reasoning_streaming` and were emitted as `DeltaMessage(reasoning=...)` instead of `DeltaMessage(content=...)`, leaving `choices[0].delta.content` empty for the entire stream. Fixes: vllm-project/vllm#40816 Changes: - `_in_reasoning_phase`: check `state.reasoning_ended` before the tool-parser-presence check, so reasoning is never re-entered once ended. - `parse_delta`: add a content pass-through branch for the case where reasoning has ended but there is no tool parser, so deltas are not silently dropped. - Add regression test `test_prompt_is_reasoning_end_routes_to_content` in `tests/reasoning/test_qwen3_reasoning_parser.py` that exercises `DelegatingParser.parse_delta` with a prompt containing ``. Signed-off-by: Mohamed Mostafa Co-authored-by: Claude (Anthropic) --- .../reasoning/test_qwen3_reasoning_parser.py | 75 +++++++++++++++++++ vllm/parser/abstract_parser.py | 7 +- 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/tests/reasoning/test_qwen3_reasoning_parser.py b/tests/reasoning/test_qwen3_reasoning_parser.py index f42458560f9f..85857beae308 100644 --- a/tests/reasoning/test_qwen3_reasoning_parser.py +++ b/tests/reasoning/test_qwen3_reasoning_parser.py @@ -10,6 +10,7 @@ run_reasoning_extraction_streaming, ) from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.parser.abstract_parser import _WrappedParser from vllm.reasoning import ReasoningParser, ReasoningParserManager parser_name = "qwen3" @@ -373,3 +374,77 @@ def test_reasoning_thinking_disabled( assert reasoning == expected_reasoning assert content == expected_content + + +# --- Tests for prompt_is_reasoning_end (streaming, DelegatingParser path) --- + + +@pytest.mark.parametrize( + "content_deltas", + [ + pytest.param( + ["Hello", " world", "!"], + id="short_answer", + ), + pytest.param( + ["The answer is 42."], + id="single_delta", + ), + pytest.param( + ["Line one\n", "Line two\n", "Line three"], + id="multiline", + ), + ], +) +def test_prompt_is_reasoning_end_routes_to_content( + content_deltas: list[str], + qwen3_tokenizer, +): + """Regression test for: Qwen3.6 streaming emits final answer in + delta.reasoning when enable_thinking=False (vllm-project/vllm#40816). + + When is present in the prompt (enable_thinking=False), the + serving layer sets reasoning_ended=True via prompt_is_reasoning_end + before any output tokens arrive. DelegatingParser.parse_delta must + route all subsequent deltas as content, not reasoning. + """ + + class _Qwen3Parser(_WrappedParser): + reasoning_parser_cls = ReasoningParserManager.get_reasoning_parser("qwen3") + tool_parser_cls = None + + parser = _Qwen3Parser(qwen3_tokenizer) + + # Build a fake prompt_token_ids that contains the end token, + # simulating the token sequence injected by the chat template when + # enable_thinking=False. + end_token_id = parser._reasoning_parser.end_token_id + prompt_token_ids: list[int] | None = [1, 2, end_token_id, 3] + + request = ChatCompletionRequest(messages=[], model="test-model") + reconstructor = StreamingReasoningReconstructor() + + for delta in content_deltas: + token_ids = [ + qwen3_tokenizer.get_vocab().get(tok) + for tok in qwen3_tokenizer.tokenize(delta) + if tok in qwen3_tokenizer.get_vocab() + ] + delta_message = parser.parse_delta( + delta_text=delta, + delta_token_ids=token_ids, + request=request, + prompt_token_ids=prompt_token_ids, + ) + # Only pass prompt_token_ids on the first call (as the serving layer does). + prompt_token_ids = None + if delta_message is not None: + reconstructor.append_delta(delta_message) + + expected_content = "".join(content_deltas) + assert reconstructor.reasoning is None, ( + f"Expected no reasoning, got: {reconstructor.reasoning!r}" + ) + assert reconstructor.other_content == expected_content, ( + f"Expected content {expected_content!r}, got: {reconstructor.other_content!r}" + ) diff --git a/vllm/parser/abstract_parser.py b/vllm/parser/abstract_parser.py index e7f83686dbef..a7ff5eed44d0 100644 --- a/vllm/parser/abstract_parser.py +++ b/vllm/parser/abstract_parser.py @@ -635,8 +635,6 @@ def extract_content_ids(self, input_ids: list[int]) -> list[int]: def _in_reasoning_phase(self, state: StreamState) -> bool: if self._reasoning_parser is None: return False - if self._tool_parser is None: - return True return not state.reasoning_ended def _in_tool_call_phase(self, state: StreamState) -> bool: @@ -711,6 +709,11 @@ def parse_delta( # No parsers: pass through as content if self._reasoning_parser is None and self._tool_parser is None: delta_message = DeltaMessage(content=delta_text) + # Reasoning ended (e.g. enable_thinking=False detected via + # prompt_is_reasoning_end) and no tool parser: route remaining + # deltas directly as content. + elif self._tool_parser is None and state.reasoning_ended: + delta_message = DeltaMessage(content=delta_text) if delta_text else None state.previous_text = current_text state.previous_token_ids = current_token_ids