diff --git a/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py b/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py index d7a601114b21..ae2b597e13ac 100644 --- a/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py +++ b/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py @@ -287,3 +287,47 @@ async def plain_call(): f"index {i}: reasoning decode token ids ({n_reason}) != " f"thinking_token_budget ({expected_budget})" ) + + +@pytest.mark.asyncio +@pytest.mark.parametrize("client", ["default", "auto_config"], indirect=True) +async def test_streaming_with_thinking_disabled_stays_in_content( + client: openai.AsyncOpenAI, +): + request_kwargs = { + "model": MODEL_NAME, + "messages": [ + { + "role": "user", + "content": "Which is larger, 4 or 12?" + " Output exactly one token: 4 or 12.", + } + ], + "max_tokens": 16, + "temperature": 0.0, + "extra_body": {"chat_template_kwargs": {"enable_thinking": False}}, + } + + response = await client.chat.completions.create(**request_kwargs) + message = response.choices[0].message + assert message.content is not None and message.content.strip() != "" + assert getattr(message, "reasoning", None) in (None, "") + + stream = await client.chat.completions.create( + **request_kwargs, + stream=True, + ) + + content_chunks = [] + reasoning_chunks = [] + async for chunk in stream: + if not chunk.choices: + continue + delta = chunk.choices[0].delta + if getattr(delta, "content", None): + content_chunks.append(delta.content) + if getattr(delta, "reasoning", None): + reasoning_chunks.append(delta.reasoning) + + assert "".join(content_chunks).strip() != "" + assert reasoning_chunks == [] diff --git a/tests/parser/__init__.py b/tests/parser/__init__.py new file mode 100644 index 000000000000..208f01a7cb5e --- /dev/null +++ b/tests/parser/__init__.py @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project diff --git a/tests/parser/test_streaming.py b/tests/parser/test_streaming.py new file mode 100644 index 000000000000..d9194d48ed5a --- /dev/null +++ b/tests/parser/test_streaming.py @@ -0,0 +1,165 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import json + +import pytest + +from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest +from vllm.entrypoints.openai.engine.protocol import DeltaMessage +from vllm.parser.abstract_parser import _WrappedParser +from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser +from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser + + +class ThinkReasoningParser(BaseThinkingReasoningParser): + @property + def start_token(self) -> str: + return "" + + @property + def end_token(self) -> str: + return "" + + +MODEL_OUTPUT = ( + "let me think about this" + '\n{"name": "get_weather", ' + '"arguments": {"city": "Dallas"}}\n' +) + + +@pytest.fixture(scope="module") +def tokenizer(): + from vllm.tokenizers import get_tokenizer + + return get_tokenizer("Qwen/Qwen3-32B") + + +@pytest.fixture +def request_obj(): + return ChatCompletionRequest( + model="test-model", + messages=[{"role": "user", "content": "hi"}], + ) + + +def make_parser(tokenizer, reasoning=False, tool=False): + _WrappedParser.reasoning_parser_cls = ThinkReasoningParser if reasoning else None + _WrappedParser.tool_parser_cls = Hermes2ProToolParser if tool else None + return _WrappedParser(tokenizer) + + +def stream_text(parser, tokenizer, text, request, prompt_token_ids=None): + token_ids = tokenizer.encode(text, add_special_tokens=False) + results: list[DeltaMessage | None] = [] + for tid in token_ids: + delta_text = tokenizer.decode([tid]) + result = parser.parse_delta( + delta_text, [tid], request, prompt_token_ids=prompt_token_ids + ) + prompt_token_ids = None + results.append(result) + return results + + +def collect_fields(results): + all_reasoning = "".join(r.reasoning for r in results if r and r.reasoning) + all_content = "".join(r.content for r in results if r and r.content) + all_tool_calls = [tc for r in results if r and r.tool_calls for tc in r.tool_calls] + return all_reasoning, all_content, all_tool_calls + + +def test_parse_delta_neither_parser(tokenizer, request_obj): + parser = make_parser(tokenizer, reasoning=False, tool=False) + results = stream_text( + parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[] + ) + reasoning, content, tool_calls = collect_fields(results) + + assert reasoning == "" + assert len(tool_calls) == 0 + assert "" in content + assert "let me think about this" in content + assert "" in content + assert "get_weather" in content + + +def test_parse_delta_tool_parser_only(tokenizer, request_obj): + parser = make_parser(tokenizer, reasoning=False, tool=True) + results = stream_text( + parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[] + ) + reasoning, content, tool_calls = collect_fields(results) + + assert reasoning == "" + assert "" in content + assert "let me think about this" in content + assert "" in content + + assert len(tool_calls) > 0 + assert tool_calls[0].function.name == "get_weather" + tool_args = "".join( + tc.function.arguments for tc in tool_calls if tc.function.arguments + ) + assert json.loads(tool_args) == {"city": "Dallas"} + + +def test_parse_delta_reasoning_parser_only(tokenizer, request_obj): + parser = make_parser(tokenizer, reasoning=True, tool=False) + results = stream_text( + parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[] + ) + reasoning, content, tool_calls = collect_fields(results) + + assert "let me think about this" in reasoning + assert len(tool_calls) == 0 + assert "" in content + assert "get_weather" in content + assert "" in content + + +def test_parse_delta_both_parsers(tokenizer, request_obj): + parser = make_parser(tokenizer, reasoning=True, tool=True) + results = stream_text( + parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[] + ) + reasoning, content, tool_calls = collect_fields(results) + + assert "let me think about this" in reasoning + assert content == "" + + assert len(tool_calls) > 0 + assert tool_calls[0].function.name == "get_weather" + tool_args = "".join( + tc.function.arguments for tc in tool_calls if tc.function.arguments + ) + assert json.loads(tool_args) == {"city": "Dallas"} + + +def test_parse_delta_reasoning_only_thinking_disabled(tokenizer, request_obj): + """Regression test for vllm-project/vllm#40466. + + When enable_thinking=False, the chat template places \\n\\n + in the prompt. The model then generates pure content (no think tokens). + All streaming output must go to delta.content, not delta.reasoning. + """ + parser = make_parser(tokenizer, reasoning=True, tool=False) + + end_token_id = parser._reasoning_parser.end_token_id + prompt_token_ids = [1, 2, end_token_id, 3] + + content_text = "Hello! How can I assist you today?" + results = stream_text( + parser, + tokenizer, + content_text, + request_obj, + prompt_token_ids=prompt_token_ids, + ) + reasoning, content, tool_calls = collect_fields(results) + + assert reasoning == "", f"Expected no reasoning, got: {reasoning!r}" + assert "Hello" in content + assert "assist" in content + assert len(tool_calls) == 0 diff --git a/vllm/parser/abstract_parser.py b/vllm/parser/abstract_parser.py index e7f83686dbef..3296ec76709d 100644 --- a/vllm/parser/abstract_parser.py +++ b/vllm/parser/abstract_parser.py @@ -635,15 +635,11 @@ def extract_content_ids(self, input_ids: list[int]) -> list[int]: def _in_reasoning_phase(self, state: StreamState) -> bool: if self._reasoning_parser is None: return False - if self._tool_parser is None: - return True return not state.reasoning_ended def _in_tool_call_phase(self, state: StreamState) -> bool: if self._tool_parser is None: return False - if self._reasoning_parser is None: - return True return state.reasoning_ended def parse_delta( @@ -657,7 +653,9 @@ def parse_delta( if not state.prompt_reasoning_checked and prompt_token_ids is not None: state.prompt_reasoning_checked = True - if self.is_reasoning_end(prompt_token_ids): + if self._reasoning_parser is None or self.is_reasoning_end( + prompt_token_ids + ): state.reasoning_ended = True current_text = state.previous_text + delta_text @@ -708,8 +706,12 @@ def parse_delta( ) ) - # No parsers: pass through as content - if self._reasoning_parser is None and self._tool_parser is None: + # No phase active: pass through as content + if ( + delta_message is None + and not self._in_reasoning_phase(state) + and not self._in_tool_call_phase(state) + ): delta_message = DeltaMessage(content=delta_text) state.previous_text = current_text