diff --git a/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py b/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py
index d7a601114b21..ae2b597e13ac 100644
--- a/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py
+++ b/tests/entrypoints/openai/chat_completion/test_thinking_token_budget.py
@@ -287,3 +287,47 @@ async def plain_call():
f"index {i}: reasoning decode token ids ({n_reason}) != "
f"thinking_token_budget ({expected_budget})"
)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("client", ["default", "auto_config"], indirect=True)
+async def test_streaming_with_thinking_disabled_stays_in_content(
+ client: openai.AsyncOpenAI,
+):
+ request_kwargs = {
+ "model": MODEL_NAME,
+ "messages": [
+ {
+ "role": "user",
+ "content": "Which is larger, 4 or 12?"
+ " Output exactly one token: 4 or 12.",
+ }
+ ],
+ "max_tokens": 16,
+ "temperature": 0.0,
+ "extra_body": {"chat_template_kwargs": {"enable_thinking": False}},
+ }
+
+ response = await client.chat.completions.create(**request_kwargs)
+ message = response.choices[0].message
+ assert message.content is not None and message.content.strip() != ""
+ assert getattr(message, "reasoning", None) in (None, "")
+
+ stream = await client.chat.completions.create(
+ **request_kwargs,
+ stream=True,
+ )
+
+ content_chunks = []
+ reasoning_chunks = []
+ async for chunk in stream:
+ if not chunk.choices:
+ continue
+ delta = chunk.choices[0].delta
+ if getattr(delta, "content", None):
+ content_chunks.append(delta.content)
+ if getattr(delta, "reasoning", None):
+ reasoning_chunks.append(delta.reasoning)
+
+ assert "".join(content_chunks).strip() != ""
+ assert reasoning_chunks == []
diff --git a/tests/parser/__init__.py b/tests/parser/__init__.py
new file mode 100644
index 000000000000..208f01a7cb5e
--- /dev/null
+++ b/tests/parser/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/tests/parser/test_streaming.py b/tests/parser/test_streaming.py
new file mode 100644
index 000000000000..d9194d48ed5a
--- /dev/null
+++ b/tests/parser/test_streaming.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
+from vllm.parser.abstract_parser import _WrappedParser
+from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
+
+
+class ThinkReasoningParser(BaseThinkingReasoningParser):
+ @property
+ def start_token(self) -> str:
+ return ""
+
+ @property
+ def end_token(self) -> str:
+ return ""
+
+
+MODEL_OUTPUT = (
+ "let me think about this"
+ '\n{"name": "get_weather", '
+ '"arguments": {"city": "Dallas"}}\n'
+)
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+ from vllm.tokenizers import get_tokenizer
+
+ return get_tokenizer("Qwen/Qwen3-32B")
+
+
+@pytest.fixture
+def request_obj():
+ return ChatCompletionRequest(
+ model="test-model",
+ messages=[{"role": "user", "content": "hi"}],
+ )
+
+
+def make_parser(tokenizer, reasoning=False, tool=False):
+ _WrappedParser.reasoning_parser_cls = ThinkReasoningParser if reasoning else None
+ _WrappedParser.tool_parser_cls = Hermes2ProToolParser if tool else None
+ return _WrappedParser(tokenizer)
+
+
+def stream_text(parser, tokenizer, text, request, prompt_token_ids=None):
+ token_ids = tokenizer.encode(text, add_special_tokens=False)
+ results: list[DeltaMessage | None] = []
+ for tid in token_ids:
+ delta_text = tokenizer.decode([tid])
+ result = parser.parse_delta(
+ delta_text, [tid], request, prompt_token_ids=prompt_token_ids
+ )
+ prompt_token_ids = None
+ results.append(result)
+ return results
+
+
+def collect_fields(results):
+ all_reasoning = "".join(r.reasoning for r in results if r and r.reasoning)
+ all_content = "".join(r.content for r in results if r and r.content)
+ all_tool_calls = [tc for r in results if r and r.tool_calls for tc in r.tool_calls]
+ return all_reasoning, all_content, all_tool_calls
+
+
+def test_parse_delta_neither_parser(tokenizer, request_obj):
+ parser = make_parser(tokenizer, reasoning=False, tool=False)
+ results = stream_text(
+ parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+ )
+ reasoning, content, tool_calls = collect_fields(results)
+
+ assert reasoning == ""
+ assert len(tool_calls) == 0
+ assert "" in content
+ assert "let me think about this" in content
+ assert "" in content
+ assert "get_weather" in content
+
+
+def test_parse_delta_tool_parser_only(tokenizer, request_obj):
+ parser = make_parser(tokenizer, reasoning=False, tool=True)
+ results = stream_text(
+ parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+ )
+ reasoning, content, tool_calls = collect_fields(results)
+
+ assert reasoning == ""
+ assert "" in content
+ assert "let me think about this" in content
+ assert "" in content
+
+ assert len(tool_calls) > 0
+ assert tool_calls[0].function.name == "get_weather"
+ tool_args = "".join(
+ tc.function.arguments for tc in tool_calls if tc.function.arguments
+ )
+ assert json.loads(tool_args) == {"city": "Dallas"}
+
+
+def test_parse_delta_reasoning_parser_only(tokenizer, request_obj):
+ parser = make_parser(tokenizer, reasoning=True, tool=False)
+ results = stream_text(
+ parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+ )
+ reasoning, content, tool_calls = collect_fields(results)
+
+ assert "let me think about this" in reasoning
+ assert len(tool_calls) == 0
+ assert "" in content
+ assert "get_weather" in content
+ assert "" in content
+
+
+def test_parse_delta_both_parsers(tokenizer, request_obj):
+ parser = make_parser(tokenizer, reasoning=True, tool=True)
+ results = stream_text(
+ parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+ )
+ reasoning, content, tool_calls = collect_fields(results)
+
+ assert "let me think about this" in reasoning
+ assert content == ""
+
+ assert len(tool_calls) > 0
+ assert tool_calls[0].function.name == "get_weather"
+ tool_args = "".join(
+ tc.function.arguments for tc in tool_calls if tc.function.arguments
+ )
+ assert json.loads(tool_args) == {"city": "Dallas"}
+
+
+def test_parse_delta_reasoning_only_thinking_disabled(tokenizer, request_obj):
+ """Regression test for vllm-project/vllm#40466.
+
+ When enable_thinking=False, the chat template places \\n\\n
+ in the prompt. The model then generates pure content (no think tokens).
+ All streaming output must go to delta.content, not delta.reasoning.
+ """
+ parser = make_parser(tokenizer, reasoning=True, tool=False)
+
+ end_token_id = parser._reasoning_parser.end_token_id
+ prompt_token_ids = [1, 2, end_token_id, 3]
+
+ content_text = "Hello! How can I assist you today?"
+ results = stream_text(
+ parser,
+ tokenizer,
+ content_text,
+ request_obj,
+ prompt_token_ids=prompt_token_ids,
+ )
+ reasoning, content, tool_calls = collect_fields(results)
+
+ assert reasoning == "", f"Expected no reasoning, got: {reasoning!r}"
+ assert "Hello" in content
+ assert "assist" in content
+ assert len(tool_calls) == 0
diff --git a/vllm/parser/abstract_parser.py b/vllm/parser/abstract_parser.py
index e7f83686dbef..3296ec76709d 100644
--- a/vllm/parser/abstract_parser.py
+++ b/vllm/parser/abstract_parser.py
@@ -635,15 +635,11 @@ def extract_content_ids(self, input_ids: list[int]) -> list[int]:
def _in_reasoning_phase(self, state: StreamState) -> bool:
if self._reasoning_parser is None:
return False
- if self._tool_parser is None:
- return True
return not state.reasoning_ended
def _in_tool_call_phase(self, state: StreamState) -> bool:
if self._tool_parser is None:
return False
- if self._reasoning_parser is None:
- return True
return state.reasoning_ended
def parse_delta(
@@ -657,7 +653,9 @@ def parse_delta(
if not state.prompt_reasoning_checked and prompt_token_ids is not None:
state.prompt_reasoning_checked = True
- if self.is_reasoning_end(prompt_token_ids):
+ if self._reasoning_parser is None or self.is_reasoning_end(
+ prompt_token_ids
+ ):
state.reasoning_ended = True
current_text = state.previous_text + delta_text
@@ -708,8 +706,12 @@ def parse_delta(
)
)
- # No parsers: pass through as content
- if self._reasoning_parser is None and self._tool_parser is None:
+ # No phase active: pass through as content
+ if (
+ delta_message is None
+ and not self._in_reasoning_phase(state)
+ and not self._in_tool_call_phase(state)
+ ):
delta_message = DeltaMessage(content=delta_text)
state.previous_text = current_text