vllm-project · robertgshaw2-redhat · May 6, 2026 · Apr 24, 2026 · May 6, 2026 · May 6, 2026
@@ -287,3 +287,47 @@ async def plain_call():
                 f"index {i}: reasoning decode token ids ({n_reason}) != "
                 f"thinking_token_budget ({expected_budget})"
             )
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("client", ["default", "auto_config"], indirect=True)
+async def test_streaming_with_thinking_disabled_stays_in_content(
+    client: openai.AsyncOpenAI,
+):
+    request_kwargs = {
+        "model": MODEL_NAME,
+        "messages": [
+            {
+                "role": "user",
+                "content": "Which is larger, 4 or 12?"
+                " Output exactly one token: 4 or 12.",
+            }
+        ],
+        "max_tokens": 16,
+        "temperature": 0.0,
+        "extra_body": {"chat_template_kwargs": {"enable_thinking": False}},
+    }
+
+    response = await client.chat.completions.create(**request_kwargs)
+    message = response.choices[0].message
+    assert message.content is not None and message.content.strip() != ""
+    assert getattr(message, "reasoning", None) in (None, "")
+
+    stream = await client.chat.completions.create(
+        **request_kwargs,
+        stream=True,
+    )
+
+    content_chunks = []
+    reasoning_chunks = []
+    async for chunk in stream:
+        if not chunk.choices:
+            continue
+        delta = chunk.choices[0].delta
+        if getattr(delta, "content", None):
+            content_chunks.append(delta.content)
+        if getattr(delta, "reasoning", None):
+            reasoning_chunks.append(delta.reasoning)
+
+    assert "".join(content_chunks).strip() != ""
+    assert reasoning_chunks == []
diff --git a/tests/parser/__init__.py b/tests/parser/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/tests/parser/test_streaming.py b/tests/parser/test_streaming.py
@@ -0,0 +1,165 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import json
+
+import pytest
+
+from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
+from vllm.entrypoints.openai.engine.protocol import DeltaMessage
+from vllm.parser.abstract_parser import _WrappedParser
+from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
+from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser
+
+
+class ThinkReasoningParser(BaseThinkingReasoningParser):
+    @property
+    def start_token(self) -> str:
+        return "<think>"
+
+    @property
+    def end_token(self) -> str:
+        return "</think>"
+
+
+MODEL_OUTPUT = (
+    "<think>let me think about this</think>"
+    '<tool_call>\n{"name": "get_weather", '
+    '"arguments": {"city": "Dallas"}}\n</tool_call>'
+)
+
+
+@pytest.fixture(scope="module")
+def tokenizer():
+    from vllm.tokenizers import get_tokenizer
+
+    return get_tokenizer("Qwen/Qwen3-32B")
+
+
+@pytest.fixture
+def request_obj():
+    return ChatCompletionRequest(
+        model="test-model",
+        messages=[{"role": "user", "content": "hi"}],
+    )
+
+
+def make_parser(tokenizer, reasoning=False, tool=False):
+    _WrappedParser.reasoning_parser_cls = ThinkReasoningParser if reasoning else None
+    _WrappedParser.tool_parser_cls = Hermes2ProToolParser if tool else None
+    return _WrappedParser(tokenizer)
+
+
+def stream_text(parser, tokenizer, text, request, prompt_token_ids=None):
+    token_ids = tokenizer.encode(text, add_special_tokens=False)
+    results: list[DeltaMessage | None] = []
+    for tid in token_ids:
+        delta_text = tokenizer.decode([tid])
+        result = parser.parse_delta(
+            delta_text, [tid], request, prompt_token_ids=prompt_token_ids
+        )
+        prompt_token_ids = None
+        results.append(result)
+    return results
+
+
+def collect_fields(results):
+    all_reasoning = "".join(r.reasoning for r in results if r and r.reasoning)
+    all_content = "".join(r.content for r in results if r and r.content)
+    all_tool_calls = [tc for r in results if r and r.tool_calls for tc in r.tool_calls]
+    return all_reasoning, all_content, all_tool_calls
+
+
+def test_parse_delta_neither_parser(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=False, tool=False)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert reasoning == ""
+    assert len(tool_calls) == 0
+    assert "<think>" in content
+    assert "let me think about this" in content
+    assert "<tool_call>" in content
+    assert "get_weather" in content
+
+
+def test_parse_delta_tool_parser_only(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=False, tool=True)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert reasoning == ""
+    assert "<think>" in content
+    assert "let me think about this" in content
+    assert "</think>" in content
+
+    assert len(tool_calls) > 0
+    assert tool_calls[0].function.name == "get_weather"
+    tool_args = "".join(
+        tc.function.arguments for tc in tool_calls if tc.function.arguments
+    )
+    assert json.loads(tool_args) == {"city": "Dallas"}
+
+
+def test_parse_delta_reasoning_parser_only(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=True, tool=False)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert "let me think about this" in reasoning
+    assert len(tool_calls) == 0
+    assert "<tool_call>" in content
+    assert "get_weather" in content
+    assert "</tool_call>" in content
+
+
+def test_parse_delta_both_parsers(tokenizer, request_obj):
+    parser = make_parser(tokenizer, reasoning=True, tool=True)
+    results = stream_text(
+        parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert "let me think about this" in reasoning
+    assert content == ""
+
+    assert len(tool_calls) > 0
+    assert tool_calls[0].function.name == "get_weather"
+    tool_args = "".join(
+        tc.function.arguments for tc in tool_calls if tc.function.arguments
+    )
+    assert json.loads(tool_args) == {"city": "Dallas"}
+
+
+def test_parse_delta_reasoning_only_thinking_disabled(tokenizer, request_obj):
+    """Regression test for vllm-project/vllm#40466.
+
+    When enable_thinking=False, the chat template places <think>\\n\\n</think>
+    in the prompt. The model then generates pure content (no think tokens).
+    All streaming output must go to delta.content, not delta.reasoning.
+    """
+    parser = make_parser(tokenizer, reasoning=True, tool=False)
+
+    end_token_id = parser._reasoning_parser.end_token_id
+    prompt_token_ids = [1, 2, end_token_id, 3]
+
+    content_text = "Hello! How can I assist you today?"
+    results = stream_text(
+        parser,
+        tokenizer,
+        content_text,
+        request_obj,
+        prompt_token_ids=prompt_token_ids,
+    )
+    reasoning, content, tool_calls = collect_fields(results)
+
+    assert reasoning == "", f"Expected no reasoning, got: {reasoning!r}"
+    assert "Hello" in content
+    assert "assist" in content
+    assert len(tool_calls) == 0
@@ -635,15 +635,11 @@ def extract_content_ids(self, input_ids: list[int]) -> list[int]:
     def _in_reasoning_phase(self, state: StreamState) -> bool:
         if self._reasoning_parser is None:
             return False
-        if self._tool_parser is None:
-            return True
         return not state.reasoning_ended
 
     def _in_tool_call_phase(self, state: StreamState) -> bool:
         if self._tool_parser is None:
             return False
-        if self._reasoning_parser is None:
-            return True
         return state.reasoning_ended
 
     def parse_delta(
@@ -657,7 +653,9 @@ def parse_delta(
 
         if not state.prompt_reasoning_checked and prompt_token_ids is not None:
             state.prompt_reasoning_checked = True
-            if self.is_reasoning_end(prompt_token_ids):
+            if self._reasoning_parser is None or self.is_reasoning_end(
+                prompt_token_ids
+            ):
                 state.reasoning_ended = True
 
         current_text = state.previous_text + delta_text
@@ -708,8 +706,12 @@ def parse_delta(
                 )
             )
 
-        # No parsers: pass through as content
-        if self._reasoning_parser is None and self._tool_parser is None:
+        # No phase active: pass through as content
+        if (
+            delta_message is None
+            and not self._in_reasoning_phase(state)
+            and not self._in_tool_call_phase(state)
+        ):
             delta_message = DeltaMessage(content=delta_text)
 
         state.previous_text = current_text
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# SPDX-License-Identifier: Apache-2.0
		# SPDX-FileCopyrightText: Copyright contributors to the vLLM project