Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -287,3 +287,47 @@ async def plain_call():
f"index {i}: reasoning decode token ids ({n_reason}) != "
f"thinking_token_budget ({expected_budget})"
)


@pytest.mark.asyncio
@pytest.mark.parametrize("client", ["default", "auto_config"], indirect=True)
async def test_streaming_with_thinking_disabled_stays_in_content(
client: openai.AsyncOpenAI,
):
request_kwargs = {
"model": MODEL_NAME,
"messages": [
{
"role": "user",
"content": "Which is larger, 4 or 12?"
" Output exactly one token: 4 or 12.",
}
],
"max_tokens": 16,
"temperature": 0.0,
"extra_body": {"chat_template_kwargs": {"enable_thinking": False}},
}

response = await client.chat.completions.create(**request_kwargs)
message = response.choices[0].message
assert message.content is not None and message.content.strip() != ""
assert getattr(message, "reasoning", None) in (None, "")

stream = await client.chat.completions.create(
**request_kwargs,
stream=True,
)

content_chunks = []
reasoning_chunks = []
async for chunk in stream:
if not chunk.choices:
continue
delta = chunk.choices[0].delta
if getattr(delta, "content", None):
content_chunks.append(delta.content)
if getattr(delta, "reasoning", None):
reasoning_chunks.append(delta.reasoning)

assert "".join(content_chunks).strip() != ""
assert reasoning_chunks == []
2 changes: 2 additions & 0 deletions tests/parser/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
165 changes: 165 additions & 0 deletions tests/parser/test_streaming.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import json

import pytest

from vllm.entrypoints.openai.chat_completion.protocol import ChatCompletionRequest
from vllm.entrypoints.openai.engine.protocol import DeltaMessage
from vllm.parser.abstract_parser import _WrappedParser
from vllm.reasoning.basic_parsers import BaseThinkingReasoningParser
from vllm.tool_parsers.hermes_tool_parser import Hermes2ProToolParser


class ThinkReasoningParser(BaseThinkingReasoningParser):
@property
def start_token(self) -> str:
return "<think>"

@property
def end_token(self) -> str:
return "</think>"


MODEL_OUTPUT = (
"<think>let me think about this</think>"
'<tool_call>\n{"name": "get_weather", '
'"arguments": {"city": "Dallas"}}\n</tool_call>'
)


@pytest.fixture(scope="module")
def tokenizer():
from vllm.tokenizers import get_tokenizer

return get_tokenizer("Qwen/Qwen3-32B")


@pytest.fixture
def request_obj():
return ChatCompletionRequest(
model="test-model",
messages=[{"role": "user", "content": "hi"}],
)


def make_parser(tokenizer, reasoning=False, tool=False):
_WrappedParser.reasoning_parser_cls = ThinkReasoningParser if reasoning else None
_WrappedParser.tool_parser_cls = Hermes2ProToolParser if tool else None
return _WrappedParser(tokenizer)


def stream_text(parser, tokenizer, text, request, prompt_token_ids=None):
token_ids = tokenizer.encode(text, add_special_tokens=False)
results: list[DeltaMessage | None] = []
for tid in token_ids:
delta_text = tokenizer.decode([tid])
result = parser.parse_delta(
delta_text, [tid], request, prompt_token_ids=prompt_token_ids
)
prompt_token_ids = None
results.append(result)
return results


def collect_fields(results):
all_reasoning = "".join(r.reasoning for r in results if r and r.reasoning)
all_content = "".join(r.content for r in results if r and r.content)
all_tool_calls = [tc for r in results if r and r.tool_calls for tc in r.tool_calls]
return all_reasoning, all_content, all_tool_calls


def test_parse_delta_neither_parser(tokenizer, request_obj):
parser = make_parser(tokenizer, reasoning=False, tool=False)
results = stream_text(
parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
)
reasoning, content, tool_calls = collect_fields(results)

assert reasoning == ""
assert len(tool_calls) == 0
assert "<think>" in content
assert "let me think about this" in content
assert "<tool_call>" in content
assert "get_weather" in content


def test_parse_delta_tool_parser_only(tokenizer, request_obj):
parser = make_parser(tokenizer, reasoning=False, tool=True)
results = stream_text(
parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
)
reasoning, content, tool_calls = collect_fields(results)

assert reasoning == ""
assert "<think>" in content
assert "let me think about this" in content
assert "</think>" in content

assert len(tool_calls) > 0
assert tool_calls[0].function.name == "get_weather"
tool_args = "".join(
tc.function.arguments for tc in tool_calls if tc.function.arguments
)
assert json.loads(tool_args) == {"city": "Dallas"}


def test_parse_delta_reasoning_parser_only(tokenizer, request_obj):
parser = make_parser(tokenizer, reasoning=True, tool=False)
results = stream_text(
parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
)
reasoning, content, tool_calls = collect_fields(results)

assert "let me think about this" in reasoning
assert len(tool_calls) == 0
assert "<tool_call>" in content
assert "get_weather" in content
assert "</tool_call>" in content


def test_parse_delta_both_parsers(tokenizer, request_obj):
parser = make_parser(tokenizer, reasoning=True, tool=True)
results = stream_text(
parser, tokenizer, MODEL_OUTPUT, request_obj, prompt_token_ids=[]
)
reasoning, content, tool_calls = collect_fields(results)

assert "let me think about this" in reasoning
assert content == ""

assert len(tool_calls) > 0
assert tool_calls[0].function.name == "get_weather"
tool_args = "".join(
tc.function.arguments for tc in tool_calls if tc.function.arguments
)
assert json.loads(tool_args) == {"city": "Dallas"}


def test_parse_delta_reasoning_only_thinking_disabled(tokenizer, request_obj):
"""Regression test for vllm-project/vllm#40466.

When enable_thinking=False, the chat template places <think>\\n\\n</think>
in the prompt. The model then generates pure content (no think tokens).
All streaming output must go to delta.content, not delta.reasoning.
"""
parser = make_parser(tokenizer, reasoning=True, tool=False)

end_token_id = parser._reasoning_parser.end_token_id
prompt_token_ids = [1, 2, end_token_id, 3]

content_text = "Hello! How can I assist you today?"
results = stream_text(
parser,
tokenizer,
content_text,
request_obj,
prompt_token_ids=prompt_token_ids,
)
reasoning, content, tool_calls = collect_fields(results)

assert reasoning == "", f"Expected no reasoning, got: {reasoning!r}"
assert "Hello" in content
assert "assist" in content
assert len(tool_calls) == 0
16 changes: 9 additions & 7 deletions vllm/parser/abstract_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,15 +635,11 @@ def extract_content_ids(self, input_ids: list[int]) -> list[int]:
def _in_reasoning_phase(self, state: StreamState) -> bool:
if self._reasoning_parser is None:
return False
if self._tool_parser is None:
return True
return not state.reasoning_ended

def _in_tool_call_phase(self, state: StreamState) -> bool:
if self._tool_parser is None:
return False
if self._reasoning_parser is None:
return True
return state.reasoning_ended

def parse_delta(
Expand All @@ -657,7 +653,9 @@ def parse_delta(

if not state.prompt_reasoning_checked and prompt_token_ids is not None:
state.prompt_reasoning_checked = True
if self.is_reasoning_end(prompt_token_ids):
if self._reasoning_parser is None or self.is_reasoning_end(
prompt_token_ids
):
state.reasoning_ended = True

current_text = state.previous_text + delta_text
Expand Down Expand Up @@ -708,8 +706,12 @@ def parse_delta(
)
)

# No parsers: pass through as content
if self._reasoning_parser is None and self._tool_parser is None:
# No phase active: pass through as content
if (
delta_message is None
and not self._in_reasoning_phase(state)
and not self._in_tool_call_phase(state)
):
delta_message = DeltaMessage(content=delta_text)

state.previous_text = current_text
Expand Down
Loading