Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion tests/entrypoints/test_chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
parse_chat_messages,
parse_chat_messages_async,
)
from vllm.exceptions import VLLMValidationError
from vllm.inputs import MultiModalDataDict, MultiModalUUIDDict
from vllm.multimodal.utils import (
encode_audio_url,
Expand Down Expand Up @@ -2086,6 +2087,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
)


@pytest.mark.skip_global_cleanup
def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
messages = [
{
Expand Down Expand Up @@ -2138,15 +2140,36 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
"role": "assistant",
"content": [
{"type": "text", "text": "Let me think about it."},
{"type": "text", "text": "2+2 = 4"},
{"type": "text", "text": "The answer is 4."},
],
"reasoning": "2+2 = 4",
"reasoning_content": "2+2 = 4",
},
]

assert conversation_with_thinking == expected_conversation


@pytest.mark.skip_global_cleanup
def test_parse_chat_messages_rejects_duplicate_assistant_reasoning(
mistral_model_config,
):
messages = [
{
"role": "assistant",
"content": [{"type": "thinking", "thinking": "from content"}],
"reasoning": "from reasoning",
}
]

with pytest.raises(VLLMValidationError, match="both top-level `reasoning`"):
parse_chat_messages(
messages,
mistral_model_config,
content_format="string",
)


def test_parse_chat_messages_single_empty_audio_with_uuid(
qwen2_audio_model_config,
):
Expand Down
29 changes: 29 additions & 0 deletions tests/tokenizers_/test_deepseek_v4.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,35 @@ def test_deepseek_v4_renders_parsed_history_tool_arguments():
assert 'parameter name="arguments"' not in prompt


@pytest.mark.skip_global_cleanup
def test_deepseek_v4_renders_assistant_thinking_content_as_reasoning():
messages = [
{"role": "user", "content": "u"},
{
"role": "assistant",
"content": [{"type": "thinking", "thinking": "REASON"}],
},
{"role": "user", "content": "v"},
]
conversation, _, _ = parse_chat_messages(
messages,
_model_config(),
content_format="string",
)

prompt = _tokenizer().apply_chat_template(
conversation=conversation,
messages=messages,
tokenize=False,
thinking=True,
drop_thinking=False,
reasoning_effort="high",
)

assert "<|User|>u<|Assistant|><think>REASON</think>" in prompt
assert "<think></think>REASON" not in prompt


@pytest.mark.parametrize("reasoning_effort", ["minimal", "low", "medium", "high"])
def test_deepseek_v4_accepts_openai_reasoning_effort_values(reasoning_effort):
prompt = _tokenizer().apply_chat_template(
Expand Down
31 changes: 31 additions & 0 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1736,6 +1736,23 @@ def _parse_chat_message_content_part(
return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None


def _extract_assistant_thinking_parts(
parts: list[ChatCompletionContentPartParam],
) -> tuple[list[ChatCompletionContentPartParam], str | None]:
visible_parts: list[ChatCompletionContentPartParam] = []
reasoning_parts: list[str] = []

for part in parts:
if isinstance(part, dict) and part.get("type") == "thinking":
thinking = part.get("thinking")
if thinking is not None:
reasoning_parts.append(cast(str, thinking))
continue
visible_parts.append(part)

return visible_parts, "".join(reasoning_parts) or None
Comment thread
mertunsall marked this conversation as resolved.


# No need to validate using Pydantic again
_AssistantParser = partial(cast, ChatCompletionAssistantMessageParam)
_ToolParser = partial(cast, ChatCompletionToolMessageParam)
Expand All @@ -1756,6 +1773,20 @@ def _parse_chat_message_content(
content = []
elif isinstance(content, str):
content = [ChatCompletionContentPartTextParam(type="text", text=content)]

if role == "assistant":
content, reasoning_from_content = _extract_assistant_thinking_parts(content)
if reasoning is not None and reasoning_from_content is not None:
raise VLLMValidationError(
"Assistant messages must not contain both top-level "
"`reasoning` and content parts of type `thinking`. Please "
"use only one representation for assistant reasoning.",
parameter="messages",
value=message,
)
if reasoning_from_content is not None:
reasoning = reasoning_from_content
Comment on lines +1787 to +1788
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

recent mistral chat templates such as this one
https://huggingface.co/mistralai/Mistral-Medium-3.5-128B/blob/main/chat_template.jinja#L80 and mistral-common
would add twice the thinking part if you do this.


result = _parse_chat_message_content_parts(
role,
content, # type: ignore
Expand Down
Loading