diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index afda75d4fc10..3a49aca1b9f1 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -16,6 +16,7 @@
parse_chat_messages,
parse_chat_messages_async,
)
+from vllm.exceptions import VLLMValidationError
from vllm.inputs import MultiModalDataDict, MultiModalUUIDDict
from vllm.multimodal.utils import (
encode_audio_url,
@@ -2086,6 +2087,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
)
+@pytest.mark.skip_global_cleanup
def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
messages = [
{
@@ -2138,15 +2140,36 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
"role": "assistant",
"content": [
{"type": "text", "text": "Let me think about it."},
- {"type": "text", "text": "2+2 = 4"},
{"type": "text", "text": "The answer is 4."},
],
+ "reasoning": "2+2 = 4",
+ "reasoning_content": "2+2 = 4",
},
]
assert conversation_with_thinking == expected_conversation
+@pytest.mark.skip_global_cleanup
+def test_parse_chat_messages_rejects_duplicate_assistant_reasoning(
+ mistral_model_config,
+):
+ messages = [
+ {
+ "role": "assistant",
+ "content": [{"type": "thinking", "thinking": "from content"}],
+ "reasoning": "from reasoning",
+ }
+ ]
+
+ with pytest.raises(VLLMValidationError, match="both top-level `reasoning`"):
+ parse_chat_messages(
+ messages,
+ mistral_model_config,
+ content_format="string",
+ )
+
+
def test_parse_chat_messages_single_empty_audio_with_uuid(
qwen2_audio_model_config,
):
diff --git a/tests/tokenizers_/test_deepseek_v4.py b/tests/tokenizers_/test_deepseek_v4.py
index 358732eabf40..016093b63732 100644
--- a/tests/tokenizers_/test_deepseek_v4.py
+++ b/tests/tokenizers_/test_deepseek_v4.py
@@ -183,6 +183,35 @@ def test_deepseek_v4_renders_parsed_history_tool_arguments():
assert 'parameter name="arguments"' not in prompt
+@pytest.mark.skip_global_cleanup
+def test_deepseek_v4_renders_assistant_thinking_content_as_reasoning():
+ messages = [
+ {"role": "user", "content": "u"},
+ {
+ "role": "assistant",
+ "content": [{"type": "thinking", "thinking": "REASON"}],
+ },
+ {"role": "user", "content": "v"},
+ ]
+ conversation, _, _ = parse_chat_messages(
+ messages,
+ _model_config(),
+ content_format="string",
+ )
+
+ prompt = _tokenizer().apply_chat_template(
+ conversation=conversation,
+ messages=messages,
+ tokenize=False,
+ thinking=True,
+ drop_thinking=False,
+ reasoning_effort="high",
+ )
+
+ assert "<|User|>u<|Assistant|>REASON" in prompt
+ assert "REASON" not in prompt
+
+
@pytest.mark.parametrize("reasoning_effort", ["minimal", "low", "medium", "high"])
def test_deepseek_v4_accepts_openai_reasoning_effort_values(reasoning_effort):
prompt = _tokenizer().apply_chat_template(
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index cfe0857b679e..f72802613f44 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1736,6 +1736,23 @@ def _parse_chat_message_content_part(
return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
+def _extract_assistant_thinking_parts(
+ parts: list[ChatCompletionContentPartParam],
+) -> tuple[list[ChatCompletionContentPartParam], str | None]:
+ visible_parts: list[ChatCompletionContentPartParam] = []
+ reasoning_parts: list[str] = []
+
+ for part in parts:
+ if isinstance(part, dict) and part.get("type") == "thinking":
+ thinking = part.get("thinking")
+ if thinking is not None:
+ reasoning_parts.append(cast(str, thinking))
+ continue
+ visible_parts.append(part)
+
+ return visible_parts, "".join(reasoning_parts) or None
+
+
# No need to validate using Pydantic again
_AssistantParser = partial(cast, ChatCompletionAssistantMessageParam)
_ToolParser = partial(cast, ChatCompletionToolMessageParam)
@@ -1756,6 +1773,20 @@ def _parse_chat_message_content(
content = []
elif isinstance(content, str):
content = [ChatCompletionContentPartTextParam(type="text", text=content)]
+
+ if role == "assistant":
+ content, reasoning_from_content = _extract_assistant_thinking_parts(content)
+ if reasoning is not None and reasoning_from_content is not None:
+ raise VLLMValidationError(
+ "Assistant messages must not contain both top-level "
+ "`reasoning` and content parts of type `thinking`. Please "
+ "use only one representation for assistant reasoning.",
+ parameter="messages",
+ value=message,
+ )
+ if reasoning_from_content is not None:
+ reasoning = reasoning_from_content
+
result = _parse_chat_message_content_parts(
role,
content, # type: ignore