From e35531efae3e9dc903671d6e6848173548761cc6 Mon Sep 17 00:00:00 2001
From: mertunsall <mert.unsal@mistral.ai>
Date: Tue, 5 May 2026 09:04:31 +0000
Subject: [PATCH] Fix assistant thinking block normalization

Co-authored-by: OpenAI Codex <codex@openai.com>
Signed-off-by: mertunsall <mert.unsal@mistral.ai>
---
 tests/entrypoints/test_chat_utils.py  | 25 ++++++++++++++++++++-
 tests/tokenizers_/test_deepseek_v4.py | 29 +++++++++++++++++++++++++
 vllm/entrypoints/chat_utils.py        | 31 +++++++++++++++++++++++++++
 3 files changed, 84 insertions(+), 1 deletion(-)
diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py
index afda75d4fc10..3a49aca1b9f1 100644
--- a/tests/entrypoints/test_chat_utils.py
+++ b/tests/entrypoints/test_chat_utils.py
@@ -16,6 +16,7 @@
     parse_chat_messages,
     parse_chat_messages_async,
 )
+from vllm.exceptions import VLLMValidationError
 from vllm.inputs import MultiModalDataDict, MultiModalUUIDDict
 from vllm.multimodal.utils import (
     encode_audio_url,
@@ -2086,6 +2087,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders(
         )
 
 
+@pytest.mark.skip_global_cleanup
 def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
     messages = [
         {
@@ -2138,15 +2140,36 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config):
             "role": "assistant",
             "content": [
                 {"type": "text", "text": "Let me think about it."},
-                {"type": "text", "text": "2+2 = 4"},
                 {"type": "text", "text": "The answer is 4."},
             ],
+            "reasoning": "2+2 = 4",
+            "reasoning_content": "2+2 = 4",
         },
     ]
 
     assert conversation_with_thinking == expected_conversation
 
 
+@pytest.mark.skip_global_cleanup
+def test_parse_chat_messages_rejects_duplicate_assistant_reasoning(
+    mistral_model_config,
+):
+    messages = [
+        {
+            "role": "assistant",
+            "content": [{"type": "thinking", "thinking": "from content"}],
+            "reasoning": "from reasoning",
+        }
+    ]
+
+    with pytest.raises(VLLMValidationError, match="both top-level `reasoning`"):
+        parse_chat_messages(
+            messages,
+            mistral_model_config,
+            content_format="string",
+        )
+
+
 def test_parse_chat_messages_single_empty_audio_with_uuid(
     qwen2_audio_model_config,
 ):
diff --git a/tests/tokenizers_/test_deepseek_v4.py b/tests/tokenizers_/test_deepseek_v4.py
index 358732eabf40..016093b63732 100644
--- a/tests/tokenizers_/test_deepseek_v4.py
+++ b/tests/tokenizers_/test_deepseek_v4.py
@@ -183,6 +183,35 @@ def test_deepseek_v4_renders_parsed_history_tool_arguments():
     assert 'parameter name="arguments"' not in prompt
 
 
+@pytest.mark.skip_global_cleanup
+def test_deepseek_v4_renders_assistant_thinking_content_as_reasoning():
+    messages = [
+        {"role": "user", "content": "u"},
+        {
+            "role": "assistant",
+            "content": [{"type": "thinking", "thinking": "REASON"}],
+        },
+        {"role": "user", "content": "v"},
+    ]
+    conversation, _, _ = parse_chat_messages(
+        messages,
+        _model_config(),
+        content_format="string",
+    )
+
+    prompt = _tokenizer().apply_chat_template(
+        conversation=conversation,
+        messages=messages,
+        tokenize=False,
+        thinking=True,
+        drop_thinking=False,
+        reasoning_effort="high",
+    )
+
+    assert "<｜User｜>u<｜Assistant｜><think>REASON</think>" in prompt
+    assert "<think></think>REASON" not in prompt
+
+
 @pytest.mark.parametrize("reasoning_effort", ["minimal", "low", "medium", "high"])
 def test_deepseek_v4_accepts_openai_reasoning_effort_values(reasoning_effort):
     prompt = _tokenizer().apply_chat_template(
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index cfe0857b679e..f72802613f44 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1736,6 +1736,23 @@ def _parse_chat_message_content_part(
     return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None
 
 
+def _extract_assistant_thinking_parts(
+    parts: list[ChatCompletionContentPartParam],
+) -> tuple[list[ChatCompletionContentPartParam], str | None]:
+    visible_parts: list[ChatCompletionContentPartParam] = []
+    reasoning_parts: list[str] = []
+
+    for part in parts:
+        if isinstance(part, dict) and part.get("type") == "thinking":
+            thinking = part.get("thinking")
+            if thinking is not None:
+                reasoning_parts.append(cast(str, thinking))
+            continue
+        visible_parts.append(part)
+
+    return visible_parts, "".join(reasoning_parts) or None
+
+
 # No need to validate using Pydantic again
 _AssistantParser = partial(cast, ChatCompletionAssistantMessageParam)
 _ToolParser = partial(cast, ChatCompletionToolMessageParam)
@@ -1756,6 +1773,20 @@ def _parse_chat_message_content(
         content = []
     elif isinstance(content, str):
         content = [ChatCompletionContentPartTextParam(type="text", text=content)]
+
+    if role == "assistant":
+        content, reasoning_from_content = _extract_assistant_thinking_parts(content)
+        if reasoning is not None and reasoning_from_content is not None:
+            raise VLLMValidationError(
+                "Assistant messages must not contain both top-level "
+                "`reasoning` and content parts of type `thinking`. Please "
+                "use only one representation for assistant reasoning.",
+                parameter="messages",
+                value=message,
+            )
+        if reasoning_from_content is not None:
+            reasoning = reasoning_from_content
+
     result = _parse_chat_message_content_parts(
         role,
         content,  # type: ignore