From e35531efae3e9dc903671d6e6848173548761cc6 Mon Sep 17 00:00:00 2001 From: mertunsall Date: Tue, 5 May 2026 09:04:31 +0000 Subject: [PATCH] Fix assistant thinking block normalization Co-authored-by: OpenAI Codex Signed-off-by: mertunsall --- tests/entrypoints/test_chat_utils.py | 25 ++++++++++++++++++++- tests/tokenizers_/test_deepseek_v4.py | 29 +++++++++++++++++++++++++ vllm/entrypoints/chat_utils.py | 31 +++++++++++++++++++++++++++ 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index afda75d4fc10..3a49aca1b9f1 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -16,6 +16,7 @@ parse_chat_messages, parse_chat_messages_async, ) +from vllm.exceptions import VLLMValidationError from vllm.inputs import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import ( encode_audio_url, @@ -2086,6 +2087,7 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( ) +@pytest.mark.skip_global_cleanup def test_parse_chat_messages_include_thinking_chunk(mistral_model_config): messages = [ { @@ -2138,15 +2140,36 @@ def test_parse_chat_messages_include_thinking_chunk(mistral_model_config): "role": "assistant", "content": [ {"type": "text", "text": "Let me think about it."}, - {"type": "text", "text": "2+2 = 4"}, {"type": "text", "text": "The answer is 4."}, ], + "reasoning": "2+2 = 4", + "reasoning_content": "2+2 = 4", }, ] assert conversation_with_thinking == expected_conversation +@pytest.mark.skip_global_cleanup +def test_parse_chat_messages_rejects_duplicate_assistant_reasoning( + mistral_model_config, +): + messages = [ + { + "role": "assistant", + "content": [{"type": "thinking", "thinking": "from content"}], + "reasoning": "from reasoning", + } + ] + + with pytest.raises(VLLMValidationError, match="both top-level `reasoning`"): + parse_chat_messages( + messages, + mistral_model_config, + content_format="string", + ) + + def test_parse_chat_messages_single_empty_audio_with_uuid( qwen2_audio_model_config, ): diff --git a/tests/tokenizers_/test_deepseek_v4.py b/tests/tokenizers_/test_deepseek_v4.py index 358732eabf40..016093b63732 100644 --- a/tests/tokenizers_/test_deepseek_v4.py +++ b/tests/tokenizers_/test_deepseek_v4.py @@ -183,6 +183,35 @@ def test_deepseek_v4_renders_parsed_history_tool_arguments(): assert 'parameter name="arguments"' not in prompt +@pytest.mark.skip_global_cleanup +def test_deepseek_v4_renders_assistant_thinking_content_as_reasoning(): + messages = [ + {"role": "user", "content": "u"}, + { + "role": "assistant", + "content": [{"type": "thinking", "thinking": "REASON"}], + }, + {"role": "user", "content": "v"}, + ] + conversation, _, _ = parse_chat_messages( + messages, + _model_config(), + content_format="string", + ) + + prompt = _tokenizer().apply_chat_template( + conversation=conversation, + messages=messages, + tokenize=False, + thinking=True, + drop_thinking=False, + reasoning_effort="high", + ) + + assert "<|User|>u<|Assistant|>REASON" in prompt + assert "REASON" not in prompt + + @pytest.mark.parametrize("reasoning_effort", ["minimal", "low", "medium", "high"]) def test_deepseek_v4_accepts_openai_reasoning_effort_values(reasoning_effort): prompt = _tokenizer().apply_chat_template( diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index cfe0857b679e..f72802613f44 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1736,6 +1736,23 @@ def _parse_chat_message_content_part( return MODALITY_PLACEHOLDERS_MAP[modality] if interleave_strings else None +def _extract_assistant_thinking_parts( + parts: list[ChatCompletionContentPartParam], +) -> tuple[list[ChatCompletionContentPartParam], str | None]: + visible_parts: list[ChatCompletionContentPartParam] = [] + reasoning_parts: list[str] = [] + + for part in parts: + if isinstance(part, dict) and part.get("type") == "thinking": + thinking = part.get("thinking") + if thinking is not None: + reasoning_parts.append(cast(str, thinking)) + continue + visible_parts.append(part) + + return visible_parts, "".join(reasoning_parts) or None + + # No need to validate using Pydantic again _AssistantParser = partial(cast, ChatCompletionAssistantMessageParam) _ToolParser = partial(cast, ChatCompletionToolMessageParam) @@ -1756,6 +1773,20 @@ def _parse_chat_message_content( content = [] elif isinstance(content, str): content = [ChatCompletionContentPartTextParam(type="text", text=content)] + + if role == "assistant": + content, reasoning_from_content = _extract_assistant_thinking_parts(content) + if reasoning is not None and reasoning_from_content is not None: + raise VLLMValidationError( + "Assistant messages must not contain both top-level " + "`reasoning` and content parts of type `thinking`. Please " + "use only one representation for assistant reasoning.", + parameter="messages", + value=message, + ) + if reasoning_from_content is not None: + reasoning = reasoning_from_content + result = _parse_chat_message_content_parts( role, content, # type: ignore