diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index afda75d4fc10..c1ab5da6e323 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -16,6 +16,7 @@ parse_chat_messages, parse_chat_messages_async, ) +from vllm.exceptions import VLLMValidationError from vllm.inputs import MultiModalDataDict, MultiModalUUIDDict from vllm.multimodal.utils import ( encode_audio_url, @@ -2086,65 +2087,286 @@ def test_parse_chat_messages_multiple_images_interleave_with_placeholders( ) -def test_parse_chat_messages_include_thinking_chunk(mistral_model_config): - messages = [ - { - "role": "system", - "content": [ - {"type": "text", "text": "You are a helpful assistant."}, - { - "type": "thinking", - "closed": True, - "thinking": "Only return the answer when you are confident.", - }, - ], - }, - {"role": "user", "content": "What is 2+2?"}, - { - "role": "assistant", - "content": [ - {"type": "text", "text": "Let me think about it."}, - {"type": "thinking", "closed": True, "thinking": "2+2 = 4"}, - { - "type": "text", - "text": "The answer is 4.", - }, - ], - }, - ] - - conversation_with_thinking, _, _ = parse_chat_messages( - messages, - mistral_model_config, - content_format="openai", +class TestParseChatMessagesThinking: + @pytest.mark.parametrize( + ("config_fixture", "messages", "content_format", "expected"), + [ + pytest.param( + "mistral_model_config", + [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": "You are a helpful assistant.", + }, + { + "type": "thinking", + "closed": True, + "thinking": "Only return the answer when you are confident.", # noqa: E501 + }, + ], + }, + {"role": "user", "content": "What is 2+2?"}, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Let me think about it."}, + { + "type": "thinking", + "thinking": "2+2 = 4", + }, + {"type": "text", "text": "The answer is 4."}, + ], + }, + ], + "openai", + [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": "You are a helpful assistant.", + }, + { + "type": "thinking", + "thinking": "Only return the answer when you are confident.", # noqa: E501 + }, + ], + }, + { + "role": "user", + "content": [{"type": "text", "text": "What is 2+2?"}], + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Let me think about it."}, + { + "type": "thinking", + "thinking": "2+2 = 4", + }, + {"type": "text", "text": "The answer is 4."}, + ], + }, + ], + id="mistral_thinking_preserved", + ), + pytest.param( + "phi3v_model_config", + [ + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Let me think about it."}, + { + "type": "thinking", + "closed": True, + "thinking": "2+2 = 4", + }, + {"type": "text", "text": "The answer is 4."}, + ], + }, + ], + "openai", + [ + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Let me think about it."}, + { + "type": "thinking", + "thinking": "2+2 = 4", + }, + {"type": "text", "text": "The answer is 4."}, + ], + }, + ], + id="openai_thinking_preserved", + ), + pytest.param( + "phi3v_model_config", + [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": "You are a helpful assistant.", + }, + { + "type": "thinking", + "thinking": "system reasoning", + }, + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "thinking", + "thinking": "assistant reasoning", + }, + {"type": "text", "text": "The answer is 4."}, + ], + }, + ], + "openai", + [ + { + "role": "system", + "content": [ + { + "type": "text", + "text": "You are a helpful assistant.", + }, + { + "type": "thinking", + "thinking": "system reasoning", + }, + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "thinking", + "thinking": "assistant reasoning", + }, + {"type": "text", "text": "The answer is 4."}, + ], + }, + ], + id="openai_system_and_assistant", + ), + pytest.param( + "phi3v_model_config", + [ + { + "role": "assistant", + "content": [ + {"type": "thinking", "thinking": "first thought"}, + {"type": "text", "text": "middle text"}, + {"type": "thinking", "thinking": "second thought"}, + ], + }, + ], + "openai", + [ + { + "role": "assistant", + "content": [ + { + "type": "thinking", + "thinking": "first thought", + }, + {"type": "text", "text": "middle text"}, + { + "type": "thinking", + "thinking": "second thought", + }, + ], + }, + ], + id="openai_multiple_thinking_blocks", + ), + pytest.param( + "phi3v_model_config", + [ + { + "role": "assistant", + "content": "The answer is 4.", + "reasoning": "I computed 2+2", + }, + ], + "openai", + [ + { + "role": "assistant", + "content": [ + {"type": "text", "text": "The answer is 4."}, + ], + "reasoning": "I computed 2+2", + "reasoning_content": "I computed 2+2", + }, + ], + id="top_level_reasoning_no_thinking", + ), + pytest.param( + "mistral_model_config", + [ + { + "role": "assistant", + "content": [ + {"type": "text", "text": "text content"}, + { + "type": "thinking", + "thinking": "thinking content", + }, + ], + }, + ], + "string", + [ + { + "role": "assistant", + "content": "text content", + "reasoning": "thinking content", + "reasoning_content": "thinking content", + }, + ], + id="string_thinking_extracted", + ), + ], ) + def test_thinking_handling( + self, + request: pytest.FixtureRequest, + config_fixture: str, + messages: list[dict[str, object]], + content_format: Literal["openai", "string"], + expected: list[dict[str, object]], + ) -> None: + model_config = request.getfixturevalue(config_fixture) + conversation, _, _ = parse_chat_messages( + messages, + model_config, + content_format=content_format, + ) + assert conversation == expected + + @pytest.mark.parametrize( + "model_config_fixture", ["phi3v_model_config", "mistral_model_config"] + ) + def test_reasoning_and_thinking_conflict_raises( + self, + model_config_fixture: str, + request: pytest.FixtureRequest, + ) -> None: + model_config = request.getfixturevalue(model_config_fixture) + messages = [ + { + "role": "assistant", + "content": [ + {"type": "thinking", "thinking": "some thinking"}, + {"type": "text", "text": "answer"}, + ], + "reasoning": "top-level reasoning", + }, + ] - expected_conversation = [ - { - "role": "system", - "content": [ - {"type": "text", "text": "You are a helpful assistant."}, - { - "type": "text", - "text": "Only return the answer when you are confident.", - }, - ], - }, - { - "role": "user", - "content": [{"type": "text", "text": "What is 2+2?"}], - }, - { - "role": "assistant", - "content": [ - {"type": "text", "text": "Let me think about it."}, - {"type": "text", "text": "2+2 = 4"}, - {"type": "text", "text": "The answer is 4."}, - ], - }, - ] - - assert conversation_with_thinking == expected_conversation + with pytest.raises( + VLLMValidationError, + match="Cannot specify both a top-level 'reasoning' field and " + "'thinking' content blocks", + ): + parse_chat_messages( + messages, + model_config, + content_format="openai", + ) def test_parse_chat_messages_single_empty_audio_with_uuid( diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index cfe0857b679e..90493eb811f0 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -406,6 +406,26 @@ class ConversationMessage(TypedDict, total=False): """Model-specific task marker. Currently passed through for DeepSeek V4.""" +def flatten_content_to_text( + content: str | None | list[dict[str, str]], +) -> str: + r"""Flatten a ConversationMessage content value to a plain string. + + Content may be a list of typed dicts + (e.g. thinking chunks for Mistral chat templates). Only + type="text" parts are kept for features like *echo* that expects plain text. + """ + if content is None: + return "" + if isinstance(content, str): + return content + return "\n".join( + part["text"] + for part in content + if isinstance(part, dict) and part.get("type") == "text" + ) + + # Passed in by user ChatTemplateContentFormatOption = Literal["auto", "string", "openai"] @@ -1562,6 +1582,7 @@ def _parse_chat_message_content_mm_part( PART_TYPES_TO_SKIP_NONE_CONTENT = ( "text", "refusal", + "thinking", ) @@ -1657,7 +1678,7 @@ def _parse_chat_message_content_part( ) return None - if part_type in ("text", "input_text", "output_text", "refusal", "thinking"): + if part_type in ("text", "input_text", "output_text", "refusal"): str_content = cast(str, content) _reject_reserved_placeholder_in_text(str_content, mm_parser.model_config) if wrap_dicts: @@ -1665,6 +1686,19 @@ def _parse_chat_message_content_part( else: return str_content + # This is kept for HF renderer with chat templates supporting thinking chunks. + if part_type == "thinking": + str_content = cast(str, content) + _reject_reserved_placeholder_in_text(str_content, mm_parser.model_config) + assert wrap_dicts, ( + "thinking parts should have been extracted by " + "_extract_thinking for content_format='string'" + ) + return { + "type": "thinking", + "thinking": str_content, + } + # For media items, if a user has provided one, use it. Otherwise, insert # a placeholder empty uuid. uuid = part.get("uuid", None) @@ -1741,6 +1775,50 @@ def _parse_chat_message_content_part( _ToolParser = partial(cast, ChatCompletionToolMessageParam) +def _extract_thinking( + content: list[ChatCompletionContentPartParam], + reasoning: str | None, + content_format: ChatTemplateContentFormat, +) -> tuple[list[ChatCompletionContentPartParam], str | None]: + r"""Extract thinking content blocks from `content`. + + When `content_format` is `"openai"`, thinking blocks are kept in + the content list so that Jinja chat templates (including older Mistral + model templates) can handle them natively as + `{"type": "thinking", ...}` dicts. When `content_format` is + `"string"`, thinking blocks are pulled out and concatenated into a + single string. + """ + + thinking_parts: list[str] = [] + non_thinking_parts: list[ChatCompletionContentPartParam] = [] + for part in content: + if isinstance(part, dict) and part.get("type") == "thinking": + thinking_str = _ThinkParser(part).get("thinking", None) + if thinking_str is not None: + thinking_parts.append(thinking_str) + else: + non_thinking_parts.append(part) + + if not thinking_parts: + return content, reasoning + + if reasoning is not None: + raise VLLMValidationError( + "Cannot specify both a top-level 'reasoning' field and " + "'thinking' content blocks on the same message." + ) + + if content_format == "openai": + # Thinking blocks stay inline for Jinja template rendering; + # no separate reasoning field is needed. + return content, reasoning + + thinking_text = "\n".join(thinking_parts) + + return non_thinking_parts, thinking_text + + def _parse_chat_message_content( message: ChatCompletionMessageParam, mm_tracker: BaseMultiModalItemTracker, @@ -1756,6 +1834,14 @@ def _parse_chat_message_content( content = [] elif isinstance(content, str): content = [ChatCompletionContentPartTextParam(type="text", text=content)] + + if isinstance(content, list): + content, reasoning = _extract_thinking( + content, + reasoning, + content_format=content_format, + ) + result = _parse_chat_message_content_parts( role, content, # type: ignore diff --git a/vllm/entrypoints/openai/chat_completion/batch_serving.py b/vllm/entrypoints/openai/chat_completion/batch_serving.py index cc49909b8361..e6b708234df9 100644 --- a/vllm/entrypoints/openai/chat_completion/batch_serving.py +++ b/vllm/entrypoints/openai/chat_completion/batch_serving.py @@ -8,7 +8,7 @@ from fastapi import Request -from vllm.entrypoints.chat_utils import ConversationMessage +from vllm.entrypoints.chat_utils import ConversationMessage, flatten_content_to_text from vllm.entrypoints.openai.chat_completion.protocol import ( BatchChatCompletionRequest, ChatCompletionResponse, @@ -279,12 +279,10 @@ async def chat_completion_full_generator_batch( if request.echo: conversation = all_conversations[prompt_idx] - last_msg_content: str | list[dict[str, str]] = "" + last_msg_content = "" if conversation and "content" in conversation[-1]: - last_msg_content = conversation[-1]["content"] or "" - if isinstance(last_msg_content, list): - last_msg_content = "\n".join( - msg["text"] for msg in last_msg_content + last_msg_content = flatten_content_to_text( + conversation[-1]["content"] ) message.content = last_msg_content + (message.content or "") diff --git a/vllm/entrypoints/openai/chat_completion/serving.py b/vllm/entrypoints/openai/chat_completion/serving.py index 694ff80047c7..f6bf3b9f8090 100644 --- a/vllm/entrypoints/openai/chat_completion/serving.py +++ b/vllm/entrypoints/openai/chat_completion/serving.py @@ -15,6 +15,7 @@ from vllm.entrypoints.chat_utils import ( ChatTemplateContentFormatOption, ConversationMessage, + flatten_content_to_text, get_history_tool_calls_cnt, get_tool_call_id_type, make_tool_call_id, @@ -571,13 +572,15 @@ async def chat_completion_stream_generator( # Send response to echo the input portion of the # last message if request.echo: - last_msg_content: str | list[dict[str, str]] = "" + last_msg_content = "" if ( conversation and "content" in conversation[-1] and conversation[-1].get("role") == role ): - last_msg_content = conversation[-1]["content"] or "" + last_msg_content = flatten_content_to_text( + conversation[-1]["content"] + ) if last_msg_content: for i in range(num_choices): @@ -1473,15 +1476,13 @@ async def chat_completion_full_generator( choices.append(choice_data) if request.echo: - last_msg_content: str | list[dict[str, str]] = "" + last_msg_content = "" if ( conversation and "content" in conversation[-1] and conversation[-1].get("role") == role ): - last_msg_content = conversation[-1]["content"] or "" - if isinstance(last_msg_content, list): - last_msg_content = "\n".join(msg["text"] for msg in last_msg_content) + last_msg_content = flatten_content_to_text(conversation[-1]["content"]) for choice in choices: full_message = last_msg_content + (choice.message.content or "")