diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 36e8b0c0b540..103ffa2dbcbc 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -2682,3 +2682,172 @@ async def test_parse_chat_messages_video_vision_chunk_with_uuid_async( assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk") + + +# -- Tests for system/developer message content validation (issue #33925) ----- + + +@pytest.mark.parametrize("role", ["system", "developer"]) +@pytest.mark.parametrize( + ("part", "part_type_label"), + [ + ( + {"type": "image_url", "image_url": {"url": "https://example.com/img.png"}}, + "image_url", + ), + ( + {"type": "input_audio", "input_audio": {"data": "abc", "format": "wav"}}, + "input_audio", + ), + ( + {"type": "video_url", "video_url": {"url": "https://example.com/vid.mp4"}}, + "video_url", + ), + ], + ids=["image_url", "input_audio", "video_url"], +) +def test_system_message_warns_on_non_text_content( + phi3v_model_config, role, part, part_type_label, caplog +): + """System and developer messages should warn on multimodal content. + + Per the decision in https://github.com/vllm-project/vllm/pull/34072, + sending multimodal content (e.g. ``image_url``) in a system message + should issue a warning and skip the part rather than raising an error. + See https://github.com/vllm-project/vllm/issues/33925 + """ + messages = [ + { + "role": role, + "content": [part], + }, + { + "role": "user", + "content": "Hello", + }, + ] + + import logging + + with caplog.at_level(logging.WARNING, logger="vllm.entrypoints.chat_utils"): + conversation, _, _ = parse_chat_messages( + messages, + phi3v_model_config, + content_format="string", + ) + + assert any( + f"'{part_type_label}' is not supported" in record.message + for record in caplog.records + ), f"Expected warning about '{part_type_label}' not found in logs" + + +@pytest.mark.parametrize("role", ["system", "developer"]) +@pytest.mark.parametrize( + ("part", "part_type_label"), + [ + ( + {"image_url": "https://example.com/img.png"}, + "image_url", + ), + ( + {"audio_url": "https://example.com/audio.mp3"}, + "audio_url", + ), + ( + {"video_url": "https://example.com/vid.mp4"}, + "video_url", + ), + ( + {"input_audio": {"data": "abc", "format": "wav"}}, + "input_audio", + ), + ], + ids=[ + "image_url_no_type", + "audio_url_no_type", + "video_url_no_type", + "input_audio_no_type", + ], +) +def test_system_message_warns_on_mm_content_without_type_key( + phi3v_model_config, role, part, part_type_label, caplog +): + """Parts without an explicit ``type`` field but with a multimodal key + (e.g. ``{"image_url": "..."}`` ) should issue a warning for text-only + roles and skip the part. + + See https://github.com/vllm-project/vllm/issues/33925 + """ + messages = [ + { + "role": role, + "content": [part], + }, + { + "role": "user", + "content": "Hello", + }, + ] + + import logging + + with caplog.at_level(logging.WARNING, logger="vllm.entrypoints.chat_utils"): + conversation, _, _ = parse_chat_messages( + messages, + phi3v_model_config, + content_format="string", + ) + + assert any( + f"'{part_type_label}' is not supported" in record.message + for record in caplog.records + ), f"Expected warning about '{part_type_label}' not found in logs" + + +@pytest.mark.parametrize("role", ["system", "developer"]) +def test_system_message_accepts_text_content(phi3v_model_config, role): + """System and developer messages with text-only content should work.""" + messages = [ + { + "role": role, + "content": [{"type": "text", "text": "You are helpful."}], + }, + { + "role": "user", + "content": "Hello", + }, + ] + + # Should not raise + conversation, _, _ = parse_chat_messages( + messages, + phi3v_model_config, + content_format="string", + ) + assert conversation[0]["role"] == role + assert conversation[0]["content"] == "You are helpful." + + +@pytest.mark.parametrize("role", ["system", "developer"]) +def test_system_message_accepts_string_content(phi3v_model_config, role): + """System and developer messages with plain string content should work.""" + messages = [ + { + "role": role, + "content": "You are helpful.", + }, + { + "role": "user", + "content": "Hello", + }, + ] + + # Should not raise + conversation, _, _ = parse_chat_messages( + messages, + phi3v_model_config, + content_format="string", + ) + assert conversation[0]["role"] == role + assert conversation[0]["content"] == "You are helpful." diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c48d7bea983c..2a5e46b491fa 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1335,6 +1335,7 @@ def _parse_chat_message_content_parts( parse_res = _parse_chat_message_content_part( part, mm_parser, + role=role, wrap_dicts=wrap_dicts, interleave_strings=interleave_strings, ) @@ -1360,6 +1361,7 @@ def _parse_chat_message_content_part( part: ChatCompletionContentPartParam, mm_parser: BaseMultiModalContentParser, *, + role: str, wrap_dicts: bool, interleave_strings: bool, ) -> _ContentPart | None: @@ -1372,6 +1374,34 @@ def _parse_chat_message_content_part( """ if isinstance(part, str): # Handle plain text parts return part + + # Validate text-only roles (system / developer) before doing any + # multimodal parsing. This covers both parts with an explicit ``type`` + # field and parts that only carry a multimodal key (e.g. + # ``{"image_url": "..."}`` without ``"type"``). + if role in _ROLES_TEXT_ONLY: + part_type_raw = part.get("type") + has_explicit_mm_type = ( + isinstance(part_type_raw, str) and part_type_raw not in _TEXT_CONTENT_TYPES + ) + has_mm_key = bool(set(part.keys()) & _MULTIMODAL_CONTENT_KEYS) + if has_explicit_mm_type or has_mm_key: + # Build a descriptive label for the error message. + label = ( + part_type_raw + if has_explicit_mm_type + else next(k for k in part if k in _MULTIMODAL_CONTENT_KEYS) + ) + logger.warning( + "Content part type '%s' is not supported " + "in '%s' messages. Only text content is accepted " + "for '%s' role messages. Skipping this content part.", + label, + role, + role, + ) + return None + # Handle structured dictionary parts part_type, content = _parse_chat_message_content_mm_part(part) # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but @@ -1442,6 +1472,36 @@ def _parse_chat_message_content_part( _ToolParser = partial(cast, ChatCompletionToolMessageParam) +_ROLES_TEXT_ONLY = frozenset({"system", "developer"}) +"""Roles whose ``content`` field must only contain text parts, +per the OpenAI Chat Completion API specification.""" + +_TEXT_CONTENT_TYPES = frozenset( + { + "text", + "input_text", + "output_text", + "thinking", + "refusal", + } +) +"""Content part types that are considered text-only.""" + +_MULTIMODAL_CONTENT_KEYS = frozenset( + { + "image_url", + "image_pil", + "image_embeds", + "audio_url", + "input_audio", + "audio_embeds", + "video_url", + } +) +"""Keys whose presence in a content part dict indicates multimodal content, +even when an explicit ``type`` field is absent.""" + + def _parse_chat_message_content( message: ChatCompletionMessageParam, mm_tracker: BaseMultiModalItemTracker, @@ -1456,6 +1516,7 @@ def _parse_chat_message_content( content = [] elif isinstance(content, str): content = [ChatCompletionContentPartTextParam(type="text", text=content)] + result = _parse_chat_message_content_parts( role, content, # type: ignore