From d1831c19a017d57009cb0ea97b48d0a117191b16 Mon Sep 17 00:00:00 2001 From: Varun Chawla Date: Mon, 16 Feb 2026 11:30:07 -0800 Subject: [PATCH 1/3] fix: reject non-text content in system/developer messages Per the OpenAI API spec, system and developer messages only accept text content. Add validation inside the existing per-part parsing loop to reject multimodal content (image_url, audio_url, video_url, input_audio, etc.) for these roles. Handles both explicit type fields and inferred types from dict keys, preventing bypasses via simplified multimodal formats. Fixes #33925 Signed-off-by: Varun Chawla --- tests/entrypoints/test_chat_utils.py | 154 +++++++++++++++++++++++++++ vllm/entrypoints/chat_utils.py | 57 ++++++++++ 2 files changed, 211 insertions(+) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 36e8b0c0b540..31a0807c04cd 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -2682,3 +2682,157 @@ async def test_parse_chat_messages_video_vision_chunk_with_uuid_async( assert conversation == expected_conversation _assert_mm_data_is_vision_chunk_input(mm_data, 1) _assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk") + + +# -- Tests for system/developer message content validation (issue #33925) ----- + + +@pytest.mark.parametrize("role", ["system", "developer"]) +@pytest.mark.parametrize( + ("part", "part_type_label"), + [ + ( + {"type": "image_url", "image_url": {"url": "https://example.com/img.png"}}, + "image_url", + ), + ( + {"type": "input_audio", "input_audio": {"data": "abc", "format": "wav"}}, + "input_audio", + ), + ( + {"type": "video_url", "video_url": {"url": "https://example.com/vid.mp4"}}, + "video_url", + ), + ], + ids=["image_url", "input_audio", "video_url"], +) +def test_system_message_rejects_non_text_content( + phi3v_model_config, role, part, part_type_label +): + """System and developer messages must only accept text content. + + Per the OpenAI API specification, sending multimodal content (e.g. + ``image_url``) in a system message should be rejected with an error. + See https://github.com/vllm-project/vllm/issues/33925 + """ + messages = [ + { + "role": role, + "content": [part], + }, + { + "role": "user", + "content": "Hello", + }, + ] + + with pytest.raises(ValueError, match=f"'{part_type_label}' is not supported"): + parse_chat_messages( + messages, + phi3v_model_config, + content_format="string", + ) + + +@pytest.mark.parametrize("role", ["system", "developer"]) +@pytest.mark.parametrize( + ("part", "part_type_label"), + [ + ( + {"image_url": "https://example.com/img.png"}, + "image_url", + ), + ( + {"audio_url": "https://example.com/audio.mp3"}, + "audio_url", + ), + ( + {"video_url": "https://example.com/vid.mp4"}, + "video_url", + ), + ( + {"input_audio": {"data": "abc", "format": "wav"}}, + "input_audio", + ), + ], + ids=[ + "image_url_no_type", + "audio_url_no_type", + "video_url_no_type", + "input_audio_no_type", + ], +) +def test_system_message_rejects_mm_content_without_type_key( + phi3v_model_config, role, part, part_type_label +): + """Parts without an explicit ``type`` field but with a multimodal key + (e.g. ``{"image_url": "..."}`` ) must also be rejected for text-only + roles. + + See https://github.com/vllm-project/vllm/issues/33925 + """ + messages = [ + { + "role": role, + "content": [part], + }, + { + "role": "user", + "content": "Hello", + }, + ] + + with pytest.raises(ValueError, match=f"'{part_type_label}' is not supported"): + parse_chat_messages( + messages, + phi3v_model_config, + content_format="string", + ) + + +@pytest.mark.parametrize("role", ["system", "developer"]) +def test_system_message_accepts_text_content(phi3v_model_config, role): + """System and developer messages with text-only content should work.""" + messages = [ + { + "role": role, + "content": [{"type": "text", "text": "You are helpful."}], + }, + { + "role": "user", + "content": "Hello", + }, + ] + + # Should not raise + conversation, _, _ = parse_chat_messages( + messages, + phi3v_model_config, + content_format="string", + ) + assert conversation[0]["role"] == role + assert conversation[0]["content"] == "You are helpful." + + +@pytest.mark.parametrize("role", ["system", "developer"]) +def test_system_message_accepts_string_content(phi3v_model_config, role): + """System and developer messages with plain string content should work.""" + messages = [ + { + "role": role, + "content": "You are helpful.", + }, + { + "role": "user", + "content": "Hello", + }, + ] + + # Should not raise + conversation, _, _ = parse_chat_messages( + messages, + phi3v_model_config, + content_format="string", + ) + assert conversation[0]["role"] == role + assert conversation[0]["content"] == "You are helpful." diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c48d7bea983c..e013889443b5 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1335,6 +1335,7 @@ def _parse_chat_message_content_parts( parse_res = _parse_chat_message_content_part( part, mm_parser, + role=role, wrap_dicts=wrap_dicts, interleave_strings=interleave_strings, ) @@ -1360,6 +1361,7 @@ def _parse_chat_message_content_part( part: ChatCompletionContentPartParam, mm_parser: BaseMultiModalContentParser, *, + role: str, wrap_dicts: bool, interleave_strings: bool, ) -> _ContentPart | None: @@ -1372,6 +1374,30 @@ def _parse_chat_message_content_part( """ if isinstance(part, str): # Handle plain text parts return part + + # Validate text-only roles (system / developer) before doing any + # multimodal parsing. This covers both parts with an explicit ``type`` + # field and parts that only carry a multimodal key (e.g. + # ``{"image_url": "..."}`` without ``"type"``). + if role in _ROLES_TEXT_ONLY: + part_type_raw = part.get("type") + has_explicit_mm_type = ( + isinstance(part_type_raw, str) and part_type_raw not in _TEXT_CONTENT_TYPES + ) + has_mm_key = bool(set(part.keys()) & _MULTIMODAL_CONTENT_KEYS) + if has_explicit_mm_type or has_mm_key: + # Build a descriptive label for the error message. + label = ( + part_type_raw + if has_explicit_mm_type + else next(k for k in part if k in _MULTIMODAL_CONTENT_KEYS) + ) + raise ValueError( + f"Content part type '{label}' is not supported " + f"in '{role}' messages. Only text content is accepted " + f"for '{role}' role messages." + ) + # Handle structured dictionary parts part_type, content = _parse_chat_message_content_mm_part(part) # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but @@ -1442,6 +1468,36 @@ def _parse_chat_message_content_part( _ToolParser = partial(cast, ChatCompletionToolMessageParam) +_ROLES_TEXT_ONLY = frozenset({"system", "developer"}) +"""Roles whose ``content`` field must only contain text parts, +per the OpenAI Chat Completion API specification.""" + +_TEXT_CONTENT_TYPES = frozenset( + { + "text", + "input_text", + "output_text", + "thinking", + "refusal", + } +) +"""Content part types that are considered text-only.""" + +_MULTIMODAL_CONTENT_KEYS = frozenset( + { + "image_url", + "image_pil", + "image_embeds", + "audio_url", + "input_audio", + "audio_embeds", + "video_url", + } +) +"""Keys whose presence in a content part dict indicates multimodal content, +even when an explicit ``type`` field is absent.""" + + def _parse_chat_message_content( message: ChatCompletionMessageParam, mm_tracker: BaseMultiModalItemTracker, @@ -1456,6 +1512,7 @@ def _parse_chat_message_content( content = [] elif isinstance(content, str): content = [ChatCompletionContentPartTextParam(type="text", text=content)] + result = _parse_chat_message_content_parts( role, content, # type: ignore From 5b44ab9417601d9f69230f3ba4fb53db06954294 Mon Sep 17 00:00:00 2001 From: Varun Chawla Date: Thu, 19 Feb 2026 21:58:40 -0800 Subject: [PATCH 2/3] Change non-text content validation from error to warning Instead of raising a ValueError when system/developer messages contain non-text content, issue a logger.warning and skip the part. This is consistent with the decision in #34072. Co-Authored-By: Claude Opus 4.6 --- tests/entrypoints/test_chat_utils.py | 39 ++++++++++++++++++---------- vllm/entrypoints/chat_utils.py | 10 ++++--- 2 files changed, 32 insertions(+), 17 deletions(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 31a0807c04cd..57a3c8672a09 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -2706,13 +2706,14 @@ async def test_parse_chat_messages_video_vision_chunk_with_uuid_async( ], ids=["image_url", "input_audio", "video_url"], ) -def test_system_message_rejects_non_text_content( - phi3v_model_config, role, part, part_type_label +def test_system_message_warns_on_non_text_content( + phi3v_model_config, role, part, part_type_label, caplog ): - """System and developer messages must only accept text content. + """System and developer messages should warn on multimodal content. - Per the OpenAI API specification, sending multimodal content (e.g. - ``image_url``) in a system message should be rejected with an error. + Per the decision in https://github.com/vllm-project/vllm/pull/34072, + sending multimodal content (e.g. ``image_url``) in a system message + should issue a warning and skip the part rather than raising an error. See https://github.com/vllm-project/vllm/issues/33925 """ messages = [ @@ -2726,13 +2727,19 @@ def test_system_message_rejects_non_text_content( }, ] - with pytest.raises(ValueError, match=f"'{part_type_label}' is not supported"): - parse_chat_messages( + import logging + with caplog.at_level(logging.WARNING, logger="vllm.entrypoints.chat_utils"): + conversation, _, _ = parse_chat_messages( messages, phi3v_model_config, content_format="string", ) + assert any( + f"'{part_type_label}' is not supported" in record.message + for record in caplog.records + ), f"Expected warning about '{part_type_label}' not found in logs" + @pytest.mark.parametrize("role", ["system", "developer"]) @pytest.mark.parametrize( @@ -2762,12 +2769,12 @@ def test_system_message_rejects_non_text_content( "input_audio_no_type", ], ) -def test_system_message_rejects_mm_content_without_type_key( - phi3v_model_config, role, part, part_type_label +def test_system_message_warns_on_mm_content_without_type_key( + phi3v_model_config, role, part, part_type_label, caplog ): """Parts without an explicit ``type`` field but with a multimodal key - (e.g. ``{"image_url": "..."}`` ) must also be rejected for text-only - roles. + (e.g. ``{"image_url": "..."}`` ) should issue a warning for text-only + roles and skip the part. See https://github.com/vllm-project/vllm/issues/33925 """ @@ -2782,13 +2789,19 @@ def test_system_message_rejects_mm_content_without_type_key( }, ] - with pytest.raises(ValueError, match=f"'{part_type_label}' is not supported"): - parse_chat_messages( + import logging + with caplog.at_level(logging.WARNING, logger="vllm.entrypoints.chat_utils"): + conversation, _, _ = parse_chat_messages( messages, phi3v_model_config, content_format="string", ) + assert any( + f"'{part_type_label}' is not supported" in record.message + for record in caplog.records + ), f"Expected warning about '{part_type_label}' not found in logs" + @pytest.mark.parametrize("role", ["system", "developer"]) def test_system_message_accepts_text_content(phi3v_model_config, role): diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index e013889443b5..f63aaaeb520c 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1392,11 +1392,13 @@ def _parse_chat_message_content_part( if has_explicit_mm_type else next(k for k in part if k in _MULTIMODAL_CONTENT_KEYS) ) - raise ValueError( - f"Content part type '{label}' is not supported " - f"in '{role}' messages. Only text content is accepted " - f"for '{role}' role messages." + logger.warning( + "Content part type '%s' is not supported " + "in '%s' messages. Only text content is accepted " + "for '%s' role messages. Skipping this content part.", + label, role, role, ) + return None # Handle structured dictionary parts part_type, content = _parse_chat_message_content_mm_part(part) From 584013b8891b4872c965e439d4b40f0cd803f95a Mon Sep 17 00:00:00 2001 From: Varun Chawla Date: Thu, 19 Feb 2026 22:17:59 -0800 Subject: [PATCH 3/3] style: fix pre-commit formatting issues Add missing blank lines after import statements and split long function arguments across multiple lines per project style guide. Signed-off-by: Varun Chawla --- tests/entrypoints/test_chat_utils.py | 2 ++ vllm/entrypoints/chat_utils.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 57a3c8672a09..103ffa2dbcbc 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -2728,6 +2728,7 @@ def test_system_message_warns_on_non_text_content( ] import logging + with caplog.at_level(logging.WARNING, logger="vllm.entrypoints.chat_utils"): conversation, _, _ = parse_chat_messages( messages, @@ -2790,6 +2791,7 @@ def test_system_message_warns_on_mm_content_without_type_key( ] import logging + with caplog.at_level(logging.WARNING, logger="vllm.entrypoints.chat_utils"): conversation, _, _ = parse_chat_messages( messages, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index f63aaaeb520c..2a5e46b491fa 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1396,7 +1396,9 @@ def _parse_chat_message_content_part( "Content part type '%s' is not supported " "in '%s' messages. Only text content is accepted " "for '%s' role messages. Skipping this content part.", - label, role, role, + label, + role, + role, ) return None