vllm-project · veeceey · Feb 16, 2026 · Feb 20, 2026 · Feb 20, 2026 · DarkLight1337
@@ -2682,3 +2682,172 @@ async def test_parse_chat_messages_video_vision_chunk_with_uuid_async(
     assert conversation == expected_conversation
     _assert_mm_data_is_vision_chunk_input(mm_data, 1)
     _assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")
+
+
+# -- Tests for system/developer message content validation (issue #33925) -----
+
+
+@pytest.mark.parametrize("role", ["system", "developer"])
+@pytest.mark.parametrize(
+    ("part", "part_type_label"),
+    [
+        (
+            {"type": "image_url", "image_url": {"url": "https://example.com/img.png"}},
+            "image_url",
+        ),
+        (
+            {"type": "input_audio", "input_audio": {"data": "abc", "format": "wav"}},
+            "input_audio",
+        ),
+        (
+            {"type": "video_url", "video_url": {"url": "https://example.com/vid.mp4"}},
+            "video_url",
+        ),
+    ],
+    ids=["image_url", "input_audio", "video_url"],
+)
+def test_system_message_warns_on_non_text_content(
+    phi3v_model_config, role, part, part_type_label, caplog
+):
+    """System and developer messages should warn on multimodal content.
+
+    Per the decision in https://github.com/vllm-project/vllm/pull/34072,
+    sending multimodal content (e.g. ``image_url``) in a system message
+    should issue a warning and skip the part rather than raising an error.
+    See https://github.com/vllm-project/vllm/issues/33925
+    """
+    messages = [
+        {
+            "role": role,
+            "content": [part],
+        },
+        {
+            "role": "user",
+            "content": "Hello",
+        },
+    ]
+
+    import logging
+
+    with caplog.at_level(logging.WARNING, logger="vllm.entrypoints.chat_utils"):
+        conversation, _, _ = parse_chat_messages(
+            messages,
+            phi3v_model_config,
+            content_format="string",
+        )
+
+    assert any(
+        f"'{part_type_label}' is not supported" in record.message
+        for record in caplog.records
+    ), f"Expected warning about '{part_type_label}' not found in logs"
+
+
+@pytest.mark.parametrize("role", ["system", "developer"])
+@pytest.mark.parametrize(
+    ("part", "part_type_label"),
+    [
+        (
+            {"image_url": "https://example.com/img.png"},
+            "image_url",
+        ),
+        (
+            {"audio_url": "https://example.com/audio.mp3"},
+            "audio_url",
+        ),
+        (
+            {"video_url": "https://example.com/vid.mp4"},
+            "video_url",
+        ),
+        (
+            {"input_audio": {"data": "abc", "format": "wav"}},
+            "input_audio",
+        ),
+    ],
+    ids=[
+        "image_url_no_type",
+        "audio_url_no_type",
+        "video_url_no_type",
+        "input_audio_no_type",
+    ],
+)
+def test_system_message_warns_on_mm_content_without_type_key(
+    phi3v_model_config, role, part, part_type_label, caplog
+):
+    """Parts without an explicit ``type`` field but with a multimodal key
+    (e.g. ``{"image_url": "..."}`` ) should issue a warning for text-only
+    roles and skip the part.
+
+    See https://github.com/vllm-project/vllm/issues/33925
+    """
+    messages = [
+        {
+            "role": role,
+            "content": [part],
+        },
+        {
+            "role": "user",
+            "content": "Hello",
+        },
+    ]
+
+    import logging
+
+    with caplog.at_level(logging.WARNING, logger="vllm.entrypoints.chat_utils"):
+        conversation, _, _ = parse_chat_messages(
+            messages,
+            phi3v_model_config,
+            content_format="string",
+        )
+
+    assert any(
+        f"'{part_type_label}' is not supported" in record.message
+        for record in caplog.records
+    ), f"Expected warning about '{part_type_label}' not found in logs"
+
+
+@pytest.mark.parametrize("role", ["system", "developer"])
+def test_system_message_accepts_text_content(phi3v_model_config, role):
+    """System and developer messages with text-only content should work."""
+    messages = [
+        {
+            "role": role,
+            "content": [{"type": "text", "text": "You are helpful."}],
+        },
+        {
+            "role": "user",
+            "content": "Hello",
+        },
+    ]
+
+    # Should not raise
+    conversation, _, _ = parse_chat_messages(
+        messages,
+        phi3v_model_config,
+        content_format="string",
+    )
+    assert conversation[0]["role"] == role
+    assert conversation[0]["content"] == "You are helpful."
+
+
+@pytest.mark.parametrize("role", ["system", "developer"])
+def test_system_message_accepts_string_content(phi3v_model_config, role):
+    """System and developer messages with plain string content should work."""
+    messages = [
+        {
+            "role": role,
+            "content": "You are helpful.",
+        },
+        {
+            "role": "user",
+            "content": "Hello",
+        },
+    ]
+
+    # Should not raise
+    conversation, _, _ = parse_chat_messages(
+        messages,
+        phi3v_model_config,
+        content_format="string",
+    )
+    assert conversation[0]["role"] == role
+    assert conversation[0]["content"] == "You are helpful."
@@ -1335,6 +1335,7 @@ def _parse_chat_message_content_parts(
         parse_res = _parse_chat_message_content_part(
             part,
             mm_parser,
+            role=role,
             wrap_dicts=wrap_dicts,
             interleave_strings=interleave_strings,
         )
@@ -1360,6 +1361,7 @@ def _parse_chat_message_content_part(
     part: ChatCompletionContentPartParam,
     mm_parser: BaseMultiModalContentParser,
     *,
+    role: str,
     wrap_dicts: bool,
     interleave_strings: bool,
 ) -> _ContentPart | None:
@@ -1372,6 +1374,34 @@ def _parse_chat_message_content_part(
     """
     if isinstance(part, str):  # Handle plain text parts
         return part
+
+    # Validate text-only roles (system / developer) before doing any
+    # multimodal parsing.  This covers both parts with an explicit ``type``
+    # field and parts that only carry a multimodal key (e.g.
+    # ``{"image_url": "..."}`` without ``"type"``).
+    if role in _ROLES_TEXT_ONLY:
+        part_type_raw = part.get("type")
+        has_explicit_mm_type = (
+            isinstance(part_type_raw, str) and part_type_raw not in _TEXT_CONTENT_TYPES
+        )
+        has_mm_key = bool(set(part.keys()) & _MULTIMODAL_CONTENT_KEYS)
+        if has_explicit_mm_type or has_mm_key:
+            # Build a descriptive label for the error message.
+            label = (
+                part_type_raw
+                if has_explicit_mm_type
+                else next(k for k in part if k in _MULTIMODAL_CONTENT_KEYS)
+            )
+            logger.warning(
+                "Content part type '%s' is not supported "
+                "in '%s' messages. Only text content is accepted "
+                "for '%s' role messages. Skipping this content part.",
+                label,
+                role,
+                role,
+            )
+            return None
+
     # Handle structured dictionary parts
     part_type, content = _parse_chat_message_content_mm_part(part)
     # if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
@@ -1442,6 +1472,36 @@ def _parse_chat_message_content_part(
 _ToolParser = partial(cast, ChatCompletionToolMessageParam)
 
 
+_ROLES_TEXT_ONLY = frozenset({"system", "developer"})
+"""Roles whose ``content`` field must only contain text parts,
+per the OpenAI Chat Completion API specification."""
+
+_TEXT_CONTENT_TYPES = frozenset(
+    {
+        "text",
+        "input_text",
+        "output_text",
+        "thinking",
+        "refusal",
+    }
+)
+"""Content part types that are considered text-only."""
+
+_MULTIMODAL_CONTENT_KEYS = frozenset(
+    {
+        "image_url",
+        "image_pil",
+        "image_embeds",
+        "audio_url",
+        "input_audio",
+        "audio_embeds",
+        "video_url",
+    }
+)
+"""Keys whose presence in a content part dict indicates multimodal content,
+even when an explicit ``type`` field is absent."""
+
+
 def _parse_chat_message_content(
     message: ChatCompletionMessageParam,
     mm_tracker: BaseMultiModalItemTracker,
@@ -1456,6 +1516,7 @@ def _parse_chat_message_content(
         content = []
     elif isinstance(content, str):
         content = [ChatCompletionContentPartTextParam(type="text", text=content)]
+
     result = _parse_chat_message_content_parts(
         role,
         content,  # type: ignore