Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
169 changes: 169 additions & 0 deletions tests/entrypoints/test_chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2682,3 +2682,172 @@ async def test_parse_chat_messages_video_vision_chunk_with_uuid_async(
assert conversation == expected_conversation
_assert_mm_data_is_vision_chunk_input(mm_data, 1)
_assert_mm_uuids(mm_uuids, 1, expected_uuids=[video_uuid], modality="vision_chunk")


# -- Tests for system/developer message content validation (issue #33925) -----


@pytest.mark.parametrize("role", ["system", "developer"])
@pytest.mark.parametrize(
("part", "part_type_label"),
[
(
{"type": "image_url", "image_url": {"url": "https://example.com/img.png"}},
"image_url",
),
(
{"type": "input_audio", "input_audio": {"data": "abc", "format": "wav"}},
"input_audio",
),
(
{"type": "video_url", "video_url": {"url": "https://example.com/vid.mp4"}},
"video_url",
),
],
ids=["image_url", "input_audio", "video_url"],
)
def test_system_message_warns_on_non_text_content(
phi3v_model_config, role, part, part_type_label, caplog
):
"""System and developer messages should warn on multimodal content.

Per the decision in https://github.com/vllm-project/vllm/pull/34072,
sending multimodal content (e.g. ``image_url``) in a system message
should issue a warning and skip the part rather than raising an error.
See https://github.com/vllm-project/vllm/issues/33925
"""
messages = [
{
"role": role,
"content": [part],
},
{
"role": "user",
"content": "Hello",
},
]

import logging

with caplog.at_level(logging.WARNING, logger="vllm.entrypoints.chat_utils"):
conversation, _, _ = parse_chat_messages(
messages,
phi3v_model_config,
content_format="string",
)

assert any(
f"'{part_type_label}' is not supported" in record.message
for record in caplog.records
), f"Expected warning about '{part_type_label}' not found in logs"


@pytest.mark.parametrize("role", ["system", "developer"])
@pytest.mark.parametrize(
("part", "part_type_label"),
[
(
{"image_url": "https://example.com/img.png"},
"image_url",
),
(
{"audio_url": "https://example.com/audio.mp3"},
"audio_url",
),
(
{"video_url": "https://example.com/vid.mp4"},
"video_url",
),
(
{"input_audio": {"data": "abc", "format": "wav"}},
"input_audio",
),
],
ids=[
"image_url_no_type",
"audio_url_no_type",
"video_url_no_type",
"input_audio_no_type",
],
)
def test_system_message_warns_on_mm_content_without_type_key(
phi3v_model_config, role, part, part_type_label, caplog
):
"""Parts without an explicit ``type`` field but with a multimodal key
(e.g. ``{"image_url": "..."}`` ) should issue a warning for text-only
roles and skip the part.

See https://github.com/vllm-project/vllm/issues/33925
"""
messages = [
{
"role": role,
"content": [part],
},
{
"role": "user",
"content": "Hello",
},
]

import logging

with caplog.at_level(logging.WARNING, logger="vllm.entrypoints.chat_utils"):
conversation, _, _ = parse_chat_messages(
messages,
phi3v_model_config,
content_format="string",
)

assert any(
f"'{part_type_label}' is not supported" in record.message
for record in caplog.records
), f"Expected warning about '{part_type_label}' not found in logs"


@pytest.mark.parametrize("role", ["system", "developer"])
def test_system_message_accepts_text_content(phi3v_model_config, role):
"""System and developer messages with text-only content should work."""
messages = [
{
"role": role,
"content": [{"type": "text", "text": "You are helpful."}],
},
{
"role": "user",
"content": "Hello",
},
]

# Should not raise
conversation, _, _ = parse_chat_messages(
messages,
phi3v_model_config,
content_format="string",
)
assert conversation[0]["role"] == role
assert conversation[0]["content"] == "You are helpful."


@pytest.mark.parametrize("role", ["system", "developer"])
def test_system_message_accepts_string_content(phi3v_model_config, role):
"""System and developer messages with plain string content should work."""
messages = [
{
"role": role,
"content": "You are helpful.",
},
{
"role": "user",
"content": "Hello",
},
]

# Should not raise
conversation, _, _ = parse_chat_messages(
messages,
phi3v_model_config,
content_format="string",
)
assert conversation[0]["role"] == role
assert conversation[0]["content"] == "You are helpful."
61 changes: 61 additions & 0 deletions vllm/entrypoints/chat_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1335,6 +1335,7 @@ def _parse_chat_message_content_parts(
parse_res = _parse_chat_message_content_part(
part,
mm_parser,
role=role,
wrap_dicts=wrap_dicts,
interleave_strings=interleave_strings,
)
Expand All @@ -1360,6 +1361,7 @@ def _parse_chat_message_content_part(
part: ChatCompletionContentPartParam,
mm_parser: BaseMultiModalContentParser,
*,
role: str,
wrap_dicts: bool,
interleave_strings: bool,
) -> _ContentPart | None:
Expand All @@ -1372,6 +1374,34 @@ def _parse_chat_message_content_part(
"""
if isinstance(part, str): # Handle plain text parts
return part

# Validate text-only roles (system / developer) before doing any
# multimodal parsing. This covers both parts with an explicit ``type``
# field and parts that only carry a multimodal key (e.g.
# ``{"image_url": "..."}`` without ``"type"``).
if role in _ROLES_TEXT_ONLY:
part_type_raw = part.get("type")
has_explicit_mm_type = (
isinstance(part_type_raw, str) and part_type_raw not in _TEXT_CONTENT_TYPES
)
has_mm_key = bool(set(part.keys()) & _MULTIMODAL_CONTENT_KEYS)
if has_explicit_mm_type or has_mm_key:
# Build a descriptive label for the error message.
label = (
part_type_raw
if has_explicit_mm_type
else next(k for k in part if k in _MULTIMODAL_CONTENT_KEYS)
)
logger.warning(
"Content part type '%s' is not supported "
"in '%s' messages. Only text content is accepted "
"for '%s' role messages. Skipping this content part.",
label,
role,
role,
)
return None
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should still allow it. But we can log a warning that it is outside of OpenAI spec.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So I think this PR isn't really needed as #34072 does that already


# Handle structured dictionary parts
part_type, content = _parse_chat_message_content_mm_part(part)
# if part_type is text/refusal/image_url/audio_url/video_url/input_audio but
Expand Down Expand Up @@ -1442,6 +1472,36 @@ def _parse_chat_message_content_part(
_ToolParser = partial(cast, ChatCompletionToolMessageParam)


_ROLES_TEXT_ONLY = frozenset({"system", "developer"})
"""Roles whose ``content`` field must only contain text parts,
per the OpenAI Chat Completion API specification."""

_TEXT_CONTENT_TYPES = frozenset(
{
"text",
"input_text",
"output_text",
"thinking",
"refusal",
}
)
"""Content part types that are considered text-only."""

_MULTIMODAL_CONTENT_KEYS = frozenset(
{
"image_url",
"image_pil",
"image_embeds",
"audio_url",
"input_audio",
"audio_embeds",
"video_url",
}
)
"""Keys whose presence in a content part dict indicates multimodal content,
even when an explicit ``type`` field is absent."""


def _parse_chat_message_content(
message: ChatCompletionMessageParam,
mm_tracker: BaseMultiModalItemTracker,
Expand All @@ -1456,6 +1516,7 @@ def _parse_chat_message_content(
content = []
elif isinstance(content, str):
content = [ChatCompletionContentPartTextParam(type="text", text=content)]

result = _parse_chat_message_content_parts(
role,
content, # type: ignore
Expand Down