diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index 6e915a9f6005..94471e6ce9e1 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -393,7 +393,6 @@ formatter=lambda vid_prompt: f"<|im_start|>user\n{vid_prompt}<|im_end|>\n<|im_start|>assistant\n", # noqa: E501 ), limit_mm_per_prompt={"video": 4}, - runner_mm_key="videos", )], ), "llava_next_video": VLMTestInfo( @@ -623,6 +622,19 @@ limit_mm_per_prompt={"image": 4}, )], ), + "qwen2_5_omni-mixed-modalities": VLMTestInfo( + models=["Qwen/Qwen2.5-Omni-3B"], + test_type=VLMTestType.CUSTOM_INPUTS, + max_model_len=4096, + max_num_seqs=2, + auto_cls=AutoModelForTextToWaveform, + vllm_output_post_proc=model_utils.qwen2_vllm_to_hf_output, + patch_hf_runner=model_utils.qwen2_5_omni_patch_hf_runner, + custom_test_opts=[CustomTestOptions( + inputs=custom_inputs.mixed_modality_qwen2_5_omni(), + limit_mm_per_prompt={"audio": 1, "image": 1, "video": 1}, + )], + ), # regression test for https://github.com/vllm-project/vllm/issues/15122 "qwen2_5_vl-windows-attention": VLMTestInfo( models=["Qwen/Qwen2.5-VL-3B-Instruct"], diff --git a/tests/models/multimodal/generation/vlm_utils/builders.py b/tests/models/multimodal/generation/vlm_utils/builders.py index e3ba955a96a6..298df164bd09 100644 --- a/tests/models/multimodal/generation/vlm_utils/builders.py +++ b/tests/models/multimodal/generation/vlm_utils/builders.py @@ -14,7 +14,8 @@ from .....conftest import ImageTestAssets, VideoTestAssets from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER, TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT, - ImageSizeWrapper, SizeType, VLMTestInfo) + ImageSizeWrapper, PromptWithMultiModalInput, SizeType, + VLMTestInfo) def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int], @@ -68,10 +69,11 @@ def get_model_prompts(base_prompts: Iterable[str], def build_single_image_inputs_from_test_info( - test_info: VLMTestInfo, - image_assets: ImageTestAssets, - size_wrapper: ImageSizeWrapper, - tmp_path: Optional[PosixPath] = None): + test_info: VLMTestInfo, + image_assets: ImageTestAssets, + size_wrapper: ImageSizeWrapper, + tmp_path: Optional[PosixPath] = None, +) -> list[PromptWithMultiModalInput]: if test_info.prompt_formatter is None: raise ValueError( "Prompt formatter must be set to build single image inputs") @@ -97,28 +99,32 @@ def build_single_image_inputs_from_test_info( return build_single_image_inputs(images, model_prompts, size_wrapper) -def build_single_image_inputs(images, model_prompts, - size_wrapper: ImageSizeWrapper): +def build_single_image_inputs( + images, model_prompts, + size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]: # For every image / prompt pair, get a pair containing two lists of # length size_factors, where the first contains duplicates of the model # prompt [str], and the second contains copies of the image after being # scaled by one of the size factors. # # NOTE: rescaling preserves the image aspect ratio. - return [( - [prompt for _ in size_wrapper.data], - [ - apply_image_size_scaling(image, size, size_wrapper.type) - for size in size_wrapper.data - ], - ) for image, prompt in zip(images, model_prompts)] + return [ + PromptWithMultiModalInput.create( + prompts=[prompt for _ in size_wrapper.data], + image_data=[ + apply_image_size_scaling(image, size, size_wrapper.type) + for size in size_wrapper.data + ], + ) for image, prompt in zip(images, model_prompts) + ] def build_multi_image_inputs_from_test_info( - test_info: VLMTestInfo, - image_assets: ImageTestAssets, - size_wrapper: ImageSizeWrapper, - tmp_path: Optional[PosixPath] = None): + test_info: VLMTestInfo, + image_assets: ImageTestAssets, + size_wrapper: ImageSizeWrapper, + tmp_path: Optional[PosixPath] = None, +) -> list[PromptWithMultiModalInput]: if test_info.prompt_formatter is None: raise ValueError( "Prompt formatter must be set to build multi image inputs") @@ -146,15 +152,18 @@ def build_multi_image_inputs_from_test_info( ) -def build_multi_image_inputs(image_lists, model_prompts, - size_wrapper: ImageSizeWrapper): - return [( - [prompt for _ in size_wrapper.data], - [[ - apply_image_size_scaling(image, size, size_wrapper.type) - for image in images - ] for size in size_wrapper.data], - ) for images, prompt in zip(image_lists, model_prompts)] +def build_multi_image_inputs( + image_lists, model_prompts, + size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]: + return [ + PromptWithMultiModalInput.create( + prompts=[prompt for _ in size_wrapper.data], + image_data=[[ + apply_image_size_scaling(image, size, size_wrapper.type) + for image in images + ] for size in size_wrapper.data], + ) for images, prompt in zip(image_lists, model_prompts) + ] def build_embedding_inputs_from_test_info( @@ -195,7 +204,7 @@ def build_video_inputs_from_test_info( video_assets: VideoTestAssets, size_wrapper: ImageSizeWrapper, num_frames: int, -): +) -> list[PromptWithMultiModalInput]: if test_info.prompt_formatter is None: raise ValueError("Prompt formatter must be set to build video inputs") model_prompts = get_model_prompts( @@ -213,10 +222,14 @@ def build_video_inputs_from_test_info( video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size) - return [( - [prompt for _ in size_wrapper.data], - [video_scaler(video, size) for size in size_wrapper.data], - ) for video, prompt in zip(sampled_vids, model_prompts)] + return [ + PromptWithMultiModalInput.create( + prompts=[prompt for _ in size_wrapper.data], + video_data=[ + video_scaler(video, size) for size in size_wrapper.data + ], + ) for video, prompt in zip(sampled_vids, model_prompts) + ] def apply_image_size_scaling(image, size: Union[float, tuple[int, int]], diff --git a/tests/models/multimodal/generation/vlm_utils/core.py b/tests/models/multimodal/generation/vlm_utils/core.py index c3d20f56855f..ccd2799abd90 100644 --- a/tests/models/multimodal/generation/vlm_utils/core.py +++ b/tests/models/multimodal/generation/vlm_utils/core.py @@ -1,9 +1,8 @@ # SPDX-License-Identifier: Apache-2.0 """Core test implementation to be shared across modalities.""" -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Optional import torch -from PIL.Image import Image from transformers.models.auto.auto_factory import _BaseAutoModelClass from vllm.config import TaskOption @@ -11,14 +10,14 @@ from .....conftest import HfRunner, VllmRunner from ....registry import HF_EXAMPLE_MODELS -from .types import RunnerOutput +from .types import PromptWithMultiModalInput, RunnerOutput def run_test( *, hf_runner: type[HfRunner], vllm_runner: type[VllmRunner], - inputs: list[tuple[list[str], list[Union[list[Image], Image]]]], + inputs: list[PromptWithMultiModalInput], model: str, dtype: str, max_tokens: int, @@ -38,7 +37,6 @@ def run_test( hf_model_kwargs: Optional[dict[str, Any]], patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]], task: TaskOption = "auto", - runner_mm_key: str = "images", distributed_executor_backend: Optional[str] = None, tensor_parallel_size: int = 1, vllm_embeddings: Optional[torch.Tensor] = None, @@ -94,10 +92,16 @@ def run_test( if stop_str: vllm_kwargs["stop"] = stop_str - for prompts, media in vllm_inputs: - vllm_kwargs[runner_mm_key] = media + for prompts, image_data, video_data, audio_data in vllm_inputs: + mm_data = dict(images=image_data, + videos=video_data, + audios=audio_data) + vllm_kwargs_with_mm_data = vllm_kwargs | mm_data vllm_output = vllm_model.generate_greedy_logprobs( - prompts, max_tokens, num_logprobs=num_logprobs, **vllm_kwargs) + prompts, + max_tokens, + num_logprobs=num_logprobs, + **vllm_kwargs_with_mm_data) vllm_outputs_per_mm.append(vllm_output) hf_model = hf_runner(model, @@ -122,14 +126,17 @@ def run_test( if stop_str: hf_kwargs["stop_strings"] = stop_str - for prompts, media in inputs: - hf_kwargs[runner_mm_key] = media + for prompts, image_data, video_data, audio_data in inputs: + mm_data = dict(images=image_data, + videos=video_data, + audios=audio_data) + hf_kwargs_with_mm_data = hf_kwargs | mm_data hf_output = hf_model.generate_greedy_logprobs_limit( prompts, max_tokens, num_logprobs=num_logprobs, tokenizer=tokenizer, - **hf_kwargs) + **hf_kwargs_with_mm_data) hf_outputs_per_mm.append(hf_output) # Apply output processing / sanitation to the vLLM and HF runner results diff --git a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py index 235618ae547e..113660f9e561 100644 --- a/tests/models/multimodal/generation/vlm_utils/custom_inputs.py +++ b/tests/models/multimodal/generation/vlm_utils/custom_inputs.py @@ -6,13 +6,16 @@ import requests from PIL import Image +from vllm.assets.audio import AudioAsset +from vllm.assets.image import ImageAsset +from vllm.assets.video import VideoAsset from vllm.multimodal.image import rescale_image_size from vllm.multimodal.video import (rescale_video_size, resize_video, sample_frames_from_video) from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS from .builders import build_multi_image_inputs, build_single_image_inputs -from .types import ImageSizeWrapper, SizeType +from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]): @@ -32,24 +35,28 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]): "\nWhat is the season?", ] formatted_prompts = [formatter(prompt) for prompt in img_prompts] - - return [( - formatted_prompts, + aspect_ratio_images = [ + [stop_sign, cherry_blossom], + # Images with different sizes and aspect-ratios + [ + rescale_image_size(stop_sign, 0.1), + stop_sign, + ], [ - [stop_sign, cherry_blossom], - # Images with different sizes and aspect-ratios - [ - rescale_image_size(stop_sign, 0.1), - stop_sign, - ], - [ - stop_sign, - rescale_image_size(stop_sign, 0.25), - cherry_blossom.resize((183, 488)), - cherry_blossom.resize((488, 183)) - ], - cherry_blossom, - ])] + stop_sign, + rescale_image_size(stop_sign, 0.25), + cherry_blossom.resize((183, 488)), + cherry_blossom.resize((488, 183)) + ], + cherry_blossom, + ] + + return [ + PromptWithMultiModalInput.create( + prompts=formatted_prompts, + image_data=aspect_ratio_images, + ) + ] def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str], @@ -68,24 +75,28 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str], "