diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 1637627695e..3065439084a 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -29,6 +29,16 @@ IMAGE_KEY = ["square", "quadrate", "rectangle"] VIDEO_KEY = ["sphere", "globe", "circle", "round", "ball"] +# Heavier synthetic inputs than the default expansion cases (longer timeline / more pixels). +# Long video: 120s @ 30fps => 3600 frames (generate_synthetic_video in tests/conftest.py). +# Use 224² spatial size to bound RAM (~W*H*num_frames*3) vs. 288² at this frame count. +LONG_VIDEO_WIDTH = 224 +LONG_VIDEO_HEIGHT = 224 +LONG_VIDEO_FRAMES = 3600 +LARGE_IMAGE_WIDTH = 1920 +LARGE_IMAGE_HEIGHT = 1080 +LONG_AUDIO_DURATION_SEC = 120 + def get_chunk_config(default_path): path = modify_stage_config( @@ -37,7 +47,8 @@ def get_chunk_config(default_path): "async_chunk": True, "stage_args": { 0: { - "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk" + "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk", + "default_sampling_params.max_tokens": 2048, }, 1: { "engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk" @@ -167,88 +178,17 @@ def test_text_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_image_to_text_001(omni_server, openai_client) -> None: - """ - Input Modal: image - Output Modal: text - Input Setting: stream=True - Datasets: single request - """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" - messages = dummy_messages_from_mix_data(image_data_url=image_data_url) - - request_config = { - "model": omni_server.model, - "messages": messages, - "modalities": ["text"], - "stream": True, - "key_words": {"image": IMAGE_KEY}, - } - - openai_client.send_omni_request(request_config) - - -@pytest.mark.advanced_model -@pytest.mark.omni -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_image_to_audio_001(omni_server, openai_client) -> None: - """ - Input Modal: image - Output Modal: audio - Input Setting: stream=False - Datasets: single request - """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" - messages = dummy_messages_from_mix_data(image_data_url=image_data_url) - - request_config = { - "model": omni_server.model, - "messages": messages, - "modalities": ["audio"], - "key_words": {"image": IMAGE_KEY}, - } - - openai_client.send_omni_request(request_config) - - -@pytest.mark.advanced_model -@pytest.mark.omni -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_image_to_text_audio_001(omni_server, openai_client) -> None: - """ - Input Modal: image - Output Modal: text, audio - Input Setting: stream=False - Datasets: few requests - """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(1280, 720)['base64']}" - - messages = dummy_messages_from_mix_data(image_data_url=image_data_url) - - request_config = { - "model": omni_server.model, - "messages": messages, - "key_words": {"image": IMAGE_KEY}, - } - - openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) - - -@pytest.mark.advanced_model -@pytest.mark.omni -@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_video_to_text_001(omni_server, openai_client) -> None: +def test_text_video_to_text_001(omni_server, openai_client) -> None: """ - Input Modal: video + Input Modal: long synthetic video (120s @ 30fps, LONG_VIDEO_FRAMES frames) Output Modal: text Input Setting: stream=False Datasets: single request """ - video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}" - messages = dummy_messages_from_mix_data(video_data_url=video_data_url) + video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(LONG_VIDEO_WIDTH, LONG_VIDEO_HEIGHT, LONG_VIDEO_FRAMES)['base64']}" + messages = dummy_messages_from_mix_data( + video_data_url=video_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_video") + ) request_config = { "model": omni_server.model, @@ -257,28 +197,29 @@ def test_video_to_text_001(omni_server, openai_client) -> None: "key_words": {"video": VIDEO_KEY}, } - openai_client.send_omni_request(request_config) + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_video_to_audio_001(omni_server, openai_client) -> None: +@pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) +def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None: """ - Input Modal: video - Output Modal: audio + Input Modal: text, audio + Output Modal: text, audio Input Setting: stream=False Datasets: single request """ - video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}" - messages = dummy_messages_from_mix_data(video_data_url=video_data_url) + audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(5, 1)['base64']}" + messages = dummy_messages_from_mix_data( + audio_data_url=audio_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_audio") + ) request_config = { "model": omni_server.model, "messages": messages, - "modalities": ["audio"], - "key_words": {"video": VIDEO_KEY}, + "key_words": {"audio": AUDIO_KEY}, } openai_client.send_omni_request(request_config) @@ -287,22 +228,25 @@ def test_video_to_audio_001(omni_server, openai_client) -> None: @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) -@pytest.mark.parametrize("omni_server", test_params, indirect=True) -def test_video_to_text_audio_001(omni_server, openai_client) -> None: +@pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) +def test_text_audio_to_text_audio_002(omni_server, openai_client) -> None: """ - Input Modal: video + Input Modal: text, long-duration audio (~LONG_AUDIO_DURATION_SEC s WAV) Output Modal: text, audio Input Setting: stream=False - Datasets: few requests + Datasets: single request """ - video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}" - - messages = dummy_messages_from_mix_data(video_data_url=video_data_url) + audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(LONG_AUDIO_DURATION_SEC, 1)['base64']}" + messages = dummy_messages_from_mix_data( + audio_data_url=audio_data_url, + system_prompt=get_system_prompt(), + content_text=get_prompt("text_audio"), + ) request_config = { "model": omni_server.model, "messages": messages, - "key_words": {"video": VIDEO_KEY}, + "key_words": {"audio": AUDIO_KEY}, } openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @@ -312,22 +256,23 @@ def test_video_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) -def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None: +def test_text_image_to_text_audio_001(omni_server, openai_client) -> None: """ - Input Modal: text, audio + Input Modal: text, image Output Modal: text, audio Input Setting: stream=False Datasets: single request """ - audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(5, 1)['base64']}" + image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" + messages = dummy_messages_from_mix_data( - audio_data_url=audio_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_audio") + image_data_url=image_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_image") ) request_config = { "model": omni_server.model, "messages": messages, - "key_words": {"audio": AUDIO_KEY}, + "key_words": {"image": IMAGE_KEY}, } openai_client.send_omni_request(request_config) @@ -337,17 +282,21 @@ def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True) -def test_text_image_to_text_audio_001(omni_server, openai_client) -> None: +def test_large_image_to_text_audio_001(omni_server, openai_client) -> None: """ - Input Modal: text, image + Input Modal: text, high-resolution image (1080p-class JPEG) Output Modal: text, audio Input Setting: stream=False Datasets: single request """ - image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}" + image_data_url = ( + f"data:image/jpeg;base64,{generate_synthetic_image(LARGE_IMAGE_WIDTH, LARGE_IMAGE_HEIGHT)['base64']}" + ) messages = dummy_messages_from_mix_data( - image_data_url=image_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_image") + image_data_url=image_data_url, + system_prompt=get_system_prompt(), + content_text=get_prompt("text_image"), ) request_config = { @@ -356,7 +305,7 @@ def test_text_image_to_text_audio_001(omni_server, openai_client) -> None: "key_words": {"image": IMAGE_KEY}, } - openai_client.send_omni_request(request_config) + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @pytest.mark.advanced_model