Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
159 changes: 54 additions & 105 deletions tests/e2e/online_serving/test_qwen3_omni_expansion.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,16 @@
IMAGE_KEY = ["square", "quadrate", "rectangle"]
VIDEO_KEY = ["sphere", "globe", "circle", "round", "ball"]

# Heavier synthetic inputs than the default expansion cases (longer timeline / more pixels).
# Long video: 120s @ 30fps => 3600 frames (generate_synthetic_video in tests/conftest.py).
# Use 224² spatial size to bound RAM (~W*H*num_frames*3) vs. 288² at this frame count.
LONG_VIDEO_WIDTH = 224
LONG_VIDEO_HEIGHT = 224
LONG_VIDEO_FRAMES = 3600
LARGE_IMAGE_WIDTH = 1920
LARGE_IMAGE_HEIGHT = 1080
LONG_AUDIO_DURATION_SEC = 120


def get_chunk_config(default_path):
path = modify_stage_config(
Expand All @@ -37,7 +47,8 @@ def get_chunk_config(default_path):
"async_chunk": True,
"stage_args": {
0: {
"engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk"
"engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker_async_chunk",
"default_sampling_params.max_tokens": 2048,
},
1: {
"engine_args.custom_process_next_stage_input_func": "vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav_async_chunk"
Expand Down Expand Up @@ -167,88 +178,17 @@ def test_text_to_text_audio_001(omni_server, openai_client) -> None:
@pytest.mark.omni
@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
def test_image_to_text_001(omni_server, openai_client) -> None:
"""
Input Modal: image
Output Modal: text
Input Setting: stream=True
Datasets: single request
"""
image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}"
messages = dummy_messages_from_mix_data(image_data_url=image_data_url)

request_config = {
"model": omni_server.model,
"messages": messages,
"modalities": ["text"],
"stream": True,
"key_words": {"image": IMAGE_KEY},
}

openai_client.send_omni_request(request_config)


@pytest.mark.advanced_model
@pytest.mark.omni
@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
def test_image_to_audio_001(omni_server, openai_client) -> None:
"""
Input Modal: image
Output Modal: audio
Input Setting: stream=False
Datasets: single request
"""
image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}"
messages = dummy_messages_from_mix_data(image_data_url=image_data_url)

request_config = {
"model": omni_server.model,
"messages": messages,
"modalities": ["audio"],
"key_words": {"image": IMAGE_KEY},
}

openai_client.send_omni_request(request_config)


@pytest.mark.advanced_model
@pytest.mark.omni
@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
def test_image_to_text_audio_001(omni_server, openai_client) -> None:
"""
Input Modal: image
Output Modal: text, audio
Input Setting: stream=False
Datasets: few requests
"""
image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(1280, 720)['base64']}"

messages = dummy_messages_from_mix_data(image_data_url=image_data_url)

request_config = {
"model": omni_server.model,
"messages": messages,
"key_words": {"image": IMAGE_KEY},
}

openai_client.send_omni_request(request_config, request_num=get_max_batch_size())


@pytest.mark.advanced_model
@pytest.mark.omni
@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
def test_video_to_text_001(omni_server, openai_client) -> None:
def test_text_video_to_text_001(omni_server, openai_client) -> None:
"""
Input Modal: video
Input Modal: long synthetic video (120s @ 30fps, LONG_VIDEO_FRAMES frames)
Output Modal: text
Input Setting: stream=False
Datasets: single request
"""
video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}"
messages = dummy_messages_from_mix_data(video_data_url=video_data_url)
video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(LONG_VIDEO_WIDTH, LONG_VIDEO_HEIGHT, LONG_VIDEO_FRAMES)['base64']}"
messages = dummy_messages_from_mix_data(
video_data_url=video_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_video")
)

request_config = {
"model": omni_server.model,
Expand All @@ -257,28 +197,29 @@ def test_video_to_text_001(omni_server, openai_client) -> None:
"key_words": {"video": VIDEO_KEY},
}

openai_client.send_omni_request(request_config)
openai_client.send_omni_request(request_config, request_num=get_max_batch_size())


@pytest.mark.advanced_model
@pytest.mark.omni
@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
def test_video_to_audio_001(omni_server, openai_client) -> None:
@pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True)
def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None:
"""
Input Modal: video
Output Modal: audio
Input Modal: text, audio
Output Modal: text, audio
Input Setting: stream=False
Datasets: single request
"""
video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}"
messages = dummy_messages_from_mix_data(video_data_url=video_data_url)
audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(5, 1)['base64']}"
messages = dummy_messages_from_mix_data(
audio_data_url=audio_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_audio")
)

request_config = {
"model": omni_server.model,
"messages": messages,
"modalities": ["audio"],
"key_words": {"video": VIDEO_KEY},
"key_words": {"audio": AUDIO_KEY},
}

openai_client.send_omni_request(request_config)
Expand All @@ -287,22 +228,25 @@ def test_video_to_audio_001(omni_server, openai_client) -> None:
@pytest.mark.advanced_model
@pytest.mark.omni
@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
@pytest.mark.parametrize("omni_server", test_params, indirect=True)
def test_video_to_text_audio_001(omni_server, openai_client) -> None:
@pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True)
def test_text_audio_to_text_audio_002(omni_server, openai_client) -> None:
"""
Input Modal: video
Input Modal: text, long-duration audio (~LONG_AUDIO_DURATION_SEC s WAV)
Output Modal: text, audio
Input Setting: stream=False
Datasets: few requests
Datasets: single request
"""
video_data_url = f"data:video/mp4;base64,{generate_synthetic_video(224, 224, 300)['base64']}"

messages = dummy_messages_from_mix_data(video_data_url=video_data_url)
audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(LONG_AUDIO_DURATION_SEC, 1)['base64']}"
messages = dummy_messages_from_mix_data(
audio_data_url=audio_data_url,
system_prompt=get_system_prompt(),
content_text=get_prompt("text_audio"),
)

request_config = {
"model": omni_server.model,
"messages": messages,
"key_words": {"video": VIDEO_KEY},
"key_words": {"audio": AUDIO_KEY},
}

openai_client.send_omni_request(request_config, request_num=get_max_batch_size())
Expand All @@ -312,22 +256,23 @@ def test_video_to_text_audio_001(omni_server, openai_client) -> None:
@pytest.mark.omni
@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
@pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True)
def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None:
def test_text_image_to_text_audio_001(omni_server, openai_client) -> None:
"""
Input Modal: text, audio
Input Modal: text, image
Output Modal: text, audio
Input Setting: stream=False
Datasets: single request
"""
audio_data_url = f"data:audio/wav;base64,{generate_synthetic_audio(5, 1)['base64']}"
image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}"

messages = dummy_messages_from_mix_data(
audio_data_url=audio_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_audio")
image_data_url=image_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_image")
)

request_config = {
"model": omni_server.model,
"messages": messages,
"key_words": {"audio": AUDIO_KEY},
"key_words": {"image": IMAGE_KEY},
}

openai_client.send_omni_request(request_config)
Expand All @@ -337,17 +282,21 @@ def test_text_audio_to_text_audio_001(omni_server, openai_client) -> None:
@pytest.mark.omni
@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
@pytest.mark.parametrize("omni_server", test_params + test_token_params, indirect=True)
def test_text_image_to_text_audio_001(omni_server, openai_client) -> None:
def test_large_image_to_text_audio_001(omni_server, openai_client) -> None:
"""
Input Modal: text, image
Input Modal: text, high-resolution image (1080p-class JPEG)
Output Modal: text, audio
Input Setting: stream=False
Datasets: single request
"""
image_data_url = f"data:image/jpeg;base64,{generate_synthetic_image(224, 224)['base64']}"
image_data_url = (
f"data:image/jpeg;base64,{generate_synthetic_image(LARGE_IMAGE_WIDTH, LARGE_IMAGE_HEIGHT)['base64']}"
)

messages = dummy_messages_from_mix_data(
image_data_url=image_data_url, system_prompt=get_system_prompt(), content_text=get_prompt("text_image")
image_data_url=image_data_url,
system_prompt=get_system_prompt(),
content_text=get_prompt("text_image"),
)

request_config = {
Expand All @@ -356,7 +305,7 @@ def test_text_image_to_text_audio_001(omni_server, openai_client) -> None:
"key_words": {"image": IMAGE_KEY},
}

openai_client.send_omni_request(request_config)
openai_client.send_omni_request(request_config, request_num=get_max_batch_size())


@pytest.mark.advanced_model
Expand Down
Loading