From ac2e967e6f1e9e78d0489e37038facac9152e10d Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Thu, 8 Jan 2026 21:22:34 +0800 Subject: [PATCH 01/11] add full test Signed-off-by: wangyu31577 --- tests/conftest.py | 298 ++++++++++++++++++ .../online_serving/test_qwen3_omni_full.py | 145 +++++++++ tests/e2e/stage_configs/qwen3_omni_ci.yaml | 95 ++++++ 3 files changed, 538 insertions(+) create mode 100644 tests/e2e/online_serving/test_qwen3_omni_full.py create mode 100644 tests/e2e/stage_configs/qwen3_omni_ci.yaml diff --git a/tests/conftest.py b/tests/conftest.py index 82c959f07ca..e2dad2b69c7 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,8 +1,19 @@ +import base64 import os +import socket +import subprocess +from pathlib import Path +from typing import Any +import psutil import pytest +import speech_recognition as sr +import sys +import time import torch +import yaml from vllm.logger import init_logger +from vllm.utils import get_open_port logger = init_logger(__name__) @@ -34,3 +45,290 @@ def clean_gpu_memory_between_tests(): if torch.cuda.is_available(): torch.cuda.empty_cache() gc.collect() + + +def dummy_messages_from_mix_data( + system_prompt: dict[str, Any] = None, + video_data_url: Any = None, + audio_data_url: Any = None, + image_data_url: Any = None, + content_text: str = None, +): + """Create messages with video、image、audio data URL for OpenAI API.""" + + if content_text is not None: + content = [{"type": "text", "text": content_text}] + else: + content = [] + + media_items = [] + if isinstance(video_data_url, list): + for video_url in video_data_url: + media_items.append((video_url, "video")) + else: + media_items.append((video_data_url, "video")) + + if isinstance(image_data_url, list): + for url in image_data_url: + media_items.append((url, "image")) + else: + media_items.append((image_data_url, "image")) + + if isinstance(audio_data_url, list): + for url in audio_data_url: + media_items.append((url, "audio")) + else: + media_items.append((audio_data_url, "audio")) + + content.extend( + {"type": f"{media_type}_url", f"{media_type}_url": {"url": url}} + for url, media_type in media_items + if url is not None + ) + messages = [{"role": "user", "content": content}] + if system_prompt is not None: + messages = [system_prompt] + messages + return messages + + +def cosine_similarity_text(s1, s2): + """ + Calculate cosine similarity between two text strings. + Notes: + ------ + - Higher score means more similar texts + - Score of 1.0 means identical word composition (bag-of-words) + - Score of 0.0 means completely different vocabulary + """ + from sklearn.feature_extraction.text import CountVectorizer + from sklearn.metrics.pairwise import cosine_similarity + vectorizer = CountVectorizer().fit_transform([s1, s2]) + vectors = vectorizer.toarray() + return cosine_similarity([vectors[0]], [vectors[1]])[0][0] + + +def convert_audio_to_text(audio_data): + """ + Convert base64 encoded audio data to text using speech recognition. + """ + + audio_data = base64.b64decode(audio_data) + output_path = f"./test_{int(time.time())}" + with open(output_path, 'wb') as audio_file: + audio_file.write(audio_data) + + print(f"audio data is saved: {output_path}") + recognizer = sr.Recognizer() + with sr.AudioFile(output_path) as source: + audio_data = recognizer.record(source) + + print("Start voice recognition...") + + text = recognizer.recognize_sphinx(audio_data) + if text: + return text + else: + return "" + + +def modify_stage_config( + yaml_path: str, + stage_updates: dict[int, dict[str, Any]], +) -> str: + """ + Batch modify configurations for multiple stages in a YAML file. + + Args: + yaml_path: Path to the YAML configuration file. + stage_updates: Dictionary where keys are stage IDs and values are dictionaries of + modifications for that stage. Each modification dictionary uses + dot-separated paths as keys and new configuration values as values. + Example: { + 0: {'engine_args.max_model_len': 5800}, + 1: {'runtime.max_batch_size': 2} + } + + Returns: + str: Path to the newly created modified YAML file with timestamp suffix. + + Example: + >>> output_file = modify_stage_config( + ... 'config.yaml', + ... { + ... 0: {'engine_args.max_model_len': 5800}, + ... 1: {'runtime.max_batch_size': 2} + ... } + ... ) + >>> print(f"Modified configuration saved to: {output_file}") + Modified configuration saved to: config_1698765432.yaml + """ + path = Path(yaml_path) + if not path.exists(): + raise FileNotFoundError(f"yaml does not exist: {path}") + try: + with open(yaml_path, encoding="utf-8") as f: + config = yaml.safe_load(f) or {} + except Exception as e: + raise ValueError(f"Cannot parse YAML file: {e}") + + stage_args = config.get("stage_args", []) + if not stage_args: + raise ValueError("the stage_args does not exist") + + for stage_id, config_dict in stage_updates.items(): + target_stage = None + for stage in stage_args: + if stage.get("stage_id") == stage_id: + target_stage = stage + break + + if target_stage is None: + available_ids = [s.get("stage_id") for s in stage_args if "stage_id" in s] + raise KeyError(f"Stage ID {stage_id} is not exist, available IDs: {available_ids}") + + for key_path, value in config_dict.items(): + current = target_stage + keys = key_path.split(".") + for i in range(len(keys) - 1): + key = keys[i] + if key not in current: + raise KeyError(f"the {'.'.join(keys[: i + 1])} does not exist") + + elif not isinstance(current[key], dict) and i < len(keys) - 2: + raise ValueError(f"{'.'.join(keys[: i + 1])}' cannot continue deeper because it's not a dict") + current = current[key] + current[keys[-1]] = value + + output_path = f"{yaml_path.split('.')[0]}_{int(time.time())}.yaml" + with open(output_path, "w", encoding="utf-8") as f: + yaml.dump(config, f, default_flow_style=False, sort_keys=False, allow_unicode=True, indent=2) + + return output_path + + +class OmniServer: + """Omniserver for vLLM-Omni tests.""" + + def __init__( + self, + model: str, + serve_args: list[str], + *, + env_dict: dict[str, str] | None = None, + ) -> None: + self.model = model + self.serve_args = serve_args + self.env_dict = env_dict + self.proc: subprocess.Popen | None = None + self.host = "127.0.0.1" + self.port = get_open_port() + + def _start_server(self) -> None: + """Start the vLLM-Omni server subprocess.""" + env = os.environ.copy() + env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + if self.env_dict is not None: + env.update(self.env_dict) + + cmd = [ + sys.executable, + "-m", + "vllm_omni.entrypoints.cli.main", + "serve", + self.model, + "--omni", + "--host", + self.host, + "--port", + str(self.port), + ] + self.serve_args + + print(f"Launching OmniServer with: {' '.join(cmd)}") + self.proc = subprocess.Popen( + cmd, + env=env, + cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))), # Set working directory to vllm-omni root + ) + + # Wait for server to be ready + max_wait = 600 # 10 minutes + start_time = time.time() + while time.time() - start_time < max_wait: + try: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.settimeout(1) + result = sock.connect_ex((self.host, self.port)) + if result == 0: + print(f"Server ready on {self.host}:{self.port}") + return + except Exception: + pass + time.sleep(2) + + raise RuntimeError(f"Server failed to start within {max_wait} seconds") + + def _kill_process_tree(self, pid): + """kill process and its children""" + try: + parent = psutil.Process(pid) + children = parent.children(recursive=True) + for child in children: + try: + child.terminate() + except psutil.NoSuchProcess: + pass + + gone, still_alive = psutil.wait_procs(children, timeout=10) + + for child in still_alive: + try: + child.kill() + except psutil.NoSuchProcess: + pass + + try: + parent.terminate() + parent.wait(timeout=10) + except (psutil.NoSuchProcess, psutil.TimeoutExpired): + try: + parent.kill() + except psutil.NoSuchProcess: + pass + + except psutil.NoSuchProcess: + pass + + def __enter__(self): + self._start_server() + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + if self.proc: + try: + parent = psutil.Process(self.proc.pid) + children = parent.children(recursive=True) + for child in children: + try: + child.terminate() + except psutil.NoSuchProcess: + pass + + gone, still_alive = psutil.wait_procs(children, timeout=10) + + for child in still_alive: + try: + child.kill() + except psutil.NoSuchProcess: + pass + + try: + parent.terminate() + parent.wait(timeout=10) + except (psutil.NoSuchProcess, psutil.TimeoutExpired): + try: + parent.kill() + except psutil.NoSuchProcess: + pass + + except psutil.NoSuchProcess: + pass diff --git a/tests/e2e/online_serving/test_qwen3_omni_full.py b/tests/e2e/online_serving/test_qwen3_omni_full.py new file mode 100644 index 00000000000..423d00f110e --- /dev/null +++ b/tests/e2e/online_serving/test_qwen3_omni_full.py @@ -0,0 +1,145 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +E2E Online tests for Qwen3-Omni model. +""" + +import concurrent.futures +import os +from pathlib import Path + +import openai +import pytest +import time + +from tests.conftest import (OmniServer, dummy_messages_from_mix_data, modify_stage_config, convert_audio_to_text, + cosine_similarity_text) + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + +models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"] + +# CI stage config for 2*H100-80G GPUs +stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml")] + +# Create parameter combinations for model and stage config +test_params = [(model, stage_config) for model in models for stage_config in stage_configs] + + +def client(omni_server): + """OpenAI client for the running vLLM-Omni server.""" + return openai.OpenAI( + base_url=f"http://{omni_server.host}:{omni_server.port}/v1", + api_key="EMPTY", + ) + + +def get_system_prompt(): + return { + "role": "system", + "content": [ + { + "type": "text", + "text": ( + "You are Qwen, a virtual human developed by the Qwen Team, " + "Alibaba Group, capable of perceiving auditory and visual inputs, " + "as well as generating text and speech." + ), + } + ], + } + + +@pytest.mark.parametrize("test_config", test_params) +def test_text_to_text_001(test_config: tuple[str, str]) -> None: + """Test processing text, generating text output via OpenAI API.""" + model, stage_config_path = test_config + with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server: + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text="What is the capital of China?" + ) + + # Test single completion + api_client = client(server) + start_time = time.perf_counter() + chat_completion = api_client.chat.completions.create( + model=server.model, messages=messages, max_tokens=10, stop=None, modalities=["text"] + ) + # Verify E2E + print(f"the request e2e is: {time.perf_counter() - start_time}") + # TODO: Verify the E2E latency after confirmation baseline. + + # Verify only output text + assert len(chat_completion.choices) == 1, "The generated content includes more than just text." + + # Verify text output success + text_choice = chat_completion.choices[0] + assert text_choice.message.content is not None, "No text output is generated" + assert chat_completion.usage.completion_tokens == 10, "The output length differs from the requested max_tokens." + assert "Beijing" in text_choice.message.content, "The output do not contain keywords." + + +@pytest.mark.parametrize("test_config", test_params) +def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None: + """Test processing text, generating audio output via OpenAI API.""" + + model, stage_config_path = test_config + num_concurrent_requests = 5 + stage_config_path = modify_stage_config(stage_config_path, { + 0: {"runtime.max_batch_size": num_concurrent_requests, "default_sampling_params.ignore_eos": True}, + 1: {"runtime.max_batch_size": num_concurrent_requests}}) + with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server: + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text="What is recited in the audio? What is in this image? Describe the video briefly." + ) + + # Test single completion + api_client = client(server) + e2e_list = list() + with concurrent.futures.ThreadPoolExecutor(max_workers=num_concurrent_requests) as executor: + # Submit multiple completion requests concurrently + futures = [ + executor.submit( + api_client.chat.completions.create, + model=server.model, + messages=messages, + max_tokens=1000, + stop=None, + ) + for _ in range(num_concurrent_requests) + ] + start_time = time.perf_counter() + # Wait for all requests to complete and collect results + chat_completions = list() + for future in concurrent.futures.as_completed(futures): + chat_completions.append(future.result()) + # Verify E2E + current_e2e = time.perf_counter() - start_time + print(f"the request e2e is: {current_e2e}") + # TODO: Verify the E2E latency after confirmation baseline. + e2e_list.append(current_e2e) + + print(f"the avg e2e is: {sum(e2e_list)/len(e2e_list)}") + # Verify all completions succeeded + assert len(chat_completions) == num_concurrent_requests, "Not all requests succeeded." + for chat_completion in chat_completions: + # Verify audio output success + audio_message = chat_completion.choices[1].message + audio_data = audio_message.audio.data + assert audio_data is not None, "No audio output is generated" + assert audio_message.audio.expires_at > time.time(), "The generated audio has expired." + + # Verify text output success + text_choice = chat_completion.choices[0] + text_content = text_choice.message.content + assert text_choice.message.content is not None, "No text output is generated" + assert chat_completion.usage.completion_tokens == 1000, "The output length differs from the requested max_tokens." + + # Verify text output same as audio output + audio_content = convert_audio_to_text(audio_data) + print(f"text content is: {text_content}") + print(f"audio content is: {audio_content}") + assert cosine_similarity_text(audio_content, + text_content) > 0.8, "The audio content is not same as the text" diff --git a/tests/e2e/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_ci.yaml new file mode 100644 index 00000000000..d52b51bb5eb --- /dev/null +++ b/tests/e2e/stage_configs/qwen3_omni_ci.yaml @@ -0,0 +1,95 @@ +# Stage config for running Qwen3-Omni-MoE with 3-stage architecture +# Stage 0: Thinker (multimodal understanding + text generation) +# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes) +# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform) + +# The following config has been verified on 2x H100-80G GPUs. +stage_args: +- stage_id: 0 + runtime: + devices: "0,1" + max_batch_size: 1 + engine_args: + model_stage: thinker + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.6 + enforce_eager: true + trust_remote_code: true + engine_output_type: latent # Output hidden states for talker + distributed_executor_backend: "mp" + max_num_batched_tokens: 32768 + enable_prefix_caching: false + hf_config_name: thinker_config + tensor_parallel_size: 2 + final_output: true + final_output_type: text + is_comprehension: true + default_sampling_params: + temperature: 0.4 + top_p: 0.9 + top_k: 1 + max_tokens: 100 + seed: 42 + ignore_eos: False + detokenize: True + repetition_penalty: 1.05 + +- stage_id: 1 + runtime: + devices: "1" + max_batch_size: 1 + engine_args: + model_stage: talker + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker + scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler + gpu_memory_utilization: 0.3 + enforce_eager: true + trust_remote_code: true + engine_output_type: latent # Output codec codes for code2wav + enable_prefix_caching: false + max_num_batched_tokens: 32768 + distributed_executor_backend: "mp" + hf_config_name: talker_config + engine_input_source: [0] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker + default_sampling_params: + temperature: 0.9 + top_k: 50 + max_tokens: 100 + seed: 42 + detokenize: False + repetition_penalty: 1.05 + stop_token_ids: [2150] + +- stage_id: 2 + runtime: + devices: "0" + max_batch_size: 1 + engine_args: + model_stage: code2wav + model_arch: Qwen3OmniMoeForConditionalGeneration + worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker + scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler + enforce_eager: true + trust_remote_code: true + enable_prefix_caching: false + engine_output_type: audio # Final output: audio waveform + gpu_memory_utilization: 0.1 + distributed_executor_backend: "mp" + max_num_batched_tokens: 1000000 + hf_config_name: thinker_config + engine_input_source: [1] + custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav + final_output: true + final_output_type: audio + default_sampling_params: + temperature: 0.0 + top_p: 1.0 + top_k: -1 + max_tokens: 200 + seed: 42 + detokenize: True + repetition_penalty: 1.1 \ No newline at end of file From 2a513dd2eb4296166b16ec60b27a2cf36d10bf7f Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Fri, 9 Jan 2026 14:30:36 +0800 Subject: [PATCH 02/11] add label Signed-off-by: wangyu31577 --- tests/e2e/online_serving/test_qwen3_omni_full.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/e2e/online_serving/test_qwen3_omni_full.py b/tests/e2e/online_serving/test_qwen3_omni_full.py index 423d00f110e..de88eccd30a 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_full.py +++ b/tests/e2e/online_serving/test_qwen3_omni_full.py @@ -50,6 +50,10 @@ def get_system_prompt(): } +@pytest.mark.omni +@pytest.mark.gpu +@pytest.mark.H100 +@multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("test_config", test_params) def test_text_to_text_001(test_config: tuple[str, str]) -> None: """Test processing text, generating text output via OpenAI API.""" From 356786a4a0b83f8e2bc9a27a3473b802d84bde01 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Fri, 9 Jan 2026 15:08:57 +0800 Subject: [PATCH 03/11] add full test Signed-off-by: wangyu31577 --- tests/e2e/online_serving/test_qwen3_omni_full.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/e2e/online_serving/test_qwen3_omni_full.py b/tests/e2e/online_serving/test_qwen3_omni_full.py index de88eccd30a..58533a1ceb1 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_full.py +++ b/tests/e2e/online_serving/test_qwen3_omni_full.py @@ -49,11 +49,6 @@ def get_system_prompt(): ], } - -@pytest.mark.omni -@pytest.mark.gpu -@pytest.mark.H100 -@multi_gpu_test(num_gpus=2) @pytest.mark.parametrize("test_config", test_params) def test_text_to_text_001(test_config: tuple[str, str]) -> None: """Test processing text, generating text output via OpenAI API.""" From a89948a0438e82044259d2f25276732c3a85d076 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Fri, 9 Jan 2026 15:26:11 +0800 Subject: [PATCH 04/11] add full test Signed-off-by: wangyu31577 --- ..._omni_full.py => test_qwen3_omni_expansion.py} | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) rename tests/e2e/online_serving/{test_qwen3_omni_full.py => test_qwen3_omni_expansion.py} (87%) diff --git a/tests/e2e/online_serving/test_qwen3_omni_full.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py similarity index 87% rename from tests/e2e/online_serving/test_qwen3_omni_full.py rename to tests/e2e/online_serving/test_qwen3_omni_expansion.py index 58533a1ceb1..d4f978641a4 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_full.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -63,7 +63,7 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None: api_client = client(server) start_time = time.perf_counter() chat_completion = api_client.chat.completions.create( - model=server.model, messages=messages, max_tokens=10, stop=None, modalities=["text"] + model=server.model, messages=messages, modalities=["text"] ) # Verify E2E print(f"the request e2e is: {time.perf_counter() - start_time}") @@ -75,8 +75,7 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None: # Verify text output success text_choice = chat_completion.choices[0] assert text_choice.message.content is not None, "No text output is generated" - assert chat_completion.usage.completion_tokens == 10, "The output length differs from the requested max_tokens." - assert "Beijing" in text_choice.message.content, "The output do not contain keywords." + assert "beijing" in text_choice.message.content.lower(), "The output do not contain keywords." @pytest.mark.parametrize("test_config", test_params) @@ -86,12 +85,12 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None: model, stage_config_path = test_config num_concurrent_requests = 5 stage_config_path = modify_stage_config(stage_config_path, { - 0: {"runtime.max_batch_size": num_concurrent_requests, "default_sampling_params.ignore_eos": True}, + 0: {"runtime.max_batch_size": num_concurrent_requests}, 1: {"runtime.max_batch_size": num_concurrent_requests}}) with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server: messages = dummy_messages_from_mix_data( system_prompt=get_system_prompt(), - content_text="What is recited in the audio? What is in this image? Describe the video briefly." + content_text="What is the capital of China?" ) # Test single completion @@ -103,9 +102,7 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None: executor.submit( api_client.chat.completions.create, model=server.model, - messages=messages, - max_tokens=1000, - stop=None, + messages=messages ) for _ in range(num_concurrent_requests) ] @@ -134,7 +131,7 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None: text_choice = chat_completion.choices[0] text_content = text_choice.message.content assert text_choice.message.content is not None, "No text output is generated" - assert chat_completion.usage.completion_tokens == 1000, "The output length differs from the requested max_tokens." + assert "beijing" in text_choice.message.content.lower(), "The output do not contain keywords." # Verify text output same as audio output audio_content = convert_audio_to_text(audio_data) From 1c106d531a327580715d912f96d41607ab289db0 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Fri, 9 Jan 2026 15:29:18 +0800 Subject: [PATCH 05/11] add full test Signed-off-by: wangyu31577 --- tests/e2e/online_serving/test_qwen3_omni_expansion.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index d4f978641a4..605b855482b 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -63,7 +63,7 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None: api_client = client(server) start_time = time.perf_counter() chat_completion = api_client.chat.completions.create( - model=server.model, messages=messages, modalities=["text"] + model=server.model, messages=messages, max_token=20, modalities=["text"] ) # Verify E2E print(f"the request e2e is: {time.perf_counter() - start_time}") @@ -75,6 +75,7 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None: # Verify text output success text_choice = chat_completion.choices[0] assert text_choice.message.content is not None, "No text output is generated" + assert chat_completion.usage.completion_tokens <= 20, "The output length more than the requested max_tokens." assert "beijing" in text_choice.message.content.lower(), "The output do not contain keywords." From 290e5c751e2e444abaf52ccf6cee0baf28e46988 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Fri, 9 Jan 2026 17:07:48 +0800 Subject: [PATCH 06/11] add full test Signed-off-by: wangyu31577 --- tests/conftest.py | 19 +++++++------------ .../test_qwen3_omni_expansion.py | 2 +- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index e2dad2b69c7..5a3947de796 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,10 +7,10 @@ import psutil import pytest -import speech_recognition as sr import sys import time import torch +import whisper import yaml from vllm.logger import init_logger from vllm.utils import get_open_port @@ -118,17 +118,12 @@ def convert_audio_to_text(audio_data): audio_file.write(audio_data) print(f"audio data is saved: {output_path}") - recognizer = sr.Recognizer() - with sr.AudioFile(output_path) as source: - audio_data = recognizer.record(source) - - print("Start voice recognition...") - - text = recognizer.recognize_sphinx(audio_data) - if text: - return text - else: - return "" + model = whisper.load_model("base") + text = model.transcribe(output_path)["text"] + if text: + return text + else: + return "" def modify_stage_config( diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 605b855482b..4d131113bd2 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -63,7 +63,7 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None: api_client = client(server) start_time = time.perf_counter() chat_completion = api_client.chat.completions.create( - model=server.model, messages=messages, max_token=20, modalities=["text"] + model=server.model, messages=messages, max_tokens=20, modalities=["text"] ) # Verify E2E print(f"the request e2e is: {time.perf_counter() - start_time}") From 3023809d853334b5ad1627e8379302f8a7631392 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Fri, 9 Jan 2026 17:46:54 +0800 Subject: [PATCH 07/11] pre-commit Signed-off-by: wangyu31577 --- tests/conftest.py | 53 ++++++++++--------- .../test_qwen3_omni_expansion.py | 43 ++++++++------- tests/e2e/stage_configs/qwen3_omni_ci.yaml | 2 +- 3 files changed, 52 insertions(+), 46 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 5a3947de796..5b21f671bdb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -2,13 +2,13 @@ import os import socket import subprocess +import sys +import time from pathlib import Path from typing import Any import psutil import pytest -import sys -import time import torch import whisper import yaml @@ -48,11 +48,11 @@ def clean_gpu_memory_between_tests(): def dummy_messages_from_mix_data( - system_prompt: dict[str, Any] = None, - video_data_url: Any = None, - audio_data_url: Any = None, - image_data_url: Any = None, - content_text: str = None, + system_prompt: dict[str, Any] = None, + video_data_url: Any = None, + audio_data_url: Any = None, + image_data_url: Any = None, + content_text: str = None, ): """Create messages with video、image、audio data URL for OpenAI API.""" @@ -102,6 +102,7 @@ def cosine_similarity_text(s1, s2): """ from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity + vectorizer = CountVectorizer().fit_transform([s1, s2]) vectors = vectorizer.toarray() return cosine_similarity([vectors[0]], [vectors[1]])[0][0] @@ -114,7 +115,7 @@ def convert_audio_to_text(audio_data): audio_data = base64.b64decode(audio_data) output_path = f"./test_{int(time.time())}" - with open(output_path, 'wb') as audio_file: + with open(output_path, "wb") as audio_file: audio_file.write(audio_data) print(f"audio data is saved: {output_path}") @@ -127,8 +128,8 @@ def convert_audio_to_text(audio_data): def modify_stage_config( - yaml_path: str, - stage_updates: dict[int, dict[str, Any]], + yaml_path: str, + stage_updates: dict[int, dict[str, Any]], ) -> str: """ Batch modify configurations for multiple stages in a YAML file. @@ -205,11 +206,11 @@ class OmniServer: """Omniserver for vLLM-Omni tests.""" def __init__( - self, - model: str, - serve_args: list[str], - *, - env_dict: dict[str, str] | None = None, + self, + model: str, + serve_args: list[str], + *, + env_dict: dict[str, str] | None = None, ) -> None: self.model = model self.serve_args = serve_args @@ -226,17 +227,17 @@ def _start_server(self) -> None: env.update(self.env_dict) cmd = [ - sys.executable, - "-m", - "vllm_omni.entrypoints.cli.main", - "serve", - self.model, - "--omni", - "--host", - self.host, - "--port", - str(self.port), - ] + self.serve_args + sys.executable, + "-m", + "vllm_omni.entrypoints.cli.main", + "serve", + self.model, + "--omni", + "--host", + self.host, + "--port", + str(self.port), + ] + self.serve_args print(f"Launching OmniServer with: {' '.join(cmd)}") self.proc = subprocess.Popen( diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 4d131113bd2..cb9b4a2a3e1 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -6,14 +6,19 @@ import concurrent.futures import os +import time from pathlib import Path import openai import pytest -import time -from tests.conftest import (OmniServer, dummy_messages_from_mix_data, modify_stage_config, convert_audio_to_text, - cosine_similarity_text) +from tests.conftest import ( + OmniServer, + convert_audio_to_text, + cosine_similarity_text, + dummy_messages_from_mix_data, + modify_stage_config, +) os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" @@ -49,14 +54,14 @@ def get_system_prompt(): ], } + @pytest.mark.parametrize("test_config", test_params) def test_text_to_text_001(test_config: tuple[str, str]) -> None: """Test processing text, generating text output via OpenAI API.""" model, stage_config_path = test_config with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server: messages = dummy_messages_from_mix_data( - system_prompt=get_system_prompt(), - content_text="What is the capital of China?" + system_prompt=get_system_prompt(), content_text="What is the capital of China?" ) # Test single completion @@ -81,17 +86,20 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None: @pytest.mark.parametrize("test_config", test_params) def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None: - """Test processing text, generating audio output via OpenAI API.""" + """Test processing text, generating text and audio output via OpenAI API.""" model, stage_config_path = test_config num_concurrent_requests = 5 - stage_config_path = modify_stage_config(stage_config_path, { - 0: {"runtime.max_batch_size": num_concurrent_requests}, - 1: {"runtime.max_batch_size": num_concurrent_requests}}) + stage_config_path = modify_stage_config( + stage_config_path, + { + 0: {"runtime.max_batch_size": num_concurrent_requests}, + 1: {"runtime.max_batch_size": num_concurrent_requests}, + }, + ) with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server: messages = dummy_messages_from_mix_data( - system_prompt=get_system_prompt(), - content_text="What is the capital of China?" + system_prompt=get_system_prompt(), content_text="What is the capital of China?" ) # Test single completion @@ -100,11 +108,7 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None: with concurrent.futures.ThreadPoolExecutor(max_workers=num_concurrent_requests) as executor: # Submit multiple completion requests concurrently futures = [ - executor.submit( - api_client.chat.completions.create, - model=server.model, - messages=messages - ) + executor.submit(api_client.chat.completions.create, model=server.model, messages=messages) for _ in range(num_concurrent_requests) ] start_time = time.perf_counter() @@ -118,7 +122,7 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None: # TODO: Verify the E2E latency after confirmation baseline. e2e_list.append(current_e2e) - print(f"the avg e2e is: {sum(e2e_list)/len(e2e_list)}") + print(f"the avg e2e is: {sum(e2e_list) / len(e2e_list)}") # Verify all completions succeeded assert len(chat_completions) == num_concurrent_requests, "Not all requests succeeded." for chat_completion in chat_completions: @@ -138,5 +142,6 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None: audio_content = convert_audio_to_text(audio_data) print(f"text content is: {text_content}") print(f"audio content is: {audio_content}") - assert cosine_similarity_text(audio_content, - text_content) > 0.8, "The audio content is not same as the text" + assert cosine_similarity_text(audio_content.lower(), text_content.lower()) > 0.9, ( + "The audio content is not same as the text" + ) diff --git a/tests/e2e/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_ci.yaml index d52b51bb5eb..5106b185419 100644 --- a/tests/e2e/stage_configs/qwen3_omni_ci.yaml +++ b/tests/e2e/stage_configs/qwen3_omni_ci.yaml @@ -92,4 +92,4 @@ stage_args: max_tokens: 200 seed: 42 detokenize: True - repetition_penalty: 1.1 \ No newline at end of file + repetition_penalty: 1.1 From b37dfcad0a27d7d70f7061bc6b7748e2e3b52987 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Fri, 16 Jan 2026 16:36:12 +0800 Subject: [PATCH 08/11] Add prompt and batch size getter functions Signed-off-by: wangyu31577 --- .../test_qwen3_omni_expansion.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index cb9b4a2a3e1..6a47e96f866 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -55,14 +55,25 @@ def get_system_prompt(): } +def get_prompt(prompt_type="text_only"): + prompts = { + "text_only": "What is the capital of China?", + "mix": "What is recited in the audio? What is in this image? Describe the video briefly.", + } + return prompts.get(prompt_type, prompts["text_only"]) + + +def get_max_batch_size(size_type="few"): + batch_sizes = {"few": 5, "medium": 100, "large": 256} + return batch_sizes.get(size_type, 5) + + @pytest.mark.parametrize("test_config", test_params) def test_text_to_text_001(test_config: tuple[str, str]) -> None: """Test processing text, generating text output via OpenAI API.""" model, stage_config_path = test_config with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server: - messages = dummy_messages_from_mix_data( - system_prompt=get_system_prompt(), content_text="What is the capital of China?" - ) + messages = dummy_messages_from_mix_data(system_prompt=get_system_prompt(), content_text=get_prompt()) # Test single completion api_client = client(server) @@ -89,7 +100,7 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None: """Test processing text, generating text and audio output via OpenAI API.""" model, stage_config_path = test_config - num_concurrent_requests = 5 + num_concurrent_requests = get_max_batch_size() stage_config_path = modify_stage_config( stage_config_path, { From f6e46e6375e7bc64cf5d3b8fbded99e5137127c7 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Fri, 16 Jan 2026 18:30:21 +0800 Subject: [PATCH 09/11] add dependencies Signed-off-by: wangyu31577 --- docker/Dockerfile.ci | 6 ++++++ pyproject.toml | 1 + 2 files changed, 7 insertions(+) diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci index 5e1d00a5f88..aecc429454c 100644 --- a/docker/Dockerfile.ci +++ b/docker/Dockerfile.ci @@ -6,6 +6,12 @@ WORKDIR ${APP_DIR} COPY . . +# Install system dependencies +RUN apt-get update && \ + apt-get install -y ffmpeg && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + # Install vllm-omni into the same uv-managed Python environment used by the base image. RUN uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]" diff --git a/pyproject.toml b/pyproject.toml index 4833b117487..b78ce0ccdfa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,6 +50,7 @@ dev = [ "pytest-cov>=4.0.0", "mypy==1.11.1", "pre-commit==4.0.1", + "openai-whisper" ] docs = [ From 35b9c1c69d11cf6b26c49598cb102a8423187bd2 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Fri, 16 Jan 2026 18:33:29 +0800 Subject: [PATCH 10/11] add dependencies Signed-off-by: wangyu31577 --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b78ce0ccdfa..c50e9820cd2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,7 +50,8 @@ dev = [ "pytest-cov>=4.0.0", "mypy==1.11.1", "pre-commit==4.0.1", - "openai-whisper" + "openai-whisper", + "psutil" ] docs = [ From eb41de2c5d0b6e682bd4d91fc2af4ff5c8e34883 Mon Sep 17 00:00:00 2001 From: wangyu31577 Date: Fri, 16 Jan 2026 19:03:21 +0800 Subject: [PATCH 11/11] add dependencies version Signed-off-by: wangyu31577 --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c50e9820cd2..2e2cddc5b7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -50,8 +50,8 @@ dev = [ "pytest-cov>=4.0.0", "mypy==1.11.1", "pre-commit==4.0.1", - "openai-whisper", - "psutil" + "openai-whisper>=20250625", + "psutil>=7.2.0" ] docs = [