From ac2e967e6f1e9e78d0489e37038facac9152e10d Mon Sep 17 00:00:00 2001
From: wangyu31577 <wangyu31577@hundsun.com>
Date: Thu, 8 Jan 2026 21:22:34 +0800
Subject: [PATCH 01/11] add full test

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
---
 tests/conftest.py                             | 298 ++++++++++++++++++
 .../online_serving/test_qwen3_omni_full.py    | 145 +++++++++
 tests/e2e/stage_configs/qwen3_omni_ci.yaml    |  95 ++++++
 3 files changed, 538 insertions(+)
 create mode 100644 tests/e2e/online_serving/test_qwen3_omni_full.py
 create mode 100644 tests/e2e/stage_configs/qwen3_omni_ci.yaml

diff --git a/tests/conftest.py b/tests/conftest.py
index 82c959f07ca..e2dad2b69c7 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,8 +1,19 @@
+import base64
 import os
+import socket
+import subprocess
+from pathlib import Path
+from typing import Any
 
+import psutil
 import pytest
+import speech_recognition as sr
+import sys
+import time
 import torch
+import yaml
 from vllm.logger import init_logger
+from vllm.utils import get_open_port
 
 logger = init_logger(__name__)
 
@@ -34,3 +45,290 @@ def clean_gpu_memory_between_tests():
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
         gc.collect()
+
+
+def dummy_messages_from_mix_data(
+        system_prompt: dict[str, Any] = None,
+        video_data_url: Any = None,
+        audio_data_url: Any = None,
+        image_data_url: Any = None,
+        content_text: str = None,
+):
+    """Create messages with video、image、audio data URL for OpenAI API."""
+
+    if content_text is not None:
+        content = [{"type": "text", "text": content_text}]
+    else:
+        content = []
+
+    media_items = []
+    if isinstance(video_data_url, list):
+        for video_url in video_data_url:
+            media_items.append((video_url, "video"))
+    else:
+        media_items.append((video_data_url, "video"))
+
+    if isinstance(image_data_url, list):
+        for url in image_data_url:
+            media_items.append((url, "image"))
+    else:
+        media_items.append((image_data_url, "image"))
+
+    if isinstance(audio_data_url, list):
+        for url in audio_data_url:
+            media_items.append((url, "audio"))
+    else:
+        media_items.append((audio_data_url, "audio"))
+
+    content.extend(
+        {"type": f"{media_type}_url", f"{media_type}_url": {"url": url}}
+        for url, media_type in media_items
+        if url is not None
+    )
+    messages = [{"role": "user", "content": content}]
+    if system_prompt is not None:
+        messages = [system_prompt] + messages
+    return messages
+
+
+def cosine_similarity_text(s1, s2):
+    """
+        Calculate cosine similarity between two text strings.
+        Notes:
+    ------
+    - Higher score means more similar texts
+    - Score of 1.0 means identical word composition (bag-of-words)
+    - Score of 0.0 means completely different vocabulary
+    """
+    from sklearn.feature_extraction.text import CountVectorizer
+    from sklearn.metrics.pairwise import cosine_similarity
+    vectorizer = CountVectorizer().fit_transform([s1, s2])
+    vectors = vectorizer.toarray()
+    return cosine_similarity([vectors[0]], [vectors[1]])[0][0]
+
+
+def convert_audio_to_text(audio_data):
+    """
+    Convert base64 encoded audio data to text using speech recognition.
+    """
+
+    audio_data = base64.b64decode(audio_data)
+    output_path = f"./test_{int(time.time())}"
+    with open(output_path, 'wb') as audio_file:
+        audio_file.write(audio_data)
+
+    print(f"audio data is saved: {output_path}")
+    recognizer = sr.Recognizer()
+    with sr.AudioFile(output_path) as source:
+        audio_data = recognizer.record(source)
+
+        print("Start voice recognition...")
+
+        text = recognizer.recognize_sphinx(audio_data)
+        if text:
+            return text
+        else:
+            return ""
+
+
+def modify_stage_config(
+        yaml_path: str,
+        stage_updates: dict[int, dict[str, Any]],
+) -> str:
+    """
+    Batch modify configurations for multiple stages in a YAML file.
+
+    Args:
+        yaml_path: Path to the YAML configuration file.
+        stage_updates: Dictionary where keys are stage IDs and values are dictionaries of
+                      modifications for that stage. Each modification dictionary uses
+                      dot-separated paths as keys and new configuration values as values.
+                      Example: {
+                          0: {'engine_args.max_model_len': 5800},
+                          1: {'runtime.max_batch_size': 2}
+                      }
+
+    Returns:
+        str: Path to the newly created modified YAML file with timestamp suffix.
+
+    Example:
+        >>> output_file = modify_stage_config(
+        ...     'config.yaml',
+        ...     {
+        ...         0: {'engine_args.max_model_len': 5800},
+        ...         1: {'runtime.max_batch_size': 2}
+        ...     }
+        ... )
+        >>> print(f"Modified configuration saved to: {output_file}")
+        Modified configuration saved to: config_1698765432.yaml
+    """
+    path = Path(yaml_path)
+    if not path.exists():
+        raise FileNotFoundError(f"yaml does not exist: {path}")
+    try:
+        with open(yaml_path, encoding="utf-8") as f:
+            config = yaml.safe_load(f) or {}
+    except Exception as e:
+        raise ValueError(f"Cannot parse YAML file: {e}")
+
+    stage_args = config.get("stage_args", [])
+    if not stage_args:
+        raise ValueError("the stage_args does not exist")
+
+    for stage_id, config_dict in stage_updates.items():
+        target_stage = None
+        for stage in stage_args:
+            if stage.get("stage_id") == stage_id:
+                target_stage = stage
+                break
+
+        if target_stage is None:
+            available_ids = [s.get("stage_id") for s in stage_args if "stage_id" in s]
+            raise KeyError(f"Stage ID {stage_id} is not exist, available IDs: {available_ids}")
+
+        for key_path, value in config_dict.items():
+            current = target_stage
+            keys = key_path.split(".")
+            for i in range(len(keys) - 1):
+                key = keys[i]
+                if key not in current:
+                    raise KeyError(f"the {'.'.join(keys[: i + 1])} does not exist")
+
+                elif not isinstance(current[key], dict) and i < len(keys) - 2:
+                    raise ValueError(f"{'.'.join(keys[: i + 1])}' cannot continue deeper because it's not a dict")
+                current = current[key]
+            current[keys[-1]] = value
+
+    output_path = f"{yaml_path.split('.')[0]}_{int(time.time())}.yaml"
+    with open(output_path, "w", encoding="utf-8") as f:
+        yaml.dump(config, f, default_flow_style=False, sort_keys=False, allow_unicode=True, indent=2)
+
+    return output_path
+
+
+class OmniServer:
+    """Omniserver for vLLM-Omni tests."""
+
+    def __init__(
+            self,
+            model: str,
+            serve_args: list[str],
+            *,
+            env_dict: dict[str, str] | None = None,
+    ) -> None:
+        self.model = model
+        self.serve_args = serve_args
+        self.env_dict = env_dict
+        self.proc: subprocess.Popen | None = None
+        self.host = "127.0.0.1"
+        self.port = get_open_port()
+
+    def _start_server(self) -> None:
+        """Start the vLLM-Omni server subprocess."""
+        env = os.environ.copy()
+        env["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+        if self.env_dict is not None:
+            env.update(self.env_dict)
+
+        cmd = [
+                  sys.executable,
+                  "-m",
+                  "vllm_omni.entrypoints.cli.main",
+                  "serve",
+                  self.model,
+                  "--omni",
+                  "--host",
+                  self.host,
+                  "--port",
+                  str(self.port),
+              ] + self.serve_args
+
+        print(f"Launching OmniServer with: {' '.join(cmd)}")
+        self.proc = subprocess.Popen(
+            cmd,
+            env=env,
+            cwd=os.path.dirname(os.path.dirname(os.path.abspath(__file__))),  # Set working directory to vllm-omni root
+        )
+
+        # Wait for server to be ready
+        max_wait = 600  # 10 minutes
+        start_time = time.time()
+        while time.time() - start_time < max_wait:
+            try:
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
+                    sock.settimeout(1)
+                    result = sock.connect_ex((self.host, self.port))
+                    if result == 0:
+                        print(f"Server ready on {self.host}:{self.port}")
+                        return
+            except Exception:
+                pass
+            time.sleep(2)
+
+        raise RuntimeError(f"Server failed to start within {max_wait} seconds")
+
+    def _kill_process_tree(self, pid):
+        """kill process and its children"""
+        try:
+            parent = psutil.Process(pid)
+            children = parent.children(recursive=True)
+            for child in children:
+                try:
+                    child.terminate()
+                except psutil.NoSuchProcess:
+                    pass
+
+            gone, still_alive = psutil.wait_procs(children, timeout=10)
+
+            for child in still_alive:
+                try:
+                    child.kill()
+                except psutil.NoSuchProcess:
+                    pass
+
+            try:
+                parent.terminate()
+                parent.wait(timeout=10)
+            except (psutil.NoSuchProcess, psutil.TimeoutExpired):
+                try:
+                    parent.kill()
+                except psutil.NoSuchProcess:
+                    pass
+
+        except psutil.NoSuchProcess:
+            pass
+
+    def __enter__(self):
+        self._start_server()
+        return self
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        if self.proc:
+            try:
+                parent = psutil.Process(self.proc.pid)
+                children = parent.children(recursive=True)
+                for child in children:
+                    try:
+                        child.terminate()
+                    except psutil.NoSuchProcess:
+                        pass
+
+                gone, still_alive = psutil.wait_procs(children, timeout=10)
+
+                for child in still_alive:
+                    try:
+                        child.kill()
+                    except psutil.NoSuchProcess:
+                        pass
+
+                try:
+                    parent.terminate()
+                    parent.wait(timeout=10)
+                except (psutil.NoSuchProcess, psutil.TimeoutExpired):
+                    try:
+                        parent.kill()
+                    except psutil.NoSuchProcess:
+                        pass
+
+            except psutil.NoSuchProcess:
+                pass
diff --git a/tests/e2e/online_serving/test_qwen3_omni_full.py b/tests/e2e/online_serving/test_qwen3_omni_full.py
new file mode 100644
index 00000000000..423d00f110e
--- /dev/null
+++ b/tests/e2e/online_serving/test_qwen3_omni_full.py
@@ -0,0 +1,145 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+E2E Online tests for Qwen3-Omni model.
+"""
+
+import concurrent.futures
+import os
+from pathlib import Path
+
+import openai
+import pytest
+import time
+
+from tests.conftest import (OmniServer, dummy_messages_from_mix_data, modify_stage_config, convert_audio_to_text,
+                            cosine_similarity_text)
+
+os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+
+models = ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]
+
+# CI stage config for 2*H100-80G GPUs
+stage_configs = [str(Path(__file__).parent.parent / "stage_configs" / "qwen3_omni_ci.yaml")]
+
+# Create parameter combinations for model and stage config
+test_params = [(model, stage_config) for model in models for stage_config in stage_configs]
+
+
+def client(omni_server):
+    """OpenAI client for the running vLLM-Omni server."""
+    return openai.OpenAI(
+        base_url=f"http://{omni_server.host}:{omni_server.port}/v1",
+        api_key="EMPTY",
+    )
+
+
+def get_system_prompt():
+    return {
+        "role": "system",
+        "content": [
+            {
+                "type": "text",
+                "text": (
+                    "You are Qwen, a virtual human developed by the Qwen Team, "
+                    "Alibaba Group, capable of perceiving auditory and visual inputs, "
+                    "as well as generating text and speech."
+                ),
+            }
+        ],
+    }
+
+
+@pytest.mark.parametrize("test_config", test_params)
+def test_text_to_text_001(test_config: tuple[str, str]) -> None:
+    """Test processing text, generating text output via OpenAI API."""
+    model, stage_config_path = test_config
+    with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server:
+        messages = dummy_messages_from_mix_data(
+            system_prompt=get_system_prompt(),
+            content_text="What is the capital of China?"
+        )
+
+        # Test single completion
+        api_client = client(server)
+        start_time = time.perf_counter()
+        chat_completion = api_client.chat.completions.create(
+            model=server.model, messages=messages, max_tokens=10, stop=None, modalities=["text"]
+        )
+        # Verify E2E
+        print(f"the request e2e is: {time.perf_counter() - start_time}")
+        # TODO: Verify the E2E latency after confirmation baseline.
+
+        # Verify only output text
+        assert len(chat_completion.choices) == 1, "The generated content includes more than just text."
+
+        # Verify text output success
+        text_choice = chat_completion.choices[0]
+        assert text_choice.message.content is not None, "No text output is generated"
+        assert chat_completion.usage.completion_tokens == 10, "The output length differs from the requested max_tokens."
+        assert "Beijing" in text_choice.message.content, "The output do not contain keywords."
+
+
+@pytest.mark.parametrize("test_config", test_params)
+def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None:
+    """Test processing text, generating audio output via OpenAI API."""
+
+    model, stage_config_path = test_config
+    num_concurrent_requests = 5
+    stage_config_path = modify_stage_config(stage_config_path, {
+        0: {"runtime.max_batch_size": num_concurrent_requests, "default_sampling_params.ignore_eos": True},
+        1: {"runtime.max_batch_size": num_concurrent_requests}})
+    with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server:
+        messages = dummy_messages_from_mix_data(
+            system_prompt=get_system_prompt(),
+            content_text="What is recited in the audio? What is in this image? Describe the video briefly."
+        )
+
+        # Test single completion
+        api_client = client(server)
+        e2e_list = list()
+        with concurrent.futures.ThreadPoolExecutor(max_workers=num_concurrent_requests) as executor:
+            # Submit multiple completion requests concurrently
+            futures = [
+                executor.submit(
+                    api_client.chat.completions.create,
+                    model=server.model,
+                    messages=messages,
+                    max_tokens=1000,
+                    stop=None,
+                )
+                for _ in range(num_concurrent_requests)
+            ]
+            start_time = time.perf_counter()
+            # Wait for all requests to complete and collect results
+            chat_completions = list()
+            for future in concurrent.futures.as_completed(futures):
+                chat_completions.append(future.result())
+                # Verify E2E
+                current_e2e = time.perf_counter() - start_time
+                print(f"the request e2e is: {current_e2e}")
+                # TODO: Verify the E2E latency after confirmation baseline.
+                e2e_list.append(current_e2e)
+
+        print(f"the avg e2e is: {sum(e2e_list)/len(e2e_list)}")
+        # Verify all completions succeeded
+        assert len(chat_completions) == num_concurrent_requests, "Not all requests succeeded."
+        for chat_completion in chat_completions:
+            # Verify audio output success
+            audio_message = chat_completion.choices[1].message
+            audio_data = audio_message.audio.data
+            assert audio_data is not None, "No audio output is generated"
+            assert audio_message.audio.expires_at > time.time(), "The generated audio has expired."
+
+            # Verify text output success
+            text_choice = chat_completion.choices[0]
+            text_content = text_choice.message.content
+            assert text_choice.message.content is not None, "No text output is generated"
+            assert chat_completion.usage.completion_tokens == 1000, "The output length differs from the requested max_tokens."
+
+            # Verify text output same as audio output
+            audio_content = convert_audio_to_text(audio_data)
+            print(f"text content is: {text_content}")
+            print(f"audio content is: {audio_content}")
+            assert cosine_similarity_text(audio_content,
+                                          text_content) > 0.8, "The audio content is not same as the text"
diff --git a/tests/e2e/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_ci.yaml
new file mode 100644
index 00000000000..d52b51bb5eb
--- /dev/null
+++ b/tests/e2e/stage_configs/qwen3_omni_ci.yaml
@@ -0,0 +1,95 @@
+# Stage config for running Qwen3-Omni-MoE with 3-stage architecture
+# Stage 0: Thinker (multimodal understanding + text generation)
+# Stage 1: Talker (text embeddings → 16-layer RVQ codec codes)
+# Stage 2: Code2Wav (8-layer RVQ codes → audio waveform)
+
+# The following config has been verified on 2x H100-80G GPUs.
+stage_args:
+- stage_id: 0
+  runtime:
+    devices: "0,1"
+    max_batch_size: 1
+  engine_args:
+    model_stage: thinker
+    model_arch: Qwen3OmniMoeForConditionalGeneration
+    worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+    scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+    gpu_memory_utilization: 0.6
+    enforce_eager: true
+    trust_remote_code: true
+    engine_output_type: latent  # Output hidden states for talker
+    distributed_executor_backend: "mp"
+    max_num_batched_tokens: 32768
+    enable_prefix_caching: false
+    hf_config_name: thinker_config
+    tensor_parallel_size: 2
+  final_output: true
+  final_output_type: text
+  is_comprehension: true
+  default_sampling_params:
+    temperature: 0.4
+    top_p: 0.9
+    top_k: 1
+    max_tokens: 100
+    seed: 42
+    ignore_eos: False
+    detokenize: True
+    repetition_penalty: 1.05
+
+- stage_id: 1
+  runtime:
+    devices: "1"
+    max_batch_size: 1
+  engine_args:
+    model_stage: talker
+    model_arch: Qwen3OmniMoeForConditionalGeneration
+    worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker
+    scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler
+    gpu_memory_utilization: 0.3
+    enforce_eager: true
+    trust_remote_code: true
+    engine_output_type: latent  # Output codec codes for code2wav
+    enable_prefix_caching: false
+    max_num_batched_tokens: 32768
+    distributed_executor_backend: "mp"
+    hf_config_name: talker_config
+  engine_input_source: [0]
+  custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.thinker2talker
+  default_sampling_params:
+    temperature: 0.9
+    top_k: 50
+    max_tokens: 100
+    seed: 42
+    detokenize: False
+    repetition_penalty: 1.05
+    stop_token_ids: [2150]
+
+- stage_id: 2
+  runtime:
+    devices: "0"
+    max_batch_size: 1
+  engine_args:
+    model_stage: code2wav
+    model_arch: Qwen3OmniMoeForConditionalGeneration
+    worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker
+    scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler
+    enforce_eager: true
+    trust_remote_code: true
+    enable_prefix_caching: false
+    engine_output_type: audio  # Final output: audio waveform
+    gpu_memory_utilization: 0.1
+    distributed_executor_backend: "mp"
+    max_num_batched_tokens: 1000000
+    hf_config_name: thinker_config
+  engine_input_source: [1]
+  custom_process_input_func: vllm_omni.model_executor.stage_input_processors.qwen3_omni.talker2code2wav
+  final_output: true
+  final_output_type: audio
+  default_sampling_params:
+    temperature: 0.0
+    top_p: 1.0
+    top_k: -1
+    max_tokens: 200
+    seed: 42
+    detokenize: True
+    repetition_penalty: 1.1
\ No newline at end of file

From 2a513dd2eb4296166b16ec60b27a2cf36d10bf7f Mon Sep 17 00:00:00 2001
From: wangyu31577 <wangyu31577@hundsun.com>
Date: Fri, 9 Jan 2026 14:30:36 +0800
Subject: [PATCH 02/11] add label

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
---
 tests/e2e/online_serving/test_qwen3_omni_full.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/e2e/online_serving/test_qwen3_omni_full.py b/tests/e2e/online_serving/test_qwen3_omni_full.py
index 423d00f110e..de88eccd30a 100644
--- a/tests/e2e/online_serving/test_qwen3_omni_full.py
+++ b/tests/e2e/online_serving/test_qwen3_omni_full.py
@@ -50,6 +50,10 @@ def get_system_prompt():
     }
 
 
+@pytest.mark.omni
+@pytest.mark.gpu
+@pytest.mark.H100
+@multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("test_config", test_params)
 def test_text_to_text_001(test_config: tuple[str, str]) -> None:
     """Test processing text, generating text output via OpenAI API."""

From 356786a4a0b83f8e2bc9a27a3473b802d84bde01 Mon Sep 17 00:00:00 2001
From: wangyu31577 <wangyu31577@hundsun.com>
Date: Fri, 9 Jan 2026 15:08:57 +0800
Subject: [PATCH 03/11] add full test

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
---
 tests/e2e/online_serving/test_qwen3_omni_full.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/tests/e2e/online_serving/test_qwen3_omni_full.py b/tests/e2e/online_serving/test_qwen3_omni_full.py
index de88eccd30a..58533a1ceb1 100644
--- a/tests/e2e/online_serving/test_qwen3_omni_full.py
+++ b/tests/e2e/online_serving/test_qwen3_omni_full.py
@@ -49,11 +49,6 @@ def get_system_prompt():
         ],
     }
 
-
-@pytest.mark.omni
-@pytest.mark.gpu
-@pytest.mark.H100
-@multi_gpu_test(num_gpus=2)
 @pytest.mark.parametrize("test_config", test_params)
 def test_text_to_text_001(test_config: tuple[str, str]) -> None:
     """Test processing text, generating text output via OpenAI API."""

From a89948a0438e82044259d2f25276732c3a85d076 Mon Sep 17 00:00:00 2001
From: wangyu31577 <wangyu31577@hundsun.com>
Date: Fri, 9 Jan 2026 15:26:11 +0800
Subject: [PATCH 04/11] add full test

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
---
 ..._omni_full.py => test_qwen3_omni_expansion.py} | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)
 rename tests/e2e/online_serving/{test_qwen3_omni_full.py => test_qwen3_omni_expansion.py} (87%)

diff --git a/tests/e2e/online_serving/test_qwen3_omni_full.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
similarity index 87%
rename from tests/e2e/online_serving/test_qwen3_omni_full.py
rename to tests/e2e/online_serving/test_qwen3_omni_expansion.py
index 58533a1ceb1..d4f978641a4 100644
--- a/tests/e2e/online_serving/test_qwen3_omni_full.py
+++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
@@ -63,7 +63,7 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None:
         api_client = client(server)
         start_time = time.perf_counter()
         chat_completion = api_client.chat.completions.create(
-            model=server.model, messages=messages, max_tokens=10, stop=None, modalities=["text"]
+            model=server.model, messages=messages, modalities=["text"]
         )
         # Verify E2E
         print(f"the request e2e is: {time.perf_counter() - start_time}")
@@ -75,8 +75,7 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None:
         # Verify text output success
         text_choice = chat_completion.choices[0]
         assert text_choice.message.content is not None, "No text output is generated"
-        assert chat_completion.usage.completion_tokens == 10, "The output length differs from the requested max_tokens."
-        assert "Beijing" in text_choice.message.content, "The output do not contain keywords."
+        assert "beijing" in text_choice.message.content.lower(), "The output do not contain keywords."
 
 
 @pytest.mark.parametrize("test_config", test_params)
@@ -86,12 +85,12 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None:
     model, stage_config_path = test_config
     num_concurrent_requests = 5
     stage_config_path = modify_stage_config(stage_config_path, {
-        0: {"runtime.max_batch_size": num_concurrent_requests, "default_sampling_params.ignore_eos": True},
+        0: {"runtime.max_batch_size": num_concurrent_requests},
         1: {"runtime.max_batch_size": num_concurrent_requests}})
     with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server:
         messages = dummy_messages_from_mix_data(
             system_prompt=get_system_prompt(),
-            content_text="What is recited in the audio? What is in this image? Describe the video briefly."
+            content_text="What is the capital of China?"
         )
 
         # Test single completion
@@ -103,9 +102,7 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None:
                 executor.submit(
                     api_client.chat.completions.create,
                     model=server.model,
-                    messages=messages,
-                    max_tokens=1000,
-                    stop=None,
+                    messages=messages
                 )
                 for _ in range(num_concurrent_requests)
             ]
@@ -134,7 +131,7 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None:
             text_choice = chat_completion.choices[0]
             text_content = text_choice.message.content
             assert text_choice.message.content is not None, "No text output is generated"
-            assert chat_completion.usage.completion_tokens == 1000, "The output length differs from the requested max_tokens."
+            assert "beijing" in text_choice.message.content.lower(), "The output do not contain keywords."
 
             # Verify text output same as audio output
             audio_content = convert_audio_to_text(audio_data)

From 1c106d531a327580715d912f96d41607ab289db0 Mon Sep 17 00:00:00 2001
From: wangyu31577 <wangyu31577@hundsun.com>
Date: Fri, 9 Jan 2026 15:29:18 +0800
Subject: [PATCH 05/11] add full test

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
---
 tests/e2e/online_serving/test_qwen3_omni_expansion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
index d4f978641a4..605b855482b 100644
--- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
@@ -63,7 +63,7 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None:
         api_client = client(server)
         start_time = time.perf_counter()
         chat_completion = api_client.chat.completions.create(
-            model=server.model, messages=messages, modalities=["text"]
+            model=server.model, messages=messages, max_token=20, modalities=["text"]
         )
         # Verify E2E
         print(f"the request e2e is: {time.perf_counter() - start_time}")
@@ -75,6 +75,7 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None:
         # Verify text output success
         text_choice = chat_completion.choices[0]
         assert text_choice.message.content is not None, "No text output is generated"
+        assert chat_completion.usage.completion_tokens <= 20, "The output length more than the requested max_tokens."
         assert "beijing" in text_choice.message.content.lower(), "The output do not contain keywords."
 
 

From 290e5c751e2e444abaf52ccf6cee0baf28e46988 Mon Sep 17 00:00:00 2001
From: wangyu31577 <wangyu31577@hundsun.com>
Date: Fri, 9 Jan 2026 17:07:48 +0800
Subject: [PATCH 06/11] add full test

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
---
 tests/conftest.py                             | 19 +++++++------------
 .../test_qwen3_omni_expansion.py              |  2 +-
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index e2dad2b69c7..5a3947de796 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -7,10 +7,10 @@
 
 import psutil
 import pytest
-import speech_recognition as sr
 import sys
 import time
 import torch
+import whisper
 import yaml
 from vllm.logger import init_logger
 from vllm.utils import get_open_port
@@ -118,17 +118,12 @@ def convert_audio_to_text(audio_data):
         audio_file.write(audio_data)
 
     print(f"audio data is saved: {output_path}")
-    recognizer = sr.Recognizer()
-    with sr.AudioFile(output_path) as source:
-        audio_data = recognizer.record(source)
-
-        print("Start voice recognition...")
-
-        text = recognizer.recognize_sphinx(audio_data)
-        if text:
-            return text
-        else:
-            return ""
+    model = whisper.load_model("base")
+    text = model.transcribe(output_path)["text"]
+    if text:
+        return text
+    else:
+        return ""
 
 
 def modify_stage_config(
diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
index 605b855482b..4d131113bd2 100644
--- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
@@ -63,7 +63,7 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None:
         api_client = client(server)
         start_time = time.perf_counter()
         chat_completion = api_client.chat.completions.create(
-            model=server.model, messages=messages, max_token=20, modalities=["text"]
+            model=server.model, messages=messages, max_tokens=20, modalities=["text"]
         )
         # Verify E2E
         print(f"the request e2e is: {time.perf_counter() - start_time}")

From 3023809d853334b5ad1627e8379302f8a7631392 Mon Sep 17 00:00:00 2001
From: wangyu31577 <wangyu31577@hundsun.com>
Date: Fri, 9 Jan 2026 17:46:54 +0800
Subject: [PATCH 07/11] pre-commit

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
---
 tests/conftest.py                             | 53 ++++++++++---------
 .../test_qwen3_omni_expansion.py              | 43 ++++++++-------
 tests/e2e/stage_configs/qwen3_omni_ci.yaml    |  2 +-
 3 files changed, 52 insertions(+), 46 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 5a3947de796..5b21f671bdb 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,13 +2,13 @@
 import os
 import socket
 import subprocess
+import sys
+import time
 from pathlib import Path
 from typing import Any
 
 import psutil
 import pytest
-import sys
-import time
 import torch
 import whisper
 import yaml
@@ -48,11 +48,11 @@ def clean_gpu_memory_between_tests():
 
 
 def dummy_messages_from_mix_data(
-        system_prompt: dict[str, Any] = None,
-        video_data_url: Any = None,
-        audio_data_url: Any = None,
-        image_data_url: Any = None,
-        content_text: str = None,
+    system_prompt: dict[str, Any] = None,
+    video_data_url: Any = None,
+    audio_data_url: Any = None,
+    image_data_url: Any = None,
+    content_text: str = None,
 ):
     """Create messages with video、image、audio data URL for OpenAI API."""
 
@@ -102,6 +102,7 @@ def cosine_similarity_text(s1, s2):
     """
     from sklearn.feature_extraction.text import CountVectorizer
     from sklearn.metrics.pairwise import cosine_similarity
+
     vectorizer = CountVectorizer().fit_transform([s1, s2])
     vectors = vectorizer.toarray()
     return cosine_similarity([vectors[0]], [vectors[1]])[0][0]
@@ -114,7 +115,7 @@ def convert_audio_to_text(audio_data):
 
     audio_data = base64.b64decode(audio_data)
     output_path = f"./test_{int(time.time())}"
-    with open(output_path, 'wb') as audio_file:
+    with open(output_path, "wb") as audio_file:
         audio_file.write(audio_data)
 
     print(f"audio data is saved: {output_path}")
@@ -127,8 +128,8 @@ def convert_audio_to_text(audio_data):
 
 
 def modify_stage_config(
-        yaml_path: str,
-        stage_updates: dict[int, dict[str, Any]],
+    yaml_path: str,
+    stage_updates: dict[int, dict[str, Any]],
 ) -> str:
     """
     Batch modify configurations for multiple stages in a YAML file.
@@ -205,11 +206,11 @@ class OmniServer:
     """Omniserver for vLLM-Omni tests."""
 
     def __init__(
-            self,
-            model: str,
-            serve_args: list[str],
-            *,
-            env_dict: dict[str, str] | None = None,
+        self,
+        model: str,
+        serve_args: list[str],
+        *,
+        env_dict: dict[str, str] | None = None,
     ) -> None:
         self.model = model
         self.serve_args = serve_args
@@ -226,17 +227,17 @@ def _start_server(self) -> None:
             env.update(self.env_dict)
 
         cmd = [
-                  sys.executable,
-                  "-m",
-                  "vllm_omni.entrypoints.cli.main",
-                  "serve",
-                  self.model,
-                  "--omni",
-                  "--host",
-                  self.host,
-                  "--port",
-                  str(self.port),
-              ] + self.serve_args
+            sys.executable,
+            "-m",
+            "vllm_omni.entrypoints.cli.main",
+            "serve",
+            self.model,
+            "--omni",
+            "--host",
+            self.host,
+            "--port",
+            str(self.port),
+        ] + self.serve_args
 
         print(f"Launching OmniServer with: {' '.join(cmd)}")
         self.proc = subprocess.Popen(
diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
index 4d131113bd2..cb9b4a2a3e1 100644
--- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
@@ -6,14 +6,19 @@
 
 import concurrent.futures
 import os
+import time
 from pathlib import Path
 
 import openai
 import pytest
-import time
 
-from tests.conftest import (OmniServer, dummy_messages_from_mix_data, modify_stage_config, convert_audio_to_text,
-                            cosine_similarity_text)
+from tests.conftest import (
+    OmniServer,
+    convert_audio_to_text,
+    cosine_similarity_text,
+    dummy_messages_from_mix_data,
+    modify_stage_config,
+)
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
@@ -49,14 +54,14 @@ def get_system_prompt():
         ],
     }
 
+
 @pytest.mark.parametrize("test_config", test_params)
 def test_text_to_text_001(test_config: tuple[str, str]) -> None:
     """Test processing text, generating text output via OpenAI API."""
     model, stage_config_path = test_config
     with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server:
         messages = dummy_messages_from_mix_data(
-            system_prompt=get_system_prompt(),
-            content_text="What is the capital of China?"
+            system_prompt=get_system_prompt(), content_text="What is the capital of China?"
         )
 
         # Test single completion
@@ -81,17 +86,20 @@ def test_text_to_text_001(test_config: tuple[str, str]) -> None:
 
 @pytest.mark.parametrize("test_config", test_params)
 def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None:
-    """Test processing text, generating audio output via OpenAI API."""
+    """Test processing text, generating text and audio output via OpenAI API."""
 
     model, stage_config_path = test_config
     num_concurrent_requests = 5
-    stage_config_path = modify_stage_config(stage_config_path, {
-        0: {"runtime.max_batch_size": num_concurrent_requests},
-        1: {"runtime.max_batch_size": num_concurrent_requests}})
+    stage_config_path = modify_stage_config(
+        stage_config_path,
+        {
+            0: {"runtime.max_batch_size": num_concurrent_requests},
+            1: {"runtime.max_batch_size": num_concurrent_requests},
+        },
+    )
     with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server:
         messages = dummy_messages_from_mix_data(
-            system_prompt=get_system_prompt(),
-            content_text="What is the capital of China?"
+            system_prompt=get_system_prompt(), content_text="What is the capital of China?"
         )
 
         # Test single completion
@@ -100,11 +108,7 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None:
         with concurrent.futures.ThreadPoolExecutor(max_workers=num_concurrent_requests) as executor:
             # Submit multiple completion requests concurrently
             futures = [
-                executor.submit(
-                    api_client.chat.completions.create,
-                    model=server.model,
-                    messages=messages
-                )
+                executor.submit(api_client.chat.completions.create, model=server.model, messages=messages)
                 for _ in range(num_concurrent_requests)
             ]
             start_time = time.perf_counter()
@@ -118,7 +122,7 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None:
                 # TODO: Verify the E2E latency after confirmation baseline.
                 e2e_list.append(current_e2e)
 
-        print(f"the avg e2e is: {sum(e2e_list)/len(e2e_list)}")
+        print(f"the avg e2e is: {sum(e2e_list) / len(e2e_list)}")
         # Verify all completions succeeded
         assert len(chat_completions) == num_concurrent_requests, "Not all requests succeeded."
         for chat_completion in chat_completions:
@@ -138,5 +142,6 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None:
             audio_content = convert_audio_to_text(audio_data)
             print(f"text content is: {text_content}")
             print(f"audio content is: {audio_content}")
-            assert cosine_similarity_text(audio_content,
-                                          text_content) > 0.8, "The audio content is not same as the text"
+            assert cosine_similarity_text(audio_content.lower(), text_content.lower()) > 0.9, (
+                "The audio content is not same as the text"
+            )
diff --git a/tests/e2e/stage_configs/qwen3_omni_ci.yaml b/tests/e2e/stage_configs/qwen3_omni_ci.yaml
index d52b51bb5eb..5106b185419 100644
--- a/tests/e2e/stage_configs/qwen3_omni_ci.yaml
+++ b/tests/e2e/stage_configs/qwen3_omni_ci.yaml
@@ -92,4 +92,4 @@ stage_args:
     max_tokens: 200
     seed: 42
     detokenize: True
-    repetition_penalty: 1.1
\ No newline at end of file
+    repetition_penalty: 1.1

From b37dfcad0a27d7d70f7061bc6b7748e2e3b52987 Mon Sep 17 00:00:00 2001
From: wangyu31577 <wangyu31577@hundsun.com>
Date: Fri, 16 Jan 2026 16:36:12 +0800
Subject: [PATCH 08/11] Add prompt and batch size getter functions

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
---
 .../test_qwen3_omni_expansion.py              | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
index cb9b4a2a3e1..6a47e96f866 100644
--- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py
+++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py
@@ -55,14 +55,25 @@ def get_system_prompt():
     }
 
 
+def get_prompt(prompt_type="text_only"):
+    prompts = {
+        "text_only": "What is the capital of China?",
+        "mix": "What is recited in the audio? What is in this image? Describe the video briefly.",
+    }
+    return prompts.get(prompt_type, prompts["text_only"])
+
+
+def get_max_batch_size(size_type="few"):
+    batch_sizes = {"few": 5, "medium": 100, "large": 256}
+    return batch_sizes.get(size_type, 5)
+
+
 @pytest.mark.parametrize("test_config", test_params)
 def test_text_to_text_001(test_config: tuple[str, str]) -> None:
     """Test processing text, generating text output via OpenAI API."""
     model, stage_config_path = test_config
     with OmniServer(model, ["--stage-configs-path", stage_config_path, "--stage-init-timeout", "90"]) as server:
-        messages = dummy_messages_from_mix_data(
-            system_prompt=get_system_prompt(), content_text="What is the capital of China?"
-        )
+        messages = dummy_messages_from_mix_data(system_prompt=get_system_prompt(), content_text=get_prompt())
 
         # Test single completion
         api_client = client(server)
@@ -89,7 +100,7 @@ def test_text_to_text_audio_001(test_config: tuple[str, str]) -> None:
     """Test processing text, generating text and audio output via OpenAI API."""
 
     model, stage_config_path = test_config
-    num_concurrent_requests = 5
+    num_concurrent_requests = get_max_batch_size()
     stage_config_path = modify_stage_config(
         stage_config_path,
         {

From f6e46e6375e7bc64cf5d3b8fbded99e5137127c7 Mon Sep 17 00:00:00 2001
From: wangyu31577 <wangyu31577@hundsun.com>
Date: Fri, 16 Jan 2026 18:30:21 +0800
Subject: [PATCH 09/11] add dependencies

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
---
 docker/Dockerfile.ci | 6 ++++++
 pyproject.toml       | 1 +
 2 files changed, 7 insertions(+)

diff --git a/docker/Dockerfile.ci b/docker/Dockerfile.ci
index 5e1d00a5f88..aecc429454c 100644
--- a/docker/Dockerfile.ci
+++ b/docker/Dockerfile.ci
@@ -6,6 +6,12 @@ WORKDIR ${APP_DIR}
 
 COPY . .
 
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y ffmpeg && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
 # Install vllm-omni into the same uv-managed Python environment used by the base image.
 RUN uv pip install --python "$(python3 -c 'import sys; print(sys.executable)')" --no-cache-dir ".[dev]"
 
diff --git a/pyproject.toml b/pyproject.toml
index 4833b117487..b78ce0ccdfa 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,6 +50,7 @@ dev = [
     "pytest-cov>=4.0.0",
     "mypy==1.11.1",
     "pre-commit==4.0.1",
+    "openai-whisper"
 ]
 
 docs = [

From 35b9c1c69d11cf6b26c49598cb102a8423187bd2 Mon Sep 17 00:00:00 2001
From: wangyu31577 <wangyu31577@hundsun.com>
Date: Fri, 16 Jan 2026 18:33:29 +0800
Subject: [PATCH 10/11] add dependencies

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
---
 pyproject.toml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b78ce0ccdfa..c50e9820cd2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,7 +50,8 @@ dev = [
     "pytest-cov>=4.0.0",
     "mypy==1.11.1",
     "pre-commit==4.0.1",
-    "openai-whisper"
+    "openai-whisper",
+    "psutil"
 ]
 
 docs = [

From eb41de2c5d0b6e682bd4d91fc2af4ff5c8e34883 Mon Sep 17 00:00:00 2001
From: wangyu31577 <wangyu31577@hundsun.com>
Date: Fri, 16 Jan 2026 19:03:21 +0800
Subject: [PATCH 11/11] add dependencies version

Signed-off-by: wangyu31577 <wangyu31577@hundsun.com>
---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c50e9820cd2..2e2cddc5b7e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -50,8 +50,8 @@ dev = [
     "pytest-cov>=4.0.0",
     "mypy==1.11.1",
     "pre-commit==4.0.1",
-    "openai-whisper",
-    "psutil"
+    "openai-whisper>=20250625",
+    "psutil>=7.2.0"
 ]
 
 docs = [