vllm-project · hsliuustc0106 · Apr 22, 2026 · Apr 21, 2026 · Apr 22, 2026 · Apr 22, 2026
@@ -275,6 +275,46 @@ steps:
                   path: /mnt/hf-cache
                   type: DirectoryOrCreate
 
+  - label: "Audio Streaming Input Test with H100"
+    timeout_in_minutes: 30
+    depends_on: upload-merge-pipeline
+    commands:
+      - export VLLM_TEST_CLEAN_GPU_MEMORY="1"
+      - pytest -s -v tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py -m "advanced_model" --run-level "advanced_model"
+    agents:
+      queue: "mithril-h100-pool"
+    plugins:
+      - kubernetes:
+          podSpec:
+            containers:
+              - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+                resources:
+                  limits:
+                    nvidia.com/gpu: 2
+                volumeMounts:
+                  - name: devshm
+                    mountPath: /dev/shm
+                  - name: hf-cache
+                    mountPath: /root/.cache/huggingface
+                env:
+                  - name: HF_HOME
+                    value: /root/.cache/huggingface
+                  - name: HF_TOKEN
+                    valueFrom:
+                      secretKeyRef:
+                        name: hf-token-secret
+                        key: token
+            nodeSelector:
+              node.kubernetes.io/instance-type: gpu-h100-sxm
+            volumes:
+              - name: devshm
+                emptyDir:
+                  medium: Memory
+              - name: hf-cache
+                hostPath:
+                  path: /mnt/hf-cache
+                  type: DirectoryOrCreate
+
   - label: "Diffusion Image Edit Test with H100 (1 GPU)"
     timeout_in_minutes: 20
     depends_on: upload-merge-pipeline

@@ -23,33 +23,28 @@
     generate_synthetic_audio,
 )
 from tests.helpers.runtime import OmniServerParams
-from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config
+from tests.helpers.stage_config import get_deploy_config_path
 
 os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
 
 MODEL = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
 
+# Synthetic input for realtime E2E (``generate_synthetic_audio``); distinct cache file per phrase.
+REALTIME_SYNTH_PHRASE_TEXT = "Translate into Chinese: Beijing is the Capital of China"
+
 # The new-schema CI overlay bakes in async_chunk: False and covers CUDA/ROCm/XPU
 # via its ``platforms:`` section, so one path serves all three.
 default_stage_config = get_deploy_config_path("ci/qwen3_omni_moe.yaml")
 
-
-def _realtime_stage_config_path() -> str:
-    """CI omni layout without async_chunk; stage 0 thinker max_tokens=10."""
-    return modify_stage_config(
-        default_stage_config,
-        updates={"stages": {0: {"default_sampling_params.max_tokens": 10}}},
-    )
-
-
 realtime_server_params = [
     pytest.param(
         OmniServerParams(
             model=MODEL,
-            stage_config_path=_realtime_stage_config_path(),
+            stage_config_path=default_stage_config,
             use_stage_cli=True,
+            server_args=["--no-async-chunk"],
         ),
-        id="thinker_max_tokens_10",
+        id="default",
     ),
 ]
 
@@ -169,9 +164,15 @@ def test_streaming_audio_input_pcm_output(self, omni_server) -> None:
         """
         Short streamed 16 kHz mono PCM16 input; expect streamed PCM16 audio deltas and
         transcription. Verify Whisper(output audio) aligns with model text (same idea
-        as multimodal omni e2e).
+        as multimodal omni e2e). Input speech is synthesized from
+        ``REALTIME_SYNTH_PHRASE_TEXT``.
         """
-        syn = generate_synthetic_audio(10, 1, sample_rate=16000)
+        syn = generate_synthetic_audio(
+            10,
+            1,
+            sample_rate=16000,
+            phrase_text=REALTIME_SYNTH_PHRASE_TEXT,
+        )
         wav_bytes = base64.b64decode(syn["base64"])
         pcm16 = _pcm16_mono_16k_from_wav_bytes(wav_bytes)
 

@@ -3,6 +3,7 @@
 import base64
 import concurrent.futures
 import gc
+import hashlib
 import io
 import logging
 import math
@@ -66,6 +67,7 @@ def generate_synthetic_audio(
     num_channels: int,
     sample_rate: int = 48000,
     *,
+    phrase_text: str = "test",
     force_regenerate: bool = False,
     cache_dir: Path | str | None = None,
 ) -> dict[str, Any]:
@@ -74,12 +76,16 @@ def generate_synthetic_audio(
 
     Caches the WAV under ``cache_dir`` when given, else under the default temp
     subdirectory. Reuses the file when the same
-    ``duration`` / ``num_channels`` / ``sample_rate`` are requested unless
-    ``force_regenerate`` is true.
+    ``duration`` / ``num_channels`` / ``sample_rate`` / ``phrase_text`` are
+    requested unless ``force_regenerate`` is true.
+
+    The cache filename includes a SHA-256 digest of ``phrase_text`` so different
+    phrases never share a WAV cache entry.
     """
     root = _resolve_synthetic_media_cache_dir(cache_dir)
     root.mkdir(parents=True, exist_ok=True)
-    cache_path = root / f"synth_audio_d{duration}_ch{num_channels}_sr{sample_rate}.wav"
+    phrase_key = hashlib.sha256(phrase_text.encode("utf-8")).hexdigest()
+    cache_path = root / f"synth_audio_d{duration}_ch{num_channels}_sr{sample_rate}_pt{phrase_key}.wav"
 
     if not force_regenerate and cache_path.is_file():
         data, _sr = sf.read(str(cache_path), dtype="float32", always_2d=True)
@@ -204,7 +210,6 @@ def _enhance_speech(audio: np.ndarray) -> np.ndarray:
             enhanced = enhanced / peak * 0.95
         return enhanced.astype(np.float32)
 
-    phrase_text = "test"
     num_samples = int(sample_rate * max(1, duration))
     audio_data = np.zeros((num_samples, num_channels), dtype=np.float32)