From 06b2065290038ae95bceb15bbaf473e7e88ae0e5 Mon Sep 17 00:00:00 2001 From: CHEN <116010019@link.cuhk.edu.cn> Date: Tue, 21 Apr 2026 09:08:44 +0800 Subject: [PATCH 1/3] add merge ci for streaming input Signed-off-by: CHEN <116010019@link.cuhk.edu.cn> --- .buildkite/test-merge.yml | 40 +++++++++++++++++++ .../test_qwen3_omni_realtime_websocket.py | 17 ++++++-- tests/helpers/media.py | 13 ++++-- 3 files changed, 62 insertions(+), 8 deletions(-) rename tests/{e2e/online_serving => entrypoints/openai_api}/test_qwen3_omni_realtime_websocket.py (92%) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 785cc58fab9..691f3f8764d 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -275,6 +275,46 @@ steps: path: /mnt/hf-cache type: DirectoryOrCreate + - label: "Audio Streaming Input Test with H100" + timeout_in_minutes: 30 + depends_on: upload-merge-pipeline + commands: + - export VLLM_TEST_CLEAN_GPU_MEMORY="1" + - pytest -s -v tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py -m "advanced_model" --run-level "advanced_model" + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 2 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate + - label: "Diffusion Image Edit Test with H100 (1 GPU)" timeout_in_minutes: 20 depends_on: upload-merge-pipeline diff --git a/tests/e2e/online_serving/test_qwen3_omni_realtime_websocket.py b/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py similarity index 92% rename from tests/e2e/online_serving/test_qwen3_omni_realtime_websocket.py rename to tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py index f3b26108199..88e29b72ab5 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_realtime_websocket.py +++ b/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py @@ -29,16 +29,18 @@ MODEL = "Qwen/Qwen3-Omni-30B-A3B-Instruct" +# Synthetic input for realtime E2E (``generate_synthetic_audio``); distinct cache file per phrase. +REALTIME_SYNTH_PHRASE_TEXT = "Translate into Chinese: Beijing is the Capital of China" + # The new-schema CI overlay bakes in async_chunk: False and covers CUDA/ROCm/XPU # via its ``platforms:`` section, so one path serves all three. default_stage_config = get_deploy_config_path("ci/qwen3_omni_moe.yaml") def _realtime_stage_config_path() -> str: - """CI omni layout without async_chunk; stage 0 thinker max_tokens=10.""" + """CI omni layout without async_chunk""" return modify_stage_config( default_stage_config, - updates={"stages": {0: {"default_sampling_params.max_tokens": 10}}}, ) @@ -48,6 +50,7 @@ def _realtime_stage_config_path() -> str: model=MODEL, stage_config_path=_realtime_stage_config_path(), use_stage_cli=True, + server_args=["--no-async-chunk"], ), id="thinker_max_tokens_10", ), @@ -169,9 +172,15 @@ def test_streaming_audio_input_pcm_output(self, omni_server) -> None: """ Short streamed 16 kHz mono PCM16 input; expect streamed PCM16 audio deltas and transcription. Verify Whisper(output audio) aligns with model text (same idea - as multimodal omni e2e). + as multimodal omni e2e). Input speech is synthesized from + ``REALTIME_SYNTH_PHRASE_TEXT``. """ - syn = generate_synthetic_audio(10, 1, sample_rate=16000) + syn = generate_synthetic_audio( + 10, + 1, + sample_rate=16000, + phrase_text=REALTIME_SYNTH_PHRASE_TEXT, + ) wav_bytes = base64.b64decode(syn["base64"]) pcm16 = _pcm16_mono_16k_from_wav_bytes(wav_bytes) diff --git a/tests/helpers/media.py b/tests/helpers/media.py index 4463acbbbb6..2e99788b14c 100644 --- a/tests/helpers/media.py +++ b/tests/helpers/media.py @@ -3,6 +3,7 @@ import base64 import concurrent.futures import gc +import hashlib import io import logging import math @@ -66,6 +67,7 @@ def generate_synthetic_audio( num_channels: int, sample_rate: int = 48000, *, + phrase_text: str = "test", force_regenerate: bool = False, cache_dir: Path | str | None = None, ) -> dict[str, Any]: @@ -74,12 +76,16 @@ def generate_synthetic_audio( Caches the WAV under ``cache_dir`` when given, else under the default temp subdirectory. Reuses the file when the same - ``duration`` / ``num_channels`` / ``sample_rate`` are requested unless - ``force_regenerate`` is true. + ``duration`` / ``num_channels`` / ``sample_rate`` / ``phrase_text`` are + requested unless ``force_regenerate`` is true. + + The cache filename includes a SHA-256 digest of ``phrase_text`` so different + phrases never share a WAV cache entry. """ root = _resolve_synthetic_media_cache_dir(cache_dir) root.mkdir(parents=True, exist_ok=True) - cache_path = root / f"synth_audio_d{duration}_ch{num_channels}_sr{sample_rate}.wav" + phrase_key = hashlib.sha256(phrase_text.encode("utf-8")).hexdigest() + cache_path = root / f"synth_audio_d{duration}_ch{num_channels}_sr{sample_rate}_pt{phrase_key}.wav" if not force_regenerate and cache_path.is_file(): data, _sr = sf.read(str(cache_path), dtype="float32", always_2d=True) @@ -204,7 +210,6 @@ def _enhance_speech(audio: np.ndarray) -> np.ndarray: enhanced = enhanced / peak * 0.95 return enhanced.astype(np.float32) - phrase_text = "test" num_samples = int(sample_rate * max(1, duration)) audio_data = np.zeros((num_samples, num_channels), dtype=np.float32) From adf5a5788c6d8240dd69492d8fd62d514fc64549 Mon Sep 17 00:00:00 2001 From: CHEN <116010019@link.cuhk.edu.cn> Date: Wed, 22 Apr 2026 09:51:49 +0800 Subject: [PATCH 2/3] rm realtime stage config path Signed-off-by: CHEN <116010019@link.cuhk.edu.cn> --- .../openai_api/test_qwen3_omni_realtime_websocket.py | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py b/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py index 88e29b72ab5..784ffc2339d 100644 --- a/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py +++ b/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py @@ -36,23 +36,15 @@ # via its ``platforms:`` section, so one path serves all three. default_stage_config = get_deploy_config_path("ci/qwen3_omni_moe.yaml") - -def _realtime_stage_config_path() -> str: - """CI omni layout without async_chunk""" - return modify_stage_config( - default_stage_config, - ) - - realtime_server_params = [ pytest.param( OmniServerParams( model=MODEL, - stage_config_path=_realtime_stage_config_path(), + stage_config_path=default_stage_config, use_stage_cli=True, server_args=["--no-async-chunk"], ), - id="thinker_max_tokens_10", + id="default", ), ] From d919e5222ed491ddc9daeed90605b063340c360e Mon Sep 17 00:00:00 2001 From: CHEN <116010019@link.cuhk.edu.cn> Date: Wed, 22 Apr 2026 09:58:20 +0800 Subject: [PATCH 3/3] add meger ci Signed-off-by: CHEN <116010019@link.cuhk.edu.cn> --- .../openai_api/test_qwen3_omni_realtime_websocket.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py b/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py index 784ffc2339d..90f8897c58f 100644 --- a/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py +++ b/tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py @@ -23,7 +23,7 @@ generate_synthetic_audio, ) from tests.helpers.runtime import OmniServerParams -from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config +from tests.helpers.stage_config import get_deploy_config_path os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"