Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 40 additions & 0 deletions .buildkite/test-merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,46 @@ steps:
path: /mnt/hf-cache
type: DirectoryOrCreate

- label: "Audio Streaming Input Test with H100"
timeout_in_minutes: 30
depends_on: upload-merge-pipeline
commands:
- export VLLM_TEST_CLEAN_GPU_MEMORY="1"
- pytest -s -v tests/entrypoints/openai_api/test_qwen3_omni_realtime_websocket.py -m "advanced_model" --run-level "advanced_model"
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 2
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate

- label: "Diffusion Image Edit Test with H100 (1 GPU)"
timeout_in_minutes: 20
depends_on: upload-merge-pipeline
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,33 +23,28 @@
generate_synthetic_audio,
)
from tests.helpers.runtime import OmniServerParams
from tests.helpers.stage_config import get_deploy_config_path, modify_stage_config
from tests.helpers.stage_config import get_deploy_config_path

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"

MODEL = "Qwen/Qwen3-Omni-30B-A3B-Instruct"

# Synthetic input for realtime E2E (``generate_synthetic_audio``); distinct cache file per phrase.
REALTIME_SYNTH_PHRASE_TEXT = "Translate into Chinese: Beijing is the Capital of China"

# The new-schema CI overlay bakes in async_chunk: False and covers CUDA/ROCm/XPU
# via its ``platforms:`` section, so one path serves all three.
default_stage_config = get_deploy_config_path("ci/qwen3_omni_moe.yaml")


def _realtime_stage_config_path() -> str:
"""CI omni layout without async_chunk; stage 0 thinker max_tokens=10."""
return modify_stage_config(
default_stage_config,
updates={"stages": {0: {"default_sampling_params.max_tokens": 10}}},
)


realtime_server_params = [
pytest.param(
OmniServerParams(
model=MODEL,
stage_config_path=_realtime_stage_config_path(),
stage_config_path=default_stage_config,
use_stage_cli=True,
server_args=["--no-async-chunk"],
),
id="thinker_max_tokens_10",
id="default",
),
]

Expand Down Expand Up @@ -169,9 +164,15 @@ def test_streaming_audio_input_pcm_output(self, omni_server) -> None:
"""
Short streamed 16 kHz mono PCM16 input; expect streamed PCM16 audio deltas and
transcription. Verify Whisper(output audio) aligns with model text (same idea
as multimodal omni e2e).
as multimodal omni e2e). Input speech is synthesized from
``REALTIME_SYNTH_PHRASE_TEXT``.
"""
syn = generate_synthetic_audio(10, 1, sample_rate=16000)
syn = generate_synthetic_audio(
10,
1,
sample_rate=16000,
phrase_text=REALTIME_SYNTH_PHRASE_TEXT,
)
wav_bytes = base64.b64decode(syn["base64"])
pcm16 = _pcm16_mono_16k_from_wav_bytes(wav_bytes)

Expand Down
13 changes: 9 additions & 4 deletions tests/helpers/media.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import base64
import concurrent.futures
import gc
import hashlib
import io
import logging
import math
Expand Down Expand Up @@ -66,6 +67,7 @@ def generate_synthetic_audio(
num_channels: int,
sample_rate: int = 48000,
*,
phrase_text: str = "test",
force_regenerate: bool = False,
cache_dir: Path | str | None = None,
) -> dict[str, Any]:
Expand All @@ -74,12 +76,16 @@ def generate_synthetic_audio(

Caches the WAV under ``cache_dir`` when given, else under the default temp
subdirectory. Reuses the file when the same
``duration`` / ``num_channels`` / ``sample_rate`` are requested unless
``force_regenerate`` is true.
``duration`` / ``num_channels`` / ``sample_rate`` / ``phrase_text`` are
requested unless ``force_regenerate`` is true.

The cache filename includes a SHA-256 digest of ``phrase_text`` so different
phrases never share a WAV cache entry.
"""
root = _resolve_synthetic_media_cache_dir(cache_dir)
root.mkdir(parents=True, exist_ok=True)
cache_path = root / f"synth_audio_d{duration}_ch{num_channels}_sr{sample_rate}.wav"
phrase_key = hashlib.sha256(phrase_text.encode("utf-8")).hexdigest()
cache_path = root / f"synth_audio_d{duration}_ch{num_channels}_sr{sample_rate}_pt{phrase_key}.wav"

if not force_regenerate and cache_path.is_file():
data, _sr = sf.read(str(cache_path), dtype="float32", always_2d=True)
Expand Down Expand Up @@ -204,7 +210,6 @@ def _enhance_speech(audio: np.ndarray) -> np.ndarray:
enhanced = enhanced / peak * 0.95
return enhanced.astype(np.float32)

phrase_text = "test"
num_samples = int(sample_rate * max(1, duration))
audio_data = np.zeros((num_samples, num_channels), dtype=np.float32)

Expand Down
Loading