Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
42387a3
add qwen3-omni tests
yenuo26 Mar 23, 2026
040100c
Enhance qwen3-omni tests by adding support for audio-video prompts an…
yenuo26 Mar 23, 2026
16e87f6
Update Omni Model Test configuration and enhance audio-video test cases
yenuo26 Mar 23, 2026
3665991
Merge branch 'vllm-project:main' into audio_in_video
yenuo26 Mar 24, 2026
95aa645
Update CI timeout and enhance Omni model test parameters for batch to…
yenuo26 Mar 24, 2026
8fe2c41
Merge branch 'audio_in_video' of https://github.com/yenuo26/vllm-omni…
yenuo26 Mar 24, 2026
f205c5d
debug
yenuo26 Mar 24, 2026
22458c8
Enhance synthetic video generation by adding `embed_audio` parameter …
yenuo26 Mar 24, 2026
95ca2b3
remove debug
yenuo26 Mar 24, 2026
a699b37
Merge branch 'main' into audio_in_video
yenuo26 Mar 25, 2026
f52ece8
Merge branch 'vllm-project:main' into audio_in_video
yenuo26 Mar 26, 2026
453f551
Merge branch 'vllm-project:main' into audio_in_video
yenuo26 Mar 26, 2026
d1fc6f3
Merge branch 'main' into audio_in_video
congw729 Mar 26, 2026
51d4fbb
Update output file naming in modify_stage_config to use nanosecond pr…
yenuo26 Mar 27, 2026
8f4eb6e
Merge branch 'audio_in_video' of https://github.com/yenuo26/vllm-omni…
yenuo26 Mar 27, 2026
2b7e886
Improve audio transcript validation in OmniResponse assertions. Adde…
yenuo26 Mar 27, 2026
c125fe7
Merge branch 'main' into audio_in_video
yenuo26 Mar 27, 2026
975f197
Updated test cases to retry on assertion failures for improved robust…
yenuo26 Mar 27, 2026
073bfee
Merge branch 'main' into audio_in_video
yenuo26 Mar 31, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 36 additions & 38 deletions .buildkite/test-ready.yml
Original file line number Diff line number Diff line change
Expand Up @@ -216,44 +216,42 @@ steps:
volumes:
- "/fsx/hf_cache:/fsx/hf_cache"

# - label: "Omni Model Test with H100"
# depends_on: upload-ready-pipeline
# commands:
# - |
# timeout 20m bash -c '
# export VLLM_WORKER_MULTIPROC_METHOD=spawn
# export VLLM_TEST_CLEAN_GPU_MEMORY="1"
# # - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
# pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model"
# '
# agents:
# queue: "mithril-h100-pool"
# plugins:
# - kubernetes:
# podSpec:
# containers:
# - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
# resources:
# limits:
# nvidia.com/gpu: 2
# volumeMounts:
# - name: devshm
# mountPath: /dev/shm
# - name: hf-cache
# mountPath: /root/.cache/huggingface
# env:
# - name: HF_HOME
# value: /root/.cache/huggingface
# nodeSelector:
# node.kubernetes.io/instance-type: gpu-h100-sxm
# volumes:
# - name: devshm
# emptyDir:
# medium: Memory
# - name: hf-cache
# hostPath:
# path: /mnt/hf-cache
# type: DirectoryOrCreate
- label: "Omni Model Test with H100"
depends_on: upload-ready-pipeline
commands:
- |
timeout 20m bash -c '
export VLLM_WORKER_MULTIPROC_METHOD=spawn
pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "core_model" --run-level "core_model"
'
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 2
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate

- label: "MiMo-Audio E2E Test with H100"
depends_on: upload-ready-pipeline
Expand Down
193 changes: 159 additions & 34 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -688,8 +688,112 @@ def _enhance_speech(audio: np.ndarray) -> np.ndarray:
return result


def generate_synthetic_video(width: int, height: int, num_frames: int, save_to_file: bool = False) -> dict[str, Any]:
"""Generate synthetic video with bouncing balls and return base64 string."""
def _mux_mp4_bytes_with_synthetic_audio(
video_mp4_bytes: bytes,
*,
num_frames: int,
fps: float = 30.0,
sample_rate: int = 48000,
) -> bytes:
"""
Mux a video-only MP4 with mono TTS audio from :func:`generate_synthetic_audio` (AAC).

Audio length is at least the video duration in whole seconds (rounded up); ffmpeg
``-shortest`` trims to the video when the WAV is longer.

Uses ffmpeg from ``imageio_ffmpeg`` when available, else ``ffmpeg`` on PATH.
If TTS or mux fails, returns ``video_mp4_bytes`` unchanged.

Mux subprocess does **not** use ``capture_output=True``: ffmpeg can block writing
to a full stderr pipe while :func:`subprocess.run` waits for exit (classic deadlock).
"""
duration_sec = num_frames / fps if fps > 0 else 0.0
# generate_synthetic_audio(duration=int) uses at least 1s of buffer internally
duration_int = max(1, int(math.ceil(duration_sec)))

try:
audio_result = generate_synthetic_audio(
duration=duration_int,
num_channels=1,
sample_rate=sample_rate,
save_to_file=False,
)
audio_pcm = audio_result["np_array"]
except Exception as e:
logger.warning("Synthetic video: generate_synthetic_audio failed (%s); using video-only MP4.", e)
return video_mp4_bytes

try:
import imageio_ffmpeg

ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
except Exception:
ffmpeg_exe = "ffmpeg"

import tempfile

try:
with tempfile.TemporaryDirectory(prefix="syn_vid_mux_") as tmp:
vid_path = os.path.join(tmp, "video.mp4")
wav_path = os.path.join(tmp, "audio.wav")
out_path = os.path.join(tmp, "out.mp4")
with open(vid_path, "wb") as f:
f.write(video_mp4_bytes)
sf.write(wav_path, audio_pcm, sample_rate, format="WAV", subtype="PCM_16")
cmd = [
ffmpeg_exe,
"-y",
"-nostdin",
"-hide_banner",
"-loglevel",
"error",
"-i",
vid_path,
"-i",
wav_path,
"-c:v",
"copy",
"-c:a",
"aac",
"-b:a",
"128k",
"-shortest",
"-movflags",
"+faststart",
out_path,
]
subprocess.run(
cmd,
check=True,
stdin=subprocess.DEVNULL,
timeout=300,
)
with open(out_path, "rb") as f:
return f.read()
except (
FileNotFoundError,
subprocess.CalledProcessError,
subprocess.TimeoutExpired,
OSError,
) as e:
logger.warning("Synthetic video: audio mux failed (%s); using video-only MP4.", e)
return video_mp4_bytes


def generate_synthetic_video(
width: int,
height: int,
num_frames: int,
save_to_file: bool = False,
*,
embed_audio: bool = False,
) -> dict[str, Any]:
"""Generate synthetic video with bouncing balls and base64 MP4.

When ``embed_audio`` is True, muxes mono AAC from :func:`generate_synthetic_audio`
(TTS + ffmpeg) into the MP4; otherwise returns video-only MP4 (faster when tests do
not need an audio track).
"""

import cv2
import imageio
Expand Down Expand Up @@ -762,13 +866,13 @@ def generate_synthetic_video(width: int, height: int, num_frames: int, save_to_f
result = {
"np_array": video_array,
}
video_bytes = None
saved_file_path = None

fps = 30
buffer = io.BytesIO()
writer_kwargs = {
"format": "mp4",
"fps": 30,
"fps": fps,
"codec": "libx264",
"quality": 7,
"pixelformat": "yuv420p",
Expand All @@ -787,32 +891,31 @@ def generate_synthetic_video(width: int, height: int, num_frames: int, save_to_f
],
}

if save_to_file:
import datetime
try:
with imageio.get_writer(buffer, **writer_kwargs) as writer:
for frame in video_frames:
writer.append_data(frame)
buffer.seek(0)
video_only_bytes = buffer.read()
except Exception as e:
print(f"Warning: Failed to encode synthetic video: {e}")
raise

if embed_audio:
video_bytes = _mux_mp4_bytes_with_synthetic_audio(video_only_bytes, num_frames=num_frames, fps=float(fps))
else:
video_bytes = video_only_bytes

if save_to_file:
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_path = f"video_{width}x{height}_{timestamp}.mp4"
try:
with imageio.get_writer(output_path, **writer_kwargs) as writer:
for frame in video_frames:
writer.append_data(frame)

with open(output_path, "wb") as f:
f.write(video_bytes)
saved_file_path = output_path
print(f"Video saved to: {saved_file_path}")
with open(output_path, "rb") as f:
video_bytes = f.read()

except Exception as e:
print(f"Warning: Failed to save video to file {output_path}: {e}")
save_to_file = False

if not save_to_file or video_bytes is None:
with imageio.get_writer(buffer, **writer_kwargs) as writer:
for frame in video_frames:
writer.append_data(frame)

buffer.seek(0)
video_bytes = buffer.read()

base64_video = base64.b64encode(video_bytes).decode("utf-8")

Expand Down Expand Up @@ -1262,10 +1365,12 @@ def delete_by_path(config_dict: dict, path: str) -> None:
# Direct top-level key
config[key] = value

# Save to new file with timestamp
timestamp = int(time.time())
# Unique suffix: multiple modify_stage_config calls in one process often run
# within the same second (e.g. test_qwen3_omni_expansion imports both
# get_chunk_config and get_batch_token_config). int(time.time()) would collide
# and the later write would overwrite the earlier YAML on disk.
base_name = yaml_path.rsplit(".", 1)[0] if "." in yaml_path else yaml_path
output_path = f"{base_name}_{timestamp}.yaml"
output_path = f"{base_name}_{time.time_ns()}.yaml"

with open(output_path, "w", encoding="utf-8") as f:
yaml.dump(config, f, default_flow_style=None, sort_keys=False, allow_unicode=True, indent=2)
Expand Down Expand Up @@ -1733,9 +1838,11 @@ def assert_omni_response(response: OmniResponse, request_config: dict[str, Any],
"The output does not contain any of the keywords."
)

# Verify similarity
# Verify similarity (Whisper transcript vs streamed/detokenized text)
if "text" in modalities and "audio" in modalities:
assert response.similarity > 0.9, "The audio content is not same as the text"
assert response.similarity is not None and response.similarity > 0.9, (
"The audio content is not same as the text"
)
print(f"similarity is: {response.similarity}")


Expand Down Expand Up @@ -2135,7 +2242,11 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1
Send OpenAI requests.

Args:
request_config: Request configuration dictionary containing parameters like model, messages, stream
request_config: Request configuration dictionary containing parameters like model, messages, stream.
Optional ``use_audio_in_video`` (bool): when true, sets
``extra_body["mm_processor_kwargs"] = {"use_audio_in_video": True}`` for Qwen-Omni video+audio
extraction (merged with any existing ``extra_body`` / ``mm_processor_kwargs``).
Optional ``extra_body`` (dict): passed through to ``chat.completions.create`` after merge.
request_num: Number of requests, defaults to 1 (single request)

Returns:
Expand All @@ -2146,14 +2257,28 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1
stream = request_config.get("stream", False)
modalities = request_config.get("modalities", ["text", "audio"])

extra_body: dict[str, Any] = {}
raw_extra = request_config.get("extra_body")
if raw_extra:
extra_body.update(raw_extra)
if request_config.get("use_audio_in_video"):
mm = dict(extra_body.get("mm_processor_kwargs") or {})
mm["use_audio_in_video"] = True
extra_body["mm_processor_kwargs"] = mm
extra_body_arg: dict[str, Any] | None = extra_body if extra_body else None

create_kwargs: dict[str, Any] = {
"model": request_config.get("model"),
"messages": request_config.get("messages"),
"stream": stream,
"modalities": modalities,
}
if extra_body_arg is not None:
create_kwargs["extra_body"] = extra_body_arg

if request_num == 1:
# Send single request
chat_completion = self.client.chat.completions.create(
model=request_config.get("model"),
messages=request_config.get("messages"),
stream=stream,
modalities=modalities,
)
chat_completion = self.client.chat.completions.create(**create_kwargs)

if stream:
response = self._process_stream_omni_response(chat_completion)
Expand Down
Loading
Loading