Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 10 additions & 15 deletions .buildkite/test-merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ steps:
timeout_in_minutes: 20
depends_on: upload-merge-pipeline
commands:
- pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py
- pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
# Single pytest session for one combined summary at end of log.
- pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py tests/e2e/offline_inference/test_diffusion_layerwise_offload.py
agents:
queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU
plugins:
Expand Down Expand Up @@ -111,8 +111,7 @@ steps:
timeout_in_minutes: 20
depends_on: upload-merge-pipeline
commands:
- pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py
- pytest -s -v tests/diffusion/distributed/test_ulysses_uaa_perf.py
- pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py tests/diffusion/distributed/test_ulysses_uaa_perf.py
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
Expand Down Expand Up @@ -193,8 +192,7 @@ steps:
commands:
- export VLLM_LOGGING_LEVEL=DEBUG
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py
- pytest -s -v tests/e2e/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model"
- pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py tests/e2e/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model"
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
Expand All @@ -216,7 +214,7 @@ steps:
export VLLM_LOGGING_LEVEL=DEBUG
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_customvoice.py
pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py tests/e2e/offline_inference/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model"
'
agents:
queue: "gpu_1_queue"
Expand All @@ -239,7 +237,7 @@ steps:
export VLLM_LOGGING_LEVEL=DEBUG
export VLLM_WORKER_MULTIPROC_METHOD=spawn
export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_base.py
pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py tests/e2e/offline_inference/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model"
'
agents:
queue: "gpu_1_queue"
Expand All @@ -259,9 +257,8 @@ steps:
depends_on: upload-merge-pipeline
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py
- pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
- pytest -s -v tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model"
- export VLLM_TEST_CLEAN_GPU_MEMORY="1"
- pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model"
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down Expand Up @@ -347,8 +344,7 @@ steps:
export VLLM_TEST_CLEAN_GPU_MEMORY=1
export VLLM_IMAGE_FETCH_TIMEOUT=60
pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory"
pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model"
pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model"
pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model"
'
agents:
queue: "mithril-h100-pool"
Expand Down Expand Up @@ -392,8 +388,7 @@ steps:
timeout 20m bash -c '
export VLLM_LOGGING_LEVEL=DEBUG
export VLLM_WORKER_MULTIPROC_METHOD=spawn
pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model"
pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model"
pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model"
'
agents:
queue: "mithril-h100-pool"
Expand Down
17 changes: 5 additions & 12 deletions .buildkite/test-nightly.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,10 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- |
set +e
pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model"
EXIT1=$$?
pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model"
EXIT2=$$?
pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model"
EXIT3=$$?
pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model"
EXIT4=$$?
exit $$((EXIT1 | EXIT2 | EXIT3 | EXIT4))
pytest -s -v \
tests/examples/ \
tests/e2e/online_serving/test_*_expansion.py \
-m "advanced_model and H100 and omni" --run-level "advanced_model"
agents:
queue: "mithril-h100-pool"
plugins:
Expand Down Expand Up @@ -57,8 +51,7 @@ steps:
commands:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1"
- pytest -s -v tests/examples/ -m "advanced_model and L4 and omni" --run-level "advanced_model"
- pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model"
- pytest -s -v tests/examples/ tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model"
agents:
queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU
plugins:
Expand Down
3 changes: 1 addition & 2 deletions .buildkite/test-ready.yml
Original file line number Diff line number Diff line change
Expand Up @@ -328,8 +328,7 @@ steps:
timeout 20m bash -c '
export VLLM_LOGGING_LEVEL=DEBUG
export VLLM_WORKER_MULTIPROC_METHOD=spawn
pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model"
pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model"
pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "core_model" --run-level "core_model"
'
agents:
queue: "mithril-h100-pool"
Expand Down
110 changes: 65 additions & 45 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,18 +1146,6 @@ def convert_audio_bytes_to_text(raw_bytes: bytes) -> str:
return text


def merge_base64_and_convert_to_text(base64_list):
"""
Merge a list of base64 encoded audio data and convert to text.
"""
merged_audio = _merge_base64_audio_to_segment(base64_list)
output_path = f"./test_{uuid.uuid4().hex}.wav"
merged_audio.export(output_path, format="wav")
print(f"audio data is saved: {output_path}")
text = convert_audio_file_to_text(output_path)
return text


def modify_stage_config(
yaml_path: str,
updates: dict[str, Any] = None,
Expand Down Expand Up @@ -1742,7 +1730,7 @@ def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str:
label = str(top.get("label", "")).lower()
conf = float(top.get("score", 0.0))

if conf < 0.6:
if conf < 0.5:
gender = "unknown"
# Some models use non-English labels (e.g., Russian). Normalize to 'male'/'female'.
elif ("female" in label) or ("жен" in label):
Expand Down Expand Up @@ -1771,6 +1759,34 @@ def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str:
return "unknown"


_PRESET_VOICE_GENDER_MAP: dict[str, str] = {
"serena": "female",
"uncle_fu": "male",
"chelsie": "female",
"clone": "female",
"ethan": "male",
}


def _assert_preset_voice_gender_from_audio(
audio_bytes: bytes | None,
voice_name: str | None,
) -> None:
"""If ``voice_name`` matches a known preset, assert classifier gender matches (skip when unknown)."""
if not voice_name or not audio_bytes:
return
key = str(voice_name).lower()
expected_gender = _PRESET_VOICE_GENDER_MAP.get(key)
if expected_gender is None:
return
estimated_gender = _estimate_voice_gender_from_audio(audio_bytes)
print(f"Preset voice gender check: preset={key!r}, estimated={estimated_gender!r}, expected={expected_gender!r}")
if estimated_gender != "unknown":
assert estimated_gender == expected_gender, (
f"{voice_name!r} is expected {expected_gender}, but estimated gender is {estimated_gender!r}"
)


# Threshold aligned with _compute_pcm_hnr_db docstring (clean clone vs distorted).
_MIN_PCM_SPEECH_HNR_DB = 1.0

Expand Down Expand Up @@ -1837,6 +1853,12 @@ def assert_omni_response(response: OmniResponse, request_config: dict[str, Any],
if "audio" in modalities:
assert response.audio_content is not None, "No audio output is generated"
print(f"audio content is: {response.audio_content}")
speaker = request_config.get("speaker")
if speaker:
_assert_preset_voice_gender_from_audio(
response.audio_bytes,
speaker,
)

if "text" in modalities:
assert response.text_content is not None, "No text output is generated"
Expand All @@ -1849,12 +1871,14 @@ def assert_omni_response(response: OmniResponse, request_config: dict[str, Any],
keywords = keywords_dict.get(word_type)
if "text" in modalities:
if keywords:
assert any(keyword in response.text_content.lower() for keyword in keywords), (
text_lower = response.text_content.lower()
assert any(str(kw).lower() in text_lower for kw in keywords), (
"The output does not contain any of the keywords."
)
else:
if keywords:
assert any(keyword in response.audio_content.lower() for keyword in keywords), (
audio_lower = response.audio_content.lower()
assert any(str(kw).lower() in audio_lower for kw in keywords), (
"The output does not contain any of the keywords."
)

Expand Down Expand Up @@ -1908,24 +1932,12 @@ def assert_audio_speech_response(
f"Transcript doesn't match input: similarity={similarity:.2f}, transcript='{transcript}'"
)

# Voice gender consistency check:
# Voice gender consistency check (preset names in ``_PRESET_VOICE_GENDER_MAP``).
# When the estimator returns 'unknown', we treat it as inconclusive and do NOT fail the test.
voice = (request_config.get("voice") or "").lower()
if voice and response.audio_bytes:
estimated_gender = _estimate_voice_gender_from_audio(response.audio_bytes)
voice_gender_map = {
# adjust this mapping to your actual voice names
"serena": "female",
"uncle_fu": "male",
"clone": "female",
}
expected_gender = voice_gender_map.get(voice)
if expected_gender is not None:
print(f"Estimated voice gender from audio: {estimated_gender} (voice='{voice}')")
if estimated_gender != "unknown":
assert estimated_gender == expected_gender, (
f"Voice '{voice}' is expected {expected_gender}, but estimated gender is '{estimated_gender}'"
)
_assert_preset_voice_gender_from_audio(
response.audio_bytes,
request_config.get("voice"),
)


def assert_diffusion_response(response: DiffusionResponse, request_config: dict[str, Any], run_level: str = None):
Expand Down Expand Up @@ -2041,7 +2053,11 @@ def _process_stream_omni_response(self, chat_completion) -> OmniResponse:

if audio_data or text_content:
if audio_data:
audio_content = merge_base64_and_convert_to_text(audio_data)
merged_seg = _merge_base64_audio_to_segment(audio_data)
wav_buf = BytesIO()
merged_seg.export(wav_buf, format="wav")
result.audio_bytes = wav_buf.getvalue()
audio_content = convert_audio_bytes_to_text(result.audio_bytes)
if audio_content and text_content:
similarity = cosine_similarity_text(audio_content.lower(), text_content.lower())

Expand Down Expand Up @@ -2096,7 +2112,8 @@ def _process_non_stream_omni_response(self, chat_completion) -> OmniResponse:

if audio_data or text_content:
if audio_data:
audio_content = convert_audio_to_text(audio_data)
result.audio_bytes = base64.b64decode(audio_data)
audio_content = convert_audio_bytes_to_text(result.audio_bytes)
if audio_content and text_content:
similarity = cosine_similarity_text(audio_content.lower(), text_content.lower())

Expand Down Expand Up @@ -2265,8 +2282,9 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1
request_config: Request configuration dictionary containing parameters like model, messages, stream.
Optional ``use_audio_in_video`` (bool): when true, sets
``extra_body["mm_processor_kwargs"] = {"use_audio_in_video": True}`` for Qwen-Omni video+audio
extraction (merged with any existing ``extra_body`` / ``mm_processor_kwargs``).
Optional ``extra_body`` (dict): passed through to ``chat.completions.create`` after merge.
extraction.
Optional top-level ``speaker`` (str): Qwen3-Omni preset TTS speaker name; sent as
``extra_body["speaker"]`` to ``chat.completions.create``.
request_num: Number of requests, defaults to 1 (single request)

Returns:
Expand All @@ -2278,9 +2296,8 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1
modalities = request_config.get("modalities", ["text", "audio"])

extra_body: dict[str, Any] = {}
raw_extra = request_config.get("extra_body")
if raw_extra:
extra_body.update(raw_extra)
if "speaker" in request_config:
extra_body["speaker"] = request_config["speaker"]
if request_config.get("use_audio_in_video"):
mm = dict(extra_body.get("mm_processor_kwargs") or {})
mm["use_audio_in_video"] = True
Expand Down Expand Up @@ -2312,12 +2329,15 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1
# Send concurrent requests: run create + process in worker so e2e_latency includes full round-trip.
def _one_omni_request():
start = time.perf_counter()
chat_completion = self.client.chat.completions.create(
model=request_config.get("model"),
messages=request_config.get("messages"),
modalities=modalities,
stream=stream,
)
worker_kwargs: dict[str, Any] = {
"model": request_config.get("model"),
"messages": request_config.get("messages"),
"modalities": modalities,
"stream": stream,
}
if extra_body_arg is not None:
worker_kwargs["extra_body"] = extra_body_arg
chat_completion = self.client.chat.completions.create(**worker_kwargs)
if stream:
response = self._process_stream_omni_response(chat_completion)
else:
Expand Down
4 changes: 2 additions & 2 deletions tests/e2e/offline_inference/test_qwen2_5_omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def get_question(prompt_type="mix"):
return prompts.get(prompt_type, prompts["mix"])


@pytest.mark.core_model
@pytest.mark.advanced_model
@pytest.mark.omni
@hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 4, "rocm": 2, "xpu": 3})
@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
Expand Down Expand Up @@ -88,7 +88,7 @@ def test_mix_to_audio(omni_runner, omni_runner_handler) -> None:
omni_runner_handler.send_request(request_config)


@pytest.mark.core_model
@pytest.mark.advanced_model
@pytest.mark.omni
@hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 4, "rocm": 2, "xpu": 3})
@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
Expand Down
2 changes: 1 addition & 1 deletion tests/e2e/offline_inference/test_qwen3_omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def get_question(prompt_type="video"):
return prompts.get(prompt_type, prompts["video"])


@pytest.mark.core_model
@pytest.mark.advanced_model
@pytest.mark.omni
@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2)
@pytest.mark.parametrize("omni_runner", test_params, indirect=True)
Expand Down
Loading
Loading