diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 7bee193191..118af40a08 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -56,8 +56,8 @@ steps: timeout_in_minutes: 20 depends_on: upload-merge-pipeline commands: - - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py - - pytest -s -v tests/e2e/offline_inference/test_diffusion_layerwise_offload.py + # Single pytest session for one combined summary at end of log. + - pytest -s -v tests/e2e/offline_inference/test_diffusion_cpu_offload.py tests/e2e/offline_inference/test_diffusion_layerwise_offload.py agents: queue: "gpu_1_queue" # g6.4xlarge instance on AWS, has 1 L4 GPU plugins: @@ -111,8 +111,7 @@ steps: timeout_in_minutes: 20 depends_on: upload-merge-pipeline commands: - - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py - - pytest -s -v tests/diffusion/distributed/test_ulysses_uaa_perf.py + - pytest -s -v tests/e2e/offline_inference/test_sequence_parallel.py tests/diffusion/distributed/test_ulysses_uaa_perf.py agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: @@ -193,8 +192,7 @@ steps: commands: - export VLLM_LOGGING_LEVEL=DEBUG - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py - - pytest -s -v tests/e2e/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model" + - pytest -s -v tests/e2e/offline_inference/test_qwen2_5_omni.py tests/e2e/online_serving/test_qwen2_5_omni.py -m "advanced_model" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: @@ -216,7 +214,7 @@ steps: export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_customvoice.py + pytest -s -v tests/e2e/online_serving/test_qwen3_tts_customvoice.py tests/e2e/offline_inference/test_qwen3_tts_customvoice.py -m "advanced_model" --run-level "advanced_model" ' agents: queue: "gpu_1_queue" @@ -239,7 +237,7 @@ steps: export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" && pytest -s -v tests/e2e/offline_inference/test_qwen3_tts_base.py + pytest -s -v tests/e2e/online_serving/test_qwen3_tts_base.py tests/e2e/offline_inference/test_qwen3_tts_base.py -m "advanced_model" --run-level "advanced_model" ' agents: queue: "gpu_1_queue" @@ -259,9 +257,8 @@ steps: depends_on: upload-merge-pipeline commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py - - pytest -s -v tests/e2e/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" - - pytest -s -v tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model" + - export VLLM_TEST_CLEAN_GPU_MEMORY="1" + - pytest -s -v tests/e2e/offline_inference/test_qwen3_omni.py tests/e2e/online_serving/test_qwen3_omni.py tests/e2e/online_serving/test_mimo_audio.py -m "advanced_model" --run-level "advanced_model" agents: queue: "mithril-h100-pool" plugins: @@ -347,8 +344,7 @@ steps: export VLLM_TEST_CLEAN_GPU_MEMORY=1 export VLLM_IMAGE_FETCH_TIMEOUT=60 pytest -s -v tests/e2e/offline_inference/test_bagel_text2img.py -m "advanced_model" --run-level "advanced_model" -k "shared_memory" - pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py -m "advanced_model" --run-level "advanced_model" - pytest -s -v tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" + pytest -s -v tests/e2e/offline_inference/test_bagel_img2img.py tests/e2e/online_serving/test_bagel_online.py -m "advanced_model" --run-level "advanced_model" ' agents: queue: "mithril-h100-pool" @@ -392,8 +388,7 @@ steps: timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" - pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" + pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" ' agents: queue: "mithril-h100-pool" diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 5c6d6d35a6..9088c352b1 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -6,16 +6,10 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - | - set +e - pytest -s -v tests/e2e/online_serving/test_qwen3_omni_expansion.py -m "advanced_model" --run-level "advanced_model" - EXIT1=$$? - pytest -s -v tests/examples/online_serving/test_qwen3_omni.py -m "advanced_model" --run-level "advanced_model" - EXIT2=$$? - pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" - EXIT3=$$? - pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" - EXIT4=$$? - exit $$((EXIT1 | EXIT2 | EXIT3 | EXIT4)) + pytest -s -v \ + tests/examples/ \ + tests/e2e/online_serving/test_*_expansion.py \ + -m "advanced_model and H100 and omni" --run-level "advanced_model" agents: queue: "mithril-h100-pool" plugins: @@ -57,8 +51,7 @@ steps: commands: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - export VLLM_ALLOW_LONG_MAX_MODEL_LEN="1" - - pytest -s -v tests/examples/ -m "advanced_model and L4 and omni" --run-level "advanced_model" - - pytest -s -v tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" + - pytest -s -v tests/examples/ tests/e2e/online_serving/test_*_expansion.py -m "advanced_model and L4 and omni" --run-level "advanced_model" agents: queue: "gpu_4_queue" # g6.12xlarge instance on AWS, has 4 L4 GPU plugins: diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 89839a2d1e..1ed9e8980c 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -328,8 +328,7 @@ steps: timeout 20m bash -c ' export VLLM_LOGGING_LEVEL=DEBUG export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" - pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" + pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "core_model" --run-level "core_model" ' agents: queue: "mithril-h100-pool" diff --git a/tests/conftest.py b/tests/conftest.py index fb88869542..8e9a7bf928 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1146,18 +1146,6 @@ def convert_audio_bytes_to_text(raw_bytes: bytes) -> str: return text -def merge_base64_and_convert_to_text(base64_list): - """ - Merge a list of base64 encoded audio data and convert to text. - """ - merged_audio = _merge_base64_audio_to_segment(base64_list) - output_path = f"./test_{uuid.uuid4().hex}.wav" - merged_audio.export(output_path, format="wav") - print(f"audio data is saved: {output_path}") - text = convert_audio_file_to_text(output_path) - return text - - def modify_stage_config( yaml_path: str, updates: dict[str, Any] = None, @@ -1742,7 +1730,7 @@ def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str: label = str(top.get("label", "")).lower() conf = float(top.get("score", 0.0)) - if conf < 0.6: + if conf < 0.5: gender = "unknown" # Some models use non-English labels (e.g., Russian). Normalize to 'male'/'female'. elif ("female" in label) or ("жен" in label): @@ -1771,6 +1759,34 @@ def _estimate_voice_gender_from_audio(audio_bytes: bytes) -> str: return "unknown" +_PRESET_VOICE_GENDER_MAP: dict[str, str] = { + "serena": "female", + "uncle_fu": "male", + "chelsie": "female", + "clone": "female", + "ethan": "male", +} + + +def _assert_preset_voice_gender_from_audio( + audio_bytes: bytes | None, + voice_name: str | None, +) -> None: + """If ``voice_name`` matches a known preset, assert classifier gender matches (skip when unknown).""" + if not voice_name or not audio_bytes: + return + key = str(voice_name).lower() + expected_gender = _PRESET_VOICE_GENDER_MAP.get(key) + if expected_gender is None: + return + estimated_gender = _estimate_voice_gender_from_audio(audio_bytes) + print(f"Preset voice gender check: preset={key!r}, estimated={estimated_gender!r}, expected={expected_gender!r}") + if estimated_gender != "unknown": + assert estimated_gender == expected_gender, ( + f"{voice_name!r} is expected {expected_gender}, but estimated gender is {estimated_gender!r}" + ) + + # Threshold aligned with _compute_pcm_hnr_db docstring (clean clone vs distorted). _MIN_PCM_SPEECH_HNR_DB = 1.0 @@ -1837,6 +1853,12 @@ def assert_omni_response(response: OmniResponse, request_config: dict[str, Any], if "audio" in modalities: assert response.audio_content is not None, "No audio output is generated" print(f"audio content is: {response.audio_content}") + speaker = request_config.get("speaker") + if speaker: + _assert_preset_voice_gender_from_audio( + response.audio_bytes, + speaker, + ) if "text" in modalities: assert response.text_content is not None, "No text output is generated" @@ -1849,12 +1871,14 @@ def assert_omni_response(response: OmniResponse, request_config: dict[str, Any], keywords = keywords_dict.get(word_type) if "text" in modalities: if keywords: - assert any(keyword in response.text_content.lower() for keyword in keywords), ( + text_lower = response.text_content.lower() + assert any(str(kw).lower() in text_lower for kw in keywords), ( "The output does not contain any of the keywords." ) else: if keywords: - assert any(keyword in response.audio_content.lower() for keyword in keywords), ( + audio_lower = response.audio_content.lower() + assert any(str(kw).lower() in audio_lower for kw in keywords), ( "The output does not contain any of the keywords." ) @@ -1908,24 +1932,12 @@ def assert_audio_speech_response( f"Transcript doesn't match input: similarity={similarity:.2f}, transcript='{transcript}'" ) - # Voice gender consistency check: + # Voice gender consistency check (preset names in ``_PRESET_VOICE_GENDER_MAP``). # When the estimator returns 'unknown', we treat it as inconclusive and do NOT fail the test. - voice = (request_config.get("voice") or "").lower() - if voice and response.audio_bytes: - estimated_gender = _estimate_voice_gender_from_audio(response.audio_bytes) - voice_gender_map = { - # adjust this mapping to your actual voice names - "serena": "female", - "uncle_fu": "male", - "clone": "female", - } - expected_gender = voice_gender_map.get(voice) - if expected_gender is not None: - print(f"Estimated voice gender from audio: {estimated_gender} (voice='{voice}')") - if estimated_gender != "unknown": - assert estimated_gender == expected_gender, ( - f"Voice '{voice}' is expected {expected_gender}, but estimated gender is '{estimated_gender}'" - ) + _assert_preset_voice_gender_from_audio( + response.audio_bytes, + request_config.get("voice"), + ) def assert_diffusion_response(response: DiffusionResponse, request_config: dict[str, Any], run_level: str = None): @@ -2041,7 +2053,11 @@ def _process_stream_omni_response(self, chat_completion) -> OmniResponse: if audio_data or text_content: if audio_data: - audio_content = merge_base64_and_convert_to_text(audio_data) + merged_seg = _merge_base64_audio_to_segment(audio_data) + wav_buf = BytesIO() + merged_seg.export(wav_buf, format="wav") + result.audio_bytes = wav_buf.getvalue() + audio_content = convert_audio_bytes_to_text(result.audio_bytes) if audio_content and text_content: similarity = cosine_similarity_text(audio_content.lower(), text_content.lower()) @@ -2096,7 +2112,8 @@ def _process_non_stream_omni_response(self, chat_completion) -> OmniResponse: if audio_data or text_content: if audio_data: - audio_content = convert_audio_to_text(audio_data) + result.audio_bytes = base64.b64decode(audio_data) + audio_content = convert_audio_bytes_to_text(result.audio_bytes) if audio_content and text_content: similarity = cosine_similarity_text(audio_content.lower(), text_content.lower()) @@ -2265,8 +2282,9 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1 request_config: Request configuration dictionary containing parameters like model, messages, stream. Optional ``use_audio_in_video`` (bool): when true, sets ``extra_body["mm_processor_kwargs"] = {"use_audio_in_video": True}`` for Qwen-Omni video+audio - extraction (merged with any existing ``extra_body`` / ``mm_processor_kwargs``). - Optional ``extra_body`` (dict): passed through to ``chat.completions.create`` after merge. + extraction. + Optional top-level ``speaker`` (str): Qwen3-Omni preset TTS speaker name; sent as + ``extra_body["speaker"]`` to ``chat.completions.create``. request_num: Number of requests, defaults to 1 (single request) Returns: @@ -2278,9 +2296,8 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1 modalities = request_config.get("modalities", ["text", "audio"]) extra_body: dict[str, Any] = {} - raw_extra = request_config.get("extra_body") - if raw_extra: - extra_body.update(raw_extra) + if "speaker" in request_config: + extra_body["speaker"] = request_config["speaker"] if request_config.get("use_audio_in_video"): mm = dict(extra_body.get("mm_processor_kwargs") or {}) mm["use_audio_in_video"] = True @@ -2312,12 +2329,15 @@ def send_omni_request(self, request_config: dict[str, Any], request_num: int = 1 # Send concurrent requests: run create + process in worker so e2e_latency includes full round-trip. def _one_omni_request(): start = time.perf_counter() - chat_completion = self.client.chat.completions.create( - model=request_config.get("model"), - messages=request_config.get("messages"), - modalities=modalities, - stream=stream, - ) + worker_kwargs: dict[str, Any] = { + "model": request_config.get("model"), + "messages": request_config.get("messages"), + "modalities": modalities, + "stream": stream, + } + if extra_body_arg is not None: + worker_kwargs["extra_body"] = extra_body_arg + chat_completion = self.client.chat.completions.create(**worker_kwargs) if stream: response = self._process_stream_omni_response(chat_completion) else: diff --git a/tests/e2e/offline_inference/test_qwen2_5_omni.py b/tests/e2e/offline_inference/test_qwen2_5_omni.py index 6af59c1f63..4c4315aab9 100644 --- a/tests/e2e/offline_inference/test_qwen2_5_omni.py +++ b/tests/e2e/offline_inference/test_qwen2_5_omni.py @@ -57,7 +57,7 @@ def get_question(prompt_type="mix"): return prompts.get(prompt_type, prompts["mix"]) -@pytest.mark.core_model +@pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 4, "rocm": 2, "xpu": 3}) @pytest.mark.parametrize("omni_runner", test_params, indirect=True) @@ -88,7 +88,7 @@ def test_mix_to_audio(omni_runner, omni_runner_handler) -> None: omni_runner_handler.send_request(request_config) -@pytest.mark.core_model +@pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "L4", "rocm": "MI325", "xpu": "B60"}, num_cards={"cuda": 4, "rocm": 2, "xpu": 3}) @pytest.mark.parametrize("omni_runner", test_params, indirect=True) diff --git a/tests/e2e/offline_inference/test_qwen3_omni.py b/tests/e2e/offline_inference/test_qwen3_omni.py index 01be0486fc..cc0af437ec 100644 --- a/tests/e2e/offline_inference/test_qwen3_omni.py +++ b/tests/e2e/offline_inference/test_qwen3_omni.py @@ -56,7 +56,7 @@ def get_question(prompt_type="video"): return prompts.get(prompt_type, prompts["video"]) -@pytest.mark.core_model +@pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) @pytest.mark.parametrize("omni_runner", test_params, indirect=True) diff --git a/tests/e2e/online_serving/test_qwen3_omni_expansion.py b/tests/e2e/online_serving/test_qwen3_omni_expansion.py index 4055ad4267..0bcc86840b 100644 --- a/tests/e2e/online_serving/test_qwen3_omni_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_omni_expansion.py @@ -26,7 +26,7 @@ model = "Qwen/Qwen3-Omni-30B-A3B-Instruct" AUDIO_KEY = ["test"] -IMAGE_KEY = ["square", "quadrate"] +IMAGE_KEY = ["square", "quadrate", "rectangle"] VIDEO_KEY = ["sphere", "globe", "circle", "round", "ball"] @@ -103,6 +103,7 @@ def get_prompt(prompt_type="text_only"): "text_audio": "What is in this audio? ", "text_audio_video": "First, what is in this audio? Then, what is in this video? ", "one_word": "What is the capital of UK? Answer in one word", + "text_chinese": "北京,中国的首都,是一座融合了长城等历史地点与现代建筑的国际化大都市,充满了独特的文化与活力。请重复这句话。", } return prompts.get(prompt_type, prompts["text_only"]) @@ -464,20 +465,10 @@ def test_audio_in_video_002(omni_server, openai_client) -> None: "messages": messages, "stream": True, "use_audio_in_video": True, - "key_words": {"video": VIDEO_KEY, "audio": AUDIO_KEY + ["beep", "electronic"]}, + "key_words": {"video": VIDEO_KEY}, } - # Retry when assert_omni_response fails on key_words (see tests/conftest.py). - _keyword_assert_msg = "The output does not contain any of the keywords." - _max_retries = 3 - for attempt in range(_max_retries): - try: - openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) - break - except AssertionError as e: - if _keyword_assert_msg not in str(e) or attempt == _max_retries - 1: - raise - print(f"Keyword assertion failed, retrying {attempt + 2}/{_max_retries}: {e!r}") + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) @pytest.mark.advanced_model @@ -514,3 +505,120 @@ def test_one_word_prompt_001(omni_server, openai_client) -> None: if _similarity_assert_msg not in str(e) or attempt == _max_retries - 1: raise print(f"Similarity assertion failed, retrying {attempt + 2}/{_max_retries}: {e!r}") + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_speaker_001(omni_server, openai_client) -> None: + """ + Input Modal: text only (one-word answer constraint). + Output Modal: text, audio (default ``modalities``); ``key_words`` only assert on text. + Input Setting: stream=True + Datasets: single request + """ + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text=get_prompt("text"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "speaker": "Chelsie", + "key_words": {"text": ["beijing"]}, + } + + openai_client.send_omni_request(request_config) + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_speaker_002(omni_server, openai_client) -> None: + """ + Input Modal: text only (one-word answer constraint). + Output Modal: text, audio (default ``modalities``); ``key_words`` only assert on text. + Input Setting: stream=True + Datasets: single request + """ + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text=get_prompt("text"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "speaker": "Ethan", + "key_words": {"text": ["beijing"]}, + } + + # Retry only when assert_omni_response fails on preset voice gender (see tests/conftest.py). + _gender_assert_substr = "estimated gender" + _max_retries = 3 + for attempt in range(_max_retries): + try: + openai_client.send_omni_request(request_config, request_num=get_max_batch_size()) + break + except AssertionError as e: + if _gender_assert_substr not in str(e) or attempt == _max_retries - 1: + raise + print(f"Gender assertion failed, retrying {attempt + 2}/{_max_retries}: {e!r}") + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_speaker_003(omni_server, openai_client) -> None: + """ + Input Modal: text only (one-word answer constraint). + Output Modal: text, audio (default ``modalities``); ``key_words`` only assert on text. + Input Setting: stream=True + Datasets: single request + """ + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text=get_prompt("text"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "speaker": "CHELSIE", + "key_words": {"text": ["beijing"]}, + } + + openai_client.send_omni_request(request_config) + + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100", "rocm": "MI325"}, num_cards=2) +@pytest.mark.parametrize("omni_server", test_params, indirect=True) +def test_language_001(omni_server, openai_client) -> None: + """ + Input Modal: text only (one-word answer constraint). + Output Modal: text, audio (default ``modalities``); ``key_words`` only assert on text. + Input Setting: stream=True + Datasets: single request + """ + messages = dummy_messages_from_mix_data( + system_prompt=get_system_prompt(), + content_text=get_prompt("text_chinese"), + ) + + request_config = { + "model": omni_server.model, + "messages": messages, + "stream": True, + "key_words": {"text": ["北京"]}, + } + + openai_client.send_omni_request(request_config) diff --git a/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py b/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py index 9921e3a4a1..03a985896e 100644 --- a/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py +++ b/tests/e2e/online_serving/test_qwen3_tts_customvoice_expansion.py @@ -120,6 +120,31 @@ def test_voice_002(omni_server, openai_client) -> None: openai_client.send_audio_speech_request(request_config) +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "L4"}, num_cards=1) +@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) +def test_voice_003(omni_server, openai_client) -> None: + """ + Test text input processing and audio output via OpenAI API. + Deploy Setting: default yaml + Input Modal: text + Output Modal: audio + Input Setting: stream=False, language=chinese + Datasets: few requests + """ + request_config = { + "model": omni_server.model, + "input": get_prompt(), + "stream": False, + "response_format": "wav", + "task_type": "CustomVoice", + "voice": "SERENA", + } + + openai_client.send_audio_speech_request(request_config) + + @pytest.mark.advanced_model @pytest.mark.omni @hardware_test(res={"cuda": "L4"}, num_cards=1)