Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .buildkite/test-merge.yml
Original file line number Diff line number Diff line change
Expand Up @@ -390,6 +390,16 @@ steps:
export VLLM_WORKER_MULTIPROC_METHOD=spawn
pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model"
'

- label: "CosyVoice3-TTS E2E Test"
timeout_in_minutes: 20
depends_on: upload-merge-pipeline
commands:
- |
timeout 20m bash -c '
export VLLM_WORKER_MULTIPROC_METHOD=spawn
pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "advanced_model" --run-level "advanced_model"
'
agents:
queue: "mithril-h100-pool"
plugins:
Expand All @@ -408,6 +418,11 @@ steps:
env:
- name: HF_HOME
value: /root/.cache/huggingface
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
Expand Down
43 changes: 43 additions & 0 deletions .buildkite/test-ready.yml
Original file line number Diff line number Diff line change
Expand Up @@ -548,3 +548,46 @@ steps:
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate

- label: "CosyVoice3-TTS E2E Test"
timeout_in_minutes: 20
depends_on: upload-ready-pipeline
commands:
- |
timeout 20m bash -c '
export VLLM_WORKER_MULTIPROC_METHOD=spawn
pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "core_model" --run-level "core_model"
'
agents:
queue: "mithril-h100-pool"
plugins:
- kubernetes:
podSpec:
containers:
- image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
resources:
limits:
nvidia.com/gpu: 1
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: /root/.cache/huggingface
env:
- name: HF_HOME
value: /root/.cache/huggingface
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
nodeSelector:
node.kubernetes.io/instance-type: gpu-h100-sxm
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
hostPath:
path: /mnt/hf-cache
type: DirectoryOrCreate
124 changes: 124 additions & 0 deletions tests/e2e/online_serving/test_cosyvoice3_tts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
# SPDX-License-Identifier: Apache-2.0

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To unify the code style, maybe we can modify this test case according to the tests/e2e/online_serving/test_qwen3_tts_base.py? If there are validation points that cannot be covered, we can add them in the assert_audio_speech_response of tests/conftest.py.

# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
E2E Online tests for CosyVoice3 TTS model with voice cloning.

These tests verify the /v1/audio/speech endpoint works correctly with
the CosyVoice3 model, which requires reference audio for voice cloning.
"""

import os

os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0"

from pathlib import Path

import pytest

from tests.conftest import OmniServerParams
from tests.utils import hardware_test

MODEL = "FunAudioLLM/Fun-CosyVoice3-0.5B-2512"

# Official CosyVoice zero-shot prompt audio and its transcript
REF_AUDIO_URL = "https://raw.githubusercontent.com/FunAudioLLM/CosyVoice/main/asset/zero_shot_prompt.wav"
REF_TEXT = "希望你以后能够做的比我还好呦。"


def get_stage_config(name: str = "cosyvoice3.yaml"):
"""Get the stage config path from vllm_omni model_executor stage_configs."""
return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name)


def get_prompt(prompt_type="zh"):
prompts = {
"zh": "收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的感动让我热泪盈眶。",
"en": "Hello, this is a voice cloning test with English text.",
}
return prompts.get(prompt_type, prompts["zh"])


tts_server_params = [
pytest.param(
OmniServerParams(
model=MODEL,
stage_config_path=get_stage_config(),
server_args=["--trust-remote-code", "--disable-log-stats"],
),
id="cosyvoice3",
)
]


@pytest.mark.advanced_model
@pytest.mark.core_model
@pytest.mark.omni
@hardware_test(res={"cuda": "H100"}, num_cards=1)
@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
def test_voice_clone_zh_001(omni_server, openai_client) -> None:
"""
Test voice cloning TTS with Chinese text via OpenAI API.
Deploy Setting: default yaml
Input Modal: text + ref_audio + ref_text
Output Modal: audio
Input Setting: stream=False
Datasets: single request
"""
request_config = {
"model": omni_server.model,
"input": get_prompt("zh"),
"stream": False,
"response_format": "wav",
"ref_audio": REF_AUDIO_URL,
"ref_text": REF_TEXT,
}
openai_client.send_audio_speech_request(request_config)


@pytest.mark.advanced_model
@pytest.mark.omni
@hardware_test(res={"cuda": "H100"}, num_cards=1)
@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
def test_voice_clone_zh_002(omni_server, openai_client) -> None:
"""
Test voice cloning TTS with Chinese text via OpenAI API.
Deploy Setting: default yaml
Input Modal: text + ref_audio + ref_text
Output Modal: audio
Input Setting: stream=True
Datasets: single request
"""
request_config = {
"model": omni_server.model,
"input": get_prompt("zh"),
"stream": True,
"response_format": "wav",
"ref_audio": REF_AUDIO_URL,
"ref_text": REF_TEXT,
}
openai_client.send_audio_speech_request(request_config)


@pytest.mark.advanced_model
@pytest.mark.omni
@hardware_test(res={"cuda": "H100"}, num_cards=1)
@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
def test_voice_clone_en_001(omni_server, openai_client) -> None:
"""
Test voice cloning TTS with English text via OpenAI API.
Deploy Setting: default yaml
Input Modal: text + ref_audio + ref_text
Output Modal: audio
Input Setting: stream=False
Datasets: single request
"""
request_config = {
"model": omni_server.model,
"input": get_prompt("en"),
"stream": False,
"response_format": "wav",
"ref_audio": REF_AUDIO_URL,
"ref_text": REF_TEXT,
}
openai_client.send_audio_speech_request(request_config)
112 changes: 112 additions & 0 deletions tests/entrypoints/openai_api/test_serving_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -1872,3 +1872,115 @@ def test_streaming_unsupported_format_rejected(self, wav_streaming_app):
for fmt in unsupported_formats:
response = client.post("/v1/audio/speech", json={"input": "Hello", "stream": True, "response_format": fmt})
assert response.status_code == 422


# ---- CosyVoice3 Serving Tests ----


@pytest.fixture
def cosyvoice3_server(mocker: MockerFixture):
mocker.patch.object(OmniOpenAIServingSpeech, "_load_supported_speakers", return_value=set())
mocker.patch.object(OmniOpenAIServingSpeech, "_load_codec_frame_rate", return_value=None)

mock_engine_client = mocker.MagicMock()
mock_engine_client.errored = False
mock_engine_client.model_config = mocker.MagicMock(model="FunAudioLLM/Fun-CosyVoice3-0.5B-2512")
mock_engine_client.default_sampling_params_list = [SimpleNamespace(max_tokens=2048)]
mock_engine_client.tts_batch_max_items = 32
mock_engine_client.generate = mocker.MagicMock(return_value="generator")
mock_engine_client.stage_configs = [
SimpleNamespace(
engine_args=SimpleNamespace(model_stage="cosyvoice3_talker"),
tts_args={},
)
]

mock_models = mocker.MagicMock()
mock_models.is_base_model.return_value = True

return OmniOpenAIServingSpeech(
engine_client=mock_engine_client,
models=mock_models,
request_logger=mocker.MagicMock(),
)


class TestCosyVoice3Serving:
def test_cosyvoice3_model_type_detection(self, cosyvoice3_server):
assert cosyvoice3_server._tts_model_type == "cosyvoice3"
assert cosyvoice3_server._is_tts is True
assert cosyvoice3_server._is_cosyvoice3 is True

def test_cosyvoice3_stage_registered(self):
from vllm_omni.entrypoints.openai.serving_speech import (
_COSYVOICE3_TTS_MODEL_STAGES,
_TTS_MODEL_STAGES,
)

assert "cosyvoice3_talker" in _COSYVOICE3_TTS_MODEL_STAGES
assert "cosyvoice3_talker" in _TTS_MODEL_STAGES

def test_validate_cosyvoice3_empty_input(self, cosyvoice3_server):
request = OpenAICreateSpeechRequest(input="", ref_audio="data:audio/wav;base64,abc", ref_text="hello")
error = cosyvoice3_server._validate_cosyvoice3_request(request)
assert error is not None
assert "empty" in error.lower()

def test_validate_cosyvoice3_missing_ref_audio(self, cosyvoice3_server):
request = OpenAICreateSpeechRequest(input="Hello", ref_text="hello")
error = cosyvoice3_server._validate_cosyvoice3_request(request)
assert error is not None
assert "ref_audio" in error.lower()

def test_validate_cosyvoice3_missing_ref_text(self, cosyvoice3_server):
request = OpenAICreateSpeechRequest(input="Hello", ref_audio="data:audio/wav;base64,abc")
error = cosyvoice3_server._validate_cosyvoice3_request(request)
assert error is not None
assert "ref_text" in error.lower()

def test_validate_cosyvoice3_invalid_ref_audio_format(self, cosyvoice3_server):
request = OpenAICreateSpeechRequest(input="Hello", ref_audio="/local/path.wav", ref_text="hello")
error = cosyvoice3_server._validate_cosyvoice3_request(request)
assert error is not None
assert "url" in error.lower() or "format" in error.lower()

def test_validate_cosyvoice3_valid_request(self, cosyvoice3_server):
request = OpenAICreateSpeechRequest(
input="Hello world",
ref_audio="data:audio/wav;base64,abc123",
ref_text="Reference transcript",
)
error = cosyvoice3_server._validate_cosyvoice3_request(request)
assert error is None

def test_validate_cosyvoice3_max_new_tokens_range(self, cosyvoice3_server):
request = OpenAICreateSpeechRequest(
input="Hello",
ref_audio="data:audio/wav;base64,abc",
ref_text="hello",
max_new_tokens=0,
)
error = cosyvoice3_server._validate_cosyvoice3_request(request)
assert error is not None
assert "max_new_tokens" in error

def test_prepare_speech_generation_cosyvoice3(self, cosyvoice3_server):
cosyvoice3_server._build_cosyvoice3_prompt = AsyncMock(
return_value={
"prompt": "Hello",
"multi_modal_data": {"audio": (np.zeros(24000), 24000)},
"mm_processor_kwargs": {"prompt_text": "ref text", "sample_rate": 24000},
}
)

request = OpenAICreateSpeechRequest(
input="Hello",
ref_audio="data:audio/wav;base64,abc",
ref_text="Reference text",
)
request_id, generator, tts_params = asyncio.run(cosyvoice3_server._prepare_speech_generation(request))

assert request_id.startswith("speech-")
assert generator == "generator"
assert tts_params == {}
cosyvoice3_server._build_cosyvoice3_prompt.assert_awaited_once()
Loading
Loading