From 78f2d656d97e242be433985739d4e51ac82d38a9 Mon Sep 17 00:00:00 2001 From: linyueqian Date: Wed, 1 Apr 2026 16:09:25 -0400 Subject: [PATCH 1/8] [CosyVoice3] Add online serving support, fix stage config, and add CI tests - Namespace stage names to cosyvoice3_talker/cosyvoice3_code2wav to avoid collision with other models using generic talker/code2wav names - Register CosyVoice3 in the TTS serving layer with validation and prompt building for /v1/audio/speech endpoint (voice cloning with ref_audio) - Fix cuDNN crash in code2wav by setting enforce_eager=true (Conv1d dynamic shapes are incompatible with CUDA graphs) - Add sr=22050 to code2wav multimodal output for correct audio playback - Tune gpu_memory_utilization (0.2/0.1) for the 0.5B model - Auto-inject model_type into hf_overrides so models with empty config.json (like CosyVoice3) can be loaded directly from HuggingFace - Register omni model configs in vLLM _CONFIG_REGISTRY for config resolution - Auto-detect tokenizer in subdirectories for models that don't store it at the root (CosyVoice-BlankEN/) - Auto-download mel_filters.npz asset from Whisper repo when missing - Add unit tests for CosyVoice3 serving (validation, detection, prompt) - Add e2e test with official CosyVoice zero-shot reference audio - Add CI steps in merge (core_model) and nightly (advanced_model) pipelines Signed-off-by: linyueqian --- .buildkite/test-merge.yml | 15 ++ .buildkite/test-nightly.yml | 4 +- .../e2e/online_serving/test_cosyvoice3_tts.py | 172 ++++++++++++++++++ .../openai_api/test_serving_speech.py | 112 ++++++++++++ vllm_omni/engine/arg_utils.py | 56 ++++++ .../entrypoints/openai/serving_speech.py | 72 +++++++- .../models/cosyvoice3/cosyvoice3.py | 22 +-- .../model_executor/models/cosyvoice3/utils.py | 28 ++- .../stage_configs/cosyvoice3.yaml | 12 +- 9 files changed, 465 insertions(+), 28 deletions(-) create mode 100644 tests/e2e/online_serving/test_cosyvoice3_tts.py diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index fc1f7a67969..d6082ff595d 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -396,6 +396,16 @@ steps: pytest -s -v tests/e2e/online_serving/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" ' + + - label: "CosyVoice3-TTS E2E Test" + timeout_in_minutes: 20 + depends_on: upload-merge-pipeline + commands: + - | + timeout 20m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "core_model" --run-level "core_model" + ' agents: queue: "mithril-h100-pool" plugins: @@ -414,6 +424,11 @@ steps: env: - name: HF_HOME value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token nodeSelector: node.kubernetes.io/instance-type: gpu-h100-sxm volumes: diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 16ef4bc8e45..fa0c66d41ea 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -15,7 +15,9 @@ steps: EXIT3=$$? pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" EXIT4=$$? - exit $$((EXIT1 | EXIT2 | EXIT3 | EXIT4)) + pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "advanced_model" --run-level "advanced_model" + EXIT5=$$? + exit $$((EXIT1 | EXIT2 | EXIT3 | EXIT4 | EXIT5)) agents: queue: "mithril-h100-pool" plugins: diff --git a/tests/e2e/online_serving/test_cosyvoice3_tts.py b/tests/e2e/online_serving/test_cosyvoice3_tts.py new file mode 100644 index 00000000000..1dfe786733e --- /dev/null +++ b/tests/e2e/online_serving/test_cosyvoice3_tts.py @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +E2E Online tests for CosyVoice3 TTS model with voice cloning. + +These tests verify the /v1/audio/speech endpoint works correctly with +the CosyVoice3 model, which requires reference audio for voice cloning. + +The official CosyVoice zero-shot prompt audio is fetched from GitHub +and encoded as a base64 data URI for the API requests. +""" + +import os + +os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" +os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" + +import base64 +import urllib.request +from pathlib import Path + +import httpx +import pytest + +from tests.conftest import OmniServerParams +from tests.utils import hardware_test + +MODEL = "FunAudioLLM/Fun-CosyVoice3-0.5B-2512" + +STAGE_CONFIG = str( + Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "cosyvoice3.yaml" +) +EXTRA_ARGS = [ + "--trust-remote-code", + "--disable-log-stats", +] +TEST_PARAMS = [ + OmniServerParams( + model=MODEL, + stage_config_path=STAGE_CONFIG, + server_args=EXTRA_ARGS, + ) +] + +# Official CosyVoice zero-shot prompt audio and its transcript +_REF_AUDIO_URL = "https://raw.githubusercontent.com/FunAudioLLM/CosyVoice/main/asset/zero_shot_prompt.wav" +_REF_TEXT = "希望你以后能够做的比我还好呦。" +_ref_audio_cache: str | None = None + + +def _get_ref_audio_data_uri() -> str: + """Fetch official CosyVoice zero-shot prompt audio and return as data URI. + + The result is cached so the download only happens once per test session. + """ + global _ref_audio_cache + if _ref_audio_cache is not None: + return _ref_audio_cache + + with urllib.request.urlopen(_REF_AUDIO_URL, timeout=30) as resp: + wav_bytes = resp.read() + b64 = base64.b64encode(wav_bytes).decode() + _ref_audio_cache = f"data:audio/wav;base64,{b64}" + return _ref_audio_cache + + +def make_speech_request( + host: str, + port: int, + text: str, + ref_audio: str, + ref_text: str, + timeout: float = 180.0, +) -> httpx.Response: + """Make a request to the /v1/audio/speech endpoint for CosyVoice3.""" + url = f"http://{host}:{port}/v1/audio/speech" + payload = { + "input": text, + "ref_audio": ref_audio, + "ref_text": ref_text, + } + + with httpx.Client(timeout=timeout) as client: + return client.post(url, json=payload) + + +def verify_wav_audio(content: bytes) -> bool: + """Verify that content is valid WAV audio data.""" + if len(content) < 44: + return False + return content[:4] == b"RIFF" and content[8:12] == b"WAVE" + + +MIN_AUDIO_BYTES = 5000 + + +@pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) +class TestCosyVoice3TTS: + """E2E tests for CosyVoice3 TTS model.""" + + @pytest.mark.core_model + @pytest.mark.omni + @hardware_test(res={"cuda": "H100"}, num_cards=1) + def test_speech_voice_clone_basic(self, omni_server) -> None: + """Test basic voice cloning TTS generation with official reference audio.""" + ref_audio = _get_ref_audio_data_uri() + response = make_speech_request( + host=omni_server.host, + port=omni_server.port, + text="收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的感动让我热泪盈眶。", + ref_audio=ref_audio, + ref_text=_REF_TEXT, + ) + + assert response.status_code == 200, f"Request failed: {response.text}" + assert response.headers.get("content-type") == "audio/wav" + assert verify_wav_audio(response.content), "Response is not valid WAV audio" + assert len(response.content) > MIN_AUDIO_BYTES, ( + f"Audio content too small ({len(response.content)} bytes), expected at least {MIN_AUDIO_BYTES} bytes" + ) + + @pytest.mark.advanced_model + @pytest.mark.omni + @hardware_test(res={"cuda": "H100"}, num_cards=1) + def test_speech_missing_ref_audio_rejected(self, omni_server) -> None: + """Request without ref_audio should return an error.""" + url = f"http://{omni_server.host}:{omni_server.port}/v1/audio/speech" + payload = { + "input": "This should fail without reference audio.", + } + + with httpx.Client(timeout=60.0) as client: + response = client.post(url, json=payload) + + data = response.json() + assert "error" in data or "message" in data, f"Expected error response for missing ref_audio, got: {data}" + + @pytest.mark.advanced_model + @pytest.mark.omni + @hardware_test(res={"cuda": "H100"}, num_cards=1) + def test_speech_missing_ref_text_rejected(self, omni_server) -> None: + """Request with ref_audio but no ref_text should return an error.""" + ref_audio = _get_ref_audio_data_uri() + url = f"http://{omni_server.host}:{omni_server.port}/v1/audio/speech" + payload = { + "input": "This should fail without reference text.", + "ref_audio": ref_audio, + } + + with httpx.Client(timeout=60.0) as client: + response = client.post(url, json=payload) + + data = response.json() + assert "error" in data or "message" in data, f"Expected error response for missing ref_text, got: {data}" + + @pytest.mark.advanced_model + @pytest.mark.omni + @hardware_test(res={"cuda": "H100"}, num_cards=1) + def test_speech_english_text(self, omni_server) -> None: + """Test voice cloning with English synthesis text.""" + ref_audio = _get_ref_audio_data_uri() + response = make_speech_request( + host=omni_server.host, + port=omni_server.port, + text="Hello, this is a voice cloning test with English text.", + ref_audio=ref_audio, + ref_text=_REF_TEXT, + ) + + assert response.status_code == 200, f"Request failed: {response.text}" + assert verify_wav_audio(response.content), "Response is not valid WAV audio" + assert len(response.content) > MIN_AUDIO_BYTES diff --git a/tests/entrypoints/openai_api/test_serving_speech.py b/tests/entrypoints/openai_api/test_serving_speech.py index 969df5bce0d..75feb65f064 100644 --- a/tests/entrypoints/openai_api/test_serving_speech.py +++ b/tests/entrypoints/openai_api/test_serving_speech.py @@ -1820,3 +1820,115 @@ def test_streaming_unsupported_format_rejected(self, wav_streaming_app): for fmt in unsupported_formats: response = client.post("/v1/audio/speech", json={"input": "Hello", "stream": True, "response_format": fmt}) assert response.status_code == 422 + + +# ---- CosyVoice3 Serving Tests ---- + + +@pytest.fixture +def cosyvoice3_server(mocker: MockerFixture): + mocker.patch.object(OmniOpenAIServingSpeech, "_load_supported_speakers", return_value=set()) + mocker.patch.object(OmniOpenAIServingSpeech, "_load_codec_frame_rate", return_value=None) + + mock_engine_client = mocker.MagicMock() + mock_engine_client.errored = False + mock_engine_client.model_config = mocker.MagicMock(model="FunAudioLLM/Fun-CosyVoice3-0.5B-2512") + mock_engine_client.default_sampling_params_list = [SimpleNamespace(max_tokens=2048)] + mock_engine_client.tts_batch_max_items = 32 + mock_engine_client.generate = mocker.MagicMock(return_value="generator") + mock_engine_client.stage_configs = [ + SimpleNamespace( + engine_args=SimpleNamespace(model_stage="cosyvoice3_talker"), + tts_args={}, + ) + ] + + mock_models = mocker.MagicMock() + mock_models.is_base_model.return_value = True + + return OmniOpenAIServingSpeech( + engine_client=mock_engine_client, + models=mock_models, + request_logger=mocker.MagicMock(), + ) + + +class TestCosyVoice3Serving: + def test_cosyvoice3_model_type_detection(self, cosyvoice3_server): + assert cosyvoice3_server._tts_model_type == "cosyvoice3" + assert cosyvoice3_server._is_tts is True + assert cosyvoice3_server._is_cosyvoice3 is True + + def test_cosyvoice3_stage_registered(self): + from vllm_omni.entrypoints.openai.serving_speech import ( + _COSYVOICE3_TTS_MODEL_STAGES, + _TTS_MODEL_STAGES, + ) + + assert "cosyvoice3_talker" in _COSYVOICE3_TTS_MODEL_STAGES + assert "cosyvoice3_talker" in _TTS_MODEL_STAGES + + def test_validate_cosyvoice3_empty_input(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest(input="", ref_audio="data:audio/wav;base64,abc", ref_text="hello") + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is not None + assert "empty" in error.lower() + + def test_validate_cosyvoice3_missing_ref_audio(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest(input="Hello", ref_text="hello") + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is not None + assert "ref_audio" in error.lower() + + def test_validate_cosyvoice3_missing_ref_text(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest(input="Hello", ref_audio="data:audio/wav;base64,abc") + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is not None + assert "ref_text" in error.lower() + + def test_validate_cosyvoice3_invalid_ref_audio_format(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest(input="Hello", ref_audio="/local/path.wav", ref_text="hello") + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is not None + assert "url" in error.lower() or "format" in error.lower() + + def test_validate_cosyvoice3_valid_request(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest( + input="Hello world", + ref_audio="data:audio/wav;base64,abc123", + ref_text="Reference transcript", + ) + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is None + + def test_validate_cosyvoice3_max_new_tokens_range(self, cosyvoice3_server): + request = OpenAICreateSpeechRequest( + input="Hello", + ref_audio="data:audio/wav;base64,abc", + ref_text="hello", + max_new_tokens=0, + ) + error = cosyvoice3_server._validate_cosyvoice3_request(request) + assert error is not None + assert "max_new_tokens" in error + + def test_prepare_speech_generation_cosyvoice3(self, cosyvoice3_server): + cosyvoice3_server._build_cosyvoice3_prompt = AsyncMock( + return_value={ + "prompt": "Hello", + "multi_modal_data": {"audio": (np.zeros(24000), 24000)}, + "mm_processor_kwargs": {"prompt_text": "ref text", "sample_rate": 24000}, + } + ) + + request = OpenAICreateSpeechRequest( + input="Hello", + ref_audio="data:audio/wav;base64,abc", + ref_text="Reference text", + ) + request_id, generator, tts_params = asyncio.run(cosyvoice3_server._prepare_speech_generation(request)) + + assert request_id.startswith("speech-") + assert generator == "generator" + assert tts_params == {} + cosyvoice3_server._build_cosyvoice3_prompt.assert_awaited_once() diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index f4a082cffb1..e428a2d2d93 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -1,5 +1,6 @@ import argparse import dataclasses +import os from dataclasses import dataclass, field from typing import Any @@ -28,6 +29,14 @@ def _register_omni_hf_configs() -> None: logger.warning("Skipping omni HF config registration due to import error: %s", exc) return + # Register with both transformers AutoConfig and vLLM's config registry + # so models with empty/missing config.json (e.g. CosyVoice3) can be + # resolved when model_type is injected via hf_overrides. + try: + from vllm.transformers_utils.config import _CONFIG_REGISTRY + except ImportError: + _CONFIG_REGISTRY = None + for model_type, config_cls in [ ("qwen3_tts", Qwen3TTSConfig), ("cosyvoice3", CosyVoice3Config), @@ -38,6 +47,8 @@ def _register_omni_hf_configs() -> None: except ValueError: # Already registered elsewhere; ignore. pass + if _CONFIG_REGISTRY is not None and model_type not in _CONFIG_REGISTRY: + _CONFIG_REGISTRY[model_type] = config_cls def register_omni_models_to_vllm(): @@ -127,11 +138,56 @@ def create_model_config(self) -> OmniModelConfig: # If model_arch is specified, inject it into hf_overrides so vLLM can # resolve the architecture even when config.json lacks 'architectures'. + # Also inject model_type so AutoConfig can resolve the correct config + # class for models with empty or missing config.json (e.g. CosyVoice3). if self.model_arch: if self.hf_overrides is None: self.hf_overrides = {} if isinstance(self.hf_overrides, dict): self.hf_overrides.setdefault("architectures", [self.model_arch]) + # Derive model_type from known arch→model_type mappings. + # This must use the actual HF model_type (from config classes), + # not the registry folder name which can differ. + if "model_type" not in self.hf_overrides: + _ARCH_TO_MODEL_TYPE = { + "CosyVoice3Model": "cosyvoice3", + } + model_type = _ARCH_TO_MODEL_TYPE.get(self.model_arch) + if model_type is not None: + self.hf_overrides.setdefault("model_type", model_type) + + # Auto-detect tokenizer for models that store it in a subdirectory + # rather than the root (e.g. CosyVoice3 uses CosyVoice-BlankEN/). + if not self.tokenizer and self.model: + model_path = self.model + if os.path.isdir(model_path) and not os.path.isfile(os.path.join(model_path, "tokenizer_config.json")): + for subfolder in sorted(os.listdir(model_path)): + candidate = os.path.join(model_path, subfolder) + if os.path.isdir(candidate) and os.path.isfile(os.path.join(candidate, "tokenizer_config.json")): + self.tokenizer = candidate + logger.info("Auto-detected tokenizer at %s", candidate) + break + elif not os.path.isdir(model_path): + # For HF model IDs, check known tokenizer subfolder mappings + _TOKENIZER_SUBFOLDER_MAP = { + "CosyVoice3Model": "CosyVoice-BlankEN", + } + subfolder = _TOKENIZER_SUBFOLDER_MAP.get(self.model_arch) + if subfolder: + # Download just the tokenizer files from the subfolder + try: + from huggingface_hub import snapshot_download + + local_dir = snapshot_download( + model_path, + allow_patterns=[f"{subfolder}/*"], + ) + candidate = os.path.join(local_dir, subfolder) + if os.path.isdir(candidate): + self.tokenizer = candidate + logger.info("Downloaded tokenizer from %s/%s", model_path, subfolder) + except Exception as e: + logger.warning("Failed to download tokenizer subfolder: %s", e) # Build the vLLM config first, then use it to create the Omni config. model_config = super().create_model_config() diff --git a/vllm_omni/entrypoints/openai/serving_speech.py b/vllm_omni/entrypoints/openai/serving_speech.py index b483181fd5f..2cd9c0110bf 100644 --- a/vllm_omni/entrypoints/openai/serving_speech.py +++ b/vllm_omni/entrypoints/openai/serving_speech.py @@ -47,7 +47,10 @@ _VOXTRAL_TTS_MODEL_STAGES = {"audio_generation"} _QWEN3_TTS_MODEL_STAGES = {"qwen3_tts"} _FISH_TTS_MODEL_STAGES = {"fish_speech_slow_ar"} -_TTS_MODEL_STAGES: set[str] = _VOXTRAL_TTS_MODEL_STAGES | _QWEN3_TTS_MODEL_STAGES | _FISH_TTS_MODEL_STAGES +_COSYVOICE3_TTS_MODEL_STAGES = {"cosyvoice3_talker"} +_TTS_MODEL_STAGES: set[str] = ( + _VOXTRAL_TTS_MODEL_STAGES | _QWEN3_TTS_MODEL_STAGES | _FISH_TTS_MODEL_STAGES | _COSYVOICE3_TTS_MODEL_STAGES +) _TTS_LANGUAGES: set[str] = { "Auto", "Chinese", @@ -165,6 +168,13 @@ def __init__(self, *args, **kwargs): ) self._fish_speech_tokenizer = None + self._is_cosyvoice3 = ( + self._tts_stage is not None + and getattr(getattr(self._tts_stage, "engine_args", None), "model_stage", None) + in _COSYVOICE3_TTS_MODEL_STAGES + ) + self._cosyvoice3_tokenizer = None + # Determine TTS model type or None self._tts_model_type = self._detect_tts_model_type() @@ -240,6 +250,8 @@ def _detect_tts_model_type(self) -> str | None: return "voxtral_tts" if model_stage in _FISH_TTS_MODEL_STAGES: return "fish_tts" + if model_stage in _COSYVOICE3_TTS_MODEL_STAGES: + return "cosyvoice3" return None def _compute_max_instructions_length(self) -> int: @@ -704,6 +716,8 @@ def _validate_tts_request(self, request: OpenAICreateSpeechRequest) -> str | Non return self._validate_voxtral_tts_request(request) if self._tts_model_type == "fish_tts": return self._validate_fish_tts_request(request) + if self._tts_model_type == "cosyvoice3": + return self._validate_cosyvoice3_request(request) return self._validate_qwen_tts_request(request) def _validate_ref_audio_format(self, ref_audio: str) -> str | None: @@ -875,6 +889,30 @@ def _validate_fish_tts_request(self, request: OpenAICreateSpeechRequest) -> str return None + def _validate_cosyvoice3_request(self, request: OpenAICreateSpeechRequest) -> str | None: + """Validate CosyVoice3 request parameters. Returns error message or None.""" + if not request.input or not request.input.strip(): + return "Input text cannot be empty" + + # CosyVoice3 requires reference audio for voice cloning + if request.ref_audio is None: + return "CosyVoice3 requires 'ref_audio' (reference audio for voice cloning)" + + fmt_err = self._validate_ref_audio_format(request.ref_audio) + if fmt_err: + return fmt_err + + if not request.ref_text or not request.ref_text.strip(): + return "CosyVoice3 requires 'ref_text' (transcript of the reference audio)" + + if request.max_new_tokens is not None: + if request.max_new_tokens < _TTS_MAX_NEW_TOKENS_MIN: + return f"max_new_tokens must be at least {_TTS_MAX_NEW_TOKENS_MIN}" + if request.max_new_tokens > _TTS_MAX_NEW_TOKENS_MAX: + return f"max_new_tokens cannot exceed {_TTS_MAX_NEW_TOKENS_MAX}" + + return None + async def _resolve_ref_audio(self, ref_audio_str: str) -> tuple[list[float], int]: """Resolve ref_audio to (wav_samples, sample_rate). @@ -1184,6 +1222,33 @@ def _build_fish_speech_prompt( "additional_information": additional_information, } + # ---- CosyVoice3 helpers ---- + + async def _build_cosyvoice3_prompt( + self, + request: OpenAICreateSpeechRequest, + ) -> dict[str, Any]: + """Build prompt for CosyVoice3. + + CosyVoice3 uses multimodal input with reference audio for voice cloning. + The prompt format matches the offline example: text prompt + audio data + + mm_processor_kwargs with prompt_text. + """ + # Resolve reference audio + wav_samples, sr = await self._resolve_ref_audio(request.ref_audio) + audio_data = (np.asarray(wav_samples, dtype=np.float32), sr) + + return { + "prompt": request.input, + "multi_modal_data": { + "audio": audio_data, + }, + "mm_processor_kwargs": { + "prompt_text": request.ref_text, + "sample_rate": sr, + }, + } + # ---- Common speech generation helpers ---- async def _prepare_speech_generation( @@ -1211,6 +1276,9 @@ async def _prepare_speech_generation( if self._tts_model_type == "voxtral_tts": prompt = await self._build_voxtral_prompt(request) tts_params = {} + elif self._tts_model_type == "cosyvoice3": + prompt = await self._build_cosyvoice3_prompt(request) + tts_params = {} else: tts_params = self._build_tts_params(request) # Resolve ref_audio (explicit or auto-set for uploaded voices) @@ -1234,6 +1302,8 @@ async def _prepare_speech_generation( model_type = "fish_speech" elif self._tts_model_type == "voxtral_tts": model_type = "voxtral_tts" + elif self._tts_model_type == "cosyvoice3": + model_type = "cosyvoice3" elif self._is_tts: model_type = tts_params.get("task_type", ["unknown"])[0] else: diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py index 87c5f323a45..d2ba07c9dfa 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py @@ -268,7 +268,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.model_stage = vllm_config.model_config.model_stage self.model_dir = vllm_config.model_config.model self.model = None - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": # Initialize talker stage (text to speech tokens) from vllm_omni.model_executor.models.cosyvoice3.cosyvoice3_talker import CosyVoice3LM, VLLMQwen2Encoder @@ -286,7 +286,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): # KV cache is now managed externally by vLLM's PagedAttention # No need for self.llm_cache self.model = self.talker - elif self.model_stage == "code2wav": + elif self.model_stage == "cosyvoice3_code2wav": # Initialize code2wav stage (flow matching + vocoder) from vllm_omni.model_executor.models.cosyvoice3.cosyvoice3_code2wav import CosyVoice3Code2Wav @@ -322,7 +322,7 @@ def _create_llm_vllm_config(self, parent_config: VllmConfig) -> VllmConfig: def compute_logits(self, hidden_states: torch.Tensor | OmniOutput) -> torch.Tensor | None: if isinstance(hidden_states, OmniOutput): hidden_states = hidden_states.text_hidden_states - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": logits = self.model.llm_decoder(hidden_states) vocab_size = self.config.vocab_size pad_size = vocab_size - logits.size(-1) @@ -337,7 +337,7 @@ def compute_logits(self, hidden_states: torch.Tensor | OmniOutput) -> torch.Tens raise RuntimeError(f"compute_logits is only valid for {self.model_stage}.") def embed_multimodal(self, **kwargs: object) -> torch.Tensor: - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": speech_token = kwargs["speech_token"] speech_token_emb = self.model.speech_embedding(speech_token) return speech_token_emb @@ -350,7 +350,7 @@ def embed_input_ids( multimodal_embeddings=None, is_multimodal=None, ) -> torch.Tensor: - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": if is_multimodal is not None and any(is_multimodal): embed_tokens = self.model.llm.model.embed_tokens(input_ids) sos = self.model.speech_embedding.weight[self.model.sos].reshape(1, -1) @@ -363,7 +363,7 @@ def embed_input_ids( else: embed_tokens = self.model.speech_embedding.weight[input_ids] return embed_tokens - elif self.model_stage == "code2wav": + elif self.model_stage == "cosyvoice3_code2wav": assert input_ids.dim() == 1 hidden = int(self.config.hidden_size) return torch.zeros( @@ -381,7 +381,7 @@ def forward( additional_information: dict[str, object] | None = None, **kwargs: object, ) -> OmniOutput: - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": if inputs_embeds is None: inputs_embeds = self.embed_input_ids(input_ids) @@ -399,7 +399,7 @@ def forward( } return OmniOutput(text_hidden_states=hidden_states, multimodal_outputs=multimodal_outputs) - elif self.model_stage == "code2wav": + elif self.model_stage == "cosyvoice3_code2wav": runtime_info = kwargs.get("runtime_additional_information", []) if not runtime_info: length = 30 * 24000 @@ -420,13 +420,13 @@ def forward( return OmniOutput( text_hidden_states=None, - multimodal_outputs={"audio": tts_speech}, + multimodal_outputs={"audio": tts_speech, "sr": 22050}, ) else: raise ValueError(f"Unsupported model_stage: {self.model_stage}") def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: - if self.model_stage == "talker": + if self.model_stage == "cosyvoice3_talker": # Load weights for text to speech LM stage using vLLM's weight loading llm_weight_path = os.path.join(self.model_dir, "llm.pt") device = next(self.parameters()).device @@ -460,7 +460,7 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: self.model.llm_decoder.load_state_dict(llm_decoder_state) self.model.to(device).eval() - elif self.model_stage == "code2wav": + elif self.model_stage == "cosyvoice3_code2wav": # Load weights for code2wav stage (flow + hift) device = next(self.parameters()).device self.code2wav.load_weights(self.model_dir, device) diff --git a/vllm_omni/model_executor/models/cosyvoice3/utils.py b/vllm_omni/model_executor/models/cosyvoice3/utils.py index e1310cd3b19..590b3406c8f 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/utils.py +++ b/vllm_omni/model_executor/models/cosyvoice3/utils.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import logging import os from functools import cache, lru_cache @@ -10,6 +11,8 @@ import torchaudio.compliance.kaldi as kaldi from librosa.filters import mel as librosa_mel_fn +logger = logging.getLogger(__name__) + IGNORE_ID = -1 @@ -134,15 +137,22 @@ def mel_filters(device, n_mels: int) -> torch.Tensor: filters_path = os.path.join(os.path.dirname(__file__), "assets", "mel_filters.npz") if not os.path.exists(filters_path): source_url = "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz" - raise FileNotFoundError( - "Missing CosyVoice3 mel filter asset:\n" - f" {filters_path}\n" - "Download it manually from:\n" - f" {source_url}\n" - "Example:\n" - f" mkdir -p {os.path.dirname(filters_path)} && " - f"curl -L {source_url} -o {filters_path}" - ) + os.makedirs(os.path.dirname(filters_path), exist_ok=True) + try: + import urllib.request + + urllib.request.urlretrieve(source_url, filters_path) + logger.info("Downloaded mel_filters.npz from %s", source_url) + except Exception as e: + raise FileNotFoundError( + "Missing CosyVoice3 mel filter asset:\n" + f" {filters_path}\n" + "Auto-download failed. Download it manually from:\n" + f" {source_url}\n" + "Example:\n" + f" mkdir -p {os.path.dirname(filters_path)} && " + f"curl -L {source_url} -o {filters_path}" + ) from e with np.load(filters_path, allow_pickle=False) as f: return torch.from_numpy(f[f"mel_{n_mels}"]).to(device) diff --git a/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml b/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml index 13b6ff55bd6..e215f51428a 100644 --- a/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml +++ b/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml @@ -9,16 +9,16 @@ stage_args: runtime: devices: 0 engine_args: - model_stage: talker + model_stage: cosyvoice3_talker worker_cls: vllm_omni.worker.gpu_ar_worker.GPUARWorker scheduler_cls: vllm_omni.core.sched.omni_ar_scheduler.OmniARScheduler model_arch: CosyVoice3Model trust_remote_code: true - gpu_memory_utilization: 0.4 + gpu_memory_utilization: 0.2 engine_output_type: latent # Output speech tokens for chunk aware flow matching disable_hybrid_kv_cache_manager: true enable_prefix_caching: false - enforce_eager: false + enforce_eager: true mm_processor_cache_gb: 0 skip_mm_profiling: true dtype: "float32" @@ -27,14 +27,14 @@ stage_args: runtime: devices: 0 engine_args: - model_stage: code2wav + model_stage: cosyvoice3_code2wav model_arch: CosyVoice3Model trust_remote_code: true worker_cls: vllm_omni.worker.gpu_generation_worker.GPUGenerationWorker scheduler_cls: vllm_omni.core.sched.omni_generation_scheduler.OmniGenerationScheduler engine_output_type: latent - gpu_memory_utilization: 0.2 - enforce_eager: false # CUDA graphs don't work with dynamic runtime_info access + gpu_memory_utilization: 0.1 + enforce_eager: true # CUDA graphs don't work with dynamic conv shapes in code2wav disable_hybrid_kv_cache_manager: true enable_prefix_caching: false skip_mm_profiling: true From 6435b246da47677138c8e8cb7262d9ab8061ecaf Mon Sep 17 00:00:00 2001 From: linyueqian Date: Wed, 1 Apr 2026 20:16:45 -0400 Subject: [PATCH 2/8] [CI] Add CosyVoice3-TTS E2E test to ready pipeline for PR testing Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: linyueqian --- .buildkite/test-ready.yml | 43 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/.buildkite/test-ready.yml b/.buildkite/test-ready.yml index 91ea92a5cec..12e7a30d032 100644 --- a/.buildkite/test-ready.yml +++ b/.buildkite/test-ready.yml @@ -526,3 +526,46 @@ steps: hostPath: path: /mnt/hf-cache type: DirectoryOrCreate + + - label: "CosyVoice3-TTS E2E Test" + timeout_in_minutes: 20 + depends_on: upload-ready-pipeline + commands: + - | + timeout 20m bash -c ' + export VLLM_WORKER_MULTIPROC_METHOD=spawn + pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "core_model" --run-level "core_model" + ' + agents: + queue: "mithril-h100-pool" + plugins: + - kubernetes: + podSpec: + containers: + - image: 936637512419.dkr.ecr.us-west-2.amazonaws.com/vllm-ci-pull-through-cache/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT + resources: + limits: + nvidia.com/gpu: 1 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: HF_HOME + value: /root/.cache/huggingface + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + node.kubernetes.io/instance-type: gpu-h100-sxm + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /mnt/hf-cache + type: DirectoryOrCreate From da56ff4123aa9a4b13dcdfc6e010cf9ba26d5838 Mon Sep 17 00:00:00 2001 From: linyueqian Date: Wed, 1 Apr 2026 21:53:45 -0400 Subject: [PATCH 3/8] [CI] Address review feedback for CosyVoice3 E2E test - Use advanced_model marker in merge pipeline - Remove duplicate CosyVoice3 step from nightly pipeline - Rewrite test to follow test_qwen3_tts_base.py style with function-based tests and openai_client.send_audio_speech_request; basic zh test marked core_model+advanced_model, others advanced_model only Co-Authored-By: Claude Sonnet 4.6 Signed-off-by: linyueqian --- .buildkite/test-merge.yml | 2 +- .buildkite/test-nightly.yml | 4 +- .../e2e/online_serving/test_cosyvoice3_tts.py | 232 +++++++----------- 3 files changed, 94 insertions(+), 144 deletions(-) diff --git a/.buildkite/test-merge.yml b/.buildkite/test-merge.yml index 62f506b60e4..b336d3aeef0 100644 --- a/.buildkite/test-merge.yml +++ b/.buildkite/test-merge.yml @@ -403,7 +403,7 @@ steps: - | timeout 20m bash -c ' export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "core_model" --run-level "core_model" + pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "advanced_model" --run-level "advanced_model" ' agents: queue: "mithril-h100-pool" diff --git a/.buildkite/test-nightly.yml b/.buildkite/test-nightly.yml index 5f853e13f26..5c6d6d35a65 100644 --- a/.buildkite/test-nightly.yml +++ b/.buildkite/test-nightly.yml @@ -15,9 +15,7 @@ steps: EXIT3=$$? pytest -s -v tests/e2e/offline_inference/test_voxtral_tts.py -m "advanced_model" --run-level "advanced_model" EXIT4=$$? - pytest -s -v tests/e2e/online_serving/test_cosyvoice3_tts.py -m "advanced_model" --run-level "advanced_model" - EXIT5=$$? - exit $$((EXIT1 | EXIT2 | EXIT3 | EXIT4 | EXIT5)) + exit $$((EXIT1 | EXIT2 | EXIT3 | EXIT4)) agents: queue: "mithril-h100-pool" plugins: diff --git a/tests/e2e/online_serving/test_cosyvoice3_tts.py b/tests/e2e/online_serving/test_cosyvoice3_tts.py index 1dfe786733e..976be805c27 100644 --- a/tests/e2e/online_serving/test_cosyvoice3_tts.py +++ b/tests/e2e/online_serving/test_cosyvoice3_tts.py @@ -5,9 +5,6 @@ These tests verify the /v1/audio/speech endpoint works correctly with the CosyVoice3 model, which requires reference audio for voice cloning. - -The official CosyVoice zero-shot prompt audio is fetched from GitHub -and encoded as a base64 data URI for the API requests. """ import os @@ -15,11 +12,8 @@ os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" os.environ["VLLM_TEST_CLEAN_GPU_MEMORY"] = "0" -import base64 -import urllib.request from pathlib import Path -import httpx import pytest from tests.conftest import OmniServerParams @@ -27,146 +21,104 @@ MODEL = "FunAudioLLM/Fun-CosyVoice3-0.5B-2512" -STAGE_CONFIG = str( - Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "cosyvoice3.yaml" -) -EXTRA_ARGS = [ - "--trust-remote-code", - "--disable-log-stats", -] -TEST_PARAMS = [ - OmniServerParams( - model=MODEL, - stage_config_path=STAGE_CONFIG, - server_args=EXTRA_ARGS, +# Official CosyVoice zero-shot prompt audio and its transcript +REF_AUDIO_URL = "https://raw.githubusercontent.com/FunAudioLLM/CosyVoice/main/asset/zero_shot_prompt.wav" +REF_TEXT = "希望你以后能够做的比我还好呦。" + + +def get_stage_config(name: str = "cosyvoice3.yaml"): + """Get the stage config path from vllm_omni model_executor stage_configs.""" + return str(Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / name) + + +def get_prompt(prompt_type="zh"): + prompts = { + "zh": "收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的感动让我热泪盈眶。", + "en": "Hello, this is a voice cloning test with English text.", + } + return prompts.get(prompt_type, prompts["zh"]) + + +tts_server_params = [ + pytest.param( + OmniServerParams( + model=MODEL, + stage_config_path=get_stage_config(), + server_args=["--trust-remote-code", "--disable-log-stats"], + ), + id="cosyvoice3", ) ] -# Official CosyVoice zero-shot prompt audio and its transcript -_REF_AUDIO_URL = "https://raw.githubusercontent.com/FunAudioLLM/CosyVoice/main/asset/zero_shot_prompt.wav" -_REF_TEXT = "希望你以后能够做的比我还好呦。" -_ref_audio_cache: str | None = None +@pytest.mark.advanced_model +@pytest.mark.core_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=1) +@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) +def test_voice_clone_zh_001(omni_server, openai_client) -> None: + """ + Test voice cloning TTS with Chinese text via OpenAI API. + Deploy Setting: default yaml + Input Modal: text + ref_audio + ref_text + Output Modal: audio + Input Setting: stream=False + Datasets: single request + """ + request_config = { + "model": omni_server.model, + "input": get_prompt("zh"), + "stream": False, + "response_format": "wav", + "ref_audio": REF_AUDIO_URL, + "ref_text": REF_TEXT, + } + openai_client.send_audio_speech_request(request_config) -def _get_ref_audio_data_uri() -> str: - """Fetch official CosyVoice zero-shot prompt audio and return as data URI. - The result is cached so the download only happens once per test session. +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=1) +@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) +def test_voice_clone_zh_002(omni_server, openai_client) -> None: + """ + Test voice cloning TTS with Chinese text via OpenAI API. + Deploy Setting: default yaml + Input Modal: text + ref_audio + ref_text + Output Modal: audio + Input Setting: stream=True + Datasets: single request """ - global _ref_audio_cache - if _ref_audio_cache is not None: - return _ref_audio_cache - - with urllib.request.urlopen(_REF_AUDIO_URL, timeout=30) as resp: - wav_bytes = resp.read() - b64 = base64.b64encode(wav_bytes).decode() - _ref_audio_cache = f"data:audio/wav;base64,{b64}" - return _ref_audio_cache - - -def make_speech_request( - host: str, - port: int, - text: str, - ref_audio: str, - ref_text: str, - timeout: float = 180.0, -) -> httpx.Response: - """Make a request to the /v1/audio/speech endpoint for CosyVoice3.""" - url = f"http://{host}:{port}/v1/audio/speech" - payload = { - "input": text, - "ref_audio": ref_audio, - "ref_text": ref_text, + request_config = { + "model": omni_server.model, + "input": get_prompt("zh"), + "stream": True, + "response_format": "wav", + "ref_audio": REF_AUDIO_URL, + "ref_text": REF_TEXT, } + openai_client.send_audio_speech_request(request_config) - with httpx.Client(timeout=timeout) as client: - return client.post(url, json=payload) - - -def verify_wav_audio(content: bytes) -> bool: - """Verify that content is valid WAV audio data.""" - if len(content) < 44: - return False - return content[:4] == b"RIFF" and content[8:12] == b"WAVE" - - -MIN_AUDIO_BYTES = 5000 - - -@pytest.mark.parametrize("omni_server", TEST_PARAMS, indirect=True) -class TestCosyVoice3TTS: - """E2E tests for CosyVoice3 TTS model.""" - - @pytest.mark.core_model - @pytest.mark.omni - @hardware_test(res={"cuda": "H100"}, num_cards=1) - def test_speech_voice_clone_basic(self, omni_server) -> None: - """Test basic voice cloning TTS generation with official reference audio.""" - ref_audio = _get_ref_audio_data_uri() - response = make_speech_request( - host=omni_server.host, - port=omni_server.port, - text="收到好友从远方寄来的生日礼物,那份意外的惊喜与深深的感动让我热泪盈眶。", - ref_audio=ref_audio, - ref_text=_REF_TEXT, - ) - - assert response.status_code == 200, f"Request failed: {response.text}" - assert response.headers.get("content-type") == "audio/wav" - assert verify_wav_audio(response.content), "Response is not valid WAV audio" - assert len(response.content) > MIN_AUDIO_BYTES, ( - f"Audio content too small ({len(response.content)} bytes), expected at least {MIN_AUDIO_BYTES} bytes" - ) - - @pytest.mark.advanced_model - @pytest.mark.omni - @hardware_test(res={"cuda": "H100"}, num_cards=1) - def test_speech_missing_ref_audio_rejected(self, omni_server) -> None: - """Request without ref_audio should return an error.""" - url = f"http://{omni_server.host}:{omni_server.port}/v1/audio/speech" - payload = { - "input": "This should fail without reference audio.", - } - - with httpx.Client(timeout=60.0) as client: - response = client.post(url, json=payload) - - data = response.json() - assert "error" in data or "message" in data, f"Expected error response for missing ref_audio, got: {data}" - - @pytest.mark.advanced_model - @pytest.mark.omni - @hardware_test(res={"cuda": "H100"}, num_cards=1) - def test_speech_missing_ref_text_rejected(self, omni_server) -> None: - """Request with ref_audio but no ref_text should return an error.""" - ref_audio = _get_ref_audio_data_uri() - url = f"http://{omni_server.host}:{omni_server.port}/v1/audio/speech" - payload = { - "input": "This should fail without reference text.", - "ref_audio": ref_audio, - } - - with httpx.Client(timeout=60.0) as client: - response = client.post(url, json=payload) - - data = response.json() - assert "error" in data or "message" in data, f"Expected error response for missing ref_text, got: {data}" - - @pytest.mark.advanced_model - @pytest.mark.omni - @hardware_test(res={"cuda": "H100"}, num_cards=1) - def test_speech_english_text(self, omni_server) -> None: - """Test voice cloning with English synthesis text.""" - ref_audio = _get_ref_audio_data_uri() - response = make_speech_request( - host=omni_server.host, - port=omni_server.port, - text="Hello, this is a voice cloning test with English text.", - ref_audio=ref_audio, - ref_text=_REF_TEXT, - ) - - assert response.status_code == 200, f"Request failed: {response.text}" - assert verify_wav_audio(response.content), "Response is not valid WAV audio" - assert len(response.content) > MIN_AUDIO_BYTES + +@pytest.mark.advanced_model +@pytest.mark.omni +@hardware_test(res={"cuda": "H100"}, num_cards=1) +@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) +def test_voice_clone_en_001(omni_server, openai_client) -> None: + """ + Test voice cloning TTS with English text via OpenAI API. + Deploy Setting: default yaml + Input Modal: text + ref_audio + ref_text + Output Modal: audio + Input Setting: stream=False + Datasets: single request + """ + request_config = { + "model": omni_server.model, + "input": get_prompt("en"), + "stream": False, + "response_format": "wav", + "ref_audio": REF_AUDIO_URL, + "ref_text": REF_TEXT, + } + openai_client.send_audio_speech_request(request_config) From 96a2cb9052cbcc90ae3e42e6298307868926f682 Mon Sep 17 00:00:00 2001 From: linyueqian Date: Thu, 2 Apr 2026 11:25:08 -0400 Subject: [PATCH 4/8] [CosyVoice3] Address review feedback: move dicts to module level, narrow download, add timeout - Move _ARCH_TO_MODEL_TYPE and _TOKENIZER_SUBFOLDER_MAP to module level - Narrow snapshot_download allow_patterns to tokenizer files only - Replace urlretrieve with urlopen(timeout=30) to prevent CI hangs Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: linyueqian --- vllm_omni/engine/arg_utils.py | 29 ++++++++++++------- .../model_executor/models/cosyvoice3/utils.py | 4 ++- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index e428a2d2d93..d7358720248 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -13,6 +13,17 @@ logger = init_logger(__name__) +# Maps model architecture names to their HuggingFace model_type values. +# Used when auto-injecting hf_overrides for models with missing config.json. +_ARCH_TO_MODEL_TYPE: dict[str, str] = { + "CosyVoice3Model": "cosyvoice3", +} + +# Maps model architecture names to tokenizer subfolder paths within HF repos. +_TOKENIZER_SUBFOLDER_MAP: dict[str, str] = { + "CosyVoice3Model": "CosyVoice-BlankEN", +} + def _register_omni_hf_configs() -> None: try: @@ -145,13 +156,7 @@ def create_model_config(self) -> OmniModelConfig: self.hf_overrides = {} if isinstance(self.hf_overrides, dict): self.hf_overrides.setdefault("architectures", [self.model_arch]) - # Derive model_type from known arch→model_type mappings. - # This must use the actual HF model_type (from config classes), - # not the registry folder name which can differ. if "model_type" not in self.hf_overrides: - _ARCH_TO_MODEL_TYPE = { - "CosyVoice3Model": "cosyvoice3", - } model_type = _ARCH_TO_MODEL_TYPE.get(self.model_arch) if model_type is not None: self.hf_overrides.setdefault("model_type", model_type) @@ -168,10 +173,6 @@ def create_model_config(self) -> OmniModelConfig: logger.info("Auto-detected tokenizer at %s", candidate) break elif not os.path.isdir(model_path): - # For HF model IDs, check known tokenizer subfolder mappings - _TOKENIZER_SUBFOLDER_MAP = { - "CosyVoice3Model": "CosyVoice-BlankEN", - } subfolder = _TOKENIZER_SUBFOLDER_MAP.get(self.model_arch) if subfolder: # Download just the tokenizer files from the subfolder @@ -180,7 +181,13 @@ def create_model_config(self) -> OmniModelConfig: local_dir = snapshot_download( model_path, - allow_patterns=[f"{subfolder}/*"], + allow_patterns=[ + f"{subfolder}/tokenizer*", + f"{subfolder}/special_tokens*", + f"{subfolder}/vocab*", + f"{subfolder}/merges*", + f"{subfolder}/added_tokens*", + ], ) candidate = os.path.join(local_dir, subfolder) if os.path.isdir(candidate): diff --git a/vllm_omni/model_executor/models/cosyvoice3/utils.py b/vllm_omni/model_executor/models/cosyvoice3/utils.py index 590b3406c8f..151c3627aff 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/utils.py +++ b/vllm_omni/model_executor/models/cosyvoice3/utils.py @@ -141,7 +141,9 @@ def mel_filters(device, n_mels: int) -> torch.Tensor: try: import urllib.request - urllib.request.urlretrieve(source_url, filters_path) + with urllib.request.urlopen(source_url, timeout=30) as resp: + with open(filters_path, "wb") as f_out: + f_out.write(resp.read()) logger.info("Downloaded mel_filters.npz from %s", source_url) except Exception as e: raise FileNotFoundError( From 40476a4fd83b5c73574b4e718c573389ad2b8b42 Mon Sep 17 00:00:00 2001 From: linyueqian Date: Fri, 3 Apr 2026 16:18:29 -0400 Subject: [PATCH 5/8] fix: resolve HF repo ID to local cache path in CosyVoice3 processor When model_dir is an HF repo ID (e.g. FunAudioLLM/Fun-CosyVoice3-0.5B-2512), os.path.join with qwen_pretrain_path produces an invalid 3-part repo ID that AutoTokenizer.from_pretrained rejects. Use snapshot_download to resolve to the local cache directory first. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: linyueqian --- vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py index d2ba07c9dfa..d35141519f4 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py @@ -66,6 +66,12 @@ def _ensure_cached_runtime_components(self, model_dir: str, config: CosyVoice3Co if cached_model_dir == model_dir: return + # If model_dir is an HF repo ID (not a local path), resolve to cache + if not os.path.isdir(model_dir): + from huggingface_hub import snapshot_download + + model_dir = snapshot_download(model_dir) + import onnxruntime from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer From 64fb8b5c1ddee6759079e276ca710b38c4fb0909 Mon Sep 17 00:00:00 2001 From: linyueqian Date: Fri, 3 Apr 2026 16:33:41 -0400 Subject: [PATCH 6/8] fix: resolve HF repo ID to local cache path in CosyVoice3Model.__init__ The previous fix only covered the processor path. CosyVoice3Model.load_weights also uses self.model_dir with os.path.join for flow.pt, llm.pt, hift.pt etc. Resolve the HF repo ID to local cache in __init__ so all downstream code works. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: linyueqian --- vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py index d35141519f4..784393e181e 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py @@ -272,7 +272,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.config = vllm_config.model_config.hf_config self.have_multimodal_outputs = True self.model_stage = vllm_config.model_config.model_stage - self.model_dir = vllm_config.model_config.model + model_dir = vllm_config.model_config.model + if not os.path.isdir(model_dir): + from huggingface_hub import snapshot_download + + model_dir = snapshot_download(model_dir) + self.model_dir = model_dir self.model = None if self.model_stage == "cosyvoice3_talker": # Initialize talker stage (text to speech tokens) From 9ccc05441c253a762499f7f9e642560d1e5b9dda Mon Sep 17 00:00:00 2001 From: linyueqian Date: Fri, 3 Apr 2026 17:00:51 -0400 Subject: [PATCH 7/8] fix: register omni model configs with vLLM _CONFIG_REGISTRY and patch generation_config Models like CosyVoice3 have an empty config.json ({}) without model_type, which causes AutoConfig.from_pretrained to fail. This commit: 1. Registers omni config classes with vLLM's internal _CONFIG_REGISTRY (not just transformers AutoConfig) so HFConfigParser can resolve them 2. Injects model_type into hf_overrides when model_arch is specified 3. Patches try_get_generation_config in _attach_llm_stage to avoid crashes for models without generation_config.json Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: linyueqian --- vllm_omni/engine/arg_utils.py | 1 + vllm_omni/engine/async_omni_engine.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index d7358720248..2d10724d72e 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -17,6 +17,7 @@ # Used when auto-injecting hf_overrides for models with missing config.json. _ARCH_TO_MODEL_TYPE: dict[str, str] = { "CosyVoice3Model": "cosyvoice3", + "OmniVoiceModel": "omnivoice", } # Maps model architecture names to tokenizer subfolder paths within HF repos. diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index c987106fee1..5a209abf4a9 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -73,6 +73,16 @@ logger = init_logger(__name__) +def _patch_generation_config_if_needed(model_config: Any) -> None: + """Ensure try_get_generation_config won't crash for models whose HF + config.json lacks model_type (e.g. CosyVoice3). We probe it once; + if it raises, we monkey-patch the method to return None.""" + try: + model_config.try_get_generation_config() + except Exception: + model_config.try_get_generation_config = lambda: None + + def _inject_kv_stage_info(stage_cfg: Any, stage_id: int) -> None: """Inject stage_id and engine_input_source into omni_kv_config. @@ -409,6 +419,12 @@ def _attach_llm_stage( ) input_processor = None if started.stage_id == 0: + # Some omni models (e.g. CosyVoice3) have an empty HF + # config.json without model_type, which causes + # try_get_generation_config -> AutoConfig.from_pretrained + # to raise ValueError. Patch it to return None so + # InputProcessor doesn't crash. + _patch_generation_config_if_needed(started.vllm_config.model_config) input_processor = InputProcessor(vllm_config=started.vllm_config) # Use omni preprocessor so text-only prompts with # mm_processor_kwargs (e.g. GLM-Image t2i target_h/target_w) From 307b351d1d297aadb45417f995cfbf39f33650f6 Mon Sep 17 00:00:00 2001 From: linyueqian Date: Fri, 3 Apr 2026 17:30:55 -0400 Subject: [PATCH 8/8] fix: return empty dict from patched try_get_generation_config The previous patch returned None, but get_diff_sampling_param() calls .update() on the result, causing AttributeError on NoneType. Co-Authored-By: Claude Opus 4.6 (1M context) Signed-off-by: linyueqian --- vllm_omni/engine/async_omni_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm_omni/engine/async_omni_engine.py b/vllm_omni/engine/async_omni_engine.py index 5a209abf4a9..d9960ecbac3 100644 --- a/vllm_omni/engine/async_omni_engine.py +++ b/vllm_omni/engine/async_omni_engine.py @@ -80,7 +80,7 @@ def _patch_generation_config_if_needed(model_config: Any) -> None: try: model_config.try_get_generation_config() except Exception: - model_config.try_get_generation_config = lambda: None + model_config.try_get_generation_config = lambda: {} def _inject_kv_stage_info(stage_cfg: Any, stage_id: int) -> None: