diff --git a/docs/api/README.md b/docs/api/README.md index 0147f19e126..11401dc0208 100644 --- a/docs/api/README.md +++ b/docs/api/README.md @@ -99,7 +99,6 @@ Configuration classes. - [vllm_omni.diffusion.cache.teacache.config.TeaCacheConfig][] - [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][] - [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][] -- [vllm_omni.model_executor.models.cosyvoice3.config.CosyVoice3Config][] - [vllm_omni.model_executor.models.fish_speech.configuration_fish_speech.FishSpeechConfig][] - [vllm_omni.model_executor.models.fish_speech.configuration_fish_speech.FishSpeechFastARConfig][] - [vllm_omni.model_executor.models.fish_speech.configuration_fish_speech.FishSpeechSlowARConfig][] @@ -116,10 +115,14 @@ Configuration classes. - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderConfig][] - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderDiTConfig][] - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1EncoderConfig][] +- [vllm_omni.transformers_utils.configs.cosyvoice3.CosyVoice3Config][] - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Config][] - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Qwen2_5_VLConfig][] - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Qwen2_5_VLTextConfig][] - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Qwen2_5_VLVisionConfig][] +- [vllm_omni.transformers_utils.configs.omnivoice.OmniVoiceConfig][] +- [vllm_omni.transformers_utils.configs.voxcpm.VoxCPMConfig][] +- [vllm_omni.transformers_utils.configs.voxcpm2.VoxCPM2Config][] ## Workers diff --git a/examples/offline_inference/text_to_speech/README.md b/examples/offline_inference/text_to_speech/README.md index ddc5f11c16b..bfdee543cc7 100644 --- a/examples/offline_inference/text_to_speech/README.md +++ b/examples/offline_inference/text_to_speech/README.md @@ -73,13 +73,15 @@ python examples/offline_inference/text_to_speech/cosyvoice3/end2end.py \ ``` ### Voice cloning -Pass a reference audio. Note that CosyVoice3's `--prompt-text` is a system-style prompt for the GPT stage, not a reference transcript: +If `--ref-audio` is omitted, the script downloads the upstream +[`zero_shot_prompt.wav`](https://github.com/FunAudioLLM/CosyVoice/blob/main/asset/zero_shot_prompt.wav) from the CosyVoice repo into the current directory. +To use your own clip, pass `--ref-audio /path/to/reference.wav`, and modify `--prompt-text` correspondingly. ```bash python examples/offline_inference/text_to_speech/cosyvoice3/end2end.py \ --model pretrained_models/Fun-CosyVoice3-0.5B \ --tokenizer pretrained_models/Fun-CosyVoice3-0.5B/CosyVoice-BlankEN \ - --ref-audio prompt.wav \ - --prompt-text "You are a helpful assistant.<|endofprompt|>Testing my voices. Why should I not?" + --ref-audio /path/to/reference.wav \ + --prompt-text "You are a helpful assistant.<|endofprompt|>Trascript in your ref audio clip" ``` ### Streaming @@ -317,44 +319,39 @@ pip install voxcpm soundfile export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/VoxCPM/src ``` -If the native VoxCPM `config.json` does not contain HF metadata such as `model_type`, prepare a persistent HF-compatible config directory and point the stage configs to it via `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH`: - -```bash -export VOXCPM_MODEL=/path/to/voxcpm-model -export VLLM_OMNI_VOXCPM_HF_CONFIG_PATH=/tmp/voxcpm_hf_config -mkdir -p "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH" -cp "$VOXCPM_MODEL/config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/config.json" -cp "$VOXCPM_MODEL/generation_config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/generation_config.json" 2>/dev/null || true -python3 -c 'import json, os; p=os.path.join(os.environ["VLLM_OMNI_VOXCPM_HF_CONFIG_PATH"], "config.json"); cfg=json.load(open(p, "r", encoding="utf-8")); cfg["model_type"]="voxcpm"; cfg.setdefault("architectures", ["VoxCPMForConditionalGeneration"]); json.dump(cfg, open(p, "w", encoding="utf-8"), indent=2, ensure_ascii=False)' -``` - ### Quick start ```bash python examples/offline_inference/text_to_speech/voxcpm/end2end.py \ - --model "$VOXCPM_MODEL" \ + --model openbmb/VoxCPM-0.5B \ --text "This is a split-stage VoxCPM synthesis example running on vLLM Omni." ``` ### Voice cloning ```bash python examples/offline_inference/text_to_speech/voxcpm/end2end.py \ - --model "$VOXCPM_MODEL" \ + --model openbmb/VoxCPM-0.5B \ --text "This sentence is synthesized with a cloned voice." \ --ref-audio /path/to/reference.wav \ --ref-text "The exact transcript spoken in reference.wav." ``` ### Streaming -Pass the async-chunk stage config: +Pass `--streaming` together with the legacy async-chunk stage config: ```bash python examples/offline_inference/text_to_speech/voxcpm/end2end.py \ - --model "$VOXCPM_MODEL" \ + --model openbmb/VoxCPM-0.5B \ + --streaming \ --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \ --text "This is a split-stage VoxCPM streaming example running on vLLM Omni." ``` +### Persistent HF config (optional) +The engine auto-patches a missing `model_type` in the checkpoint's `config.json` in-memory at startup (see `vllm_omni/engine/arg_utils.py`), so no manual setup is required. + +If you'd rather maintain a persistent patched config dir (e.g. to share across runs or to add fields the auto-patcher doesn't set), export `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH` to its location — both the bundled deploy and async-chunk yamls read it via `${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,}`. + ### Notes -- `voxcpm.yaml` is the default non-streaming stage config; `voxcpm_async_chunk.yaml` enables streaming. +- Non-streaming auto-loads `vllm_omni/deploy/voxcpm.yaml` from the model directory name / HF `model_type`. Streaming uses the legacy `voxcpm_async_chunk.yaml` passed via `--stage-configs-path`. - Streaming is currently single-request oriented. - `--ref-text` must be the real transcript of `--ref-audio`; mismatched text degrades quality. - For online serving, see the [VoxCPM section in the online hub](../../online_serving/text_to_speech/README.md#voxcpm). For benchmark reporting, see [`benchmarks/voxcpm`](../../../benchmarks/voxcpm/README.md). diff --git a/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py b/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py index 0332ab38236..8a3309ca7a6 100644 --- a/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py +++ b/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py @@ -2,18 +2,32 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse import os +import urllib.request +from pathlib import Path import numpy as np import soundfile as sf from vllm import SamplingParams -from vllm.assets.audio import AudioAsset from vllm.multimodal.media.audio import load_audio from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults from vllm_omni.entrypoints.omni import Omni -from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer from vllm_omni.model_executor.models.cosyvoice3.utils import extract_text_token +from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config + +# Upstream zero-shot reference clip +ZERO_SHOT_PROMPT_URL = "https://raw.githubusercontent.com/FunAudioLLM/CosyVoice/main/asset/zero_shot_prompt.wav" + + +def _default_ref_audio() -> str: + # Download the upstream zero_shot_prompt.wav into the current dir + dest = Path("zero_shot_prompt.wav") + if not dest.exists() or dest.stat().st_size == 0: + print(f"Downloading default reference audio to {dest}") + urllib.request.urlretrieve(ZERO_SHOT_PROMPT_URL, dest) + + return str(dest) def run_e2e(): @@ -36,9 +50,15 @@ def run_e2e(): parser.add_argument( "--prompt-text", type=str, - default="You are a helpful assistant.<|endofprompt|>Testing my voices. Why should I not?", + default="You are a helpful assistant.<|endofprompt|>希望你以后,能够做的比我还好呦!", + ) + parser.add_argument( + "--ref-audio", + type=str, + default=None, + help="Path to reference audio for voice cloning. " + "If unset, downloads the upstream CosyVoice3 zero-shot prompt audio clip", ) - parser.add_argument("--ref-audio", type=str, default="prompt.wav") parser.add_argument( "--tokenizer", type=str, @@ -66,24 +86,22 @@ def run_e2e(): sampling_cfg = {"top_p": 0.8, "top_k": 25, "eos_token_id": 6561 + 1} print("Model initialized. Preparing inputs...") - if args.ref_audio: - if not os.path.exists(args.ref_audio): - raise FileNotFoundError(f"Audio file not found: {args.ref_audio}") - # Load at native sample rate - audio_signal, sr = load_audio(args.ref_audio, sr=None) - - # Validate sample rate before processing (similar to original CosyVoice) - min_sr = 16000 - if sr < min_sr: - raise ValueError( - f"Audio sample rate {sr} Hz is too low. " - f"Minimum required: {min_sr} Hz. " - f"Please provide audio with sample rate >= {min_sr} Hz." - ) - - audio_data = (audio_signal.astype(np.float32), sr) - else: - audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate + ref_audio_path = args.ref_audio or _default_ref_audio() + if not os.path.exists(ref_audio_path): + raise FileNotFoundError(f"Audio file not found: {ref_audio_path}") + # Load at native sample rate + audio_signal, sr = load_audio(ref_audio_path, sr=None) + + # Validate sample rate before processing (similar to original CosyVoice) + min_sr = 16000 + if sr < min_sr: + raise ValueError( + f"Audio sample rate {sr} Hz is too low. " + f"Minimum required: {min_sr} Hz. " + f"Please provide audio with sample rate >= {min_sr} Hz." + ) + + audio_data = (audio_signal.astype(np.float32), sr) prompts = { "prompt": args.text, diff --git a/examples/offline_inference/text_to_speech/voxcpm/end2end.py b/examples/offline_inference/text_to_speech/voxcpm/end2end.py index b41d7365011..79efa312340 100644 --- a/examples/offline_inference/text_to_speech/voxcpm/end2end.py +++ b/examples/offline_inference/text_to_speech/voxcpm/end2end.py @@ -11,12 +11,9 @@ import torch from vllm.utils.argparse_utils import FlexibleArgumentParser -from tests.helpers.stage_config import get_deploy_config_path from vllm_omni import AsyncOmni, Omni from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults -DEFAULT_SYNC_STAGE_CONFIG = get_deploy_config_path("voxcpm.yaml") - def _build_prompt(args) -> dict[str, Any]: additional_information: dict[str, list[Any]] = { @@ -59,10 +56,6 @@ def _extract_sample_rate(mm: dict[str, Any]) -> int: return int(sr_raw) -def _is_streaming_stage_config(stage_config_path: str) -> bool: - return "async_chunk" in Path(stage_config_path).stem - - def _save_audio(audio: torch.Tensor, sample_rate: int, output_dir: Path, request_id: str) -> Path: output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / f"output_{request_id}.wav" @@ -87,6 +80,7 @@ async def _run_streaming(args) -> Path: omni = AsyncOmni( model=args.model, stage_configs_path=args.stage_configs_path, + deploy_config=args.deploy_config, log_stats=args.log_stats, stage_init_timeout=args.stage_init_timeout, ) @@ -133,6 +127,7 @@ def _run_sync(args) -> Path: omni = Omni( model=args.model, stage_configs_path=args.stage_configs_path, + deploy_config=args.deploy_config, log_stats=args.log_stats, stage_init_timeout=args.stage_init_timeout, ) @@ -164,11 +159,27 @@ def _run_sync(args) -> Path: def parse_args(): parser = FlexibleArgumentParser(description="Minimal offline VoxCPM example for vLLM Omni.") parser.add_argument("--model", type=str, required=True, help="Local VoxCPM model directory.") + parser.add_argument( + "--deploy-config", + type=str, + default=None, + help=("Override the deploy config path."), + ) parser.add_argument( "--stage-configs-path", type=str, - default=DEFAULT_SYNC_STAGE_CONFIG, - help=("Stage config path. Use voxcpm.yaml for non-streaming or voxcpm_async_chunk.yaml for streaming."), + default=None, + help=( + "Legacy stage_args yaml path. Required for streaming " + "(vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml); " + "leave unset for non-streaming to use the auto-resolved deploy config." + ), + ) + parser.add_argument( + "--streaming", + action="store_true", + default=False, + help="Stream audio chunks as they arrive via AsyncOmni.", ) parser.add_argument("--text", type=str, required=True, help="Input text for synthesis.") parser.add_argument("--ref-audio", type=str, default=None, help="Reference audio path for voice cloning.") @@ -194,9 +205,10 @@ def parse_args(): def main(args) -> None: - route = "streaming" if _is_streaming_stage_config(args.stage_configs_path) else "sync" + route = "streaming" if args.streaming else "sync" print(f"Model: {args.model}") - print(f"Stage config: {args.stage_configs_path}") + print(f"Deploy config: {args.deploy_config or ''}") + print(f"Stage configs path: {args.stage_configs_path or ''}") print(f"Route: {route}") if route == "streaming": asyncio.run(_run_streaming(args)) diff --git a/tests/e2e/offline_inference/test_cosyvoice3_expansion.py b/tests/e2e/offline_inference/test_cosyvoice3_expansion.py index 09a79d94cf7..ea6ac8dd3ce 100644 --- a/tests/e2e/offline_inference/test_cosyvoice3_expansion.py +++ b/tests/e2e/offline_inference/test_cosyvoice3_expansion.py @@ -27,9 +27,9 @@ from tests.helpers.media import get_asset_path from tests.helpers.runtime import OmniRunner from tests.helpers.stage_config import get_deploy_config_path -from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer from vllm_omni.outputs import OmniRequestOutput +from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config MODEL = "FunAudioLLM/Fun-CosyVoice3-0.5B-2512" MODEL_DIR_ENV = "VLLM_OMNI_COSYVOICE3_MODEL_DIR" diff --git a/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py b/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py index bd1a85a8624..51a93de2c6b 100644 --- a/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py +++ b/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py @@ -26,10 +26,10 @@ from vllm_omni.diffusion.distributed.utils import get_local_device from vllm_omni.diffusion.models.interface import SupportAudioOutput from vllm_omni.diffusion.request import OmniDiffusionRequest -from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig from vllm_omni.model_executor.models.omnivoice.duration import RuleDurationEstimator from vllm_omni.model_executor.models.omnivoice.omnivoice_decoder import OmniVoiceDecoder from vllm_omni.model_executor.models.omnivoice.omnivoice_generator import OmniVoiceGenerator +from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig from vllm_omni.utils.speaker_cache import get_speaker_cache try: diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py index f01094befab..4a2f46b01bd 100644 --- a/vllm_omni/engine/arg_utils.py +++ b/vllm_omni/engine/arg_utils.py @@ -35,13 +35,9 @@ def _register_omni_hf_configs() -> None: try: from transformers import AutoConfig - from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config - from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig from vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts import ( Qwen3TTSConfig, ) - from vllm_omni.transformers_utils.configs.voxcpm import VoxCPMConfig - from vllm_omni.transformers_utils.configs.voxcpm2 import VoxCPM2Config except Exception as exc: # pragma: no cover - best-effort optional registration logger.warning("Skipping omni HF config registration due to import error: %s", exc) return @@ -56,10 +52,6 @@ def _register_omni_hf_configs() -> None: for model_type, config_cls in [ ("qwen3_tts", Qwen3TTSConfig), - ("cosyvoice3", CosyVoice3Config), - ("omnivoice", OmniVoiceConfig), - ("voxcpm", VoxCPMConfig), - ("voxcpm2", VoxCPM2Config), ]: try: AutoConfig.register(model_type, config_cls) diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py index b9bfff2635e..5023307ff8c 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py @@ -33,7 +33,6 @@ from vllm.v1.sample.sampler import Sampler from vllm_omni.data_entry_keys import EmbeddingsStruct, OmniPayloadStruct, to_dict, to_struct -from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config from vllm_omni.model_executor.models.cosyvoice3.utils import ( concat_text_with_prompt_ids, extract_speech_feat, @@ -42,6 +41,7 @@ extract_text_token, ) from vllm_omni.model_executor.models.output_templates import OmniOutput +from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config from vllm_omni.utils.speaker_cache import get_speaker_cache logger = init_logger(__name__) diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py index 3ad23cdb108..186a258c809 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py @@ -29,7 +29,7 @@ CausalHiFTGenerator, ) from vllm_omni.model_executor.models.cosyvoice3.code2wav_core.layers import PreLookaheadLayer -from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config +from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config logger = init_logger(__name__) diff --git a/vllm_omni/model_executor/models/omnivoice/omnivoice.py b/vllm_omni/model_executor/models/omnivoice/omnivoice.py index bb09edc6ad2..c5026086e23 100644 --- a/vllm_omni/model_executor/models/omnivoice/omnivoice.py +++ b/vllm_omni/model_executor/models/omnivoice/omnivoice.py @@ -37,8 +37,8 @@ ) from vllm.sequence import IntermediateTensors -from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig from vllm_omni.model_executor.models.output_templates import OmniOutput +from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig logger = init_logger(__name__) diff --git a/vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py b/vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py index cf69f265870..59f23d0d060 100644 --- a/vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py +++ b/vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py @@ -23,7 +23,7 @@ import torch.nn as nn from vllm.logger import init_logger -from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig +from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig logger = init_logger(__name__) diff --git a/vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py b/vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py index 32fe4227217..eedb5828a40 100644 --- a/vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py +++ b/vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py @@ -19,7 +19,7 @@ import torch.nn.functional as F from vllm.logger import init_logger -from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig +from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig logger = init_logger(__name__) diff --git a/vllm_omni/model_executor/models/voxcpm/__init__.py b/vllm_omni/model_executor/models/voxcpm/__init__.py index 3b064c0f683..ffde6dee6f9 100644 --- a/vllm_omni/model_executor/models/voxcpm/__init__.py +++ b/vllm_omni/model_executor/models/voxcpm/__init__.py @@ -1,7 +1,5 @@ -from .configuration_voxcpm import VoxCPMConfig from .voxcpm import VoxCPMForConditionalGeneration __all__ = [ - "VoxCPMConfig", "VoxCPMForConditionalGeneration", ] diff --git a/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py b/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py deleted file mode 100644 index ce1d809bd38..00000000000 --- a/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py +++ /dev/null @@ -1,3 +0,0 @@ -from vllm_omni.transformers_utils.configs.voxcpm import VoxCPMConfig - -__all__ = ["VoxCPMConfig"] diff --git a/vllm_omni/transformers_utils/configs/__init__.py b/vllm_omni/transformers_utils/configs/__init__.py index cb9b8418a50..4f258bbc019 100644 --- a/vllm_omni/transformers_utils/configs/__init__.py +++ b/vllm_omni/transformers_utils/configs/__init__.py @@ -20,6 +20,8 @@ "VoxCPMConfig": "vllm_omni.transformers_utils.configs.voxcpm", "VoxCPM2Config": "vllm_omni.transformers_utils.configs.voxcpm2", "VoxtralTTSConfig": "vllm_omni.transformers_utils.configs.voxtral_tts", + "CosyVoice3Config": "vllm_omni.transformers_utils.configs.cosyvoice3", + "OmniVoiceConfig": "vllm_omni.transformers_utils.configs.omnivoice", "BailingMoeV2Config": "vllm_omni.transformers_utils.configs.ming_flash_omni", "BailingMM2Config": "vllm_omni.transformers_utils.configs.ming_flash_omni", "MingFlashOmniConfig": "vllm_omni.transformers_utils.configs.ming_flash_omni", @@ -38,6 +40,8 @@ "VoxCPMConfig", "VoxCPM2Config", "VoxtralTTSConfig", + "CosyVoice3Config", + "OmniVoiceConfig", "BailingMoeV2Config", "BailingMM2Config", "MingFlashOmniConfig", @@ -61,9 +65,11 @@ def __dir__(): # Eagerly import all config modules so their AutoConfig.register() side-effects # run as soon as `vllm_omni.transformers_utils.configs` is imported. +from vllm_omni.transformers_utils.configs import cosyvoice3 as _cosyvoice3 # noqa: F401, E402 from vllm_omni.transformers_utils.configs import fish_speech as _fish_speech # noqa: F401, E402 from vllm_omni.transformers_utils.configs import mammoth_moda2 as _mammoth_moda2 # noqa: F401, E402 from vllm_omni.transformers_utils.configs import ming_flash_omni as _ming_flash_omni # noqa: F401, E402 +from vllm_omni.transformers_utils.configs import omnivoice as _omnivoice # noqa: F401, E402 from vllm_omni.transformers_utils.configs import voxcpm as _voxcpm # noqa: F401, E402 from vllm_omni.transformers_utils.configs import voxcpm2 as _voxcpm2 # noqa: F401, E402 from vllm_omni.transformers_utils.configs import voxtral_tts as _voxtral_tts # noqa: F401, E402 diff --git a/vllm_omni/model_executor/models/cosyvoice3/config.py b/vllm_omni/transformers_utils/configs/cosyvoice3.py similarity index 97% rename from vllm_omni/model_executor/models/cosyvoice3/config.py rename to vllm_omni/transformers_utils/configs/cosyvoice3.py index 518fe76b78a..9accc3cc83f 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/config.py +++ b/vllm_omni/transformers_utils/configs/cosyvoice3.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig @@ -131,3 +132,8 @@ def __init__(self, **kwargs): "cond_channels": 512, }, } + + +AutoConfig.register("cosyvoice3", CosyVoice3Config) + +__all__ = ["CosyVoice3Config"] diff --git a/vllm_omni/model_executor/models/omnivoice/config.py b/vllm_omni/transformers_utils/configs/omnivoice.py similarity index 96% rename from vllm_omni/model_executor/models/omnivoice/config.py rename to vllm_omni/transformers_utils/configs/omnivoice.py index a24176bcf25..d1bd3527fee 100644 --- a/vllm_omni/model_executor/models/omnivoice/config.py +++ b/vllm_omni/transformers_utils/configs/omnivoice.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """OmniVoice configuration for vLLM-Omni two-stage pipeline.""" +from transformers import AutoConfig from transformers.configuration_utils import PretrainedConfig @@ -79,3 +80,8 @@ def __init__(self, **kwargs): # Serving self.speculative_config = None + + +AutoConfig.register("omnivoice", OmniVoiceConfig) + +__all__ = ["OmniVoiceConfig"]