vllm-project · hsliuustc0106 · May 16, 2026 · May 4, 2026 · May 4, 2026 · May 4, 2026
@@ -99,7 +99,6 @@ Configuration classes.
 - [vllm_omni.diffusion.cache.teacache.config.TeaCacheConfig][]
 - [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][]
 - [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][]
-- [vllm_omni.model_executor.models.cosyvoice3.config.CosyVoice3Config][]
 - [vllm_omni.model_executor.models.fish_speech.configuration_fish_speech.FishSpeechConfig][]
 - [vllm_omni.model_executor.models.fish_speech.configuration_fish_speech.FishSpeechFastARConfig][]
 - [vllm_omni.model_executor.models.fish_speech.configuration_fish_speech.FishSpeechSlowARConfig][]
@@ -116,10 +115,14 @@ Configuration classes.
 - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderConfig][]
 - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderDiTConfig][]
 - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1EncoderConfig][]
+- [vllm_omni.transformers_utils.configs.cosyvoice3.CosyVoice3Config][]
 - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Config][]
 - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Qwen2_5_VLConfig][]
 - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Qwen2_5_VLTextConfig][]
 - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Qwen2_5_VLVisionConfig][]
+- [vllm_omni.transformers_utils.configs.omnivoice.OmniVoiceConfig][]
+- [vllm_omni.transformers_utils.configs.voxcpm.VoxCPMConfig][]
+- [vllm_omni.transformers_utils.configs.voxcpm2.VoxCPM2Config][]
 
 ## Workers
 

@@ -73,13 +73,15 @@ python examples/offline_inference/text_to_speech/cosyvoice3/end2end.py \
 ```
 
 ### Voice cloning
-Pass a reference audio. Note that CosyVoice3's `--prompt-text` is a system-style prompt for the GPT stage, not a reference transcript:
+If `--ref-audio` is omitted, the script downloads the upstream
+[`zero_shot_prompt.wav`](https://github.com/FunAudioLLM/CosyVoice/blob/main/asset/zero_shot_prompt.wav) from the CosyVoice repo into the current directory.
+To use your own clip, pass `--ref-audio /path/to/reference.wav`, and modify `--prompt-text` correspondingly.
 ```bash
 python examples/offline_inference/text_to_speech/cosyvoice3/end2end.py \
     --model pretrained_models/Fun-CosyVoice3-0.5B \
     --tokenizer pretrained_models/Fun-CosyVoice3-0.5B/CosyVoice-BlankEN \
-    --ref-audio prompt.wav \
-    --prompt-text "You are a helpful assistant.<|endofprompt|>Testing my voices. Why should I not?"
+    --ref-audio /path/to/reference.wav \
+    --prompt-text "You are a helpful assistant.<|endofprompt|>Trascript in your ref audio clip"
 ```
 
 ### Streaming
@@ -317,44 +319,39 @@ pip install voxcpm soundfile
 export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/VoxCPM/src
 ```
 
-If the native VoxCPM `config.json` does not contain HF metadata such as `model_type`, prepare a persistent HF-compatible config directory and point the stage configs to it via `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH`:
-
-```bash
-export VOXCPM_MODEL=/path/to/voxcpm-model
-export VLLM_OMNI_VOXCPM_HF_CONFIG_PATH=/tmp/voxcpm_hf_config
-mkdir -p "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH"
-cp "$VOXCPM_MODEL/config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/config.json"
-cp "$VOXCPM_MODEL/generation_config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/generation_config.json" 2>/dev/null || true
-python3 -c 'import json, os; p=os.path.join(os.environ["VLLM_OMNI_VOXCPM_HF_CONFIG_PATH"], "config.json"); cfg=json.load(open(p, "r", encoding="utf-8")); cfg["model_type"]="voxcpm"; cfg.setdefault("architectures", ["VoxCPMForConditionalGeneration"]); json.dump(cfg, open(p, "w", encoding="utf-8"), indent=2, ensure_ascii=False)'
-```
-
 ### Quick start
 ```bash
 python examples/offline_inference/text_to_speech/voxcpm/end2end.py \
-    --model "$VOXCPM_MODEL" \
+    --model openbmb/VoxCPM-0.5B \
     --text "This is a split-stage VoxCPM synthesis example running on vLLM Omni."
 ```
 
 ### Voice cloning
 ```bash
 python examples/offline_inference/text_to_speech/voxcpm/end2end.py \
-    --model "$VOXCPM_MODEL" \
+    --model openbmb/VoxCPM-0.5B \
     --text "This sentence is synthesized with a cloned voice." \
     --ref-audio /path/to/reference.wav \
     --ref-text  "The exact transcript spoken in reference.wav."
 ```
 
 ### Streaming
-Pass the async-chunk stage config:
+Pass `--streaming` together with the legacy async-chunk stage config:
 ```bash
 python examples/offline_inference/text_to_speech/voxcpm/end2end.py \
-    --model "$VOXCPM_MODEL" \
+    --model openbmb/VoxCPM-0.5B \
+    --streaming \
     --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \
     --text "This is a split-stage VoxCPM streaming example running on vLLM Omni."
 ```
 
+### Persistent HF config (optional)
+The engine auto-patches a missing `model_type` in the checkpoint's `config.json` in-memory at startup (see `vllm_omni/engine/arg_utils.py`), so no manual setup is required.
+
+If you'd rather maintain a persistent patched config dir (e.g. to share across runs or to add fields the auto-patcher doesn't set), export `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH` to its location — both the bundled deploy and async-chunk yamls read it via `${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,}`.
+
 ### Notes
-- `voxcpm.yaml` is the default non-streaming stage config; `voxcpm_async_chunk.yaml` enables streaming.
+- Non-streaming auto-loads `vllm_omni/deploy/voxcpm.yaml` from the model directory name / HF `model_type`. Streaming uses the legacy `voxcpm_async_chunk.yaml` passed via `--stage-configs-path`.
 - Streaming is currently single-request oriented.
 - `--ref-text` must be the real transcript of `--ref-audio`; mismatched text degrades quality.
 - For online serving, see the [VoxCPM section in the online hub](../../online_serving/text_to_speech/README.md#voxcpm). For benchmark reporting, see [`benchmarks/voxcpm`](../../../benchmarks/voxcpm/README.md).

@@ -2,18 +2,32 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import os
+import urllib.request
+from pathlib import Path
 
 import numpy as np
 import soundfile as sf
 from vllm import SamplingParams
-from vllm.assets.audio import AudioAsset
 from vllm.multimodal.media.audio import load_audio
 
 from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults
 from vllm_omni.entrypoints.omni import Omni
-from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
 from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer
 from vllm_omni.model_executor.models.cosyvoice3.utils import extract_text_token
+from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config
+
+# Upstream zero-shot reference clip
+ZERO_SHOT_PROMPT_URL = "https://raw.githubusercontent.com/FunAudioLLM/CosyVoice/main/asset/zero_shot_prompt.wav"
+
+
+def _default_ref_audio() -> str:
+    # Download the upstream zero_shot_prompt.wav into the current dir
+    dest = Path("zero_shot_prompt.wav")
+    if not dest.exists() or dest.stat().st_size == 0:
+        print(f"Downloading default reference audio to {dest}")
+        urllib.request.urlretrieve(ZERO_SHOT_PROMPT_URL, dest)
+
+    return str(dest)
 
 
 def run_e2e():
@@ -36,9 +50,15 @@ def run_e2e():
     parser.add_argument(
         "--prompt-text",
         type=str,
-        default="You are a helpful assistant.<|endofprompt|>Testing my voices. Why should I not?",
+        default="You are a helpful assistant.<|endofprompt|>希望你以后，能够做的比我还好呦!",
+    )
+    parser.add_argument(
+        "--ref-audio",
+        type=str,
+        default=None,
+        help="Path to reference audio for voice cloning. "
+        "If unset, downloads the upstream CosyVoice3 zero-shot prompt audio clip",
     )
-    parser.add_argument("--ref-audio", type=str, default="prompt.wav")
     parser.add_argument(
         "--tokenizer",
         type=str,
@@ -66,24 +86,22 @@ def run_e2e():
     sampling_cfg = {"top_p": 0.8, "top_k": 25, "eos_token_id": 6561 + 1}
 
     print("Model initialized. Preparing inputs...")
-    if args.ref_audio:
-        if not os.path.exists(args.ref_audio):
-            raise FileNotFoundError(f"Audio file not found: {args.ref_audio}")
-        # Load at native sample rate
-        audio_signal, sr = load_audio(args.ref_audio, sr=None)
-
-        # Validate sample rate before processing (similar to original CosyVoice)
-        min_sr = 16000
-        if sr < min_sr:
-            raise ValueError(
-                f"Audio sample rate {sr} Hz is too low. "
-                f"Minimum required: {min_sr} Hz. "
-                f"Please provide audio with sample rate >= {min_sr} Hz."
-            )
-
-        audio_data = (audio_signal.astype(np.float32), sr)
-    else:
-        audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate
+    ref_audio_path = args.ref_audio or _default_ref_audio()
+    if not os.path.exists(ref_audio_path):
+        raise FileNotFoundError(f"Audio file not found: {ref_audio_path}")
+    # Load at native sample rate
+    audio_signal, sr = load_audio(ref_audio_path, sr=None)
+
+    # Validate sample rate before processing (similar to original CosyVoice)
+    min_sr = 16000
+    if sr < min_sr:
+        raise ValueError(
+            f"Audio sample rate {sr} Hz is too low. "
+            f"Minimum required: {min_sr} Hz. "
+            f"Please provide audio with sample rate >= {min_sr} Hz."
+        )
+
+    audio_data = (audio_signal.astype(np.float32), sr)
 
     prompts = {
         "prompt": args.text,

@@ -11,12 +11,9 @@
 import torch
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
-from tests.helpers.stage_config import get_deploy_config_path
 from vllm_omni import AsyncOmni, Omni
 from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults
 
-DEFAULT_SYNC_STAGE_CONFIG = get_deploy_config_path("voxcpm.yaml")
-
 
 def _build_prompt(args) -> dict[str, Any]:
     additional_information: dict[str, list[Any]] = {
@@ -59,10 +56,6 @@ def _extract_sample_rate(mm: dict[str, Any]) -> int:
     return int(sr_raw)
 
 
-def _is_streaming_stage_config(stage_config_path: str) -> bool:
-    return "async_chunk" in Path(stage_config_path).stem
-
-
 def _save_audio(audio: torch.Tensor, sample_rate: int, output_dir: Path, request_id: str) -> Path:
     output_dir.mkdir(parents=True, exist_ok=True)
     output_path = output_dir / f"output_{request_id}.wav"
@@ -87,6 +80,7 @@ async def _run_streaming(args) -> Path:
     omni = AsyncOmni(
         model=args.model,
         stage_configs_path=args.stage_configs_path,
+        deploy_config=args.deploy_config,
         log_stats=args.log_stats,
         stage_init_timeout=args.stage_init_timeout,
     )
@@ -133,6 +127,7 @@ def _run_sync(args) -> Path:
     omni = Omni(
         model=args.model,
         stage_configs_path=args.stage_configs_path,
+        deploy_config=args.deploy_config,
         log_stats=args.log_stats,
         stage_init_timeout=args.stage_init_timeout,
     )
@@ -164,11 +159,27 @@ def _run_sync(args) -> Path:
 def parse_args():
     parser = FlexibleArgumentParser(description="Minimal offline VoxCPM example for vLLM Omni.")
     parser.add_argument("--model", type=str, required=True, help="Local VoxCPM model directory.")
+    parser.add_argument(
+        "--deploy-config",
+        type=str,
+        default=None,
+        help=("Override the deploy config path."),
+    )
     parser.add_argument(
         "--stage-configs-path",
         type=str,
-        default=DEFAULT_SYNC_STAGE_CONFIG,
-        help=("Stage config path. Use voxcpm.yaml for non-streaming or voxcpm_async_chunk.yaml for streaming."),
+        default=None,
+        help=(
+            "Legacy stage_args yaml path. Required for streaming "
+            "(vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml); "
+            "leave unset for non-streaming to use the auto-resolved deploy config."
+        ),
+    )
+    parser.add_argument(
+        "--streaming",
+        action="store_true",
+        default=False,
+        help="Stream audio chunks as they arrive via AsyncOmni.",
     )
     parser.add_argument("--text", type=str, required=True, help="Input text for synthesis.")
     parser.add_argument("--ref-audio", type=str, default=None, help="Reference audio path for voice cloning.")
@@ -194,9 +205,10 @@ def parse_args():
 
 
 def main(args) -> None:
-    route = "streaming" if _is_streaming_stage_config(args.stage_configs_path) else "sync"
+    route = "streaming" if args.streaming else "sync"
     print(f"Model: {args.model}")
-    print(f"Stage config: {args.stage_configs_path}")
+    print(f"Deploy config: {args.deploy_config or '<auto from HF model_type>'}")
+    print(f"Stage configs path: {args.stage_configs_path or '<unused>'}")
     print(f"Route: {route}")
     if route == "streaming":
         asyncio.run(_run_streaming(args))

@@ -27,9 +27,9 @@
 from tests.helpers.media import get_asset_path
 from tests.helpers.runtime import OmniRunner
 from tests.helpers.stage_config import get_deploy_config_path
-from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
 from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer
 from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config
 
 MODEL = "FunAudioLLM/Fun-CosyVoice3-0.5B-2512"
 MODEL_DIR_ENV = "VLLM_OMNI_COSYVOICE3_MODEL_DIR"

@@ -26,10 +26,10 @@
 from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.models.interface import SupportAudioOutput
 from vllm_omni.diffusion.request import OmniDiffusionRequest
-from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
 from vllm_omni.model_executor.models.omnivoice.duration import RuleDurationEstimator
 from vllm_omni.model_executor.models.omnivoice.omnivoice_decoder import OmniVoiceDecoder
 from vllm_omni.model_executor.models.omnivoice.omnivoice_generator import OmniVoiceGenerator
+from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig
 from vllm_omni.utils.speaker_cache import get_speaker_cache
 
 try:

@@ -35,13 +35,9 @@ def _register_omni_hf_configs() -> None:
     try:
         from transformers import AutoConfig
 
-        from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
-        from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
         from vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts import (
             Qwen3TTSConfig,
         )
-        from vllm_omni.transformers_utils.configs.voxcpm import VoxCPMConfig
-        from vllm_omni.transformers_utils.configs.voxcpm2 import VoxCPM2Config
     except Exception as exc:  # pragma: no cover - best-effort optional registration
         logger.warning("Skipping omni HF config registration due to import error: %s", exc)
         return
@@ -56,10 +52,6 @@ def _register_omni_hf_configs() -> None:
 
     for model_type, config_cls in [
         ("qwen3_tts", Qwen3TTSConfig),
-        ("cosyvoice3", CosyVoice3Config),
-        ("omnivoice", OmniVoiceConfig),
-        ("voxcpm", VoxCPMConfig),
-        ("voxcpm2", VoxCPM2Config),
     ]:
         try:
             AutoConfig.register(model_type, config_cls)

@@ -33,7 +33,6 @@
 from vllm.v1.sample.sampler import Sampler
 
 from vllm_omni.data_entry_keys import EmbeddingsStruct, OmniPayloadStruct, to_dict, to_struct
-from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
 from vllm_omni.model_executor.models.cosyvoice3.utils import (
     concat_text_with_prompt_ids,
     extract_speech_feat,
@@ -42,6 +41,7 @@
     extract_text_token,
 )
 from vllm_omni.model_executor.models.output_templates import OmniOutput
+from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config
 from vllm_omni.utils.speaker_cache import get_speaker_cache
 
 logger = init_logger(__name__)

@@ -29,7 +29,7 @@
     CausalHiFTGenerator,
 )
 from vllm_omni.model_executor.models.cosyvoice3.code2wav_core.layers import PreLookaheadLayer
-from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
+from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config
 
 logger = init_logger(__name__)
 

@@ -37,8 +37,8 @@
 )
 from vllm.sequence import IntermediateTensors
 
-from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
 from vllm_omni.model_executor.models.output_templates import OmniOutput
+from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig
 
 logger = init_logger(__name__)
 

@@ -23,7 +23,7 @@
 import torch.nn as nn
 from vllm.logger import init_logger
 
-from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
+from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig
 
 logger = init_logger(__name__)
 

@@ -19,7 +19,7 @@
 import torch.nn.functional as F
 from vllm.logger import init_logger
 
-from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
+from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig
 
 logger = init_logger(__name__)
 

@@ -1,7 +1,5 @@
-from .configuration_voxcpm import VoxCPMConfig
 from .voxcpm import VoxCPMForConditionalGeneration
 
 __all__ = [
-    "VoxCPMConfig",
     "VoxCPMForConditionalGeneration",
 ]