diff --git a/docs/api/README.md b/docs/api/README.md
index 0147f19e126..11401dc0208 100644
--- a/docs/api/README.md
+++ b/docs/api/README.md
@@ -99,7 +99,6 @@ Configuration classes.
 - [vllm_omni.diffusion.cache.teacache.config.TeaCacheConfig][]
 - [vllm_omni.distributed.omni_connectors.utils.config.ConnectorSpec][]
 - [vllm_omni.distributed.omni_connectors.utils.config.OmniTransferConfig][]
-- [vllm_omni.model_executor.models.cosyvoice3.config.CosyVoice3Config][]
 - [vllm_omni.model_executor.models.fish_speech.configuration_fish_speech.FishSpeechConfig][]
 - [vllm_omni.model_executor.models.fish_speech.configuration_fish_speech.FishSpeechFastARConfig][]
 - [vllm_omni.model_executor.models.fish_speech.configuration_fish_speech.FishSpeechSlowARConfig][]
@@ -116,10 +115,14 @@ Configuration classes.
 - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderConfig][]
 - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1DecoderDiTConfig][]
 - [vllm_omni.model_executor.models.qwen3_tts.tokenizer_25hz.configuration_qwen3_tts_tokenizer_v1.Qwen3TTSTokenizerV1EncoderConfig][]
+- [vllm_omni.transformers_utils.configs.cosyvoice3.CosyVoice3Config][]
 - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Config][]
 - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Qwen2_5_VLConfig][]
 - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Qwen2_5_VLTextConfig][]
 - [vllm_omni.transformers_utils.configs.mammoth_moda2.Mammothmoda2Qwen2_5_VLVisionConfig][]
+- [vllm_omni.transformers_utils.configs.omnivoice.OmniVoiceConfig][]
+- [vllm_omni.transformers_utils.configs.voxcpm.VoxCPMConfig][]
+- [vllm_omni.transformers_utils.configs.voxcpm2.VoxCPM2Config][]
 
 ## Workers
 
diff --git a/examples/offline_inference/text_to_speech/README.md b/examples/offline_inference/text_to_speech/README.md
index ddc5f11c16b..bfdee543cc7 100644
--- a/examples/offline_inference/text_to_speech/README.md
+++ b/examples/offline_inference/text_to_speech/README.md
@@ -73,13 +73,15 @@ python examples/offline_inference/text_to_speech/cosyvoice3/end2end.py \
 ```
 
 ### Voice cloning
-Pass a reference audio. Note that CosyVoice3's `--prompt-text` is a system-style prompt for the GPT stage, not a reference transcript:
+If `--ref-audio` is omitted, the script downloads the upstream
+[`zero_shot_prompt.wav`](https://github.com/FunAudioLLM/CosyVoice/blob/main/asset/zero_shot_prompt.wav) from the CosyVoice repo into the current directory.
+To use your own clip, pass `--ref-audio /path/to/reference.wav`, and modify `--prompt-text` correspondingly.
 ```bash
 python examples/offline_inference/text_to_speech/cosyvoice3/end2end.py \
     --model pretrained_models/Fun-CosyVoice3-0.5B \
     --tokenizer pretrained_models/Fun-CosyVoice3-0.5B/CosyVoice-BlankEN \
-    --ref-audio prompt.wav \
-    --prompt-text "You are a helpful assistant.<|endofprompt|>Testing my voices. Why should I not?"
+    --ref-audio /path/to/reference.wav \
+    --prompt-text "You are a helpful assistant.<|endofprompt|>Trascript in your ref audio clip"
 ```
 
 ### Streaming
@@ -317,44 +319,39 @@ pip install voxcpm soundfile
 export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/VoxCPM/src
 ```
 
-If the native VoxCPM `config.json` does not contain HF metadata such as `model_type`, prepare a persistent HF-compatible config directory and point the stage configs to it via `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH`:
-
-```bash
-export VOXCPM_MODEL=/path/to/voxcpm-model
-export VLLM_OMNI_VOXCPM_HF_CONFIG_PATH=/tmp/voxcpm_hf_config
-mkdir -p "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH"
-cp "$VOXCPM_MODEL/config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/config.json"
-cp "$VOXCPM_MODEL/generation_config.json" "$VLLM_OMNI_VOXCPM_HF_CONFIG_PATH/generation_config.json" 2>/dev/null || true
-python3 -c 'import json, os; p=os.path.join(os.environ["VLLM_OMNI_VOXCPM_HF_CONFIG_PATH"], "config.json"); cfg=json.load(open(p, "r", encoding="utf-8")); cfg["model_type"]="voxcpm"; cfg.setdefault("architectures", ["VoxCPMForConditionalGeneration"]); json.dump(cfg, open(p, "w", encoding="utf-8"), indent=2, ensure_ascii=False)'
-```
-
 ### Quick start
 ```bash
 python examples/offline_inference/text_to_speech/voxcpm/end2end.py \
-    --model "$VOXCPM_MODEL" \
+    --model openbmb/VoxCPM-0.5B \
     --text "This is a split-stage VoxCPM synthesis example running on vLLM Omni."
 ```
 
 ### Voice cloning
 ```bash
 python examples/offline_inference/text_to_speech/voxcpm/end2end.py \
-    --model "$VOXCPM_MODEL" \
+    --model openbmb/VoxCPM-0.5B \
     --text "This sentence is synthesized with a cloned voice." \
     --ref-audio /path/to/reference.wav \
     --ref-text  "The exact transcript spoken in reference.wav."
 ```
 
 ### Streaming
-Pass the async-chunk stage config:
+Pass `--streaming` together with the legacy async-chunk stage config:
 ```bash
 python examples/offline_inference/text_to_speech/voxcpm/end2end.py \
-    --model "$VOXCPM_MODEL" \
+    --model openbmb/VoxCPM-0.5B \
+    --streaming \
     --stage-configs-path vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml \
     --text "This is a split-stage VoxCPM streaming example running on vLLM Omni."
 ```
 
+### Persistent HF config (optional)
+The engine auto-patches a missing `model_type` in the checkpoint's `config.json` in-memory at startup (see `vllm_omni/engine/arg_utils.py`), so no manual setup is required.
+
+If you'd rather maintain a persistent patched config dir (e.g. to share across runs or to add fields the auto-patcher doesn't set), export `VLLM_OMNI_VOXCPM_HF_CONFIG_PATH` to its location — both the bundled deploy and async-chunk yamls read it via `${oc.env:VLLM_OMNI_VOXCPM_HF_CONFIG_PATH,}`.
+
 ### Notes
-- `voxcpm.yaml` is the default non-streaming stage config; `voxcpm_async_chunk.yaml` enables streaming.
+- Non-streaming auto-loads `vllm_omni/deploy/voxcpm.yaml` from the model directory name / HF `model_type`. Streaming uses the legacy `voxcpm_async_chunk.yaml` passed via `--stage-configs-path`.
 - Streaming is currently single-request oriented.
 - `--ref-text` must be the real transcript of `--ref-audio`; mismatched text degrades quality.
 - For online serving, see the [VoxCPM section in the online hub](../../online_serving/text_to_speech/README.md#voxcpm). For benchmark reporting, see [`benchmarks/voxcpm`](../../../benchmarks/voxcpm/README.md).
diff --git a/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py b/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py
index 0332ab38236..8a3309ca7a6 100644
--- a/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py
+++ b/examples/offline_inference/text_to_speech/cosyvoice3/end2end.py
@@ -2,18 +2,32 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import os
+import urllib.request
+from pathlib import Path
 
 import numpy as np
 import soundfile as sf
 from vllm import SamplingParams
-from vllm.assets.audio import AudioAsset
 from vllm.multimodal.media.audio import load_audio
 
 from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults
 from vllm_omni.entrypoints.omni import Omni
-from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
 from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer
 from vllm_omni.model_executor.models.cosyvoice3.utils import extract_text_token
+from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config
+
+# Upstream zero-shot reference clip
+ZERO_SHOT_PROMPT_URL = "https://raw.githubusercontent.com/FunAudioLLM/CosyVoice/main/asset/zero_shot_prompt.wav"
+
+
+def _default_ref_audio() -> str:
+    # Download the upstream zero_shot_prompt.wav into the current dir
+    dest = Path("zero_shot_prompt.wav")
+    if not dest.exists() or dest.stat().st_size == 0:
+        print(f"Downloading default reference audio to {dest}")
+        urllib.request.urlretrieve(ZERO_SHOT_PROMPT_URL, dest)
+
+    return str(dest)
 
 
 def run_e2e():
@@ -36,9 +50,15 @@ def run_e2e():
     parser.add_argument(
         "--prompt-text",
         type=str,
-        default="You are a helpful assistant.<|endofprompt|>Testing my voices. Why should I not?",
+        default="You are a helpful assistant.<|endofprompt|>希望你以后，能够做的比我还好呦!",
+    )
+    parser.add_argument(
+        "--ref-audio",
+        type=str,
+        default=None,
+        help="Path to reference audio for voice cloning. "
+        "If unset, downloads the upstream CosyVoice3 zero-shot prompt audio clip",
     )
-    parser.add_argument("--ref-audio", type=str, default="prompt.wav")
     parser.add_argument(
         "--tokenizer",
         type=str,
@@ -66,24 +86,22 @@ def run_e2e():
     sampling_cfg = {"top_p": 0.8, "top_k": 25, "eos_token_id": 6561 + 1}
 
     print("Model initialized. Preparing inputs...")
-    if args.ref_audio:
-        if not os.path.exists(args.ref_audio):
-            raise FileNotFoundError(f"Audio file not found: {args.ref_audio}")
-        # Load at native sample rate
-        audio_signal, sr = load_audio(args.ref_audio, sr=None)
-
-        # Validate sample rate before processing (similar to original CosyVoice)
-        min_sr = 16000
-        if sr < min_sr:
-            raise ValueError(
-                f"Audio sample rate {sr} Hz is too low. "
-                f"Minimum required: {min_sr} Hz. "
-                f"Please provide audio with sample rate >= {min_sr} Hz."
-            )
-
-        audio_data = (audio_signal.astype(np.float32), sr)
-    else:
-        audio_data = AudioAsset("mary_had_lamb").audio_and_sample_rate
+    ref_audio_path = args.ref_audio or _default_ref_audio()
+    if not os.path.exists(ref_audio_path):
+        raise FileNotFoundError(f"Audio file not found: {ref_audio_path}")
+    # Load at native sample rate
+    audio_signal, sr = load_audio(ref_audio_path, sr=None)
+
+    # Validate sample rate before processing (similar to original CosyVoice)
+    min_sr = 16000
+    if sr < min_sr:
+        raise ValueError(
+            f"Audio sample rate {sr} Hz is too low. "
+            f"Minimum required: {min_sr} Hz. "
+            f"Please provide audio with sample rate >= {min_sr} Hz."
+        )
+
+    audio_data = (audio_signal.astype(np.float32), sr)
 
     prompts = {
         "prompt": args.text,
diff --git a/examples/offline_inference/text_to_speech/voxcpm/end2end.py b/examples/offline_inference/text_to_speech/voxcpm/end2end.py
index b41d7365011..79efa312340 100644
--- a/examples/offline_inference/text_to_speech/voxcpm/end2end.py
+++ b/examples/offline_inference/text_to_speech/voxcpm/end2end.py
@@ -11,12 +11,9 @@
 import torch
 from vllm.utils.argparse_utils import FlexibleArgumentParser
 
-from tests.helpers.stage_config import get_deploy_config_path
 from vllm_omni import AsyncOmni, Omni
 from vllm_omni.engine.arg_utils import nullify_stage_engine_defaults
 
-DEFAULT_SYNC_STAGE_CONFIG = get_deploy_config_path("voxcpm.yaml")
-
 
 def _build_prompt(args) -> dict[str, Any]:
     additional_information: dict[str, list[Any]] = {
@@ -59,10 +56,6 @@ def _extract_sample_rate(mm: dict[str, Any]) -> int:
     return int(sr_raw)
 
 
-def _is_streaming_stage_config(stage_config_path: str) -> bool:
-    return "async_chunk" in Path(stage_config_path).stem
-
-
 def _save_audio(audio: torch.Tensor, sample_rate: int, output_dir: Path, request_id: str) -> Path:
     output_dir.mkdir(parents=True, exist_ok=True)
     output_path = output_dir / f"output_{request_id}.wav"
@@ -87,6 +80,7 @@ async def _run_streaming(args) -> Path:
     omni = AsyncOmni(
         model=args.model,
         stage_configs_path=args.stage_configs_path,
+        deploy_config=args.deploy_config,
         log_stats=args.log_stats,
         stage_init_timeout=args.stage_init_timeout,
     )
@@ -133,6 +127,7 @@ def _run_sync(args) -> Path:
     omni = Omni(
         model=args.model,
         stage_configs_path=args.stage_configs_path,
+        deploy_config=args.deploy_config,
         log_stats=args.log_stats,
         stage_init_timeout=args.stage_init_timeout,
     )
@@ -164,11 +159,27 @@ def _run_sync(args) -> Path:
 def parse_args():
     parser = FlexibleArgumentParser(description="Minimal offline VoxCPM example for vLLM Omni.")
     parser.add_argument("--model", type=str, required=True, help="Local VoxCPM model directory.")
+    parser.add_argument(
+        "--deploy-config",
+        type=str,
+        default=None,
+        help=("Override the deploy config path."),
+    )
     parser.add_argument(
         "--stage-configs-path",
         type=str,
-        default=DEFAULT_SYNC_STAGE_CONFIG,
-        help=("Stage config path. Use voxcpm.yaml for non-streaming or voxcpm_async_chunk.yaml for streaming."),
+        default=None,
+        help=(
+            "Legacy stage_args yaml path. Required for streaming "
+            "(vllm_omni/model_executor/stage_configs/voxcpm_async_chunk.yaml); "
+            "leave unset for non-streaming to use the auto-resolved deploy config."
+        ),
+    )
+    parser.add_argument(
+        "--streaming",
+        action="store_true",
+        default=False,
+        help="Stream audio chunks as they arrive via AsyncOmni.",
     )
     parser.add_argument("--text", type=str, required=True, help="Input text for synthesis.")
     parser.add_argument("--ref-audio", type=str, default=None, help="Reference audio path for voice cloning.")
@@ -194,9 +205,10 @@ def parse_args():
 
 
 def main(args) -> None:
-    route = "streaming" if _is_streaming_stage_config(args.stage_configs_path) else "sync"
+    route = "streaming" if args.streaming else "sync"
     print(f"Model: {args.model}")
-    print(f"Stage config: {args.stage_configs_path}")
+    print(f"Deploy config: {args.deploy_config or '<auto from HF model_type>'}")
+    print(f"Stage configs path: {args.stage_configs_path or '<unused>'}")
     print(f"Route: {route}")
     if route == "streaming":
         asyncio.run(_run_streaming(args))
diff --git a/tests/e2e/offline_inference/test_cosyvoice3_expansion.py b/tests/e2e/offline_inference/test_cosyvoice3_expansion.py
index 09a79d94cf7..ea6ac8dd3ce 100644
--- a/tests/e2e/offline_inference/test_cosyvoice3_expansion.py
+++ b/tests/e2e/offline_inference/test_cosyvoice3_expansion.py
@@ -27,9 +27,9 @@
 from tests.helpers.media import get_asset_path
 from tests.helpers.runtime import OmniRunner
 from tests.helpers.stage_config import get_deploy_config_path
-from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
 from vllm_omni.model_executor.models.cosyvoice3.tokenizer import get_qwen_tokenizer
 from vllm_omni.outputs import OmniRequestOutput
+from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config
 
 MODEL = "FunAudioLLM/Fun-CosyVoice3-0.5B-2512"
 MODEL_DIR_ENV = "VLLM_OMNI_COSYVOICE3_MODEL_DIR"
diff --git a/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py b/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py
index bd1a85a8624..51a93de2c6b 100644
--- a/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py
+++ b/vllm_omni/diffusion/models/omnivoice/pipeline_omnivoice.py
@@ -26,10 +26,10 @@
 from vllm_omni.diffusion.distributed.utils import get_local_device
 from vllm_omni.diffusion.models.interface import SupportAudioOutput
 from vllm_omni.diffusion.request import OmniDiffusionRequest
-from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
 from vllm_omni.model_executor.models.omnivoice.duration import RuleDurationEstimator
 from vllm_omni.model_executor.models.omnivoice.omnivoice_decoder import OmniVoiceDecoder
 from vllm_omni.model_executor.models.omnivoice.omnivoice_generator import OmniVoiceGenerator
+from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig
 from vllm_omni.utils.speaker_cache import get_speaker_cache
 
 try:
diff --git a/vllm_omni/engine/arg_utils.py b/vllm_omni/engine/arg_utils.py
index f01094befab..4a2f46b01bd 100644
--- a/vllm_omni/engine/arg_utils.py
+++ b/vllm_omni/engine/arg_utils.py
@@ -35,13 +35,9 @@ def _register_omni_hf_configs() -> None:
     try:
         from transformers import AutoConfig
 
-        from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
-        from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
         from vllm_omni.model_executor.models.qwen3_tts.configuration_qwen3_tts import (
             Qwen3TTSConfig,
         )
-        from vllm_omni.transformers_utils.configs.voxcpm import VoxCPMConfig
-        from vllm_omni.transformers_utils.configs.voxcpm2 import VoxCPM2Config
     except Exception as exc:  # pragma: no cover - best-effort optional registration
         logger.warning("Skipping omni HF config registration due to import error: %s", exc)
         return
@@ -56,10 +52,6 @@ def _register_omni_hf_configs() -> None:
 
     for model_type, config_cls in [
         ("qwen3_tts", Qwen3TTSConfig),
-        ("cosyvoice3", CosyVoice3Config),
-        ("omnivoice", OmniVoiceConfig),
-        ("voxcpm", VoxCPMConfig),
-        ("voxcpm2", VoxCPM2Config),
     ]:
         try:
             AutoConfig.register(model_type, config_cls)
diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py
index b9bfff2635e..5023307ff8c 100644
--- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py
+++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py
@@ -33,7 +33,6 @@
 from vllm.v1.sample.sampler import Sampler
 
 from vllm_omni.data_entry_keys import EmbeddingsStruct, OmniPayloadStruct, to_dict, to_struct
-from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
 from vllm_omni.model_executor.models.cosyvoice3.utils import (
     concat_text_with_prompt_ids,
     extract_speech_feat,
@@ -42,6 +41,7 @@
     extract_text_token,
 )
 from vllm_omni.model_executor.models.output_templates import OmniOutput
+from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config
 from vllm_omni.utils.speaker_cache import get_speaker_cache
 
 logger = init_logger(__name__)
diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py
index 3ad23cdb108..186a258c809 100644
--- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py
+++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py
@@ -29,7 +29,7 @@
     CausalHiFTGenerator,
 )
 from vllm_omni.model_executor.models.cosyvoice3.code2wav_core.layers import PreLookaheadLayer
-from vllm_omni.model_executor.models.cosyvoice3.config import CosyVoice3Config
+from vllm_omni.transformers_utils.configs.cosyvoice3 import CosyVoice3Config
 
 logger = init_logger(__name__)
 
diff --git a/vllm_omni/model_executor/models/omnivoice/omnivoice.py b/vllm_omni/model_executor/models/omnivoice/omnivoice.py
index bb09edc6ad2..c5026086e23 100644
--- a/vllm_omni/model_executor/models/omnivoice/omnivoice.py
+++ b/vllm_omni/model_executor/models/omnivoice/omnivoice.py
@@ -37,8 +37,8 @@
 )
 from vllm.sequence import IntermediateTensors
 
-from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
 from vllm_omni.model_executor.models.output_templates import OmniOutput
+from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig
 
 logger = init_logger(__name__)
 
diff --git a/vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py b/vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py
index cf69f265870..59f23d0d060 100644
--- a/vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py
+++ b/vllm_omni/model_executor/models/omnivoice/omnivoice_decoder.py
@@ -23,7 +23,7 @@
 import torch.nn as nn
 from vllm.logger import init_logger
 
-from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
+from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig
 
 logger = init_logger(__name__)
 
diff --git a/vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py b/vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py
index 32fe4227217..eedb5828a40 100644
--- a/vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py
+++ b/vllm_omni/model_executor/models/omnivoice/omnivoice_generator.py
@@ -19,7 +19,7 @@
 import torch.nn.functional as F
 from vllm.logger import init_logger
 
-from vllm_omni.model_executor.models.omnivoice.config import OmniVoiceConfig
+from vllm_omni.transformers_utils.configs.omnivoice import OmniVoiceConfig
 
 logger = init_logger(__name__)
 
diff --git a/vllm_omni/model_executor/models/voxcpm/__init__.py b/vllm_omni/model_executor/models/voxcpm/__init__.py
index 3b064c0f683..ffde6dee6f9 100644
--- a/vllm_omni/model_executor/models/voxcpm/__init__.py
+++ b/vllm_omni/model_executor/models/voxcpm/__init__.py
@@ -1,7 +1,5 @@
-from .configuration_voxcpm import VoxCPMConfig
 from .voxcpm import VoxCPMForConditionalGeneration
 
 __all__ = [
-    "VoxCPMConfig",
     "VoxCPMForConditionalGeneration",
 ]
diff --git a/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py b/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py
deleted file mode 100644
index ce1d809bd38..00000000000
--- a/vllm_omni/model_executor/models/voxcpm/configuration_voxcpm.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from vllm_omni.transformers_utils.configs.voxcpm import VoxCPMConfig
-
-__all__ = ["VoxCPMConfig"]
diff --git a/vllm_omni/transformers_utils/configs/__init__.py b/vllm_omni/transformers_utils/configs/__init__.py
index cb9b8418a50..4f258bbc019 100644
--- a/vllm_omni/transformers_utils/configs/__init__.py
+++ b/vllm_omni/transformers_utils/configs/__init__.py
@@ -20,6 +20,8 @@
     "VoxCPMConfig": "vllm_omni.transformers_utils.configs.voxcpm",
     "VoxCPM2Config": "vllm_omni.transformers_utils.configs.voxcpm2",
     "VoxtralTTSConfig": "vllm_omni.transformers_utils.configs.voxtral_tts",
+    "CosyVoice3Config": "vllm_omni.transformers_utils.configs.cosyvoice3",
+    "OmniVoiceConfig": "vllm_omni.transformers_utils.configs.omnivoice",
     "BailingMoeV2Config": "vllm_omni.transformers_utils.configs.ming_flash_omni",
     "BailingMM2Config": "vllm_omni.transformers_utils.configs.ming_flash_omni",
     "MingFlashOmniConfig": "vllm_omni.transformers_utils.configs.ming_flash_omni",
@@ -38,6 +40,8 @@
     "VoxCPMConfig",
     "VoxCPM2Config",
     "VoxtralTTSConfig",
+    "CosyVoice3Config",
+    "OmniVoiceConfig",
     "BailingMoeV2Config",
     "BailingMM2Config",
     "MingFlashOmniConfig",
@@ -61,9 +65,11 @@ def __dir__():
 
 # Eagerly import all config modules so their AutoConfig.register() side-effects
 # run as soon as `vllm_omni.transformers_utils.configs` is imported.
+from vllm_omni.transformers_utils.configs import cosyvoice3 as _cosyvoice3  # noqa: F401, E402
 from vllm_omni.transformers_utils.configs import fish_speech as _fish_speech  # noqa: F401, E402
 from vllm_omni.transformers_utils.configs import mammoth_moda2 as _mammoth_moda2  # noqa: F401, E402
 from vllm_omni.transformers_utils.configs import ming_flash_omni as _ming_flash_omni  # noqa: F401, E402
+from vllm_omni.transformers_utils.configs import omnivoice as _omnivoice  # noqa: F401, E402
 from vllm_omni.transformers_utils.configs import voxcpm as _voxcpm  # noqa: F401, E402
 from vllm_omni.transformers_utils.configs import voxcpm2 as _voxcpm2  # noqa: F401, E402
 from vllm_omni.transformers_utils.configs import voxtral_tts as _voxtral_tts  # noqa: F401, E402
diff --git a/vllm_omni/model_executor/models/cosyvoice3/config.py b/vllm_omni/transformers_utils/configs/cosyvoice3.py
similarity index 97%
rename from vllm_omni/model_executor/models/cosyvoice3/config.py
rename to vllm_omni/transformers_utils/configs/cosyvoice3.py
index 518fe76b78a..9accc3cc83f 100644
--- a/vllm_omni/model_executor/models/cosyvoice3/config.py
+++ b/vllm_omni/transformers_utils/configs/cosyvoice3.py
@@ -1,5 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
 
 
@@ -131,3 +132,8 @@ def __init__(self, **kwargs):
                 "cond_channels": 512,
             },
         }
+
+
+AutoConfig.register("cosyvoice3", CosyVoice3Config)
+
+__all__ = ["CosyVoice3Config"]
diff --git a/vllm_omni/model_executor/models/omnivoice/config.py b/vllm_omni/transformers_utils/configs/omnivoice.py
similarity index 96%
rename from vllm_omni/model_executor/models/omnivoice/config.py
rename to vllm_omni/transformers_utils/configs/omnivoice.py
index a24176bcf25..d1bd3527fee 100644
--- a/vllm_omni/model_executor/models/omnivoice/config.py
+++ b/vllm_omni/transformers_utils/configs/omnivoice.py
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """OmniVoice configuration for vLLM-Omni two-stage pipeline."""
 
+from transformers import AutoConfig
 from transformers.configuration_utils import PretrainedConfig
 
 
@@ -79,3 +80,8 @@ def __init__(self, **kwargs):
 
         # Serving
         self.speculative_config = None
+
+
+AutoConfig.register("omnivoice", OmniVoiceConfig)
+
+__all__ = ["OmniVoiceConfig"]