Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions tests/e2e/online_serving/test_cosyvoice3_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def test_voice_clone_zh_001(omni_server, openai_client) -> None:
@pytest.mark.omni
@hardware_test(res={"cuda": "H100"}, num_cards=1)
@pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
@pytest.mark.skip(reason="CosyVoice3 does not support async_chunk streaming yet")
def test_voice_clone_zh_002(omni_server, openai_client) -> None:
"""
Test voice cloning TTS with Chinese text via OpenAI API.
Expand Down
28 changes: 27 additions & 1 deletion vllm_omni/entrypoints/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,28 @@ def _convert_dataclasses_to_dict(obj: Any) -> Any:
return obj


def _try_resolve_omni_model_type(model: str) -> str | None:
"""Try to resolve model_type for omni models with empty config.json.

Checks if any registered omni stage config file name matches a substring
in the model name (e.g. 'cosyvoice3' in 'FunAudioLLM/Fun-CosyVoice3-0.5B-2512').
When multiple configs match, the longest stem wins to avoid ambiguity
(e.g. 'bagel_single_stage' over 'bagel').
"""
stage_configs_dir = PROJECT_ROOT / "vllm_omni" / "model_executor" / "stage_configs"
if not stage_configs_dir.exists():
return None
model_lower = model.lower().replace("-", "").replace("_", "")
best_match: str | None = None
best_len = 0
for config_file in sorted(stage_configs_dir.glob("*.yaml")):
candidate = config_file.stem.replace("-", "").replace("_", "")
if candidate in model_lower and len(candidate) > best_len:
best_match = config_file.stem
best_len = len(candidate)
return best_match


def resolve_model_config_path(model: str) -> str:
"""Resolve the stage config file path from the model name.

Expand Down Expand Up @@ -220,7 +242,11 @@ def resolve_model_config_path(model: str) -> str:
if config_dict and "model_type" in config_dict:
model_type = config_dict["model_type"]
else:
raise ValueError(f"config.json found but missing 'model_type' for model: {model}")
# For models with empty config.json (e.g. CosyVoice3),
# try matching against registered omni stage configs.
model_type = _try_resolve_omni_model_type(model)
if model_type is None:
raise ValueError(f"config.json found but missing 'model_type' for model: {model}")
except Exception as e:
raise ValueError(f"Failed to read config.json for model: {model}. Error: {e}") from e
else:
Expand Down
2 changes: 2 additions & 0 deletions vllm_omni/model_executor/models/cosyvoice3/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ class CosyVoice3Config(PretrainedConfig):
model_type = "cosyvoice3"

def __init__(self, **kwargs):
# Set speech EOS so vLLM stops generation at the right token
kwargs.setdefault("eos_token_id", 6562)
super().__init__(**kwargs)
self.sample_rate = 24000
self.llm_input_size = 896
Expand Down
2 changes: 1 addition & 1 deletion vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,7 @@ def forward(

return OmniOutput(
text_hidden_states=None,
multimodal_outputs={"audio": tts_speech, "sr": 22050},
multimodal_outputs={"audio": tts_speech, "sr": torch.tensor(22050)},
)
else:
raise ValueError(f"Unsupported model_stage: {self.model_stage}")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,10 @@ def forward(
# Create mask
mask = (~make_pad_mask(full_token_len)).unsqueeze(-1).to(embedding)

# Token embedding
token_emb = self.input_embedding(torch.clamp(full_token, min=0)) * mask
# Token embedding (clamp to valid codebook range; EOS/padding tokens may exceed vocab_size)
token_emb = (
self.input_embedding(torch.clamp(full_token, min=0, max=self.input_embedding.num_embeddings - 1)) * mask
)

# Pre-lookahead processing
h = self.pre_lookahead_layer(token_emb)
Expand Down
5 changes: 5 additions & 0 deletions vllm_omni/model_executor/stage_configs/cosyvoice3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ stage_args:
mm_processor_cache_gb: 0
skip_mm_profiling: true
dtype: "float32"
default_sampling_params:
max_tokens: 2048
stop_token_ids: [6562] # speech EOS token

- stage_id: 1
runtime:
Expand All @@ -39,6 +42,8 @@ stage_args:
enable_prefix_caching: false
skip_mm_profiling: true
dtype: "float32"
default_sampling_params:
max_tokens: 2048
engine_input_source: [0]
custom_process_input_func: vllm_omni.model_executor.stage_input_processors.cosyvoice3.text2flow
final_output: true
Expand Down
Loading