From 64cbbcc2814305e507488cc8270dbb495ac09f5e Mon Sep 17 00:00:00 2001 From: linyueqian Date: Sat, 4 Apr 2026 14:59:25 -0400 Subject: [PATCH 1/2] [CosyVoice3] Fix vLLM 0.19.0 compatibility issues vLLM 0.19.0 changed several defaults that broke CosyVoice3: 1. **Stage config resolution**: `resolve_model_config_path` fails for models with empty config.json (no model_type). Add fallback that matches model name against registered stage config filenames. 2. **EOS token not set**: CosyVoice3Config stores eos_token_id=6562 in a nested `llm` dict but the top-level PretrainedConfig field was None. vLLM reads the top-level field, so generation never stopped. Set it via kwargs.setdefault in __init__. 3. **SamplingParams.max_tokens default changed to 16**: vLLM 0.19.0 defaults max_tokens=16 (was higher before). Add default_sampling_params with max_tokens=2048 and stop_token_ids=[6562] to both stages. 4. **Embedding OOB crash in code2wav**: Speech EOS token (6562) exceeds the flow model's embedding table size (6561). Clamp token IDs to valid range before embedding lookup. Signed-off-by: linyueqian --- vllm_omni/entrypoints/utils.py | 23 ++++++++++++++++++- .../models/cosyvoice3/config.py | 2 ++ .../models/cosyvoice3/cosyvoice3_code2wav.py | 6 +++-- .../stage_configs/cosyvoice3.yaml | 5 ++++ 4 files changed, 33 insertions(+), 3 deletions(-) diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index e29e9eea1c2..fce6bbf2f00 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -182,6 +182,23 @@ def _convert_dataclasses_to_dict(obj: Any) -> Any: return obj +def _try_resolve_omni_model_type(model: str) -> str | None: + """Try to resolve model_type for omni models with empty config.json. + + Checks if any registered omni stage config file name matches a substring + in the model name (e.g. 'cosyvoice3' in 'FunAudioLLM/Fun-CosyVoice3-0.5B-2512'). + """ + stage_configs_dir = PROJECT_ROOT / "vllm_omni" / "model_executor" / "stage_configs" + if not stage_configs_dir.exists(): + return None + model_lower = model.lower().replace("-", "").replace("_", "") + for config_file in stage_configs_dir.glob("*.yaml"): + candidate = config_file.stem.replace("-", "").replace("_", "") + if candidate in model_lower: + return config_file.stem + return None + + def resolve_model_config_path(model: str) -> str: """Resolve the stage config file path from the model name. @@ -220,7 +237,11 @@ def resolve_model_config_path(model: str) -> str: if config_dict and "model_type" in config_dict: model_type = config_dict["model_type"] else: - raise ValueError(f"config.json found but missing 'model_type' for model: {model}") + # For models with empty config.json (e.g. CosyVoice3), + # try matching against registered omni stage configs. + model_type = _try_resolve_omni_model_type(model) + if model_type is None: + raise ValueError(f"config.json found but missing 'model_type' for model: {model}") except Exception as e: raise ValueError(f"Failed to read config.json for model: {model}. Error: {e}") from e else: diff --git a/vllm_omni/model_executor/models/cosyvoice3/config.py b/vllm_omni/model_executor/models/cosyvoice3/config.py index 0c9a2899797..b4e44b7a82a 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/config.py +++ b/vllm_omni/model_executor/models/cosyvoice3/config.py @@ -7,6 +7,8 @@ class CosyVoice3Config(PretrainedConfig): model_type = "cosyvoice3" def __init__(self, **kwargs): + # Set speech EOS so vLLM stops generation at the right token + kwargs.setdefault("eos_token_id", 6562) super().__init__(**kwargs) self.sample_rate = 24000 self.llm_input_size = 896 diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py index f5e0d04a8ae..222d6d98ace 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3_code2wav.py @@ -192,8 +192,10 @@ def forward( # Create mask mask = (~make_pad_mask(full_token_len)).unsqueeze(-1).to(embedding) - # Token embedding - token_emb = self.input_embedding(torch.clamp(full_token, min=0)) * mask + # Token embedding (clamp to valid codebook range; EOS/padding tokens may exceed vocab_size) + token_emb = ( + self.input_embedding(torch.clamp(full_token, min=0, max=self.input_embedding.num_embeddings - 1)) * mask + ) # Pre-lookahead processing h = self.pre_lookahead_layer(token_emb) diff --git a/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml b/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml index e215f51428a..bfb847f5eae 100644 --- a/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml +++ b/vllm_omni/model_executor/stage_configs/cosyvoice3.yaml @@ -22,6 +22,9 @@ stage_args: mm_processor_cache_gb: 0 skip_mm_profiling: true dtype: "float32" + default_sampling_params: + max_tokens: 2048 + stop_token_ids: [6562] # speech EOS token - stage_id: 1 runtime: @@ -39,6 +42,8 @@ stage_args: enable_prefix_caching: false skip_mm_profiling: true dtype: "float32" + default_sampling_params: + max_tokens: 2048 engine_input_source: [0] custom_process_input_func: vllm_omni.model_executor.stage_input_processors.cosyvoice3.text2flow final_output: true From 3d590770a1d02260e1b06b4994fc5beb15596b61 Mon Sep 17 00:00:00 2001 From: linyueqian Date: Sat, 4 Apr 2026 15:28:19 -0400 Subject: [PATCH 2/2] fix: wrap sr as tensor and skip streaming test - Wrap sr=22050 as torch.tensor in code2wav output so the generation model runner doesn't silently drop it (only tensor outputs accepted). Fixes "First audio chunk must include sample rate metadata" assertion. - Skip test_voice_clone_zh_002 (stream=True) because CosyVoice3 does not have async_chunk streaming support yet. Signed-off-by: linyueqian --- tests/e2e/online_serving/test_cosyvoice3_tts.py | 1 + vllm_omni/entrypoints/utils.py | 13 +++++++++---- .../model_executor/models/cosyvoice3/cosyvoice3.py | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/e2e/online_serving/test_cosyvoice3_tts.py b/tests/e2e/online_serving/test_cosyvoice3_tts.py index 976be805c27..1845d7818aa 100644 --- a/tests/e2e/online_serving/test_cosyvoice3_tts.py +++ b/tests/e2e/online_serving/test_cosyvoice3_tts.py @@ -80,6 +80,7 @@ def test_voice_clone_zh_001(omni_server, openai_client) -> None: @pytest.mark.omni @hardware_test(res={"cuda": "H100"}, num_cards=1) @pytest.mark.parametrize("omni_server", tts_server_params, indirect=True) +@pytest.mark.skip(reason="CosyVoice3 does not support async_chunk streaming yet") def test_voice_clone_zh_002(omni_server, openai_client) -> None: """ Test voice cloning TTS with Chinese text via OpenAI API. diff --git a/vllm_omni/entrypoints/utils.py b/vllm_omni/entrypoints/utils.py index fce6bbf2f00..0e1000ec952 100644 --- a/vllm_omni/entrypoints/utils.py +++ b/vllm_omni/entrypoints/utils.py @@ -187,16 +187,21 @@ def _try_resolve_omni_model_type(model: str) -> str | None: Checks if any registered omni stage config file name matches a substring in the model name (e.g. 'cosyvoice3' in 'FunAudioLLM/Fun-CosyVoice3-0.5B-2512'). + When multiple configs match, the longest stem wins to avoid ambiguity + (e.g. 'bagel_single_stage' over 'bagel'). """ stage_configs_dir = PROJECT_ROOT / "vllm_omni" / "model_executor" / "stage_configs" if not stage_configs_dir.exists(): return None model_lower = model.lower().replace("-", "").replace("_", "") - for config_file in stage_configs_dir.glob("*.yaml"): + best_match: str | None = None + best_len = 0 + for config_file in sorted(stage_configs_dir.glob("*.yaml")): candidate = config_file.stem.replace("-", "").replace("_", "") - if candidate in model_lower: - return config_file.stem - return None + if candidate in model_lower and len(candidate) > best_len: + best_match = config_file.stem + best_len = len(candidate) + return best_match def resolve_model_config_path(model: str) -> str: diff --git a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py index bc04aae33c9..18a16ba5516 100644 --- a/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py +++ b/vllm_omni/model_executor/models/cosyvoice3/cosyvoice3.py @@ -432,7 +432,7 @@ def forward( return OmniOutput( text_hidden_states=None, - multimodal_outputs={"audio": tts_speech, "sr": 22050}, + multimodal_outputs={"audio": tts_speech, "sr": torch.tensor(22050)}, ) else: raise ValueError(f"Unsupported model_stage: {self.model_stage}")