vllm-project · Gaohan123 · Apr 5, 2026 · Apr 4, 2026 · Apr 4, 2026
@@ -80,6 +80,7 @@ def test_voice_clone_zh_001(omni_server, openai_client) -> None:
 @pytest.mark.omni
 @hardware_test(res={"cuda": "H100"}, num_cards=1)
 @pytest.mark.parametrize("omni_server", tts_server_params, indirect=True)
+@pytest.mark.skip(reason="CosyVoice3 does not support async_chunk streaming yet")
 def test_voice_clone_zh_002(omni_server, openai_client) -> None:
     """
     Test voice cloning TTS with Chinese text via OpenAI API.

@@ -182,6 +182,28 @@ def _convert_dataclasses_to_dict(obj: Any) -> Any:
     return obj
 
 
+def _try_resolve_omni_model_type(model: str) -> str | None:
+    """Try to resolve model_type for omni models with empty config.json.
+
+    Checks if any registered omni stage config file name matches a substring
+    in the model name (e.g. 'cosyvoice3' in 'FunAudioLLM/Fun-CosyVoice3-0.5B-2512').
+    When multiple configs match, the longest stem wins to avoid ambiguity
+    (e.g. 'bagel_single_stage' over 'bagel').
+    """
+    stage_configs_dir = PROJECT_ROOT / "vllm_omni" / "model_executor" / "stage_configs"
+    if not stage_configs_dir.exists():
+        return None
+    model_lower = model.lower().replace("-", "").replace("_", "")
+    best_match: str | None = None
+    best_len = 0
+    for config_file in sorted(stage_configs_dir.glob("*.yaml")):
+        candidate = config_file.stem.replace("-", "").replace("_", "")
+        if candidate in model_lower and len(candidate) > best_len:
+            best_match = config_file.stem
+            best_len = len(candidate)
+    return best_match
+
+
 def resolve_model_config_path(model: str) -> str:
     """Resolve the stage config file path from the model name.
 
@@ -220,7 +242,11 @@ def resolve_model_config_path(model: str) -> str:
                 if config_dict and "model_type" in config_dict:
                     model_type = config_dict["model_type"]
                 else:
-                    raise ValueError(f"config.json found but missing 'model_type' for model: {model}")
+                    # For models with empty config.json (e.g. CosyVoice3),
+                    # try matching against registered omni stage configs.
+                    model_type = _try_resolve_omni_model_type(model)
+                    if model_type is None:
+                        raise ValueError(f"config.json found but missing 'model_type' for model: {model}")
             except Exception as e:
                 raise ValueError(f"Failed to read config.json for model: {model}. Error: {e}") from e
         else:

@@ -7,6 +7,8 @@ class CosyVoice3Config(PretrainedConfig):
     model_type = "cosyvoice3"
 
     def __init__(self, **kwargs):
+        # Set speech EOS so vLLM stops generation at the right token
+        kwargs.setdefault("eos_token_id", 6562)
         super().__init__(**kwargs)
         self.sample_rate = 24000
         self.llm_input_size = 896

@@ -432,7 +432,7 @@ def forward(
 
             return OmniOutput(
                 text_hidden_states=None,
-                multimodal_outputs={"audio": tts_speech, "sr": 22050},
+                multimodal_outputs={"audio": tts_speech, "sr": torch.tensor(22050)},
             )
         else:
             raise ValueError(f"Unsupported model_stage: {self.model_stage}")

@@ -192,8 +192,10 @@ def forward(
         # Create mask
         mask = (~make_pad_mask(full_token_len)).unsqueeze(-1).to(embedding)
 
-        # Token embedding
-        token_emb = self.input_embedding(torch.clamp(full_token, min=0)) * mask
+        # Token embedding (clamp to valid codebook range; EOS/padding tokens may exceed vocab_size)
+        token_emb = (
+            self.input_embedding(torch.clamp(full_token, min=0, max=self.input_embedding.num_embeddings - 1)) * mask
+        )
 
         # Pre-lookahead processing
         h = self.pre_lookahead_layer(token_emb)

@@ -22,6 +22,9 @@ stage_args:
       mm_processor_cache_gb: 0
       skip_mm_profiling: true
       dtype: "float32"
+    default_sampling_params:
+      max_tokens: 2048
+      stop_token_ids: [6562]  # speech EOS token
 
   - stage_id: 1
     runtime:
@@ -39,6 +42,8 @@ stage_args:
       enable_prefix_caching: false
       skip_mm_profiling: true
       dtype: "float32"
+    default_sampling_params:
+      max_tokens: 2048
     engine_input_source: [0]
     custom_process_input_func: vllm_omni.model_executor.stage_input_processors.cosyvoice3.text2flow
     final_output: true