vllm-project · nemoramo · Mar 11, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
@@ -223,6 +223,7 @@ datasets/
 *.csv
 *.json
 !apps/ComfyUI-vLLM-Omni/example_workflows/*.json
+!vllm_omni/model_executor/models/cosyvoice3/hf_config/config.json
 *.jsonl
 *.parquet
 

@@ -38,6 +38,7 @@ th {
 |`LongCatImageEditPipeline` | LongCat-Image-Edit | `meituan-longcat/LongCat-Image-Edit` |
 |`StableDiffusion3Pipeline` | Stable-Diffusion-3 | `stabilityai/stable-diffusion-3.5-medium` |
 |`CosyVoice3Model` | CosyVoice3 | `FunAudioLLM/Fun-CosyVoice3-0.5B-2512` |
+|`FunAudioChatForConditionalGeneration` | Fun-Audio-Chat-8B | `FunAudioLLM/Fun-Audio-Chat-8B` |
 |`MammothModa2ForConditionalGeneration` | MammothModa2-Preview | `bytedance-research/MammothModa2-Preview` |
 |`Flux2KleinPipeline` | FLUX.2-klein | `black-forest-labs/FLUX.2-klein-4B`, `black-forest-labs/FLUX.2-klein-9B` |
 |`FluxPipeline` | FLUX.1-dev | `black-forest-labs/FLUX.1-dev` |
@@ -54,7 +55,6 @@ th {
 |`DreamIDOmniPipeline`| DreamID-Omni | `XuGuo699/DreamID-Omni` |
 |`VoxtralTTSForConditionalGeneration` | Voxtral TTS | `mistralai/tts-model` |
 
-
 ## List of Supported Models for NPU
 
 <style>

@@ -87,6 +87,7 @@ include = ["vllm_omni*"]
 
 [tool.setuptools.package-data]
 "vllm_omni" = ["_version.py", "py.typed"]
+"vllm_omni.model_executor.models.cosyvoice3" = ["hf_config/*.json"]
 "vllm_omni.model_executor.stage_configs" = ["*.yaml"]
 
 [tool.setuptools_scm]

@@ -0,0 +1,29 @@
+from types import SimpleNamespace
+
+from vllm_omni.engine.stage_init_utils import build_engine_args_dict
+
+
+def test_build_engine_args_dict_preserves_stage_model_override():
+    stage_cfg = SimpleNamespace(
+        stage_id=1,
+        stage_type="llm",
+        engine_args=SimpleNamespace(model="stage-model", worker_type="ar"),
+    )
+
+    engine_args = build_engine_args_dict(stage_cfg, model="cli-model")
+
+    assert engine_args["model"] == "stage-model"
+    assert engine_args["stage_id"] == 1
+
+
+def test_build_engine_args_dict_falls_back_to_cli_model():
+    stage_cfg = SimpleNamespace(
+        stage_id=0,
+        stage_type="llm",
+        engine_args=SimpleNamespace(worker_type="ar"),
+    )
+
+    engine_args = build_engine_args_dict(stage_cfg, model="cli-model")
+
+    assert engine_args["model"] == "cli-model"
+    assert engine_args["stage_id"] == 0
@@ -0,0 +1,52 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+from pathlib import Path
+from types import SimpleNamespace
+
+import pytest
+import yaml
+
+from vllm_omni.engine.arg_utils import _resolve_bundled_hf_config_path
+from vllm_omni.entrypoints import utils as entrypoint_utils
+
+pytestmark = [pytest.mark.core_model, pytest.mark.cpu]
+
+
+def test_resolve_bundled_hf_config_path_uses_cosyvoice3_bundle_by_default():
+    resolved = _resolve_bundled_hf_config_path("FunAudioChatCosyVoice3Code2Wav", None)
+
+    assert resolved is not None
+    assert resolved.endswith("vllm_omni/model_executor/models/cosyvoice3/hf_config")
+    assert (Path(resolved) / "config.json").is_file()
+
+
+def test_resolve_bundled_hf_config_path_preserves_explicit_override():
+    resolved = _resolve_bundled_hf_config_path("FunAudioChatCosyVoice3Code2Wav", "/tmp/custom-hf-config")
+
+    assert resolved == "/tmp/custom-hf-config"
+
+
+def test_resolve_model_config_path_detects_funaudiochat_default_yaml(monkeypatch: pytest.MonkeyPatch):
+    monkeypatch.setattr(
+        entrypoint_utils,
+        "get_config",
+        lambda model, trust_remote_code=True: SimpleNamespace(model_type="funaudiochat"),
+    )
+
+    resolved = entrypoint_utils.resolve_model_config_path("dummy-funaudiochat-model")
+
+    assert resolved is not None
+    assert resolved.endswith("vllm_omni/model_executor/stage_configs/funaudiochat.yaml")
+
+
+def test_funaudiochat_default_stage_config_limits_audio_profile_and_keeps_audio_towers():
+    config_path = (
+        Path(__file__).resolve().parents[2] / "vllm_omni" / "model_executor" / "stage_configs" / "funaudiochat.yaml"
+    )
+    config = yaml.safe_load(config_path.read_text())
+    stage0_engine_args = config["stage_args"][0]["engine_args"]
+
+    assert "language_model_only" not in stage0_engine_args
+    assert stage0_engine_args["hf_overrides"]["audio_config"]["max_source_positions"] == 100
+    assert stage0_engine_args["limit_mm_per_prompt"]["audio"] == 1
@@ -0,0 +1,96 @@
+from __future__ import annotations
+
+from types import SimpleNamespace
+
+import torch
+
+from vllm_omni.model_executor.models.funaudiochat.funaudiochat_code2wav import (
+    FunAudioChatCosyVoice3Code2Wav,
+)
+
+
+def test_split_tokens_like_official_keeps_short_inputs_as_single_segment():
+    token = torch.arange(100, dtype=torch.long)
+
+    segments = FunAudioChatCosyVoice3Code2Wav._split_tokens_like_official(token)
+
+    assert len(segments) == 1
+    assert torch.equal(segments[0], token)
+
+
+def test_split_tokens_like_official_rebalances_tiny_tail_segment():
+    token = torch.arange(760, dtype=torch.long)
+
+    segments = FunAudioChatCosyVoice3Code2Wav._split_tokens_like_official(token)
+
+    assert [segment.numel() for segment in segments] == [380, 380]
+    assert torch.equal(torch.cat(segments, dim=0), token)
+
+
+def _build_code2wav_stub() -> FunAudioChatCosyVoice3Code2Wav:
+    model = object.__new__(FunAudioChatCosyVoice3Code2Wav)
+    model.vllm_config = SimpleNamespace(device_config=SimpleNamespace(device=torch.device("cpu")))
+    model._max_codec_token_id = 6560
+    model._dummy_profile_token_len = 32
+    model._logged_dummy_profile_cap = False
+    return model
+
+
+def test_build_decode_tokens_keeps_real_input_ids_without_sampling_metadata():
+    model = _build_code2wav_stub()
+    input_ids = torch.tensor([12, 34, 56], dtype=torch.long)
+
+    token_batches, is_dummy_profile = model._build_decode_tokens(input_ids, sampling_metadata=None)
+
+    assert len(token_batches) == 1
+    assert token_batches[0].tolist() == [[12, 34, 56]]
+    assert is_dummy_profile is False
+
+
+def test_build_decode_tokens_uses_prompt_token_ids_when_input_ids_are_empty():
+    model = _build_code2wav_stub()
+    sampling_metadata = SimpleNamespace(prompt_token_ids=[1, 2, 3, 4])
+
+    token_batches, is_dummy_profile = model._build_decode_tokens(
+        torch.empty((0,), dtype=torch.long),
+        sampling_metadata,
+    )
+
+    assert len(token_batches) == 1
+    assert token_batches[0].tolist() == [[1, 2, 3, 4]]
+    assert is_dummy_profile is False
+
+
+def test_build_decode_tokens_treats_all_zero_missing_metadata_as_dummy_profile():
+    model = _build_code2wav_stub()
+    input_ids = torch.zeros((64,), dtype=torch.long)
+
+    token_batches, is_dummy_profile = model._build_decode_tokens(input_ids, sampling_metadata=None)
+
+    assert len(token_batches) == 1
+    assert token_batches[0].shape == (1, 32)
+    assert is_dummy_profile is True
+
+
+def test_build_decode_tokens_no_longer_rejects_long_sequences_before_segmentation():
+    model = _build_code2wav_stub()
+    input_ids = torch.arange(10235, dtype=torch.long) % 6000
+
+    token_batches, is_dummy_profile = model._build_decode_tokens(input_ids, sampling_metadata=None)
+
+    assert len(token_batches) == 1
+    assert token_batches[0].shape == (1, 10235)
+    assert is_dummy_profile is False
+
+
+def test_build_decode_tokens_preserves_batched_prompt_token_ids_per_request():
+    model = _build_code2wav_stub()
+    sampling_metadata = SimpleNamespace(prompt_token_ids=[[1, 2, 3], [4, 5]])
+
+    token_batches, is_dummy_profile = model._build_decode_tokens(
+        torch.empty((0,), dtype=torch.long),
+        sampling_metadata,
+    )
+
+    assert [token.tolist() for token in token_batches] == [[[1, 2, 3]], [[4, 5]]]
+    assert is_dummy_profile is False
-Original file line number
+Diff line change
@@ Expand Up / @@ -223,6 +223,7 @@ datasets/ @@
     *.csv
     *.json
     !apps/ComfyUI-vLLM-Omni/example_workflows/*.json
+    !vllm_omni/model_executor/models/cosyvoice3/hf_config/config.json
     *.jsonl
     *.parquet
@@ Expand Down @@