vllm-project · hsliuustc0106 · Apr 11, 2026 · Apr 9, 2026 · Apr 10, 2026 · Apr 10, 2026
@@ -317,6 +317,31 @@ steps:
           volumes:
             - "/fsx/hf_cache:/fsx/hf_cache"
 
+  - label: "VoxCPM2 Native AR E2E Test"
+    timeout_in_minutes: 20
+    depends_on: upload-ready-pipeline
+    commands:
+      - |
+        timeout 20m bash -c '
+          pip install voxcpm
+          export VLLM_LOGGING_LEVEL=DEBUG
+          export VLLM_WORKER_MULTIPROC_METHOD=spawn
+          pytest -s -v tests/e2e/offline_inference/test_voxcpm2.py -m "core_model" --run-level "core_model"
+        '
+    agents:
+      queue: "gpu_1_queue"
+    plugins:
+      - docker#v5.2.0:
+          image: public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT
+          always-pull: true
+          propagate-environment: true
+          shm-size: "8gb"
+          environment:
+            - "HF_HOME=/fsx/hf_cache"
+            - "HF_TOKEN"
+          volumes:
+            - "/fsx/hf_cache:/fsx/hf_cache"
+
   - label: "OmniVoice E2E Test"
     timeout_in_minutes: 20
     depends_on: upload-ready-pipeline

@@ -0,0 +1,83 @@
+# VoxCPM2 Offline Inference (Native AR)
+
+VoxCPM2 is a 2B-parameter tokenizer-free diffusion AR TTS model. It produces 48kHz audio and supports 30+ languages with a single-stage native AR pipeline backed by MiniCPM4.
+
+## Prerequisites
+
+Install the `voxcpm` package, or set the environment variable pointing to the source tree:
+
+```bash
+# Option A: install package
+pip install voxcpm
+
+# Option B: use source checkout
+export VLLM_OMNI_VOXCPM_CODE_PATH=/path/to/voxcpm
+```
+
+## Quick Start
+
+Zero-shot synthesis:
+
+```bash
+python examples/offline_inference/voxcpm2/end2end.py \
+    --model openbmb/VoxCPM2 \
+    --text "Hello, this is a VoxCPM2 demo." \
+    --output-dir output_audio
+```
+
+Voice cloning with a reference audio:
+
+```bash
+python examples/offline_inference/voxcpm2/end2end.py \
+    --text "Hello, this is a voice clone demo." \
+    --reference-audio /path/to/reference.wav \
+    --output-dir output_clone
+```
+
+Prompt continuation (matched audio + text prefix):
+
+```bash
+python examples/offline_inference/voxcpm2/end2end.py \
+    --text "Continuation target sentence." \
+    --prompt-audio /path/to/prompt.wav \
+    --prompt-text "Transcript of the prompt audio." \
+    --output-dir output_cont
+```
+
+The script accepts the following arguments:
+
+| Argument | Default | Description |
+|---|---|---|
+| `--model` | `openbmb/VoxCPM2` | HuggingFace repo ID or local path |
+| `--text` | (example sentence) | Text to synthesize |
+| `--output-dir` | `output_audio` | Directory for output WAV files |
+| `--stage-configs-path` | `voxcpm2.yaml` | Stage config YAML path |
+| `--reference-audio` | `None` | Reference audio for voice cloning (isolated) |
+| `--prompt-audio` | `None` | Prompt audio for continuation mode |
+| `--prompt-text` | `None` | Transcript matching `--prompt-audio` |
+
+## Performance
+
+Measured on a single H20 GPU (80 GB), voxcpm 0.0.0, PyTorch 2.10.0+cu128:
+
+| Input length | RTF | Sample rate |
+|---|---|---|
+| Short (~6 words) | ~0.81 | 48 kHz |
+| Long (~50 words) | ~0.72 | 48 kHz |
+
+RTF < 1.0 means faster than real time.
+
+## Architecture
+
+VoxCPM2 uses a single-stage native AR pipeline:
+
+```
+feat_encoder
+└─► MiniCPM4 (base LM)
+     └─► FSQ (finite scalar quantization)
+          └─► residual_lm (residual AR)
+               └─► LocDiT (local diffusion transformer)
+                    └─► AudioVAE → 48 kHz waveform
+```
+
+All stages are fused into one vllm-native execution graph via `voxcpm2.yaml`, eliminating inter-stage coordination overhead and enabling true end-to-end batching.
@@ -0,0 +1,145 @@
+"""Offline VoxCPM2 inference example (native AR pipeline).
+
+Uses the single-stage native AR config (voxcpm2.yaml).
+Requires the `voxcpm` package or VLLM_OMNI_VOXCPM_CODE_PATH env var.
+"""
+
+from __future__ import annotations
+
+import os
+import time
+from pathlib import Path
+
+import soundfile as sf
+import torch
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+from vllm_omni import Omni
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+DEFAULT_STAGE_CONFIGS_PATH = str(REPO_ROOT / "vllm_omni" / "model_executor" / "stage_configs" / "voxcpm2.yaml")
+SAMPLE_RATE = 48_000
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(description="Offline VoxCPM2 native AR inference")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="openbmb/VoxCPM2",
+        help="VoxCPM2 model path or HuggingFace repo ID.",
+    )
+    parser.add_argument(
+        "--text",
+        type=str,
+        default="This is a VoxCPM2 native AR synthesis example running on vLLM Omni.",
+        help="Text to synthesize.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default="output_audio",
+        help="Directory for output WAV files.",
+    )
+    parser.add_argument(
+        "--stage-configs-path",
+        type=str,
+        default=DEFAULT_STAGE_CONFIGS_PATH,
+        help="Path to the stage config YAML file.",
+    )
+    parser.add_argument(
+        "--reference-audio",
+        type=str,
+        default=None,
+        help="Path to reference audio for voice cloning (isolated ref mode).",
+    )
+    parser.add_argument(
+        "--prompt-audio",
+        type=str,
+        default=None,
+        help="Path to prompt audio for continuation mode (requires --prompt-text).",
+    )
+    parser.add_argument(
+        "--prompt-text",
+        type=str,
+        default=None,
+        help="Text matching --prompt-audio for continuation mode.",
+    )
+    return parser.parse_args()
+
+
+def extract_audio(multimodal_output: dict) -> torch.Tensor:
+    """Extract the final complete audio tensor from multimodal output.
+
+    The output processor accumulates per-step full audio under ``audio``
+    as a list. The last element is the complete waveform.
+    """
+    audio = multimodal_output.get("audio") or multimodal_output.get("model_outputs")
+    if audio is None:
+        raise ValueError(f"No audio key in multimodal_output: {list(multimodal_output.keys())}")
+
+    if isinstance(audio, list):
+        # Take the last valid tensor (most complete audio)
+        valid = [torch.as_tensor(a).float().cpu().reshape(-1) for a in audio if a is not None]
+        if not valid:
+            raise ValueError("Audio list is empty or all elements are None.")
+        return valid[-1]
+
+    return torch.as_tensor(audio).float().cpu().reshape(-1)
+
+
+def main():
+    args = parse_args()
+
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    engine = Omni(
+        model=args.model,
+        stage_configs_path=args.stage_configs_path,
+    )
+
+    additional: dict = {}
+    if args.reference_audio:
+        additional["reference_audio"] = args.reference_audio
+    if args.prompt_audio and args.prompt_text:
+        additional["prompt_audio"] = args.prompt_audio
+        additional["prompt_text"] = args.prompt_text
+
+    prompt: dict = {"prompt": args.text}
+    if additional:
+        prompt["additional_information"] = additional
+
+    print(f"Model       : {args.model}")
+    print(f"Text        : {args.text}")
+    if args.reference_audio:
+        print(f"Ref audio   : {args.reference_audio}")
+    if args.prompt_audio:
+        print(f"Prompt audio: {args.prompt_audio}")
+        print(f"Prompt text : {args.prompt_text}")
+    print(f"Output dir  : {output_dir}")
+
+    t_start = time.perf_counter()
+    outputs = engine.generate([prompt])
+    elapsed = time.perf_counter() - t_start
+
+    # outputs[0].outputs[0].multimodal_output["audio"] is a list of tensors
+    request_output = outputs[0]
+    mm = request_output.outputs[0].multimodal_output
+    audio = extract_audio(mm)
+
+    duration = audio.numel() / SAMPLE_RATE
+    rtf = elapsed / duration if duration > 0 else float("inf")
+
+    output_path = output_dir / "output.wav"
+    sf.write(str(output_path), audio.numpy(), SAMPLE_RATE, format="WAV")
+
+    print(f"Saved       : {output_path}")
+    print(f"Duration    : {duration:.2f}s")
+    print(f"Inference   : {elapsed:.2f}s")
+    print(f"RTF         : {rtf:.3f}")
+
+
+if __name__ == "__main__":
+    os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+    main()
@@ -0,0 +1,101 @@
+"""E2E test for VoxCPM2 native AR offline inference."""
+
+import os
+
+import pytest
+import torch
+
+from tests.utils import hardware_test
+
+VOXCPM2_MODEL = "openbmb/VoxCPM2"
+STAGE_CONFIG = os.path.join(
+    os.path.dirname(__file__),
+    "..",
+    "..",
+    "..",
+    "vllm_omni",
+    "model_executor",
+    "stage_configs",
+    "voxcpm2.yaml",
+)
+SAMPLE_RATE = 48000
+
+
+@pytest.fixture(scope="module")
+def voxcpm2_engine():
+    """Create VoxCPM2 engine for testing."""
+    from vllm_omni import Omni
+
+    engine = Omni(model=VOXCPM2_MODEL, stage_configs_path=STAGE_CONFIG)
+    yield engine
+
+
+def _extract_audio(multimodal_output: dict) -> torch.Tensor:
+    """Extract the final complete audio tensor from multimodal output."""
+    assert isinstance(multimodal_output, dict), f"Expected dict, got {type(multimodal_output)}"
+
+    # Output processor accumulates per-step full audio under "audio".
+    audio = multimodal_output.get("audio") or multimodal_output.get("model_outputs")
+    assert audio is not None, f"No audio key, got {list(multimodal_output.keys())}"
+
+    if isinstance(audio, list):
+        valid = [x for x in audio if isinstance(x, torch.Tensor) and x.numel() > 100]
+        assert valid, "No valid audio tensors in output list"
+        audio = valid[-1]
+
+    assert isinstance(audio, torch.Tensor), f"Expected Tensor, got {type(audio)}"
+    return audio
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+def test_voxcpm2_zero_shot_001(voxcpm2_engine):
+    """Test zero-shot TTS produces valid audio output."""
+    outputs = voxcpm2_engine.generate([{"prompt": "Hello, this is a test."}])
+    assert len(outputs) == 1
+
+    audio = _extract_audio(outputs[0].outputs[0].multimodal_output)
+    duration_s = audio.shape[0] / SAMPLE_RATE
+    assert 0.5 < duration_s < 30.0, f"Audio duration out of range: {duration_s:.2f}s"
+
+
+@pytest.mark.core_model
+@pytest.mark.omni
+@hardware_test(res={"cuda": "L4"}, num_cards=1)
+def test_voxcpm2_voice_clone_002(voxcpm2_engine):
+    """Test voice cloning with a reference audio file.
+
+    Uses the example ``reference_speaker.wav`` bundled with the voxcpm
+    package. Skipped if the file is not present.
+    """
+    # Try to locate a reference wav from the voxcpm package / env override
+    candidates = []
+    env_path = os.environ.get("VLLM_OMNI_VOXCPM_CODE_PATH")
+    if env_path:
+        candidates.append(os.path.join(env_path, "..", "examples", "reference_speaker.wav"))
+    try:
+        import voxcpm  # noqa: F401 (only used to locate path)
+
+        vox_dir = os.path.dirname(os.path.dirname(os.path.abspath(voxcpm.__file__)))
+        candidates.append(os.path.join(vox_dir, "examples", "reference_speaker.wav"))
+    except ImportError:
+        pass
+
+    ref_path = next((p for p in candidates if p and os.path.exists(p)), None)
+    if ref_path is None:
+        pytest.skip("No reference audio available for voice clone test")
+
+    outputs = voxcpm2_engine.generate(
+        [
+            {
+                "prompt": "Hello, this is a voice clone demo.",
+                "additional_information": {"reference_audio": ref_path},
+            }
+        ]
+    )
+    assert len(outputs) == 1
+
+    audio = _extract_audio(outputs[0].outputs[0].multimodal_output)
+    duration_s = audio.shape[0] / SAMPLE_RATE
+    assert 0.5 < duration_s < 30.0, f"Audio duration out of range: {duration_s:.2f}s"
@@ -20,6 +20,7 @@
 _ARCH_TO_MODEL_TYPE: dict[str, str] = {
     "CosyVoice3Model": "cosyvoice3",
     "OmniVoiceModel": "omnivoice",
+    "VoxCPM2TalkerForConditionalGeneration": "voxcpm2",
 }
 
 # Maps model architecture names to tokenizer subfolder paths within HF repos.
@@ -40,6 +41,7 @@ def _register_omni_hf_configs() -> None:
         from vllm_omni.model_executor.models.voxtral_tts.configuration_voxtral_tts import (
             VoxtralTTSConfig,
         )
+        from vllm_omni.transformers_utils.configs.voxcpm2 import VoxCPM2Config
     except Exception as exc:  # pragma: no cover - best-effort optional registration
         logger.warning("Skipping omni HF config registration due to import error: %s", exc)
         return
@@ -57,6 +59,7 @@ def _register_omni_hf_configs() -> None:
         ("cosyvoice3", CosyVoice3Config),
         ("omnivoice", OmniVoiceConfig),
         ("voxtral_tts", VoxtralTTSConfig),
+        ("voxcpm2", VoxCPM2Config),
     ]:
         try:
             AutoConfig.register(model_type, config_cls)