vllm-project · linyueqian · Mar 21, 2026 · Mar 20, 2026 · Mar 20, 2026 · Mar 20, 2026
@@ -52,7 +52,7 @@ th {
 |`Flux2Pipeline` | FLUX.2-dev | `black-forest-labs/FLUX.2-dev` |
 |`FishSpeechSlowARForConditionalGeneration` | Fish Speech S2 Pro | `fishaudio/s2-pro` |
 |`DreamIDOmniPipeline`| DreamID-Omni | `XuGuo699/DreamID-Omni` |
-|`VoxtralTTSForConditionalGeneration` | Voxtral TTS | `mistralai/tts-model` |
+|`VoxtralTTSForConditionalGeneration` | Voxtral TTS | `mistralai/Voxtral-4B-TTS-2603` |
 
 
 ## List of Supported Models for NPU

@@ -4,7 +4,7 @@ vLLM-Omni provides an OpenAI-compatible API for text-to-speech (TTS) generation.
 
 - **Qwen3-TTS** (`Qwen/Qwen3-TTS-12Hz-*`) -- Qwen3-based TTS with CustomVoice, VoiceDesign, and Base (voice cloning) task types. Output: 24 kHz.
 - **Fish Speech S2 Pro** (`fishaudio/s2-pro`) -- Dual-AR TTS with DAC codec. Supports text-to-speech and voice cloning via reference audio. Output: 44.1 kHz.
-- **Voxtral TTS** (`mistralai/tts-model`) -- AR + FlowMatching TTS with preset voices. Output: 24 kHz.
+- **Voxtral TTS** (`mistralai/Voxtral-4B-TTS-2603`) -- AR + FlowMatching TTS with preset voices. Output: 24 kHz.
 
 Each server instance runs a single model (specified at startup via `vllm serve <model> --omni`).
 
@@ -31,7 +31,7 @@ vllm-omni serve fishaudio/s2-pro \
     --gpu-memory-utilization 0.9
 
 # Voxtral TTS
-vllm serve mistralai/tts-model \
+vllm serve mistralai/Voxtral-4B-TTS-2603 \
     --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \
     --omni \
     --port 8091 \
@@ -360,7 +360,7 @@ Fish Speech uses `ref_audio` and `ref_text` for voice cloning (no `task_type` ne
 
 | Model | Description |
 |-------|-------------|
-| `mistralai/tts-model` | 3B AR + FlowMatching TTS. Supports text-to-speech with preset voices. |
+| `mistralai/Voxtral-4B-TTS-2603` | 3B AR + FlowMatching TTS. Supports text-to-speech with preset voices. |
 
 ## Error Responses
 

@@ -12,36 +12,36 @@ When `mistral_common` has `SpeechRequest` support, prompt token IDs are built vi
 python3 examples/offline_inference/voxtral_tts/end2end.py \
     --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \
     --write-audio --voice cheerful_female \
-    --model mistralai/tts-model \
+    --model mistralai/Voxtral-4B-TTS-2603 \
     --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?"
 
 # 32 replicate prompts with cheerful_female voice preset
 python3 examples/offline_inference/voxtral_tts/end2end.py \
     --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \
     --num-prompts 32 --write-audio --voice cheerful_female \
-    --model mistralai/tts-model \
+    --model mistralai/Voxtral-4B-TTS-2603 \
     --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?"
 
 # Streaming with neutral_female voice preset
 python3 examples/offline_inference/voxtral_tts/end2end.py \
     --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \
     --streaming --write-audio --voice neutral_female \
-    --model mistralai/tts-model \
+    --model mistralai/Voxtral-4B-TTS-2603 \
     --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?"
 
 # 32 prompts, 8 concurrent requests per wave, streaming with neutral_female voice
 python3 examples/offline_inference/voxtral_tts/end2end.py \
     --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \
     --num-prompts 32 --concurrency 8 --streaming --write-audio --voice neutral_female \
-    --model mistralai/tts-model \
+    --model mistralai/Voxtral-4B-TTS-2603 \
     --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?"
 
 # Short debug prompt with reference audio
 # Note: Reference audio capability is not yet released.
 python3 examples/offline_inference/voxtral_tts/end2end.py \
     --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \
     --write-audio \
-    --model mistralai/tts-model \
+    --model mistralai/Voxtral-4B-TTS-2603 \
     --text "This is a test message." \
     --audio-path path/to/reference_audio.wav
 ```
@@ -50,7 +50,7 @@ python3 examples/offline_inference/voxtral_tts/end2end.py \
 
 | Argument | Description |
 |---|---|
-| `--model PATH` | HuggingFace repo ID or local directory path (default: `mistralai/tts-model`) |
+| `--model PATH` | HuggingFace repo ID or local directory path (default: `mistralai/Voxtral-4B-TTS-2603`) |
 | `--text TEXT` | Text to synthesize (default: `"This is a test message."`) |
 | `--audio-path PATH` | Path to reference audio file for voice cloning |
 | `--output-dir DIR` | Directory to write output WAV files (default: `output_audio`) |

@@ -229,7 +229,7 @@ def parse_args() -> Namespace:
     parser.add_argument(
         "--model",
         type=str,
-        default="mistralai/tts-model",
+        default="mistralai/Voxtral-4B-TTS-2603",
         help="Model name or path.",
     )
     parser.add_argument(

@@ -1,3 +1,14 @@
+"""
+- Make sure to install the following for this example to function correctly:
+- `pip install -e .`
+- `pip install gradio==5.50 mistral_common=1.10.0`
+
+Example use case:
+
+python examples/online_serving/voxtral_tts/gradio_demo.py --host slurm-199-077 --port 8000
+
+"""
+
 import argparse
 import io
 import json
@@ -125,11 +136,12 @@ def _load_from_share(
 
 def main(
     model: str,
+    host: str,
+    port: str,
     output_dir: str | None = None,
-    debug: bool = False,
 ) -> None:
-    base_url = "http://localhost:8091/v1"
-    print(f"Using speech API at: {base_url}/audio/speech")
+    base_url = f"http://{host}:{port}/v1"
+    logger.info(f"Using speech API at: {base_url}/audio/speech")
 
     outputs_dir: Path | None = None
     if output_dir is not None:
@@ -236,20 +248,22 @@ def _on_reset():
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Voxtral TTS Gradio Demo")
-    parser.add_argument("--model", type=str, required=True, help="Path to the model checkpoint")
+    parser.add_argument("--model", type=str, default="mistralai/Voxtral-4B-TTS-2603", help="Name of model repo on HF")
+    parser.add_argument("--host", type=str, default="localhost", help="Name of host")
+    parser.add_argument("--port", type=str, default="8091", help="port number")
     parser.add_argument(
         "--output-dir",
         type=str,
         default=None,
         help="Directory to save generated audio and share links. "
         "If not provided, save/share functionality is disabled.",
     )
-    parser.add_argument("--debug", action="store_true", default=False, help="Enable debug mode")
 
     args = parser.parse_args()
 
     main(
         model=args.model,
+        host=args.host,
+        port=args.port,
         output_dir=args.output_dir,
-        debug=args.debug,
     )
@@ -20,7 +20,7 @@
 from tests.conftest import OmniServerParams
 from tests.utils import hardware_test
 
-MODEL = "mistralai/tts-model"
+MODEL = "mistralai/Voxtral-4B-TTS-2603"
 
 STAGE_CONFIG = str(
     Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "voxtral_tts.yaml"