diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index ece7dffff80..a3b1dbe76da 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -52,7 +52,7 @@ th { |`Flux2Pipeline` | FLUX.2-dev | `black-forest-labs/FLUX.2-dev` | |`FishSpeechSlowARForConditionalGeneration` | Fish Speech S2 Pro | `fishaudio/s2-pro` | |`DreamIDOmniPipeline`| DreamID-Omni | `XuGuo699/DreamID-Omni` | -|`VoxtralTTSForConditionalGeneration` | Voxtral TTS | `mistralai/tts-model` | +|`VoxtralTTSForConditionalGeneration` | Voxtral TTS | `mistralai/Voxtral-4B-TTS-2603` | ## List of Supported Models for NPU diff --git a/docs/serving/speech_api.md b/docs/serving/speech_api.md index 17787e682d0..7969ca4410f 100644 --- a/docs/serving/speech_api.md +++ b/docs/serving/speech_api.md @@ -4,7 +4,7 @@ vLLM-Omni provides an OpenAI-compatible API for text-to-speech (TTS) generation. - **Qwen3-TTS** (`Qwen/Qwen3-TTS-12Hz-*`) -- Qwen3-based TTS with CustomVoice, VoiceDesign, and Base (voice cloning) task types. Output: 24 kHz. - **Fish Speech S2 Pro** (`fishaudio/s2-pro`) -- Dual-AR TTS with DAC codec. Supports text-to-speech and voice cloning via reference audio. Output: 44.1 kHz. -- **Voxtral TTS** (`mistralai/tts-model`) -- AR + FlowMatching TTS with preset voices. Output: 24 kHz. +- **Voxtral TTS** (`mistralai/Voxtral-4B-TTS-2603`) -- AR + FlowMatching TTS with preset voices. Output: 24 kHz. Each server instance runs a single model (specified at startup via `vllm serve --omni`). @@ -31,7 +31,7 @@ vllm-omni serve fishaudio/s2-pro \ --gpu-memory-utilization 0.9 # Voxtral TTS -vllm serve mistralai/tts-model \ +vllm serve mistralai/Voxtral-4B-TTS-2603 \ --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ --omni \ --port 8091 \ @@ -360,7 +360,7 @@ Fish Speech uses `ref_audio` and `ref_text` for voice cloning (no `task_type` ne | Model | Description | |-------|-------------| -| `mistralai/tts-model` | 3B AR + FlowMatching TTS. Supports text-to-speech with preset voices. | +| `mistralai/Voxtral-4B-TTS-2603` | 3B AR + FlowMatching TTS. Supports text-to-speech with preset voices. | ## Error Responses diff --git a/examples/offline_inference/voxtral_tts/README.md b/examples/offline_inference/voxtral_tts/README.md index f552fd165ee..d754b0b38ee 100644 --- a/examples/offline_inference/voxtral_tts/README.md +++ b/examples/offline_inference/voxtral_tts/README.md @@ -12,28 +12,28 @@ When `mistral_common` has `SpeechRequest` support, prompt token IDs are built vi python3 examples/offline_inference/voxtral_tts/end2end.py \ --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ --write-audio --voice cheerful_female \ - --model mistralai/tts-model \ + --model mistralai/Voxtral-4B-TTS-2603 \ --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?" # 32 replicate prompts with cheerful_female voice preset python3 examples/offline_inference/voxtral_tts/end2end.py \ --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ --num-prompts 32 --write-audio --voice cheerful_female \ - --model mistralai/tts-model \ + --model mistralai/Voxtral-4B-TTS-2603 \ --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?" # Streaming with neutral_female voice preset python3 examples/offline_inference/voxtral_tts/end2end.py \ --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ --streaming --write-audio --voice neutral_female \ - --model mistralai/tts-model \ + --model mistralai/Voxtral-4B-TTS-2603 \ --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?" # 32 prompts, 8 concurrent requests per wave, streaming with neutral_female voice python3 examples/offline_inference/voxtral_tts/end2end.py \ --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ --num-prompts 32 --concurrency 8 --streaming --write-audio --voice neutral_female \ - --model mistralai/tts-model \ + --model mistralai/Voxtral-4B-TTS-2603 \ --text "That eerie silence after the first storm was just the calm before another round of chaos, wasn't it?" # Short debug prompt with reference audio @@ -41,7 +41,7 @@ python3 examples/offline_inference/voxtral_tts/end2end.py \ python3 examples/offline_inference/voxtral_tts/end2end.py \ --stage-configs-path vllm_omni/model_executor/stage_configs/voxtral_tts.yaml \ --write-audio \ - --model mistralai/tts-model \ + --model mistralai/Voxtral-4B-TTS-2603 \ --text "This is a test message." \ --audio-path path/to/reference_audio.wav ``` @@ -50,7 +50,7 @@ python3 examples/offline_inference/voxtral_tts/end2end.py \ | Argument | Description | |---|---| -| `--model PATH` | HuggingFace repo ID or local directory path (default: `mistralai/tts-model`) | +| `--model PATH` | HuggingFace repo ID or local directory path (default: `mistralai/Voxtral-4B-TTS-2603`) | | `--text TEXT` | Text to synthesize (default: `"This is a test message."`) | | `--audio-path PATH` | Path to reference audio file for voice cloning | | `--output-dir DIR` | Directory to write output WAV files (default: `output_audio`) | diff --git a/examples/offline_inference/voxtral_tts/end2end.py b/examples/offline_inference/voxtral_tts/end2end.py index d3052e5ca68..f2f2a551b1d 100644 --- a/examples/offline_inference/voxtral_tts/end2end.py +++ b/examples/offline_inference/voxtral_tts/end2end.py @@ -229,7 +229,7 @@ def parse_args() -> Namespace: parser.add_argument( "--model", type=str, - default="mistralai/tts-model", + default="mistralai/Voxtral-4B-TTS-2603", help="Model name or path.", ) parser.add_argument( diff --git a/examples/online_serving/voxtral_tts/gradio_demo.py b/examples/online_serving/voxtral_tts/gradio_demo.py index 293f1c39119..465167ba15a 100644 --- a/examples/online_serving/voxtral_tts/gradio_demo.py +++ b/examples/online_serving/voxtral_tts/gradio_demo.py @@ -1,3 +1,14 @@ +""" +- Make sure to install the following for this example to function correctly: +- `pip install -e .` +- `pip install gradio==5.50 mistral_common=1.10.0` + +Example use case: + +python examples/online_serving/voxtral_tts/gradio_demo.py --host slurm-199-077 --port 8000 + +""" + import argparse import io import json @@ -125,11 +136,12 @@ def _load_from_share( def main( model: str, + host: str, + port: str, output_dir: str | None = None, - debug: bool = False, ) -> None: - base_url = "http://localhost:8091/v1" - print(f"Using speech API at: {base_url}/audio/speech") + base_url = f"http://{host}:{port}/v1" + logger.info(f"Using speech API at: {base_url}/audio/speech") outputs_dir: Path | None = None if output_dir is not None: @@ -236,7 +248,9 @@ def _on_reset(): if __name__ == "__main__": parser = argparse.ArgumentParser(description="Voxtral TTS Gradio Demo") - parser.add_argument("--model", type=str, required=True, help="Path to the model checkpoint") + parser.add_argument("--model", type=str, default="mistralai/Voxtral-4B-TTS-2603", help="Name of model repo on HF") + parser.add_argument("--host", type=str, default="localhost", help="Name of host") + parser.add_argument("--port", type=str, default="8091", help="port number") parser.add_argument( "--output-dir", type=str, @@ -244,12 +258,12 @@ def _on_reset(): help="Directory to save generated audio and share links. " "If not provided, save/share functionality is disabled.", ) - parser.add_argument("--debug", action="store_true", default=False, help="Enable debug mode") args = parser.parse_args() main( model=args.model, + host=args.host, + port=args.port, output_dir=args.output_dir, - debug=args.debug, ) diff --git a/tests/e2e/online_serving/test_voxtral_tts.py b/tests/e2e/online_serving/test_voxtral_tts.py index d4de073f82a..f795288f375 100644 --- a/tests/e2e/online_serving/test_voxtral_tts.py +++ b/tests/e2e/online_serving/test_voxtral_tts.py @@ -20,7 +20,7 @@ from tests.conftest import OmniServerParams from tests.utils import hardware_test -MODEL = "mistralai/tts-model" +MODEL = "mistralai/Voxtral-4B-TTS-2603" STAGE_CONFIG = str( Path(__file__).parent.parent.parent.parent / "vllm_omni" / "model_executor" / "stage_configs" / "voxtral_tts.yaml"