diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index e47ca8be1ed5..d56cabde3662 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -105,6 +105,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "12" + cuda-minor-version: "9" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-12-pocket-tts' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "pocket-tts" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "12" cuda-minor-version: "0" @@ -340,6 +353,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'cublas' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-nvidia-cuda-13-pocket-tts' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "pocket-tts" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'cublas' cuda-major-version: "13" cuda-minor-version: "0" @@ -405,6 +431,19 @@ jobs: backend: "vibevoice" dockerfile: "./backend/Dockerfile.python" context: "./" + - build-type: 'l4t' + cuda-major-version: "13" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-cuda-13-arm64-pocket-tts' + runs-on: 'ubuntu-24.04-arm' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + ubuntu-version: '2404' + backend: "pocket-tts" + dockerfile: "./backend/Dockerfile.python" + context: "./" - build-type: 'l4t' cuda-major-version: "13" cuda-minor-version: "0" @@ -641,6 +680,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'hipblas' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-rocm-hipblas-pocket-tts' + runs-on: 'arc-runner-set' + base-image: "rocm/dev-ubuntu-24.04:6.4.4" + skip-drivers: 'false' + backend: "pocket-tts" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'hipblas' cuda-major-version: "" cuda-minor-version: "" @@ -772,6 +824,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2204' + - build-type: 'l4t' + cuda-major-version: "12" + cuda-minor-version: "0" + platforms: 'linux/arm64' + tag-latest: 'auto' + tag-suffix: '-nvidia-l4t-pocket-tts' + runs-on: 'ubuntu-24.04-arm' + base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0" + skip-drivers: 'true' + backend: "pocket-tts" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2204' - build-type: 'l4t' cuda-major-version: "12" cuda-minor-version: "0" @@ -825,6 +890,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: 'intel' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64' + tag-latest: 'auto' + tag-suffix: '-gpu-intel-pocket-tts' + runs-on: 'arc-runner-set' + base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04" + skip-drivers: 'false' + backend: "pocket-tts" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' - build-type: 'intel' cuda-major-version: "" cuda-minor-version: "" @@ -1278,6 +1356,19 @@ jobs: dockerfile: "./backend/Dockerfile.python" context: "./" ubuntu-version: '2404' + - build-type: '' + cuda-major-version: "" + cuda-minor-version: "" + platforms: 'linux/amd64,linux/arm64' + tag-latest: 'auto' + tag-suffix: '-cpu-pocket-tts' + runs-on: 'ubuntu-latest' + base-image: "ubuntu:24.04" + skip-drivers: 'false' + backend: "pocket-tts" + dockerfile: "./backend/Dockerfile.python" + context: "./" + ubuntu-version: '2404' backend-jobs-darwin: uses: ./.github/workflows/backend_build_darwin.yml strategy: diff --git a/.github/workflows/test-extra.yml b/.github/workflows/test-extra.yml index bbefad7e1911..0d01cde73e37 100644 --- a/.github/workflows/test-extra.yml +++ b/.github/workflows/test-extra.yml @@ -265,4 +265,23 @@ jobs: - name: Test moonshine run: | make --jobs=5 --output-sync=target -C backend/python/moonshine - make --jobs=5 --output-sync=target -C backend/python/moonshine test \ No newline at end of file + make --jobs=5 --output-sync=target -C backend/python/moonshine test + tests-pocket-tts: + runs-on: ubuntu-latest + steps: + - name: Clone + uses: actions/checkout@v6 + with: + submodules: true + - name: Dependencies + run: | + sudo apt-get update + sudo apt-get install build-essential ffmpeg + sudo apt-get install -y ca-certificates cmake curl patch python3-pip + # Install UV + curl -LsSf https://astral.sh/uv/install.sh | sh + pip install --user --no-cache-dir grpcio-tools==1.64.1 + - name: Test pocket-tts + run: | + make --jobs=5 --output-sync=target -C backend/python/pocket-tts + make --jobs=5 --output-sync=target -C backend/python/pocket-tts test \ No newline at end of file diff --git a/Makefile b/Makefile index d703b1e00d0d..9bc95063e4d9 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,5 @@ # Disable parallel execution for backend builds -.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/moonshine +.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/moonshine backends/pocket-tts GOCMD=go GOTEST=$(GOCMD) test @@ -9,7 +9,7 @@ LAUNCHER_BINARY_NAME=local-ai-launcher CUDA_MAJOR_VERSION?=13 CUDA_MINOR_VERSION?=0 -UBUNTU_VERSION?=2204 +UBUNTU_VERSION?=2404 UBUNTU_CODENAME?=noble GORELEASER?= @@ -316,6 +316,7 @@ prepare-test-extra: protogen-python $(MAKE) -C backend/python/vllm $(MAKE) -C backend/python/vibevoice $(MAKE) -C backend/python/moonshine + $(MAKE) -C backend/python/pocket-tts test-extra: prepare-test-extra $(MAKE) -C backend/python/transformers test @@ -324,6 +325,7 @@ test-extra: prepare-test-extra $(MAKE) -C backend/python/vllm test $(MAKE) -C backend/python/vibevoice test $(MAKE) -C backend/python/moonshine test + $(MAKE) -C backend/python/pocket-tts test DOCKER_IMAGE?=local-ai DOCKER_AIO_IMAGE?=local-ai-aio @@ -447,17 +449,16 @@ BACKEND_FASTER_WHISPER = faster-whisper|python|.|false|true BACKEND_COQUI = coqui|python|.|false|true BACKEND_BARK = bark|python|.|false|true BACKEND_EXLLAMA2 = exllama2|python|.|false|true - -# Python backends with ./backend context -BACKEND_RFDETR = rfdetr|python|./backend|false|true -BACKEND_KITTEN_TTS = kitten-tts|python|./backend|false|true -BACKEND_NEUTTS = neutts|python|./backend|false|true -BACKEND_KOKORO = kokoro|python|./backend|false|true -BACKEND_VLLM = vllm|python|./backend|false|true -BACKEND_DIFFUSERS = diffusers|python|./backend|--progress=plain|true -BACKEND_CHATTERBOX = chatterbox|python|./backend|false|true -BACKEND_VIBEVOICE = vibevoice|python|./backend|--progress=plain|true -BACKEND_MOONSHINE = moonshine|python|./backend|false|true +BACKEND_RFDETR = rfdetr|python|.|false|true +BACKEND_KITTEN_TTS = kitten-tts|python|.|false|true +BACKEND_NEUTTS = neutts|python|.|false|true +BACKEND_KOKORO = kokoro|python|.|false|true +BACKEND_VLLM = vllm|python|.|false|true +BACKEND_DIFFUSERS = diffusers|python|.|--progress=plain|true +BACKEND_CHATTERBOX = chatterbox|python|.|false|true +BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true +BACKEND_MOONSHINE = moonshine|python|.|false|true +BACKEND_POCKET_TTS = pocket-tts|python|.|false|true # Helper function to build docker image for a backend # Usage: $(call docker-build-backend,BACKEND_NAME,DOCKERFILE_TYPE,BUILD_CONTEXT,PROGRESS_FLAG,NEEDS_BACKEND_ARG) @@ -503,12 +504,13 @@ $(eval $(call generate-docker-build-target,$(BACKEND_DIFFUSERS))) $(eval $(call generate-docker-build-target,$(BACKEND_CHATTERBOX))) $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE))) $(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE))) +$(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS))) # Pattern rule for docker-save targets docker-save-%: backend-images docker save local-ai-backend:$* -o backend-images/$*.tar -docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-vibevoice docker-build-exllama2 docker-build-moonshine +docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-transformers docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-bark docker-build-chatterbox docker-build-vibevoice docker-build-exllama2 docker-build-moonshine docker-build-pocket-tts ######################################################## ### END Backends diff --git a/README.md b/README.md index 092432241837..f19e6b68b818 100644 --- a/README.md +++ b/README.md @@ -295,6 +295,7 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration | **silero-vad** | Voice Activity Detection | CPU | | **neutts** | Text-to-speech with voice cloning | CUDA 12/13, ROCm, CPU | | **vibevoice** | Real-time TTS with voice cloning | CUDA 12/13, ROCm, Intel, CPU | +| **pocket-tts** | Lightweight CPU-based TTS | CUDA 12/13, ROCm, Intel, CPU | ### Image & Video Generation | Backend | Description | Acceleration Support | @@ -316,8 +317,8 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration |-------------------|-------------------|------------------| | **NVIDIA CUDA 12** | All CUDA-compatible backends | Nvidia hardware | | **NVIDIA CUDA 13** | All CUDA-compatible backends | Nvidia hardware | -| **AMD ROCm** | llama.cpp, whisper, vllm, transformers, diffusers, rerankers, coqui, kokoro, bark, neutts, vibevoice | AMD Graphics | -| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, exllama2, coqui, kokoro, bark, vibevoice | Intel Arc, Intel iGPUs | +| **AMD ROCm** | llama.cpp, whisper, vllm, transformers, diffusers, rerankers, coqui, kokoro, bark, neutts, vibevoice, pocket-tts | AMD Graphics | +| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, exllama2, coqui, kokoro, bark, vibevoice, pocket-tts | Intel Arc, Intel iGPUs | | **Apple Metal** | llama.cpp, whisper, diffusers, MLX, MLX-VLM, bark-cpp | Apple M1/M2/M3+ | | **Vulkan** | llama.cpp, whisper, stablediffusion | Cross-platform GPUs | | **NVIDIA Jetson (CUDA 12)** | llama.cpp, whisper, stablediffusion, diffusers, rfdetr | ARM64 embedded AI (AGX Orin, etc.) | diff --git a/backend/index.yaml b/backend/index.yaml index 41befc625a44..45c5bb713b62 100644 --- a/backend/index.yaml +++ b/backend/index.yaml @@ -428,6 +428,28 @@ nvidia-l4t-cuda-12: "nvidia-l4t-vibevoice" nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vibevoice" icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4 +- &pocket-tts + urls: + - https://github.com/kyutai-labs/pocket-tts + description: | + Pocket TTS is a lightweight text-to-speech model designed to run efficiently on CPUs. + tags: + - text-to-speech + - TTS + license: mit + name: "pocket-tts" + alias: "pocket-tts" + capabilities: + nvidia: "cuda12-pocket-tts" + intel: "intel-pocket-tts" + amd: "rocm-pocket-tts" + nvidia-l4t: "nvidia-l4t-pocket-tts" + default: "cpu-pocket-tts" + nvidia-cuda-13: "cuda13-pocket-tts" + nvidia-cuda-12: "cuda12-pocket-tts" + nvidia-l4t-cuda-12: "nvidia-l4t-pocket-tts" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-pocket-tts" + icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4 - &piper name: "piper" uri: "quay.io/go-skynet/local-ai-backends:latest-piper" @@ -1605,3 +1627,86 @@ uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-vibevoice" mirrors: - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-vibevoice +## pocket-tts +- !!merge <<: *pocket-tts + name: "pocket-tts-development" + capabilities: + nvidia: "cuda12-pocket-tts-development" + intel: "intel-pocket-tts-development" + amd: "rocm-pocket-tts-development" + nvidia-l4t: "nvidia-l4t-pocket-tts-development" + default: "cpu-pocket-tts-development" + nvidia-cuda-13: "cuda13-pocket-tts-development" + nvidia-cuda-12: "cuda12-pocket-tts-development" + nvidia-l4t-cuda-12: "nvidia-l4t-pocket-tts-development" + nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-pocket-tts-development" +- !!merge <<: *pocket-tts + name: "cpu-pocket-tts" + uri: "quay.io/go-skynet/local-ai-backends:latest-cpu-pocket-tts" + mirrors: + - localai/localai-backends:latest-cpu-pocket-tts +- !!merge <<: *pocket-tts + name: "cpu-pocket-tts-development" + uri: "quay.io/go-skynet/local-ai-backends:master-cpu-pocket-tts" + mirrors: + - localai/localai-backends:master-cpu-pocket-tts +- !!merge <<: *pocket-tts + name: "cuda12-pocket-tts" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-pocket-tts" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-12-pocket-tts +- !!merge <<: *pocket-tts + name: "cuda12-pocket-tts-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-pocket-tts" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-12-pocket-tts +- !!merge <<: *pocket-tts + name: "cuda13-pocket-tts" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-pocket-tts" + mirrors: + - localai/localai-backends:latest-gpu-nvidia-cuda-13-pocket-tts +- !!merge <<: *pocket-tts + name: "cuda13-pocket-tts-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-pocket-tts" + mirrors: + - localai/localai-backends:master-gpu-nvidia-cuda-13-pocket-tts +- !!merge <<: *pocket-tts + name: "intel-pocket-tts" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-intel-pocket-tts" + mirrors: + - localai/localai-backends:latest-gpu-intel-pocket-tts +- !!merge <<: *pocket-tts + name: "intel-pocket-tts-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-intel-pocket-tts" + mirrors: + - localai/localai-backends:master-gpu-intel-pocket-tts +- !!merge <<: *pocket-tts + name: "rocm-pocket-tts" + uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-rocm-hipblas-pocket-tts" + mirrors: + - localai/localai-backends:latest-gpu-rocm-hipblas-pocket-tts +- !!merge <<: *pocket-tts + name: "rocm-pocket-tts-development" + uri: "quay.io/go-skynet/local-ai-backends:master-gpu-rocm-hipblas-pocket-tts" + mirrors: + - localai/localai-backends:master-gpu-rocm-hipblas-pocket-tts +- !!merge <<: *pocket-tts + name: "nvidia-l4t-pocket-tts" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-pocket-tts" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-pocket-tts +- !!merge <<: *pocket-tts + name: "nvidia-l4t-pocket-tts-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-pocket-tts" + mirrors: + - localai/localai-backends:master-nvidia-l4t-pocket-tts +- !!merge <<: *pocket-tts + name: "cuda13-nvidia-l4t-arm64-pocket-tts" + uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-pocket-tts" + mirrors: + - localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-pocket-tts +- !!merge <<: *pocket-tts + name: "cuda13-nvidia-l4t-arm64-pocket-tts-development" + uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-pocket-tts" + mirrors: + - localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-pocket-tts diff --git a/backend/python/pocket-tts/Makefile b/backend/python/pocket-tts/Makefile new file mode 100644 index 000000000000..3366bb4874ce --- /dev/null +++ b/backend/python/pocket-tts/Makefile @@ -0,0 +1,23 @@ +.PHONY: pocket-tts +pocket-tts: + bash install.sh + +.PHONY: run +run: pocket-tts + @echo "Running pocket-tts..." + bash run.sh + @echo "pocket-tts run." + +.PHONY: test +test: pocket-tts + @echo "Testing pocket-tts..." + bash test.sh + @echo "pocket-tts tested." + +.PHONY: protogen-clean +protogen-clean: + $(RM) backend_pb2_grpc.py backend_pb2.py + +.PHONY: clean +clean: protogen-clean + rm -rf venv __pycache__ diff --git a/backend/python/pocket-tts/backend.py b/backend/python/pocket-tts/backend.py new file mode 100644 index 000000000000..b02cf481a55c --- /dev/null +++ b/backend/python/pocket-tts/backend.py @@ -0,0 +1,255 @@ +#!/usr/bin/env python3 +""" +This is an extra gRPC server of LocalAI for Pocket TTS +""" +from concurrent import futures +import time +import argparse +import signal +import sys +import os +import traceback +import scipy.io.wavfile +import backend_pb2 +import backend_pb2_grpc +import torch +from pocket_tts import TTSModel + +import grpc + +def is_float(s): + """Check if a string can be converted to float.""" + try: + float(s) + return True + except ValueError: + return False + +def is_int(s): + """Check if a string can be converted to int.""" + try: + int(s) + return True + except ValueError: + return False + +_ONE_DAY_IN_SECONDS = 60 * 60 * 24 + +# If MAX_WORKERS are specified in the environment use it, otherwise default to 1 +MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1')) + +# Implement the BackendServicer class with the service methods +class BackendServicer(backend_pb2_grpc.BackendServicer): + """ + BackendServicer is the class that implements the gRPC service + """ + def Health(self, request, context): + return backend_pb2.Reply(message=bytes("OK", 'utf-8')) + + def LoadModel(self, request, context): + # Get device + if torch.cuda.is_available(): + print("CUDA is available", file=sys.stderr) + device = "cuda" + else: + print("CUDA is not available", file=sys.stderr) + device = "cpu" + mps_available = hasattr(torch.backends, "mps") and torch.backends.mps.is_available() + if mps_available: + device = "mps" + if not torch.cuda.is_available() and request.CUDA: + return backend_pb2.Result(success=False, message="CUDA is not available") + + # Normalize potential 'mpx' typo to 'mps' + if device == "mpx": + print("Note: device 'mpx' detected, treating it as 'mps'.", file=sys.stderr) + device = "mps" + + # Validate mps availability if requested + if device == "mps" and not torch.backends.mps.is_available(): + print("Warning: MPS not available. Falling back to CPU.", file=sys.stderr) + device = "cpu" + + self.device = device + + options = request.Options + + # empty dict + self.options = {} + + # The options are a list of strings in this form optname:optvalue + # We are storing all the options in a dict so we can use it later when + # generating the audio + for opt in options: + if ":" not in opt: + continue + key, value = opt.split(":", 1) # Split only on first colon + # if value is a number, convert it to the appropriate type + if is_float(value): + value = float(value) + elif is_int(value): + value = int(value) + elif value.lower() in ["true", "false"]: + value = value.lower() == "true" + self.options[key] = value + + # Default voice for caching + self.default_voice_url = self.options.get("default_voice", None) + self._voice_cache = {} + + try: + print("Loading Pocket TTS model", file=sys.stderr) + self.tts_model = TTSModel.load_model() + print(f"Model loaded successfully. Sample rate: {self.tts_model.sample_rate}", file=sys.stderr) + + # Pre-load default voice if specified + if self.default_voice_url: + try: + print(f"Pre-loading default voice: {self.default_voice_url}", file=sys.stderr) + voice_state = self.tts_model.get_state_for_audio_prompt(self.default_voice_url) + self._voice_cache[self.default_voice_url] = voice_state + print("Default voice loaded successfully", file=sys.stderr) + except Exception as e: + print(f"Warning: Failed to pre-load default voice: {e}", file=sys.stderr) + + except Exception as err: + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + return backend_pb2.Result(message="Model loaded successfully", success=True) + + def _get_voice_state(self, voice_input): + """ + Get voice state from cache or load it. + voice_input can be: + - HuggingFace URL (e.g., hf://kyutai/tts-voices/alba-mackenna/casual.wav) + - Local file path + - None (use default) + """ + # Use default if no voice specified + if not voice_input: + voice_input = self.default_voice_url + + if not voice_input: + return None + + # Check cache first + if voice_input in self._voice_cache: + return self._voice_cache[voice_input] + + # Load voice state + try: + print(f"Loading voice from: {voice_input}", file=sys.stderr) + voice_state = self.tts_model.get_state_for_audio_prompt(voice_input) + self._voice_cache[voice_input] = voice_state + return voice_state + except Exception as e: + print(f"Error loading voice from {voice_input}: {e}", file=sys.stderr) + return None + + def TTS(self, request, context): + try: + # Determine voice input + # Priority: request.voice > AudioPath (from ModelOptions) > default + voice_input = None + + if request.voice: + voice_input = request.voice + elif hasattr(request, 'AudioPath') and request.AudioPath: + # Use AudioPath as voice file + if os.path.isabs(request.AudioPath): + voice_input = request.AudioPath + elif hasattr(request, 'ModelFile') and request.ModelFile: + model_file_base = os.path.dirname(request.ModelFile) + voice_input = os.path.join(model_file_base, request.AudioPath) + elif hasattr(request, 'ModelPath') and request.ModelPath: + voice_input = os.path.join(request.ModelPath, request.AudioPath) + else: + voice_input = request.AudioPath + + # Get voice state + voice_state = self._get_voice_state(voice_input) + if voice_state is None: + return backend_pb2.Result( + success=False, + message=f"Voice not found or failed to load: {voice_input}. Please provide a valid voice URL or file path." + ) + + # Prepare text + text = request.text.strip() + + if not text: + return backend_pb2.Result( + success=False, + message="Text is empty" + ) + + print(f"Generating audio for text: {text[:50]}...", file=sys.stderr) + + # Generate audio + audio = self.tts_model.generate_audio(voice_state, text) + + # Audio is a 1D torch tensor containing PCM data + if audio is None or audio.numel() == 0: + return backend_pb2.Result( + success=False, + message="No audio generated" + ) + + # Save audio to file + output_path = request.dst + if not output_path: + output_path = "/tmp/pocket-tts-output.wav" + + # Ensure output directory exists + output_dir = os.path.dirname(output_path) + if output_dir and not os.path.exists(output_dir): + os.makedirs(output_dir, exist_ok=True) + + # Convert torch tensor to numpy and save + audio_numpy = audio.numpy() + scipy.io.wavfile.write(output_path, self.tts_model.sample_rate, audio_numpy) + print(f"Saved audio to {output_path}", file=sys.stderr) + + except Exception as err: + print(f"Error in TTS: {err}", file=sys.stderr) + print(traceback.format_exc(), file=sys.stderr) + return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}") + + return backend_pb2.Result(success=True) + +def serve(address): + server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS), + options=[ + ('grpc.max_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_send_message_length', 50 * 1024 * 1024), # 50MB + ('grpc.max_receive_message_length', 50 * 1024 * 1024), # 50MB + ]) + backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server) + server.add_insecure_port(address) + server.start() + print("Server started. Listening on: " + address, file=sys.stderr) + + # Define the signal handler function + def signal_handler(sig, frame): + print("Received termination signal. Shutting down...") + server.stop(0) + sys.exit(0) + + # Set the signal handlers for SIGINT and SIGTERM + signal.signal(signal.SIGINT, signal_handler) + signal.signal(signal.SIGTERM, signal_handler) + + try: + while True: + time.sleep(_ONE_DAY_IN_SECONDS) + except KeyboardInterrupt: + server.stop(0) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Run the gRPC server.") + parser.add_argument( + "--addr", default="localhost:50051", help="The address to bind the server to." + ) + args = parser.parse_args() + + serve(args.addr) diff --git a/backend/python/pocket-tts/install.sh b/backend/python/pocket-tts/install.sh new file mode 100755 index 000000000000..6058b3d545ad --- /dev/null +++ b/backend/python/pocket-tts/install.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links. +# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match. +# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index +# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index +if [ "x${BUILD_PROFILE}" == "xintel" ]; then + EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match" +fi + +# Use python 3.12 for l4t +if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then + PYTHON_VERSION="3.12" + PYTHON_PATCH="12" + PY_STANDALONE_TAG="20251120" +fi + +if [ "x${BUILD_PROFILE}" == "xl4t12" ]; then + USE_PIP=true +fi + +installRequirements diff --git a/backend/python/pocket-tts/protogen.sh b/backend/python/pocket-tts/protogen.sh new file mode 100755 index 000000000000..1ad37dee164b --- /dev/null +++ b/backend/python/pocket-tts/protogen.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +python3 -m grpc_tools.protoc -I../.. -I./ --python_out=. --grpc_python_out=. backend.proto diff --git a/backend/python/pocket-tts/requirements-cpu.txt b/backend/python/pocket-tts/requirements-cpu.txt new file mode 100644 index 000000000000..d14153bc5aaf --- /dev/null +++ b/backend/python/pocket-tts/requirements-cpu.txt @@ -0,0 +1,4 @@ +--extra-index-url https://download.pytorch.org/whl/cpu +pocket-tts +scipy +torch diff --git a/backend/python/pocket-tts/requirements-cublas12.txt b/backend/python/pocket-tts/requirements-cublas12.txt new file mode 100644 index 000000000000..f43f5094b9f4 --- /dev/null +++ b/backend/python/pocket-tts/requirements-cublas12.txt @@ -0,0 +1,4 @@ +--extra-index-url https://download.pytorch.org/whl/cu121 +pocket-tts +scipy +torch diff --git a/backend/python/pocket-tts/requirements-cublas13.txt b/backend/python/pocket-tts/requirements-cublas13.txt new file mode 100644 index 000000000000..26e07545fdc7 --- /dev/null +++ b/backend/python/pocket-tts/requirements-cublas13.txt @@ -0,0 +1,4 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +pocket-tts +scipy +torch diff --git a/backend/python/pocket-tts/requirements-hipblas.txt b/backend/python/pocket-tts/requirements-hipblas.txt new file mode 100644 index 000000000000..b6f9d2fb6a0a --- /dev/null +++ b/backend/python/pocket-tts/requirements-hipblas.txt @@ -0,0 +1,4 @@ +--extra-index-url https://download.pytorch.org/whl/rocm6.3 +pocket-tts +scipy +torch==2.7.1+rocm6.3 diff --git a/backend/python/pocket-tts/requirements-intel.txt b/backend/python/pocket-tts/requirements-intel.txt new file mode 100644 index 000000000000..3bb61cb7311d --- /dev/null +++ b/backend/python/pocket-tts/requirements-intel.txt @@ -0,0 +1,4 @@ +--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ +pocket-tts +scipy +torch==2.5.1+cxx11.abi diff --git a/backend/python/pocket-tts/requirements-l4t12.txt b/backend/python/pocket-tts/requirements-l4t12.txt new file mode 100644 index 000000000000..39131ac17b36 --- /dev/null +++ b/backend/python/pocket-tts/requirements-l4t12.txt @@ -0,0 +1,4 @@ +--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/ +pocket-tts +scipy +torch diff --git a/backend/python/pocket-tts/requirements-l4t13.txt b/backend/python/pocket-tts/requirements-l4t13.txt new file mode 100644 index 000000000000..d6503f7c118d --- /dev/null +++ b/backend/python/pocket-tts/requirements-l4t13.txt @@ -0,0 +1,4 @@ +--extra-index-url https://download.pytorch.org/whl/cu130 +pocket-tts +scipy +torch \ No newline at end of file diff --git a/backend/python/pocket-tts/requirements-mps.txt b/backend/python/pocket-tts/requirements-mps.txt new file mode 100644 index 000000000000..235eaffd54fb --- /dev/null +++ b/backend/python/pocket-tts/requirements-mps.txt @@ -0,0 +1,4 @@ +pocket-tts +scipy +torch==2.7.1 +torchvision==0.22.1 diff --git a/backend/python/pocket-tts/requirements.txt b/backend/python/pocket-tts/requirements.txt new file mode 100644 index 000000000000..9e532186b2c8 --- /dev/null +++ b/backend/python/pocket-tts/requirements.txt @@ -0,0 +1,4 @@ +grpcio==1.71.0 +protobuf +certifi +packaging==24.1 diff --git a/backend/python/pocket-tts/run.sh b/backend/python/pocket-tts/run.sh new file mode 100755 index 000000000000..eae121f37b0b --- /dev/null +++ b/backend/python/pocket-tts/run.sh @@ -0,0 +1,9 @@ +#!/bin/bash +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +startBackend $@ diff --git a/backend/python/pocket-tts/test.py b/backend/python/pocket-tts/test.py new file mode 100644 index 000000000000..34efa1080d00 --- /dev/null +++ b/backend/python/pocket-tts/test.py @@ -0,0 +1,141 @@ +""" +A test script to test the gRPC service +""" +import unittest +import subprocess +import time +import os +import tempfile +import backend_pb2 +import backend_pb2_grpc + +import grpc + + +class TestBackendServicer(unittest.TestCase): + """ + TestBackendServicer is the class that tests the gRPC service + """ + def setUp(self): + """ + This method sets up the gRPC service by starting the server + """ + self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"]) + time.sleep(30) + + def tearDown(self) -> None: + """ + This method tears down the gRPC service by terminating the server + """ + self.service.terminate() + self.service.wait() + + def test_server_startup(self): + """ + This method tests if the server starts up successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.Health(backend_pb2.HealthMessage()) + self.assertEqual(response.message, b'OK') + except Exception as err: + print(err) + self.fail("Server failed to start") + finally: + self.tearDown() + + def test_load_model(self): + """ + This method tests if the model is loaded successfully + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + response = stub.LoadModel(backend_pb2.ModelOptions()) + print(response) + self.assertTrue(response.success) + self.assertEqual(response.message, "Model loaded successfully") + except Exception as err: + print(err) + self.fail("LoadModel service failed") + finally: + self.tearDown() + + def test_tts_with_hf_voice(self): + """ + This method tests TTS generation with HuggingFace voice URL + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + # Load model + response = stub.LoadModel(backend_pb2.ModelOptions()) + self.assertTrue(response.success) + + # Create temporary output file + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: + output_path = tmp_file.name + + # Test TTS with HuggingFace voice URL + tts_request = backend_pb2.TTSRequest( + text="Hello world, this is a test.", + dst=output_path, + voice="azelma" + ) + tts_response = stub.TTS(tts_request) + self.assertTrue(tts_response.success) + + # Verify output file exists and is not empty + self.assertTrue(os.path.exists(output_path)) + self.assertGreater(os.path.getsize(output_path), 0) + + # Cleanup + os.unlink(output_path) + except Exception as err: + print(err) + self.fail("TTS service failed") + finally: + self.tearDown() + + def test_tts_with_default_voice(self): + """ + This method tests TTS generation with default voice (via AudioPath in LoadModel) + """ + try: + self.setUp() + with grpc.insecure_channel("localhost:50051") as channel: + stub = backend_pb2_grpc.BackendStub(channel) + # Load model with default voice + load_request = backend_pb2.ModelOptions( + Options=["default_voice:azelma"] + ) + response = stub.LoadModel(load_request) + self.assertTrue(response.success) + + # Create temporary output file + with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file: + output_path = tmp_file.name + + # Test TTS without specifying voice (should use default) + tts_request = backend_pb2.TTSRequest( + text="Hello world, this is a test.", + dst=output_path + ) + tts_response = stub.TTS(tts_request) + self.assertTrue(tts_response.success) + + # Verify output file exists and is not empty + self.assertTrue(os.path.exists(output_path)) + self.assertGreater(os.path.getsize(output_path), 0) + + # Cleanup + os.unlink(output_path) + except Exception as err: + print(err) + self.fail("TTS service with default voice failed") + finally: + self.tearDown() diff --git a/backend/python/pocket-tts/test.sh b/backend/python/pocket-tts/test.sh new file mode 100755 index 000000000000..eb59f2aaf3f3 --- /dev/null +++ b/backend/python/pocket-tts/test.sh @@ -0,0 +1,11 @@ +#!/bin/bash +set -e + +backend_dir=$(dirname $0) +if [ -d $backend_dir/common ]; then + source $backend_dir/common/libbackend.sh +else + source $backend_dir/../common/libbackend.sh +fi + +runUnittests diff --git a/docs/content/features/text-to-audio.md b/docs/content/features/text-to-audio.md index c6ff4d001b0d..3d179430704c 100644 --- a/docs/content/features/text-to-audio.md +++ b/docs/content/features/text-to-audio.md @@ -164,6 +164,57 @@ curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ }' | aplay ``` +### Pocket TTS + +[Pocket TTS](https://github.com/kyutai-labs/pocket-tts) is a lightweight text-to-speech model designed to run efficiently on CPUs. It supports voice cloning through HuggingFace voice URLs or local audio files. + +#### Setup + +Install the `pocket-tts` model in the Model gallery or run `local-ai run models install pocket-tts`. + +#### Usage + +Use the tts endpoint by specifying the pocket-tts backend: + +``` +curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ + "model": "pocket-tts", + "input":"Hello world, this is a test." + }' | aplay +``` + +#### Voice cloning + +Pocket TTS supports voice cloning through built-in voice names, HuggingFace URLs, or local audio files. You can configure a model with a specific voice: + +```yaml +name: pocket-tts +backend: pocket-tts +tts: + voice: "azelma" # Built-in voice name + # Or use HuggingFace URL: "hf://kyutai/tts-voices/alba-mackenna/casual.wav" + # Or use local file path: "path/to/voice.wav" + # Available built-in voices: alba, marius, javert, jean, fantine, cosette, eponine, azelma +``` + +You can also pre-load a default voice for faster first generation: + +```yaml +name: pocket-tts +backend: pocket-tts +options: + - "default_voice:azelma" # Pre-load this voice when model loads +``` + +Then you can use the model: + +``` +curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{ + "model": "pocket-tts", + "input":"Hello world, this is a test." + }' | aplay +``` + ### Vall-E-X [VALL-E-X](https://github.com/Plachtaa/VALL-E-X) is an open source implementation of Microsoft's VALL-E X zero-shot TTS model. diff --git a/docs/content/reference/compatibility-table.md b/docs/content/reference/compatibility-table.md index 97bc61313dc0..2461c95e1f1d 100644 --- a/docs/content/reference/compatibility-table.md +++ b/docs/content/reference/compatibility-table.md @@ -42,6 +42,7 @@ LocalAI will attempt to automatically load models which are not explicitly confi | [silero-vad](https://github.com/snakers4/silero-vad) with [Golang bindings](https://github.com/streamer45/silero-vad-go) | Silero VAD | no | Voice Activity Detection | no | no | CPU | | [neutts](https://github.com/neuphonic/neuttsair) | NeuTTSAir | no | Text-to-speech with voice cloning | no | no | CUDA 12/13, ROCm, CPU | | [vibevoice](https://github.com/microsoft/VibeVoice) | VibeVoice-Realtime | no | Real-time text-to-speech with voice cloning | no | no | CUDA 12/13, ROCm, Intel, CPU | +| [pocket-tts](https://github.com/kyutai-labs/pocket-tts) | Pocket TTS | no | Lightweight CPU-based text-to-speech with voice cloning | no | no | CUDA 12/13, ROCm, Intel, CPU | | [mlx-audio](https://github.com/Blaizzy/mlx-audio) | MLX | no | Text-tospeech | no | no | Metal (Apple Silicon) | ## Image & Video Generation diff --git a/gallery/index.yaml b/gallery/index.yaml index 1ccb2d659ac6..da8c0cd0f42e 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -478,6 +478,16 @@ - filename: voices/streaming_model/en-Davis_man.pt uri: https://raw.githubusercontent.com/microsoft/VibeVoice/main/demo/voices/streaming_model/en-Davis_man.pt sha256: 67561d63bfa2153616e4c02fd967007c182593fc53738a6ad94bf5f84e8832ac +- &pocket-tts + url: "github:mudler/LocalAI/gallery/pocket-tts.yaml@master" + icon: https://avatars.githubusercontent.com/u/6154722?s=200&v=4 + license: mit + tags: + - text-to-speech + - TTS + name: "pocket-tts" + urls: + - https://github.com/kyutai-labs/pocket-tts - &qwen3vl url: "github:mudler/LocalAI/gallery/qwen3.yaml@master" icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png diff --git a/gallery/pocket-tts.yaml b/gallery/pocket-tts.yaml new file mode 100644 index 000000000000..87bbec7beb05 --- /dev/null +++ b/gallery/pocket-tts.yaml @@ -0,0 +1,34 @@ +--- +name: localai + +config_file: |- + name: pocket-tts + backend: pocket-tts + description: | + Pocket TTS is a lightweight text-to-speech model designed to run efficiently on CPUs. + This model supports voice cloning through HuggingFace voice URLs or local audio files. + + parameters: + model: "" + + # TTS configuration + tts: + # Voice selection - can be: + # 1. Built-in voice name (e.g., "alba", "marius", "javert", "jean", "fantine", "cosette", "eponine", "azelma") + # 2. HuggingFace URL (e.g., "hf://kyutai/tts-voices/alba-mackenna/casual.wav") + # 3. Local file path (relative to model directory or absolute) + # voice: "azelma" + # Alternative: use audio_path to specify a voice file directly + # audio_path: "hf://kyutai/tts-voices/alba-mackenna/casual.wav" + + known_usecases: + - tts + + # Backend-specific options + # These are passed as "key:value" strings to the backend + options: + # Default voice to pre-load (optional) + # Can be a voice name or HuggingFace URL + # If set, this voice will be loaded when the model loads for faster first generation + - "default_voice:azelma" + # - "default_voice:hf://kyutai/tts-voices/alba-mackenna/casual.wav"