diff --git a/.agents/AGENTS.md b/.agents/AGENTS.md index 88b8d151e..478639829 100644 --- a/.agents/AGENTS.md +++ b/.agents/AGENTS.md @@ -225,6 +225,7 @@ See `subagent-index.toon` for complete listing of agents, subagents, workflows, | WordPress | `tools/wordpress/wp-dev.md`, `tools/wordpress/mainwp.md` | | SEO | `seo/dataforseo.md`, `seo/google-search-console.md` | | Video | `tools/video/video-prompt-design.md`, `tools/video/remotion.md`, `tools/video/higgsfield.md` | +| Voice | `tools/voice/speech-to-speech.md` | | Parallel agents | `tools/ai-assistants/headless-dispatch.md`, `tools/ai-assistants/runners/` | | MCP dev | `tools/build-mcp/build-mcp.md` | | Agent design | `tools/build-agent/build-agent.md` | diff --git a/.agents/scripts/speech-to-speech-helper.sh b/.agents/scripts/speech-to-speech-helper.sh new file mode 100755 index 000000000..176f34c1c --- /dev/null +++ b/.agents/scripts/speech-to-speech-helper.sh @@ -0,0 +1,526 @@ +#!/bin/bash +# shellcheck disable=SC2034,SC2155 + +# Speech-to-Speech Helper Script +# Manages HuggingFace speech-to-speech pipeline +# Supports local GPU (CUDA/MPS), Docker, and remote server deployment + +set -euo pipefail + +# Colors for output +readonly GREEN='\033[0;32m' +readonly BLUE='\033[0;34m' +readonly YELLOW='\033[1;33m' +readonly RED='\033[0;31m' +readonly NC='\033[0m' + +# Defaults +readonly S2S_REPO="https://github.com/huggingface/speech-to-speech.git" +readonly S2S_DIR="${HOME}/.aidevops/.agent-workspace/work/speech-to-speech" +readonly S2S_PID_FILE="${S2S_DIR}/.s2s.pid" +readonly S2S_LOG_FILE="${S2S_DIR}/.s2s.log" +readonly DEFAULT_RECV_PORT=12345 +readonly DEFAULT_SEND_PORT=12346 + +print_info() { + local msg="$1" + echo -e "${BLUE}[INFO]${NC} $msg" + return 0 +} + +print_success() { + local msg="$1" + echo -e "${GREEN}[OK]${NC} $msg" + return 0 +} + +print_warning() { + local msg="$1" + echo -e "${YELLOW}[WARN]${NC} $msg" + return 0 +} + +print_error() { + local msg="$1" + echo -e "${RED}[ERROR]${NC} $msg" >&2 + return 0 +} + +# ─── Dependency checks ─────────────────────────────────────────────── + +check_python() { + if ! command -v python3 &>/dev/null; then + print_error "python3 is required but not installed" + return 1 + fi + local py_version + py_version=$(python3 -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")') + local major="${py_version%%.*}" + local minor="${py_version##*.}" + if [[ "$major" -lt 3 ]] || { [[ "$major" -eq 3 ]] && [[ "$minor" -lt 10 ]]; }; then + print_error "Python 3.10+ required, found $py_version" + return 1 + fi + print_info "Python $py_version" + return 0 +} + +check_uv() { + if ! command -v uv &>/dev/null; then + print_warning "uv not found. Install: curl -LsSf https://astral.sh/uv/install.sh | sh" + print_info "Falling back to pip" + return 1 + fi + return 0 +} + +detect_platform() { + local platform + platform=$(uname -s) + case "$platform" in + Darwin) + if [[ "$(uname -m)" == "arm64" ]]; then + echo "mac-arm64" + else + echo "mac-x86" + fi + ;; + Linux) + if command -v nvidia-smi &>/dev/null; then + echo "linux-cuda" + else + echo "linux-cpu" + fi + ;; + *) + echo "unknown" + ;; + esac + return 0 +} + +detect_gpu() { + local platform + platform=$(detect_platform) + case "$platform" in + mac-arm64) + print_info "Apple Silicon detected (MPS acceleration)" + echo "mps" + ;; + linux-cuda) + local gpu_info + gpu_info=$(nvidia-smi --query-gpu=name,memory.total --format=csv,noheader 2>/dev/null || echo "unknown") + print_info "NVIDIA GPU: $gpu_info" + echo "cuda" + ;; + *) + print_warning "No GPU acceleration detected, using CPU" + echo "cpu" + ;; + esac + return 0 +} + +# ─── Setup ──────────────────────────────────────────────────────────── + +cmd_setup() { + print_info "Setting up speech-to-speech pipeline..." + + check_python || return 1 + + # Clone or update repo + if [[ -d "$S2S_DIR/.git" ]]; then + print_info "Updating existing installation..." + git -C "$S2S_DIR" pull --ff-only 2>/dev/null || { + print_warning "Could not fast-forward, repo may have local changes" + } + else + print_info "Cloning speech-to-speech..." + mkdir -p "$(dirname "$S2S_DIR")" + git clone "$S2S_REPO" "$S2S_DIR" + fi + + # Install dependencies based on platform + local platform + platform=$(detect_platform) + local req_file="requirements.txt" + if [[ "$platform" == "mac-arm64" ]] || [[ "$platform" == "mac-x86" ]]; then + req_file="requirements_mac.txt" + fi + + print_info "Installing dependencies from $req_file..." + if check_uv; then + uv pip install -r "${S2S_DIR}/${req_file}" + else + pip install -r "${S2S_DIR}/${req_file}" + fi + + # Download NLTK data + print_info "Downloading NLTK data..." + python3 -c "import nltk; nltk.download('punkt_tab'); nltk.download('averaged_perceptron_tagger_eng')" >/dev/null + + print_success "Setup complete. Run: speech-to-speech-helper.sh start" + return 0 +} + +# ─── Start pipeline ────────────────────────────────────────────────── + +cmd_start() { + local mode="" + local language="en" + local extra_args=() + local background=false + + while [[ $# -gt 0 ]]; do + case "$1" in + --local-mac) mode="local-mac"; shift ;; + --cuda) mode="cuda"; shift ;; + --server) mode="server"; shift ;; + --docker) mode="docker"; shift ;; + --language) language="$2"; shift 2 ;; + --background) background=true; shift ;; + *) extra_args+=("$1"); shift ;; + esac + done + + if [[ -z "$mode" ]]; then + local gpu + gpu=$(detect_gpu) + case "$gpu" in + mps) mode="local-mac" ;; + cuda) mode="cuda" ;; + *) mode="cuda" ;; + esac + print_info "Auto-detected mode: $mode" + fi + + # Check if already running + if [[ -f "$S2S_PID_FILE" ]]; then + local pid + pid=$(cat "$S2S_PID_FILE") + if kill -0 "$pid" 2>/dev/null; then + print_warning "Pipeline already running (PID $pid). Use 'stop' first." + return 1 + fi + rm -f "$S2S_PID_FILE" + fi + + if [[ ! -d "$S2S_DIR/.git" ]]; then + print_error "Not installed. Run: speech-to-speech-helper.sh setup" + return 1 + fi + + local cmd_args=() + + case "$mode" in + local-mac) + cmd_args=( + python3 s2s_pipeline.py + --local_mac_optimal_settings + --device mps + --language "$language" + ) + if [[ "$language" == "auto" ]]; then + cmd_args+=(--stt_model_name large-v3) + cmd_args+=(--mlx_lm_model_name mlx-community/Meta-Llama-3.1-8B-Instruct-4bit) + fi + ;; + cuda) + cmd_args=( + python3 s2s_pipeline.py + --recv_host 0.0.0.0 + --send_host 0.0.0.0 + --lm_model_name microsoft/Phi-3-mini-4k-instruct + --stt_compile_mode reduce-overhead + --tts_compile_mode default + --language "$language" + ) + ;; + server) + cmd_args=( + python3 s2s_pipeline.py + --recv_host 0.0.0.0 + --send_host 0.0.0.0 + --language "$language" + ) + ;; + docker) + cmd_docker_start "${extra_args[@]}" + return $? + ;; + esac + + # Append any extra args + cmd_args+=("${extra_args[@]}") + + print_info "Starting pipeline (mode: $mode, language: $language)..." + print_info "Command: ${cmd_args[*]}" + + if [[ "$background" == true ]]; then + (cd "$S2S_DIR" && "${cmd_args[@]}" > "$S2S_LOG_FILE" 2>&1 & + echo $! > "$S2S_PID_FILE") + local pid + pid=$(cat "$S2S_PID_FILE") + print_success "Pipeline started in background (PID $pid)" + print_info "Logs: tail -f $S2S_LOG_FILE" + else + (cd "$S2S_DIR" && exec "${cmd_args[@]}") + fi + + return 0 +} + +cmd_docker_start() { + if ! command -v docker &>/dev/null; then + print_error "Docker is not installed" + return 1 + fi + + if [[ ! -f "${S2S_DIR}/docker-compose.yml" ]]; then + print_error "docker-compose.yml not found. Run setup first." + return 1 + fi + + print_info "Starting with Docker..." + (cd "$S2S_DIR" && docker compose up -d) + print_success "Docker containers started" + print_info "Ports: ${DEFAULT_RECV_PORT} (recv), ${DEFAULT_SEND_PORT} (send)" + return 0 +} + +# ─── Client ─────────────────────────────────────────────────────────── + +cmd_client() { + local host="" + local extra_args=() + + while [[ $# -gt 0 ]]; do + case "$1" in + --host) host="$2"; shift 2 ;; + *) extra_args+=("$1"); shift ;; + esac + done + + if [[ -z "$host" ]]; then + print_error "Server host required: --host " + return 1 + fi + + if [[ ! -f "${S2S_DIR}/listen_and_play.py" ]]; then + print_error "Not installed. Run: speech-to-speech-helper.sh setup" + return 1 + fi + + print_info "Connecting to server at $host..." + (cd "$S2S_DIR" && python3 listen_and_play.py --host "$host" "${extra_args[@]}") + return 0 +} + +# ─── Stop ───────────────────────────────────────────────────────────── + +cmd_stop() { + # Stop background process + if [[ -f "$S2S_PID_FILE" ]]; then + local pid + pid=$(cat "$S2S_PID_FILE") + if kill -0 "$pid" 2>/dev/null; then + print_info "Stopping pipeline (PID $pid)..." + kill "$pid" + sleep 2 + if kill -0 "$pid" 2>/dev/null; then + print_warning "Force killing..." + kill -9 "$pid" 2>/dev/null || true + fi + print_success "Pipeline stopped" + else + print_info "Process not running" + fi + rm -f "$S2S_PID_FILE" + else + print_info "No PID file found" + fi + + # Stop Docker if running + if [[ -f "${S2S_DIR}/docker-compose.yml" ]]; then + if docker compose -f "${S2S_DIR}/docker-compose.yml" ps --quiet 2>/dev/null | grep -q .; then + print_info "Stopping Docker containers..." + (cd "$S2S_DIR" && docker compose down) + print_success "Docker containers stopped" + fi + fi + + return 0 +} + +# ─── Status ─────────────────────────────────────────────────────────── + +cmd_status() { + echo "=== Speech-to-Speech Status ===" + echo "" + + # Installation + if [[ -d "$S2S_DIR/.git" ]]; then + local commit + commit=$(git -C "$S2S_DIR" log -1 --format='%h %s' 2>/dev/null || echo "unknown") + print_success "Installed: $S2S_DIR" + print_info "Commit: $commit" + else + print_warning "Not installed. Run: speech-to-speech-helper.sh setup" + return 0 + fi + + # Platform + local platform + platform=$(detect_platform) + local gpu + gpu=$(detect_gpu) + print_info "Platform: $platform (accelerator: $gpu)" + + # Process + if [[ -f "$S2S_PID_FILE" ]]; then + local pid + pid=$(cat "$S2S_PID_FILE") + if kill -0 "$pid" 2>/dev/null; then + print_success "Running (PID $pid)" + else + print_warning "Stale PID file (process not running)" + rm -f "$S2S_PID_FILE" + fi + else + print_info "Not running" + fi + + # Docker + if command -v docker &>/dev/null && [[ -f "${S2S_DIR}/docker-compose.yml" ]]; then + local docker_status + docker_status=$(docker compose -f "${S2S_DIR}/docker-compose.yml" ps --format "table {{.Name}}\t{{.Status}}" 2>/dev/null || echo "not running") + if echo "$docker_status" | grep -qi "up"; then + print_success "Docker: running" + echo "$docker_status" + else + print_info "Docker: not running" + fi + fi + + echo "" + return 0 +} + +# ─── Config presets ─────────────────────────────────────────────────── + +cmd_config() { + local preset="${1:-}" + + case "$preset" in + low-latency) + echo "--stt faster-whisper --llm open_api --tts parler --stt_compile_mode reduce-overhead --tts_compile_mode default" + ;; + low-vram) + echo "--stt moonshine --llm open_api --tts pocket" + ;; + quality) + echo "--stt whisper --stt_model_name openai/whisper-large-v3 --llm transformers --lm_model_name microsoft/Phi-3-mini-4k-instruct --tts parler" + ;; + mac) + echo "--local_mac_optimal_settings --device mps --mlx_lm_model_name mlx-community/Meta-Llama-3.1-8B-Instruct-4bit" + ;; + multilingual) + echo "--stt_model_name large-v3 --language auto --tts melo" + ;; + *) + echo "Available presets:" + echo " low-latency - Fastest response (CUDA + OpenAI API)" + echo " low-vram - Minimal GPU memory (~4GB)" + echo " quality - Best quality (24GB+ VRAM)" + echo " mac - Optimal macOS Apple Silicon" + echo " multilingual - Auto language detection (6 languages)" + echo "" + echo "Usage: speech-to-speech-helper.sh start \$(speech-to-speech-helper.sh config low-latency)" + ;; + esac + return 0 +} + +# ─── Benchmark ──────────────────────────────────────────────────────── + +cmd_benchmark() { + if [[ ! -d "$S2S_DIR/.git" ]]; then + print_error "Not installed. Run: speech-to-speech-helper.sh setup" + return 1 + fi + + if [[ ! -f "${S2S_DIR}/benchmark_stt.py" ]]; then + print_error "Benchmark script not found in repo" + return 1 + fi + + print_info "Running STT benchmark..." + (cd "$S2S_DIR" && python3 benchmark_stt.py "$@") + return 0 +} + +# ─── Help ───────────────────────────────────────────────────────────── + +cmd_help() { + echo "Speech-to-Speech Helper" + echo "Manages HuggingFace speech-to-speech pipeline" + echo "" + echo "Usage: $0 [options]" + echo "" + echo "Commands:" + echo " setup Install/update the pipeline" + echo " start [options] Start the pipeline" + echo " stop Stop running pipeline" + echo " status Show installation and runtime status" + echo " client --host Connect to remote server" + echo " config [preset] Show configuration presets" + echo " benchmark Run STT benchmark" + echo " help Show this help" + echo "" + echo "Start options:" + echo " --local-mac macOS Apple Silicon (auto-detected)" + echo " --cuda NVIDIA GPU with torch compile" + echo " --server Server mode (remote clients connect)" + echo " --docker Docker with NVIDIA GPU" + echo " --language Language: en, fr, es, zh, ja, ko, auto" + echo " --background Run in background" + echo "" + echo "Examples:" + echo " $0 setup" + echo " $0 start --local-mac" + echo " $0 start --cuda --language auto --background" + echo " $0 start --server" + echo " $0 client --host 192.168.1.100" + echo " $0 start \$($0 config low-latency)" + echo " $0 stop" + echo "" + echo "Install dir: $S2S_DIR" + return 0 +} + +# ─── Main ───────────────────────────────────────────────────────────── + +main() { + local command="${1:-help}" + if [[ $# -gt 0 ]]; then + shift + fi + + case "$command" in + setup) cmd_setup "$@" ;; + start) cmd_start "$@" ;; + stop) cmd_stop "$@" ;; + status) cmd_status "$@" ;; + client) cmd_client "$@" ;; + config) cmd_config "$@" ;; + benchmark) cmd_benchmark "$@" ;; + help|--help|-h) cmd_help ;; + *) + print_error "Unknown command: $command" + cmd_help + return 1 + ;; + esac +} + +main "$@" diff --git a/.agents/subagent-index.toon b/.agents/subagent-index.toon index 60fef3354..f2279175d 100644 --- a/.agents/subagent-index.toon +++ b/.agents/subagent-index.toon @@ -22,7 +22,7 @@ flash,gemini-2.5-flash,Fast cheap large context pro,gemini-2.5-pro,Capable large context --> - - diff --git a/.agents/tools/voice/speech-to-speech.md b/.agents/tools/voice/speech-to-speech.md new file mode 100644 index 000000000..2f0cb384c --- /dev/null +++ b/.agents/tools/voice/speech-to-speech.md @@ -0,0 +1,293 @@ +--- +description: "HuggingFace Speech-to-Speech - modular voice pipeline (VAD, STT, LLM, TTS) for local GPU and cloud GPU deployment" +mode: subagent +upstream_url: https://github.com/huggingface/speech-to-speech +tools: + read: true + write: false + edit: false + bash: true + glob: true + grep: true + webfetch: true + task: true +--- + +# Speech-to-Speech Pipeline + + + +## Quick Reference + +- **Source**: [huggingface/speech-to-speech](https://github.com/huggingface/speech-to-speech) (Apache-2.0) +- **Purpose**: Modular, open-source GPT-4o-style voice assistant pipeline +- **Pipeline**: VAD -> STT -> LLM -> TTS (each component swappable) +- **Helper**: `speech-to-speech-helper.sh [setup|start|stop|status|config|benchmark] [options]` +- **Install dir**: `~/.aidevops/.agent-workspace/work/speech-to-speech/` +- **Languages**: English, French, Spanish, Chinese, Japanese, Korean (auto-detect or fixed) + +**When to Use**: Read this when setting up voice interfaces, transcription pipelines, voice-driven DevOps, or phone-based AI assistants (pairs with Twilio). + + + +## Architecture + +Four-stage cascaded pipeline connected via thread-safe queues: + +```text +Microphone/Socket -> [VAD] -> [STT] -> [LLM] -> [TTS] -> Speaker/Socket + | | | | + Silero Whisper Any HF Parler + VAD v5 variants instruct Melo + Parafor. OpenAI ChatTTS + Faster-W MLX-LM Kokoro + Parakeet FacebookMMS + Moonshine Pocket +``` + +Each stage runs in its own thread. Audio streams via socket (server/client) or local audio device. + +## Component Options + +### VAD (Voice Activity Detection) + +| Implementation | Notes | +|---------------|-------| +| Silero VAD v5 | Default, production-grade | + +Key parameters: `--thresh` (trigger sensitivity), `--min_speech_ms`, `--min_silence_ms` + +### STT (Speech to Text) + +| Implementation | Flag | Best For | +|---------------|------|----------| +| Whisper (Transformers) | `--stt whisper` | CUDA, general purpose | +| Faster Whisper | `--stt faster-whisper` | CUDA, lower latency | +| Lightning Whisper MLX | `--stt whisper-mlx` | macOS Apple Silicon | +| MLX Audio Whisper | `--stt mlx-audio-whisper` | macOS, newer models | +| Paraformer (FunASR) | `--stt paraformer` | Chinese, low latency | +| Parakeet TDT | `--stt parakeet-tdt` | CUDA, NVIDIA NeMo | +| Moonshine | `--stt moonshine` | Lightweight | + +Model selection: `--stt_model_name ` (any Whisper checkpoint on HF Hub) + +### LLM (Language Model) + +| Implementation | Flag | Best For | +|---------------|------|----------| +| Transformers | `--llm transformers` | CUDA, any HF model | +| MLX-LM | `--llm mlx-lm` | macOS Apple Silicon | +| OpenAI API | `--llm open_api` | Cloud, lowest latency | + +Model selection: `--lm_model_name ` or `--mlx_lm_model_name ` + +### TTS (Text to Speech) + +| Implementation | Flag | Best For | +|---------------|------|----------| +| Parler-TTS | `--tts parler` | CUDA, streaming output | +| MeloTTS | `--tts melo` | Multi-language (6 langs) | +| ChatTTS | `--tts chatTTS` | Natural conversational | +| Kokoro | `--tts kokoro` | macOS default, quality | +| FacebookMMS | `--tts facebookMMS` | 1000+ languages | +| Pocket TTS | `--tts pocket` | Lightweight | + +## Deployment Modes + +### Local (macOS with Apple Silicon) + +Optimal for development and personal use. Uses MPS acceleration: + +```bash +# One-liner with optimal Mac settings +speech-to-speech-helper.sh start --local-mac + +# Equivalent to: +python s2s_pipeline.py \ + --local_mac_optimal_settings \ + --device mps \ + --stt parakeet-tdt \ + --llm mlx-lm \ + --tts kokoro \ + --mlx_lm_model_name mlx-community/Meta-Llama-3.1-8B-Instruct-4bit +``` + +### Local (CUDA GPU) + +For workstations with NVIDIA GPU: + +```bash +# Start with torch compile optimizations +speech-to-speech-helper.sh start --cuda + +# Equivalent to: +python s2s_pipeline.py \ + --recv_host 0.0.0.0 --send_host 0.0.0.0 \ + --lm_model_name microsoft/Phi-3-mini-4k-instruct \ + --stt_compile_mode reduce-overhead \ + --tts_compile_mode default +``` + +### Server/Client (Remote GPU) + +For cloud GPU instances (NVIDIA Cloud, Vast.ai, RunPod, Lambda): + +```bash +# On GPU server +speech-to-speech-helper.sh start --server + +# On local machine (audio I/O) +speech-to-speech-helper.sh client --host + +# Or directly: +python listen_and_play.py --host +``` + +### Docker (CUDA) + +```bash +# Start with docker compose +speech-to-speech-helper.sh start --docker + +# Uses: pytorch/pytorch:2.4.0-cuda12.1-cudnn9-devel +# Ports: 12345 (recv), 12346 (send) +# GPU: nvidia device 0 +``` + +## Setup + +```bash +# Install via helper (clones repo, installs deps) +speech-to-speech-helper.sh setup + +# Or manually: +git clone https://github.com/huggingface/speech-to-speech.git +cd speech-to-speech + +# CUDA/Linux +uv pip install -r requirements.txt + +# macOS +uv pip install -r requirements_mac.txt + +# For MeloTTS (optional) +python -m unidic download +``` + +### Requirements + +- Python 3.10+ +- PyTorch 2.4+ (CUDA) or 2.10+ (macOS) +- `uv` package manager (recommended) +- CUDA 12.1+ (for GPU) or Apple Silicon (for MPS) +- `sounddevice` for local audio I/O +- ~4GB VRAM minimum (varies by model selection) + +## Multi-Language + +```bash +# Auto-detect language per utterance +speech-to-speech-helper.sh start --local-mac --language auto + +# Fixed language (e.g., Chinese) +speech-to-speech-helper.sh start --local-mac --language zh +``` + +Requires compatible STT model (e.g., `--stt_model_name large-v3`) and multilingual TTS (MeloTTS or ChatTTS - Parler-TTS is English-only currently). + +## CLI Parameters + +All parameters use prefix convention: `--stt_*`, `--lm_*`, `--tts_*`, `--melo_*`, etc. + +Generation parameters use `_gen_` infix: `--stt_gen_max_new_tokens 128` + +Full reference: `python s2s_pipeline.py -h` or see [arguments_classes/](https://github.com/huggingface/speech-to-speech/tree/main/arguments_classes) + +## Integration with aidevops + +### Voice-Driven DevOps + +Pair with the LLM stage to create voice-controlled DevOps: + +1. STT captures voice command +2. LLM interprets as DevOps action (via system prompt) +3. TTS confirms action and reports result + +### Transcription Pipeline + +Use STT stage standalone for meeting notes, podcast transcription. Run the pipeline with `--llm open_api` and a system prompt that outputs transcription only, or use the STT components directly via Python. + +### Phone Integration (Twilio) + +Combine with `services/communications/twilio.md` for phone-based AI: + +1. Twilio receives call, streams audio via WebSocket +2. S2S pipeline processes speech in real-time +3. TTS response streamed back to caller + +### Video Narration + +Pair with `tools/video/remotion.md` for generated voiceover: + +1. Generate script with LLM +2. TTS produces audio track +3. Remotion composites with video + +## Cloud GPU Providers + +For server/client deployment when local GPU is insufficient: + +| Provider | GPU Options | Pricing Model | Notes | +|----------|------------|---------------|-------| +| NVIDIA Cloud | A100, H100 | Per-hour | Official NVIDIA, best for production | +| Vast.ai | Consumer + datacenter | Auction/fixed | Cheapest, variable availability | +| RunPod | A100, 4090, H100 | Per-hour | Good balance of price/reliability | +| Lambda | A100, H100 | Per-hour | Research-focused | + +Typical setup: SSH into instance, clone repo, install deps, run server mode. Connect from local machine with `listen_and_play.py`. + +## Recommended Configurations + +### Low Latency (CUDA) + +```bash +--stt faster-whisper --llm open_api --tts parler \ +--stt_compile_mode reduce-overhead --tts_compile_mode default +``` + +### Low VRAM (~4GB) + +```bash +--stt moonshine --llm open_api --tts pocket +``` + +### Best Quality (CUDA, 24GB+) + +```bash +--stt whisper --stt_model_name openai/whisper-large-v3 \ +--llm transformers --lm_model_name microsoft/Phi-3-mini-4k-instruct \ +--tts parler +``` + +### macOS Optimal + +```bash +--local_mac_optimal_settings --device mps \ +--mlx_lm_model_name mlx-community/Meta-Llama-3.1-8B-Instruct-4bit +``` + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| `Cannot use CUDA on macOS` | Use `--device mps` or `--local_mac_optimal_settings` | +| MeloTTS import error | Run `python -m unidic download` | +| High latency | Enable torch compile: `--stt_compile_mode reduce-overhead` | +| Audio crackling | Increase `--min_silence_ms` or check sample rate | +| OOM on GPU | Use smaller models or `--llm open_api` to offload LLM | + +## See Also + +- `services/communications/twilio.md` - Phone integration +- `tools/video/remotion.md` - Video narration +- `tools/video/heygen-skill/rules/voices.md` - AI voice cloning diff --git a/README.md b/README.md index 36bcfa9bd..7bb9d6459 100644 --- a/README.md +++ b/README.md @@ -664,7 +664,52 @@ Test suites are JSON files with prompts and validation rules (`expect_contains`, ### Voice Integration -Speech-to-speech AI conversations: +Open-source speech-to-speech pipeline based on [huggingface/speech-to-speech](https://github.com/huggingface/speech-to-speech) (Apache-2.0). Modular four-stage architecture with swappable components: + +```text +Microphone → [VAD] → [STT] → [LLM] → [TTS] → Speaker + Silero Whisper Any HF Parler/Melo/ + VAD v5 variants instruct Kokoro/ChatTTS +``` + +**Quick start:** + +```bash +# Install the pipeline +speech-to-speech-helper.sh setup + +# Run locally on Mac (auto-configures MPS acceleration) +speech-to-speech-helper.sh start --local-mac + +# Run on NVIDIA GPU with torch compile +speech-to-speech-helper.sh start --cuda + +# Server mode (clients connect remotely) +speech-to-speech-helper.sh start --server +speech-to-speech-helper.sh client --host 192.168.1.100 + +# Configuration presets +speech-to-speech-helper.sh config low-latency # Fastest (CUDA + OpenAI API) +speech-to-speech-helper.sh config low-vram # Minimal GPU (~4GB) +speech-to-speech-helper.sh config quality # Best quality (24GB+ VRAM) +speech-to-speech-helper.sh config mac # Apple Silicon optimal +speech-to-speech-helper.sh config multilingual # Auto language detection (6 langs) +``` + +**Recommended hardware for voice:** + +| Setup | CPU | RAM | GPU | Use Case | +|-------|-----|-----|-----|----------| +| Mac (local) | Apple M1+ | 16GB+ | MPS (unified) | Development, personal use | +| Workstation (CUDA) | Any modern | 16GB+ | NVIDIA 8GB+ VRAM | Low-latency local voice | +| Quality (CUDA) | Any modern | 32GB+ | NVIDIA 24GB+ VRAM | Full Whisper large-v3 + Parler | +| Cloud GPU | - | - | A100/H100 | Production server, multi-user | + +**Cloud GPU providers** for server/client deployment: NVIDIA Cloud, Vast.ai, RunPod, Lambda. + +**Supported languages:** English, French, Spanish, Chinese, Japanese, Korean (auto-detect or fixed). + +**Additional voice methods:** | Method | Description | |--------|-------------| @@ -672,6 +717,8 @@ Speech-to-speech AI conversations: | **iPhone Shortcut** | iOS: dictate → HTTP → speak response | | **Pipecat STS** | Full voice pipeline: Soniox STT → AI → Cartesia TTS | +**See:** [speech-to-speech.md](.agents/tools/voice/speech-to-speech.md) for full component options, CLI parameters, and integration patterns (Twilio phone, video narration, voice-driven DevOps). + ### Scheduled Agent Tasks Cron-based agent dispatch for automated workflows: @@ -685,6 +732,23 @@ Cron-based agent dispatch for automated workflows: ## **Requirements** +### **Recommended Hardware** + +aidevops itself is lightweight (shell scripts + markdown), but AI model workloads benefit from capable hardware: + +| Tier | Machine | CPU | RAM | GPU | Best For | +|------|---------|-----|-----|-----|----------| +| **Minimum** | Any modern laptop | 4+ cores | 8GB | None | Framework only, cloud AI APIs | +| **Recommended** | Mac Studio / desktop | Apple M1+ or 8+ cores | 16GB+ | MPS (Apple) or NVIDIA 8GB+ | Local voice, browser automation, dev servers | +| **Power User** | Workstation | 8+ cores | 32GB+ | NVIDIA 24GB+ VRAM | Full voice pipeline, local LLMs, parallel agents | +| **Server** | Cloud GPU | Any | 16GB+ | A100 / H100 | Production voice, multi-user, batch processing | + +**Cloud GPU providers** for on-demand GPU access: [NVIDIA Cloud](https://www.nvidia.com/en-us/gpu-cloud/), [Vast.ai](https://vast.ai/), [RunPod](https://www.runpod.io/), [Lambda](https://lambdalabs.com/). + +**Note:** Most aidevops features (infrastructure management, SEO, code quality, Git workflows) require no GPU. GPU is only needed for local AI model inference (voice pipeline, vision models, local LLMs). + +### **Software Dependencies** + ```bash # Install dependencies (auto-detected by setup.sh) brew install sshpass jq curl mkcert dnsmasq fd ripgrep # macOS @@ -829,6 +893,11 @@ See `.agents/tools/ocr/glm-ocr.md` for batch processing, PDF workflows, and Peek - **[Remotion](https://remotion.dev/)**: Programmatic video creation with React - create videos using code with 29 specialized rule files - **[Video Prompt Design](https://github.com/snubroot/Veo-3-Meta-Framework)**: Structured prompt engineering for AI video generation (Veo 3, 7-component framework, character consistency, audio design) +### **Voice AI** + +- **[Speech-to-Speech](https://github.com/huggingface/speech-to-speech)**: Open-source modular voice pipeline (VAD → STT → LLM → TTS) with local GPU and cloud GPU deployment +- **[Pipecat](https://github.com/pipecat-ai/pipecat)**: Real-time voice agent framework with Soniox STT, Cartesia TTS, and multi-LLM support + ### **Performance & Monitoring** - **[PageSpeed Insights](https://pagespeed.web.dev/)**: Website performance auditing diff --git a/TODO.md b/TODO.md index 1fbceb266..d5f0c5053 100644 --- a/TODO.md +++ b/TODO.md @@ -339,8 +339,14 @@ Tasks with no open blockers - ready to work on. Use `/ready` to refresh this lis - Notes: Implemented complete system for importing skills from external GitHub repos. Created add-skill-helper.sh (~630 lines) for fetching, format detection (SKILL.md, AGENTS.md, .cursorrules, raw markdown), conversion, and registration. Created skill-update-helper.sh (~280 lines) for upstream update checking. Added skill-sources.json registry, /add-skill command, add-skill.md subagent. Updated setup.sh with create_skill_symlinks() for cross-tool compatibility. PR #135. - [x] t067 Optimise OpenCode MCP loading with on-demand activation #opencode #performance #mcp ~4h (ai:2h test:1h read:1h) logged:2026-01-21 blocked-by:t056 started:2026-01-21T06:15Z completed:2026-01-21 actual:30m - Notes: Implemented on-demand MCP loading pattern. Updated generate-opencode-agents.sh to sync MCP index on agent generation. Added MCP On-Demand Loading section to AGENTS.md. Pattern: MCPs disabled globally, enabled per-subagent via frontmatter, discoverable via mcp-index-helper.sh search. - -