From 847064331578ef339fcf3243aa78b7ebd1133f72 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 12:04:37 -0800 Subject: [PATCH 01/18] feat(desktop): add voice commands with wake word detection and settings toggle Adds a Python-based voice sidecar that listens for a wake word ("Hey Jarvis"), captures speech, transcribes it, and routes commands back to the desktop app. The feature is gated behind a "Voice Commands" toggle in Settings > Features (default: off). The sidecar process auto-starts/stops based on subscriber count, so it only runs when the setting is enabled. --- apps/api/package.json | 1 + apps/api/src/app/api/voice/route.ts | 93 ++ apps/api/src/app/api/voice/tool-adapter.ts | 257 ++++ apps/api/src/app/api/voice/voice-service.ts | 142 +++ apps/api/src/env.ts | 2 + apps/desktop/electron-builder.ts | 9 + apps/desktop/scripts/build-voice-sidecar.sh | 50 + apps/desktop/src/lib/trpc/routers/index.ts | 2 + .../src/lib/trpc/routers/settings/index.ts | 20 + .../src/lib/trpc/routers/voice/index.ts | 54 + apps/desktop/src/main/index.ts | 3 +- .../src/main/lib/voice/python/.gitignore | 3 + .../src/main/lib/voice/python/audio.py | 60 + .../src/main/lib/voice/python/config.py | 26 + .../desktop/src/main/lib/voice/python/main.py | 146 +++ .../src/main/lib/voice/python/pyproject.toml | 11 + .../main/lib/voice/python/speech_capture.py | 76 ++ .../src/main/lib/voice/python/wake_word.py | 46 + .../src/main/lib/voice/voice-process-paths.ts | 86 ++ .../src/main/lib/voice/voice-process.ts | 140 +++ .../components/Voice/VoiceListener.tsx | 95 ++ .../RecordingIndicator/RecordingIndicator.tsx | 26 + .../components/RecordingIndicator/index.ts | 1 + .../ResponsePanel/ResponsePanel.tsx | 122 ++ .../hooks/useVoicePipeline/index.ts | 1 + .../useVoicePipeline/useVoicePipeline.ts | 214 ++++ .../Voice/components/ResponsePanel/index.ts | 1 + .../src/renderer/components/Voice/index.ts | 1 + .../renderer/routes/_authenticated/layout.tsx | 2 + .../BehaviorSettings/BehaviorSettings.tsx | 50 + .../utils/settings-search/settings-search.ts | 18 + apps/desktop/src/shared/voice.ts | 29 + bun.lock | 3 + .../0016_add_voice_commands_enabled.sql | 1 + .../local-db/drizzle/meta/0016_snapshot.json | 1057 +++++++++++++++++ packages/local-db/drizzle/meta/_journal.json | 7 + packages/local-db/src/schema/schema.ts | 3 + 37 files changed, 2857 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/app/api/voice/route.ts create mode 100644 apps/api/src/app/api/voice/tool-adapter.ts create mode 100644 apps/api/src/app/api/voice/voice-service.ts create mode 100755 apps/desktop/scripts/build-voice-sidecar.sh create mode 100644 apps/desktop/src/lib/trpc/routers/voice/index.ts create mode 100644 apps/desktop/src/main/lib/voice/python/.gitignore create mode 100644 apps/desktop/src/main/lib/voice/python/audio.py create mode 100644 apps/desktop/src/main/lib/voice/python/config.py create mode 100644 apps/desktop/src/main/lib/voice/python/main.py create mode 100644 apps/desktop/src/main/lib/voice/python/pyproject.toml create mode 100644 apps/desktop/src/main/lib/voice/python/speech_capture.py create mode 100644 apps/desktop/src/main/lib/voice/python/wake_word.py create mode 100644 apps/desktop/src/main/lib/voice/voice-process-paths.ts create mode 100644 apps/desktop/src/main/lib/voice/voice-process.ts create mode 100644 apps/desktop/src/renderer/components/Voice/VoiceListener.tsx create mode 100644 apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/RecordingIndicator.tsx create mode 100644 apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/index.ts create mode 100644 apps/desktop/src/renderer/components/Voice/components/ResponsePanel/ResponsePanel.tsx create mode 100644 apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/index.ts create mode 100644 apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/useVoicePipeline.ts create mode 100644 apps/desktop/src/renderer/components/Voice/components/ResponsePanel/index.ts create mode 100644 apps/desktop/src/renderer/components/Voice/index.ts create mode 100644 apps/desktop/src/shared/voice.ts create mode 100644 packages/local-db/drizzle/0016_add_voice_commands_enabled.sql create mode 100644 packages/local-db/drizzle/meta/0016_snapshot.json diff --git a/apps/api/package.json b/apps/api/package.json index 9ac03eee334..593f85990a9 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -38,6 +38,7 @@ "lodash.chunk": "^4.2.0", "mcp-handler": "^1.0.7", "next": "^16.0.10", + "openai": "^6.17.0", "react": "19.1.0", "react-dom": "19.1.0", "require-in-the-middle": "8.0.1", diff --git a/apps/api/src/app/api/voice/route.ts b/apps/api/src/app/api/voice/route.ts new file mode 100644 index 00000000000..8464966517d --- /dev/null +++ b/apps/api/src/app/api/voice/route.ts @@ -0,0 +1,93 @@ +import { auth } from "@superset/auth/server"; +import type { McpContext } from "@/lib/mcp/auth"; +import { runVoicePipeline } from "./voice-service"; + +async function authenticate(request: Request): Promise { + // Try session auth + const session = await auth.api.getSession({ headers: request.headers }); + if (session?.session) { + const extendedSession = session.session as { + activeOrganizationId?: string; + }; + if (!extendedSession.activeOrganizationId) { + return null; + } + return { + userId: session.user.id, + organizationId: extendedSession.activeOrganizationId, + }; + } + + return null; +} + +export async function POST(request: Request) { + // 1. Authenticate + const ctx = await authenticate(request); + if (!ctx) { + return Response.json({ error: "Unauthorized" }, { status: 401 }); + } + + // 2. Parse multipart form data + let formData: FormData; + try { + formData = await request.formData(); + } catch { + return Response.json( + { error: "Expected multipart form data with audio file" }, + { status: 400 }, + ); + } + + const audioFile = formData.get("audio"); + if (!audioFile || !(audioFile instanceof File)) { + return Response.json( + { error: "Missing 'audio' file in form data" }, + { status: 400 }, + ); + } + + const MAX_AUDIO_SIZE = 5 * 1024 * 1024; // 5 MB + if (audioFile.size > MAX_AUDIO_SIZE) { + return Response.json( + { error: "Audio file too large (max 5 MB)" }, + { status: 413 }, + ); + } + + const audioBuffer = new Uint8Array(await audioFile.arrayBuffer()); + + // 3. Stream SSE response + const encoder = new TextEncoder(); + + const stream = new ReadableStream({ + async start(controller) { + const sse = { + write(event: string, data: unknown) { + const payload = `event: ${event}\ndata: ${JSON.stringify(data)}\n\n`; + controller.enqueue(encoder.encode(payload)); + }, + }; + + try { + await runVoicePipeline({ audioBuffer, ctx, sse }); + } catch (error) { + console.error("[voice/route] Pipeline error:", error); + sse.write("error", { + message: + error instanceof Error ? error.message : "Voice pipeline failed", + }); + } finally { + controller.close(); + } + }, + }); + + return new Response(stream, { + headers: { + "Content-Type": "text/event-stream", + "Cache-Control": "no-cache", + Connection: "keep-alive", + }, + }); +} diff --git a/apps/api/src/app/api/voice/tool-adapter.ts b/apps/api/src/app/api/voice/tool-adapter.ts new file mode 100644 index 00000000000..d52e4e7f3c3 --- /dev/null +++ b/apps/api/src/app/api/voice/tool-adapter.ts @@ -0,0 +1,257 @@ +import type Anthropic from "@anthropic-ai/sdk"; +import { z } from "zod"; +import type { McpContext } from "@/lib/mcp/auth"; + +type ToolHandler = ( + params: Record, + ctx: McpContext, +) => Promise<{ + content: Array<{ type: "text"; text: string }>; + isError?: boolean; +}>; + +interface ToolDefinition { + name: string; + description: string; + input_schema: Anthropic.Tool["input_schema"]; + handler: ToolHandler; +} + +let cachedTools: ToolDefinition[] | null = null; + +/** + * Builds tool definitions by intercepting MCP tool registration. + * Converts Zod input schemas to JSON Schema for the Anthropic SDK. + * Results are cached since tool definitions are static. + */ +export async function getToolDefinitions(): Promise { + if (cachedTools) return cachedTools; + + const tools: ToolDefinition[] = []; + + const interceptServer = { + tool( + name: string, + description: string, + inputSchema: Record, + handler: ( + params: Record, + extra: { + authInfo?: { extra?: { mcpContext?: McpContext } }; + }, + ) => Promise<{ + content: Array<{ type: "text"; text: string }>; + isError?: boolean; + }>, + ) { + // Convert Zod schemas to JSON Schema properties + const properties: Record = {}; + const required: string[] = []; + + for (const [key, schema] of Object.entries(inputSchema)) { + try { + properties[key] = zodToJsonSchema(schema); + if (!isOptional(schema)) { + required.push(key); + } + } catch { + // Fallback for schemas that can't be converted + properties[key] = { type: "string" }; + } + } + + tools.push({ + name, + description, + input_schema: { + type: "object" as const, + properties, + ...(required.length > 0 ? { required } : {}), + }, + handler: async (params, ctx) => { + return handler(params, { + authInfo: { extra: { mcpContext: ctx } }, + }); + }, + }); + }, + }; + + const { registerTools } = await import("@/lib/mcp/tools"); + registerTools(interceptServer as never); + + cachedTools = tools; + return tools; +} + +/** + * Convert a Zod schema to a basic JSON Schema representation. + */ +function zodToJsonSchema(schema: z.ZodType): Record { + const def = ( + schema as unknown as { + _zod?: { def?: { type?: string; typeName?: string } }; + } + )._zod?.def; + const description = schema.description; + + // Unwrap optional/default wrappers + const innerSchema = unwrapZod(schema); + const innerDef = (innerSchema as { _zod?: { def?: Record } }) + ._zod?.def; + const typeName = (innerDef?.typeName ?? + def?.type ?? + def?.typeName ?? + "") as string; + + const result: Record = {}; + + switch (typeName) { + case "ZodString": + case "string": + result.type = "string"; + break; + case "ZodNumber": + case "number": + result.type = "number"; + break; + case "ZodBoolean": + case "boolean": + result.type = "boolean"; + break; + case "ZodArray": + case "array": { + result.type = "array"; + const itemSchema = + (innerDef as Record)?.innerType ?? + (innerDef as Record)?.type; + if (itemSchema && itemSchema instanceof z.ZodType) { + result.items = zodToJsonSchema(itemSchema); + } + break; + } + case "ZodEnum": + case "enum": { + result.type = "string"; + const values = + (innerDef as Record)?.entries ?? + (innerDef as Record)?.values; + if (Array.isArray(values)) { + result.enum = values; + } else if (values && typeof values === "object") { + result.enum = Object.keys(values); + } + break; + } + case "ZodObject": + case "object": { + result.type = "object"; + const shape = (innerDef as Record)?.shape; + if (shape && typeof shape === "object") { + const props: Record = {}; + for (const [k, v] of Object.entries( + shape as Record, + )) { + props[k] = zodToJsonSchema(v); + } + result.properties = props; + } + break; + } + default: + result.type = "string"; + break; + } + + if (description) { + result.description = description; + } + + return result; +} + +/** + * Unwrap optional/default/nullable wrappers to get the inner type. + */ +function unwrapZod(schema: z.ZodType): z.ZodType { + const def = ( + schema as unknown as { _zod?: { def?: Record } } + )._zod?.def; + const typeName = (def?.typeName ?? "") as string; + + if ( + typeName === "ZodOptional" || + typeName === "ZodDefault" || + typeName === "ZodNullable" + ) { + const inner = def?.innerType; + if (inner && inner instanceof z.ZodType) { + return unwrapZod(inner); + } + } + + return schema; +} + +/** + * Check if a Zod schema is optional. + */ +function isOptional(schema: z.ZodType): boolean { + const def = ( + schema as unknown as { _zod?: { def?: Record } } + )._zod?.def; + const typeName = (def?.typeName ?? "") as string; + + if (typeName === "ZodOptional" || typeName === "ZodDefault") { + return true; + } + + const inner = def?.innerType; + if (inner && inner instanceof z.ZodType) { + return isOptional(inner); + } + + return false; +} + +/** + * Execute a tool by name with the given input and auth context. + */ +export async function executeTool({ + toolName, + toolInput, + ctx, + tools, +}: { + toolName: string; + toolInput: Record; + ctx: McpContext; + tools: ToolDefinition[]; +}): Promise { + const tool = tools.find((t) => t.name === toolName); + if (!tool) { + return JSON.stringify({ error: `Unknown tool: ${toolName}` }); + } + + try { + const result = await tool.handler(toolInput, ctx); + const text = result.content.map((c) => c.text).join("\n"); + return text; + } catch (error) { + console.error(`[voice/tool] Error executing ${toolName}:`, error); + return JSON.stringify({ + error: `Tool execution failed: ${error instanceof Error ? error.message : "Unknown error"}`, + }); + } +} + +/** + * Convert tool definitions to Anthropic SDK tool format. + */ +export function toAnthropicTools(tools: ToolDefinition[]): Anthropic.Tool[] { + return tools.map((t) => ({ + name: t.name, + description: t.description, + input_schema: t.input_schema, + })); +} diff --git a/apps/api/src/app/api/voice/voice-service.ts b/apps/api/src/app/api/voice/voice-service.ts new file mode 100644 index 00000000000..a3e0901d84a --- /dev/null +++ b/apps/api/src/app/api/voice/voice-service.ts @@ -0,0 +1,142 @@ +import Anthropic from "@anthropic-ai/sdk"; +import { OpenAI } from "openai"; +import { env } from "@/env"; +import type { McpContext } from "@/lib/mcp/auth"; +import { + executeTool, + getToolDefinitions, + toAnthropicTools, +} from "./tool-adapter"; + +const SYSTEM_PROMPT = `You are a helpful voice assistant for Superset, a project management tool. You have access to tools for creating and managing tasks, workspaces, and other organizational resources. Keep responses concise and conversational — the user is speaking to you, so respond in 1-3 sentences unless the question requires more detail. When you use tools, briefly confirm what you did.`; + +/** + * SSE event types emitted during the voice pipeline. + */ +interface SSEWriter { + write(event: string, data: unknown): void; +} + +/** + * Transcribes audio using OpenAI Whisper API. + */ +async function transcribeAudio(audioBuffer: Uint8Array): Promise { + const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY }); + + const blob = new Blob([audioBuffer], { type: "audio/wav" }); + const file = new File([blob], "audio.wav", { type: "audio/wav" }); + + const result = await openai.audio.transcriptions.create({ + model: "whisper-1", + file, + }); + + // Strip wake word from transcription + let text = result.text.trim(); + text = text.replace(/^hey\s*jarvis[,.\s!?]*/i, "").trim(); + return text; +} + +/** + * Runs the full voice pipeline: transcription → Claude with tools → streaming response. + * Writes SSE events to the provided writer throughout. + */ +export async function runVoicePipeline({ + audioBuffer, + ctx, + sse, +}: { + audioBuffer: Uint8Array; + ctx: McpContext; + sse: SSEWriter; +}): Promise { + // 1. Transcribe + const transcription = await transcribeAudio(audioBuffer); + sse.write("transcription", { text: transcription }); + + if (!transcription) { + sse.write("done", { fullResponse: "" }); + return; + } + + // 2. Load tools + const toolDefs = await getToolDefinitions(); + const anthropicTools = toAnthropicTools(toolDefs); + + // 3. Stream Claude response with tool use loop + const anthropic = new Anthropic({ apiKey: env.ANTHROPIC_API_KEY }); + + const messages: Anthropic.MessageParam[] = [ + { role: "user", content: transcription }, + ]; + + let fullResponse = ""; + + // Tool use loop — Claude may call tools, then we feed results back + const MAX_TOOL_ROUNDS = 5; + for (let round = 0; round < MAX_TOOL_ROUNDS; round++) { + const stream = anthropic.messages.stream({ + model: "claude-sonnet-4-20250514", + max_tokens: 1024, + system: SYSTEM_PROMPT, + messages, + tools: anthropicTools.length > 0 ? anthropicTools : undefined, + }); + + for await (const event of stream) { + if (event.type === "content_block_delta") { + if (event.delta.type === "text_delta") { + fullResponse += event.delta.text; + sse.write("text_delta", { delta: event.delta.text }); + } + } + } + + // Collect the final message to check for tool use + const finalMessage = await stream.finalMessage(); + const contentBlocks = finalMessage.content; + + // Check for tool use blocks + const toolUseBlocks = contentBlocks.filter( + (block): block is Anthropic.ToolUseBlock => block.type === "tool_use", + ); + + if (toolUseBlocks.length === 0) { + break; + } + + // Execute each tool call and collect results + const toolResults: Anthropic.ToolResultBlockParam[] = []; + + for (const toolBlock of toolUseBlocks) { + sse.write("tool_use", { + toolName: toolBlock.name, + toolInput: toolBlock.input, + }); + + const result = await executeTool({ + toolName: toolBlock.name, + toolInput: toolBlock.input as Record, + ctx, + tools: toolDefs, + }); + + sse.write("tool_result", { + toolName: toolBlock.name, + result, + }); + + toolResults.push({ + type: "tool_result", + tool_use_id: toolBlock.id, + content: result, + }); + } + + // Feed tool results back into conversation for next iteration + messages.push({ role: "assistant", content: contentBlocks }); + messages.push({ role: "user", content: toolResults }); + } + + sse.write("done", { fullResponse }); +} diff --git a/apps/api/src/env.ts b/apps/api/src/env.ts index a5e2bbc5b0f..0db088c82f5 100644 --- a/apps/api/src/env.ts +++ b/apps/api/src/env.ts @@ -40,6 +40,8 @@ export const env = createEnv({ STRIPE_PRO_MONTHLY_PRICE_ID: z.string(), STRIPE_PRO_YEARLY_PRICE_ID: z.string(), SENTRY_AUTH_TOKEN: z.string().optional(), + OPENAI_API_KEY: z.string().min(1), + ANTHROPIC_API_KEY: z.string().min(1), }, client: { NEXT_PUBLIC_API_URL: z.string().url(), diff --git a/apps/desktop/electron-builder.ts b/apps/desktop/electron-builder.ts index 9fb84f56634..cf146fb3701 100644 --- a/apps/desktop/electron-builder.ts +++ b/apps/desktop/electron-builder.ts @@ -56,6 +56,12 @@ const config: Configuration = { to: "resources/migrations", filter: ["**/*"], }, + // Voice sidecar binary (built by PyInstaller via scripts/build-voice-sidecar.sh) + { + from: "dist/voice-sidecar/voice-sidecar", + to: "voice-sidecar", + filter: ["**/*"], + }, ], files: [ @@ -125,6 +131,9 @@ const config: Configuration = { "Superset needs access to your local network to discover and connect to development servers running on your network.", // Bonjour service types to browse for (triggers the permission prompt) NSBonjourServices: ["_http._tcp", "_https._tcp"], + // Required for microphone access (voice commands) + NSMicrophoneUsageDescription: + "Superset uses the microphone for voice commands to interact with your development environment.", }, }, diff --git a/apps/desktop/scripts/build-voice-sidecar.sh b/apps/desktop/scripts/build-voice-sidecar.sh new file mode 100755 index 00000000000..f84edd7e978 --- /dev/null +++ b/apps/desktop/scripts/build-voice-sidecar.sh @@ -0,0 +1,50 @@ +#!/usr/bin/env bash +# Builds the voice sidecar Python script into a standalone binary using PyInstaller. +# The output binary is placed in dist/voice-sidecar/ and gets bundled into +# the Electron app's extraResources by electron-builder. +# +# Prerequisites: +# pip install pyinstaller (in the voice python venv) +# +# Usage: +# ./scripts/build-voice-sidecar.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +DESKTOP_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" +PYTHON_DIR="$DESKTOP_DIR/src/main/lib/voice/python" +VENV_DIR="$PYTHON_DIR/.venv" +OUTPUT_DIR="$DESKTOP_DIR/dist/voice-sidecar" + +if [ ! -d "$VENV_DIR" ]; then + echo "Error: Python venv not found at $VENV_DIR" + echo "Create it with: python3 -m venv $VENV_DIR && $VENV_DIR/bin/pip install openwakeword sounddevice numpy" + exit 1 +fi + +PYTHON="$VENV_DIR/bin/python3" +PIP="$VENV_DIR/bin/pip" + +# Ensure PyInstaller is installed +if ! "$PYTHON" -c "import PyInstaller" 2>/dev/null; then + echo "Installing PyInstaller..." + "$PIP" install pyinstaller +fi + +echo "Building voice sidecar binary..." + +"$PYTHON" -m PyInstaller \ + --name voice-sidecar \ + --onedir \ + --noconfirm \ + --clean \ + --distpath "$OUTPUT_DIR" \ + --workpath "$DESKTOP_DIR/dist/voice-sidecar-build" \ + --specpath "$DESKTOP_DIR/dist" \ + --collect-data openwakeword \ + "$PYTHON_DIR/main.py" + +echo "Voice sidecar binary built at: $OUTPUT_DIR/voice-sidecar/" +echo "Contents:" +ls -la "$OUTPUT_DIR/voice-sidecar/" diff --git a/apps/desktop/src/lib/trpc/routers/index.ts b/apps/desktop/src/lib/trpc/routers/index.ts index 2545a0f404b..1c216be4355 100644 --- a/apps/desktop/src/lib/trpc/routers/index.ts +++ b/apps/desktop/src/lib/trpc/routers/index.ts @@ -16,6 +16,7 @@ import { createRingtoneRouter } from "./ringtone"; import { createSettingsRouter } from "./settings"; import { createTerminalRouter } from "./terminal"; import { createUiStateRouter } from "./ui-state"; +import { createVoiceRouter } from "./voice"; import { createWindowRouter } from "./window"; import { createWorkspacesRouter } from "./workspaces"; @@ -39,6 +40,7 @@ export const createAppRouter = (getWindow: () => BrowserWindow | null) => { config: createConfigRouter(), uiState: createUiStateRouter(), ringtone: createRingtoneRouter(), + voice: createVoiceRouter(), }); }; diff --git a/apps/desktop/src/lib/trpc/routers/settings/index.ts b/apps/desktop/src/lib/trpc/routers/settings/index.ts index ea2166ba14e..3cdf84d394e 100644 --- a/apps/desktop/src/lib/trpc/routers/settings/index.ts +++ b/apps/desktop/src/lib/trpc/routers/settings/index.ts @@ -405,5 +405,25 @@ export const createSettingsRouter = () => { return { success: true }; }), + + getVoiceCommandsEnabled: publicProcedure.query(() => { + const row = getSettings(); + return row.voiceCommandsEnabled ?? false; + }), + + setVoiceCommandsEnabled: publicProcedure + .input(z.object({ enabled: z.boolean() })) + .mutation(({ input }) => { + localDb + .insert(settings) + .values({ id: 1, voiceCommandsEnabled: input.enabled }) + .onConflictDoUpdate({ + target: settings.id, + set: { voiceCommandsEnabled: input.enabled }, + }) + .run(); + + return { success: true }; + }), }); }; diff --git a/apps/desktop/src/lib/trpc/routers/voice/index.ts b/apps/desktop/src/lib/trpc/routers/voice/index.ts new file mode 100644 index 00000000000..7a706147599 --- /dev/null +++ b/apps/desktop/src/lib/trpc/routers/voice/index.ts @@ -0,0 +1,54 @@ +import { observable } from "@trpc/server/observable"; +import { + getCurrentVoiceState, + startVoiceProcess, + stopVoiceProcess, + voiceProcessEmitter, +} from "main/lib/voice/voice-process"; +import type { VoiceSidecarEvent } from "shared/voice"; +import { publicProcedure, router } from "../.."; + +export const createVoiceRouter = () => { + let subscriberCount = 0; + + return router({ + subscribe: publicProcedure.subscription(() => { + return observable((emit) => { + subscriberCount++; + + // Auto-start the voice process when first subscriber connects + if (subscriberCount === 1) { + startVoiceProcess(); + } + + emit.next(getCurrentVoiceState()); + + const onVoiceEvent = (event: VoiceSidecarEvent) => { + emit.next(event); + }; + + voiceProcessEmitter.on("voice-event", onVoiceEvent); + + return () => { + voiceProcessEmitter.off("voice-event", onVoiceEvent); + subscriberCount--; + + // Auto-stop when last subscriber disconnects + if (subscriberCount === 0) { + stopVoiceProcess(); + } + }; + }); + }), + + start: publicProcedure.mutation(() => { + startVoiceProcess(); + return { success: true as const }; + }), + + stop: publicProcedure.mutation(() => { + stopVoiceProcess(); + return { success: true as const }; + }), + }); +}; diff --git a/apps/desktop/src/main/index.ts b/apps/desktop/src/main/index.ts index 2a46c20f4e5..842202fdf33 100644 --- a/apps/desktop/src/main/index.ts +++ b/apps/desktop/src/main/index.ts @@ -17,6 +17,7 @@ import { shutdownOrphanedDaemon, } from "./lib/terminal"; import { disposeTray, initTray } from "./lib/tray"; +import { stopVoiceProcess } from "./lib/voice/voice-process"; import { MainWindow } from "./windows/main"; // Initialize local SQLite database (runs migrations + legacy data migration on import) @@ -158,8 +159,8 @@ app.on("before-quit", async (event) => { } // Quit confirmed or no confirmation needed - exit immediately - // Let OS clean up child processes, tray, etc. isQuitting = true; + stopVoiceProcess(); disposeTray(); app.exit(0); }); diff --git a/apps/desktop/src/main/lib/voice/python/.gitignore b/apps/desktop/src/main/lib/voice/python/.gitignore new file mode 100644 index 00000000000..77ac75498fb --- /dev/null +++ b/apps/desktop/src/main/lib/voice/python/.gitignore @@ -0,0 +1,3 @@ +.venv/ +__pycache__/ +*.pyc diff --git a/apps/desktop/src/main/lib/voice/python/audio.py b/apps/desktop/src/main/lib/voice/python/audio.py new file mode 100644 index 00000000000..5d8d96409fb --- /dev/null +++ b/apps/desktop/src/main/lib/voice/python/audio.py @@ -0,0 +1,60 @@ +import queue +from typing import Optional + +import numpy as np +import sounddevice as sd + +from config import Config + + +class AudioStream: + """Context manager for capturing audio from the microphone.""" + + def __init__(self, config: Config) -> None: + self._config = config + self._queue: queue.Queue[np.ndarray] = queue.Queue() + self._stream: Optional[sd.InputStream] = None + + def _callback( + self, + indata: np.ndarray, + frames: int, + time_info: object, + status: sd.CallbackFlags, + ) -> None: + if status: + _emit_error(f"audio callback: {status}") + self._queue.put(indata.copy()) + + def read_chunk(self, timeout: float = 2.0) -> Optional[np.ndarray]: + """Read the next audio chunk from the queue. Returns None on timeout.""" + try: + return self._queue.get(timeout=timeout) + except queue.Empty: + return None + + def __enter__(self) -> "AudioStream": + self._stream = sd.InputStream( + samplerate=self._config.sample_rate, + channels=self._config.channels, + dtype=self._config.dtype, + blocksize=self._config.chunk_size, + callback=self._callback, + ) + self._stream.start() + return self + + def __exit__(self, *exc: object) -> None: + if self._stream is not None: + self._stream.stop() + self._stream.close() + self._stream = None + + +def _emit_error(message: str) -> None: + """Helper to emit error via stdout JSON (imported lazily to avoid circular imports).""" + import json + import sys + + sys.stdout.write(json.dumps({"event": "error", "message": message}) + "\n") + sys.stdout.flush() diff --git a/apps/desktop/src/main/lib/voice/python/config.py b/apps/desktop/src/main/lib/voice/python/config.py new file mode 100644 index 00000000000..6192fce0cfc --- /dev/null +++ b/apps/desktop/src/main/lib/voice/python/config.py @@ -0,0 +1,26 @@ +from dataclasses import dataclass + + +@dataclass(frozen=True) +class Config: + # Audio + sample_rate: int = 16000 + channels: int = 1 + dtype: str = "int16" + chunk_duration_ms: int = 80 + chunk_size: int = 0 # computed in __post_init__ + + # Wake word + wake_word_model: str = "hey_jarvis" + wake_word_threshold: float = 0.5 + + # Speech capture + pre_buffer_chunks: int = 63 # ~5s of audio to carry over into speech capture + min_capture_s: float = 1.5 # don't end capture until this much live time has passed + silence_threshold_rms: float = 200.0 + silence_duration_s: float = 1.5 + max_speech_duration_s: float = 30.0 + + def __post_init__(self) -> None: + computed = int(self.sample_rate * self.chunk_duration_ms / 1000) + object.__setattr__(self, "chunk_size", computed) diff --git a/apps/desktop/src/main/lib/voice/python/main.py b/apps/desktop/src/main/lib/voice/python/main.py new file mode 100644 index 00000000000..e02f7842bae --- /dev/null +++ b/apps/desktop/src/main/lib/voice/python/main.py @@ -0,0 +1,146 @@ +""" +Voice sidecar process — wake word detection + audio capture. + +Communicates with the parent Node.js process via stdio JSON lines. + +Stdout events: + {"event": "ready"} + {"event": "recording"} + {"event": "audio_captured", "audio_b64": "", "duration_s": 3.2} + {"event": "error", "message": "..."} + {"event": "idle"} + +Stdin commands: + {"cmd": "start"} + {"cmd": "stop"} +""" + +import base64 +import collections +import io +import json +import sys +import threading +import time +import wave +from typing import Any + +import numpy as np + +from audio import AudioStream +from config import Config +from speech_capture import CaptureStatus, SpeechCapture +from wake_word import WakeWordDetector + + +def emit(event: str, **kwargs: Any) -> None: + """Write a JSON event to stdout.""" + msg = {"event": event, **kwargs} + sys.stdout.write(json.dumps(msg) + "\n") + sys.stdout.flush() + + +def to_wav_b64(audio: np.ndarray, config: Config) -> str: + """Convert int16 numpy array to base64-encoded WAV.""" + buf = io.BytesIO() + with wave.open(buf, "wb") as wf: + wf.setnchannels(config.channels) + wf.setsampwidth(2) # 16-bit = 2 bytes + wf.setframerate(config.sample_rate) + wf.writeframes(audio.tobytes()) + return base64.b64encode(buf.getvalue()).decode("ascii") + + +def stdin_reader(stop_event: threading.Event) -> None: + """Read stdin commands in a background thread.""" + for line in sys.stdin: + line = line.strip() + if not line: + continue + try: + cmd = json.loads(line) + if cmd.get("cmd") == "stop": + stop_event.set() + except json.JSONDecodeError as e: + print(f"[stdin] Invalid JSON: {e}", file=sys.stderr) + + +def main() -> None: + config = Config() + + # Load wake word model + detector = WakeWordDetector(config) + try: + detector.load() + except Exception as e: + emit("error", message=f"Failed to load wake word model: {e}") + sys.exit(1) + + capturer = SpeechCapture(config) + + # Listen for stop commands from parent process + stop_event = threading.Event() + stdin_thread = threading.Thread(target=stdin_reader, args=(stop_event,), daemon=True) + stdin_thread.start() + + emit("ready") + + pre_buffer: collections.deque[Any] = collections.deque(maxlen=config.pre_buffer_chunks) + + try: + with AudioStream(config) as stream: + emit("idle") + + while not stop_event.is_set(): + chunk = stream.read_chunk() + if chunk is None: + continue + + pre_buffer.append(chunk.copy()) + + # Wake word detection + result = detector.process_chunk(chunk) + if not result.detected: + continue + + # Speech capture + emit("recording") + capturer.start() + for buffered_chunk in pre_buffer: + capturer.add_prebuffer(buffered_chunk) + pre_buffer.clear() + + while not stop_event.is_set(): + audio_chunk = stream.read_chunk() + if audio_chunk is None: + continue + status = capturer.add_chunk(audio_chunk) + if status != CaptureStatus.CAPTURING: + break + + speech_audio = capturer.get_audio() + + if speech_audio.size == 0: + emit("idle") + detector.reset() + continue + + # Convert to WAV and emit + audio_b64 = to_wav_b64(speech_audio, config) + emit( + "audio_captured", + audio_b64=audio_b64, + duration_s=round(capturer.duration_s, 2), + ) + + # Reset for next cycle + detector.reset() + time.sleep(0.5) + emit("idle") + + except Exception as e: + emit("error", message=str(e)) + + +if __name__ == "__main__": + main() diff --git a/apps/desktop/src/main/lib/voice/python/pyproject.toml b/apps/desktop/src/main/lib/voice/python/pyproject.toml new file mode 100644 index 00000000000..9766a581781 --- /dev/null +++ b/apps/desktop/src/main/lib/voice/python/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "superset-voice-sidecar" +version = "0.1.0" +description = "Wake word detection + audio capture sidecar for Superset desktop" +requires-python = ">=3.10" + +dependencies = [ + "openwakeword>=0.6.0", + "sounddevice>=0.5.0", + "numpy>=1.24.0", +] diff --git a/apps/desktop/src/main/lib/voice/python/speech_capture.py b/apps/desktop/src/main/lib/voice/python/speech_capture.py new file mode 100644 index 00000000000..a5fbc04fb35 --- /dev/null +++ b/apps/desktop/src/main/lib/voice/python/speech_capture.py @@ -0,0 +1,76 @@ +import enum +import time + +import numpy as np + +from config import Config + + +class CaptureStatus(enum.Enum): + CAPTURING = "capturing" + SPEECH_ENDED = "speech_ended" + MAX_DURATION = "max_duration" + + +class SpeechCapture: + """Accumulates audio after wake word trigger, detects silence to end capture.""" + + def __init__(self, config: Config) -> None: + self._config = config + self._buffers: list[np.ndarray] = [] + self._start_time: float = 0.0 + self._last_speech_time: float = 0.0 + self._active = False + + def start(self) -> None: + """Begin a new speech capture session.""" + self._buffers = [] + self._start_time = time.perf_counter() + self._last_speech_time = self._start_time + self._active = True + + def add_prebuffer(self, chunk: np.ndarray) -> None: + """Add a pre-buffered chunk (audio only, no silence detection).""" + if not self._active: + raise RuntimeError("Capture not started. Call start() first.") + self._buffers.append(chunk.copy()) + + def add_chunk(self, chunk: np.ndarray) -> CaptureStatus: + """Add a live chunk and return the current capture status.""" + if not self._active: + raise RuntimeError("Capture not started. Call start() first.") + + self._buffers.append(chunk.copy()) + now = time.perf_counter() + elapsed = now - self._start_time + + if elapsed >= self._config.max_speech_duration_s: + self._active = False + return CaptureStatus.MAX_DURATION + + rms = np.sqrt(np.mean(chunk.astype(np.float64) ** 2)) + if rms > self._config.silence_threshold_rms: + self._last_speech_time = now + + if elapsed < self._config.min_capture_s: + return CaptureStatus.CAPTURING + + silence_duration = now - self._last_speech_time + if silence_duration >= self._config.silence_duration_s: + self._active = False + return CaptureStatus.SPEECH_ENDED + + return CaptureStatus.CAPTURING + + def get_audio(self) -> np.ndarray: + """Return all captured audio as a single array.""" + if not self._buffers: + return np.array([], dtype=np.int16) + return np.concatenate(self._buffers).flatten() + + @property + def duration_s(self) -> float: + if not self._buffers: + return 0.0 + total_samples = sum(b.size for b in self._buffers) + return total_samples / self._config.sample_rate diff --git a/apps/desktop/src/main/lib/voice/python/wake_word.py b/apps/desktop/src/main/lib/voice/python/wake_word.py new file mode 100644 index 00000000000..c66013ef64c --- /dev/null +++ b/apps/desktop/src/main/lib/voice/python/wake_word.py @@ -0,0 +1,46 @@ +from dataclasses import dataclass +from typing import Optional + +import numpy as np +from openwakeword.model import Model + +from config import Config + + +@dataclass +class WakeWordResult: + detected: bool + confidence: float + + +class WakeWordDetector: + """Wraps openwakeword for wake word detection.""" + + def __init__(self, config: Config) -> None: + self._config = config + self._model: Optional[Model] = None + + def load(self) -> None: + self._model = Model( + wakeword_models=[self._config.wake_word_model], + inference_framework="onnx", + ) + + def process_chunk(self, chunk: np.ndarray) -> WakeWordResult: + """Process an audio chunk and return detection result.""" + if self._model is None: + raise RuntimeError("Model not loaded. Call load() first.") + + audio = chunk.flatten().astype(np.int16) + self._model.predict(audio) + + scores = self._model.prediction_buffer.get(self._config.wake_word_model, []) + confidence = scores[-1] if scores else 0.0 + detected = confidence >= self._config.wake_word_threshold + + return WakeWordResult(detected=detected, confidence=confidence) + + def reset(self) -> None: + """Reset the model's prediction buffer for a new detection cycle.""" + if self._model is not None: + self._model.reset() diff --git a/apps/desktop/src/main/lib/voice/voice-process-paths.ts b/apps/desktop/src/main/lib/voice/voice-process-paths.ts new file mode 100644 index 00000000000..a65c37c2b9d --- /dev/null +++ b/apps/desktop/src/main/lib/voice/voice-process-paths.ts @@ -0,0 +1,86 @@ +import { existsSync } from "node:fs"; +import { join } from "node:path"; +import { app } from "electron"; +import { env } from "main/env.main"; + +export interface VoiceSpawnConfig { + /** The command to execute (python path or PyInstaller binary). */ + command: string; + /** Arguments to pass (e.g. ["main.py"] for dev, [] for binary). */ + args: string[]; + /** Working directory for the spawned process. */ + cwd: string; +} + +/** + * Returns the spawn configuration for the voice sidecar process. + * + * Production (packaged): PyInstaller binary at process.resourcesPath/voice-sidecar/voice-sidecar + * Development: .venv/bin/python3 main.py in the source directory + * Preview: Similar to dev, resolves relative to dist/ + */ +export function getVoiceSpawnConfig(): VoiceSpawnConfig { + if (app.isPackaged) { + return getPackagedConfig(); + } + + const isDev = env.NODE_ENV === "development"; + if (isDev) { + return getDevConfig(); + } + + return getPreviewConfig(); +} + +function getPackagedConfig(): VoiceSpawnConfig { + const binaryDir = join(process.resourcesPath, "voice-sidecar"); + const binaryName = + process.platform === "win32" ? "voice-sidecar.exe" : "voice-sidecar"; + const binaryPath = join(binaryDir, binaryName); + + if (existsSync(binaryPath)) { + return { command: binaryPath, args: [], cwd: binaryDir }; + } + + // Fallback: try system python3 with unpacked script + console.warn( + "[voice-paths] PyInstaller binary not found, falling back to system python3", + ); + const scriptDir = join( + process.resourcesPath, + "app.asar.unpacked/src/main/lib/voice/python", + ); + return { + command: "python3", + args: [join(scriptDir, "main.py")], + cwd: scriptDir, + }; +} + +function getDevConfig(): VoiceSpawnConfig { + const scriptDir = join(app.getAppPath(), "src/main/lib/voice/python"); + const venvPython = join(scriptDir, ".venv/bin/python3"); + + if (existsSync(venvPython)) { + return { command: venvPython, args: ["main.py"], cwd: scriptDir }; + } + + console.warn( + "[voice-paths] Dev venv not found, falling back to system python3", + ); + return { command: "python3", args: ["main.py"], cwd: scriptDir }; +} + +function getPreviewConfig(): VoiceSpawnConfig { + const previewDir = join(__dirname, "../lib/voice/python"); + const srcDir = join(app.getAppPath(), "src/main/lib/voice/python"); + + const scriptDir = existsSync(previewDir) ? previewDir : srcDir; + const venvPython = join(srcDir, ".venv/bin/python3"); + + if (existsSync(venvPython)) { + return { command: venvPython, args: ["main.py"], cwd: scriptDir }; + } + + return { command: "python3", args: ["main.py"], cwd: scriptDir }; +} diff --git a/apps/desktop/src/main/lib/voice/voice-process.ts b/apps/desktop/src/main/lib/voice/voice-process.ts new file mode 100644 index 00000000000..f6af4827b06 --- /dev/null +++ b/apps/desktop/src/main/lib/voice/voice-process.ts @@ -0,0 +1,140 @@ +import type { ChildProcess } from "node:child_process"; +import { spawn } from "node:child_process"; +import { EventEmitter } from "node:events"; +import { createInterface } from "node:readline"; +import type { PythonVoiceEvent, VoiceSidecarEvent } from "shared/voice"; +import { getVoiceSpawnConfig } from "./voice-process-paths"; + +export const voiceProcessEmitter = new EventEmitter(); + +let childProcess: ChildProcess | null = null; +let isRunning = false; +let lastEvent: VoiceSidecarEvent = { type: "idle" }; + +function parsePythonEvent(raw: PythonVoiceEvent): VoiceSidecarEvent | null { + switch (raw.event) { + case "ready": + return { type: "ready" }; + case "recording": + return { type: "recording" }; + case "audio_captured": + if (raw.audio_b64 && raw.duration_s !== undefined) { + return { + type: "audio_captured", + audioB64: raw.audio_b64, + durationS: raw.duration_s, + }; + } + return null; + case "error": + return { type: "error", message: raw.message ?? "Unknown error" }; + case "idle": + return { type: "idle" }; + default: + return null; + } +} + +export function startVoiceProcess(): void { + if (childProcess) { + console.warn("[voice-process] Already running"); + return; + } + + const config = getVoiceSpawnConfig(); + + console.log( + `[voice-process] Starting: ${config.command} ${config.args.join(" ")}`, + ); + + childProcess = spawn(config.command, config.args, { + cwd: config.cwd, + stdio: ["pipe", "pipe", "pipe"], + env: { ...process.env }, + }); + + isRunning = true; + + // Parse stdout JSON lines + if (childProcess.stdout) { + const rl = createInterface({ input: childProcess.stdout }); + rl.on("line", (line) => { + try { + const raw = JSON.parse(line) as PythonVoiceEvent; + const event = parsePythonEvent(raw); + if (event) { + lastEvent = event; + voiceProcessEmitter.emit("voice-event", event); + } + } catch { + console.warn("[voice-process] Non-JSON stdout:", line); + } + }); + } + + // Log stderr + if (childProcess.stderr) { + const rl = createInterface({ input: childProcess.stderr }); + rl.on("line", (line) => { + console.error("[voice-process/stderr]", line); + }); + } + + childProcess.on("error", (err) => { + console.error("[voice-process] Spawn error:", err.message); + voiceProcessEmitter.emit("voice-event", { + type: "error", + message: `Process error: ${err.message}`, + } satisfies VoiceSidecarEvent); + cleanup(); + }); + + childProcess.on("exit", (code, signal) => { + console.log(`[voice-process] Exited with code=${code} signal=${signal}`); + cleanup(); + }); +} + +export function stopVoiceProcess(): void { + if (!childProcess) { + return; + } + + // Send stop command via stdin + if (childProcess.stdin && !childProcess.stdin.destroyed) { + try { + childProcess.stdin.write(`${JSON.stringify({ cmd: "stop" })}\n`); + } catch { + // stdin may be closed already + } + } + + // Give it a moment to exit gracefully, then force kill + const timeout = setTimeout(() => { + if (childProcess) { + childProcess.kill("SIGKILL"); + } + }, 3000); + + childProcess.once("exit", () => { + clearTimeout(timeout); + }); + + childProcess.kill("SIGTERM"); +} + +export function getVoiceProcessStatus(): { + running: boolean; +} { + return { running: isRunning }; +} + +export function getCurrentVoiceState(): VoiceSidecarEvent { + return lastEvent; +} + +function cleanup(): void { + childProcess = null; + isRunning = false; + lastEvent = { type: "idle" }; +} diff --git a/apps/desktop/src/renderer/components/Voice/VoiceListener.tsx b/apps/desktop/src/renderer/components/Voice/VoiceListener.tsx new file mode 100644 index 00000000000..829cf7644ab --- /dev/null +++ b/apps/desktop/src/renderer/components/Voice/VoiceListener.tsx @@ -0,0 +1,95 @@ +import { toast } from "@superset/ui/sonner"; +import { useEffect, useRef } from "react"; +import { electronTrpc } from "renderer/lib/electron-trpc"; +import { RecordingIndicator } from "./components/RecordingIndicator"; +import { ResponsePanel } from "./components/ResponsePanel"; + +/** + * Single component that queries the voiceCommandsEnabled setting and + * passes it as the `enabled` flag to `useSubscription`. This avoids + * conditional rendering / mount-unmount cycles — the subscription hook + * is always called (React rules-of-hooks) but only connects when the + * setting is true. + */ +export function VoiceListener() { + const { data: voiceEnabled } = + electronTrpc.settings.getVoiceCommandsEnabled.useQuery(); + + const indicatorToastRef = useRef(null); + const responseToastRef = useRef(null); + + // Dismiss any lingering toasts when voice is disabled + useEffect(() => { + if (!voiceEnabled) { + dismissAll(indicatorToastRef, responseToastRef); + } + }, [voiceEnabled]); + + electronTrpc.voice.subscribe.useSubscription(undefined, { + enabled: !!voiceEnabled, + onData: (event) => { + switch (event.type) { + case "recording": { + dismissAll(indicatorToastRef, responseToastRef); + + const toastId = toast.custom( + (id) => , + { + duration: Number.POSITIVE_INFINITY, + position: "bottom-center", + unstyled: true, + }, + ); + indicatorToastRef.current = toastId; + break; + } + + case "audio_captured": { + if (indicatorToastRef.current !== null) { + toast.dismiss(indicatorToastRef.current); + indicatorToastRef.current = null; + } + + const toastId = toast.custom( + (id) => , + { + duration: Number.POSITIVE_INFINITY, + position: "bottom-center", + unstyled: true, + }, + ); + responseToastRef.current = toastId; + break; + } + + case "idle": { + if (indicatorToastRef.current !== null) { + toast.dismiss(indicatorToastRef.current); + indicatorToastRef.current = null; + } + break; + } + + case "error": { + dismissAll(indicatorToastRef, responseToastRef); + console.error("[voice-listener] Sidecar error:", event.message); + break; + } + } + }, + onError: (error) => { + console.error("[voice-listener] Subscription error:", error); + }, + }); + + return null; +} + +function dismissAll(...refs: React.RefObject[]): void { + for (const ref of refs) { + if (ref.current !== null) { + toast.dismiss(ref.current); + ref.current = null; + } + } +} diff --git a/apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/RecordingIndicator.tsx b/apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/RecordingIndicator.tsx new file mode 100644 index 00000000000..de55ed1d3e1 --- /dev/null +++ b/apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/RecordingIndicator.tsx @@ -0,0 +1,26 @@ +import { toast } from "@superset/ui/sonner"; +import { HiMiniMicrophone } from "react-icons/hi2"; + +interface RecordingIndicatorProps { + toastId: string | number; +} + +export function RecordingIndicator({ toastId }: RecordingIndicatorProps) { + return ( +
+ + + + + + Listening... + +
+ ); +} diff --git a/apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/index.ts b/apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/index.ts new file mode 100644 index 00000000000..a7487ae9461 --- /dev/null +++ b/apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/index.ts @@ -0,0 +1 @@ +export { RecordingIndicator } from "./RecordingIndicator"; diff --git a/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/ResponsePanel.tsx b/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/ResponsePanel.tsx new file mode 100644 index 00000000000..c2bc6967461 --- /dev/null +++ b/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/ResponsePanel.tsx @@ -0,0 +1,122 @@ +import { toast } from "@superset/ui/sonner"; +import { useEffect } from "react"; +import { HiMiniMicrophone, HiMiniXMark } from "react-icons/hi2"; +import { useVoicePipeline } from "./hooks/useVoicePipeline"; + +interface ResponsePanelProps { + toastId: string | number; + audioB64: string; +} + +export function ResponsePanel({ toastId, audioB64 }: ResponsePanelProps) { + const { + status, + transcription, + toolCalls, + responseText, + error, + processAudio, + abort, + } = useVoicePipeline(); + + // Start processing when mounted + useEffect(() => { + processAudio(audioB64); + }, [audioB64, processAudio]); + + // Auto-dismiss after done + useEffect(() => { + if (status === "done") { + const timer = setTimeout(() => { + toast.dismiss(toastId); + }, 8000); + return () => clearTimeout(timer); + } + }, [status, toastId]); + + const handleDismiss = () => { + abort(); + toast.dismiss(toastId); + }; + + return ( +
+ + + {/* Header */} +
+ + Voice Command +
+ + {/* Status indicator */} + {status === "transcribing" && ( +
+ + + + + Transcribing... +
+ )} + + {/* Transcription */} + {transcription && ( +
+ “{transcription}” +
+ )} + + {/* Tool calls */} + {toolCalls.length > 0 && ( +
+ {toolCalls.map((tc, i) => ( +
+ + {tc.toolName} + {tc.result && done} +
+ ))} +
+ )} + + {/* Streaming response */} + {(status === "streaming" || status === "done") && responseText && ( +
+ {responseText} + {status === "streaming" && ( + + )} +
+ )} + + {/* Processing indicator */} + {status === "processing" && !responseText && ( +
+ + + + + Thinking... +
+ )} + + {/* Error */} + {status === "error" && ( +
+ {error || "Something went wrong"} +
+ )} +
+ ); +} diff --git a/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/index.ts b/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/index.ts new file mode 100644 index 00000000000..e0ce15fb143 --- /dev/null +++ b/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/index.ts @@ -0,0 +1 @@ +export { useVoicePipeline } from "./useVoicePipeline"; diff --git a/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/useVoicePipeline.ts b/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/useVoicePipeline.ts new file mode 100644 index 00000000000..0146d32343a --- /dev/null +++ b/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/useVoicePipeline.ts @@ -0,0 +1,214 @@ +import { useCallback, useRef, useState } from "react"; +import { env } from "renderer/env.renderer"; + +type PipelineStatus = + | "idle" + | "transcribing" + | "processing" + | "streaming" + | "done" + | "error"; + +interface ToolCall { + toolName: string; + toolInput?: unknown; + result?: string; +} + +interface VoicePipelineState { + status: PipelineStatus; + transcription: string | null; + toolCalls: ToolCall[]; + responseText: string; + error: string | null; +} + +export function useVoicePipeline() { + const [state, setState] = useState({ + status: "idle", + transcription: null, + toolCalls: [], + responseText: "", + error: null, + }); + + const abortRef = useRef(null); + + const processAudio = useCallback(async (audioB64: string) => { + // Abort any in-flight request + abortRef.current?.abort(); + + // Reset state + setState({ + status: "transcribing", + transcription: null, + toolCalls: [], + responseText: "", + error: null, + }); + + // Decode base64 to binary + const binaryStr = atob(audioB64); + const bytes = new Uint8Array(binaryStr.length); + for (let i = 0; i < binaryStr.length; i++) { + bytes[i] = binaryStr.charCodeAt(i); + } + + // Build form data + const formData = new FormData(); + formData.append( + "audio", + new Blob([bytes], { type: "audio/wav" }), + "audio.wav", + ); + + // POST to API with SSE response + const abortController = new AbortController(); + abortRef.current = abortController; + + try { + const response = await fetch(`${env.NEXT_PUBLIC_API_URL}/api/voice`, { + method: "POST", + body: formData, + credentials: "include", + signal: abortController.signal, + }); + + if (!response.ok) { + const text = await response.text(); + setState((prev) => ({ + ...prev, + status: "error", + error: `API error: ${response.status} ${text}`, + })); + return; + } + + if (!response.body) { + setState((prev) => ({ + ...prev, + status: "error", + error: "No response body", + })); + return; + } + + // Read SSE stream + const reader = response.body.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + + // Parse SSE events from buffer + const lines = buffer.split("\n"); + buffer = lines.pop() ?? ""; + + let eventType = ""; + for (const line of lines) { + if (line.startsWith("event: ")) { + eventType = line.slice(7).trim(); + } else if (line.startsWith("data: ") && eventType) { + try { + const data = JSON.parse(line.slice(6)); + handleSSEEvent(eventType, data, setState); + } catch { + // Skip malformed data + } + eventType = ""; + } + } + } + + // Ensure we end in done state + setState((prev) => { + if (prev.status !== "error") { + return { ...prev, status: "done" }; + } + return prev; + }); + } catch (error) { + if (abortController.signal.aborted) return; + setState((prev) => ({ + ...prev, + status: "error", + error: error instanceof Error ? error.message : "Request failed", + })); + } + }, []); + + const abort = useCallback(() => { + abortRef.current?.abort(); + }, []); + + return { ...state, processAudio, abort }; +} + +function handleSSEEvent( + event: string, + data: Record, + setState: React.Dispatch>, +) { + switch (event) { + case "transcription": + setState((prev) => ({ + ...prev, + status: "processing", + transcription: data.text as string, + })); + break; + + case "tool_use": + setState((prev) => ({ + ...prev, + status: "processing", + toolCalls: [ + ...prev.toolCalls, + { + toolName: data.toolName as string, + toolInput: data.toolInput, + }, + ], + })); + break; + + case "tool_result": + setState((prev) => ({ + ...prev, + toolCalls: prev.toolCalls.map((tc) => + tc.toolName === data.toolName && !tc.result + ? { ...tc, result: data.result as string } + : tc, + ), + })); + break; + + case "text_delta": + setState((prev) => ({ + ...prev, + status: "streaming", + responseText: prev.responseText + (data.delta as string), + })); + break; + + case "done": + setState((prev) => ({ + ...prev, + status: "done", + responseText: (data.fullResponse as string) || prev.responseText, + })); + break; + + case "error": + setState((prev) => ({ + ...prev, + status: "error", + error: data.message as string, + })); + break; + } +} diff --git a/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/index.ts b/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/index.ts new file mode 100644 index 00000000000..d34650661d5 --- /dev/null +++ b/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/index.ts @@ -0,0 +1 @@ +export { ResponsePanel } from "./ResponsePanel"; diff --git a/apps/desktop/src/renderer/components/Voice/index.ts b/apps/desktop/src/renderer/components/Voice/index.ts new file mode 100644 index 00000000000..7ecc6a6c63a --- /dev/null +++ b/apps/desktop/src/renderer/components/Voice/index.ts @@ -0,0 +1 @@ +export { VoiceListener } from "./VoiceListener"; diff --git a/apps/desktop/src/renderer/routes/_authenticated/layout.tsx b/apps/desktop/src/renderer/routes/_authenticated/layout.tsx index 146b5b6de52..3100b3a070f 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/layout.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/layout.tsx @@ -8,6 +8,7 @@ import { DndProvider } from "react-dnd"; import { NewWorkspaceModal } from "renderer/components/NewWorkspaceModal"; import { Paywall } from "renderer/components/Paywall"; import { useUpdateListener } from "renderer/components/UpdateToast"; +import { VoiceListener } from "renderer/components/Voice"; import { env } from "renderer/env.renderer"; import { authClient } from "renderer/lib/auth-client"; import { dragDropManager } from "renderer/lib/dnd"; @@ -78,6 +79,7 @@ function AuthenticatedLayout() { + diff --git a/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx b/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx index 25a17e454d7..a7ee0a66102 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx @@ -32,6 +32,10 @@ export function BehaviorSettings({ visibleItems }: BehaviorSettingsProps) { SETTING_ITEM_ID.BEHAVIOR_BRANCH_PREFIX, visibleItems, ); + const showVoiceCommands = isItemVisible( + SETTING_ITEM_ID.BEHAVIOR_VOICE_COMMANDS, + visibleItems, + ); const utils = electronTrpc.useUtils(); @@ -58,6 +62,33 @@ export function BehaviorSettings({ visibleItems }: BehaviorSettingsProps) { setConfirmOnQuit.mutate({ enabled }); }; + const { data: voiceCommandsEnabled, isLoading: isVoiceLoading } = + electronTrpc.settings.getVoiceCommandsEnabled.useQuery(); + const setVoiceCommandsEnabled = + electronTrpc.settings.setVoiceCommandsEnabled.useMutation({ + onMutate: async ({ enabled }) => { + await utils.settings.getVoiceCommandsEnabled.cancel(); + const previous = utils.settings.getVoiceCommandsEnabled.getData(); + utils.settings.getVoiceCommandsEnabled.setData(undefined, enabled); + return { previous }; + }, + onError: (_err, _vars, context) => { + if (context?.previous !== undefined) { + utils.settings.getVoiceCommandsEnabled.setData( + undefined, + context.previous, + ); + } + }, + onSettled: () => { + utils.settings.getVoiceCommandsEnabled.invalidate(); + }, + }); + + const handleVoiceToggle = (enabled: boolean) => { + setVoiceCommandsEnabled.mutate({ enabled }); + }; + const { data: branchPrefix, isLoading: isBranchPrefixLoading } = electronTrpc.settings.getBranchPrefix.useQuery(); const { data: gitInfo } = electronTrpc.settings.getGitInfo.useQuery(); @@ -137,6 +168,25 @@ export function BehaviorSettings({ visibleItems }: BehaviorSettingsProps) { )} + {showVoiceCommands && ( +
+
+ +

+ Enable wake word detection and voice commands +

+
+ +
+ )} + {showBranchPrefix && (
diff --git a/apps/desktop/src/renderer/routes/_authenticated/settings/utils/settings-search/settings-search.ts b/apps/desktop/src/renderer/routes/_authenticated/settings/utils/settings-search/settings-search.ts index 568eeaa9f60..e106975cd1b 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/settings/utils/settings-search/settings-search.ts +++ b/apps/desktop/src/renderer/routes/_authenticated/settings/utils/settings-search/settings-search.ts @@ -21,6 +21,7 @@ export const SETTING_ITEM_ID = { KEYBOARD_SHORTCUTS: "keyboard-shortcuts", BEHAVIOR_CONFIRM_QUIT: "behavior-confirm-quit", + BEHAVIOR_VOICE_COMMANDS: "behavior-voice-commands", BEHAVIOR_BRANCH_PREFIX: "behavior-branch-prefix", TERMINAL_PRESETS: "terminal-presets", @@ -308,6 +309,23 @@ export const SETTINGS_ITEMS: SettingsItem[] = [ "unsaved", ], }, + { + id: SETTING_ITEM_ID.BEHAVIOR_VOICE_COMMANDS, + section: "behavior", + title: "Voice Commands", + description: "Enable wake word detection and voice commands", + keywords: [ + "features", + "voice", + "commands", + "wake word", + "microphone", + "speech", + "audio", + "jarvis", + "sidecar", + ], + }, { id: SETTING_ITEM_ID.BEHAVIOR_BRANCH_PREFIX, section: "behavior", diff --git a/apps/desktop/src/shared/voice.ts b/apps/desktop/src/shared/voice.ts new file mode 100644 index 00000000000..3699e394307 --- /dev/null +++ b/apps/desktop/src/shared/voice.ts @@ -0,0 +1,29 @@ +/** + * Voice sidecar events emitted by the Python child process via stdio JSON lines. + * These are the events the tRPC subscription forwards to the renderer. + */ + +export type VoiceSidecarEvent = + | { type: "ready" } + | { type: "recording" } + | { type: "audio_captured"; audioB64: string; durationS: number } + | { type: "error"; message: string } + | { type: "idle" }; + +/** + * Raw JSON events from the Python process stdout. + * Converted to VoiceSidecarEvent by voice-process.ts. + */ +export interface PythonVoiceEvent { + event: "ready" | "recording" | "audio_captured" | "error" | "idle"; + audio_b64?: string; + duration_s?: number; + message?: string; +} + +/** + * Commands sent to the Python process via stdin. + */ +export interface PythonVoiceCommand { + cmd: "start" | "stop"; +} diff --git a/bun.lock b/bun.lock index 03d8b44a23f..b75f168dfe2 100644 --- a/bun.lock +++ b/bun.lock @@ -85,6 +85,7 @@ "lodash.chunk": "^4.2.0", "mcp-handler": "^1.0.7", "next": "^16.0.10", + "openai": "^6.17.0", "react": "19.1.0", "react-dom": "19.1.0", "require-in-the-middle": "8.0.1", @@ -3948,6 +3949,8 @@ "open": ["open@7.4.2", "", { "dependencies": { "is-docker": "^2.0.0", "is-wsl": "^2.1.1" } }, "sha512-MVHddDVweXZF3awtlAS+6pgKLlm/JgxZ90+/NBurBoQctVOOB/zDdVjcyPzQ+0laDGbsWgrRkflI65sQeOgT9Q=="], + "openai": ["openai@6.17.0", "", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-NHRpPEUPzAvFOAFs9+9pC6+HCw/iWsYsKCMPXH5Kw7BpMxqd8g/A07/1o7Gx2TWtCnzevVRyKMRFqyiHyAlqcA=="], + "opentype.js": ["opentype.js@0.8.0", "", { "dependencies": { "tiny-inflate": "^1.0.2" }, "bin": { "ot": "./bin/ot" } }, "sha512-FQHR4oGP+a0m/f6yHoRpBOIbn/5ZWxKd4D/djHVJu8+KpBTYrJda0b7mLcgDEMWXE9xBCJm+qb0yv6FcvPjukg=="], "ora": ["ora@8.2.0", "", { "dependencies": { "chalk": "^5.3.0", "cli-cursor": "^5.0.0", "cli-spinners": "^2.9.2", "is-interactive": "^2.0.0", "is-unicode-supported": "^2.0.0", "log-symbols": "^6.0.0", "stdin-discarder": "^0.2.2", "string-width": "^7.2.0", "strip-ansi": "^7.1.0" } }, "sha512-weP+BZ8MVNnlCm8c0Qdc1WSWq4Qn7I+9CJGm7Qali6g44e/PUzbjNqJX5NJ9ljlNMosfJvg1fKEGILklK9cwnw=="], diff --git a/packages/local-db/drizzle/0016_add_voice_commands_enabled.sql b/packages/local-db/drizzle/0016_add_voice_commands_enabled.sql new file mode 100644 index 00000000000..76838600137 --- /dev/null +++ b/packages/local-db/drizzle/0016_add_voice_commands_enabled.sql @@ -0,0 +1 @@ +ALTER TABLE `settings` ADD `voice_commands_enabled` integer DEFAULT false; \ No newline at end of file diff --git a/packages/local-db/drizzle/meta/0016_snapshot.json b/packages/local-db/drizzle/meta/0016_snapshot.json new file mode 100644 index 00000000000..de165c0f6e9 --- /dev/null +++ b/packages/local-db/drizzle/meta/0016_snapshot.json @@ -0,0 +1,1057 @@ +{ + "version": "6", + "dialect": "sqlite", + "id": "c5371ab6-1178-4cb5-b635-ab01e82cb6a0", + "prevId": "2c6f4b00-72ca-4cc3-bc0a-f25a40163119", + "tables": { + "organization_members": { + "name": "organization_members", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "user_id": { + "name": "user_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "role": { + "name": "role", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "created_at": { + "name": "created_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "organization_members_organization_id_idx": { + "name": "organization_members_organization_id_idx", + "columns": [ + "organization_id" + ], + "isUnique": false + }, + "organization_members_user_id_idx": { + "name": "organization_members_user_id_idx", + "columns": [ + "user_id" + ], + "isUnique": false + } + }, + "foreignKeys": { + "organization_members_organization_id_organizations_id_fk": { + "name": "organization_members_organization_id_organizations_id_fk", + "tableFrom": "organization_members", + "tableTo": "organizations", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "organization_members_user_id_users_id_fk": { + "name": "organization_members_user_id_users_id_fk", + "tableFrom": "organization_members", + "tableTo": "users", + "columnsFrom": [ + "user_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "organizations": { + "name": "organizations", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "clerk_org_id": { + "name": "clerk_org_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "slug": { + "name": "slug", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "github_org": { + "name": "github_org", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "avatar_url": { + "name": "avatar_url", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "created_at": { + "name": "created_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "updated_at": { + "name": "updated_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "organizations_clerk_org_id_unique": { + "name": "organizations_clerk_org_id_unique", + "columns": [ + "clerk_org_id" + ], + "isUnique": true + }, + "organizations_slug_unique": { + "name": "organizations_slug_unique", + "columns": [ + "slug" + ], + "isUnique": true + }, + "organizations_slug_idx": { + "name": "organizations_slug_idx", + "columns": [ + "slug" + ], + "isUnique": false + }, + "organizations_clerk_org_id_idx": { + "name": "organizations_clerk_org_id_idx", + "columns": [ + "clerk_org_id" + ], + "isUnique": false + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "projects": { + "name": "projects", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "main_repo_path": { + "name": "main_repo_path", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "color": { + "name": "color", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "tab_order": { + "name": "tab_order", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "last_opened_at": { + "name": "last_opened_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "created_at": { + "name": "created_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "config_toast_dismissed": { + "name": "config_toast_dismissed", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "default_branch": { + "name": "default_branch", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "github_owner": { + "name": "github_owner", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "branch_prefix_mode": { + "name": "branch_prefix_mode", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "branch_prefix_custom": { + "name": "branch_prefix_custom", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": { + "projects_main_repo_path_idx": { + "name": "projects_main_repo_path_idx", + "columns": [ + "main_repo_path" + ], + "isUnique": false + }, + "projects_last_opened_at_idx": { + "name": "projects_last_opened_at_idx", + "columns": [ + "last_opened_at" + ], + "isUnique": false + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "settings": { + "name": "settings", + "columns": { + "id": { + "name": "id", + "type": "integer", + "primaryKey": true, + "notNull": true, + "autoincrement": false, + "default": 1 + }, + "last_active_workspace_id": { + "name": "last_active_workspace_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "last_used_app": { + "name": "last_used_app", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "terminal_presets": { + "name": "terminal_presets", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "terminal_presets_initialized": { + "name": "terminal_presets_initialized", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "selected_ringtone_id": { + "name": "selected_ringtone_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "active_organization_id": { + "name": "active_organization_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "confirm_on_quit": { + "name": "confirm_on_quit", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "terminal_link_behavior": { + "name": "terminal_link_behavior", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "persist_terminal": { + "name": "persist_terminal", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false, + "default": true + }, + "auto_apply_default_preset": { + "name": "auto_apply_default_preset", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "branch_prefix_mode": { + "name": "branch_prefix_mode", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "branch_prefix_custom": { + "name": "branch_prefix_custom", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "notification_sounds_muted": { + "name": "notification_sounds_muted", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "voice_commands_enabled": { + "name": "voice_commands_enabled", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false, + "default": false + } + }, + "indexes": {}, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "tasks": { + "name": "tasks", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "slug": { + "name": "slug", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "title": { + "name": "title", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "description": { + "name": "description", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "status": { + "name": "status", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "status_color": { + "name": "status_color", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "status_type": { + "name": "status_type", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "status_position": { + "name": "status_position", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "priority": { + "name": "priority", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "organization_id": { + "name": "organization_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "repository_id": { + "name": "repository_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "assignee_id": { + "name": "assignee_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "creator_id": { + "name": "creator_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "estimate": { + "name": "estimate", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "due_date": { + "name": "due_date", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "labels": { + "name": "labels", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "branch": { + "name": "branch", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "pr_url": { + "name": "pr_url", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "external_provider": { + "name": "external_provider", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "external_id": { + "name": "external_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "external_key": { + "name": "external_key", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "external_url": { + "name": "external_url", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "last_synced_at": { + "name": "last_synced_at", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "sync_error": { + "name": "sync_error", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "started_at": { + "name": "started_at", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "completed_at": { + "name": "completed_at", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "deleted_at": { + "name": "deleted_at", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "created_at": { + "name": "created_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "updated_at": { + "name": "updated_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "tasks_slug_unique": { + "name": "tasks_slug_unique", + "columns": [ + "slug" + ], + "isUnique": true + }, + "tasks_slug_idx": { + "name": "tasks_slug_idx", + "columns": [ + "slug" + ], + "isUnique": false + }, + "tasks_organization_id_idx": { + "name": "tasks_organization_id_idx", + "columns": [ + "organization_id" + ], + "isUnique": false + }, + "tasks_assignee_id_idx": { + "name": "tasks_assignee_id_idx", + "columns": [ + "assignee_id" + ], + "isUnique": false + }, + "tasks_status_idx": { + "name": "tasks_status_idx", + "columns": [ + "status" + ], + "isUnique": false + }, + "tasks_created_at_idx": { + "name": "tasks_created_at_idx", + "columns": [ + "created_at" + ], + "isUnique": false + } + }, + "foreignKeys": { + "tasks_organization_id_organizations_id_fk": { + "name": "tasks_organization_id_organizations_id_fk", + "tableFrom": "tasks", + "tableTo": "organizations", + "columnsFrom": [ + "organization_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "tasks_assignee_id_users_id_fk": { + "name": "tasks_assignee_id_users_id_fk", + "tableFrom": "tasks", + "tableTo": "users", + "columnsFrom": [ + "assignee_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "set null", + "onUpdate": "no action" + }, + "tasks_creator_id_users_id_fk": { + "name": "tasks_creator_id_users_id_fk", + "tableFrom": "tasks", + "tableTo": "users", + "columnsFrom": [ + "creator_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "users": { + "name": "users", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "clerk_id": { + "name": "clerk_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "email": { + "name": "email", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "avatar_url": { + "name": "avatar_url", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "deleted_at": { + "name": "deleted_at", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "created_at": { + "name": "created_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "updated_at": { + "name": "updated_at", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + } + }, + "indexes": { + "users_clerk_id_unique": { + "name": "users_clerk_id_unique", + "columns": [ + "clerk_id" + ], + "isUnique": true + }, + "users_email_unique": { + "name": "users_email_unique", + "columns": [ + "email" + ], + "isUnique": true + }, + "users_email_idx": { + "name": "users_email_idx", + "columns": [ + "email" + ], + "isUnique": false + }, + "users_clerk_id_idx": { + "name": "users_clerk_id_idx", + "columns": [ + "clerk_id" + ], + "isUnique": false + } + }, + "foreignKeys": {}, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "workspaces": { + "name": "workspaces", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "project_id": { + "name": "project_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "worktree_id": { + "name": "worktree_id", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "type": { + "name": "type", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "branch": { + "name": "branch", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "name": { + "name": "name", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "tab_order": { + "name": "tab_order", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "created_at": { + "name": "created_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "updated_at": { + "name": "updated_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "last_opened_at": { + "name": "last_opened_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "is_unread": { + "name": "is_unread", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false, + "default": false + }, + "deleting_at": { + "name": "deleting_at", + "type": "integer", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": { + "workspaces_project_id_idx": { + "name": "workspaces_project_id_idx", + "columns": [ + "project_id" + ], + "isUnique": false + }, + "workspaces_worktree_id_idx": { + "name": "workspaces_worktree_id_idx", + "columns": [ + "worktree_id" + ], + "isUnique": false + }, + "workspaces_last_opened_at_idx": { + "name": "workspaces_last_opened_at_idx", + "columns": [ + "last_opened_at" + ], + "isUnique": false + } + }, + "foreignKeys": { + "workspaces_project_id_projects_id_fk": { + "name": "workspaces_project_id_projects_id_fk", + "tableFrom": "workspaces", + "tableTo": "projects", + "columnsFrom": [ + "project_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + }, + "workspaces_worktree_id_worktrees_id_fk": { + "name": "workspaces_worktree_id_worktrees_id_fk", + "tableFrom": "workspaces", + "tableTo": "worktrees", + "columnsFrom": [ + "worktree_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + }, + "worktrees": { + "name": "worktrees", + "columns": { + "id": { + "name": "id", + "type": "text", + "primaryKey": true, + "notNull": true, + "autoincrement": false + }, + "project_id": { + "name": "project_id", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "path": { + "name": "path", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "branch": { + "name": "branch", + "type": "text", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "base_branch": { + "name": "base_branch", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "created_at": { + "name": "created_at", + "type": "integer", + "primaryKey": false, + "notNull": true, + "autoincrement": false + }, + "git_status": { + "name": "git_status", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + }, + "github_status": { + "name": "github_status", + "type": "text", + "primaryKey": false, + "notNull": false, + "autoincrement": false + } + }, + "indexes": { + "worktrees_project_id_idx": { + "name": "worktrees_project_id_idx", + "columns": [ + "project_id" + ], + "isUnique": false + }, + "worktrees_branch_idx": { + "name": "worktrees_branch_idx", + "columns": [ + "branch" + ], + "isUnique": false + } + }, + "foreignKeys": { + "worktrees_project_id_projects_id_fk": { + "name": "worktrees_project_id_projects_id_fk", + "tableFrom": "worktrees", + "tableTo": "projects", + "columnsFrom": [ + "project_id" + ], + "columnsTo": [ + "id" + ], + "onDelete": "cascade", + "onUpdate": "no action" + } + }, + "compositePrimaryKeys": {}, + "uniqueConstraints": {}, + "checkConstraints": {} + } + }, + "views": {}, + "enums": {}, + "_meta": { + "schemas": {}, + "tables": {}, + "columns": {} + }, + "internal": { + "indexes": {} + } +} \ No newline at end of file diff --git a/packages/local-db/drizzle/meta/_journal.json b/packages/local-db/drizzle/meta/_journal.json index 84c19fde780..a72ac8c1217 100644 --- a/packages/local-db/drizzle/meta/_journal.json +++ b/packages/local-db/drizzle/meta/_journal.json @@ -113,6 +113,13 @@ "when": 1769649140217, "tag": "0015_add_notification_sounds_muted", "breakpoints": true + }, + { + "idx": 16, + "version": "6", + "when": 1769708198787, + "tag": "0016_add_voice_commands_enabled", + "breakpoints": true } ] } \ No newline at end of file diff --git a/packages/local-db/src/schema/schema.ts b/packages/local-db/src/schema/schema.ts index 7b0ff0a1ef7..e7ec1208827 100644 --- a/packages/local-db/src/schema/schema.ts +++ b/packages/local-db/src/schema/schema.ts @@ -153,6 +153,9 @@ export const settings = sqliteTable("settings", { notificationSoundsMuted: integer("notification_sounds_muted", { mode: "boolean", }), + voiceCommandsEnabled: integer("voice_commands_enabled", { + mode: "boolean", + }).default(false), }); export type InsertSettings = typeof settings.$inferInsert; From e883e67c151357cc67948d7b4feae81264f5633b Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 12:15:11 -0800 Subject: [PATCH 02/18] ci: add voice sidecar build step to desktop workflow Sets up Python 3.11, creates a venv with openwakeword/sounddevice/numpy, and runs the PyInstaller build script so the voice-sidecar binary gets bundled into the Electron app's extraResources. --- .github/workflows/build-desktop.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/build-desktop.yml b/.github/workflows/build-desktop.yml index c811334bcaa..f0d634ee33a 100644 --- a/.github/workflows/build-desktop.yml +++ b/.github/workflows/build-desktop.yml @@ -81,6 +81,18 @@ jobs: working-directory: apps/desktop run: bun run clean:dev + - name: Setup Python for voice sidecar + uses: actions/setup-python@v5 + with: + python-version: "3.11" + + - name: Build voice sidecar + working-directory: apps/desktop + run: | + python3 -m venv src/main/lib/voice/python/.venv + src/main/lib/voice/python/.venv/bin/pip install openwakeword sounddevice numpy pyinstaller + bash scripts/build-voice-sidecar.sh + - name: Compile app with electron-vite working-directory: apps/desktop env: From 24f2f4ed083160cba5954759a21373d317853829 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 12:17:16 -0800 Subject: [PATCH 03/18] refactor: make voice sidecar build self-contained MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The build script now auto-creates the venv and installs dependencies, so CI doesn't need to know about Python internals. It runs as part of prepackage — CI only needs setup-python to ensure python3 is on PATH. --- .github/workflows/build-desktop.yml | 9 +----- apps/desktop/package.json | 2 +- apps/desktop/scripts/build-voice-sidecar.sh | 32 ++++++++++++--------- 3 files changed, 21 insertions(+), 22 deletions(-) diff --git a/.github/workflows/build-desktop.yml b/.github/workflows/build-desktop.yml index f0d634ee33a..1ba50b02dc1 100644 --- a/.github/workflows/build-desktop.yml +++ b/.github/workflows/build-desktop.yml @@ -81,18 +81,11 @@ jobs: working-directory: apps/desktop run: bun run clean:dev - - name: Setup Python for voice sidecar + - name: Setup Python uses: actions/setup-python@v5 with: python-version: "3.11" - - name: Build voice sidecar - working-directory: apps/desktop - run: | - python3 -m venv src/main/lib/voice/python/.venv - src/main/lib/voice/python/.venv/bin/pip install openwakeword sounddevice numpy pyinstaller - bash scripts/build-voice-sidecar.sh - - name: Compile app with electron-vite working-directory: apps/desktop env: diff --git a/apps/desktop/package.json b/apps/desktop/package.json index 288a116f80f..973cacd5bce 100644 --- a/apps/desktop/package.json +++ b/apps/desktop/package.json @@ -22,7 +22,7 @@ "copy:native-modules": "bun run scripts/copy-native-modules.ts", "prebuild": "bun run clean:dev && bun run compile:app && bun run copy:native-modules", "build": "cross-env CSC_IDENTITY_AUTO_DISCOVERY=false electron-builder --publish never", - "prepackage": "bun run copy:native-modules", + "prepackage": "bun run copy:native-modules && bash scripts/build-voice-sidecar.sh", "package": "electron-builder --config electron-builder.ts", "install:deps": "electron-builder install-app-deps", "release": "electron-builder --publish always", diff --git a/apps/desktop/scripts/build-voice-sidecar.sh b/apps/desktop/scripts/build-voice-sidecar.sh index f84edd7e978..6e14d823d55 100755 --- a/apps/desktop/scripts/build-voice-sidecar.sh +++ b/apps/desktop/scripts/build-voice-sidecar.sh @@ -3,8 +3,9 @@ # The output binary is placed in dist/voice-sidecar/ and gets bundled into # the Electron app's extraResources by electron-builder. # -# Prerequisites: -# pip install pyinstaller (in the voice python venv) +# This script is self-contained: it creates the venv and installs all +# dependencies automatically if they are missing. The only prerequisite +# is that `python3` is available on the PATH. # # Usage: # ./scripts/build-voice-sidecar.sh @@ -17,22 +18,28 @@ PYTHON_DIR="$DESKTOP_DIR/src/main/lib/voice/python" VENV_DIR="$PYTHON_DIR/.venv" OUTPUT_DIR="$DESKTOP_DIR/dist/voice-sidecar" +PYTHON="$VENV_DIR/bin/python3" +PIP="$VENV_DIR/bin/pip" + +# Create venv if it doesn't exist if [ ! -d "$VENV_DIR" ]; then - echo "Error: Python venv not found at $VENV_DIR" - echo "Create it with: python3 -m venv $VENV_DIR && $VENV_DIR/bin/pip install openwakeword sounddevice numpy" - exit 1 + echo "[voice-sidecar] Creating Python venv..." + python3 -m venv "$VENV_DIR" fi -PYTHON="$VENV_DIR/bin/python3" -PIP="$VENV_DIR/bin/pip" +# Install runtime dependencies +if ! "$PYTHON" -c "import openwakeword" 2>/dev/null; then + echo "[voice-sidecar] Installing dependencies..." + "$PIP" install --quiet openwakeword sounddevice numpy +fi -# Ensure PyInstaller is installed +# Install PyInstaller if ! "$PYTHON" -c "import PyInstaller" 2>/dev/null; then - echo "Installing PyInstaller..." - "$PIP" install pyinstaller + echo "[voice-sidecar] Installing PyInstaller..." + "$PIP" install --quiet pyinstaller fi -echo "Building voice sidecar binary..." +echo "[voice-sidecar] Building binary..." "$PYTHON" -m PyInstaller \ --name voice-sidecar \ @@ -45,6 +52,5 @@ echo "Building voice sidecar binary..." --collect-data openwakeword \ "$PYTHON_DIR/main.py" -echo "Voice sidecar binary built at: $OUTPUT_DIR/voice-sidecar/" -echo "Contents:" +echo "[voice-sidecar] Built at: $OUTPUT_DIR/voice-sidecar/" ls -la "$OUTPUT_DIR/voice-sidecar/" From 8d87aed9ca8b83790f05f27e0f0ee16d0dbde664 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 13:47:57 -0800 Subject: [PATCH 04/18] feat(desktop): gate voice commands on macOS microphone permission Request mic access before starting the voice sidecar, show guidance when permission is denied, and re-check on window focus so users can grant access in System Settings and return to a working toggle. --- .../src/lib/trpc/routers/voice/index.ts | 35 +++++++- .../components/Voice/VoiceListener.tsx | 17 +++- .../BehaviorSettings/BehaviorSettings.tsx | 89 +++++++++++++++---- apps/desktop/src/shared/voice.ts | 6 ++ 4 files changed, 127 insertions(+), 20 deletions(-) diff --git a/apps/desktop/src/lib/trpc/routers/voice/index.ts b/apps/desktop/src/lib/trpc/routers/voice/index.ts index 7a706147599..1a83da523a5 100644 --- a/apps/desktop/src/lib/trpc/routers/voice/index.ts +++ b/apps/desktop/src/lib/trpc/routers/voice/index.ts @@ -1,13 +1,23 @@ import { observable } from "@trpc/server/observable"; +import { systemPreferences } from "electron"; import { getCurrentVoiceState, startVoiceProcess, stopVoiceProcess, voiceProcessEmitter, } from "main/lib/voice/voice-process"; -import type { VoiceSidecarEvent } from "shared/voice"; +import type { MicPermissionStatus, VoiceSidecarEvent } from "shared/voice"; import { publicProcedure, router } from "../.."; +function getMicStatus(): MicPermissionStatus { + if (process.platform !== "darwin") { + return "granted"; + } + return systemPreferences.getMediaAccessStatus( + "microphone", + ) as MicPermissionStatus; +} + export const createVoiceRouter = () => { let subscriberCount = 0; @@ -50,5 +60,28 @@ export const createVoiceRouter = () => { stopVoiceProcess(); return { success: true as const }; }), + + getMicPermission: publicProcedure.query((): MicPermissionStatus => { + return getMicStatus(); + }), + + requestMicPermission: publicProcedure.mutation( + async (): Promise<{ granted: boolean; status: MicPermissionStatus }> => { + const current = getMicStatus(); + + if (current === "granted") { + return { granted: true, status: "granted" }; + } + + if (current !== "not-determined") { + return { granted: false, status: current }; + } + + const granted = await systemPreferences.askForMediaAccess("microphone"); + const status = getMicStatus(); + + return { granted, status }; + }, + ), }); }; diff --git a/apps/desktop/src/renderer/components/Voice/VoiceListener.tsx b/apps/desktop/src/renderer/components/Voice/VoiceListener.tsx index 829cf7644ab..a0aed59188d 100644 --- a/apps/desktop/src/renderer/components/Voice/VoiceListener.tsx +++ b/apps/desktop/src/renderer/components/Voice/VoiceListener.tsx @@ -15,18 +15,27 @@ export function VoiceListener() { const { data: voiceEnabled } = electronTrpc.settings.getVoiceCommandsEnabled.useQuery(); + const { data: micPermission } = electronTrpc.voice.getMicPermission.useQuery( + undefined, + { + refetchOnWindowFocus: true, + }, + ); + + const canListen = !!voiceEnabled && micPermission === "granted"; + const indicatorToastRef = useRef(null); const responseToastRef = useRef(null); - // Dismiss any lingering toasts when voice is disabled + // Dismiss any lingering toasts when voice is disabled or permission revoked useEffect(() => { - if (!voiceEnabled) { + if (!canListen) { dismissAll(indicatorToastRef, responseToastRef); } - }, [voiceEnabled]); + }, [canListen]); electronTrpc.voice.subscribe.useSubscription(undefined, { - enabled: !!voiceEnabled, + enabled: canListen, onData: (event) => { switch (event.type) { case "recording": { diff --git a/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx b/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx index a7ee0a66102..2ca4d63ff05 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx @@ -85,8 +85,43 @@ export function BehaviorSettings({ visibleItems }: BehaviorSettingsProps) { }, }); + const { data: micPermission } = electronTrpc.voice.getMicPermission.useQuery( + undefined, + { + refetchOnWindowFocus: true, + }, + ); + + const requestMicPermission = + electronTrpc.voice.requestMicPermission.useMutation({ + onSuccess: ({ granted }) => { + utils.voice.getMicPermission.invalidate(); + if (granted) { + setVoiceCommandsEnabled.mutate({ enabled: true }); + } + }, + }); + + const openUrl = electronTrpc.external.openUrl.useMutation(); + + const micDenied = + micPermission === "denied" || micPermission === "restricted"; + const handleVoiceToggle = (enabled: boolean) => { - setVoiceCommandsEnabled.mutate({ enabled }); + if (!enabled) { + setVoiceCommandsEnabled.mutate({ enabled: false }); + return; + } + + if (micPermission === "granted") { + setVoiceCommandsEnabled.mutate({ enabled: true }); + return; + } + + if (micPermission === "not-determined") { + requestMicPermission.mutate(); + return; + } }; const { data: branchPrefix, isLoading: isBranchPrefixLoading } = @@ -169,21 +204,45 @@ export function BehaviorSettings({ visibleItems }: BehaviorSettingsProps) { )} {showVoiceCommands && ( -
-
- -

- Enable wake word detection and voice commands -

+
+
+
+ +

+ Enable wake word detection and voice commands +

+
+
- + {micDenied && ( +

+ Microphone access was denied.{" "} + {" "} + to grant access, then return to this window. +

+ )}
)} diff --git a/apps/desktop/src/shared/voice.ts b/apps/desktop/src/shared/voice.ts index 3699e394307..dcece441f1f 100644 --- a/apps/desktop/src/shared/voice.ts +++ b/apps/desktop/src/shared/voice.ts @@ -1,3 +1,9 @@ +export type MicPermissionStatus = + | "not-determined" + | "granted" + | "denied" + | "restricted"; + /** * Voice sidecar events emitted by the Python child process via stdio JSON lines. * These are the events the tRPC subscription forwards to the renderer. From d5d40dda95df7b9acb101c84f361d823f9ce7443 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 14:23:22 -0800 Subject: [PATCH 05/18] fix(desktop): preserve base extendInfo in canary build config The canary config was replacing the entire extendInfo object, dropping NSMicrophoneUsageDescription and NSLocalNetworkUsageDescription from the base config. Spread the base extendInfo so these plist keys are preserved. --- apps/desktop/electron-builder.canary.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/desktop/electron-builder.canary.ts b/apps/desktop/electron-builder.canary.ts index fe20ad0d2c1..edfded17095 100644 --- a/apps/desktop/electron-builder.canary.ts +++ b/apps/desktop/electron-builder.canary.ts @@ -31,6 +31,7 @@ const config: Configuration = { icon: join(pkg.resources, "build/icons/icon-canary.icns"), artifactName: `Superset-Canary-\${version}-\${arch}.\${ext}`, extendInfo: { + ...baseConfig.mac?.extendInfo, CFBundleName: productName, CFBundleDisplayName: productName, }, From 93fc92445dcea590b3b625c9eab5c6cdd1b89f45 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 14:59:47 -0800 Subject: [PATCH 06/18] fix(desktop): add audio-input entitlement for microphone access Hardened runtime blocks microphone access without the com.apple.security.device.audio-input entitlement, causing getMediaAccessStatus to return "denied" regardless of TCC state. Add an explicit entitlements plist with the audio-input entitlement. --- apps/desktop/electron-builder.ts | 2 ++ .../src/resources/build/entitlements.mac.plist | 14 ++++++++++++++ 2 files changed, 16 insertions(+) create mode 100644 apps/desktop/src/resources/build/entitlements.mac.plist diff --git a/apps/desktop/electron-builder.ts b/apps/desktop/electron-builder.ts index cf146fb3701..55c80133476 100644 --- a/apps/desktop/electron-builder.ts +++ b/apps/desktop/electron-builder.ts @@ -123,6 +123,8 @@ const config: Configuration = { hardenedRuntime: true, gatekeeperAssess: false, notarize: true, + entitlements: join(pkg.resources, "build/entitlements.mac.plist"), + entitlementsInherit: join(pkg.resources, "build/entitlements.mac.plist"), extendInfo: { CFBundleName: productName, CFBundleDisplayName: productName, diff --git a/apps/desktop/src/resources/build/entitlements.mac.plist b/apps/desktop/src/resources/build/entitlements.mac.plist new file mode 100644 index 00000000000..f7d1e352274 --- /dev/null +++ b/apps/desktop/src/resources/build/entitlements.mac.plist @@ -0,0 +1,14 @@ + + + + + com.apple.security.cs.allow-jit + + com.apple.security.cs.allow-unsigned-executable-memory + + com.apple.security.cs.disable-library-validation + + com.apple.security.device.audio-input + + + From 053cfe77339d685d005c2dc29edfb5125fa3c976 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 16:17:33 -0800 Subject: [PATCH 07/18] ci: add voice sidecar build step to desktop workflow The workflow had Setup Python but never ran build-voice-sidecar.sh, so the sidecar binary was packaged without the openwakeword model data. Add the missing build step before electron-vite compilation. --- .github/workflows/build-desktop.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build-desktop.yml b/.github/workflows/build-desktop.yml index 1ba50b02dc1..8f515539eea 100644 --- a/.github/workflows/build-desktop.yml +++ b/.github/workflows/build-desktop.yml @@ -86,6 +86,10 @@ jobs: with: python-version: "3.11" + - name: Build voice sidecar + working-directory: apps/desktop + run: bash scripts/build-voice-sidecar.sh + - name: Compile app with electron-vite working-directory: apps/desktop env: From e99ac4fe12a869c3a65f2fdf5e3c3dfe1c5be888 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 16:26:16 -0800 Subject: [PATCH 08/18] fix(desktop): explicitly add openwakeword data to PyInstaller bundle --collect-data openwakeword alone was not including the model .onnx files in the PyInstaller bundle. Add --add-data with the resolved package path as a fallback, and verify the hey_jarvis model exists in the output before finishing the build. Also revert the redundant CI workflow step since the sidecar build is already wired through prepackage in package.json. --- .github/workflows/build-desktop.yml | 4 ---- apps/desktop/scripts/build-voice-sidecar.sh | 12 ++++++++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-desktop.yml b/.github/workflows/build-desktop.yml index 8f515539eea..1ba50b02dc1 100644 --- a/.github/workflows/build-desktop.yml +++ b/.github/workflows/build-desktop.yml @@ -86,10 +86,6 @@ jobs: with: python-version: "3.11" - - name: Build voice sidecar - working-directory: apps/desktop - run: bash scripts/build-voice-sidecar.sh - - name: Compile app with electron-vite working-directory: apps/desktop env: diff --git a/apps/desktop/scripts/build-voice-sidecar.sh b/apps/desktop/scripts/build-voice-sidecar.sh index 6e14d823d55..88566d79d91 100755 --- a/apps/desktop/scripts/build-voice-sidecar.sh +++ b/apps/desktop/scripts/build-voice-sidecar.sh @@ -41,6 +41,10 @@ fi echo "[voice-sidecar] Building binary..." +# Resolve the openwakeword package directory so we can explicitly add its +# data files. --collect-data alone can miss them depending on PyInstaller version. +OWW_DIR=$("$PYTHON" -c "import openwakeword, os; print(os.path.dirname(openwakeword.__file__))") + "$PYTHON" -m PyInstaller \ --name voice-sidecar \ --onedir \ @@ -50,7 +54,15 @@ echo "[voice-sidecar] Building binary..." --workpath "$DESKTOP_DIR/dist/voice-sidecar-build" \ --specpath "$DESKTOP_DIR/dist" \ --collect-data openwakeword \ + --add-data "$OWW_DIR:openwakeword" \ "$PYTHON_DIR/main.py" echo "[voice-sidecar] Built at: $OUTPUT_DIR/voice-sidecar/" ls -la "$OUTPUT_DIR/voice-sidecar/" + +# Verify the wake word model was bundled +if [ ! -f "$OUTPUT_DIR/voice-sidecar/_internal/openwakeword/resources/models/hey_jarvis_v0.1.onnx" ]; then + echo "[voice-sidecar] ERROR: hey_jarvis model not found in bundle!" + exit 1 +fi +echo "[voice-sidecar] Verified hey_jarvis model is bundled." From 308c69805080a08973a3057a90207beae6b5f1fd Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 16:43:35 -0800 Subject: [PATCH 09/18] fix(desktop): copy openwakeword data as fallback in sidecar build PyInstaller's --collect-data silently misses openwakeword model files. Fall back to manually copying the package data into _internal/ when the model isn't found after the initial build. --- apps/desktop/scripts/build-voice-sidecar.sh | 25 +++++++++++++-------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/apps/desktop/scripts/build-voice-sidecar.sh b/apps/desktop/scripts/build-voice-sidecar.sh index 88566d79d91..ab20ecdc342 100755 --- a/apps/desktop/scripts/build-voice-sidecar.sh +++ b/apps/desktop/scripts/build-voice-sidecar.sh @@ -41,10 +41,6 @@ fi echo "[voice-sidecar] Building binary..." -# Resolve the openwakeword package directory so we can explicitly add its -# data files. --collect-data alone can miss them depending on PyInstaller version. -OWW_DIR=$("$PYTHON" -c "import openwakeword, os; print(os.path.dirname(openwakeword.__file__))") - "$PYTHON" -m PyInstaller \ --name voice-sidecar \ --onedir \ @@ -54,14 +50,25 @@ OWW_DIR=$("$PYTHON" -c "import openwakeword, os; print(os.path.dirname(openwakew --workpath "$DESKTOP_DIR/dist/voice-sidecar-build" \ --specpath "$DESKTOP_DIR/dist" \ --collect-data openwakeword \ - --add-data "$OWW_DIR:openwakeword" \ "$PYTHON_DIR/main.py" -echo "[voice-sidecar] Built at: $OUTPUT_DIR/voice-sidecar/" -ls -la "$OUTPUT_DIR/voice-sidecar/" +BUNDLE_DIR="$OUTPUT_DIR/voice-sidecar" +INTERNAL_DIR="$BUNDLE_DIR/_internal" + +echo "[voice-sidecar] Built at: $BUNDLE_DIR/" +ls -la "$BUNDLE_DIR/" + +# PyInstaller's --collect-data may miss openwakeword's data files. +# Copy them manually as a guaranteed fallback. +if [ ! -f "$INTERNAL_DIR/openwakeword/resources/models/hey_jarvis_v0.1.onnx" ]; then + echo "[voice-sidecar] Model not found in bundle, copying openwakeword data manually..." + OWW_PKG_DIR=$("$PYTHON" -c "import openwakeword, os; print(os.path.dirname(openwakeword.__file__))") + mkdir -p "$INTERNAL_DIR/openwakeword" + cp -R "$OWW_PKG_DIR/"* "$INTERNAL_DIR/openwakeword/" +fi -# Verify the wake word model was bundled -if [ ! -f "$OUTPUT_DIR/voice-sidecar/_internal/openwakeword/resources/models/hey_jarvis_v0.1.onnx" ]; then +# Final verification +if [ ! -f "$INTERNAL_DIR/openwakeword/resources/models/hey_jarvis_v0.1.onnx" ]; then echo "[voice-sidecar] ERROR: hey_jarvis model not found in bundle!" exit 1 fi From 29100c9f6a9af5b98f8dbf1492b28c5adb98ae53 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 17:02:41 -0800 Subject: [PATCH 10/18] fix(desktop): fix openwakeword copy in sidecar build script rm the target first so cp -R creates a clean copy of the package directory instead of nesting it inside an existing partial dir. --- apps/desktop/scripts/build-voice-sidecar.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/desktop/scripts/build-voice-sidecar.sh b/apps/desktop/scripts/build-voice-sidecar.sh index ab20ecdc342..9ec70ea1e23 100755 --- a/apps/desktop/scripts/build-voice-sidecar.sh +++ b/apps/desktop/scripts/build-voice-sidecar.sh @@ -63,8 +63,10 @@ ls -la "$BUNDLE_DIR/" if [ ! -f "$INTERNAL_DIR/openwakeword/resources/models/hey_jarvis_v0.1.onnx" ]; then echo "[voice-sidecar] Model not found in bundle, copying openwakeword data manually..." OWW_PKG_DIR=$("$PYTHON" -c "import openwakeword, os; print(os.path.dirname(openwakeword.__file__))") - mkdir -p "$INTERNAL_DIR/openwakeword" - cp -R "$OWW_PKG_DIR/"* "$INTERNAL_DIR/openwakeword/" + echo "[voice-sidecar] Copying from: $OWW_PKG_DIR" + # Remove any partial directory and copy the full package tree + rm -rf "$INTERNAL_DIR/openwakeword" + cp -R "$OWW_PKG_DIR" "$INTERNAL_DIR/openwakeword" fi # Final verification From 06ada2893bd92425fe9f1db90ddb73eb947d3304 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 17:17:47 -0800 Subject: [PATCH 11/18] fix(desktop): add debug output for openwakeword model copy --- apps/desktop/scripts/build-voice-sidecar.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/desktop/scripts/build-voice-sidecar.sh b/apps/desktop/scripts/build-voice-sidecar.sh index 9ec70ea1e23..8fad4853db4 100755 --- a/apps/desktop/scripts/build-voice-sidecar.sh +++ b/apps/desktop/scripts/build-voice-sidecar.sh @@ -64,9 +64,13 @@ if [ ! -f "$INTERNAL_DIR/openwakeword/resources/models/hey_jarvis_v0.1.onnx" ]; echo "[voice-sidecar] Model not found in bundle, copying openwakeword data manually..." OWW_PKG_DIR=$("$PYTHON" -c "import openwakeword, os; print(os.path.dirname(openwakeword.__file__))") echo "[voice-sidecar] Copying from: $OWW_PKG_DIR" + echo "[voice-sidecar] Source package contents:" + find "$OWW_PKG_DIR" -name "*.onnx" 2>/dev/null || echo "[voice-sidecar] No .onnx files in source!" # Remove any partial directory and copy the full package tree rm -rf "$INTERNAL_DIR/openwakeword" cp -R "$OWW_PKG_DIR" "$INTERNAL_DIR/openwakeword" + echo "[voice-sidecar] After copy:" + find "$INTERNAL_DIR/openwakeword" -name "*.onnx" 2>/dev/null || echo "[voice-sidecar] No .onnx files after copy!" fi # Final verification From e71edc029d6039dc56ccac30e49158d37eee35df Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Thu, 29 Jan 2026 17:38:22 -0800 Subject: [PATCH 12/18] fix(desktop): download openwakeword models in sidecar build openwakeword >=0.6.0 no longer ships pre-trained models in the pip package. Download hey_jarvis, melspectrogram, and embedding_model from the v0.5.1 GitHub release during the sidecar build. --- apps/desktop/scripts/build-voice-sidecar.sh | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/apps/desktop/scripts/build-voice-sidecar.sh b/apps/desktop/scripts/build-voice-sidecar.sh index 8fad4853db4..648e6559a52 100755 --- a/apps/desktop/scripts/build-voice-sidecar.sh +++ b/apps/desktop/scripts/build-voice-sidecar.sh @@ -33,6 +33,20 @@ if ! "$PYTHON" -c "import openwakeword" 2>/dev/null; then "$PIP" install --quiet openwakeword sounddevice numpy fi +# openwakeword >=0.6.0 no longer ships pre-trained models in the pip package. +# Download the required models into the package's resources directory. +OWW_PKG_DIR=$("$PYTHON" -c "import openwakeword, os; print(os.path.dirname(openwakeword.__file__))") +OWW_MODELS_DIR="$OWW_PKG_DIR/resources/models" +mkdir -p "$OWW_MODELS_DIR" + +OWW_BASE_URL="https://github.com/dscripka/openWakeWord/releases/download/v0.5.1" +for model in hey_jarvis_v0.1.onnx melspectrogram.onnx embedding_model.onnx; do + if [ ! -f "$OWW_MODELS_DIR/$model" ]; then + echo "[voice-sidecar] Downloading model: $model" + curl -sL "$OWW_BASE_URL/$model" -o "$OWW_MODELS_DIR/$model" + fi +done + # Install PyInstaller if ! "$PYTHON" -c "import PyInstaller" 2>/dev/null; then echo "[voice-sidecar] Installing PyInstaller..." @@ -63,14 +77,9 @@ ls -la "$BUNDLE_DIR/" if [ ! -f "$INTERNAL_DIR/openwakeword/resources/models/hey_jarvis_v0.1.onnx" ]; then echo "[voice-sidecar] Model not found in bundle, copying openwakeword data manually..." OWW_PKG_DIR=$("$PYTHON" -c "import openwakeword, os; print(os.path.dirname(openwakeword.__file__))") - echo "[voice-sidecar] Copying from: $OWW_PKG_DIR" - echo "[voice-sidecar] Source package contents:" - find "$OWW_PKG_DIR" -name "*.onnx" 2>/dev/null || echo "[voice-sidecar] No .onnx files in source!" - # Remove any partial directory and copy the full package tree + echo "[voice-sidecar] Copying openwakeword package from: $OWW_PKG_DIR" rm -rf "$INTERNAL_DIR/openwakeword" cp -R "$OWW_PKG_DIR" "$INTERNAL_DIR/openwakeword" - echo "[voice-sidecar] After copy:" - find "$INTERNAL_DIR/openwakeword" -name "*.onnx" 2>/dev/null || echo "[voice-sidecar] No .onnx files after copy!" fi # Final verification From 9b7e000055a0a65064113b2898d33ae5aa3365f5 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Fri, 30 Jan 2026 14:02:20 -0800 Subject: [PATCH 13/18] fix(api): update voice imports to use @superset/mcp package After rebase onto main, update voice API imports from @/lib/mcp/* to @superset/mcp and @superset/mcp/auth. Remove duplicate ANTHROPIC_API_KEY in env schema (already added by Slack integration). --- apps/api/src/app/api/voice/route.ts | 2 +- apps/api/src/app/api/voice/tool-adapter.ts | 4 ++-- apps/api/src/app/api/voice/voice-service.ts | 2 +- apps/api/src/env.ts | 1 - 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/apps/api/src/app/api/voice/route.ts b/apps/api/src/app/api/voice/route.ts index 8464966517d..0b7146734d3 100644 --- a/apps/api/src/app/api/voice/route.ts +++ b/apps/api/src/app/api/voice/route.ts @@ -1,5 +1,5 @@ import { auth } from "@superset/auth/server"; -import type { McpContext } from "@/lib/mcp/auth"; +import type { McpContext } from "@superset/mcp/auth"; import { runVoicePipeline } from "./voice-service"; async function authenticate(request: Request): Promise { diff --git a/apps/api/src/app/api/voice/tool-adapter.ts b/apps/api/src/app/api/voice/tool-adapter.ts index d52e4e7f3c3..1c64b142af4 100644 --- a/apps/api/src/app/api/voice/tool-adapter.ts +++ b/apps/api/src/app/api/voice/tool-adapter.ts @@ -1,6 +1,6 @@ import type Anthropic from "@anthropic-ai/sdk"; import { z } from "zod"; -import type { McpContext } from "@/lib/mcp/auth"; +import type { McpContext } from "@superset/mcp/auth"; type ToolHandler = ( params: Record, @@ -77,7 +77,7 @@ export async function getToolDefinitions(): Promise { }, }; - const { registerTools } = await import("@/lib/mcp/tools"); + const { registerTools } = await import("@superset/mcp"); registerTools(interceptServer as never); cachedTools = tools; diff --git a/apps/api/src/app/api/voice/voice-service.ts b/apps/api/src/app/api/voice/voice-service.ts index a3e0901d84a..99355f0626e 100644 --- a/apps/api/src/app/api/voice/voice-service.ts +++ b/apps/api/src/app/api/voice/voice-service.ts @@ -1,7 +1,7 @@ import Anthropic from "@anthropic-ai/sdk"; import { OpenAI } from "openai"; import { env } from "@/env"; -import type { McpContext } from "@/lib/mcp/auth"; +import type { McpContext } from "@superset/mcp/auth"; import { executeTool, getToolDefinitions, diff --git a/apps/api/src/env.ts b/apps/api/src/env.ts index 0db088c82f5..17a0e09cdb1 100644 --- a/apps/api/src/env.ts +++ b/apps/api/src/env.ts @@ -41,7 +41,6 @@ export const env = createEnv({ STRIPE_PRO_YEARLY_PRICE_ID: z.string(), SENTRY_AUTH_TOKEN: z.string().optional(), OPENAI_API_KEY: z.string().min(1), - ANTHROPIC_API_KEY: z.string().min(1), }, client: { NEXT_PUBLIC_API_URL: z.string().url(), From 8c00fdbdbe1f6da7a640d7d28513b6b427e17c54 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Fri, 30 Jan 2026 14:13:12 -0800 Subject: [PATCH 14/18] fix(api): resolve type errors in voice tool-adapter and voice-service Cast ZodType through unknown for internal _zod access, and cast Uint8Array to BlobPart for Blob constructor compatibility. --- apps/api/src/app/api/voice/tool-adapter.ts | 5 +++-- apps/api/src/app/api/voice/voice-service.ts | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/api/src/app/api/voice/tool-adapter.ts b/apps/api/src/app/api/voice/tool-adapter.ts index 1c64b142af4..e9ebc433ca6 100644 --- a/apps/api/src/app/api/voice/tool-adapter.ts +++ b/apps/api/src/app/api/voice/tool-adapter.ts @@ -97,8 +97,9 @@ function zodToJsonSchema(schema: z.ZodType): Record { // Unwrap optional/default wrappers const innerSchema = unwrapZod(schema); - const innerDef = (innerSchema as { _zod?: { def?: Record } }) - ._zod?.def; + const innerDef = ( + innerSchema as unknown as { _zod?: { def?: Record } } + )._zod?.def; const typeName = (innerDef?.typeName ?? def?.type ?? def?.typeName ?? diff --git a/apps/api/src/app/api/voice/voice-service.ts b/apps/api/src/app/api/voice/voice-service.ts index 99355f0626e..99c3c7469e7 100644 --- a/apps/api/src/app/api/voice/voice-service.ts +++ b/apps/api/src/app/api/voice/voice-service.ts @@ -23,7 +23,7 @@ interface SSEWriter { async function transcribeAudio(audioBuffer: Uint8Array): Promise { const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY }); - const blob = new Blob([audioBuffer], { type: "audio/wav" }); + const blob = new Blob([audioBuffer as BlobPart], { type: "audio/wav" }); const file = new File([blob], "audio.wav", { type: "audio/wav" }); const result = await openai.audio.transcriptions.create({ From 9f853dce2412bfe0e35db668706194a10dd7a285 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Fri, 30 Jan 2026 14:20:05 -0800 Subject: [PATCH 15/18] fix(desktop): harden voice process lifecycle against rapid start/stop - Capture process reference locally in stopVoiceProcess and clear childProcess immediately, so startVoiceProcess can proceed if called while the old process is still shutting down - Guard exit/error handlers with identity check (childProcess === proc) to avoid cleaning up a newer process when a stale one exits - Emit idle event on cleanup so subscribers know process stopped - Fix import ordering (biome auto-fix) --- apps/api/src/app/api/voice/tool-adapter.ts | 2 +- apps/api/src/app/api/voice/voice-service.ts | 2 +- .../src/main/lib/voice/voice-process.ts | 44 ++++++++++++------- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/apps/api/src/app/api/voice/tool-adapter.ts b/apps/api/src/app/api/voice/tool-adapter.ts index e9ebc433ca6..5b1cc873122 100644 --- a/apps/api/src/app/api/voice/tool-adapter.ts +++ b/apps/api/src/app/api/voice/tool-adapter.ts @@ -1,6 +1,6 @@ import type Anthropic from "@anthropic-ai/sdk"; -import { z } from "zod"; import type { McpContext } from "@superset/mcp/auth"; +import { z } from "zod"; type ToolHandler = ( params: Record, diff --git a/apps/api/src/app/api/voice/voice-service.ts b/apps/api/src/app/api/voice/voice-service.ts index 99c3c7469e7..e9b5641d989 100644 --- a/apps/api/src/app/api/voice/voice-service.ts +++ b/apps/api/src/app/api/voice/voice-service.ts @@ -1,7 +1,7 @@ import Anthropic from "@anthropic-ai/sdk"; +import type { McpContext } from "@superset/mcp/auth"; import { OpenAI } from "openai"; import { env } from "@/env"; -import type { McpContext } from "@superset/mcp/auth"; import { executeTool, getToolDefinitions, diff --git a/apps/desktop/src/main/lib/voice/voice-process.ts b/apps/desktop/src/main/lib/voice/voice-process.ts index f6af4827b06..30d61db386f 100644 --- a/apps/desktop/src/main/lib/voice/voice-process.ts +++ b/apps/desktop/src/main/lib/voice/voice-process.ts @@ -47,17 +47,18 @@ export function startVoiceProcess(): void { `[voice-process] Starting: ${config.command} ${config.args.join(" ")}`, ); - childProcess = spawn(config.command, config.args, { + const proc = spawn(config.command, config.args, { cwd: config.cwd, stdio: ["pipe", "pipe", "pipe"], env: { ...process.env }, }); + childProcess = proc; isRunning = true; // Parse stdout JSON lines - if (childProcess.stdout) { - const rl = createInterface({ input: childProcess.stdout }); + if (proc.stdout) { + const rl = createInterface({ input: proc.stdout }); rl.on("line", (line) => { try { const raw = JSON.parse(line) as PythonVoiceEvent; @@ -73,25 +74,32 @@ export function startVoiceProcess(): void { } // Log stderr - if (childProcess.stderr) { - const rl = createInterface({ input: childProcess.stderr }); + if (proc.stderr) { + const rl = createInterface({ input: proc.stderr }); rl.on("line", (line) => { console.error("[voice-process/stderr]", line); }); } - childProcess.on("error", (err) => { + // Only run cleanup if this process is still the active one. + // A newer process may have been spawned after stopVoiceProcess() + // cleared the reference. + proc.on("error", (err) => { console.error("[voice-process] Spawn error:", err.message); voiceProcessEmitter.emit("voice-event", { type: "error", message: `Process error: ${err.message}`, } satisfies VoiceSidecarEvent); - cleanup(); + if (childProcess === proc) { + cleanup(); + } }); - childProcess.on("exit", (code, signal) => { + proc.on("exit", (code, signal) => { console.log(`[voice-process] Exited with code=${code} signal=${signal}`); - cleanup(); + if (childProcess === proc) { + cleanup(); + } }); } @@ -100,10 +108,15 @@ export function stopVoiceProcess(): void { return; } + // Capture reference and clear immediately so startVoiceProcess() + // can proceed if called while this process is still shutting down. + const proc = childProcess; + cleanup(); + // Send stop command via stdin - if (childProcess.stdin && !childProcess.stdin.destroyed) { + if (proc.stdin && !proc.stdin.destroyed) { try { - childProcess.stdin.write(`${JSON.stringify({ cmd: "stop" })}\n`); + proc.stdin.write(`${JSON.stringify({ cmd: "stop" })}\n`); } catch { // stdin may be closed already } @@ -111,16 +124,16 @@ export function stopVoiceProcess(): void { // Give it a moment to exit gracefully, then force kill const timeout = setTimeout(() => { - if (childProcess) { - childProcess.kill("SIGKILL"); + if (!proc.killed) { + proc.kill("SIGKILL"); } }, 3000); - childProcess.once("exit", () => { + proc.once("exit", () => { clearTimeout(timeout); }); - childProcess.kill("SIGTERM"); + proc.kill("SIGTERM"); } export function getVoiceProcessStatus(): { @@ -137,4 +150,5 @@ function cleanup(): void { childProcess = null; isRunning = false; lastEvent = { type: "idle" }; + voiceProcessEmitter.emit("voice-event", lastEvent); } From 0eef5ee6d1fbbdcf24b2e6612e1d18e15414d3af Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Fri, 30 Jan 2026 14:40:48 -0800 Subject: [PATCH 16/18] refactor(api): replace voice tool-adapter with in-memory MCP client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Delete the 257-line tool-adapter.ts that hand-rolled Zod-to-JSON-Schema conversion and MCP tool interception. Replace with createInMemoryMcpClient from @superset/mcp/in-memory, using client.listTools() and client.callTool() directly — the same pattern as the Slack agent. --- apps/api/src/app/api/voice/tool-adapter.ts | 258 -------------------- apps/api/src/app/api/voice/voice-service.ts | 196 ++++++++------- 2 files changed, 113 insertions(+), 341 deletions(-) delete mode 100644 apps/api/src/app/api/voice/tool-adapter.ts diff --git a/apps/api/src/app/api/voice/tool-adapter.ts b/apps/api/src/app/api/voice/tool-adapter.ts deleted file mode 100644 index 5b1cc873122..00000000000 --- a/apps/api/src/app/api/voice/tool-adapter.ts +++ /dev/null @@ -1,258 +0,0 @@ -import type Anthropic from "@anthropic-ai/sdk"; -import type { McpContext } from "@superset/mcp/auth"; -import { z } from "zod"; - -type ToolHandler = ( - params: Record, - ctx: McpContext, -) => Promise<{ - content: Array<{ type: "text"; text: string }>; - isError?: boolean; -}>; - -interface ToolDefinition { - name: string; - description: string; - input_schema: Anthropic.Tool["input_schema"]; - handler: ToolHandler; -} - -let cachedTools: ToolDefinition[] | null = null; - -/** - * Builds tool definitions by intercepting MCP tool registration. - * Converts Zod input schemas to JSON Schema for the Anthropic SDK. - * Results are cached since tool definitions are static. - */ -export async function getToolDefinitions(): Promise { - if (cachedTools) return cachedTools; - - const tools: ToolDefinition[] = []; - - const interceptServer = { - tool( - name: string, - description: string, - inputSchema: Record, - handler: ( - params: Record, - extra: { - authInfo?: { extra?: { mcpContext?: McpContext } }; - }, - ) => Promise<{ - content: Array<{ type: "text"; text: string }>; - isError?: boolean; - }>, - ) { - // Convert Zod schemas to JSON Schema properties - const properties: Record = {}; - const required: string[] = []; - - for (const [key, schema] of Object.entries(inputSchema)) { - try { - properties[key] = zodToJsonSchema(schema); - if (!isOptional(schema)) { - required.push(key); - } - } catch { - // Fallback for schemas that can't be converted - properties[key] = { type: "string" }; - } - } - - tools.push({ - name, - description, - input_schema: { - type: "object" as const, - properties, - ...(required.length > 0 ? { required } : {}), - }, - handler: async (params, ctx) => { - return handler(params, { - authInfo: { extra: { mcpContext: ctx } }, - }); - }, - }); - }, - }; - - const { registerTools } = await import("@superset/mcp"); - registerTools(interceptServer as never); - - cachedTools = tools; - return tools; -} - -/** - * Convert a Zod schema to a basic JSON Schema representation. - */ -function zodToJsonSchema(schema: z.ZodType): Record { - const def = ( - schema as unknown as { - _zod?: { def?: { type?: string; typeName?: string } }; - } - )._zod?.def; - const description = schema.description; - - // Unwrap optional/default wrappers - const innerSchema = unwrapZod(schema); - const innerDef = ( - innerSchema as unknown as { _zod?: { def?: Record } } - )._zod?.def; - const typeName = (innerDef?.typeName ?? - def?.type ?? - def?.typeName ?? - "") as string; - - const result: Record = {}; - - switch (typeName) { - case "ZodString": - case "string": - result.type = "string"; - break; - case "ZodNumber": - case "number": - result.type = "number"; - break; - case "ZodBoolean": - case "boolean": - result.type = "boolean"; - break; - case "ZodArray": - case "array": { - result.type = "array"; - const itemSchema = - (innerDef as Record)?.innerType ?? - (innerDef as Record)?.type; - if (itemSchema && itemSchema instanceof z.ZodType) { - result.items = zodToJsonSchema(itemSchema); - } - break; - } - case "ZodEnum": - case "enum": { - result.type = "string"; - const values = - (innerDef as Record)?.entries ?? - (innerDef as Record)?.values; - if (Array.isArray(values)) { - result.enum = values; - } else if (values && typeof values === "object") { - result.enum = Object.keys(values); - } - break; - } - case "ZodObject": - case "object": { - result.type = "object"; - const shape = (innerDef as Record)?.shape; - if (shape && typeof shape === "object") { - const props: Record = {}; - for (const [k, v] of Object.entries( - shape as Record, - )) { - props[k] = zodToJsonSchema(v); - } - result.properties = props; - } - break; - } - default: - result.type = "string"; - break; - } - - if (description) { - result.description = description; - } - - return result; -} - -/** - * Unwrap optional/default/nullable wrappers to get the inner type. - */ -function unwrapZod(schema: z.ZodType): z.ZodType { - const def = ( - schema as unknown as { _zod?: { def?: Record } } - )._zod?.def; - const typeName = (def?.typeName ?? "") as string; - - if ( - typeName === "ZodOptional" || - typeName === "ZodDefault" || - typeName === "ZodNullable" - ) { - const inner = def?.innerType; - if (inner && inner instanceof z.ZodType) { - return unwrapZod(inner); - } - } - - return schema; -} - -/** - * Check if a Zod schema is optional. - */ -function isOptional(schema: z.ZodType): boolean { - const def = ( - schema as unknown as { _zod?: { def?: Record } } - )._zod?.def; - const typeName = (def?.typeName ?? "") as string; - - if (typeName === "ZodOptional" || typeName === "ZodDefault") { - return true; - } - - const inner = def?.innerType; - if (inner && inner instanceof z.ZodType) { - return isOptional(inner); - } - - return false; -} - -/** - * Execute a tool by name with the given input and auth context. - */ -export async function executeTool({ - toolName, - toolInput, - ctx, - tools, -}: { - toolName: string; - toolInput: Record; - ctx: McpContext; - tools: ToolDefinition[]; -}): Promise { - const tool = tools.find((t) => t.name === toolName); - if (!tool) { - return JSON.stringify({ error: `Unknown tool: ${toolName}` }); - } - - try { - const result = await tool.handler(toolInput, ctx); - const text = result.content.map((c) => c.text).join("\n"); - return text; - } catch (error) { - console.error(`[voice/tool] Error executing ${toolName}:`, error); - return JSON.stringify({ - error: `Tool execution failed: ${error instanceof Error ? error.message : "Unknown error"}`, - }); - } -} - -/** - * Convert tool definitions to Anthropic SDK tool format. - */ -export function toAnthropicTools(tools: ToolDefinition[]): Anthropic.Tool[] { - return tools.map((t) => ({ - name: t.name, - description: t.description, - input_schema: t.input_schema, - })); -} diff --git a/apps/api/src/app/api/voice/voice-service.ts b/apps/api/src/app/api/voice/voice-service.ts index e9b5641d989..181c63e215a 100644 --- a/apps/api/src/app/api/voice/voice-service.ts +++ b/apps/api/src/app/api/voice/voice-service.ts @@ -1,25 +1,22 @@ import Anthropic from "@anthropic-ai/sdk"; import type { McpContext } from "@superset/mcp/auth"; +import { createInMemoryMcpClient } from "@superset/mcp/in-memory"; import { OpenAI } from "openai"; import { env } from "@/env"; -import { - executeTool, - getToolDefinitions, - toAnthropicTools, -} from "./tool-adapter"; const SYSTEM_PROMPT = `You are a helpful voice assistant for Superset, a project management tool. You have access to tools for creating and managing tasks, workspaces, and other organizational resources. Keep responses concise and conversational — the user is speaking to you, so respond in 1-3 sentences unless the question requires more detail. When you use tools, briefly confirm what you did.`; -/** - * SSE event types emitted during the voice pipeline. - */ +// Desktop-only tools that don't make sense in voice context +const DENIED_TOOLS = new Set([ + "navigate_to_workspace", + "switch_workspace", + "get_app_context", +]); + interface SSEWriter { write(event: string, data: unknown): void; } -/** - * Transcribes audio using OpenAI Whisper API. - */ async function transcribeAudio(audioBuffer: Uint8Array): Promise { const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY }); @@ -38,8 +35,7 @@ async function transcribeAudio(audioBuffer: Uint8Array): Promise { } /** - * Runs the full voice pipeline: transcription → Claude with tools → streaming response. - * Writes SSE events to the provided writer throughout. + * Runs the full voice pipeline: transcription → Claude with MCP tools → streaming SSE. */ export async function runVoicePipeline({ audioBuffer, @@ -59,84 +55,118 @@ export async function runVoicePipeline({ return; } - // 2. Load tools - const toolDefs = await getToolDefinitions(); - const anthropicTools = toAnthropicTools(toolDefs); - - // 3. Stream Claude response with tool use loop - const anthropic = new Anthropic({ apiKey: env.ANTHROPIC_API_KEY }); - - const messages: Anthropic.MessageParam[] = [ - { role: "user", content: transcription }, - ]; - - let fullResponse = ""; - - // Tool use loop — Claude may call tools, then we feed results back - const MAX_TOOL_ROUNDS = 5; - for (let round = 0; round < MAX_TOOL_ROUNDS; round++) { - const stream = anthropic.messages.stream({ - model: "claude-sonnet-4-20250514", - max_tokens: 1024, - system: SYSTEM_PROMPT, - messages, - tools: anthropicTools.length > 0 ? anthropicTools : undefined, - }); - - for await (const event of stream) { - if (event.type === "content_block_delta") { - if (event.delta.type === "text_delta") { - fullResponse += event.delta.text; - sse.write("text_delta", { delta: event.delta.text }); - } - } - } - - // Collect the final message to check for tool use - const finalMessage = await stream.finalMessage(); - const contentBlocks = finalMessage.content; + // 2. Create in-memory MCP client for tool access + const { client: mcpClient, cleanup } = await createInMemoryMcpClient({ + userId: ctx.userId, + organizationId: ctx.organizationId, + }); - // Check for tool use blocks - const toolUseBlocks = contentBlocks.filter( - (block): block is Anthropic.ToolUseBlock => block.type === "tool_use", - ); + try { + const { tools: mcpTools } = await mcpClient.listTools(); + + const anthropicTools: Anthropic.Tool[] = mcpTools + .filter((t) => !DENIED_TOOLS.has(t.name)) + .map((t) => ({ + name: t.name, + description: t.description ?? "", + input_schema: t.inputSchema as Anthropic.Tool.InputSchema, + })); + + // 3. Stream Claude response with tool use loop + const anthropic = new Anthropic({ apiKey: env.ANTHROPIC_API_KEY }); + + const messages: Anthropic.MessageParam[] = [ + { role: "user", content: transcription }, + ]; + + let fullResponse = ""; + + const MAX_TOOL_ROUNDS = 5; + for (let round = 0; round < MAX_TOOL_ROUNDS; round++) { + const stream = anthropic.messages.stream({ + model: "claude-sonnet-4-20250514", + max_tokens: 1024, + system: SYSTEM_PROMPT, + messages, + tools: anthropicTools.length > 0 ? anthropicTools : undefined, + }); - if (toolUseBlocks.length === 0) { - break; - } + for await (const event of stream) { + if (event.type === "content_block_delta") { + if (event.delta.type === "text_delta") { + fullResponse += event.delta.text; + sse.write("text_delta", { delta: event.delta.text }); + } + } + } - // Execute each tool call and collect results - const toolResults: Anthropic.ToolResultBlockParam[] = []; + const finalMessage = await stream.finalMessage(); + const contentBlocks = finalMessage.content; - for (const toolBlock of toolUseBlocks) { - sse.write("tool_use", { - toolName: toolBlock.name, - toolInput: toolBlock.input, - }); + const toolUseBlocks = contentBlocks.filter( + (block): block is Anthropic.ToolUseBlock => block.type === "tool_use", + ); - const result = await executeTool({ - toolName: toolBlock.name, - toolInput: toolBlock.input as Record, - ctx, - tools: toolDefs, - }); + if (toolUseBlocks.length === 0) { + break; + } - sse.write("tool_result", { - toolName: toolBlock.name, - result, - }); + const toolResults: Anthropic.ToolResultBlockParam[] = []; + + for (const toolBlock of toolUseBlocks) { + sse.write("tool_use", { + toolName: toolBlock.name, + toolInput: toolBlock.input, + }); + + try { + const result = await mcpClient.callTool({ + name: toolBlock.name, + arguments: toolBlock.input as Record, + }); + + const resultText = JSON.stringify(result.content); + + sse.write("tool_result", { + toolName: toolBlock.name, + result: resultText, + }); + + toolResults.push({ + type: "tool_result", + tool_use_id: toolBlock.id, + content: resultText, + }); + } catch (error) { + console.error( + `[voice/tool] Error executing ${toolBlock.name}:`, + error, + ); + const errorText = JSON.stringify({ + error: + error instanceof Error ? error.message : "Tool execution failed", + }); + + sse.write("tool_result", { + toolName: toolBlock.name, + result: errorText, + }); + + toolResults.push({ + type: "tool_result", + tool_use_id: toolBlock.id, + content: errorText, + is_error: true, + }); + } + } - toolResults.push({ - type: "tool_result", - tool_use_id: toolBlock.id, - content: result, - }); + messages.push({ role: "assistant", content: contentBlocks }); + messages.push({ role: "user", content: toolResults }); } - // Feed tool results back into conversation for next iteration - messages.push({ role: "assistant", content: contentBlocks }); - messages.push({ role: "user", content: toolResults }); + sse.write("done", { fullResponse }); + } finally { + await cleanup().catch(() => {}); } - - sse.write("done", { fullResponse }); } From 0c0de550150cd0495607bf04dbf4d2c1bc431a6a Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Fri, 30 Jan 2026 14:59:28 -0800 Subject: [PATCH 17/18] refactor(desktop): co-locate voice types and remove dead code - Move VoiceSidecarEvent and PythonVoiceEvent into voice-process.ts - Inline MicPermissionStatus in voice router - Delete shared/voice.ts (renderer gets types via tRPC inference) - Remove unused getVoiceProcessStatus(), isRunning, start/stop mutations - Remove unused PythonVoiceCommand type --- .../src/lib/trpc/routers/voice/index.ts | 18 ++++------ .../src/main/lib/voice/voice-process.ts | 24 +++++++------ apps/desktop/src/shared/voice.ts | 35 ------------------- 3 files changed, 21 insertions(+), 56 deletions(-) delete mode 100644 apps/desktop/src/shared/voice.ts diff --git a/apps/desktop/src/lib/trpc/routers/voice/index.ts b/apps/desktop/src/lib/trpc/routers/voice/index.ts index 1a83da523a5..d999d96004f 100644 --- a/apps/desktop/src/lib/trpc/routers/voice/index.ts +++ b/apps/desktop/src/lib/trpc/routers/voice/index.ts @@ -4,11 +4,17 @@ import { getCurrentVoiceState, startVoiceProcess, stopVoiceProcess, + type VoiceSidecarEvent, voiceProcessEmitter, } from "main/lib/voice/voice-process"; -import type { MicPermissionStatus, VoiceSidecarEvent } from "shared/voice"; import { publicProcedure, router } from "../.."; +type MicPermissionStatus = + | "not-determined" + | "granted" + | "denied" + | "restricted"; + function getMicStatus(): MicPermissionStatus { if (process.platform !== "darwin") { return "granted"; @@ -51,16 +57,6 @@ export const createVoiceRouter = () => { }); }), - start: publicProcedure.mutation(() => { - startVoiceProcess(); - return { success: true as const }; - }), - - stop: publicProcedure.mutation(() => { - stopVoiceProcess(); - return { success: true as const }; - }), - getMicPermission: publicProcedure.query((): MicPermissionStatus => { return getMicStatus(); }), diff --git a/apps/desktop/src/main/lib/voice/voice-process.ts b/apps/desktop/src/main/lib/voice/voice-process.ts index 30d61db386f..41e779e5c04 100644 --- a/apps/desktop/src/main/lib/voice/voice-process.ts +++ b/apps/desktop/src/main/lib/voice/voice-process.ts @@ -2,13 +2,25 @@ import type { ChildProcess } from "node:child_process"; import { spawn } from "node:child_process"; import { EventEmitter } from "node:events"; import { createInterface } from "node:readline"; -import type { PythonVoiceEvent, VoiceSidecarEvent } from "shared/voice"; import { getVoiceSpawnConfig } from "./voice-process-paths"; +export type VoiceSidecarEvent = + | { type: "ready" } + | { type: "recording" } + | { type: "audio_captured"; audioB64: string; durationS: number } + | { type: "error"; message: string } + | { type: "idle" }; + +interface PythonVoiceEvent { + event: "ready" | "recording" | "audio_captured" | "error" | "idle"; + audio_b64?: string; + duration_s?: number; + message?: string; +} + export const voiceProcessEmitter = new EventEmitter(); let childProcess: ChildProcess | null = null; -let isRunning = false; let lastEvent: VoiceSidecarEvent = { type: "idle" }; function parsePythonEvent(raw: PythonVoiceEvent): VoiceSidecarEvent | null { @@ -54,7 +66,6 @@ export function startVoiceProcess(): void { }); childProcess = proc; - isRunning = true; // Parse stdout JSON lines if (proc.stdout) { @@ -136,19 +147,12 @@ export function stopVoiceProcess(): void { proc.kill("SIGTERM"); } -export function getVoiceProcessStatus(): { - running: boolean; -} { - return { running: isRunning }; -} - export function getCurrentVoiceState(): VoiceSidecarEvent { return lastEvent; } function cleanup(): void { childProcess = null; - isRunning = false; lastEvent = { type: "idle" }; voiceProcessEmitter.emit("voice-event", lastEvent); } diff --git a/apps/desktop/src/shared/voice.ts b/apps/desktop/src/shared/voice.ts deleted file mode 100644 index dcece441f1f..00000000000 --- a/apps/desktop/src/shared/voice.ts +++ /dev/null @@ -1,35 +0,0 @@ -export type MicPermissionStatus = - | "not-determined" - | "granted" - | "denied" - | "restricted"; - -/** - * Voice sidecar events emitted by the Python child process via stdio JSON lines. - * These are the events the tRPC subscription forwards to the renderer. - */ - -export type VoiceSidecarEvent = - | { type: "ready" } - | { type: "recording" } - | { type: "audio_captured"; audioB64: string; durationS: number } - | { type: "error"; message: string } - | { type: "idle" }; - -/** - * Raw JSON events from the Python process stdout. - * Converted to VoiceSidecarEvent by voice-process.ts. - */ -export interface PythonVoiceEvent { - event: "ready" | "recording" | "audio_captured" | "error" | "idle"; - audio_b64?: string; - duration_s?: number; - message?: string; -} - -/** - * Commands sent to the Python process via stdin. - */ -export interface PythonVoiceCommand { - cmd: "start" | "stop"; -} From 6d07a9e62ea1821be739af3d5b1dd54a4c1281b2 Mon Sep 17 00:00:00 2001 From: Satya Patel Date: Mon, 2 Feb 2026 11:00:06 -0800 Subject: [PATCH 18/18] WIP --- apps/api/src/app/api/voice/route.ts | 49 ++--- apps/api/src/app/api/voice/voice-service.ts | 176 ++++++++++-------- .../components/Voice/VoiceListener.tsx | 16 +- .../RecordingIndicator/RecordingIndicator.tsx | 6 +- .../ResponsePanel/ResponsePanel.tsx | 153 +++++++-------- .../useVoicePipeline/useVoicePipeline.ts | 176 ++++++++---------- .../BehaviorSettings/BehaviorSettings.tsx | 2 +- .../utils/settings-search/settings-search.ts | 4 +- 8 files changed, 298 insertions(+), 284 deletions(-) diff --git a/apps/api/src/app/api/voice/route.ts b/apps/api/src/app/api/voice/route.ts index 0b7146734d3..1e63fc3c88e 100644 --- a/apps/api/src/app/api/voice/route.ts +++ b/apps/api/src/app/api/voice/route.ts @@ -1,34 +1,17 @@ import { auth } from "@superset/auth/server"; -import type { McpContext } from "@superset/mcp/auth"; import { runVoicePipeline } from "./voice-service"; -async function authenticate(request: Request): Promise { - // Try session auth +export async function POST(request: Request) { const session = await auth.api.getSession({ headers: request.headers }); - if (session?.session) { - const extendedSession = session.session as { - activeOrganizationId?: string; - }; - if (!extendedSession.activeOrganizationId) { - return null; - } - return { - userId: session.user.id, - organizationId: extendedSession.activeOrganizationId, - }; + if (!session?.user) { + return Response.json({ error: "Unauthorized" }, { status: 401 }); } - return null; -} - -export async function POST(request: Request) { - // 1. Authenticate - const ctx = await authenticate(request); - if (!ctx) { - return Response.json({ error: "Unauthorized" }, { status: 401 }); + const organizationId = session.session.activeOrganizationId; + if (!organizationId) { + return Response.json({ error: "No active organization" }, { status: 400 }); } - // 2. Parse multipart form data let formData: FormData; try { formData = await request.formData(); @@ -57,7 +40,6 @@ export async function POST(request: Request) { const audioBuffer = new Uint8Array(await audioFile.arrayBuffer()); - // 3. Stream SSE response const encoder = new TextEncoder(); const stream = new ReadableStream({ @@ -70,13 +52,20 @@ export async function POST(request: Request) { }; try { - await runVoicePipeline({ audioBuffer, ctx, sse }); - } catch (error) { - console.error("[voice/route] Pipeline error:", error); - sse.write("error", { - message: - error instanceof Error ? error.message : "Voice pipeline failed", + await runVoicePipeline({ + audioBuffer, + ctx: { userId: session.user.id, organizationId }, + sse, + signal: request.signal, }); + } catch (error) { + if (!request.signal.aborted) { + console.error("[voice/route] Pipeline error:", error); + sse.write("error", { + message: + error instanceof Error ? error.message : "Voice pipeline failed", + }); + } } finally { controller.close(); } diff --git a/apps/api/src/app/api/voice/voice-service.ts b/apps/api/src/app/api/voice/voice-service.ts index 181c63e215a..9dc823965c6 100644 --- a/apps/api/src/app/api/voice/voice-service.ts +++ b/apps/api/src/app/api/voice/voice-service.ts @@ -17,16 +17,25 @@ interface SSEWriter { write(event: string, data: unknown): void; } -async function transcribeAudio(audioBuffer: Uint8Array): Promise { +async function transcribeAudio({ + audioBuffer, + signal, +}: { + audioBuffer: Uint8Array; + signal?: AbortSignal; +}): Promise { const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY }); const blob = new Blob([audioBuffer as BlobPart], { type: "audio/wav" }); const file = new File([blob], "audio.wav", { type: "audio/wav" }); - const result = await openai.audio.transcriptions.create({ - model: "whisper-1", - file, - }); + const result = await openai.audio.transcriptions.create( + { + model: "whisper-1", + file, + }, + { signal }, + ); // Strip wake word from transcription let text = result.text.trim(); @@ -41,13 +50,15 @@ export async function runVoicePipeline({ audioBuffer, ctx, sse, + signal, }: { audioBuffer: Uint8Array; ctx: McpContext; sse: SSEWriter; + signal?: AbortSignal; }): Promise { // 1. Transcribe - const transcription = await transcribeAudio(audioBuffer); + const transcription = await transcribeAudio({ audioBuffer, signal }); sse.write("transcription", { text: transcription }); if (!transcription) { @@ -81,88 +92,103 @@ export async function runVoicePipeline({ let fullResponse = ""; - const MAX_TOOL_ROUNDS = 5; - for (let round = 0; round < MAX_TOOL_ROUNDS; round++) { - const stream = anthropic.messages.stream({ - model: "claude-sonnet-4-20250514", - max_tokens: 1024, - system: SYSTEM_PROMPT, - messages, - tools: anthropicTools.length > 0 ? anthropicTools : undefined, - }); - - for await (const event of stream) { - if (event.type === "content_block_delta") { - if (event.delta.type === "text_delta") { - fullResponse += event.delta.text; - sse.write("text_delta", { delta: event.delta.text }); + try { + const MAX_TOOL_ROUNDS = 5; + for (let round = 0; round < MAX_TOOL_ROUNDS; round++) { + if (signal?.aborted) return; + + const stream = anthropic.messages.stream( + { + model: "claude-sonnet-4-20250514", + max_tokens: 1024, + system: SYSTEM_PROMPT, + messages, + tools: anthropicTools.length > 0 ? anthropicTools : undefined, + }, + { signal }, + ); + + for await (const event of stream) { + if (event.type === "content_block_delta") { + if (event.delta.type === "text_delta") { + fullResponse += event.delta.text; + sse.write("text_delta", { delta: event.delta.text }); + } } } - } - - const finalMessage = await stream.finalMessage(); - const contentBlocks = finalMessage.content; - - const toolUseBlocks = contentBlocks.filter( - (block): block is Anthropic.ToolUseBlock => block.type === "tool_use", - ); - - if (toolUseBlocks.length === 0) { - break; - } - - const toolResults: Anthropic.ToolResultBlockParam[] = []; - for (const toolBlock of toolUseBlocks) { - sse.write("tool_use", { - toolName: toolBlock.name, - toolInput: toolBlock.input, - }); + const finalMessage = await stream.finalMessage(); + const contentBlocks = finalMessage.content; - try { - const result = await mcpClient.callTool({ - name: toolBlock.name, - arguments: toolBlock.input as Record, - }); + const toolUseBlocks = contentBlocks.filter( + (block): block is Anthropic.ToolUseBlock => block.type === "tool_use", + ); - const resultText = JSON.stringify(result.content); + if (toolUseBlocks.length === 0) { + break; + } - sse.write("tool_result", { - toolName: toolBlock.name, - result: resultText, - }); + const toolResults: Anthropic.ToolResultBlockParam[] = []; - toolResults.push({ - type: "tool_result", - tool_use_id: toolBlock.id, - content: resultText, - }); - } catch (error) { - console.error( - `[voice/tool] Error executing ${toolBlock.name}:`, - error, - ); - const errorText = JSON.stringify({ - error: - error instanceof Error ? error.message : "Tool execution failed", - }); + for (const toolBlock of toolUseBlocks) { + if (signal?.aborted) return; - sse.write("tool_result", { + sse.write("tool_use", { toolName: toolBlock.name, - result: errorText, + toolInput: toolBlock.input, }); - toolResults.push({ - type: "tool_result", - tool_use_id: toolBlock.id, - content: errorText, - is_error: true, - }); + try { + const result = await mcpClient.callTool({ + name: toolBlock.name, + arguments: toolBlock.input as Record, + }); + + const resultText = JSON.stringify(result.content); + + sse.write("tool_result", { + toolName: toolBlock.name, + result: resultText, + }); + + toolResults.push({ + type: "tool_result", + tool_use_id: toolBlock.id, + content: resultText, + }); + } catch (error) { + if (signal?.aborted) return; + console.error( + `[voice/tool] Error executing ${toolBlock.name}:`, + error, + ); + const errorText = JSON.stringify({ + error: + error instanceof Error + ? error.message + : "Tool execution failed", + }); + + sse.write("tool_result", { + toolName: toolBlock.name, + result: errorText, + }); + + toolResults.push({ + type: "tool_result", + tool_use_id: toolBlock.id, + content: errorText, + is_error: true, + }); + } } - } - messages.push({ role: "assistant", content: contentBlocks }); - messages.push({ role: "user", content: toolResults }); + messages.push({ role: "assistant", content: contentBlocks }); + messages.push({ role: "user", content: toolResults }); + } + } catch (error) { + if (signal?.aborted) return; + throw error; } sse.write("done", { fullResponse }); diff --git a/apps/desktop/src/renderer/components/Voice/VoiceListener.tsx b/apps/desktop/src/renderer/components/Voice/VoiceListener.tsx index a0aed59188d..4105461f3af 100644 --- a/apps/desktop/src/renderer/components/Voice/VoiceListener.tsx +++ b/apps/desktop/src/renderer/components/Voice/VoiceListener.tsx @@ -45,8 +45,14 @@ export function VoiceListener() { (id) => , { duration: Number.POSITIVE_INFINITY, - position: "bottom-center", + position: "top-center", unstyled: true, + style: { + left: 0, + right: 0, + display: "flex", + justifyContent: "center", + }, }, ); indicatorToastRef.current = toastId; @@ -63,8 +69,14 @@ export function VoiceListener() { (id) => , { duration: Number.POSITIVE_INFINITY, - position: "bottom-center", + position: "top-center", unstyled: true, + style: { + left: 0, + right: 0, + display: "flex", + justifyContent: "center", + }, }, ); responseToastRef.current = toastId; diff --git a/apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/RecordingIndicator.tsx b/apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/RecordingIndicator.tsx index de55ed1d3e1..1d91a5a469d 100644 --- a/apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/RecordingIndicator.tsx +++ b/apps/desktop/src/renderer/components/Voice/components/RecordingIndicator/RecordingIndicator.tsx @@ -9,10 +9,10 @@ export function RecordingIndicator({ toastId }: RecordingIndicatorProps) { return (
- - + + - + Listening... +
+
+ {/* Header */} +
+ + Voice Command +
- {/* Header */} -
- - Voice Command -
+ {/* Status indicator */} + {status === "transcribing" && ( +
+ + + + + Transcribing... +
+ )} - {/* Status indicator */} - {status === "transcribing" && ( -
- - - - - Transcribing... -
- )} + {/* Transcription */} + {transcription && ( +
+ “{transcription}” +
+ )} - {/* Transcription */} - {transcription && ( -
- “{transcription}” -
- )} + {/* Tool calls */} + {toolCalls.length > 0 && ( +
+ {toolCalls.map((tc, i) => ( +
+ + {tc.toolName} + {tc.result && done} +
+ ))} +
+ )} - {/* Tool calls */} - {toolCalls.length > 0 && ( -
- {toolCalls.map((tc, i) => ( -
- - {tc.toolName} - {tc.result && done} -
- ))} -
- )} + {/* Streaming response */} + {(status === "streaming" || status === "done") && responseText && ( +
+ {responseText} + {status === "streaming" && ( + + )} +
+ )} - {/* Streaming response */} - {(status === "streaming" || status === "done") && responseText && ( -
- {responseText} - {status === "streaming" && ( - - )} -
- )} + {/* Processing indicator */} + {status === "processing" && !responseText && ( +
+ + + + + Thinking... +
+ )} - {/* Processing indicator */} - {status === "processing" && !responseText && ( -
- - - - - Thinking... -
- )} + {/* Error */} + {status === "error" && ( +
+ {error || "Something went wrong"} +
+ )} +
- {/* Error */} - {status === "error" && ( -
- {error || "Something went wrong"} + {/* Stop button footer */} + {isActive && ( +
+
)}
diff --git a/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/useVoicePipeline.ts b/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/useVoicePipeline.ts index 0146d32343a..8250ba7d630 100644 --- a/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/useVoicePipeline.ts +++ b/apps/desktop/src/renderer/components/Voice/components/ResponsePanel/hooks/useVoicePipeline/useVoicePipeline.ts @@ -1,4 +1,5 @@ import { useCallback, useRef, useState } from "react"; +import { getAuthToken } from "renderer/lib/auth-client"; import { env } from "renderer/env.renderer"; type PipelineStatus = @@ -23,38 +24,28 @@ interface VoicePipelineState { error: string | null; } -export function useVoicePipeline() { - const [state, setState] = useState({ - status: "idle", - transcription: null, - toolCalls: [], - responseText: "", - error: null, - }); +const INITIAL_STATE: VoicePipelineState = { + status: "idle", + transcription: null, + toolCalls: [], + responseText: "", + error: null, +}; +export function useVoicePipeline() { + const [state, setState] = useState(INITIAL_STATE); const abortRef = useRef(null); const processAudio = useCallback(async (audioB64: string) => { - // Abort any in-flight request abortRef.current?.abort(); + setState({ ...INITIAL_STATE, status: "transcribing" }); - // Reset state - setState({ - status: "transcribing", - transcription: null, - toolCalls: [], - responseText: "", - error: null, - }); - - // Decode base64 to binary const binaryStr = atob(audioB64); const bytes = new Uint8Array(binaryStr.length); for (let i = 0; i < binaryStr.length; i++) { bytes[i] = binaryStr.charCodeAt(i); } - // Build form data const formData = new FormData(); formData.append( "audio", @@ -62,15 +53,21 @@ export function useVoicePipeline() { "audio.wav", ); - // POST to API with SSE response const abortController = new AbortController(); abortRef.current = abortController; try { + const headers: HeadersInit = {}; + const token = getAuthToken(); + if (token) { + headers.Authorization = `Bearer ${token}`; + } + const response = await fetch(`${env.NEXT_PUBLIC_API_URL}/api/voice`, { method: "POST", body: formData, credentials: "include", + headers, signal: abortController.signal, }); @@ -93,7 +90,6 @@ export function useVoicePipeline() { return; } - // Read SSE stream const reader = response.body.getReader(); const decoder = new TextDecoder(); let buffer = ""; @@ -103,8 +99,6 @@ export function useVoicePipeline() { if (done) break; buffer += decoder.decode(value, { stream: true }); - - // Parse SSE events from buffer const lines = buffer.split("\n"); buffer = lines.pop() ?? ""; @@ -114,8 +108,7 @@ export function useVoicePipeline() { eventType = line.slice(7).trim(); } else if (line.startsWith("data: ") && eventType) { try { - const data = JSON.parse(line.slice(6)); - handleSSEEvent(eventType, data, setState); + handleSSEEvent(eventType, JSON.parse(line.slice(6))); } catch { // Skip malformed data } @@ -124,13 +117,9 @@ export function useVoicePipeline() { } } - // Ensure we end in done state - setState((prev) => { - if (prev.status !== "error") { - return { ...prev, status: "done" }; - } - return prev; - }); + setState((prev) => + prev.status !== "error" ? { ...prev, status: "done" } : prev, + ); } catch (error) { if (abortController.signal.aborted) return; setState((prev) => ({ @@ -143,72 +132,63 @@ export function useVoicePipeline() { const abort = useCallback(() => { abortRef.current?.abort(); + setState((prev) => + prev.status !== "error" && + prev.status !== "done" && + prev.status !== "idle" + ? { ...prev, status: "done" } + : prev, + ); }, []); - return { ...state, processAudio, abort }; -} - -function handleSSEEvent( - event: string, - data: Record, - setState: React.Dispatch>, -) { - switch (event) { - case "transcription": - setState((prev) => ({ - ...prev, - status: "processing", - transcription: data.text as string, - })); - break; - - case "tool_use": - setState((prev) => ({ - ...prev, - status: "processing", - toolCalls: [ - ...prev.toolCalls, - { - toolName: data.toolName as string, - toolInput: data.toolInput, - }, - ], - })); - break; - - case "tool_result": - setState((prev) => ({ - ...prev, - toolCalls: prev.toolCalls.map((tc) => - tc.toolName === data.toolName && !tc.result - ? { ...tc, result: data.result as string } - : tc, - ), - })); - break; - - case "text_delta": - setState((prev) => ({ - ...prev, - status: "streaming", - responseText: prev.responseText + (data.delta as string), - })); - break; - - case "done": - setState((prev) => ({ - ...prev, - status: "done", - responseText: (data.fullResponse as string) || prev.responseText, - })); - break; - - case "error": - setState((prev) => ({ - ...prev, - status: "error", - error: data.message as string, - })); - break; + function handleSSEEvent(event: string, data: Record) { + switch (event) { + case "transcription": + setState((prev) => ({ + ...prev, + status: "processing", + transcription: data.text as string, + })); + break; + case "tool_use": + setState((prev) => ({ + ...prev, + status: "processing", + toolCalls: [ + ...prev.toolCalls, + { toolName: data.toolName as string, toolInput: data.toolInput }, + ], + })); + break; + case "tool_result": + setState((prev) => ({ + ...prev, + toolCalls: prev.toolCalls.map((tc) => + tc.toolName === data.toolName && !tc.result + ? { ...tc, result: data.result as string } + : tc, + ), + })); + break; + case "text_delta": + setState((prev) => ({ + ...prev, + status: "streaming", + responseText: prev.responseText + (data.delta as string), + })); + break; + case "done": + setState((prev) => ({ ...prev, status: "done" })); + break; + case "error": + setState((prev) => ({ + ...prev, + status: "error", + error: data.message as string, + })); + break; + } } + + return { ...state, processAudio, abort }; } diff --git a/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx b/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx index 2ca4d63ff05..7c0255ce992 100644 --- a/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx +++ b/apps/desktop/src/renderer/routes/_authenticated/settings/behavior/components/BehaviorSettings/BehaviorSettings.tsx @@ -211,7 +211,7 @@ export function BehaviorSettings({ visibleItems }: BehaviorSettingsProps) { Voice Commands

- Enable wake word detection and voice commands + Say "Hey Jarvis" to control Superset with your voice