-
Notifications
You must be signed in to change notification settings - Fork 905
feat(desktop): add voice commands with wake word detection and mic permission UX #1055
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8470643
e883e67
24f2f4e
8d87aed
d5d40dd
93fc924
053cfe7
e99ac4f
308c698
29100c9
06ada28
e71edc0
9b7e000
8c00fdb
9f853dc
0eef5ee
0c0de55
6d07a9e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,82 @@ | ||
| import { auth } from "@superset/auth/server"; | ||
| import { runVoicePipeline } from "./voice-service"; | ||
|
|
||
| export async function POST(request: Request) { | ||
| const session = await auth.api.getSession({ headers: request.headers }); | ||
| if (!session?.user) { | ||
| return Response.json({ error: "Unauthorized" }, { status: 401 }); | ||
| } | ||
|
|
||
| const organizationId = session.session.activeOrganizationId; | ||
| if (!organizationId) { | ||
| return Response.json({ error: "No active organization" }, { status: 400 }); | ||
| } | ||
|
|
||
| let formData: FormData; | ||
| try { | ||
| formData = await request.formData(); | ||
| } catch { | ||
| return Response.json( | ||
| { error: "Expected multipart form data with audio file" }, | ||
| { status: 400 }, | ||
| ); | ||
| } | ||
|
|
||
| const audioFile = formData.get("audio"); | ||
| if (!audioFile || !(audioFile instanceof File)) { | ||
| return Response.json( | ||
| { error: "Missing 'audio' file in form data" }, | ||
| { status: 400 }, | ||
| ); | ||
| } | ||
|
|
||
| const MAX_AUDIO_SIZE = 5 * 1024 * 1024; // 5 MB | ||
| if (audioFile.size > MAX_AUDIO_SIZE) { | ||
| return Response.json( | ||
| { error: "Audio file too large (max 5 MB)" }, | ||
| { status: 413 }, | ||
| ); | ||
| } | ||
|
|
||
| const audioBuffer = new Uint8Array(await audioFile.arrayBuffer()); | ||
|
|
||
| const encoder = new TextEncoder(); | ||
|
|
||
| const stream = new ReadableStream({ | ||
| async start(controller) { | ||
| const sse = { | ||
| write(event: string, data: unknown) { | ||
| const payload = `event: ${event}\ndata: ${JSON.stringify(data)}\n\n`; | ||
| controller.enqueue(encoder.encode(payload)); | ||
| }, | ||
| }; | ||
|
|
||
| try { | ||
| await runVoicePipeline({ | ||
| audioBuffer, | ||
| ctx: { userId: session.user.id, organizationId }, | ||
| sse, | ||
| signal: request.signal, | ||
| }); | ||
| } catch (error) { | ||
| if (!request.signal.aborted) { | ||
| console.error("[voice/route] Pipeline error:", error); | ||
| sse.write("error", { | ||
| message: | ||
| error instanceof Error ? error.message : "Voice pipeline failed", | ||
| }); | ||
| } | ||
| } finally { | ||
| controller.close(); | ||
| } | ||
| }, | ||
| }); | ||
|
|
||
| return new Response(stream, { | ||
| headers: { | ||
| "Content-Type": "text/event-stream", | ||
| "Cache-Control": "no-cache", | ||
| Connection: "keep-alive", | ||
| }, | ||
| }); | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,198 @@ | ||
| import Anthropic from "@anthropic-ai/sdk"; | ||
| import type { McpContext } from "@superset/mcp/auth"; | ||
| import { createInMemoryMcpClient } from "@superset/mcp/in-memory"; | ||
| import { OpenAI } from "openai"; | ||
| import { env } from "@/env"; | ||
|
|
||
| const SYSTEM_PROMPT = `You are a helpful voice assistant for Superset, a project management tool. You have access to tools for creating and managing tasks, workspaces, and other organizational resources. Keep responses concise and conversational — the user is speaking to you, so respond in 1-3 sentences unless the question requires more detail. When you use tools, briefly confirm what you did.`; | ||
|
|
||
| // Desktop-only tools that don't make sense in voice context | ||
| const DENIED_TOOLS = new Set([ | ||
| "navigate_to_workspace", | ||
| "switch_workspace", | ||
| "get_app_context", | ||
| ]); | ||
|
|
||
| interface SSEWriter { | ||
| write(event: string, data: unknown): void; | ||
| } | ||
|
|
||
| async function transcribeAudio({ | ||
| audioBuffer, | ||
| signal, | ||
| }: { | ||
| audioBuffer: Uint8Array; | ||
| signal?: AbortSignal; | ||
| }): Promise<string> { | ||
| const openai = new OpenAI({ apiKey: env.OPENAI_API_KEY }); | ||
|
|
||
| const blob = new Blob([audioBuffer as BlobPart], { type: "audio/wav" }); | ||
| const file = new File([blob], "audio.wav", { type: "audio/wav" }); | ||
|
|
||
| const result = await openai.audio.transcriptions.create( | ||
| { | ||
| model: "whisper-1", | ||
| file, | ||
| }, | ||
| { signal }, | ||
| ); | ||
|
|
||
| // Strip wake word from transcription | ||
| let text = result.text.trim(); | ||
| text = text.replace(/^hey\s*jarvis[,.\s!?]*/i, "").trim(); | ||
| return text; | ||
| } | ||
|
|
||
| /** | ||
| * Runs the full voice pipeline: transcription → Claude with MCP tools → streaming SSE. | ||
| */ | ||
| export async function runVoicePipeline({ | ||
| audioBuffer, | ||
| ctx, | ||
| sse, | ||
| signal, | ||
| }: { | ||
| audioBuffer: Uint8Array; | ||
| ctx: McpContext; | ||
| sse: SSEWriter; | ||
| signal?: AbortSignal; | ||
| }): Promise<void> { | ||
| // 1. Transcribe | ||
| const transcription = await transcribeAudio({ audioBuffer, signal }); | ||
| sse.write("transcription", { text: transcription }); | ||
|
|
||
| if (!transcription) { | ||
| sse.write("done", { fullResponse: "" }); | ||
| return; | ||
| } | ||
|
Comment on lines
+59
to
+67
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wrap transcription call in try/catch to prevent pipeline crash. If Proposed fix ): Promise<void> {
// 1. Transcribe
+ let transcription: string;
+ try {
- const transcription = await transcribeAudio(audioBuffer);
+ transcription = await transcribeAudio(audioBuffer);
+ } catch (error) {
+ console.error("[voice/pipeline] Transcription failed:", error);
+ sse.write("error", {
+ message: error instanceof Error ? error.message : "Transcription failed",
+ });
+ sse.write("done", { fullResponse: "" });
+ return;
+ }
sse.write("transcription", { text: transcription });🤖 Prompt for AI Agents |
||
|
|
||
| // 2. Create in-memory MCP client for tool access | ||
| const { client: mcpClient, cleanup } = await createInMemoryMcpClient({ | ||
| userId: ctx.userId, | ||
| organizationId: ctx.organizationId, | ||
| }); | ||
|
|
||
| try { | ||
| const { tools: mcpTools } = await mcpClient.listTools(); | ||
|
|
||
| const anthropicTools: Anthropic.Tool[] = mcpTools | ||
| .filter((t) => !DENIED_TOOLS.has(t.name)) | ||
| .map((t) => ({ | ||
| name: t.name, | ||
| description: t.description ?? "", | ||
| input_schema: t.inputSchema as Anthropic.Tool.InputSchema, | ||
| })); | ||
|
|
||
| // 3. Stream Claude response with tool use loop | ||
| const anthropic = new Anthropic({ apiKey: env.ANTHROPIC_API_KEY }); | ||
|
|
||
| const messages: Anthropic.MessageParam[] = [ | ||
| { role: "user", content: transcription }, | ||
| ]; | ||
|
|
||
| let fullResponse = ""; | ||
|
|
||
| try { | ||
| const MAX_TOOL_ROUNDS = 5; | ||
| for (let round = 0; round < MAX_TOOL_ROUNDS; round++) { | ||
| if (signal?.aborted) return; | ||
|
|
||
| const stream = anthropic.messages.stream( | ||
| { | ||
| model: "claude-sonnet-4-20250514", | ||
| max_tokens: 1024, | ||
| system: SYSTEM_PROMPT, | ||
| messages, | ||
| tools: anthropicTools.length > 0 ? anthropicTools : undefined, | ||
| }, | ||
| { signal }, | ||
| ); | ||
|
|
||
| for await (const event of stream) { | ||
| if (event.type === "content_block_delta") { | ||
| if (event.delta.type === "text_delta") { | ||
| fullResponse += event.delta.text; | ||
| sse.write("text_delta", { delta: event.delta.text }); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| const finalMessage = await stream.finalMessage(); | ||
| const contentBlocks = finalMessage.content; | ||
|
|
||
| const toolUseBlocks = contentBlocks.filter( | ||
| (block): block is Anthropic.ToolUseBlock => block.type === "tool_use", | ||
| ); | ||
|
|
||
| if (toolUseBlocks.length === 0) { | ||
| break; | ||
| } | ||
|
|
||
| const toolResults: Anthropic.ToolResultBlockParam[] = []; | ||
|
|
||
| for (const toolBlock of toolUseBlocks) { | ||
| if (signal?.aborted) return; | ||
|
|
||
| sse.write("tool_use", { | ||
| toolName: toolBlock.name, | ||
| toolInput: toolBlock.input, | ||
| }); | ||
|
|
||
| try { | ||
| const result = await mcpClient.callTool({ | ||
| name: toolBlock.name, | ||
| arguments: toolBlock.input as Record<string, unknown>, | ||
| }); | ||
|
|
||
| const resultText = JSON.stringify(result.content); | ||
|
|
||
| sse.write("tool_result", { | ||
| toolName: toolBlock.name, | ||
| result: resultText, | ||
| }); | ||
|
|
||
| toolResults.push({ | ||
| type: "tool_result", | ||
| tool_use_id: toolBlock.id, | ||
| content: resultText, | ||
| }); | ||
| } catch (error) { | ||
| if (signal?.aborted) return; | ||
| console.error( | ||
| `[voice/tool] Error executing ${toolBlock.name}:`, | ||
| error, | ||
| ); | ||
| const errorText = JSON.stringify({ | ||
| error: | ||
| error instanceof Error | ||
| ? error.message | ||
| : "Tool execution failed", | ||
| }); | ||
|
|
||
| sse.write("tool_result", { | ||
| toolName: toolBlock.name, | ||
| result: errorText, | ||
| }); | ||
|
|
||
| toolResults.push({ | ||
| type: "tool_result", | ||
| tool_use_id: toolBlock.id, | ||
| content: errorText, | ||
| is_error: true, | ||
| }); | ||
| } | ||
| } | ||
|
|
||
| messages.push({ role: "assistant", content: contentBlocks }); | ||
| messages.push({ role: "user", content: toolResults }); | ||
| } | ||
| } catch (error) { | ||
| if (signal?.aborted) return; | ||
| throw error; | ||
| } | ||
|
|
||
| sse.write("done", { fullResponse }); | ||
| } finally { | ||
| await cleanup().catch(() => {}); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Log the error when form data parsing fails.
The catch block discards the error, making debugging difficult if unexpected parsing failures occur. As per coding guidelines, errors should be logged at minimum.
Proposed fix
try { formData = await request.formData(); - } catch { + } catch (error) { + console.error("[voice/route] Form data parsing failed:", error); return Response.json( { error: "Expected multipart form data with audio file" }, { status: 400 }, ); }📝 Committable suggestion
🤖 Prompt for AI Agents