From 84cad6a99fea084e8ddd20ee618e09b539de05ad Mon Sep 17 00:00:00 2001 From: tulsi Date: Wed, 15 Apr 2026 13:54:07 -0700 Subject: [PATCH 01/30] feat: wire up voice dictation in goose2 via ACP Add voice dictation support to the goose2 Tauri app by exposing transcription and config as ACP custom methods, then wiring the frontend to use them. Backend (crates/): - Add DictationTranscribeRequest/Response and DictationConfigRequest/Response types to goose-sdk custom_requests.rs with model metadata fields - Add #[custom_method] handlers in goose-acp server.rs for transcribe (OpenAI, Groq, ElevenLabs, Local) and config - Register methods in acp-meta.json - Forward local-inference feature from goose-cli to goose-acp Tauri (ui/goose2/src-tauri/): - Rewrite dictation.rs to use call_ext_method via ACP instead of importing goose crate directly - Add generic CallExt command to ACP manager with method name normalization (strips leading _ to avoid double-prefix) - Register get_dictation_config and transcribe_dictation commands Frontend (ui/goose2/src/): - Wire useDictationRecorder + useVoiceInputPreferences into ChatInput - Replace placeholder mic button with working toggle (recording/ transcribing states, auto-submit on keyword) - Stop recording on manual send and on auto-submit keyword - Show "Listening..."/"Transcribing..." placeholder in textarea - Add Voice section to SettingsModal with VoiceInputSettings - Add all voice i18n strings (en + es) - Fix pre-existing type errors in dictationVad.ts and VoiceInputSettings Known issue: Local Whisper reports configured: false despite model being downloaded and config set. The is_downloaded() path check needs investigation in a follow-up. Co-Authored-By: Claude Opus 4.6 (1M context) --- Cargo.lock | 1 + crates/goose-acp/Cargo.toml | 2 + crates/goose-acp/acp-meta.json | 10 + crates/goose-acp/src/server.rs | 201 ++++++++ crates/goose-cli/Cargo.toml | 2 +- crates/goose-sdk/src/custom_requests.rs | 60 +++ ui/goose2/src-tauri/Info.plist | 8 + .../src-tauri/src/services/provider_defs.rs | 11 + .../chat/hooks/useDictationRecorder.ts | 317 ++++++++++++ .../features/chat/hooks/useVoiceDictation.ts | 117 +++++ .../chat/hooks/useVoiceInputPreferences.ts | 161 ++++++ .../features/chat/lib/dictationVad.test.ts | 51 ++ .../src/features/chat/lib/dictationVad.ts | 147 ++++++ .../src/features/chat/lib/voiceInput.test.ts | 85 ++++ ui/goose2/src/features/chat/lib/voiceInput.ts | 177 +++++++ ui/goose2/src/features/chat/ui/ChatInput.tsx | 38 +- .../src/features/chat/ui/ChatInputToolbar.tsx | 33 +- .../features/settings/ui/SettingsModal.tsx | 4 + .../settings/ui/VoiceInputSettings.tsx | 465 ++++++++++++++++++ ui/goose2/src/shared/api/dictation.ts | 100 ++++ .../src/shared/i18n/locales/en/chat.json | 6 +- .../src/shared/i18n/locales/en/settings.json | 46 +- .../src/shared/i18n/locales/es/chat.json | 6 +- .../src/shared/i18n/locales/es/settings.json | 46 +- ui/goose2/src/shared/types/dictation.ts | 51 ++ 25 files changed, 2134 insertions(+), 11 deletions(-) create mode 100644 ui/goose2/src-tauri/Info.plist create mode 100644 ui/goose2/src/features/chat/hooks/useDictationRecorder.ts create mode 100644 ui/goose2/src/features/chat/hooks/useVoiceDictation.ts create mode 100644 ui/goose2/src/features/chat/hooks/useVoiceInputPreferences.ts create mode 100644 ui/goose2/src/features/chat/lib/dictationVad.test.ts create mode 100644 ui/goose2/src/features/chat/lib/dictationVad.ts create mode 100644 ui/goose2/src/features/chat/lib/voiceInput.test.ts create mode 100644 ui/goose2/src/features/chat/lib/voiceInput.ts create mode 100644 ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx create mode 100644 ui/goose2/src/shared/api/dictation.ts create mode 100644 ui/goose2/src/shared/types/dictation.ts diff --git a/Cargo.lock b/Cargo.lock index 093e9658825e..e38f0b0f0aa3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4454,6 +4454,7 @@ dependencies = [ "async-stream", "async-trait", "axum", + "base64 0.22.1", "fs-err", "futures", "goose", diff --git a/crates/goose-acp/Cargo.toml b/crates/goose-acp/Cargo.toml index 8bc2b1e7eed5..a7200146b8a8 100644 --- a/crates/goose-acp/Cargo.toml +++ b/crates/goose-acp/Cargo.toml @@ -14,6 +14,7 @@ path = "src/bin/generate_acp_schema.rs" [features] default = ["code-mode", "rustls-tls"] code-mode = ["goose/code-mode"] +local-inference = ["goose/local-inference"] rustls-tls = ["goose/rustls-tls", "goose-mcp/rustls-tls"] native-tls = ["goose/native-tls", "goose-mcp/native-tls"] @@ -48,6 +49,7 @@ uuid = { workspace = true, features = ["v7"] } schemars = { workspace = true, features = ["derive"] } goose-acp-macros = { path = "../goose-acp-macros" } goose-sdk = { path = "../goose-sdk" } +base64 = { workspace = true } [dev-dependencies] async-trait = { workspace = true } diff --git a/crates/goose-acp/acp-meta.json b/crates/goose-acp/acp-meta.json index 944d227b663f..3cd63e5f726f 100644 --- a/crates/goose-acp/acp-meta.json +++ b/crates/goose-acp/acp-meta.json @@ -104,6 +104,16 @@ "method": "_goose/session/unarchive", "requestType": "UnarchiveSessionRequest", "responseType": "EmptyResponse" + }, + { + "method": "_goose/dictation/transcribe", + "requestType": "DictationTranscribeRequest", + "responseType": "DictationTranscribeResponse" + }, + { + "method": "_goose/dictation/config", + "requestType": "DictationConfigRequest", + "responseType": "DictationConfigResponse" } ] } diff --git a/crates/goose-acp/src/server.rs b/crates/goose-acp/src/server.rs index d1a8212c7507..6176b02f8f85 100644 --- a/crates/goose-acp/src/server.rs +++ b/crates/goose-acp/src/server.rs @@ -16,6 +16,13 @@ use goose::config::paths::Paths; use goose::config::permission::PermissionManager; use goose::config::{Config, GooseMode}; use goose::conversation::message::{ActionRequiredData, Message, MessageContent}; +#[cfg(feature = "local-inference")] +use goose::dictation::providers::transcribe_local; +use goose::dictation::providers::{ + all_providers, is_configured, transcribe_with_provider, DictationProvider, +}; +#[cfg(feature = "local-inference")] +use goose::dictation::whisper; use goose::mcp_utils::ToolResult; use goose::permission::permission_confirmation::PrincipalType; use goose::permission::{Permission, PermissionConfirmation}; @@ -68,6 +75,9 @@ pub type AcpProviderFactory = Arc< const DEFAULT_PROVIDER_ID: &str = "goose"; const DEFAULT_PROVIDER_LABEL: &str = "Goose (Default)"; +const OPENAI_TRANSCRIPTION_MODEL: &str = "whisper-1"; +const GROQ_TRANSCRIPTION_MODEL: &str = "whisper-large-v3-turbo"; +const ELEVENLABS_TRANSCRIPTION_MODEL: &str = "scribe_v1"; /// In-memory state for an active ACP session. /// @@ -2904,6 +2914,197 @@ impl GooseAcpAgent { .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; Ok(EmptyResponse {}) } + + #[custom_method(DictationTranscribeRequest)] + async fn on_dictation_transcribe( + &self, + req: DictationTranscribeRequest, + ) -> Result { + use base64::{engine::general_purpose::STANDARD as BASE64, Engine}; + + let provider: DictationProvider = serde_json::from_value(serde_json::Value::String( + req.provider.clone(), + )) + .map_err(|_| { + sacp::Error::invalid_params().data(format!("Unknown provider: {}", req.provider)) + })?; + + let audio_bytes = BASE64 + .decode(&req.audio) + .map_err(|_| sacp::Error::invalid_params().data("Invalid base64 audio data"))?; + + if audio_bytes.len() > 50 * 1024 * 1024 { + return Err(sacp::Error::invalid_params().data("Audio too large (max 50MB)")); + } + + let extension = match req.mime_type.as_str() { + "audio/webm" | "audio/webm;codecs=opus" => "webm", + "audio/mp4" => "mp4", + "audio/mpeg" | "audio/mpga" => "mp3", + "audio/m4a" => "m4a", + "audio/wav" | "audio/x-wav" => "wav", + other => { + return Err( + sacp::Error::invalid_params().data(format!("Unsupported format: {other}")) + ) + } + }; + + let text = match provider { + DictationProvider::OpenAI => { + transcribe_with_provider( + DictationProvider::OpenAI, + "model".to_string(), + "whisper-1".to_string(), + audio_bytes, + extension, + &req.mime_type, + ) + .await + } + DictationProvider::Groq => { + transcribe_with_provider( + DictationProvider::Groq, + "model".to_string(), + "whisper-large-v3-turbo".to_string(), + audio_bytes, + extension, + &req.mime_type, + ) + .await + } + DictationProvider::ElevenLabs => { + transcribe_with_provider( + DictationProvider::ElevenLabs, + "model_id".to_string(), + "scribe_v1".to_string(), + audio_bytes, + extension, + &req.mime_type, + ) + .await + } + #[cfg(feature = "local-inference")] + DictationProvider::Local => transcribe_local(audio_bytes).await, + #[cfg(not(feature = "local-inference"))] + DictationProvider::Local => { + return Err(sacp::Error::invalid_params() + .data("Local inference is not available in this build")); + } + } + .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; + + Ok(DictationTranscribeResponse { text }) + } + + #[custom_method(DictationConfigRequest)] + async fn on_dictation_config( + &self, + _req: DictationConfigRequest, + ) -> Result { + let config = goose::config::Config::global(); + let mut providers = std::collections::HashMap::new(); + + for def in all_providers() { + let provider = def.provider; + let host = if let Some(host_key) = def.host_key { + config + .get(host_key, false) + .ok() + .and_then(|v| v.as_str().map(|s| s.to_string())) + } else { + None + }; + + let provider_key = serde_json::to_value(provider) + .ok() + .and_then(|v| v.as_str().map(|s| s.to_string())) + .unwrap_or_else(|| format!("{:?}", provider).to_lowercase()); + providers.insert( + provider_key, + DictationProviderStatusEntry { + configured: is_configured(provider), + host, + description: def.description.to_string(), + uses_provider_config: def.uses_provider_config, + settings_path: def.settings_path.map(|s| s.to_string()), + config_key: if !def.uses_provider_config { + Some(def.config_key.to_string()) + } else { + None + }, + model_config_key: dictation_model_config_key(provider), + default_model: dictation_default_model(provider), + selected_model: dictation_selected_model(&config, provider), + available_models: dictation_available_models(provider), + }, + ); + } + + Ok(DictationConfigResponse { providers }) + } +} + +fn dictation_model_config_key(provider: DictationProvider) -> Option { + #[cfg(feature = "local-inference")] + if provider == DictationProvider::Local { + return Some(whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY.to_string()); + } + + None +} + +fn dictation_default_model(provider: DictationProvider) -> Option { + match provider { + DictationProvider::OpenAI => Some(OPENAI_TRANSCRIPTION_MODEL.to_string()), + DictationProvider::Groq => Some(GROQ_TRANSCRIPTION_MODEL.to_string()), + DictationProvider::ElevenLabs => Some(ELEVENLABS_TRANSCRIPTION_MODEL.to_string()), + #[cfg(feature = "local-inference")] + DictationProvider::Local => Some(whisper::recommend_model().to_string()), + } +} + +fn dictation_selected_model(config: &Config, provider: DictationProvider) -> Option { + #[cfg(feature = "local-inference")] + if provider == DictationProvider::Local { + return config + .get(whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY, false) + .ok() + .and_then(|value| value.as_str().map(str::to_owned)) + .filter(|model_id| whisper::get_model(model_id).is_some()) + .or_else(|| dictation_default_model(provider)); + } + + dictation_default_model(provider) +} + +fn dictation_available_models(provider: DictationProvider) -> Vec { + match provider { + DictationProvider::OpenAI => vec![DictationModelOption { + id: OPENAI_TRANSCRIPTION_MODEL.to_string(), + label: "Whisper-1".to_string(), + description: "OpenAI's hosted Whisper transcription model.".to_string(), + }], + DictationProvider::Groq => vec![DictationModelOption { + id: GROQ_TRANSCRIPTION_MODEL.to_string(), + label: "Whisper Large V3 Turbo".to_string(), + description: "Groq's fast hosted Whisper transcription model.".to_string(), + }], + DictationProvider::ElevenLabs => vec![DictationModelOption { + id: ELEVENLABS_TRANSCRIPTION_MODEL.to_string(), + label: "Scribe v1".to_string(), + description: "ElevenLabs' hosted speech-to-text model.".to_string(), + }], + #[cfg(feature = "local-inference")] + DictationProvider::Local => whisper::available_models() + .iter() + .map(|model| DictationModelOption { + id: model.id.to_string(), + label: model.id.to_string(), + description: model.description.to_string(), + }) + .collect(), + } } pub struct GooseAcpHandler { diff --git a/crates/goose-cli/Cargo.toml b/crates/goose-cli/Cargo.toml index 6c20a644912a..369cd59606cb 100644 --- a/crates/goose-cli/Cargo.toml +++ b/crates/goose-cli/Cargo.toml @@ -71,7 +71,7 @@ winapi = { workspace = true } [features] default = ["code-mode", "local-inference", "aws-providers", "telemetry", "otel", "rustls-tls"] code-mode = ["goose/code-mode", "goose-acp/code-mode"] -local-inference = ["goose/local-inference"] +local-inference = ["goose/local-inference", "goose-acp/local-inference"] aws-providers = ["goose/aws-providers"] cuda = ["goose/cuda", "local-inference"] telemetry = ["goose/telemetry"] diff --git a/crates/goose-sdk/src/custom_requests.rs b/crates/goose-sdk/src/custom_requests.rs index bbc375be09f3..46359100a3bf 100644 --- a/crates/goose-sdk/src/custom_requests.rs +++ b/crates/goose-sdk/src/custom_requests.rs @@ -309,6 +309,66 @@ pub struct ProviderConfigKey { pub primary: bool, } +/// Transcribe audio via a dictation provider. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/transcribe", response = DictationTranscribeResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationTranscribeRequest { + /// Base64-encoded audio data + pub audio: String, + /// MIME type (e.g. "audio/wav", "audio/webm") + pub mime_type: String, + /// Provider to use: "openai", "groq", "elevenlabs", or "local" + pub provider: String, +} + +/// Transcription result. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)] +pub struct DictationTranscribeResponse { + pub text: String, +} + +/// Get the configuration status of all dictation providers. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/config", response = DictationConfigResponse)] +pub struct DictationConfigRequest {} + +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema)] +pub struct DictationModelOption { + pub id: String, + pub label: String, + pub description: String, +} + +/// Per-provider configuration status. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct DictationProviderStatusEntry { + pub configured: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub host: Option, + pub description: String, + pub uses_provider_config: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub settings_path: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub config_key: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub model_config_key: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub default_model: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub selected_model: Option, + #[serde(default)] + pub available_models: Vec, +} + +/// Dictation config response — map of provider name to status. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)] +pub struct DictationConfigResponse { + pub providers: HashMap, +} + /// Empty success response for operations that return no data. #[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)] pub struct EmptyResponse {} diff --git a/ui/goose2/src-tauri/Info.plist b/ui/goose2/src-tauri/Info.plist new file mode 100644 index 000000000000..8588d2d741c4 --- /dev/null +++ b/ui/goose2/src-tauri/Info.plist @@ -0,0 +1,8 @@ + + + + + NSMicrophoneUsageDescription + Goose uses your microphone to capture voice input for dictation. + + diff --git a/ui/goose2/src-tauri/src/services/provider_defs.rs b/ui/goose2/src-tauri/src/services/provider_defs.rs index 0a2a326eaf00..5eea0c0a5a64 100644 --- a/ui/goose2/src-tauri/src/services/provider_defs.rs +++ b/ui/goose2/src-tauri/src/services/provider_defs.rs @@ -125,6 +125,17 @@ pub(crate) static PROVIDER_CONFIG_DEFS: &[ProviderConfigDef] = &[ keys: &[], oauth_cache_path: None, }, + // Dictation providers (voice input) + ProviderConfigDef { + id: "dictation_groq", + keys: &[key("GROQ_API_KEY", true, true)], + oauth_cache_path: None, + }, + ProviderConfigDef { + id: "dictation_elevenlabs", + keys: &[key("ELEVENLABS_API_KEY", true, true)], + oauth_cache_path: None, + }, ]; pub(crate) fn find_config_key(key_name: &str) -> Option<&'static ConfigKey> { diff --git a/ui/goose2/src/features/chat/hooks/useDictationRecorder.ts b/ui/goose2/src/features/chat/hooks/useDictationRecorder.ts new file mode 100644 index 000000000000..e908acabd995 --- /dev/null +++ b/ui/goose2/src/features/chat/hooks/useDictationRecorder.ts @@ -0,0 +1,317 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { transcribeDictation } from "@/shared/api/dictation"; +import type { DictationProvider } from "@/shared/types/dictation"; +import { + advanceVadState, + createInitialVadState, + getFrameRms, +} from "../lib/dictationVad"; + +interface UseDictationRecorderOptions { + provider: DictationProvider | null; + providerConfigured: boolean; + preferredMicrophoneId: string | null; + onError: (message: string) => void; + onTranscription: (text: string) => void; +} + +const SAMPLE_RATE = 16000; + +function encodeWav(samples: Float32Array, sampleRate: number): ArrayBuffer { + const buffer = new ArrayBuffer(44 + samples.length * 2); + const view = new DataView(buffer); + const write = (offset: number, value: string) => { + for (let index = 0; index < value.length; index += 1) { + view.setUint8(offset + index, value.charCodeAt(index)); + } + }; + + write(0, "RIFF"); + view.setUint32(4, 36 + samples.length * 2, true); + write(8, "WAVE"); + write(12, "fmt "); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); + view.setUint16(22, 1, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, sampleRate * 2, true); + view.setUint16(32, 2, true); + view.setUint16(34, 16, true); + write(36, "data"); + view.setUint32(40, samples.length * 2, true); + + let offset = 44; + for (let index = 0; index < samples.length; index += 1) { + const sample = Math.max(-1, Math.min(1, samples[index] ?? 0)); + view.setInt16(offset, sample < 0 ? sample * 0x8000 : sample * 0x7fff, true); + offset += 2; + } + + return buffer; +} + +function blobToBase64(blob: Blob): Promise { + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onloadend = () => resolve(String(reader.result).split(",")[1] ?? ""); + reader.onerror = () => reject(reader.error); + reader.readAsDataURL(blob); + }); +} + +function toErrorMessage(error: unknown) { + if (error instanceof Error && error.message) { + return error.message; + } + + return "Voice input failed"; +} + +export function useDictationRecorder({ + provider, + providerConfigured, + preferredMicrophoneId, + onError, + onTranscription, +}: UseDictationRecorderOptions) { + const [isRecording, setIsRecording] = useState(false); + const [isTranscribing, setIsTranscribing] = useState(false); + const audioContextRef = useRef(null); + const processorRef = useRef(null); + const sourceRef = useRef(null); + const streamRef = useRef(null); + const samplesRef = useRef([]); + const vadStateRef = useRef(createInitialVadState()); + const pendingTranscriptionsRef = useRef(0); + const generationRef = useRef(0); + const providerRef = useRef(provider); + providerRef.current = provider; + const onErrorRef = useRef(onError); + onErrorRef.current = onError; + const onTranscriptionRef = useRef(onTranscription); + onTranscriptionRef.current = onTranscription; + + const isEnabled = Boolean(provider && providerConfigured); + + const cleanupAudioGraph = useCallback(() => { + processorRef.current?.disconnect(); + processorRef.current = null; + sourceRef.current?.disconnect(); + sourceRef.current = null; + void audioContextRef.current?.close(); + audioContextRef.current = null; + streamRef.current?.getTracks().forEach((track) => { + track.stop(); + }); + streamRef.current = null; + }, []); + + const transcribeChunk = useCallback(async (samples: Float32Array) => { + const activeProvider = providerRef.current; + if (!activeProvider) { + return; + } + + const gen = generationRef.current; + pendingTranscriptionsRef.current += 1; + setIsTranscribing(true); + + try { + const wavBlob = new Blob([encodeWav(samples, SAMPLE_RATE)], { + type: "audio/wav", + }); + const audio = await blobToBase64(wavBlob); + const response = await transcribeDictation({ + audio, + mimeType: "audio/wav", + provider: activeProvider, + }); + + if (gen !== generationRef.current) { + return; + } + + if (response.text.trim()) { + onTranscriptionRef.current(response.text); + } + } catch (error) { + onErrorRef.current(toErrorMessage(error)); + } finally { + pendingTranscriptionsRef.current -= 1; + if (pendingTranscriptionsRef.current === 0) { + setIsTranscribing(false); + } + } + }, []); + + const flushPendingSamples = useCallback(() => { + const chunks = samplesRef.current; + if (chunks.length === 0) { + return; + } + + const totalSamples = chunks.reduce( + (count, chunk) => count + chunk.length, + 0, + ); + const merged = new Float32Array(totalSamples); + let offset = 0; + for (const chunk of chunks) { + merged.set(chunk, offset); + offset += chunk.length; + } + + samplesRef.current = []; + void transcribeChunk(merged); + }, [transcribeChunk]); + + const stopRecording = useCallback( + (options?: { flushPending?: boolean }) => { + const flushPending = options?.flushPending ?? true; + + if (flushPending && samplesRef.current.length > 0) { + flushPendingSamples(); + } else if (!flushPending) { + samplesRef.current = []; + generationRef.current += 1; + } + + vadStateRef.current = createInitialVadState(); + cleanupAudioGraph(); + setIsRecording(false); + }, + [cleanupAudioGraph, flushPendingSamples], + ); + + const handleFrame = useCallback( + (samples: Float32Array) => { + const { decision, nextState } = advanceVadState( + vadStateRef.current, + getFrameRms(samples), + ); + vadStateRef.current = nextState; + + if (decision === "ignore") { + return; + } + + if (decision === "discard") { + samplesRef.current = []; + return; + } + + samplesRef.current.push(new Float32Array(samples)); + + if (decision === "append_and_flush") { + flushPendingSamples(); + } + }, + [flushPendingSamples], + ); + + const startRecording = useCallback(async () => { + if (!isEnabled || !provider) { + onError("Voice input is not configured"); + return; + } + + try { + const audioConstraints: MediaTrackConstraints = { + autoGainControl: true, + echoCancellation: true, + noiseSuppression: true, + }; + + if (preferredMicrophoneId) { + audioConstraints.deviceId = { exact: preferredMicrophoneId }; + } + + let stream: MediaStream; + try { + stream = await navigator.mediaDevices.getUserMedia({ + audio: audioConstraints, + }); + } catch (error) { + if ( + preferredMicrophoneId && + error instanceof DOMException && + (error.name === "NotFoundError" || + error.name === "OverconstrainedError") + ) { + delete audioConstraints.deviceId; + stream = await navigator.mediaDevices.getUserMedia({ + audio: audioConstraints, + }); + } else { + throw error; + } + } + + streamRef.current = stream; + samplesRef.current = []; + vadStateRef.current = createInitialVadState(); + + const context = new AudioContext({ sampleRate: SAMPLE_RATE }); + audioContextRef.current = context; + await context.resume(); + + const source = context.createMediaStreamSource(stream); + const processor = context.createScriptProcessor(1024, 1, 1); + const silence = context.createGain(); + silence.gain.value = 0; + + processor.onaudioprocess = (event) => { + const channel = event.inputBuffer.getChannelData(0); + handleFrame(new Float32Array(channel)); + }; + + source.connect(processor); + processor.connect(silence); + silence.connect(context.destination); + + sourceRef.current = source; + processorRef.current = processor; + setIsRecording(true); + } catch (error) { + stopRecording({ flushPending: false }); + onError(toErrorMessage(error)); + } + }, [ + handleFrame, + isEnabled, + onError, + preferredMicrophoneId, + provider, + stopRecording, + ]); + + const toggleRecording = useCallback(() => { + if (isRecording) { + stopRecording(); + } else { + void startRecording(); + } + }, [isRecording, startRecording, stopRecording]); + + useEffect( + () => () => { + stopRecording({ flushPending: false }); + }, + [stopRecording], + ); + + useEffect(() => { + if (!provider && isRecording) { + stopRecording({ flushPending: false }); + } + }, [isRecording, provider, stopRecording]); + + return { + isEnabled, + isRecording, + isTranscribing, + startRecording, + stopRecording, + toggleRecording, + }; +} diff --git a/ui/goose2/src/features/chat/hooks/useVoiceDictation.ts b/ui/goose2/src/features/chat/hooks/useVoiceDictation.ts new file mode 100644 index 000000000000..12fe9ce1f25f --- /dev/null +++ b/ui/goose2/src/features/chat/hooks/useVoiceDictation.ts @@ -0,0 +1,117 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { getDictationConfig } from "@/shared/api/dictation"; +import type { DictationProviderStatus } from "@/shared/types/dictation"; +import type { ChatAttachmentDraft } from "@/shared/types/messages"; +import { useDictationRecorder } from "./useDictationRecorder"; +import { useVoiceInputPreferences } from "./useVoiceInputPreferences"; +import { + appendTranscribedText, + getAutoSubmitMatch, + getDefaultDictationProvider, + VOICE_DICTATION_CONFIG_EVENT, +} from "../lib/voiceInput"; + +interface UseVoiceDictationOptions { + text: string; + setText: (value: string) => void; + attachments: ChatAttachmentDraft[]; + clearAttachments: () => void; + selectedPersonaId: string | null; + onSend: ( + text: string, + personaId?: string, + attachments?: ChatAttachmentDraft[], + ) => void; + resetTextarea: () => void; +} + +export function useVoiceDictation({ + text, + setText, + attachments, + clearAttachments, + selectedPersonaId, + onSend, + resetTextarea, +}: UseVoiceDictationOptions) { + const voicePrefs = useVoiceInputPreferences(); + const [providerStatuses, setProviderStatuses] = useState< + Partial> + >({}); + + const fetchDictationConfig = useCallback(() => { + getDictationConfig() + .then(setProviderStatuses) + .catch(() => {}); + }, []); + + useEffect(() => { + fetchDictationConfig(); + window.addEventListener(VOICE_DICTATION_CONFIG_EVENT, fetchDictationConfig); + return () => + window.removeEventListener( + VOICE_DICTATION_CONFIG_EVENT, + fetchDictationConfig, + ); + }, [fetchDictationConfig]); + + const activeVoiceProvider = + voicePrefs.selectedProvider ?? + (voicePrefs.hasStoredProviderPreference + ? null + : getDefaultDictationProvider(providerStatuses)); + + const providerConfigured = + activeVoiceProvider != null && + providerStatuses[activeVoiceProvider]?.configured === true; + + const stopRecordingRef = useRef< + (options?: { flushPending?: boolean }) => void + >(() => {}); + + const handleTranscription = useCallback( + (fragment: string) => { + const match = getAutoSubmitMatch(fragment, voicePrefs.autoSubmitPhrases); + if (match) { + const merged = appendTranscribedText(text, match.textWithoutPhrase); + if (merged.trim()) { + stopRecordingRef.current({ flushPending: false }); + onSend( + merged.trim(), + selectedPersonaId ?? undefined, + attachments.length > 0 ? attachments : undefined, + ); + setText(""); + clearAttachments(); + resetTextarea(); + } + } else { + const merged = appendTranscribedText(text, fragment); + setText(merged); + } + }, + [ + attachments, + clearAttachments, + onSend, + resetTextarea, + selectedPersonaId, + setText, + text, + voicePrefs.autoSubmitPhrases, + ], + ); + + const handleVoiceError = useCallback((_message: string) => {}, []); + + const dictation = useDictationRecorder({ + provider: activeVoiceProvider, + providerConfigured, + preferredMicrophoneId: voicePrefs.preferredMicrophoneId, + onError: handleVoiceError, + onTranscription: handleTranscription, + }); + stopRecordingRef.current = dictation.stopRecording; + + return dictation; +} diff --git a/ui/goose2/src/features/chat/hooks/useVoiceInputPreferences.ts b/ui/goose2/src/features/chat/hooks/useVoiceInputPreferences.ts new file mode 100644 index 000000000000..602c125e58ee --- /dev/null +++ b/ui/goose2/src/features/chat/hooks/useVoiceInputPreferences.ts @@ -0,0 +1,161 @@ +import { useCallback, useEffect, useMemo, useState } from "react"; +import { + DISABLED_DICTATION_PROVIDER_STORAGE_VALUE, + DEFAULT_AUTO_SUBMIT_PHRASES_RAW, + VOICE_AUTO_SUBMIT_PHRASES_STORAGE_KEY, + VOICE_DICTATION_PREFERRED_MIC_STORAGE_KEY, + VOICE_DICTATION_PROVIDER_STORAGE_KEY, + normalizeDictationProvider, + parseAutoSubmitPhrases, +} from "../lib/voiceInput"; +import type { DictationProvider } from "@/shared/types/dictation"; + +const VOICE_INPUT_PREFERENCES_EVENT = "goose:voice-input-preferences"; + +function readStoredAutoSubmitPhrases() { + try { + return ( + window.localStorage.getItem(VOICE_AUTO_SUBMIT_PHRASES_STORAGE_KEY) ?? + DEFAULT_AUTO_SUBMIT_PHRASES_RAW + ); + } catch { + return DEFAULT_AUTO_SUBMIT_PHRASES_RAW; + } +} + +function readStoredDictationProvider(): DictationProvider | null { + try { + const storedValue = window.localStorage.getItem( + VOICE_DICTATION_PROVIDER_STORAGE_KEY, + ); + + if (storedValue === DISABLED_DICTATION_PROVIDER_STORAGE_VALUE) { + return null; + } + + return normalizeDictationProvider(storedValue); + } catch { + return null; + } +} + +function readHasStoredDictationProviderPreference() { + try { + return ( + window.localStorage.getItem(VOICE_DICTATION_PROVIDER_STORAGE_KEY) !== null + ); + } catch { + return false; + } +} + +function readStoredPreferredMicrophoneId() { + try { + return window.localStorage.getItem( + VOICE_DICTATION_PREFERRED_MIC_STORAGE_KEY, + ); + } catch { + return null; + } +} + +export function useVoiceInputPreferences() { + const [rawAutoSubmitPhrases, setRawAutoSubmitPhrasesState] = useState( + readStoredAutoSubmitPhrases, + ); + const [selectedProvider, setSelectedProviderState] = useState( + readStoredDictationProvider, + ); + const [hasStoredProviderPreference, setHasStoredProviderPreferenceState] = + useState(readHasStoredDictationProviderPreference); + const [preferredMicrophoneId, setPreferredMicrophoneIdState] = useState( + readStoredPreferredMicrophoneId, + ); + + useEffect(() => { + const syncFromStorage = () => { + setRawAutoSubmitPhrasesState(readStoredAutoSubmitPhrases()); + setSelectedProviderState(readStoredDictationProvider()); + setHasStoredProviderPreferenceState( + readHasStoredDictationProviderPreference(), + ); + setPreferredMicrophoneIdState(readStoredPreferredMicrophoneId()); + }; + + window.addEventListener("storage", syncFromStorage); + window.addEventListener( + VOICE_INPUT_PREFERENCES_EVENT, + syncFromStorage as EventListener, + ); + + return () => { + window.removeEventListener("storage", syncFromStorage); + window.removeEventListener( + VOICE_INPUT_PREFERENCES_EVENT, + syncFromStorage as EventListener, + ); + }; + }, []); + + const setRawAutoSubmitPhrases = useCallback((value: string) => { + setRawAutoSubmitPhrasesState(value); + + try { + window.localStorage.setItem(VOICE_AUTO_SUBMIT_PHRASES_STORAGE_KEY, value); + window.dispatchEvent(new Event(VOICE_INPUT_PREFERENCES_EVENT)); + } catch { + // localStorage may be unavailable + } + }, []); + + const setSelectedProvider = useCallback((value: DictationProvider | null) => { + setSelectedProviderState(value); + setHasStoredProviderPreferenceState(true); + + try { + window.localStorage.setItem( + VOICE_DICTATION_PROVIDER_STORAGE_KEY, + value ?? DISABLED_DICTATION_PROVIDER_STORAGE_VALUE, + ); + window.dispatchEvent(new Event(VOICE_INPUT_PREFERENCES_EVENT)); + } catch { + // localStorage may be unavailable + } + }, []); + + const setPreferredMicrophoneId = useCallback((value: string | null) => { + setPreferredMicrophoneIdState(value); + + try { + if (value) { + window.localStorage.setItem( + VOICE_DICTATION_PREFERRED_MIC_STORAGE_KEY, + value, + ); + } else { + window.localStorage.removeItem( + VOICE_DICTATION_PREFERRED_MIC_STORAGE_KEY, + ); + } + window.dispatchEvent(new Event(VOICE_INPUT_PREFERENCES_EVENT)); + } catch { + // localStorage may be unavailable + } + }, []); + + const autoSubmitPhrases = useMemo( + () => parseAutoSubmitPhrases(rawAutoSubmitPhrases), + [rawAutoSubmitPhrases], + ); + + return { + autoSubmitPhrases, + hasStoredProviderPreference, + preferredMicrophoneId, + rawAutoSubmitPhrases, + selectedProvider, + setPreferredMicrophoneId, + setRawAutoSubmitPhrases, + setSelectedProvider, + }; +} diff --git a/ui/goose2/src/features/chat/lib/dictationVad.test.ts b/ui/goose2/src/features/chat/lib/dictationVad.test.ts new file mode 100644 index 000000000000..89e96045c507 --- /dev/null +++ b/ui/goose2/src/features/chat/lib/dictationVad.test.ts @@ -0,0 +1,51 @@ +import { describe, expect, it } from "vitest"; +import { advanceVadState, createInitialVadState } from "./dictationVad"; + +function runFrames(levels: number[]) { + const decisions: string[] = []; + let state = createInitialVadState(); + + for (const level of levels) { + const result = advanceVadState(state, level); + decisions.push(result.decision); + state = result.nextState; + } + + return decisions; +} + +describe("dictationVad", () => { + it("ignores silence-only audio", () => { + expect(runFrames([0, 0, 0, 0])).toEqual([ + "ignore", + "ignore", + "ignore", + "ignore", + ]); + }); + + it("discards short noise bursts that never confirm speech", () => { + expect(runFrames([0.03, 0, 0, 0])).toEqual([ + "append", + "append", + "append", + "discard", + ]); + }); + + it("flushes a chunk after speech followed by trailing silence", () => { + expect(runFrames([0.03, 0.03, 0.03, 0, 0, 0, 0, 0, 0])).toContain( + "append_and_flush", + ); + }); + + it("returns to ignoring silence after a flush, ready for another chunk", () => { + const decisions = runFrames([ + 0.03, 0.03, 0.03, 0, 0, 0, 0, 0, 0, 0.03, 0.03, 0.03, 0, 0, 0, 0, 0, 0, + ]); + + expect( + decisions.filter((decision) => decision === "append_and_flush"), + ).toHaveLength(2); + }); +}); diff --git a/ui/goose2/src/features/chat/lib/dictationVad.ts b/ui/goose2/src/features/chat/lib/dictationVad.ts new file mode 100644 index 000000000000..0b4561e8cbae --- /dev/null +++ b/ui/goose2/src/features/chat/lib/dictationVad.ts @@ -0,0 +1,147 @@ +export type VadPhase = "idle" | "primed" | "speaking" | "trailing"; + +export type VadDecision = "ignore" | "append" | "append_and_flush" | "discard"; + +export interface VadState { + phase: VadPhase; + speechFrames: number; + silenceFrames: number; + framesInChunk: number; +} + +const SPEECH_RMS_THRESHOLD = 0.018; +const SPEECH_CONFIRMATION_FRAMES = 2; +const MAX_PRIMED_SILENCE_FRAMES = 2; +const TRAILING_SILENCE_FRAMES = 6; +const MIN_SPEECH_FRAMES = 3; + +export function createInitialVadState(): VadState { + return { + phase: "idle", + speechFrames: 0, + silenceFrames: 0, + framesInChunk: 0, + }; +} + +export function getFrameRms(samples: Float32Array): number { + let sum = 0; + for (let index = 0; index < samples.length; index += 1) { + const value = samples[index] ?? 0; + sum += value * value; + } + + return Math.sqrt(sum / Math.max(samples.length, 1)); +} + +export function advanceVadState( + state: VadState, + frameRms: number, +): { decision: VadDecision; nextState: VadState } { + const isSpeech = frameRms >= SPEECH_RMS_THRESHOLD; + + if (state.phase === "idle") { + if (!isSpeech) { + return { decision: "ignore" as const, nextState: state }; + } + + return { + decision: "append" as const, + nextState: { + phase: "primed" as const, + speechFrames: 1, + silenceFrames: 0, + framesInChunk: 1, + }, + }; + } + + if (state.phase === "primed") { + if (isSpeech) { + const speechFrames = state.speechFrames + 1; + return { + decision: "append" as const, + nextState: { + phase: + speechFrames >= SPEECH_CONFIRMATION_FRAMES ? "speaking" : "primed", + speechFrames, + silenceFrames: 0, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + const silenceFrames = state.silenceFrames + 1; + if (silenceFrames > MAX_PRIMED_SILENCE_FRAMES) { + return { + decision: "discard" as const, + nextState: createInitialVadState(), + }; + } + + return { + decision: "append" as const, + nextState: { + ...state, + silenceFrames, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + if (state.phase === "speaking") { + if (isSpeech) { + return { + decision: "append" as const, + nextState: { + phase: "speaking" as const, + speechFrames: state.speechFrames + 1, + silenceFrames: 0, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + return { + decision: "append" as const, + nextState: { + phase: "trailing" as const, + speechFrames: state.speechFrames, + silenceFrames: 1, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + if (isSpeech) { + return { + decision: "append" as const, + nextState: { + phase: "speaking" as const, + speechFrames: state.speechFrames + 1, + silenceFrames: 0, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + const silenceFrames = state.silenceFrames + 1; + if (silenceFrames < TRAILING_SILENCE_FRAMES) { + return { + decision: "append" as const, + nextState: { + ...state, + silenceFrames, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + return { + decision: + state.speechFrames >= MIN_SPEECH_FRAMES + ? ("append_and_flush" as const) + : ("discard" as const), + nextState: createInitialVadState(), + }; +} diff --git a/ui/goose2/src/features/chat/lib/voiceInput.test.ts b/ui/goose2/src/features/chat/lib/voiceInput.test.ts new file mode 100644 index 000000000000..6ca3ae799d86 --- /dev/null +++ b/ui/goose2/src/features/chat/lib/voiceInput.test.ts @@ -0,0 +1,85 @@ +import { describe, expect, it } from "vitest"; +import { + appendTranscribedText, + getDefaultDictationProvider, + getAutoSubmitMatch, + parseAutoSubmitPhrases, + replaceTrailingTranscribedText, +} from "./voiceInput"; + +describe("voiceInput helpers", () => { + it("parses comma-separated auto-submit phrases", () => { + expect(parseAutoSubmitPhrases(" submit, Ship It ,submit ,, ")).toEqual([ + "submit", + "ship it", + ]); + }); + + it("appends dictated text without smashing words together", () => { + expect(appendTranscribedText("hello", "world")).toBe("hello world"); + expect(appendTranscribedText("hello ", "world")).toBe("hello world"); + expect(appendTranscribedText("hello", ", world")).toBe("hello, world"); + }); + + it("replaces only the trailing dictated segment", () => { + expect( + replaceTrailingTranscribedText( + "draft dictated text", + "dictated text", + "dictated text submit", + ), + ).toBe("draft dictated text submit"); + }); + + it("matches auto-submit phrases only at the end of dictated text", () => { + expect(getAutoSubmitMatch("please submit now", ["submit"])).toBeNull(); + expect(getAutoSubmitMatch("please SUBMIT.", ["submit"])).toEqual({ + matchedPhrase: "submit", + textWithoutPhrase: "please", + }); + }); + + it("picks the first configured dictation provider by priority", () => { + expect( + getDefaultDictationProvider({ + openai: { + configured: false, + description: "OpenAI", + usesProviderConfig: true, + availableModels: [], + }, + groq: { + configured: true, + description: "Groq", + usesProviderConfig: false, + availableModels: [], + }, + local: { + configured: true, + description: "Local", + usesProviderConfig: false, + availableModels: [], + }, + }), + ).toBe("groq"); + }); + + it("falls back to the first available provider when none are configured", () => { + expect( + getDefaultDictationProvider({ + elevenlabs: { + configured: false, + description: "ElevenLabs", + usesProviderConfig: false, + availableModels: [], + }, + local: { + configured: false, + description: "Local", + usesProviderConfig: false, + availableModels: [], + }, + }), + ).toBe("local"); + }); +}); diff --git a/ui/goose2/src/features/chat/lib/voiceInput.ts b/ui/goose2/src/features/chat/lib/voiceInput.ts new file mode 100644 index 000000000000..9997c451311a --- /dev/null +++ b/ui/goose2/src/features/chat/lib/voiceInput.ts @@ -0,0 +1,177 @@ +import type { + DictationProvider, + DictationProviderStatus, +} from "@/shared/types/dictation"; + +export const VOICE_AUTO_SUBMIT_PHRASES_STORAGE_KEY = + "goose:voice-auto-submit-phrases"; +export const VOICE_DICTATION_PROVIDER_STORAGE_KEY = + "goose:voice-dictation-provider"; +export const VOICE_DICTATION_PREFERRED_MIC_STORAGE_KEY = + "goose:voice-dictation-preferred-mic"; +export const VOICE_DICTATION_CONFIG_EVENT = "goose:voice-dictation-config"; +export const DISABLED_DICTATION_PROVIDER_STORAGE_VALUE = "__disabled__"; + +export const DEFAULT_AUTO_SUBMIT_PHRASES_RAW = "submit"; + +const TRAILING_PUNCTUATION_REGEX = /[\s"'`.,!?;:)\]}]+$/u; + +function normalizePhrase(value: string): string { + return value + .toLowerCase() + .replace(/\s+/g, " ") + .trim() + .replace(TRAILING_PUNCTUATION_REGEX, "") + .trim(); +} + +export function parseAutoSubmitPhrases(rawValue: string | null | undefined) { + if (!rawValue) { + return []; + } + + return Array.from( + new Set( + rawValue + .split(",") + .map((value) => normalizePhrase(value)) + .filter(Boolean), + ), + ); +} + +export function normalizeDictationProvider( + value: string | null | undefined, +): DictationProvider | null { + if ( + value === "openai" || + value === "groq" || + value === "elevenlabs" || + value === "local" + ) { + return value; + } + + return null; +} + +export function getDefaultDictationProvider( + providerStatuses: Partial>, +): DictationProvider | null { + const configuredProviderPriority: DictationProvider[] = [ + "openai", + "groq", + "elevenlabs", + "local", + ]; + const fallbackProviderPriority: DictationProvider[] = [ + "local", + "openai", + "groq", + "elevenlabs", + ]; + + for (const provider of configuredProviderPriority) { + if (providerStatuses[provider]?.configured) { + return provider; + } + } + + for (const provider of fallbackProviderPriority) { + if (providerStatuses[provider]) { + return provider; + } + } + + return null; +} + +export function appendTranscribedText(baseText: string, fragment: string) { + const normalizedFragment = fragment.replace(/\s+/g, " ").trim(); + if (!normalizedFragment) { + return baseText; + } + + if (!baseText.trim()) { + return normalizedFragment; + } + + if (/[\s([{/-]$/.test(baseText) || /^[,.;!?)]/.test(normalizedFragment)) { + return `${baseText}${normalizedFragment}`; + } + + return `${baseText} ${normalizedFragment}`; +} + +export function replaceTrailingTranscribedText( + fullText: string, + previousTranscribedText: string, + nextTranscribedText: string, +) { + if (!previousTranscribedText) { + return appendTranscribedText(fullText, nextTranscribedText); + } + + if (fullText.endsWith(previousTranscribedText)) { + return appendTranscribedText( + fullText.slice(0, -previousTranscribedText.length), + nextTranscribedText, + ); + } + + const trimmedPreviousText = previousTranscribedText.trim(); + if (trimmedPreviousText && fullText.endsWith(trimmedPreviousText)) { + return appendTranscribedText( + fullText.slice(0, -trimmedPreviousText.length), + nextTranscribedText, + ); + } + + return appendTranscribedText(fullText, nextTranscribedText); +} + +export function getAutoSubmitMatch( + transcribedText: string, + autoSubmitPhrases: string[], +) { + const normalizedTranscribedText = normalizePhrase(transcribedText); + if (!normalizedTranscribedText) { + return null; + } + + const sortedPhrases = [...autoSubmitPhrases].sort( + (left, right) => right.length - left.length, + ); + + for (const phrase of sortedPhrases) { + if (!normalizedTranscribedText.endsWith(phrase)) { + continue; + } + + const phraseStartIndex = normalizedTranscribedText.length - phrase.length; + if ( + phraseStartIndex > 0 && + normalizedTranscribedText[phraseStartIndex - 1] !== " " + ) { + continue; + } + + const trimmedText = transcribedText.replace(TRAILING_PUNCTUATION_REGEX, ""); + const textWithoutPhrase = trimmedText.slice(0, -phrase.length).trimEnd(); + + return { + matchedPhrase: phrase, + textWithoutPhrase, + }; + } + + return null; +} + +export function notifyVoiceDictationConfigChanged() { + try { + window.dispatchEvent(new Event(VOICE_DICTATION_CONFIG_EVENT)); + } catch { + // no-op + } +} diff --git a/ui/goose2/src/features/chat/ui/ChatInput.tsx b/ui/goose2/src/features/chat/ui/ChatInput.tsx index 9b40f2b768f3..8013c6f7eb02 100644 --- a/ui/goose2/src/features/chat/ui/ChatInput.tsx +++ b/ui/goose2/src/features/chat/ui/ChatInput.tsx @@ -22,6 +22,7 @@ import { } from "../hooks/useChatInputAttachments"; import type { ModelOption } from "../types"; import { ChatInputAttachments } from "./ChatInputAttachments"; +import { useVoiceDictation } from "../hooks/useVoiceDictation"; export interface ProjectOption { id: string; @@ -121,6 +122,22 @@ export function ChatInput({ clearAttachments, } = useChatInputAttachments(); + const resetTextarea = useCallback(() => { + if (textareaRef.current) { + textareaRef.current.style.height = "auto"; + } + }, []); + + const dictation = useVoiceDictation({ + text, + setText, + attachments, + clearAttachments, + selectedPersonaId, + onSend, + resetTextarea, + }); + const activePersona = useMemo( () => personas.find((persona) => persona.id === selectedPersonaId) ?? null, [personas, selectedPersonaId], @@ -178,6 +195,14 @@ export function ChatInput({ useEffect(() => textareaRef.current?.focus(), []); const handleSend = useCallback(() => { + // If recording, stop and flush — the transcription callback will + // append text and may auto-submit. Don't send the current text yet + // because the final transcription hasn't arrived. + if (dictation.isRecording || dictation.isTranscribing) { + dictation.stopRecording(); + return; + } + if (!canSend) { return; } @@ -196,6 +221,7 @@ export function ChatInput({ attachments, canSend, clearAttachments, + dictation, onSend, selectedPersonaId, setText, @@ -408,7 +434,13 @@ export function ChatInput({ onChange={handleInput} onKeyDown={handleKeyDown} onPaste={handlePaste} - placeholder={effectivePlaceholder} + placeholder={ + dictation.isRecording + ? t("toolbar.voiceInputRecording") + : dictation.isTranscribing + ? t("toolbar.voiceInputTranscribing") + : effectivePlaceholder + } disabled={disabled} rows={1} className="mb-3 min-h-[36px] max-h-[200px] w-full resize-none bg-transparent px-1 text-[14px] leading-relaxed text-foreground placeholder:font-light placeholder:text-muted-foreground/60 focus:outline-none focus-visible:ring-0 focus-visible:ring-offset-0 disabled:opacity-60" @@ -447,6 +479,10 @@ export function ChatInput({ onSend={handleSend} onStop={onStop} isCompact={isCompact} + voiceEnabled={dictation.isEnabled} + voiceRecording={dictation.isRecording} + voiceTranscribing={dictation.isTranscribing} + onVoiceToggle={dictation.toggleRecording} /> diff --git a/ui/goose2/src/features/chat/ui/ChatInputToolbar.tsx b/ui/goose2/src/features/chat/ui/ChatInputToolbar.tsx index 3e25b8f084ce..e5b553569a93 100644 --- a/ui/goose2/src/features/chat/ui/ChatInputToolbar.tsx +++ b/ui/goose2/src/features/chat/ui/ChatInputToolbar.tsx @@ -90,6 +90,11 @@ interface ChatInputToolbarProps { onAttachFiles?: () => void; onAttachFolders?: () => void; disabled?: boolean; + // Voice + voiceEnabled?: boolean; + voiceRecording?: boolean; + voiceTranscribing?: boolean; + onVoiceToggle?: () => void; // Layout isCompact: boolean; } @@ -124,6 +129,10 @@ export function ChatInputToolbar({ onAttachFiles, onAttachFolders, disabled = false, + voiceEnabled = false, + voiceRecording = false, + voiceTranscribing = false, + onVoiceToggle, isCompact, }: ChatInputToolbarProps) { const { t } = useTranslation("chat"); @@ -384,14 +393,32 @@ export function ChatInputToolbar({ type="button" variant="ghost" size="icon-sm" - disabled - aria-label={t("toolbar.voiceInputSoon")} + disabled={!voiceEnabled || disabled} + onClick={onVoiceToggle} + aria-label={ + voiceRecording + ? t("toolbar.voiceInputRecording") + : t("toolbar.voiceInput") + } + className={cn( + voiceRecording && + "bg-destructive/10 text-destructive hover:bg-destructive/20 hover:text-destructive", + voiceTranscribing && "animate-pulse", + )} > - {t("toolbar.voiceInputSoon")} + + {!voiceEnabled + ? t("toolbar.voiceInputDisabled") + : voiceRecording + ? t("toolbar.voiceInputRecording") + : voiceTranscribing + ? t("toolbar.voiceInputTranscribing") + : t("toolbar.voiceInput")} + diff --git a/ui/goose2/src/features/settings/ui/SettingsModal.tsx b/ui/goose2/src/features/settings/ui/SettingsModal.tsx index 65ab6b6aff76..03400ccef214 100644 --- a/ui/goose2/src/features/settings/ui/SettingsModal.tsx +++ b/ui/goose2/src/features/settings/ui/SettingsModal.tsx @@ -21,6 +21,7 @@ import { SelectValue, } from "@/shared/ui/select"; import { + Mic, Palette, Settings2, FolderKanban, @@ -34,6 +35,7 @@ import { AppearanceSettings } from "./AppearanceSettings"; import { DoctorSettings } from "./DoctorSettings"; import { ProvidersSettings } from "./ProvidersSettings"; import { ExtensionsSettings } from "@/features/extensions/ui/ExtensionsSettings"; +import { VoiceInputSettings } from "./VoiceInputSettings"; import { listArchivedProjects, restoreProject, @@ -50,6 +52,7 @@ const NAV_ITEMS = [ { id: "appearance", labelKey: "nav.appearance", icon: Palette }, { id: "providers", labelKey: "nav.providers", icon: IconPlug }, { id: "extensions", labelKey: "nav.extensions", icon: IconPuzzle }, + { id: "voice", labelKey: "nav.voice", icon: Mic }, { id: "general", labelKey: "nav.general", icon: Settings2 }, { id: "projects", labelKey: "nav.projects", icon: FolderKanban }, { id: "chats", labelKey: "nav.chats", icon: MessageSquare }, @@ -241,6 +244,7 @@ export function SettingsModal({ {activeSection === "appearance" && } {activeSection === "providers" && } {activeSection === "extensions" && } + {activeSection === "voice" && } {activeSection === "doctor" && } {activeSection === "general" && (
diff --git a/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx b/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx new file mode 100644 index 000000000000..8ccc14908ecb --- /dev/null +++ b/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx @@ -0,0 +1,465 @@ +import { useCallback, useEffect, useMemo, useState } from "react"; +import { useTranslation } from "react-i18next"; +import { + deleteDictationProviderSecret, + getDictationConfig, + saveDictationModelSelection, + saveDictationProviderSecret, +} from "@/shared/api/dictation"; +import { + notifyVoiceDictationConfigChanged, + getDefaultDictationProvider, +} from "@/features/chat/lib/voiceInput"; +import { useVoiceInputPreferences } from "@/features/chat/hooks/useVoiceInputPreferences"; +import type { + DictationProvider, + DictationProviderStatus, +} from "@/shared/types/dictation"; +import { useAudioDevices } from "@/shared/ui/ai-elements/mic-selector"; +import { Button } from "@/shared/ui/button"; +import { Input } from "@/shared/ui/input"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/shared/ui/select"; + +const DISABLED_PROVIDER = "__disabled__"; + +export function VoiceInputSettings() { + const { t } = useTranslation(["settings", "chat", "common"]); + const { + hasStoredProviderPreference, + preferredMicrophoneId, + rawAutoSubmitPhrases, + selectedProvider, + setPreferredMicrophoneId, + setRawAutoSubmitPhrases, + setSelectedProvider, + } = useVoiceInputPreferences(); + const [providerStatuses, setProviderStatuses] = useState< + Record + >({} as Record); + const [apiKeyInput, setApiKeyInput] = useState(""); + const [isEditingApiKey, setIsEditingApiKey] = useState(false); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const { + devices, + error: devicesError, + hasPermission, + loadDevices, + loading: loadingDevices, + } = useAudioDevices(); + const isMicrophoneSupported = + typeof navigator !== "undefined" && !!navigator.mediaDevices; + const permissionStatus = hasPermission ? "authorized" : "not_determined"; + const requestPermission = loadDevices; + + const refreshConfig = useCallback(async () => { + const nextConfig = await getDictationConfig(); + setProviderStatuses(nextConfig); + + if (!hasStoredProviderPreference) { + const defaultProvider = getDefaultDictationProvider(nextConfig); + if (defaultProvider) { + setSelectedProvider(defaultProvider); + } + return; + } + + if (!selectedProvider) { + return; + } + + if (!nextConfig[selectedProvider]) { + setSelectedProvider(null); + } + }, [hasStoredProviderPreference, selectedProvider, setSelectedProvider]); + + useEffect(() => { + const load = async () => { + setLoading(true); + setError(null); + + try { + await refreshConfig(); + } catch (caughtError) { + setError( + caughtError instanceof Error + ? caughtError.message + : t("general.voiceInput.loadError"), + ); + } finally { + setLoading(false); + } + }; + + void load(); + }, [refreshConfig, t]); + + const selectedStatus = selectedProvider + ? providerStatuses[selectedProvider] + : null; + + const providerOptions = useMemo( + () => + Object.entries(providerStatuses) as Array< + [DictationProvider, DictationProviderStatus] + >, + [providerStatuses], + ); + + const currentModelValue = + selectedStatus?.selectedModel ?? selectedStatus?.defaultModel ?? ""; + + const saveApiKey = useCallback(async () => { + if (!selectedProvider) { + return; + } + + setError(null); + try { + await saveDictationProviderSecret( + selectedProvider, + apiKeyInput, + selectedStatus?.configKey ?? undefined, + ); + setApiKeyInput(""); + setIsEditingApiKey(false); + await refreshConfig(); + notifyVoiceDictationConfigChanged(); + } catch (caughtError) { + setError( + caughtError instanceof Error + ? caughtError.message + : t("general.voiceInput.saveError"), + ); + } + }, [apiKeyInput, refreshConfig, selectedProvider, selectedStatus, t]); + + const removeApiKey = useCallback(async () => { + if (!selectedProvider) { + return; + } + + setError(null); + try { + await deleteDictationProviderSecret( + selectedProvider, + selectedStatus?.configKey ?? undefined, + ); + setApiKeyInput(""); + setIsEditingApiKey(false); + await refreshConfig(); + notifyVoiceDictationConfigChanged(); + } catch (caughtError) { + setError( + caughtError instanceof Error + ? caughtError.message + : t("general.voiceInput.deleteError"), + ); + } + }, [refreshConfig, selectedProvider, selectedStatus, t]); + + const handleModelChange = useCallback( + async (modelId: string) => { + if (!selectedProvider) { + return; + } + + setError(null); + try { + await saveDictationModelSelection(selectedProvider, modelId); + await refreshConfig(); + notifyVoiceDictationConfigChanged(); + } catch (caughtError) { + setError( + caughtError instanceof Error + ? caughtError.message + : t("general.voiceInput.saveError"), + ); + } + }, + [refreshConfig, selectedProvider, t], + ); + + const selectedMicrophoneLabel = useMemo(() => { + if (!preferredMicrophoneId) { + return t("general.voiceInput.systemMicrophone"); + } + + return ( + devices.find((device) => device.deviceId === preferredMicrophoneId) + ?.label || t("general.voiceInput.systemMicrophone") + ); + }, [devices, preferredMicrophoneId, t]); + + if (loading) { + return ( +
+

+ {t("general.voiceInput.label")} +

+

+ {t("common:labels.loading")} +

+
+ ); + } + + return ( +
+
+

+ {t("general.voiceInput.label")} +

+

+ {t("general.voiceInput.description")} +

+
+ +
+

+ {t("general.voiceInput.providerLabel")} +

+ +
+ + {selectedStatus ? ( + <> + {!selectedStatus.usesProviderConfig && + selectedProvider !== "local" ? ( +
+ {isEditingApiKey ? ( + <> +
+

+ {t("general.voiceInput.apiKeyLabel")} +

+

+ {t("general.voiceInput.apiKeyDescription")} +

+
+
+ setApiKeyInput(event.target.value)} + placeholder={t("general.voiceInput.apiKeyPlaceholder")} + className="max-w-sm" + /> +
+ + +
+
+ + ) : ( +
+
+

+ {t("general.voiceInput.apiKeyLabel")} +

+

+ {selectedStatus.configured + ? t("general.voiceInput.apiKeyConfigured") + : t("general.voiceInput.apiKeyDescription")} +

+
+
+ + {selectedStatus.configured ? ( + + ) : null} +
+
+ )} +
+ ) : null} + + {selectedProvider === "local" ? ( +
+

+ {t("general.voiceInput.localModelLabel")} +

+

+ {t("general.voiceInput.localModelUnavailable")} +

+
+ ) : (selectedStatus.availableModels ?? []).length > 0 ? ( +
+

+ {t("general.voiceInput.modelLabel")} +

+ +

+ {(selectedStatus.availableModels ?? []).find( + (model) => model.id === currentModelValue, + )?.description ?? ""} +

+
+ ) : null} + + ) : null} + +
+
+
+

+ {t("general.voiceInput.microphoneLabel")} +

+

+ {isMicrophoneSupported + ? t("general.voiceInput.microphoneDescription") + : t("general.voiceInput.microphoneUnavailable")} +

+
+ {isMicrophoneSupported && !hasPermission ? ( + + ) : null} +
+ + {!devicesError && + !hasPermission && + permissionStatus === "not_determined" ? ( +

+ {t("general.voiceInput.microphoneAccessPrompt")} +

+ ) : null} + + {devicesError ? ( +

{devicesError}

+ ) : null} + + {isMicrophoneSupported && hasPermission ? ( + + ) : null} +
+ +
+ +

+ {t("general.voiceInput.autoSubmitDescription")} +

+ setRawAutoSubmitPhrases(event.target.value)} + placeholder={t("general.voiceInput.placeholder")} + className="max-w-sm" + /> +
+ + {error ?

{error}

: null} +
+ ); +} diff --git a/ui/goose2/src/shared/api/dictation.ts b/ui/goose2/src/shared/api/dictation.ts new file mode 100644 index 000000000000..4473225b2684 --- /dev/null +++ b/ui/goose2/src/shared/api/dictation.ts @@ -0,0 +1,100 @@ +import { invoke } from "@tauri-apps/api/core"; +import type { + DictationDownloadProgress, + DictationProvider, + DictationProviderStatus, + DictationTranscribeResponse, + MicrophonePermissionStatus, + WhisperModelStatus, +} from "@/shared/types/dictation"; + +export async function getDictationConfig(): Promise< + Record +> { + return invoke("get_dictation_config"); +} + +export async function transcribeDictation(request: { + audio: string; + mimeType: string; + provider: DictationProvider; +}): Promise { + return invoke("transcribe_dictation", { + request: { + audio: request.audio, + mimeType: request.mimeType, + provider: request.provider, + }, + }); +} + +export async function saveDictationModelSelection( + provider: DictationProvider, + modelId: string, +): Promise { + return invoke("save_dictation_model_selection", { provider, modelId }); +} + +export async function saveDictationProviderSecret( + _provider: DictationProvider, + value: string, + configKey?: string, +): Promise { + if (!configKey) { + throw new Error("No config key for this provider"); + } + return invoke("save_provider_field", { key: configKey, value }); +} + +export async function deleteDictationProviderSecret( + provider: DictationProvider, + _configKey?: string, +): Promise { + const providerIdMap: Record = { + groq: "dictation_groq", + elevenlabs: "dictation_elevenlabs", + }; + const providerId = providerIdMap[provider]; + if (!providerId) { + throw new Error("Cannot delete secrets for this provider"); + } + return invoke("delete_provider_config", { providerId }); +} + +export async function listDictationLocalModels(): Promise< + WhisperModelStatus[] +> { + return invoke("list_dictation_local_models"); +} + +export async function downloadDictationLocalModel( + modelId: string, +): Promise { + return invoke("download_dictation_local_model", { modelId }); +} + +export async function getDictationLocalModelDownloadProgress( + modelId: string, +): Promise { + return invoke("get_dictation_local_model_download_progress", { modelId }); +} + +export async function cancelDictationLocalModelDownload( + modelId: string, +): Promise { + return invoke("cancel_dictation_local_model_download", { modelId }); +} + +export async function deleteDictationLocalModel( + modelId: string, +): Promise { + return invoke("delete_dictation_local_model", { modelId }); +} + +export async function getMicrophonePermissionStatus(): Promise { + return invoke("get_microphone_permission_status"); +} + +export async function requestMicrophonePermission(): Promise { + return invoke("request_microphone_permission"); +} diff --git a/ui/goose2/src/shared/i18n/locales/en/chat.json b/ui/goose2/src/shared/i18n/locales/en/chat.json index efe6776e3d87..424007cc8c5c 100644 --- a/ui/goose2/src/shared/i18n/locales/en/chat.json +++ b/ui/goose2/src/shared/i18n/locales/en/chat.json @@ -169,7 +169,11 @@ "selectProject": "Select project", "sendMessage": "Send message", "stopGeneration": "Stop generation", - "voiceInputSoon": "Voice input (coming soon)" + "voiceInput": "Voice dictation", + "voiceInputDisabled": "Configure a voice provider in Settings to enable dictation", + "voiceInputRecording": "Listening...", + "voiceInputTranscribing": "Transcribing...", + "voiceInputAutoSubmitHint": "Say \"submit\" to send" }, "tools": { "fileNotFound": "File not found: {{path}}", diff --git a/ui/goose2/src/shared/i18n/locales/en/settings.json b/ui/goose2/src/shared/i18n/locales/en/settings.json index be55f4766a1d..e4c15409aa29 100644 --- a/ui/goose2/src/shared/i18n/locales/en/settings.json +++ b/ui/goose2/src/shared/i18n/locales/en/settings.json @@ -124,7 +124,48 @@ "spanish": "Spanish", "system": "System default ({{language}})" }, - "title": "General" + "title": "General", + "voiceInput": { + "label": "Voice Input", + "description": "Configure voice dictation for hands-free input.", + "providerLabel": "Transcription Provider", + "disabled": "Disabled", + "active": "Active", + "notConfiguredSuffix": "(not configured)", + "placeholder": "Select a provider", + "modelLabel": "Model", + "apiKeyLabel": "API Key", + "apiKeyDescription": "Enter your API key for this provider.", + "apiKeyPlaceholder": "sk-...", + "apiKeyConfigured": "API key configured", + "addApiKey": "Add API key", + "updateApiKey": "Update API key", + "removeApiKey": "Remove API key", + "localModelLabel": "Local Whisper Model", + "localModelUnavailable": "Local model download is not yet available. Use the Goose CLI to download a Whisper model first.", + "download": "Download", + "recommended": "Recommended", + "microphoneLabel": "Microphone", + "microphoneDescription": "Choose which microphone to use for voice input.", + "microphoneUnavailable": "Microphone access is not available in this environment.", + "microphoneAccessPrompt": "Click \"Grant access\" to allow microphone use.", + "grantMicrophone": "Grant access", + "systemMicrophone": "System default", + "unknownMicrophone": "Unknown microphone", + "autoSubmitLabel": "Auto-submit Phrases", + "autoSubmitDescription": "Comma-separated words that trigger automatic send (e.g. \"submit\").", + "providers": { + "openai": "OpenAI Whisper", + "groq": "Groq", + "elevenlabs": "ElevenLabs", + "local": "Local Whisper" + }, + "providerSetupHint": "This provider uses your main provider config. Check {{settingsPath}} to configure it.", + "downloadProgress": "Downloading... {{percent}}%", + "loadError": "Failed to load voice settings.", + "saveError": "Failed to save.", + "deleteError": "Failed to delete." + } }, "nav": { "about": "About", @@ -134,7 +175,8 @@ "general": "General", "projects": "Projects", "extensions": "Extensions", - "providers": "Providers" + "providers": "Providers", + "voice": "Voice" }, "projects": { "description": "Manage your projects.", diff --git a/ui/goose2/src/shared/i18n/locales/es/chat.json b/ui/goose2/src/shared/i18n/locales/es/chat.json index 3a5760189e23..5bd93d8a560d 100644 --- a/ui/goose2/src/shared/i18n/locales/es/chat.json +++ b/ui/goose2/src/shared/i18n/locales/es/chat.json @@ -169,7 +169,11 @@ "selectProject": "Seleccionar proyecto", "sendMessage": "Enviar mensaje", "stopGeneration": "Detener generación", - "voiceInputSoon": "Entrada de voz (pronto)" + "voiceInput": "Dictado por voz", + "voiceInputDisabled": "Configura un proveedor de voz en Ajustes para activar el dictado", + "voiceInputRecording": "Escuchando...", + "voiceInputTranscribing": "Transcribiendo...", + "voiceInputAutoSubmitHint": "Di \"enviar\" para enviar" }, "tools": { "fileNotFound": "Archivo no encontrado: {{path}}", diff --git a/ui/goose2/src/shared/i18n/locales/es/settings.json b/ui/goose2/src/shared/i18n/locales/es/settings.json index 8b2b85236ece..33bef38d3078 100644 --- a/ui/goose2/src/shared/i18n/locales/es/settings.json +++ b/ui/goose2/src/shared/i18n/locales/es/settings.json @@ -124,7 +124,48 @@ "spanish": "Español", "system": "Predeterminado del sistema ({{language}})" }, - "title": "General" + "title": "General", + "voiceInput": { + "label": "Entrada de voz", + "description": "Configura el dictado por voz para entrada manos libres.", + "providerLabel": "Proveedor de transcripción", + "disabled": "Desactivado", + "active": "Activo", + "notConfiguredSuffix": "(no configurado)", + "placeholder": "Selecciona un proveedor", + "modelLabel": "Modelo", + "apiKeyLabel": "Clave API", + "apiKeyDescription": "Ingresa tu clave API para este proveedor.", + "apiKeyPlaceholder": "sk-...", + "apiKeyConfigured": "Clave API configurada", + "addApiKey": "Agregar clave API", + "updateApiKey": "Actualizar clave API", + "removeApiKey": "Eliminar clave API", + "localModelLabel": "Modelo Whisper local", + "localModelUnavailable": "La descarga de modelos locales aún no está disponible. Usa la CLI de Goose para descargar un modelo Whisper primero.", + "download": "Descargar", + "recommended": "Recomendado", + "microphoneLabel": "Micrófono", + "microphoneDescription": "Elige qué micrófono usar para la entrada de voz.", + "microphoneUnavailable": "El acceso al micrófono no está disponible en este entorno.", + "microphoneAccessPrompt": "Haz clic en \"Permitir acceso\" para usar el micrófono.", + "grantMicrophone": "Permitir acceso", + "systemMicrophone": "Predeterminado del sistema", + "unknownMicrophone": "Micrófono desconocido", + "autoSubmitLabel": "Frases de envío automático", + "autoSubmitDescription": "Palabras separadas por coma que activan el envío automático (ej. \"enviar\").", + "providers": { + "openai": "OpenAI Whisper", + "groq": "Groq", + "elevenlabs": "ElevenLabs", + "local": "Whisper local" + }, + "providerSetupHint": "Este proveedor usa tu configuración principal. Revisa {{settingsPath}} para configurarlo.", + "downloadProgress": "Descargando... {{percent}}%", + "loadError": "Error al cargar ajustes de voz.", + "saveError": "Error al guardar.", + "deleteError": "Error al eliminar." + } }, "nav": { "about": "Acerca de", @@ -134,7 +175,8 @@ "general": "General", "projects": "Proyectos", "extensions": "Extensiones", - "providers": "Proveedores" + "providers": "Proveedores", + "voice": "Voz" }, "projects": { "description": "Administra tus proyectos.", diff --git a/ui/goose2/src/shared/types/dictation.ts b/ui/goose2/src/shared/types/dictation.ts new file mode 100644 index 000000000000..acf617b3fec5 --- /dev/null +++ b/ui/goose2/src/shared/types/dictation.ts @@ -0,0 +1,51 @@ +export type DictationProvider = "openai" | "groq" | "elevenlabs" | "local"; + +export interface DictationModelOption { + id: string; + label: string; + description: string; +} + +export interface DictationProviderStatus { + configured: boolean; + host?: string | null; + description: string; + usesProviderConfig: boolean; + settingsPath?: string | null; + configKey?: string | null; + modelConfigKey?: string | null; + defaultModel?: string | null; + selectedModel?: string | null; + availableModels: DictationModelOption[]; +} + +export interface DictationTranscribeResponse { + text: string; +} + +export type MicrophonePermissionStatus = + | "not_determined" + | "authorized" + | "denied" + | "restricted" + | "unsupported"; + +export interface WhisperModelStatus { + id: string; + sizeMb: number; + url: string; + description: string; + downloaded: boolean; + recommended: boolean; +} + +export interface DictationDownloadProgress { + modelId: string; + status: string; + bytesDownloaded: number; + totalBytes: number; + progressPercent: number; + speedBps?: number | null; + etaSeconds?: number | null; + error?: string | null; +} From 4c32ee2fb188f62bff8eb1b9fa3fbf82d80981bc Mon Sep 17 00:00:00 2001 From: tulsi Date: Thu, 16 Apr 2026 11:08:56 -0700 Subject: [PATCH 02/30] chore(goose-acp): regenerate acp-schema.json with dictation methods Picks up DictationTranscribeRequest/Response, DictationConfigRequest/Response, and DictationProviderStatusEntry entries. Required for the @aaif/goose-sdk TypeScript generator in ui/sdk to see the new methods. --- crates/goose-acp/acp-schema.json | 178 +++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) diff --git a/crates/goose-acp/acp-schema.json b/crates/goose-acp/acp-schema.json index 0f0db1759a37..547b93a6f009 100644 --- a/crates/goose-acp/acp-schema.json +++ b/crates/goose-acp/acp-schema.json @@ -607,6 +607,150 @@ "x-side": "agent", "x-method": "_goose/session/unarchive" }, + "DictationTranscribeRequest": { + "type": "object", + "properties": { + "audio": { + "type": "string", + "description": "Base64-encoded audio data" + }, + "mimeType": { + "type": "string", + "description": "MIME type (e.g. \"audio/wav\", \"audio/webm\")" + }, + "provider": { + "type": "string", + "description": "Provider to use: \"openai\", \"groq\", \"elevenlabs\", or \"local\"" + } + }, + "required": [ + "audio", + "mimeType", + "provider" + ], + "description": "Transcribe audio via a dictation provider.", + "x-side": "agent", + "x-method": "_goose/dictation/transcribe" + }, + "DictationTranscribeResponse": { + "type": "object", + "properties": { + "text": { + "type": "string" + } + }, + "required": [ + "text" + ], + "description": "Transcription result.", + "x-side": "agent", + "x-method": "_goose/dictation/transcribe" + }, + "DictationConfigRequest": { + "type": "object", + "description": "Get the configuration status of all dictation providers.", + "x-side": "agent", + "x-method": "_goose/dictation/config" + }, + "DictationConfigResponse": { + "type": "object", + "properties": { + "providers": { + "type": "object", + "additionalProperties": { + "$ref": "#/$defs/DictationProviderStatusEntry" + } + } + }, + "required": [ + "providers" + ], + "description": "Dictation config response — map of provider name to status.", + "x-side": "agent", + "x-method": "_goose/dictation/config" + }, + "DictationProviderStatusEntry": { + "type": "object", + "properties": { + "configured": { + "type": "boolean" + }, + "host": { + "type": [ + "string", + "null" + ] + }, + "description": { + "type": "string" + }, + "usesProviderConfig": { + "type": "boolean" + }, + "settingsPath": { + "type": [ + "string", + "null" + ] + }, + "configKey": { + "type": [ + "string", + "null" + ] + }, + "modelConfigKey": { + "type": [ + "string", + "null" + ] + }, + "defaultModel": { + "type": [ + "string", + "null" + ] + }, + "selectedModel": { + "type": [ + "string", + "null" + ] + }, + "availableModels": { + "type": "array", + "items": { + "$ref": "#/$defs/DictationModelOption" + }, + "default": [] + } + }, + "required": [ + "configured", + "description", + "usesProviderConfig" + ], + "description": "Per-provider configuration status." + }, + "DictationModelOption": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "description": { + "type": "string" + } + }, + "required": [ + "id", + "label", + "description" + ] + }, "ExtRequest": { "properties": { "id": { @@ -807,6 +951,24 @@ ], "description": "Params for _goose/session/unarchive", "title": "UnarchiveSessionRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationTranscribeRequest" + } + ], + "description": "Params for _goose/dictation/transcribe", + "title": "DictationTranscribeRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationConfigRequest" + } + ], + "description": "Params for _goose/dictation/config", + "title": "DictationConfigRequest" } ] }, @@ -933,6 +1095,22 @@ } ], "title": "ImportSessionResponse" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationTranscribeResponse" + } + ], + "title": "DictationTranscribeResponse" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationConfigResponse" + } + ], + "title": "DictationConfigResponse" } ] }, From 0332e18e76c42df0fb131f36cfa3db3e182f89de Mon Sep 17 00:00:00 2001 From: tulsi Date: Thu, 16 Apr 2026 11:17:16 -0700 Subject: [PATCH 03/30] refactor(goose2): call dictation config/transcribe via SDK client Replaces Tauri invoke() with client.goose.GooseDictationConfig() and GooseDictationTranscribe() for the two ACP methods registered on the goose server. Matches the post-8549/8582 pattern: frontend talks directly to goose serve over WebSocket, no Tauri middleware. The remaining seven functions in dictation.ts still call invoke() for Tauri commands that no longer exist; those migrate to ACP methods added in a later commit. --- .../shared/api/__tests__/dictation.test.ts | 49 +++++++++++++++++++ ui/goose2/src/shared/api/dictation.ts | 16 +++--- 2 files changed, 58 insertions(+), 7 deletions(-) create mode 100644 ui/goose2/src/shared/api/__tests__/dictation.test.ts diff --git a/ui/goose2/src/shared/api/__tests__/dictation.test.ts b/ui/goose2/src/shared/api/__tests__/dictation.test.ts new file mode 100644 index 000000000000..b4d7cd501237 --- /dev/null +++ b/ui/goose2/src/shared/api/__tests__/dictation.test.ts @@ -0,0 +1,49 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { getDictationConfig, transcribeDictation } from "../dictation"; +import { getClient } from "../acpConnection"; + +vi.mock("../acpConnection", () => ({ + getClient: vi.fn(), +})); + +describe("dictation SDK wiring", () => { + let client: any; + beforeEach(() => { + client = { + goose: { + GooseDictationConfig: vi.fn().mockResolvedValue({ + providers: { + openai: { + configured: true, + description: "OpenAI transcription", + usesProviderConfig: true, + availableModels: [], + }, + }, + }), + GooseDictationTranscribe: vi.fn().mockResolvedValue({ text: "hello" }), + }, + }; + vi.mocked(getClient).mockResolvedValue(client); + }); + + it("getDictationConfig calls GooseDictationConfig and returns providers map", async () => { + const result = await getDictationConfig(); + expect(client.goose.GooseDictationConfig).toHaveBeenCalledWith({}); + expect(result.openai.configured).toBe(true); + }); + + it("transcribeDictation forwards audio + mimeType + provider", async () => { + const result = await transcribeDictation({ + audio: "base64==", + mimeType: "audio/webm", + provider: "openai" as any, + }); + expect(client.goose.GooseDictationTranscribe).toHaveBeenCalledWith({ + audio: "base64==", + mimeType: "audio/webm", + provider: "openai", + }); + expect(result.text).toBe("hello"); + }); +}); diff --git a/ui/goose2/src/shared/api/dictation.ts b/ui/goose2/src/shared/api/dictation.ts index 4473225b2684..793503f3c1dc 100644 --- a/ui/goose2/src/shared/api/dictation.ts +++ b/ui/goose2/src/shared/api/dictation.ts @@ -7,11 +7,14 @@ import type { MicrophonePermissionStatus, WhisperModelStatus, } from "@/shared/types/dictation"; +import { getClient } from "./acpConnection"; export async function getDictationConfig(): Promise< Record > { - return invoke("get_dictation_config"); + const client = await getClient(); + const response = await client.goose.GooseDictationConfig({}); + return response.providers as Record; } export async function transcribeDictation(request: { @@ -19,12 +22,11 @@ export async function transcribeDictation(request: { mimeType: string; provider: DictationProvider; }): Promise { - return invoke("transcribe_dictation", { - request: { - audio: request.audio, - mimeType: request.mimeType, - provider: request.provider, - }, + const client = await getClient(); + return client.goose.GooseDictationTranscribe({ + audio: request.audio, + mimeType: request.mimeType, + provider: request.provider, }); } From 72601fbc9e645c4b9b201d4d778cd5e658a950da Mon Sep 17 00:00:00 2001 From: tulsi Date: Thu, 16 Apr 2026 11:34:33 -0700 Subject: [PATCH 04/30] feat(goose-acp): add dictation model management ACP methods Adds six custom methods so the goose2 frontend can list, download, track, cancel, delete, and select local Whisper models through the same WebSocket channel it already uses for transcription: _goose/dictation/models/list _goose/dictation/models/download _goose/dictation/models/download/progress _goose/dictation/models/cancel _goose/dictation/models/delete _goose/dictation/model/select All local-model operations are gated on the local-inference feature; without it they return "Local inference not enabled". The select method accepts any dictation provider (openai, groq, elevenlabs, local) and writes to the appropriate config key. Replaces the previous plan to expose these as Tauri commands -- following the post-8549/8582 pattern of ACP-from-frontend-direct. Signed-off-by: tulsi --- crates/goose-acp/acp-meta.json | 30 +++ crates/goose-acp/acp-schema.json | 249 ++++++++++++++++++++++++ crates/goose-acp/src/server.rs | 233 ++++++++++++++++++++-- crates/goose-sdk/src/custom_requests.rs | 88 +++++++++ 4 files changed, 586 insertions(+), 14 deletions(-) diff --git a/crates/goose-acp/acp-meta.json b/crates/goose-acp/acp-meta.json index 3cd63e5f726f..75f28ef60a98 100644 --- a/crates/goose-acp/acp-meta.json +++ b/crates/goose-acp/acp-meta.json @@ -114,6 +114,36 @@ "method": "_goose/dictation/config", "requestType": "DictationConfigRequest", "responseType": "DictationConfigResponse" + }, + { + "method": "_goose/dictation/models/list", + "requestType": "DictationModelsListRequest", + "responseType": "DictationModelsListResponse" + }, + { + "method": "_goose/dictation/models/download", + "requestType": "DictationModelDownloadRequest", + "responseType": "EmptyResponse" + }, + { + "method": "_goose/dictation/models/download/progress", + "requestType": "DictationModelDownloadProgressRequest", + "responseType": "DictationModelDownloadProgressResponse" + }, + { + "method": "_goose/dictation/models/cancel", + "requestType": "DictationModelCancelRequest", + "responseType": "EmptyResponse" + }, + { + "method": "_goose/dictation/models/delete", + "requestType": "DictationModelDeleteRequest", + "responseType": "EmptyResponse" + }, + { + "method": "_goose/dictation/model/select", + "requestType": "DictationModelSelectRequest", + "responseType": "EmptyResponse" } ] } diff --git a/crates/goose-acp/acp-schema.json b/crates/goose-acp/acp-schema.json index 547b93a6f009..821de4145e74 100644 --- a/crates/goose-acp/acp-schema.json +++ b/crates/goose-acp/acp-schema.json @@ -751,6 +751,185 @@ "description" ] }, + "DictationModelsListRequest": { + "type": "object", + "description": "List available local Whisper models with their download status.", + "x-side": "agent", + "x-method": "_goose/dictation/models/list" + }, + "DictationModelsListResponse": { + "type": "object", + "properties": { + "models": { + "type": "array", + "items": { + "$ref": "#/$defs/DictationLocalModelStatus" + } + } + }, + "required": [ + "models" + ], + "x-side": "agent", + "x-method": "_goose/dictation/models/list" + }, + "DictationLocalModelStatus": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "description": { + "type": "string" + }, + "sizeMb": { + "type": "integer", + "minimum": 0 + }, + "downloaded": { + "type": "boolean" + }, + "downloadInProgress": { + "type": "boolean" + } + }, + "required": [ + "id", + "label", + "description", + "sizeMb", + "downloaded", + "downloadInProgress" + ] + }, + "DictationModelDownloadRequest": { + "type": "object", + "properties": { + "modelId": { + "type": "string" + } + }, + "required": [ + "modelId" + ], + "description": "Kick off a background download of a local Whisper model.", + "x-side": "agent", + "x-method": "_goose/dictation/models/download" + }, + "DictationModelDownloadProgressRequest": { + "type": "object", + "properties": { + "modelId": { + "type": "string" + } + }, + "required": [ + "modelId" + ], + "description": "Poll the progress of an in-flight download.", + "x-side": "agent", + "x-method": "_goose/dictation/models/download/progress" + }, + "DictationModelDownloadProgressResponse": { + "type": "object", + "properties": { + "progress": { + "anyOf": [ + { + "$ref": "#/$defs/DictationDownloadProgress" + }, + { + "type": "null" + } + ], + "description": "None when no download is active for this model id." + } + }, + "x-side": "agent", + "x-method": "_goose/dictation/models/download/progress" + }, + "DictationDownloadProgress": { + "type": "object", + "properties": { + "bytesDownloaded": { + "type": "integer", + "minimum": 0 + }, + "totalBytes": { + "type": "integer", + "minimum": 0 + }, + "progressPercent": { + "type": "number", + "format": "float" + }, + "status": { + "type": "string", + "description": "serde lowercase of DownloadStatus: \"downloading\" | \"completed\" | \"failed\" | \"cancelled\"" + }, + "error": { + "type": [ + "string", + "null" + ] + } + }, + "required": [ + "bytesDownloaded", + "totalBytes", + "progressPercent", + "status" + ] + }, + "DictationModelCancelRequest": { + "type": "object", + "properties": { + "modelId": { + "type": "string" + } + }, + "required": [ + "modelId" + ], + "description": "Cancel an in-flight download.", + "x-side": "agent", + "x-method": "_goose/dictation/models/cancel" + }, + "DictationModelDeleteRequest": { + "type": "object", + "properties": { + "modelId": { + "type": "string" + } + }, + "required": [ + "modelId" + ], + "description": "Delete a downloaded local Whisper model from disk.", + "x-side": "agent", + "x-method": "_goose/dictation/models/delete" + }, + "DictationModelSelectRequest": { + "type": "object", + "properties": { + "provider": { + "type": "string" + }, + "modelId": { + "type": "string" + } + }, + "required": [ + "provider", + "modelId" + ], + "description": "Persist the user's model selection for a given provider.", + "x-side": "agent", + "x-method": "_goose/dictation/model/select" + }, "ExtRequest": { "properties": { "id": { @@ -969,6 +1148,60 @@ ], "description": "Params for _goose/dictation/config", "title": "DictationConfigRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelsListRequest" + } + ], + "description": "Params for _goose/dictation/models/list", + "title": "DictationModelsListRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelDownloadRequest" + } + ], + "description": "Params for _goose/dictation/models/download", + "title": "DictationModelDownloadRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelDownloadProgressRequest" + } + ], + "description": "Params for _goose/dictation/models/download/progress", + "title": "DictationModelDownloadProgressRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelCancelRequest" + } + ], + "description": "Params for _goose/dictation/models/cancel", + "title": "DictationModelCancelRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelDeleteRequest" + } + ], + "description": "Params for _goose/dictation/models/delete", + "title": "DictationModelDeleteRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelSelectRequest" + } + ], + "description": "Params for _goose/dictation/model/select", + "title": "DictationModelSelectRequest" } ] }, @@ -1111,6 +1344,22 @@ } ], "title": "DictationConfigResponse" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelsListResponse" + } + ], + "title": "DictationModelsListResponse" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelDownloadProgressResponse" + } + ], + "title": "DictationModelDownloadProgressResponse" } ] }, diff --git a/crates/goose-acp/src/server.rs b/crates/goose-acp/src/server.rs index 6176b02f8f85..40cf27bcef7d 100644 --- a/crates/goose-acp/src/server.rs +++ b/crates/goose-acp/src/server.rs @@ -75,6 +75,9 @@ pub type AcpProviderFactory = Arc< const DEFAULT_PROVIDER_ID: &str = "goose"; const DEFAULT_PROVIDER_LABEL: &str = "Goose (Default)"; +const OPENAI_TRANSCRIPTION_MODEL_CONFIG_KEY: &str = "OPENAI_TRANSCRIPTION_MODEL"; +const GROQ_TRANSCRIPTION_MODEL_CONFIG_KEY: &str = "GROQ_TRANSCRIPTION_MODEL"; +const ELEVENLABS_TRANSCRIPTION_MODEL_CONFIG_KEY: &str = "ELEVENLABS_TRANSCRIPTION_MODEL"; const OPENAI_TRANSCRIPTION_MODEL: &str = "whisper-1"; const GROQ_TRANSCRIPTION_MODEL: &str = "whisper-large-v3-turbo"; const ELEVENLABS_TRANSCRIPTION_MODEL: &str = "scribe_v1"; @@ -2921,6 +2924,13 @@ impl GooseAcpAgent { req: DictationTranscribeRequest, ) -> Result { use base64::{engine::general_purpose::STANDARD as BASE64, Engine}; + let config = goose::config::Config::global(); + + #[cfg(not(feature = "local-inference"))] + if req.provider == "local" { + return Err(sacp::Error::invalid_params() + .data("Local inference is not available in this build")); + } let provider: DictationProvider = serde_json::from_value(serde_json::Value::String( req.provider.clone(), @@ -2952,10 +2962,12 @@ impl GooseAcpAgent { let text = match provider { DictationProvider::OpenAI => { + let model = dictation_selected_model(config, DictationProvider::OpenAI) + .unwrap_or_else(|| OPENAI_TRANSCRIPTION_MODEL.to_string()); transcribe_with_provider( DictationProvider::OpenAI, "model".to_string(), - "whisper-1".to_string(), + model, audio_bytes, extension, &req.mime_type, @@ -2963,10 +2975,12 @@ impl GooseAcpAgent { .await } DictationProvider::Groq => { + let model = dictation_selected_model(config, DictationProvider::Groq) + .unwrap_or_else(|| GROQ_TRANSCRIPTION_MODEL.to_string()); transcribe_with_provider( DictationProvider::Groq, "model".to_string(), - "whisper-large-v3-turbo".to_string(), + model, audio_bytes, extension, &req.mime_type, @@ -2974,10 +2988,12 @@ impl GooseAcpAgent { .await } DictationProvider::ElevenLabs => { + let model = dictation_selected_model(config, DictationProvider::ElevenLabs) + .unwrap_or_else(|| ELEVENLABS_TRANSCRIPTION_MODEL.to_string()); transcribe_with_provider( DictationProvider::ElevenLabs, "model_id".to_string(), - "scribe_v1".to_string(), + model, audio_bytes, extension, &req.mime_type, @@ -2986,11 +3002,6 @@ impl GooseAcpAgent { } #[cfg(feature = "local-inference")] DictationProvider::Local => transcribe_local(audio_bytes).await, - #[cfg(not(feature = "local-inference"))] - DictationProvider::Local => { - return Err(sacp::Error::invalid_params() - .data("Local inference is not available in this build")); - } } .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; @@ -3043,15 +3054,202 @@ impl GooseAcpAgent { Ok(DictationConfigResponse { providers }) } + + #[custom_method(DictationModelsListRequest)] + async fn on_dictation_models_list( + &self, + _req: DictationModelsListRequest, + ) -> Result { + #[cfg(feature = "local-inference")] + { + use goose::download_manager::{get_download_manager, DownloadStatus}; + + let manager = get_download_manager(); + let models = whisper::available_models() + .iter() + .map(|model| DictationLocalModelStatus { + id: model.id.to_string(), + label: model.id.to_string(), + description: model.description.to_string(), + size_mb: model.size_mb, + downloaded: model.is_downloaded(), + download_in_progress: manager + .get_progress(model.id) + .map(|progress| progress.status == DownloadStatus::Downloading) + .unwrap_or(false), + }) + .collect(); + + return Ok(DictationModelsListResponse { models }); + } + + #[cfg(not(feature = "local-inference"))] + Ok(DictationModelsListResponse::default()) + } + + #[custom_method(DictationModelDownloadRequest)] + async fn on_dictation_model_download( + &self, + _req: DictationModelDownloadRequest, + ) -> Result { + #[cfg(feature = "local-inference")] + { + use goose::download_manager::get_download_manager; + + let model = whisper::get_model(&_req.model_id) + .ok_or_else(|| sacp::Error::invalid_params().data("Unknown model id"))?; + let manager = get_download_manager(); + let model_id_for_config = model.id.to_string(); + + manager + .download_model( + model.id.to_string(), + model.url.to_string(), + model.local_path(), + Some(Box::new(move || { + if let Err(e) = goose::config::Config::global().set_param( + whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY, + model_id_for_config.clone(), + ) { + error!("Failed to save LOCAL_WHISPER_MODEL after download: {}", e); + } + })), + ) + .await + .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; + + return Ok(EmptyResponse {}); + } + + #[cfg(not(feature = "local-inference"))] + Err(sacp::Error::invalid_params().data("Local inference not enabled")) + } + + #[custom_method(DictationModelDownloadProgressRequest)] + async fn on_dictation_model_download_progress( + &self, + _req: DictationModelDownloadProgressRequest, + ) -> Result { + #[cfg(feature = "local-inference")] + { + use goose::download_manager::get_download_manager; + + let manager = get_download_manager(); + let progress = + manager + .get_progress(&_req.model_id) + .map(|progress| DictationDownloadProgress { + bytes_downloaded: progress.bytes_downloaded, + total_bytes: progress.total_bytes, + progress_percent: progress.progress_percent, + status: serde_json::to_value(&progress.status) + .ok() + .and_then(|value| value.as_str().map(ToOwned::to_owned)) + .unwrap_or_else(|| "unknown".to_string()), + error: progress.error, + }); + + return Ok(DictationModelDownloadProgressResponse { progress }); + } + + #[cfg(not(feature = "local-inference"))] + Ok(DictationModelDownloadProgressResponse { progress: None }) + } + + #[custom_method(DictationModelCancelRequest)] + async fn on_dictation_model_cancel( + &self, + _req: DictationModelCancelRequest, + ) -> Result { + #[cfg(feature = "local-inference")] + { + use goose::download_manager::get_download_manager; + + let manager = get_download_manager(); + manager + .cancel_download(&_req.model_id) + .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; + + return Ok(EmptyResponse {}); + } + + #[cfg(not(feature = "local-inference"))] + Err(sacp::Error::invalid_params().data("Local inference not enabled")) + } + + #[custom_method(DictationModelDeleteRequest)] + async fn on_dictation_model_delete( + &self, + _req: DictationModelDeleteRequest, + ) -> Result { + #[cfg(feature = "local-inference")] + { + let model = whisper::get_model(&_req.model_id) + .ok_or_else(|| sacp::Error::invalid_params().data("Unknown model id"))?; + let path = model.local_path(); + + if !path.exists() { + return Err(sacp::Error::invalid_params().data("Model not downloaded")); + } + + std::fs::remove_file(path) + .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; + + return Ok(EmptyResponse {}); + } + + #[cfg(not(feature = "local-inference"))] + Err(sacp::Error::invalid_params().data("Local inference not enabled")) + } + + #[custom_method(DictationModelSelectRequest)] + async fn on_dictation_model_select( + &self, + req: DictationModelSelectRequest, + ) -> Result { + #[cfg(not(feature = "local-inference"))] + if req.provider == "local" { + return Err(sacp::Error::invalid_params().data("Local inference not enabled")); + } + + let provider: DictationProvider = serde_json::from_value(serde_json::Value::String( + req.provider.clone(), + )) + .map_err(|_| { + sacp::Error::invalid_params().data(format!("Unknown provider: {}", req.provider)) + })?; + + let key = match provider { + DictationProvider::OpenAI => OPENAI_TRANSCRIPTION_MODEL_CONFIG_KEY, + DictationProvider::Groq => GROQ_TRANSCRIPTION_MODEL_CONFIG_KEY, + DictationProvider::ElevenLabs => ELEVENLABS_TRANSCRIPTION_MODEL_CONFIG_KEY, + #[cfg(feature = "local-inference")] + DictationProvider::Local => { + if whisper::get_model(&req.model_id).is_none() { + return Err(sacp::Error::invalid_params().data("Unknown model id")); + } + whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY + } + }; + + goose::config::Config::global() + .set_param(key, req.model_id) + .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; + + Ok(EmptyResponse {}) + } } fn dictation_model_config_key(provider: DictationProvider) -> Option { - #[cfg(feature = "local-inference")] - if provider == DictationProvider::Local { - return Some(whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY.to_string()); + match provider { + DictationProvider::OpenAI => Some(OPENAI_TRANSCRIPTION_MODEL_CONFIG_KEY.to_string()), + DictationProvider::Groq => Some(GROQ_TRANSCRIPTION_MODEL_CONFIG_KEY.to_string()), + DictationProvider::ElevenLabs => { + Some(ELEVENLABS_TRANSCRIPTION_MODEL_CONFIG_KEY.to_string()) + } + #[cfg(feature = "local-inference")] + DictationProvider::Local => Some(whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY.to_string()), } - - None } fn dictation_default_model(provider: DictationProvider) -> Option { @@ -3075,7 +3273,14 @@ fn dictation_selected_model(config: &Config, provider: DictationProvider) -> Opt .or_else(|| dictation_default_model(provider)); } - dictation_default_model(provider) + dictation_model_config_key(provider) + .and_then(|key| { + config + .get(&key, false) + .ok() + .and_then(|value| value.as_str().map(str::to_owned)) + }) + .or_else(|| dictation_default_model(provider)) } fn dictation_available_models(provider: DictationProvider) -> Vec { diff --git a/crates/goose-sdk/src/custom_requests.rs b/crates/goose-sdk/src/custom_requests.rs index 46359100a3bf..af14fd9cc189 100644 --- a/crates/goose-sdk/src/custom_requests.rs +++ b/crates/goose-sdk/src/custom_requests.rs @@ -372,3 +372,91 @@ pub struct DictationConfigResponse { /// Empty success response for operations that return no data. #[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)] pub struct EmptyResponse {} + +/// List available local Whisper models with their download status. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request( + method = "_goose/dictation/models/list", + response = DictationModelsListResponse +)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelsListRequest {} + +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelsListResponse { + pub models: Vec, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct DictationLocalModelStatus { + pub id: String, + pub label: String, + pub description: String, + pub size_mb: u32, + pub downloaded: bool, + pub download_in_progress: bool, +} + +/// Kick off a background download of a local Whisper model. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/models/download", response = EmptyResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelDownloadRequest { + pub model_id: String, +} + +/// Poll the progress of an in-flight download. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request( + method = "_goose/dictation/models/download/progress", + response = DictationModelDownloadProgressResponse +)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelDownloadProgressRequest { + pub model_id: String, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelDownloadProgressResponse { + /// None when no download is active for this model id. + pub progress: Option, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct DictationDownloadProgress { + pub bytes_downloaded: u64, + pub total_bytes: u64, + pub progress_percent: f32, + /// serde lowercase of DownloadStatus: "downloading" | "completed" | "failed" | "cancelled" + pub status: String, + pub error: Option, +} + +/// Cancel an in-flight download. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/models/cancel", response = EmptyResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelCancelRequest { + pub model_id: String, +} + +/// Delete a downloaded local Whisper model from disk. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/models/delete", response = EmptyResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelDeleteRequest { + pub model_id: String, +} + +/// Persist the user's model selection for a given provider. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/model/select", response = EmptyResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelSelectRequest { + pub provider: String, + pub model_id: String, +} From 1b9d22fed31082cc1c24169c0074d5e5843d2412 Mon Sep 17 00:00:00 2001 From: tulsi Date: Thu, 16 Apr 2026 11:53:57 -0700 Subject: [PATCH 05/30] refactor(goose2): route local-model dictation through SDK client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Migrates seven dictation.ts functions off Tauri invoke() onto the regenerated @aaif/goose-sdk client: saveDictationModelSelection, listDictationLocalModels, downloadDictationLocalModel, getDictationLocalModelDownloadProgress, cancelDictationLocalModelDownload, deleteDictationLocalModel Leaves alone: saveDictationProviderSecret / deleteDictationProviderSecret — use generic save_provider_field / delete_provider_config Tauri commands getMicrophonePermissionStatus / requestMicrophonePermission — OS-bound browser APIs handle mic prompt in VoiceInputSettings Each migrated function uses a type cast at the SDK boundary because the regenerated types don't fully overlap with the hand-written local types (e.g., WhisperModelStatus has url/recommended fields the SDK's DictationLocalModelStatus doesn't). Consumers that read missing fields will get undefined at runtime; end-to-end verification in a later task will surface any breakage. Signed-off-by: tulsi --- .../shared/api/__tests__/dictation.test.ts | 87 ++++++++++++++++++- ui/goose2/src/shared/api/dictation.ts | 22 +++-- 2 files changed, 102 insertions(+), 7 deletions(-) diff --git a/ui/goose2/src/shared/api/__tests__/dictation.test.ts b/ui/goose2/src/shared/api/__tests__/dictation.test.ts index b4d7cd501237..27f501a851b8 100644 --- a/ui/goose2/src/shared/api/__tests__/dictation.test.ts +++ b/ui/goose2/src/shared/api/__tests__/dictation.test.ts @@ -1,5 +1,14 @@ import { describe, it, expect, vi, beforeEach } from "vitest"; -import { getDictationConfig, transcribeDictation } from "../dictation"; +import { + cancelDictationLocalModelDownload, + deleteDictationLocalModel, + downloadDictationLocalModel, + getDictationConfig, + getDictationLocalModelDownloadProgress, + listDictationLocalModels, + saveDictationModelSelection, + transcribeDictation, +} from "../dictation"; import { getClient } from "../acpConnection"; vi.mock("../acpConnection", () => ({ @@ -46,4 +55,80 @@ describe("dictation SDK wiring", () => { }); expect(result.text).toBe("hello"); }); + + it("saveDictationModelSelection calls GooseDictationModelSelect", async () => { + client.goose.GooseDictationModelSelect = vi.fn().mockResolvedValue({}); + await saveDictationModelSelection("local" as any, "tiny"); + expect(client.goose.GooseDictationModelSelect).toHaveBeenCalledWith({ + provider: "local", + modelId: "tiny", + }); + }); + + it("listDictationLocalModels returns the models array", async () => { + client.goose.GooseDictationModelsList = vi.fn().mockResolvedValue({ + models: [ + { + id: "tiny", + description: "Tiny", + sizeMb: 75, + downloaded: true, + downloadInProgress: false, + }, + ], + }); + const result = await listDictationLocalModels(); + expect(client.goose.GooseDictationModelsList).toHaveBeenCalledWith({}); + expect(result).toHaveLength(1); + expect(result[0].id).toBe("tiny"); + }); + + it("downloadDictationLocalModel forwards modelId", async () => { + client.goose.GooseDictationModelsDownload = vi.fn().mockResolvedValue({}); + await downloadDictationLocalModel("tiny"); + expect(client.goose.GooseDictationModelsDownload).toHaveBeenCalledWith({ + modelId: "tiny", + }); + }); + + it("getDictationLocalModelDownloadProgress returns progress or null", async () => { + client.goose.GooseDictationModelsDownloadProgress = vi.fn().mockResolvedValue({ + progress: { + bytesDownloaded: 100, + totalBytes: 1000, + progressPercent: 10, + status: "downloading", + error: null, + }, + }); + const result = await getDictationLocalModelDownloadProgress("tiny"); + expect(result?.bytesDownloaded).toBe(100); + expect(client.goose.GooseDictationModelsDownloadProgress).toHaveBeenCalledWith({ + modelId: "tiny", + }); + }); + + it("getDictationLocalModelDownloadProgress returns null when no download", async () => { + client.goose.GooseDictationModelsDownloadProgress = vi.fn().mockResolvedValue({ + progress: undefined, + }); + const result = await getDictationLocalModelDownloadProgress("tiny"); + expect(result).toBeNull(); + }); + + it("cancelDictationLocalModelDownload forwards modelId", async () => { + client.goose.GooseDictationModelsCancel = vi.fn().mockResolvedValue({}); + await cancelDictationLocalModelDownload("tiny"); + expect(client.goose.GooseDictationModelsCancel).toHaveBeenCalledWith({ + modelId: "tiny", + }); + }); + + it("deleteDictationLocalModel forwards modelId", async () => { + client.goose.GooseDictationModelsDelete = vi.fn().mockResolvedValue({}); + await deleteDictationLocalModel("tiny"); + expect(client.goose.GooseDictationModelsDelete).toHaveBeenCalledWith({ + modelId: "tiny", + }); + }); }); diff --git a/ui/goose2/src/shared/api/dictation.ts b/ui/goose2/src/shared/api/dictation.ts index 793503f3c1dc..0031d120fad0 100644 --- a/ui/goose2/src/shared/api/dictation.ts +++ b/ui/goose2/src/shared/api/dictation.ts @@ -34,7 +34,8 @@ export async function saveDictationModelSelection( provider: DictationProvider, modelId: string, ): Promise { - return invoke("save_dictation_model_selection", { provider, modelId }); + const client = await getClient(); + await client.goose.GooseDictationModelSelect({ provider, modelId }); } export async function saveDictationProviderSecret( @@ -66,31 +67,40 @@ export async function deleteDictationProviderSecret( export async function listDictationLocalModels(): Promise< WhisperModelStatus[] > { - return invoke("list_dictation_local_models"); + const client = await getClient(); + const response = await client.goose.GooseDictationModelsList({}); + return response.models as unknown as WhisperModelStatus[]; } export async function downloadDictationLocalModel( modelId: string, ): Promise { - return invoke("download_dictation_local_model", { modelId }); + const client = await getClient(); + await client.goose.GooseDictationModelsDownload({ modelId }); } export async function getDictationLocalModelDownloadProgress( modelId: string, ): Promise { - return invoke("get_dictation_local_model_download_progress", { modelId }); + const client = await getClient(); + const response = await client.goose.GooseDictationModelsDownloadProgress({ + modelId, + }); + return (response.progress ?? null) as DictationDownloadProgress | null; } export async function cancelDictationLocalModelDownload( modelId: string, ): Promise { - return invoke("cancel_dictation_local_model_download", { modelId }); + const client = await getClient(); + await client.goose.GooseDictationModelsCancel({ modelId }); } export async function deleteDictationLocalModel( modelId: string, ): Promise { - return invoke("delete_dictation_local_model", { modelId }); + const client = await getClient(); + await client.goose.GooseDictationModelsDelete({ modelId }); } export async function getMicrophonePermissionStatus(): Promise { From d33a6ba9a0efaabf6eb1b122569e2c26dc4a673b Mon Sep 17 00:00:00 2001 From: tulsi Date: Thu, 16 Apr 2026 11:56:13 -0700 Subject: [PATCH 06/30] refactor(goose2): remove dead microphone permission exports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit getMicrophonePermissionStatus and requestMicrophonePermission had zero callers after the voice-input work settled — VoiceInputSettings derives permission status from the browser's navigator.mediaDevices.getUserMedia directly rather than routing through Tauri. Drop the exports and the now-unused MicrophonePermissionStatus type import. The type itself stays defined in shared/types/dictation.ts for any future consumer; only the Tauri-routed helpers are removed. --- ui/goose2/src/shared/api/dictation.ts | 9 --------- 1 file changed, 9 deletions(-) diff --git a/ui/goose2/src/shared/api/dictation.ts b/ui/goose2/src/shared/api/dictation.ts index 0031d120fad0..3ace459b8523 100644 --- a/ui/goose2/src/shared/api/dictation.ts +++ b/ui/goose2/src/shared/api/dictation.ts @@ -4,7 +4,6 @@ import type { DictationProvider, DictationProviderStatus, DictationTranscribeResponse, - MicrophonePermissionStatus, WhisperModelStatus, } from "@/shared/types/dictation"; import { getClient } from "./acpConnection"; @@ -102,11 +101,3 @@ export async function deleteDictationLocalModel( const client = await getClient(); await client.goose.GooseDictationModelsDelete({ modelId }); } - -export async function getMicrophonePermissionStatus(): Promise { - return invoke("get_microphone_permission_status"); -} - -export async function requestMicrophonePermission(): Promise { - return invoke("request_microphone_permission"); -} From e47fdf9979ea877caedcdd72e380561513e06244 Mon Sep 17 00:00:00 2001 From: tulsi Date: Thu, 16 Apr 2026 12:14:08 -0700 Subject: [PATCH 07/30] feat(goose2): local Whisper model download/select/delete UI Replaces the "Local model download is not yet available" placeholder in VoiceInputSettings with a working LocalWhisperModels component that drives the six ACP methods added upstream: list, download, progress, cancel, delete, select. Per-row UI state machine: - not downloaded -> Download button - downloading -> progress bar + Cancel button (polls every 750ms) - downloaded + selected -> "Selected" badge + Delete - downloaded + unselected -> Select + Delete Progress polling auto-stops when no active downloads remain. Download completion refreshes the model list and notifies the parent config so the mic button in chat enables without a manual reload. i18n keys added for EN and ES; obsolete localModelUnavailable key left in place (unused now) to avoid gratuitous deletion. --- .../settings/ui/LocalWhisperModels.tsx | 324 ++++++++++++++++++ .../settings/ui/VoiceInputSettings.tsx | 14 +- .../src/shared/i18n/locales/en/settings.json | 5 + .../src/shared/i18n/locales/es/settings.json | 5 + 4 files changed, 340 insertions(+), 8 deletions(-) create mode 100644 ui/goose2/src/features/settings/ui/LocalWhisperModels.tsx diff --git a/ui/goose2/src/features/settings/ui/LocalWhisperModels.tsx b/ui/goose2/src/features/settings/ui/LocalWhisperModels.tsx new file mode 100644 index 000000000000..4fa3f8dadb56 --- /dev/null +++ b/ui/goose2/src/features/settings/ui/LocalWhisperModels.tsx @@ -0,0 +1,324 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { useTranslation } from "react-i18next"; +import { Button } from "@/shared/ui/button"; +import { + cancelDictationLocalModelDownload, + deleteDictationLocalModel, + downloadDictationLocalModel, + getDictationLocalModelDownloadProgress, + listDictationLocalModels, +} from "@/shared/api/dictation"; + +type LocalModel = { + id: string; + description: string; + sizeMb: number; + downloaded: boolean; + downloadInProgress: boolean; +}; + +type DownloadProgress = { + bytesDownloaded: number; + totalBytes: number; + progressPercent: number; + status: string; + error?: string | null; +}; + +const POLL_INTERVAL_MS = 750; + +interface LocalWhisperModelsProps { + selectedModelId: string; + onSelectModel: (modelId: string) => void | Promise; + onModelsChanged: () => void | Promise; +} + +export function LocalWhisperModels({ + selectedModelId, + onSelectModel, + onModelsChanged, +}: LocalWhisperModelsProps) { + const { t } = useTranslation(["settings", "common"]); + const [models, setModels] = useState([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [downloadingIds, setDownloadingIds] = useState>(new Set()); + const [progresses, setProgresses] = useState>( + new Map(), + ); + const onModelsChangedRef = useRef(onModelsChanged); + onModelsChangedRef.current = onModelsChanged; + + const refresh = useCallback(async () => { + try { + const list = + (await listDictationLocalModels()) as unknown as LocalModel[]; + setModels(list); + setDownloadingIds((prev) => { + const next = new Set(prev); + for (const m of list) { + if (m.downloadInProgress) next.add(m.id); + } + return next; + }); + } catch (err) { + setError( + err instanceof Error + ? err.message + : t("general.voiceInput.loadError"), + ); + } + }, [t]); + + useEffect(() => { + const load = async () => { + setLoading(true); + setError(null); + await refresh(); + setLoading(false); + }; + void load(); + }, [refresh]); + + useEffect(() => { + if (downloadingIds.size === 0) return; + let cancelled = false; + + const tick = async () => { + const next = new Map(); + const stillActive = new Set(); + const finishedIds: string[] = []; + + for (const id of downloadingIds) { + try { + const progress = + (await getDictationLocalModelDownloadProgress( + id, + )) as unknown as DownloadProgress | null; + if (!progress) { + finishedIds.push(id); + continue; + } + next.set(id, progress); + if (progress.status === "downloading") { + stillActive.add(id); + } else { + finishedIds.push(id); + } + } catch { + stillActive.add(id); + } + } + if (cancelled) return; + setProgresses(next); + if (finishedIds.length > 0) { + await refresh(); + await onModelsChangedRef.current(); + } + setDownloadingIds(stillActive); + }; + + const interval = window.setInterval(() => { + void tick(); + }, POLL_INTERVAL_MS); + return () => { + cancelled = true; + window.clearInterval(interval); + }; + }, [downloadingIds, refresh]); + + const startDownload = useCallback( + async (modelId: string) => { + setError(null); + try { + await downloadDictationLocalModel(modelId); + setDownloadingIds((prev) => new Set(prev).add(modelId)); + } catch (err) { + setError( + err instanceof Error + ? err.message + : t("general.voiceInput.saveError"), + ); + } + }, + [t], + ); + + const cancelDownload = useCallback( + async (modelId: string) => { + setError(null); + try { + await cancelDictationLocalModelDownload(modelId); + } catch (err) { + setError( + err instanceof Error + ? err.message + : t("general.voiceInput.saveError"), + ); + } finally { + setDownloadingIds((prev) => { + const next = new Set(prev); + next.delete(modelId); + return next; + }); + await refresh(); + } + }, + [refresh, t], + ); + + const deleteModel = useCallback( + async (modelId: string) => { + setError(null); + try { + await deleteDictationLocalModel(modelId); + await refresh(); + await onModelsChanged(); + } catch (err) { + setError( + err instanceof Error + ? err.message + : t("general.voiceInput.deleteError"), + ); + } + }, + [onModelsChanged, refresh, t], + ); + + if (loading) { + return ( +
+

+ {t("common:labels.loading")} +

+
+ ); + } + + if (models.length === 0) { + return ( +
+

+ {t("general.voiceInput.noLocalModels")} +

+
+ ); + } + + return ( +
+
+

+ {t("general.voiceInput.localModelLabel")} +

+

+ {t("general.voiceInput.localModelDescription")} +

+
+ +
    + {models.map((model) => { + const progress = progresses.get(model.id); + const isDownloading = + downloadingIds.has(model.id) || + progress?.status === "downloading" || + model.downloadInProgress; + const isSelected = + model.downloaded && model.id === selectedModelId; + return ( +
  • +
    +
    +

    + {model.id} +

    + + {model.sizeMb} MB + + {isSelected ? ( + + {t("general.voiceInput.selectedModel")} + + ) : null} +
    +

    + {model.description} +

    + {isDownloading && progress ? ( +
    +
    +
    +
    +

    + {t("general.voiceInput.downloadProgress", { + percent: Math.round(progress.progressPercent), + })} +

    +
    + ) : null} + {progress?.status === "failed" && progress.error ? ( +

    + {progress.error} +

    + ) : null} +
    + +
    + {isDownloading ? ( + + ) : model.downloaded ? ( + <> + {!isSelected ? ( + + ) : null} + + + ) : ( + + )} +
    +
  • + ); + })} +
+ + {error ?

{error}

: null} +
+ ); +} diff --git a/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx b/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx index 8ccc14908ecb..cc485cbb4c75 100644 --- a/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx +++ b/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx @@ -17,6 +17,7 @@ import type { } from "@/shared/types/dictation"; import { useAudioDevices } from "@/shared/ui/ai-elements/mic-selector"; import { Button } from "@/shared/ui/button"; +import { LocalWhisperModels } from "./LocalWhisperModels"; import { Input } from "@/shared/ui/input"; import { Select, @@ -338,14 +339,11 @@ export function VoiceInputSettings() { ) : null} {selectedProvider === "local" ? ( -
-

- {t("general.voiceInput.localModelLabel")} -

-

- {t("general.voiceInput.localModelUnavailable")} -

-
+ handleModelChange(modelId)} + onModelsChanged={() => refreshConfig()} + /> ) : (selectedStatus.availableModels ?? []).length > 0 ? (

diff --git a/ui/goose2/src/shared/i18n/locales/en/settings.json b/ui/goose2/src/shared/i18n/locales/en/settings.json index e4c15409aa29..6dfbd824ace5 100644 --- a/ui/goose2/src/shared/i18n/locales/en/settings.json +++ b/ui/goose2/src/shared/i18n/locales/en/settings.json @@ -142,8 +142,13 @@ "updateApiKey": "Update API key", "removeApiKey": "Remove API key", "localModelLabel": "Local Whisper Model", + "localModelDescription": "Download a Whisper model to run transcription locally. Selecting a model sets it as your active local transcription model.", "localModelUnavailable": "Local model download is not yet available. Use the Goose CLI to download a Whisper model first.", + "noLocalModels": "No local Whisper models available.", "download": "Download", + "selectModel": "Select", + "selectedModel": "Selected", + "deleteModel": "Delete", "recommended": "Recommended", "microphoneLabel": "Microphone", "microphoneDescription": "Choose which microphone to use for voice input.", diff --git a/ui/goose2/src/shared/i18n/locales/es/settings.json b/ui/goose2/src/shared/i18n/locales/es/settings.json index 33bef38d3078..2178a44e80fb 100644 --- a/ui/goose2/src/shared/i18n/locales/es/settings.json +++ b/ui/goose2/src/shared/i18n/locales/es/settings.json @@ -142,8 +142,13 @@ "updateApiKey": "Actualizar clave API", "removeApiKey": "Eliminar clave API", "localModelLabel": "Modelo Whisper local", + "localModelDescription": "Descarga un modelo Whisper para transcribir localmente. Seleccionar un modelo lo establece como tu modelo de transcripción local activo.", "localModelUnavailable": "La descarga de modelos locales aún no está disponible. Usa la CLI de Goose para descargar un modelo Whisper primero.", + "noLocalModels": "No hay modelos Whisper locales disponibles.", "download": "Descargar", + "selectModel": "Seleccionar", + "selectedModel": "Seleccionado", + "deleteModel": "Eliminar", "recommended": "Recomendado", "microphoneLabel": "Micrófono", "microphoneDescription": "Elige qué micrófono usar para la entrada de voz.", From 608e812864b06adb57c3759d3cf0b66b1efae1b0 Mon Sep 17 00:00:00 2001 From: tulsi Date: Thu, 16 Apr 2026 12:31:13 -0700 Subject: [PATCH 08/30] fix(goose2): notify chat input when local model download/delete completes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The onModelsChanged callback only called refreshConfig() — it didn't emit notifyVoiceDictationConfigChanged(). Result: after downloading a local Whisper model, the chat page's useVoiceDictation hook kept stale providerStatuses and left the mic button disabled until the window was reloaded. Symmetric with how handleModelChange already notifies on cloud-provider model changes. Now both paths emit the same event. --- ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx b/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx index cc485cbb4c75..399897a9683d 100644 --- a/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx +++ b/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx @@ -342,7 +342,10 @@ export function VoiceInputSettings() { handleModelChange(modelId)} - onModelsChanged={() => refreshConfig()} + onModelsChanged={async () => { + await refreshConfig(); + notifyVoiceDictationConfigChanged(); + }} /> ) : (selectedStatus.availableModels ?? []).length > 0 ? (

From 0a8fbe3722d729a8980facbf15254a3a0af41136 Mon Sep 17 00:00:00 2001 From: tulsi Date: Thu, 16 Apr 2026 12:35:42 -0700 Subject: [PATCH 09/30] fix(goose2): one-click send while mic is still recording MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ChatInput's handleSend used to early-return when isRecording or isTranscribing, which meant clicking Send during active dictation only stopped the mic — you had to click Send a second time to actually send. Remove the early return. If recording is still live, stop it with flushPending:false and send whatever's already transcribed into the textarea. Any in-flight audio the user spoke AFTER clicking Send is intentionally dropped — by the time the user clicks Send, what's in the textarea is what they want to send. Empty-send is still blocked by the canSend guard, so an accidental Send with no transcription is a no-op. --- ui/goose2/src/features/chat/ui/ChatInput.tsx | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ui/goose2/src/features/chat/ui/ChatInput.tsx b/ui/goose2/src/features/chat/ui/ChatInput.tsx index 8013c6f7eb02..e6de03928207 100644 --- a/ui/goose2/src/features/chat/ui/ChatInput.tsx +++ b/ui/goose2/src/features/chat/ui/ChatInput.tsx @@ -195,12 +195,12 @@ export function ChatInput({ useEffect(() => textareaRef.current?.focus(), []); const handleSend = useCallback(() => { - // If recording, stop and flush — the transcription callback will - // append text and may auto-submit. Don't send the current text yet - // because the final transcription hasn't arrived. + // If recording, stop without waiting for final flush and send what's + // already transcribed into the textarea. This makes Send a single click + // even while the mic is hot; any in-flight audio after the user clicked + // Send is intentionally dropped. if (dictation.isRecording || dictation.isTranscribing) { - dictation.stopRecording(); - return; + dictation.stopRecording({ flushPending: false }); } if (!canSend) { From 48009ab1e6eeabf1469036cc02a9a4578cb9a2e5 Mon Sep 17 00:00:00 2001 From: tulsi Date: Thu, 16 Apr 2026 12:39:24 -0700 Subject: [PATCH 10/30] fix(goose2): microphone UX in Voice settings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two fixes: 1. useAudioDevices now subscribes to navigator.permissions.query for 'microphone' and reflects the live OS-level permission state. Before, hasPermission only became true when the user clicked 'Grant access' from this component — if they'd already granted mic permission via the chat input's getUserMedia call, Voice settings still showed the Grant access button with no effect. Now opening Voice settings shows the correct state immediately and updates reactively if permission changes elsewhere. 2. Move the Microphone block above the per-provider (API key / model) config block so its visual position reflects what it is: a voice-level setting that applies regardless of selected provider, not a provider-specific detail. --- .../settings/ui/VoiceInputSettings.tsx | 126 +++++++++--------- .../shared/ui/ai-elements/mic-selector.tsx | 31 ++++- 2 files changed, 93 insertions(+), 64 deletions(-) diff --git a/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx b/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx index 399897a9683d..696d3ae162b4 100644 --- a/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx +++ b/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx @@ -253,6 +253,69 @@ export function VoiceInputSettings() {
+
+
+
+

+ {t("general.voiceInput.microphoneLabel")} +

+

+ {isMicrophoneSupported + ? t("general.voiceInput.microphoneDescription") + : t("general.voiceInput.microphoneUnavailable")} +

+
+ {isMicrophoneSupported && !hasPermission ? ( + + ) : null} +
+ + {!devicesError && + !hasPermission && + permissionStatus === "not_determined" ? ( +

+ {t("general.voiceInput.microphoneAccessPrompt")} +

+ ) : null} + + {devicesError ? ( +

{devicesError}

+ ) : null} + + {isMicrophoneSupported && hasPermission ? ( + + ) : null} +
+ {selectedStatus ? ( <> {!selectedStatus.usesProviderConfig && @@ -377,69 +440,6 @@ export function VoiceInputSettings() { ) : null} -
-
-
-

- {t("general.voiceInput.microphoneLabel")} -

-

- {isMicrophoneSupported - ? t("general.voiceInput.microphoneDescription") - : t("general.voiceInput.microphoneUnavailable")} -

-
- {isMicrophoneSupported && !hasPermission ? ( - - ) : null} -
- - {!devicesError && - !hasPermission && - permissionStatus === "not_determined" ? ( -

- {t("general.voiceInput.microphoneAccessPrompt")} -

- ) : null} - - {devicesError ? ( -

{devicesError}

- ) : null} - - {isMicrophoneSupported && hasPermission ? ( - - ) : null} -
-