diff --git a/Cargo.lock b/Cargo.lock index 093e9658825e..e38f0b0f0aa3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4454,6 +4454,7 @@ dependencies = [ "async-stream", "async-trait", "axum", + "base64 0.22.1", "fs-err", "futures", "goose", diff --git a/crates/goose-acp/Cargo.toml b/crates/goose-acp/Cargo.toml index 8bc2b1e7eed5..a7200146b8a8 100644 --- a/crates/goose-acp/Cargo.toml +++ b/crates/goose-acp/Cargo.toml @@ -14,6 +14,7 @@ path = "src/bin/generate_acp_schema.rs" [features] default = ["code-mode", "rustls-tls"] code-mode = ["goose/code-mode"] +local-inference = ["goose/local-inference"] rustls-tls = ["goose/rustls-tls", "goose-mcp/rustls-tls"] native-tls = ["goose/native-tls", "goose-mcp/native-tls"] @@ -48,6 +49,7 @@ uuid = { workspace = true, features = ["v7"] } schemars = { workspace = true, features = ["derive"] } goose-acp-macros = { path = "../goose-acp-macros" } goose-sdk = { path = "../goose-sdk" } +base64 = { workspace = true } [dev-dependencies] async-trait = { workspace = true } diff --git a/crates/goose-acp/acp-meta.json b/crates/goose-acp/acp-meta.json index 944d227b663f..75f28ef60a98 100644 --- a/crates/goose-acp/acp-meta.json +++ b/crates/goose-acp/acp-meta.json @@ -104,6 +104,46 @@ "method": "_goose/session/unarchive", "requestType": "UnarchiveSessionRequest", "responseType": "EmptyResponse" + }, + { + "method": "_goose/dictation/transcribe", + "requestType": "DictationTranscribeRequest", + "responseType": "DictationTranscribeResponse" + }, + { + "method": "_goose/dictation/config", + "requestType": "DictationConfigRequest", + "responseType": "DictationConfigResponse" + }, + { + "method": "_goose/dictation/models/list", + "requestType": "DictationModelsListRequest", + "responseType": "DictationModelsListResponse" + }, + { + "method": "_goose/dictation/models/download", + "requestType": "DictationModelDownloadRequest", + "responseType": "EmptyResponse" + }, + { + "method": "_goose/dictation/models/download/progress", + "requestType": "DictationModelDownloadProgressRequest", + "responseType": "DictationModelDownloadProgressResponse" + }, + { + "method": "_goose/dictation/models/cancel", + "requestType": "DictationModelCancelRequest", + "responseType": "EmptyResponse" + }, + { + "method": "_goose/dictation/models/delete", + "requestType": "DictationModelDeleteRequest", + "responseType": "EmptyResponse" + }, + { + "method": "_goose/dictation/model/select", + "requestType": "DictationModelSelectRequest", + "responseType": "EmptyResponse" } ] } diff --git a/crates/goose-acp/acp-schema.json b/crates/goose-acp/acp-schema.json index 0f0db1759a37..821de4145e74 100644 --- a/crates/goose-acp/acp-schema.json +++ b/crates/goose-acp/acp-schema.json @@ -607,6 +607,329 @@ "x-side": "agent", "x-method": "_goose/session/unarchive" }, + "DictationTranscribeRequest": { + "type": "object", + "properties": { + "audio": { + "type": "string", + "description": "Base64-encoded audio data" + }, + "mimeType": { + "type": "string", + "description": "MIME type (e.g. \"audio/wav\", \"audio/webm\")" + }, + "provider": { + "type": "string", + "description": "Provider to use: \"openai\", \"groq\", \"elevenlabs\", or \"local\"" + } + }, + "required": [ + "audio", + "mimeType", + "provider" + ], + "description": "Transcribe audio via a dictation provider.", + "x-side": "agent", + "x-method": "_goose/dictation/transcribe" + }, + "DictationTranscribeResponse": { + "type": "object", + "properties": { + "text": { + "type": "string" + } + }, + "required": [ + "text" + ], + "description": "Transcription result.", + "x-side": "agent", + "x-method": "_goose/dictation/transcribe" + }, + "DictationConfigRequest": { + "type": "object", + "description": "Get the configuration status of all dictation providers.", + "x-side": "agent", + "x-method": "_goose/dictation/config" + }, + "DictationConfigResponse": { + "type": "object", + "properties": { + "providers": { + "type": "object", + "additionalProperties": { + "$ref": "#/$defs/DictationProviderStatusEntry" + } + } + }, + "required": [ + "providers" + ], + "description": "Dictation config response — map of provider name to status.", + "x-side": "agent", + "x-method": "_goose/dictation/config" + }, + "DictationProviderStatusEntry": { + "type": "object", + "properties": { + "configured": { + "type": "boolean" + }, + "host": { + "type": [ + "string", + "null" + ] + }, + "description": { + "type": "string" + }, + "usesProviderConfig": { + "type": "boolean" + }, + "settingsPath": { + "type": [ + "string", + "null" + ] + }, + "configKey": { + "type": [ + "string", + "null" + ] + }, + "modelConfigKey": { + "type": [ + "string", + "null" + ] + }, + "defaultModel": { + "type": [ + "string", + "null" + ] + }, + "selectedModel": { + "type": [ + "string", + "null" + ] + }, + "availableModels": { + "type": "array", + "items": { + "$ref": "#/$defs/DictationModelOption" + }, + "default": [] + } + }, + "required": [ + "configured", + "description", + "usesProviderConfig" + ], + "description": "Per-provider configuration status." + }, + "DictationModelOption": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "description": { + "type": "string" + } + }, + "required": [ + "id", + "label", + "description" + ] + }, + "DictationModelsListRequest": { + "type": "object", + "description": "List available local Whisper models with their download status.", + "x-side": "agent", + "x-method": "_goose/dictation/models/list" + }, + "DictationModelsListResponse": { + "type": "object", + "properties": { + "models": { + "type": "array", + "items": { + "$ref": "#/$defs/DictationLocalModelStatus" + } + } + }, + "required": [ + "models" + ], + "x-side": "agent", + "x-method": "_goose/dictation/models/list" + }, + "DictationLocalModelStatus": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "label": { + "type": "string" + }, + "description": { + "type": "string" + }, + "sizeMb": { + "type": "integer", + "minimum": 0 + }, + "downloaded": { + "type": "boolean" + }, + "downloadInProgress": { + "type": "boolean" + } + }, + "required": [ + "id", + "label", + "description", + "sizeMb", + "downloaded", + "downloadInProgress" + ] + }, + "DictationModelDownloadRequest": { + "type": "object", + "properties": { + "modelId": { + "type": "string" + } + }, + "required": [ + "modelId" + ], + "description": "Kick off a background download of a local Whisper model.", + "x-side": "agent", + "x-method": "_goose/dictation/models/download" + }, + "DictationModelDownloadProgressRequest": { + "type": "object", + "properties": { + "modelId": { + "type": "string" + } + }, + "required": [ + "modelId" + ], + "description": "Poll the progress of an in-flight download.", + "x-side": "agent", + "x-method": "_goose/dictation/models/download/progress" + }, + "DictationModelDownloadProgressResponse": { + "type": "object", + "properties": { + "progress": { + "anyOf": [ + { + "$ref": "#/$defs/DictationDownloadProgress" + }, + { + "type": "null" + } + ], + "description": "None when no download is active for this model id." + } + }, + "x-side": "agent", + "x-method": "_goose/dictation/models/download/progress" + }, + "DictationDownloadProgress": { + "type": "object", + "properties": { + "bytesDownloaded": { + "type": "integer", + "minimum": 0 + }, + "totalBytes": { + "type": "integer", + "minimum": 0 + }, + "progressPercent": { + "type": "number", + "format": "float" + }, + "status": { + "type": "string", + "description": "serde lowercase of DownloadStatus: \"downloading\" | \"completed\" | \"failed\" | \"cancelled\"" + }, + "error": { + "type": [ + "string", + "null" + ] + } + }, + "required": [ + "bytesDownloaded", + "totalBytes", + "progressPercent", + "status" + ] + }, + "DictationModelCancelRequest": { + "type": "object", + "properties": { + "modelId": { + "type": "string" + } + }, + "required": [ + "modelId" + ], + "description": "Cancel an in-flight download.", + "x-side": "agent", + "x-method": "_goose/dictation/models/cancel" + }, + "DictationModelDeleteRequest": { + "type": "object", + "properties": { + "modelId": { + "type": "string" + } + }, + "required": [ + "modelId" + ], + "description": "Delete a downloaded local Whisper model from disk.", + "x-side": "agent", + "x-method": "_goose/dictation/models/delete" + }, + "DictationModelSelectRequest": { + "type": "object", + "properties": { + "provider": { + "type": "string" + }, + "modelId": { + "type": "string" + } + }, + "required": [ + "provider", + "modelId" + ], + "description": "Persist the user's model selection for a given provider.", + "x-side": "agent", + "x-method": "_goose/dictation/model/select" + }, "ExtRequest": { "properties": { "id": { @@ -807,6 +1130,78 @@ ], "description": "Params for _goose/session/unarchive", "title": "UnarchiveSessionRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationTranscribeRequest" + } + ], + "description": "Params for _goose/dictation/transcribe", + "title": "DictationTranscribeRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationConfigRequest" + } + ], + "description": "Params for _goose/dictation/config", + "title": "DictationConfigRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelsListRequest" + } + ], + "description": "Params for _goose/dictation/models/list", + "title": "DictationModelsListRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelDownloadRequest" + } + ], + "description": "Params for _goose/dictation/models/download", + "title": "DictationModelDownloadRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelDownloadProgressRequest" + } + ], + "description": "Params for _goose/dictation/models/download/progress", + "title": "DictationModelDownloadProgressRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelCancelRequest" + } + ], + "description": "Params for _goose/dictation/models/cancel", + "title": "DictationModelCancelRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelDeleteRequest" + } + ], + "description": "Params for _goose/dictation/models/delete", + "title": "DictationModelDeleteRequest" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelSelectRequest" + } + ], + "description": "Params for _goose/dictation/model/select", + "title": "DictationModelSelectRequest" } ] }, @@ -933,6 +1328,38 @@ } ], "title": "ImportSessionResponse" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationTranscribeResponse" + } + ], + "title": "DictationTranscribeResponse" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationConfigResponse" + } + ], + "title": "DictationConfigResponse" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelsListResponse" + } + ], + "title": "DictationModelsListResponse" + }, + { + "allOf": [ + { + "$ref": "#/$defs/DictationModelDownloadProgressResponse" + } + ], + "title": "DictationModelDownloadProgressResponse" } ] }, diff --git a/crates/goose-acp/src/server.rs b/crates/goose-acp/src/server.rs index d1a8212c7507..ca7c0f7ff883 100644 --- a/crates/goose-acp/src/server.rs +++ b/crates/goose-acp/src/server.rs @@ -16,6 +16,13 @@ use goose::config::paths::Paths; use goose::config::permission::PermissionManager; use goose::config::{Config, GooseMode}; use goose::conversation::message::{ActionRequiredData, Message, MessageContent}; +#[cfg(feature = "local-inference")] +use goose::dictation::providers::transcribe_local; +use goose::dictation::providers::{ + all_providers, is_configured, transcribe_with_provider, DictationProvider, +}; +#[cfg(feature = "local-inference")] +use goose::dictation::whisper; use goose::mcp_utils::ToolResult; use goose::permission::permission_confirmation::PrincipalType; use goose::permission::{Permission, PermissionConfirmation}; @@ -68,6 +75,12 @@ pub type AcpProviderFactory = Arc< const DEFAULT_PROVIDER_ID: &str = "goose"; const DEFAULT_PROVIDER_LABEL: &str = "Goose (Default)"; +const OPENAI_TRANSCRIPTION_MODEL_CONFIG_KEY: &str = "OPENAI_TRANSCRIPTION_MODEL"; +const GROQ_TRANSCRIPTION_MODEL_CONFIG_KEY: &str = "GROQ_TRANSCRIPTION_MODEL"; +const ELEVENLABS_TRANSCRIPTION_MODEL_CONFIG_KEY: &str = "ELEVENLABS_TRANSCRIPTION_MODEL"; +const OPENAI_TRANSCRIPTION_MODEL: &str = "whisper-1"; +const GROQ_TRANSCRIPTION_MODEL: &str = "whisper-large-v3-turbo"; +const ELEVENLABS_TRANSCRIPTION_MODEL: &str = "scribe_v1"; /// In-memory state for an active ACP session. /// @@ -2904,6 +2917,420 @@ impl GooseAcpAgent { .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; Ok(EmptyResponse {}) } + + #[custom_method(DictationTranscribeRequest)] + async fn on_dictation_transcribe( + &self, + req: DictationTranscribeRequest, + ) -> Result { + use base64::{engine::general_purpose::STANDARD as BASE64, Engine}; + let config = goose::config::Config::global(); + + #[cfg(not(feature = "local-inference"))] + if req.provider == "local" { + return Err(sacp::Error::invalid_params() + .data("Local inference is not available in this build")); + } + + let provider: DictationProvider = serde_json::from_value(serde_json::Value::String( + req.provider.clone(), + )) + .map_err(|_| { + sacp::Error::invalid_params().data(format!("Unknown provider: {}", req.provider)) + })?; + + let audio_bytes = BASE64 + .decode(&req.audio) + .map_err(|_| sacp::Error::invalid_params().data("Invalid base64 audio data"))?; + + if audio_bytes.len() > 50 * 1024 * 1024 { + return Err(sacp::Error::invalid_params().data("Audio too large (max 50MB)")); + } + + let extension = match req.mime_type.as_str() { + "audio/webm" | "audio/webm;codecs=opus" => "webm", + "audio/mp4" => "mp4", + "audio/mpeg" | "audio/mpga" => "mp3", + "audio/m4a" => "m4a", + "audio/wav" | "audio/x-wav" => "wav", + other => { + return Err( + sacp::Error::invalid_params().data(format!("Unsupported format: {other}")) + ) + } + }; + + let text = match provider { + DictationProvider::OpenAI => { + let model = dictation_selected_model(config, DictationProvider::OpenAI) + .unwrap_or_else(|| OPENAI_TRANSCRIPTION_MODEL.to_string()); + transcribe_with_provider( + DictationProvider::OpenAI, + "model".to_string(), + model, + audio_bytes, + extension, + &req.mime_type, + ) + .await + } + DictationProvider::Groq => { + let model = dictation_selected_model(config, DictationProvider::Groq) + .unwrap_or_else(|| GROQ_TRANSCRIPTION_MODEL.to_string()); + transcribe_with_provider( + DictationProvider::Groq, + "model".to_string(), + model, + audio_bytes, + extension, + &req.mime_type, + ) + .await + } + DictationProvider::ElevenLabs => { + let model = dictation_selected_model(config, DictationProvider::ElevenLabs) + .unwrap_or_else(|| ELEVENLABS_TRANSCRIPTION_MODEL.to_string()); + transcribe_with_provider( + DictationProvider::ElevenLabs, + "model_id".to_string(), + model, + audio_bytes, + extension, + &req.mime_type, + ) + .await + } + #[cfg(feature = "local-inference")] + DictationProvider::Local => transcribe_local(audio_bytes).await, + } + .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; + + Ok(DictationTranscribeResponse { text }) + } + + #[custom_method(DictationConfigRequest)] + async fn on_dictation_config( + &self, + _req: DictationConfigRequest, + ) -> Result { + let config = goose::config::Config::global(); + let mut providers = std::collections::HashMap::new(); + + for def in all_providers() { + let provider = def.provider; + let host = if let Some(host_key) = def.host_key { + config + .get(host_key, false) + .ok() + .and_then(|v| v.as_str().map(|s| s.to_string())) + } else { + None + }; + + let provider_key = serde_json::to_value(provider) + .ok() + .and_then(|v| v.as_str().map(|s| s.to_string())) + .unwrap_or_else(|| format!("{:?}", provider).to_lowercase()); + providers.insert( + provider_key, + DictationProviderStatusEntry { + configured: is_configured(provider), + host, + description: def.description.to_string(), + uses_provider_config: def.uses_provider_config, + settings_path: def.settings_path.map(|s| s.to_string()), + config_key: if !def.uses_provider_config { + Some(def.config_key.to_string()) + } else { + None + }, + model_config_key: dictation_model_config_key(provider), + default_model: dictation_default_model(provider), + selected_model: dictation_selected_model(config, provider), + available_models: dictation_available_models(provider), + }, + ); + } + + Ok(DictationConfigResponse { providers }) + } + + #[custom_method(DictationModelsListRequest)] + async fn on_dictation_models_list( + &self, + _req: DictationModelsListRequest, + ) -> Result { + #[cfg(feature = "local-inference")] + { + use goose::download_manager::{get_download_manager, DownloadStatus}; + + let manager = get_download_manager(); + let models = whisper::available_models() + .iter() + .map(|model| DictationLocalModelStatus { + id: model.id.to_string(), + label: model.id.to_string(), + description: model.description.to_string(), + size_mb: model.size_mb, + downloaded: model.is_downloaded(), + download_in_progress: manager + .get_progress(model.id) + .map(|progress| progress.status == DownloadStatus::Downloading) + .unwrap_or(false), + }) + .collect(); + + Ok(DictationModelsListResponse { models }) + } + + #[cfg(not(feature = "local-inference"))] + Ok(DictationModelsListResponse::default()) + } + + #[custom_method(DictationModelDownloadRequest)] + async fn on_dictation_model_download( + &self, + _req: DictationModelDownloadRequest, + ) -> Result { + #[cfg(feature = "local-inference")] + { + use goose::download_manager::get_download_manager; + + let model = whisper::get_model(&_req.model_id) + .ok_or_else(|| sacp::Error::invalid_params().data("Unknown model id"))?; + let manager = get_download_manager(); + let model_id_for_config = model.id.to_string(); + + manager + .download_model( + model.id.to_string(), + model.url.to_string(), + model.local_path(), + Some(Box::new(move || { + let config = goose::config::Config::global(); + // Only auto-select this model if the user has no model + // currently selected. This prevents silently switching + // the active model mid-session when a user downloads an + // additional model while one is already in use. + let already_selected = config + .get(whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY, false) + .ok() + .and_then(|value| value.as_str().map(str::to_owned)) + .filter(|model_id| { + // Treat a deleted model file as no active selection + // so a fresh download can auto-select cleanly. + whisper::get_model(model_id) + .is_some_and(|model| model.is_downloaded()) + }); + if already_selected.is_none() { + if let Err(e) = config.set_param( + whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY, + model_id_for_config.clone(), + ) { + error!("Failed to save LOCAL_WHISPER_MODEL after download: {}", e); + } + } + })), + ) + .await + .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; + + Ok(EmptyResponse {}) + } + + #[cfg(not(feature = "local-inference"))] + Err(sacp::Error::invalid_params().data("Local inference not enabled")) + } + + #[custom_method(DictationModelDownloadProgressRequest)] + async fn on_dictation_model_download_progress( + &self, + _req: DictationModelDownloadProgressRequest, + ) -> Result { + #[cfg(feature = "local-inference")] + { + use goose::download_manager::get_download_manager; + + let manager = get_download_manager(); + let progress = + manager + .get_progress(&_req.model_id) + .map(|progress| DictationDownloadProgress { + bytes_downloaded: progress.bytes_downloaded, + total_bytes: progress.total_bytes, + progress_percent: progress.progress_percent, + status: serde_json::to_value(&progress.status) + .ok() + .and_then(|value| value.as_str().map(ToOwned::to_owned)) + .unwrap_or_else(|| "unknown".to_string()), + error: progress.error, + }); + + Ok(DictationModelDownloadProgressResponse { progress }) + } + + #[cfg(not(feature = "local-inference"))] + Ok(DictationModelDownloadProgressResponse { progress: None }) + } + + #[custom_method(DictationModelCancelRequest)] + async fn on_dictation_model_cancel( + &self, + _req: DictationModelCancelRequest, + ) -> Result { + #[cfg(feature = "local-inference")] + { + use goose::download_manager::get_download_manager; + + let manager = get_download_manager(); + manager + .cancel_download(&_req.model_id) + .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; + + Ok(EmptyResponse {}) + } + + #[cfg(not(feature = "local-inference"))] + Err(sacp::Error::invalid_params().data("Local inference not enabled")) + } + + #[custom_method(DictationModelDeleteRequest)] + async fn on_dictation_model_delete( + &self, + _req: DictationModelDeleteRequest, + ) -> Result { + #[cfg(feature = "local-inference")] + { + let model = whisper::get_model(&_req.model_id) + .ok_or_else(|| sacp::Error::invalid_params().data("Unknown model id"))?; + let path = model.local_path(); + + if !path.exists() { + return Err(sacp::Error::invalid_params().data("Model not downloaded")); + } + + std::fs::remove_file(path) + .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; + + Ok(EmptyResponse {}) + } + + #[cfg(not(feature = "local-inference"))] + Err(sacp::Error::invalid_params().data("Local inference not enabled")) + } + + #[custom_method(DictationModelSelectRequest)] + async fn on_dictation_model_select( + &self, + req: DictationModelSelectRequest, + ) -> Result { + #[cfg(not(feature = "local-inference"))] + if req.provider == "local" { + return Err(sacp::Error::invalid_params().data("Local inference not enabled")); + } + + let provider: DictationProvider = serde_json::from_value(serde_json::Value::String( + req.provider.clone(), + )) + .map_err(|_| { + sacp::Error::invalid_params().data(format!("Unknown provider: {}", req.provider)) + })?; + + let key = match provider { + DictationProvider::OpenAI => OPENAI_TRANSCRIPTION_MODEL_CONFIG_KEY, + DictationProvider::Groq => GROQ_TRANSCRIPTION_MODEL_CONFIG_KEY, + DictationProvider::ElevenLabs => ELEVENLABS_TRANSCRIPTION_MODEL_CONFIG_KEY, + #[cfg(feature = "local-inference")] + DictationProvider::Local => { + let model = whisper::get_model(&req.model_id) + .ok_or_else(|| sacp::Error::invalid_params().data("Unknown model id"))?; + if !model.is_downloaded() { + return Err( + sacp::Error::invalid_params().data("Local Whisper model is not downloaded") + ); + } + whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY + } + }; + + goose::config::Config::global() + .set_param(key, req.model_id) + .map_err(|e| sacp::Error::internal_error().data(e.to_string()))?; + + Ok(EmptyResponse {}) + } +} + +fn dictation_model_config_key(provider: DictationProvider) -> Option { + match provider { + DictationProvider::OpenAI => Some(OPENAI_TRANSCRIPTION_MODEL_CONFIG_KEY.to_string()), + DictationProvider::Groq => Some(GROQ_TRANSCRIPTION_MODEL_CONFIG_KEY.to_string()), + DictationProvider::ElevenLabs => { + Some(ELEVENLABS_TRANSCRIPTION_MODEL_CONFIG_KEY.to_string()) + } + #[cfg(feature = "local-inference")] + DictationProvider::Local => Some(whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY.to_string()), + } +} + +fn dictation_default_model(provider: DictationProvider) -> Option { + match provider { + DictationProvider::OpenAI => Some(OPENAI_TRANSCRIPTION_MODEL.to_string()), + DictationProvider::Groq => Some(GROQ_TRANSCRIPTION_MODEL.to_string()), + DictationProvider::ElevenLabs => Some(ELEVENLABS_TRANSCRIPTION_MODEL.to_string()), + #[cfg(feature = "local-inference")] + DictationProvider::Local => Some(whisper::recommend_model().to_string()), + } +} + +fn dictation_selected_model(config: &Config, provider: DictationProvider) -> Option { + #[cfg(feature = "local-inference")] + if provider == DictationProvider::Local { + return config + .get(whisper::LOCAL_WHISPER_MODEL_CONFIG_KEY, false) + .ok() + .and_then(|value| value.as_str().map(str::to_owned)) + .filter(|model_id| whisper::get_model(model_id).is_some()) + .or_else(|| dictation_default_model(provider)); + } + + dictation_model_config_key(provider) + .and_then(|key| { + config + .get(&key, false) + .ok() + .and_then(|value| value.as_str().map(str::to_owned)) + }) + .or_else(|| dictation_default_model(provider)) +} + +fn dictation_available_models(provider: DictationProvider) -> Vec { + match provider { + DictationProvider::OpenAI => vec![DictationModelOption { + id: OPENAI_TRANSCRIPTION_MODEL.to_string(), + label: "Whisper-1".to_string(), + description: "OpenAI's hosted Whisper transcription model.".to_string(), + }], + DictationProvider::Groq => vec![DictationModelOption { + id: GROQ_TRANSCRIPTION_MODEL.to_string(), + label: "Whisper Large V3 Turbo".to_string(), + description: "Groq's fast hosted Whisper transcription model.".to_string(), + }], + DictationProvider::ElevenLabs => vec![DictationModelOption { + id: ELEVENLABS_TRANSCRIPTION_MODEL.to_string(), + label: "Scribe v1".to_string(), + description: "ElevenLabs' hosted speech-to-text model.".to_string(), + }], + #[cfg(feature = "local-inference")] + DictationProvider::Local => whisper::available_models() + .iter() + .map(|model| DictationModelOption { + id: model.id.to_string(), + label: model.id.to_string(), + description: model.description.to_string(), + }) + .collect(), + } } pub struct GooseAcpHandler { diff --git a/crates/goose-cli/Cargo.toml b/crates/goose-cli/Cargo.toml index 6c20a644912a..369cd59606cb 100644 --- a/crates/goose-cli/Cargo.toml +++ b/crates/goose-cli/Cargo.toml @@ -71,7 +71,7 @@ winapi = { workspace = true } [features] default = ["code-mode", "local-inference", "aws-providers", "telemetry", "otel", "rustls-tls"] code-mode = ["goose/code-mode", "goose-acp/code-mode"] -local-inference = ["goose/local-inference"] +local-inference = ["goose/local-inference", "goose-acp/local-inference"] aws-providers = ["goose/aws-providers"] cuda = ["goose/cuda", "local-inference"] telemetry = ["goose/telemetry"] diff --git a/crates/goose-sdk/src/custom_requests.rs b/crates/goose-sdk/src/custom_requests.rs index bbc375be09f3..609299712c47 100644 --- a/crates/goose-sdk/src/custom_requests.rs +++ b/crates/goose-sdk/src/custom_requests.rs @@ -1,6 +1,7 @@ use sacp::{JsonRpcRequest, JsonRpcResponse}; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use std::collections::HashMap; /// Schema descriptor for a single custom method, produced by the /// `#[custom_methods]` macro's generated `custom_method_schemas()` function. @@ -309,6 +310,154 @@ pub struct ProviderConfigKey { pub primary: bool, } +/// Transcribe audio via a dictation provider. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/transcribe", response = DictationTranscribeResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationTranscribeRequest { + /// Base64-encoded audio data + pub audio: String, + /// MIME type (e.g. "audio/wav", "audio/webm") + pub mime_type: String, + /// Provider to use: "openai", "groq", "elevenlabs", or "local" + pub provider: String, +} + +/// Transcription result. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)] +pub struct DictationTranscribeResponse { + pub text: String, +} + +/// Get the configuration status of all dictation providers. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/config", response = DictationConfigResponse)] +pub struct DictationConfigRequest {} + +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema)] +pub struct DictationModelOption { + pub id: String, + pub label: String, + pub description: String, +} + +/// Per-provider configuration status. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct DictationProviderStatusEntry { + pub configured: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub host: Option, + pub description: String, + pub uses_provider_config: bool, + #[serde(skip_serializing_if = "Option::is_none")] + pub settings_path: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub config_key: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub model_config_key: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub default_model: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub selected_model: Option, + #[serde(default)] + pub available_models: Vec, +} + +/// Dictation config response — map of provider name to status. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)] +pub struct DictationConfigResponse { + pub providers: HashMap, +} + /// Empty success response for operations that return no data. #[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)] pub struct EmptyResponse {} + +/// List available local Whisper models with their download status. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request( + method = "_goose/dictation/models/list", + response = DictationModelsListResponse +)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelsListRequest {} + +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelsListResponse { + pub models: Vec, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct DictationLocalModelStatus { + pub id: String, + pub label: String, + pub description: String, + pub size_mb: u32, + pub downloaded: bool, + pub download_in_progress: bool, +} + +/// Kick off a background download of a local Whisper model. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/models/download", response = EmptyResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelDownloadRequest { + pub model_id: String, +} + +/// Poll the progress of an in-flight download. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request( + method = "_goose/dictation/models/download/progress", + response = DictationModelDownloadProgressResponse +)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelDownloadProgressRequest { + pub model_id: String, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelDownloadProgressResponse { + /// None when no download is active for this model id. + pub progress: Option, +} + +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema)] +#[serde(rename_all = "camelCase")] +pub struct DictationDownloadProgress { + pub bytes_downloaded: u64, + pub total_bytes: u64, + pub progress_percent: f32, + /// serde lowercase of DownloadStatus: "downloading" | "completed" | "failed" | "cancelled" + pub status: String, + pub error: Option, +} + +/// Cancel an in-flight download. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/models/cancel", response = EmptyResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelCancelRequest { + pub model_id: String, +} + +/// Delete a downloaded local Whisper model from disk. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/models/delete", response = EmptyResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelDeleteRequest { + pub model_id: String, +} + +/// Persist the user's model selection for a given provider. +#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema, JsonRpcRequest)] +#[request(method = "_goose/dictation/model/select", response = EmptyResponse)] +#[serde(rename_all = "camelCase")] +pub struct DictationModelSelectRequest { + pub provider: String, + pub model_id: String, +} diff --git a/ui/goose2/scripts/check-file-sizes.mjs b/ui/goose2/scripts/check-file-sizes.mjs index 07e1d124f273..c5c47459595b 100644 --- a/ui/goose2/scripts/check-file-sizes.mjs +++ b/ui/goose2/scripts/check-file-sizes.mjs @@ -50,6 +50,16 @@ const EXCEPTIONS = { justification: "ACP-backed session overlay persistence, draft migration, and sidebar-facing session merge logic live together for now.", }, + "src/features/chat/ui/ChatInput.tsx": { + limit: 510, + justification: + "Voice dictation send/stop guards, attachment handling, and mention/picker coordination still share one chat composer component.", + }, + "src/features/chat/ui/__tests__/ChatInput.test.tsx": { + limit: 510, + justification: + "Composer regression coverage spans personas, queueing, attachments, and voice-input edge cases in one interaction-heavy suite.", + }, "src-tauri/src/commands/projects.rs": { limit: 520, justification: diff --git a/ui/goose2/src-tauri/Info.plist b/ui/goose2/src-tauri/Info.plist new file mode 100644 index 000000000000..8588d2d741c4 --- /dev/null +++ b/ui/goose2/src-tauri/Info.plist @@ -0,0 +1,8 @@ + + + + + NSMicrophoneUsageDescription + Goose uses your microphone to capture voice input for dictation. + + diff --git a/ui/goose2/src-tauri/plugins/app-test-driver/src/lib.rs b/ui/goose2/src-tauri/plugins/app-test-driver/src/lib.rs index 256b2c29e1f6..0d7c09998b63 100644 --- a/ui/goose2/src-tauri/plugins/app-test-driver/src/lib.rs +++ b/ui/goose2/src-tauri/plugins/app-test-driver/src/lib.rs @@ -2,7 +2,9 @@ use serde::{Deserialize, Serialize}; use std::io::{BufRead, BufReader, Write}; use std::net::TcpListener; use std::sync::Mutex; -use tauri::{AppHandle, Manager, Runtime, WebviewWindow}; +use tauri::{AppHandle, Manager, Runtime}; +#[cfg(target_os = "macos")] +use tauri::WebviewWindow; #[derive(Deserialize, Debug)] struct TestCommand { diff --git a/ui/goose2/src-tauri/src/services/provider_defs.rs b/ui/goose2/src-tauri/src/services/provider_defs.rs index 0a2a326eaf00..5eea0c0a5a64 100644 --- a/ui/goose2/src-tauri/src/services/provider_defs.rs +++ b/ui/goose2/src-tauri/src/services/provider_defs.rs @@ -125,6 +125,17 @@ pub(crate) static PROVIDER_CONFIG_DEFS: &[ProviderConfigDef] = &[ keys: &[], oauth_cache_path: None, }, + // Dictation providers (voice input) + ProviderConfigDef { + id: "dictation_groq", + keys: &[key("GROQ_API_KEY", true, true)], + oauth_cache_path: None, + }, + ProviderConfigDef { + id: "dictation_elevenlabs", + keys: &[key("ELEVENLABS_API_KEY", true, true)], + oauth_cache_path: None, + }, ]; pub(crate) fn find_config_key(key_name: &str) -> Option<&'static ConfigKey> { diff --git a/ui/goose2/src/features/chat/hooks/__tests__/useDictationRecorder.test.ts b/ui/goose2/src/features/chat/hooks/__tests__/useDictationRecorder.test.ts new file mode 100644 index 000000000000..43b851d3ffc9 --- /dev/null +++ b/ui/goose2/src/features/chat/hooks/__tests__/useDictationRecorder.test.ts @@ -0,0 +1,72 @@ +import { act, renderHook, waitFor } from "@testing-library/react"; +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const mockTranscribeDictation = vi.fn(); + +vi.mock("@/shared/api/dictation", () => ({ + transcribeDictation: (...args: unknown[]) => mockTranscribeDictation(...args), +})); + +import { useDictationRecorder } from "../useDictationRecorder"; + +function deferred() { + let resolve!: (value: T | PromiseLike) => void; + const promise = new Promise((res) => { + resolve = res; + }); + return { promise, resolve }; +} + +describe("useDictationRecorder", () => { + beforeEach(() => { + mockTranscribeDictation.mockReset(); + + Object.defineProperty(navigator, "mediaDevices", { + configurable: true, + value: { + getUserMedia: vi.fn(), + }, + }); + }); + + it("lets a second toggle cancel a pending startup", async () => { + const pendingStream = deferred(); + const stopTrack = vi.fn(); + const stream = { + getTracks: () => [{ stop: stopTrack }], + } as unknown as MediaStream; + + vi.mocked(navigator.mediaDevices.getUserMedia).mockReturnValue( + pendingStream.promise, + ); + + const { result } = renderHook(() => + useDictationRecorder({ + onError: vi.fn(), + onTranscription: vi.fn(), + preferredMicrophoneId: null, + provider: "openai", + providerConfigured: true, + }), + ); + + act(() => { + result.current.toggleRecording(); + }); + + expect(result.current.isStarting()).toBe(true); + + act(() => { + result.current.toggleRecording(); + }); + + await act(async () => { + pendingStream.resolve(stream); + await pendingStream.promise; + }); + + await waitFor(() => expect(result.current.isStarting()).toBe(false)); + expect(result.current.isRecording).toBe(false); + expect(stopTrack).toHaveBeenCalledTimes(1); + }); +}); diff --git a/ui/goose2/src/features/chat/hooks/__tests__/useVoiceDictation.test.ts b/ui/goose2/src/features/chat/hooks/__tests__/useVoiceDictation.test.ts new file mode 100644 index 000000000000..a030d44b0f3e --- /dev/null +++ b/ui/goose2/src/features/chat/hooks/__tests__/useVoiceDictation.test.ts @@ -0,0 +1,99 @@ +import { renderHook, waitFor } from "@testing-library/react"; +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const mockGetDictationConfig = vi.fn(); +const mockUseDictationRecorder = vi.fn(); +const mockUseVoiceInputPreferences = vi.fn(); + +vi.mock("@/shared/api/dictation", () => ({ + getDictationConfig: () => mockGetDictationConfig(), +})); + +vi.mock("../useDictationRecorder", () => ({ + useDictationRecorder: (options: unknown) => mockUseDictationRecorder(options), +})); + +vi.mock("../useVoiceInputPreferences", () => ({ + useVoiceInputPreferences: () => mockUseVoiceInputPreferences(), +})); + +import { useVoiceDictation } from "../useVoiceDictation"; + +describe("useVoiceDictation", () => { + beforeEach(() => { + mockGetDictationConfig.mockReset(); + mockUseDictationRecorder.mockReset(); + mockUseVoiceInputPreferences.mockReset(); + + mockUseDictationRecorder.mockReturnValue({ + isEnabled: false, + isRecording: false, + isStarting: () => false, + isTranscribing: false, + startRecording: vi.fn(), + stopRecording: vi.fn(), + toggleRecording: vi.fn(), + }); + }); + + it("defers default provider fallback until preferences hydrate", async () => { + const voicePrefs = { + autoSubmitPhrases: [], + clearSelectedProvider: vi.fn(), + hasStoredProviderPreference: false, + isHydrated: false, + preferredMicrophoneId: null, + rawAutoSubmitPhrases: "submit", + selectedProvider: null, + setPreferredMicrophoneId: vi.fn(), + setRawAutoSubmitPhrases: vi.fn(), + setSelectedProvider: vi.fn(), + }; + + mockUseVoiceInputPreferences.mockImplementation(() => voicePrefs); + mockGetDictationConfig.mockResolvedValue({ + openai: { + availableModels: [], + configured: true, + description: "OpenAI", + usesProviderConfig: true, + }, + }); + + const { rerender } = renderHook(() => + useVoiceDictation({ + attachments: [], + clearAttachments: vi.fn(), + onSend: vi.fn(), + resetTextarea: vi.fn(), + selectedPersonaId: null, + setText: vi.fn(), + text: "", + }), + ); + + await waitFor(() => + expect(mockGetDictationConfig).toHaveBeenCalledTimes(1), + ); + await waitFor(() => + expect(mockUseDictationRecorder).toHaveBeenLastCalledWith( + expect.objectContaining({ + provider: null, + providerConfigured: false, + }), + ), + ); + + voicePrefs.isHydrated = true; + rerender(); + + await waitFor(() => + expect(mockUseDictationRecorder).toHaveBeenLastCalledWith( + expect.objectContaining({ + provider: "openai", + providerConfigured: true, + }), + ), + ); + }); +}); diff --git a/ui/goose2/src/features/chat/hooks/__tests__/useVoiceInputPreferences.test.ts b/ui/goose2/src/features/chat/hooks/__tests__/useVoiceInputPreferences.test.ts new file mode 100644 index 000000000000..8878ac1195aa --- /dev/null +++ b/ui/goose2/src/features/chat/hooks/__tests__/useVoiceInputPreferences.test.ts @@ -0,0 +1,106 @@ +import { act, renderHook, waitFor } from "@testing-library/react"; +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const mockGetClient = vi.fn(); + +vi.mock("@/shared/api/acpConnection", () => ({ + getClient: () => mockGetClient(), +})); + +import { useVoiceInputPreferences } from "../useVoiceInputPreferences"; + +function deferred() { + let resolve!: (value: T | PromiseLike) => void; + const promise = new Promise((res) => { + resolve = res; + }); + return { promise, resolve }; +} + +describe("useVoiceInputPreferences", () => { + beforeEach(() => { + mockGetClient.mockReset(); + }); + + it("does not hydrate until provider config can be read successfully", async () => { + let shouldFailProviderRead = true; + + mockGetClient.mockResolvedValue({ + goose: { + GooseConfigRead: vi.fn().mockImplementation(({ key }) => { + if (key === "VOICE_DICTATION_PROVIDER") { + if (shouldFailProviderRead) { + return Promise.reject(new Error("temporary acp failure")); + } + return Promise.resolve({ value: "groq" }); + } + return Promise.resolve({ value: null }); + }), + GooseConfigUpsert: vi.fn().mockResolvedValue({}), + GooseConfigRemove: vi.fn().mockResolvedValue({}), + }, + }); + + const { result } = renderHook(() => useVoiceInputPreferences()); + + await act(async () => {}); + + expect(result.current.isHydrated).toBe(false); + expect(result.current.selectedProvider).toBeNull(); + + shouldFailProviderRead = false; + + await act(async () => { + window.dispatchEvent(new Event("goose:voice-input-preferences")); + }); + + await waitFor(() => expect(result.current.isHydrated).toBe(true)); + expect(result.current.selectedProvider).toBe("groq"); + expect(result.current.hasStoredProviderPreference).toBe(true); + }); + + it("broadcasts preference changes only after config persistence settles", async () => { + const upsert = vi.fn(); + const providerRead = deferred<{ value?: unknown }>(); + const pendingWrite = deferred(); + + mockGetClient.mockResolvedValue({ + goose: { + GooseConfigRead: vi + .fn() + .mockResolvedValueOnce({ value: null }) + .mockResolvedValueOnce({ value: null }) + .mockResolvedValueOnce({ value: null }) + .mockImplementation(() => providerRead.promise), + GooseConfigUpsert: upsert.mockImplementation( + () => pendingWrite.promise, + ), + GooseConfigRemove: vi.fn().mockResolvedValue({}), + }, + }); + + const eventListener = vi.fn(); + window.addEventListener("goose:voice-input-preferences", eventListener); + + const { result } = renderHook(() => useVoiceInputPreferences()); + + await waitFor(() => expect(result.current.isHydrated).toBe(true)); + + act(() => { + result.current.setSelectedProvider("openai"); + }); + + expect(eventListener).not.toHaveBeenCalled(); + expect(result.current.selectedProvider).toBe("openai"); + + await act(async () => { + pendingWrite.resolve(); + await pendingWrite.promise; + }); + + await waitFor(() => expect(eventListener).toHaveBeenCalledTimes(1)); + + providerRead.resolve({ value: "openai" }); + window.removeEventListener("goose:voice-input-preferences", eventListener); + }); +}); diff --git a/ui/goose2/src/features/chat/hooks/useDictationRecorder.ts b/ui/goose2/src/features/chat/hooks/useDictationRecorder.ts new file mode 100644 index 000000000000..33462376b583 --- /dev/null +++ b/ui/goose2/src/features/chat/hooks/useDictationRecorder.ts @@ -0,0 +1,415 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { transcribeDictation } from "@/shared/api/dictation"; +import type { DictationProvider } from "@/shared/types/dictation"; +import { + advanceVadState, + createInitialVadState, + getFrameRms, +} from "../lib/dictationVad"; + +interface UseDictationRecorderOptions { + provider: DictationProvider | null; + providerConfigured: boolean; + preferredMicrophoneId: string | null; + onError: (message: string) => void; + onTranscription: (text: string) => void; +} + +const SAMPLE_RATE = 16000; + +function encodeWav(samples: Float32Array, sampleRate: number): ArrayBuffer { + const buffer = new ArrayBuffer(44 + samples.length * 2); + const view = new DataView(buffer); + const write = (offset: number, value: string) => { + for (let index = 0; index < value.length; index += 1) { + view.setUint8(offset + index, value.charCodeAt(index)); + } + }; + + write(0, "RIFF"); + view.setUint32(4, 36 + samples.length * 2, true); + write(8, "WAVE"); + write(12, "fmt "); + view.setUint32(16, 16, true); + view.setUint16(20, 1, true); + view.setUint16(22, 1, true); + view.setUint32(24, sampleRate, true); + view.setUint32(28, sampleRate * 2, true); + view.setUint16(32, 2, true); + view.setUint16(34, 16, true); + write(36, "data"); + view.setUint32(40, samples.length * 2, true); + + let offset = 44; + for (let index = 0; index < samples.length; index += 1) { + const sample = Math.max(-1, Math.min(1, samples[index] ?? 0)); + view.setInt16(offset, sample < 0 ? sample * 0x8000 : sample * 0x7fff, true); + offset += 2; + } + + return buffer; +} + +function blobToBase64(blob: Blob): Promise { + return new Promise((resolve, reject) => { + const reader = new FileReader(); + reader.onloadend = () => resolve(String(reader.result).split(",")[1] ?? ""); + reader.onerror = () => reject(reader.error); + reader.readAsDataURL(blob); + }); +} + +function toErrorMessage(error: unknown) { + if (error instanceof Error && error.message) { + return error.message; + } + + return "Voice input failed"; +} + +export function useDictationRecorder({ + provider, + providerConfigured, + preferredMicrophoneId, + onError, + onTranscription, +}: UseDictationRecorderOptions) { + const [isRecording, setIsRecording] = useState(false); + const [isTranscribing, setIsTranscribing] = useState(false); + const audioContextRef = useRef(null); + const processorRef = useRef(null); + const sourceRef = useRef(null); + const streamRef = useRef(null); + const samplesRef = useRef([]); + const vadStateRef = useRef(createInitialVadState()); + const pendingTranscriptionsRef = useRef(0); + const generationRef = useRef(0); + // Per-generation sequence numbers so out-of-order transcription responses + // can be reassembled into the order the chunks were captured. Without this, + // a later chunk whose API call resolves faster can be appended before an + // earlier, slower one — scrambling long dictation sessions with variable + // API latency. Empty transcriptions still occupy a slot so they don't block + // subsequent chunks. + const chunkSeqRef = useRef(0); + const nextExpectedSeqRef = useRef(0); + const pendingResultsRef = useRef>(new Map()); + // Guards against overlapping startRecording calls while getUserMedia is + // pending (user double-clicks the mic before the first startup resolves). + const startingRef = useRef(false); + // Signals to an in-flight startRecording that the user has asked to stop. + // When true, the startup path tears down any just-acquired stream instead + // of flipping isRecording to true — otherwise the OS mic indicator would + // stay on after the user tried to stop/send. + const cancelStartRef = useRef(false); + const providerRef = useRef(provider); + providerRef.current = provider; + const onErrorRef = useRef(onError); + onErrorRef.current = onError; + const onTranscriptionRef = useRef(onTranscription); + onTranscriptionRef.current = onTranscription; + + const isEnabled = Boolean(provider && providerConfigured); + + const cleanupAudioGraph = useCallback(() => { + processorRef.current?.disconnect(); + processorRef.current = null; + sourceRef.current?.disconnect(); + sourceRef.current = null; + void audioContextRef.current?.close(); + audioContextRef.current = null; + streamRef.current?.getTracks().forEach((track) => { + track.stop(); + }); + streamRef.current = null; + }, []); + + const transcribeChunk = useCallback(async (samples: Float32Array) => { + const activeProvider = providerRef.current; + if (!activeProvider) { + return; + } + + const gen = generationRef.current; + const mySeq = chunkSeqRef.current; + chunkSeqRef.current += 1; + pendingTranscriptionsRef.current += 1; + setIsTranscribing(true); + + try { + const wavBlob = new Blob([encodeWav(samples, SAMPLE_RATE)], { + type: "audio/wav", + }); + const audio = await blobToBase64(wavBlob); + const response = await transcribeDictation({ + audio, + mimeType: "audio/wav", + provider: activeProvider, + }); + + if (gen !== generationRef.current) { + return; + } + + // Buffer by sequence number, then drain any contiguous prefix so + // emissions to onTranscription stay in capture order even when API + // responses resolve out of order. + pendingResultsRef.current.set(mySeq, response.text); + while (pendingResultsRef.current.has(nextExpectedSeqRef.current)) { + const text = pendingResultsRef.current.get(nextExpectedSeqRef.current); + pendingResultsRef.current.delete(nextExpectedSeqRef.current); + nextExpectedSeqRef.current += 1; + if (text?.trim()) { + onTranscriptionRef.current(text); + } + } + } catch (error) { + onErrorRef.current(toErrorMessage(error)); + // Unblock the queue so a failure doesn't stall every subsequent chunk. + if (gen === generationRef.current) { + pendingResultsRef.current.set(mySeq, ""); + while (pendingResultsRef.current.has(nextExpectedSeqRef.current)) { + const text = pendingResultsRef.current.get( + nextExpectedSeqRef.current, + ); + pendingResultsRef.current.delete(nextExpectedSeqRef.current); + nextExpectedSeqRef.current += 1; + if (text?.trim()) { + onTranscriptionRef.current(text); + } + } + } + } finally { + pendingTranscriptionsRef.current -= 1; + if (pendingTranscriptionsRef.current === 0) { + setIsTranscribing(false); + } + } + }, []); + + const flushPendingSamples = useCallback(() => { + const chunks = samplesRef.current; + if (chunks.length === 0) { + return; + } + + const totalSamples = chunks.reduce( + (count, chunk) => count + chunk.length, + 0, + ); + const merged = new Float32Array(totalSamples); + let offset = 0; + for (const chunk of chunks) { + merged.set(chunk, offset); + offset += chunk.length; + } + + samplesRef.current = []; + void transcribeChunk(merged); + }, [transcribeChunk]); + + const stopRecording = useCallback( + (options?: { flushPending?: boolean }) => { + const flushPending = options?.flushPending ?? true; + + // Signal any in-flight startRecording to abort. If getUserMedia is + // still pending or the audio graph hasn't been wired up yet, the + // startup path will see this flag and clean up the just-acquired + // stream instead of flipping isRecording to true. + cancelStartRef.current = true; + + if (flushPending && samplesRef.current.length > 0) { + flushPendingSamples(); + } else if (!flushPending) { + samplesRef.current = []; + generationRef.current += 1; + // Reset chunk-ordering state so the new generation starts at seq 0. + // In-flight chunks from the old generation bail at the gen check in + // transcribeChunk without touching the pending map. + chunkSeqRef.current = 0; + nextExpectedSeqRef.current = 0; + pendingResultsRef.current.clear(); + } + + vadStateRef.current = createInitialVadState(); + cleanupAudioGraph(); + setIsRecording(false); + }, + [cleanupAudioGraph, flushPendingSamples], + ); + + const handleFrame = useCallback( + (samples: Float32Array) => { + const { decision, nextState } = advanceVadState( + vadStateRef.current, + getFrameRms(samples), + ); + vadStateRef.current = nextState; + + if (decision === "ignore") { + return; + } + + if (decision === "discard") { + samplesRef.current = []; + return; + } + + samplesRef.current.push(new Float32Array(samples)); + + if (decision === "append_and_flush") { + flushPendingSamples(); + } + }, + [flushPendingSamples], + ); + + const startRecording = useCallback(async () => { + if (!isEnabled || !provider) { + onError("Voice input is not configured"); + return; + } + + // Bail if a startup is already in-flight or we're already recording. + // Without this guard, a rapid second click (before getUserMedia resolves) + // would kick off a parallel recorder setup and leak a MediaStream — the + // OS mic indicator would stay on after the user thought they'd stopped. + if (startingRef.current || isRecording) { + return; + } + + startingRef.current = true; + cancelStartRef.current = false; + + try { + const audioConstraints: MediaTrackConstraints = { + autoGainControl: true, + echoCancellation: true, + noiseSuppression: true, + }; + + if (preferredMicrophoneId) { + audioConstraints.deviceId = { exact: preferredMicrophoneId }; + } + + let stream: MediaStream; + try { + stream = await navigator.mediaDevices.getUserMedia({ + audio: audioConstraints, + }); + } catch (error) { + if ( + preferredMicrophoneId && + error instanceof DOMException && + (error.name === "NotFoundError" || + error.name === "OverconstrainedError") + ) { + delete audioConstraints.deviceId; + stream = await navigator.mediaDevices.getUserMedia({ + audio: audioConstraints, + }); + } else { + throw error; + } + } + + // If stopRecording was called while getUserMedia was pending (e.g., + // user clicked Send before the mic finished setting up), tear down + // the freshly-acquired stream immediately and bail. Otherwise the + // MediaStream tracks stay hot and the OS mic indicator lingers. + if (cancelStartRef.current) { + stream.getTracks().forEach((track) => { + track.stop(); + }); + return; + } + + streamRef.current = stream; + samplesRef.current = []; + vadStateRef.current = createInitialVadState(); + + const context = new AudioContext({ sampleRate: SAMPLE_RATE }); + audioContextRef.current = context; + await context.resume(); + + // Check again after the async context.resume() — stopRecording may + // have fired while we were awaiting. + if (cancelStartRef.current) { + cleanupAudioGraph(); + return; + } + + const source = context.createMediaStreamSource(stream); + const processor = context.createScriptProcessor(1024, 1, 1); + const silence = context.createGain(); + silence.gain.value = 0; + + processor.onaudioprocess = (event) => { + const channel = event.inputBuffer.getChannelData(0); + handleFrame(new Float32Array(channel)); + }; + + source.connect(processor); + processor.connect(silence); + silence.connect(context.destination); + + sourceRef.current = source; + processorRef.current = processor; + setIsRecording(true); + } catch (error) { + stopRecording({ flushPending: false }); + onError(toErrorMessage(error)); + } finally { + startingRef.current = false; + } + }, [ + cleanupAudioGraph, + handleFrame, + isEnabled, + isRecording, + onError, + preferredMicrophoneId, + provider, + stopRecording, + ]); + + const toggleRecording = useCallback(() => { + if (startingRef.current) { + stopRecording({ flushPending: false }); + return; + } + if (isRecording) { + stopRecording(); + } else { + void startRecording(); + } + }, [isRecording, startRecording, stopRecording]); + + useEffect( + () => () => { + stopRecording({ flushPending: false }); + }, + [stopRecording], + ); + + useEffect(() => { + if (!provider && isRecording) { + stopRecording({ flushPending: false }); + } + }, [isRecording, provider, stopRecording]); + + // Imperative check for consumers (e.g. handleSend) who need to know at + // click time whether a startup is pending. Uses a function rather than a + // state value because startingRef is a ref (no render on change) and we + // only need the answer when the consumer is deciding what to do *now*. + const isStarting = useCallback(() => startingRef.current, []); + + return { + isEnabled, + isRecording, + isTranscribing, + isStarting, + startRecording, + stopRecording, + toggleRecording, + }; +} diff --git a/ui/goose2/src/features/chat/hooks/useVoiceDictation.ts b/ui/goose2/src/features/chat/hooks/useVoiceDictation.ts new file mode 100644 index 000000000000..cfd2d67e81a4 --- /dev/null +++ b/ui/goose2/src/features/chat/hooks/useVoiceDictation.ts @@ -0,0 +1,181 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { getDictationConfig } from "@/shared/api/dictation"; +import type { DictationProviderStatus } from "@/shared/types/dictation"; +import type { ChatAttachmentDraft } from "@/shared/types/messages"; +import { useDictationRecorder } from "./useDictationRecorder"; +import { useVoiceInputPreferences } from "./useVoiceInputPreferences"; +import { + appendTranscribedText, + getAutoSubmitMatch, + getDefaultDictationProvider, + VOICE_DICTATION_CONFIG_EVENT, +} from "../lib/voiceInput"; + +interface UseVoiceDictationOptions { + text: string; + setText: (value: string) => void; + attachments: ChatAttachmentDraft[]; + clearAttachments: () => void; + selectedPersonaId: string | null; + onSend: ( + text: string, + personaId?: string, + attachments?: ChatAttachmentDraft[], + ) => void; + resetTextarea: () => void; + /** + * When true, auto-submit on trigger phrase will NOT call `onSend`. + * Instead, the trigger phrase is stripped and the remaining transcription + * is left in the textarea for the user to review and send manually. + * Caller should set this to match `ChatInput`'s own send guards + * (queued-message lockout, outer `disabled` state, etc.) so voice + * auto-submit can't bypass the UI's protection against extra sends + * during an active run. + */ + isSendLocked?: boolean; +} + +export function useVoiceDictation({ + text, + setText, + attachments, + clearAttachments, + selectedPersonaId, + onSend, + resetTextarea, + isSendLocked = false, +}: UseVoiceDictationOptions) { + const voicePrefs = useVoiceInputPreferences(); + const [providerStatuses, setProviderStatuses] = useState< + Partial> + >({}); + + const fetchDictationConfig = useCallback(() => { + getDictationConfig() + .then(setProviderStatuses) + .catch(() => {}); + }, []); + + useEffect(() => { + fetchDictationConfig(); + window.addEventListener(VOICE_DICTATION_CONFIG_EVENT, fetchDictationConfig); + return () => + window.removeEventListener( + VOICE_DICTATION_CONFIG_EVENT, + fetchDictationConfig, + ); + }, [fetchDictationConfig]); + + // Treat the stored preference as valid only when it actually appears in + // `providerStatuses`. If the stored value points at a provider that's been + // feature-flagged off or removed, fall through to the default so voice + // input isn't silently disabled. The explicit "off" state + // (`hasStoredProviderPreference && selectedProvider == null`) is preserved. + const storedProviderIsPresent = + voicePrefs.selectedProvider != null && + providerStatuses[voicePrefs.selectedProvider] !== undefined; + + const activeVoiceProvider = !voicePrefs.isHydrated + ? null + : storedProviderIsPresent + ? voicePrefs.selectedProvider + : voicePrefs.hasStoredProviderPreference && + voicePrefs.selectedProvider == null + ? null + : getDefaultDictationProvider(providerStatuses); + + // If a stored preference points at a provider that's no longer in + // providerStatuses (feature-flagged off, removed), clear it so next boot + // falls through to the default cleanly instead of re-detecting the stale + // value every session. + useEffect(() => { + if ( + voicePrefs.selectedProvider != null && + Object.keys(providerStatuses).length > 0 && + providerStatuses[voicePrefs.selectedProvider] === undefined + ) { + voicePrefs.clearSelectedProvider(); + } + }, [providerStatuses, voicePrefs]); + + const providerConfigured = + activeVoiceProvider != null && + providerStatuses[activeVoiceProvider]?.configured === true; + + const stopRecordingRef = useRef< + (options?: { flushPending?: boolean }) => void + >(() => {}); + + // Mirror `text` in a ref so `handleTranscription` always sees the latest + // value, even when `useDictationRecorder` fires multiple callbacks in the + // same tick before React has applied the first setText. Without this, two + // concurrent callbacks would both read a stale `text` from closure and the + // second would overwrite the first fragment, dropping dictated words. + // + // Assign during render (not in a post-render `useEffect`) so there is no + // commit-window race: if the user types a character in the textarea and a + // transcription callback resolves before the effect runs, the callback + // would otherwise read the previous `text` and clobber the user's edit. + // Writing to `ref.current` during render is explicitly supported by React + // (see `providerRef.current = provider;` in `useDictationRecorder.ts`). + const textRef = useRef(text); + textRef.current = text; + + const handleTranscription = useCallback( + (fragment: string) => { + const latest = textRef.current; + const match = getAutoSubmitMatch(fragment, voicePrefs.autoSubmitPhrases); + if (match) { + const merged = appendTranscribedText(latest, match.textWithoutPhrase); + if (!merged.trim()) { + return; + } + stopRecordingRef.current({ flushPending: false }); + if (isSendLocked) { + // Parent UI is blocking sends (queued message, disabled, etc.). + // Strip the trigger phrase and leave the transcription in the + // textarea so the user can send it manually when the lock clears. + setText(merged); + textRef.current = merged; + return; + } + onSend( + merged.trim(), + selectedPersonaId ?? undefined, + attachments.length > 0 ? attachments : undefined, + ); + setText(""); + textRef.current = ""; + clearAttachments(); + resetTextarea(); + } else { + const merged = appendTranscribedText(latest, fragment); + setText(merged); + textRef.current = merged; + } + }, + [ + attachments, + clearAttachments, + isSendLocked, + onSend, + resetTextarea, + selectedPersonaId, + setText, + voicePrefs.autoSubmitPhrases, + ], + ); + + const handleVoiceError = useCallback((_message: string) => {}, []); + + const dictation = useDictationRecorder({ + provider: activeVoiceProvider, + providerConfigured, + preferredMicrophoneId: voicePrefs.preferredMicrophoneId, + onError: handleVoiceError, + onTranscription: handleTranscription, + }); + stopRecordingRef.current = dictation.stopRecording; + + return dictation; +} diff --git a/ui/goose2/src/features/chat/hooks/useVoiceInputPreferences.ts b/ui/goose2/src/features/chat/hooks/useVoiceInputPreferences.ts new file mode 100644 index 000000000000..e2fc66f2472c --- /dev/null +++ b/ui/goose2/src/features/chat/hooks/useVoiceInputPreferences.ts @@ -0,0 +1,211 @@ +import { useCallback, useEffect, useMemo, useState } from "react"; +import { getClient } from "@/shared/api/acpConnection"; +import { + DEFAULT_AUTO_SUBMIT_PHRASES_RAW, + DISABLED_DICTATION_PROVIDER_CONFIG_VALUE, + VOICE_AUTO_SUBMIT_PHRASES_CONFIG_KEY, + VOICE_DICTATION_PREFERRED_MIC_CONFIG_KEY, + VOICE_DICTATION_PROVIDER_CONFIG_KEY, + normalizeDictationProvider, + parseAutoSubmitPhrases, +} from "../lib/voiceInput"; +import type { DictationProvider } from "@/shared/types/dictation"; + +const VOICE_INPUT_PREFERENCES_EVENT = "goose:voice-input-preferences"; + +type ConfigReadResult = { ok: true; value: string | null } | { ok: false }; + +async function readConfigString(key: string): Promise { + try { + const client = await getClient(); + const response = await client.goose.GooseConfigRead({ key }); + return { + ok: true, + value: typeof response.value === "string" ? response.value : null, + }; + } catch { + return { ok: false }; + } +} + +async function writeConfigString(key: string, value: string): Promise { + try { + const client = await getClient(); + await client.goose.GooseConfigUpsert({ key, value }); + } catch { + // goose config may be unavailable + } +} + +async function removeConfigKey(key: string): Promise { + try { + const client = await getClient(); + await client.goose.GooseConfigRemove({ key }); + } catch { + // goose config may be unavailable + } +} + +export function useVoiceInputPreferences() { + const [rawAutoSubmitPhrases, setRawAutoSubmitPhrasesState] = useState( + DEFAULT_AUTO_SUBMIT_PHRASES_RAW, + ); + const [selectedProvider, setSelectedProviderState] = + useState(null); + const [hasStoredProviderPreference, setHasStoredProviderPreferenceState] = + useState(false); + const [preferredMicrophoneId, setPreferredMicrophoneIdState] = useState< + string | null + >(null); + // Flips true after the first syncFromConfig completes so consumers can + // distinguish "no stored preference" from "the ACP round-trip hasn't + // finished yet." Without this, a consumer that auto-writes a default when + // hasStoredProviderPreference is false can race ahead and overwrite the + // user's saved choice before it loads. + const [isHydrated, setIsHydrated] = useState(false); + + const syncFromConfig = useCallback(async () => { + const [phrasesResult, providerResult, micResult] = await Promise.all([ + readConfigString(VOICE_AUTO_SUBMIT_PHRASES_CONFIG_KEY), + readConfigString(VOICE_DICTATION_PROVIDER_CONFIG_KEY), + readConfigString(VOICE_DICTATION_PREFERRED_MIC_CONFIG_KEY), + ]); + + if (phrasesResult.ok) { + setRawAutoSubmitPhrasesState( + phrasesResult.value ?? DEFAULT_AUTO_SUBMIT_PHRASES_RAW, + ); + } + + if (!providerResult.ok) { + if (micResult.ok) { + setPreferredMicrophoneIdState(micResult.value); + } + return; + } + + if (providerResult.value === DISABLED_DICTATION_PROVIDER_CONFIG_VALUE) { + setSelectedProviderState(null); + setHasStoredProviderPreferenceState(true); + } else if (providerResult.value != null) { + const normalized = normalizeDictationProvider(providerResult.value); + if (normalized !== null) { + setSelectedProviderState(normalized); + setHasStoredProviderPreferenceState(true); + } else { + // Stored value isn't a recognized provider (stale from an older + // build, typo, etc.). Treat as no preference — don't pin the user + // to voice-off — and clear the config key so future boots fall + // through to the default cleanly. + setSelectedProviderState(null); + setHasStoredProviderPreferenceState(false); + void removeConfigKey(VOICE_DICTATION_PROVIDER_CONFIG_KEY); + } + } else { + setSelectedProviderState(null); + setHasStoredProviderPreferenceState(false); + } + + if (micResult.ok) { + setPreferredMicrophoneIdState(micResult.value); + } + setIsHydrated(true); + }, []); + + useEffect(() => { + void syncFromConfig(); + const handler = () => { + void syncFromConfig(); + }; + window.addEventListener( + VOICE_INPUT_PREFERENCES_EVENT, + handler as EventListener, + ); + return () => { + window.removeEventListener( + VOICE_INPUT_PREFERENCES_EVENT, + handler as EventListener, + ); + }; + }, [syncFromConfig]); + + const dispatchPreferencesEvent = useCallback(() => { + window.dispatchEvent(new Event(VOICE_INPUT_PREFERENCES_EVENT)); + }, []); + + const persistAndBroadcast = useCallback( + (operation: Promise) => { + void operation.finally(() => { + dispatchPreferencesEvent(); + }); + }, + [dispatchPreferencesEvent], + ); + + const setRawAutoSubmitPhrases = useCallback( + (value: string) => { + setRawAutoSubmitPhrasesState(value); + persistAndBroadcast( + writeConfigString(VOICE_AUTO_SUBMIT_PHRASES_CONFIG_KEY, value), + ); + }, + [persistAndBroadcast], + ); + + const setSelectedProvider = useCallback( + (value: DictationProvider | null) => { + setSelectedProviderState(value); + setHasStoredProviderPreferenceState(true); + persistAndBroadcast( + writeConfigString( + VOICE_DICTATION_PROVIDER_CONFIG_KEY, + value ?? DISABLED_DICTATION_PROVIDER_CONFIG_VALUE, + ), + ); + }, + [persistAndBroadcast], + ); + + // Remove the stored preference entirely, so the user falls through to the + // default provider on next boot. Distinct from setSelectedProvider(null), + // which pins the user to "voice off" via a sentinel value. + const clearSelectedProvider = useCallback(() => { + setSelectedProviderState(null); + setHasStoredProviderPreferenceState(false); + persistAndBroadcast(removeConfigKey(VOICE_DICTATION_PROVIDER_CONFIG_KEY)); + }, [persistAndBroadcast]); + + const setPreferredMicrophoneId = useCallback( + (value: string | null) => { + setPreferredMicrophoneIdState(value); + if (value) { + persistAndBroadcast( + writeConfigString(VOICE_DICTATION_PREFERRED_MIC_CONFIG_KEY, value), + ); + } else { + persistAndBroadcast( + removeConfigKey(VOICE_DICTATION_PREFERRED_MIC_CONFIG_KEY), + ); + } + }, + [persistAndBroadcast], + ); + + const autoSubmitPhrases = useMemo( + () => parseAutoSubmitPhrases(rawAutoSubmitPhrases), + [rawAutoSubmitPhrases], + ); + + return { + autoSubmitPhrases, + clearSelectedProvider, + hasStoredProviderPreference, + isHydrated, + preferredMicrophoneId, + rawAutoSubmitPhrases, + selectedProvider, + setPreferredMicrophoneId, + setRawAutoSubmitPhrases, + setSelectedProvider, + }; +} diff --git a/ui/goose2/src/features/chat/lib/dictationVad.test.ts b/ui/goose2/src/features/chat/lib/dictationVad.test.ts new file mode 100644 index 000000000000..89e96045c507 --- /dev/null +++ b/ui/goose2/src/features/chat/lib/dictationVad.test.ts @@ -0,0 +1,51 @@ +import { describe, expect, it } from "vitest"; +import { advanceVadState, createInitialVadState } from "./dictationVad"; + +function runFrames(levels: number[]) { + const decisions: string[] = []; + let state = createInitialVadState(); + + for (const level of levels) { + const result = advanceVadState(state, level); + decisions.push(result.decision); + state = result.nextState; + } + + return decisions; +} + +describe("dictationVad", () => { + it("ignores silence-only audio", () => { + expect(runFrames([0, 0, 0, 0])).toEqual([ + "ignore", + "ignore", + "ignore", + "ignore", + ]); + }); + + it("discards short noise bursts that never confirm speech", () => { + expect(runFrames([0.03, 0, 0, 0])).toEqual([ + "append", + "append", + "append", + "discard", + ]); + }); + + it("flushes a chunk after speech followed by trailing silence", () => { + expect(runFrames([0.03, 0.03, 0.03, 0, 0, 0, 0, 0, 0])).toContain( + "append_and_flush", + ); + }); + + it("returns to ignoring silence after a flush, ready for another chunk", () => { + const decisions = runFrames([ + 0.03, 0.03, 0.03, 0, 0, 0, 0, 0, 0, 0.03, 0.03, 0.03, 0, 0, 0, 0, 0, 0, + ]); + + expect( + decisions.filter((decision) => decision === "append_and_flush"), + ).toHaveLength(2); + }); +}); diff --git a/ui/goose2/src/features/chat/lib/dictationVad.ts b/ui/goose2/src/features/chat/lib/dictationVad.ts new file mode 100644 index 000000000000..0b4561e8cbae --- /dev/null +++ b/ui/goose2/src/features/chat/lib/dictationVad.ts @@ -0,0 +1,147 @@ +export type VadPhase = "idle" | "primed" | "speaking" | "trailing"; + +export type VadDecision = "ignore" | "append" | "append_and_flush" | "discard"; + +export interface VadState { + phase: VadPhase; + speechFrames: number; + silenceFrames: number; + framesInChunk: number; +} + +const SPEECH_RMS_THRESHOLD = 0.018; +const SPEECH_CONFIRMATION_FRAMES = 2; +const MAX_PRIMED_SILENCE_FRAMES = 2; +const TRAILING_SILENCE_FRAMES = 6; +const MIN_SPEECH_FRAMES = 3; + +export function createInitialVadState(): VadState { + return { + phase: "idle", + speechFrames: 0, + silenceFrames: 0, + framesInChunk: 0, + }; +} + +export function getFrameRms(samples: Float32Array): number { + let sum = 0; + for (let index = 0; index < samples.length; index += 1) { + const value = samples[index] ?? 0; + sum += value * value; + } + + return Math.sqrt(sum / Math.max(samples.length, 1)); +} + +export function advanceVadState( + state: VadState, + frameRms: number, +): { decision: VadDecision; nextState: VadState } { + const isSpeech = frameRms >= SPEECH_RMS_THRESHOLD; + + if (state.phase === "idle") { + if (!isSpeech) { + return { decision: "ignore" as const, nextState: state }; + } + + return { + decision: "append" as const, + nextState: { + phase: "primed" as const, + speechFrames: 1, + silenceFrames: 0, + framesInChunk: 1, + }, + }; + } + + if (state.phase === "primed") { + if (isSpeech) { + const speechFrames = state.speechFrames + 1; + return { + decision: "append" as const, + nextState: { + phase: + speechFrames >= SPEECH_CONFIRMATION_FRAMES ? "speaking" : "primed", + speechFrames, + silenceFrames: 0, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + const silenceFrames = state.silenceFrames + 1; + if (silenceFrames > MAX_PRIMED_SILENCE_FRAMES) { + return { + decision: "discard" as const, + nextState: createInitialVadState(), + }; + } + + return { + decision: "append" as const, + nextState: { + ...state, + silenceFrames, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + if (state.phase === "speaking") { + if (isSpeech) { + return { + decision: "append" as const, + nextState: { + phase: "speaking" as const, + speechFrames: state.speechFrames + 1, + silenceFrames: 0, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + return { + decision: "append" as const, + nextState: { + phase: "trailing" as const, + speechFrames: state.speechFrames, + silenceFrames: 1, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + if (isSpeech) { + return { + decision: "append" as const, + nextState: { + phase: "speaking" as const, + speechFrames: state.speechFrames + 1, + silenceFrames: 0, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + const silenceFrames = state.silenceFrames + 1; + if (silenceFrames < TRAILING_SILENCE_FRAMES) { + return { + decision: "append" as const, + nextState: { + ...state, + silenceFrames, + framesInChunk: state.framesInChunk + 1, + }, + }; + } + + return { + decision: + state.speechFrames >= MIN_SPEECH_FRAMES + ? ("append_and_flush" as const) + : ("discard" as const), + nextState: createInitialVadState(), + }; +} diff --git a/ui/goose2/src/features/chat/lib/voiceInput.test.ts b/ui/goose2/src/features/chat/lib/voiceInput.test.ts new file mode 100644 index 000000000000..452e5bf2d189 --- /dev/null +++ b/ui/goose2/src/features/chat/lib/voiceInput.test.ts @@ -0,0 +1,97 @@ +import { describe, expect, it } from "vitest"; +import { + appendTranscribedText, + getDefaultDictationProvider, + getAutoSubmitMatch, + parseAutoSubmitPhrases, + replaceTrailingTranscribedText, +} from "./voiceInput"; + +describe("voiceInput helpers", () => { + it("parses comma-separated auto-submit phrases", () => { + expect(parseAutoSubmitPhrases(" submit, Ship It ,submit ,, ")).toEqual([ + "submit", + "ship it", + ]); + }); + + it("appends dictated text without smashing words together", () => { + expect(appendTranscribedText("hello", "world")).toBe("hello world"); + expect(appendTranscribedText("hello ", "world")).toBe("hello world"); + expect(appendTranscribedText("hello", ", world")).toBe("hello, world"); + }); + + it("replaces only the trailing dictated segment", () => { + expect( + replaceTrailingTranscribedText( + "draft dictated text", + "dictated text", + "dictated text submit", + ), + ).toBe("draft dictated text submit"); + }); + + it("matches auto-submit phrases only at the end of dictated text", () => { + expect(getAutoSubmitMatch("please submit now", ["submit"])).toBeNull(); + expect(getAutoSubmitMatch("please SUBMIT.", ["submit"])).toEqual({ + matchedPhrase: "submit", + textWithoutPhrase: "please", + }); + }); + + it("strips the full raw phrase span when internal whitespace is repeated", () => { + // The phrase "ship it" is matched against the *normalized* text, where + // "ship it" collapses to "ship it" (7 chars). But in the raw text the + // phrase occupies 9 chars — slicing by -phrase.length would leave a + // dangling "sh" on the end. The fix walks the raw text with a regex so + // the slice index reflects the actual phrase span in the raw string. + expect(getAutoSubmitMatch("hello ship it", ["ship it"])).toEqual({ + matchedPhrase: "ship it", + textWithoutPhrase: "hello", + }); + }); + + it("picks the first configured dictation provider by priority", () => { + expect( + getDefaultDictationProvider({ + openai: { + configured: false, + description: "OpenAI", + usesProviderConfig: true, + availableModels: [], + }, + groq: { + configured: true, + description: "Groq", + usesProviderConfig: false, + availableModels: [], + }, + local: { + configured: true, + description: "Local", + usesProviderConfig: false, + availableModels: [], + }, + }), + ).toBe("groq"); + }); + + it("falls back to the first available provider when none are configured", () => { + expect( + getDefaultDictationProvider({ + elevenlabs: { + configured: false, + description: "ElevenLabs", + usesProviderConfig: false, + availableModels: [], + }, + local: { + configured: false, + description: "Local", + usesProviderConfig: false, + availableModels: [], + }, + }), + ).toBe("local"); + }); +}); diff --git a/ui/goose2/src/features/chat/lib/voiceInput.ts b/ui/goose2/src/features/chat/lib/voiceInput.ts new file mode 100644 index 000000000000..b349f89165ca --- /dev/null +++ b/ui/goose2/src/features/chat/lib/voiceInput.ts @@ -0,0 +1,199 @@ +import type { + DictationProvider, + DictationProviderStatus, +} from "@/shared/types/dictation"; + +// goose config keys — stored in the user's goose config.yaml via the +// _goose/config/{read,upsert,remove} ACP methods, not localStorage. +export const VOICE_AUTO_SUBMIT_PHRASES_CONFIG_KEY = "VOICE_AUTO_SUBMIT_PHRASES"; +export const VOICE_DICTATION_PROVIDER_CONFIG_KEY = "VOICE_DICTATION_PROVIDER"; +export const VOICE_DICTATION_PREFERRED_MIC_CONFIG_KEY = + "VOICE_DICTATION_PREFERRED_MIC"; +export const VOICE_DICTATION_CONFIG_EVENT = "goose:voice-dictation-config"; +export const DISABLED_DICTATION_PROVIDER_CONFIG_VALUE = "__disabled__"; + +export const DEFAULT_AUTO_SUBMIT_PHRASES_RAW = "submit"; + +const TRAILING_PUNCTUATION_REGEX = /[\s"'`.,!?;:)\]}]+$/u; + +function escapeRegExp(value: string): string { + return value.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); +} + +function normalizePhrase(value: string): string { + return value + .toLowerCase() + .replace(/\s+/g, " ") + .trim() + .replace(TRAILING_PUNCTUATION_REGEX, "") + .trim(); +} + +export function parseAutoSubmitPhrases(rawValue: string | null | undefined) { + if (!rawValue) { + return []; + } + + return Array.from( + new Set( + rawValue + .split(",") + .map((value) => normalizePhrase(value)) + .filter(Boolean), + ), + ); +} + +export function normalizeDictationProvider( + value: string | null | undefined, +): DictationProvider | null { + if ( + value === "openai" || + value === "groq" || + value === "elevenlabs" || + value === "local" + ) { + return value; + } + + return null; +} + +export function getDefaultDictationProvider( + providerStatuses: Partial>, +): DictationProvider | null { + const configuredProviderPriority: DictationProvider[] = [ + "openai", + "groq", + "elevenlabs", + "local", + ]; + const fallbackProviderPriority: DictationProvider[] = [ + "local", + "openai", + "groq", + "elevenlabs", + ]; + + for (const provider of configuredProviderPriority) { + if (providerStatuses[provider]?.configured) { + return provider; + } + } + + for (const provider of fallbackProviderPriority) { + if (providerStatuses[provider]) { + return provider; + } + } + + return null; +} + +export function appendTranscribedText(baseText: string, fragment: string) { + const normalizedFragment = fragment.replace(/\s+/g, " ").trim(); + if (!normalizedFragment) { + return baseText; + } + + if (!baseText.trim()) { + return normalizedFragment; + } + + if (/[\s([{/-]$/.test(baseText) || /^[,.;!?)]/.test(normalizedFragment)) { + return `${baseText}${normalizedFragment}`; + } + + return `${baseText} ${normalizedFragment}`; +} + +export function replaceTrailingTranscribedText( + fullText: string, + previousTranscribedText: string, + nextTranscribedText: string, +) { + if (!previousTranscribedText) { + return appendTranscribedText(fullText, nextTranscribedText); + } + + if (fullText.endsWith(previousTranscribedText)) { + return appendTranscribedText( + fullText.slice(0, -previousTranscribedText.length), + nextTranscribedText, + ); + } + + const trimmedPreviousText = previousTranscribedText.trim(); + if (trimmedPreviousText && fullText.endsWith(trimmedPreviousText)) { + return appendTranscribedText( + fullText.slice(0, -trimmedPreviousText.length), + nextTranscribedText, + ); + } + + return appendTranscribedText(fullText, nextTranscribedText); +} + +export function getAutoSubmitMatch( + transcribedText: string, + autoSubmitPhrases: string[], +) { + const normalizedTranscribedText = normalizePhrase(transcribedText); + if (!normalizedTranscribedText) { + return null; + } + + const sortedPhrases = [...autoSubmitPhrases].sort( + (left, right) => right.length - left.length, + ); + + for (const phrase of sortedPhrases) { + if (!normalizedTranscribedText.endsWith(phrase)) { + continue; + } + + const phraseStartIndex = normalizedTranscribedText.length - phrase.length; + if ( + phraseStartIndex > 0 && + normalizedTranscribedText[phraseStartIndex - 1] !== " " + ) { + continue; + } + + // Map the phrase back to the *raw* transcribed text. `phrase.length` is + // the length in normalized form (whitespace collapsed to single spaces, + // lowercased, trailing punctuation stripped). Applying -phrase.length + // directly to trimmedText undercounts whenever the raw text has repeated + // whitespace or mixed case, chopping off legitimate content. Instead, + // match the phrase at the end of the raw text using a regex that allows + // flexible whitespace between words, so the slice index reflects the + // actual start of the phrase in the raw string. + const trimmedText = transcribedText.replace(TRAILING_PUNCTUATION_REGEX, ""); + const phraseWords = phrase.split(" ").filter(Boolean).map(escapeRegExp); + const phrasePattern = new RegExp( + `(^|\\s)(${phraseWords.join("\\s+")})\\s*$`, + "iu", + ); + const rawMatch = trimmedText.match(phrasePattern); + const phraseStartOffset = + rawMatch && rawMatch.index !== undefined + ? rawMatch.index + (rawMatch[1]?.length ?? 0) + : trimmedText.length - phrase.length; + const textWithoutPhrase = trimmedText.slice(0, phraseStartOffset).trimEnd(); + + return { + matchedPhrase: phrase, + textWithoutPhrase, + }; + } + + return null; +} + +export function notifyVoiceDictationConfigChanged() { + try { + window.dispatchEvent(new Event(VOICE_DICTATION_CONFIG_EVENT)); + } catch { + // no-op + } +} diff --git a/ui/goose2/src/features/chat/ui/ChatInput.tsx b/ui/goose2/src/features/chat/ui/ChatInput.tsx index 9b40f2b768f3..4a21f9a60c1d 100644 --- a/ui/goose2/src/features/chat/ui/ChatInput.tsx +++ b/ui/goose2/src/features/chat/ui/ChatInput.tsx @@ -22,6 +22,7 @@ import { } from "../hooks/useChatInputAttachments"; import type { ModelOption } from "../types"; import { ChatInputAttachments } from "./ChatInputAttachments"; +import { useVoiceDictation } from "../hooks/useVoiceDictation"; export interface ProjectOption { id: string; @@ -121,6 +122,25 @@ export function ChatInput({ clearAttachments, } = useChatInputAttachments(); + const resetTextarea = useCallback(() => { + if (textareaRef.current) { + textareaRef.current.style.height = "auto"; + } + }, []); + + const hasQueuedMessage = queuedMessage !== null; + + const dictation = useVoiceDictation({ + text, + setText, + attachments, + clearAttachments, + selectedPersonaId, + onSend, + resetTextarea, + isSendLocked: hasQueuedMessage || disabled, + }); + const activePersona = useMemo( () => personas.find((persona) => persona.id === selectedPersonaId) ?? null, [personas, selectedPersonaId], @@ -133,7 +153,6 @@ export function ChatInput({ ); const stickyPersona = activePersona; - const hasQueuedMessage = queuedMessage !== null; const canSend = (text.trim().length > 0 || attachments.length > 0) && !hasQueuedMessage && @@ -182,6 +201,24 @@ export function ChatInput({ return; } + // If recording, stop without waiting for final flush and send what's + // already transcribed into the textarea. This makes Send a single click + // even while the mic is hot; any in-flight audio after the user clicked + // Send is intentionally dropped. + // + // Also handles the edge case where the user clicks Send while a + // getUserMedia startup is still pending (isRecording is still false but + // a stream is about to be acquired) — stopRecording sets the internal + // cancel flag so the pending startup tears itself down instead of + // leaving the OS mic indicator on. + if ( + dictation.isRecording || + dictation.isTranscribing || + dictation.isStarting() + ) { + dictation.stopRecording({ flushPending: false }); + } + onSend( text.trim(), selectedPersonaId ?? undefined, @@ -196,6 +233,7 @@ export function ChatInput({ attachments, canSend, clearAttachments, + dictation, onSend, selectedPersonaId, setText, @@ -408,7 +446,13 @@ export function ChatInput({ onChange={handleInput} onKeyDown={handleKeyDown} onPaste={handlePaste} - placeholder={effectivePlaceholder} + placeholder={ + dictation.isRecording + ? t("toolbar.voiceInputRecording") + : dictation.isTranscribing + ? t("toolbar.voiceInputTranscribing") + : effectivePlaceholder + } disabled={disabled} rows={1} className="mb-3 min-h-[36px] max-h-[200px] w-full resize-none bg-transparent px-1 text-[14px] leading-relaxed text-foreground placeholder:font-light placeholder:text-muted-foreground/60 focus:outline-none focus-visible:ring-0 focus-visible:ring-offset-0 disabled:opacity-60" @@ -447,6 +491,10 @@ export function ChatInput({ onSend={handleSend} onStop={onStop} isCompact={isCompact} + voiceEnabled={dictation.isEnabled} + voiceRecording={dictation.isRecording} + voiceTranscribing={dictation.isTranscribing} + onVoiceToggle={dictation.toggleRecording} /> diff --git a/ui/goose2/src/features/chat/ui/ChatInputToolbar.tsx b/ui/goose2/src/features/chat/ui/ChatInputToolbar.tsx index 3e25b8f084ce..411c002fa1fc 100644 --- a/ui/goose2/src/features/chat/ui/ChatInputToolbar.tsx +++ b/ui/goose2/src/features/chat/ui/ChatInputToolbar.tsx @@ -90,6 +90,11 @@ interface ChatInputToolbarProps { onAttachFiles?: () => void; onAttachFolders?: () => void; disabled?: boolean; + // Voice + voiceEnabled?: boolean; + voiceRecording?: boolean; + voiceTranscribing?: boolean; + onVoiceToggle?: () => void; // Layout isCompact: boolean; } @@ -124,6 +129,10 @@ export function ChatInputToolbar({ onAttachFiles, onAttachFolders, disabled = false, + voiceEnabled = false, + voiceRecording = false, + voiceTranscribing = false, + onVoiceToggle, isCompact, }: ChatInputToolbarProps) { const { t } = useTranslation("chat"); @@ -384,14 +393,32 @@ export function ChatInputToolbar({ type="button" variant="ghost" size="icon-sm" - disabled - aria-label={t("toolbar.voiceInputSoon")} + disabled={!voiceRecording && (!voiceEnabled || disabled)} + onClick={onVoiceToggle} + aria-label={ + voiceRecording + ? t("toolbar.voiceInputRecording") + : t("toolbar.voiceInput") + } + className={cn( + voiceRecording && + "bg-destructive/10 text-destructive hover:bg-destructive/20 hover:text-destructive", + voiceTranscribing && "animate-pulse", + )} > - {t("toolbar.voiceInputSoon")} + + {!voiceEnabled + ? t("toolbar.voiceInputDisabled") + : voiceRecording + ? t("toolbar.voiceInputRecording") + : voiceTranscribing + ? t("toolbar.voiceInputTranscribing") + : t("toolbar.voiceInput")} + diff --git a/ui/goose2/src/features/chat/ui/__tests__/ChatInput.test.tsx b/ui/goose2/src/features/chat/ui/__tests__/ChatInput.test.tsx index 0561892dd011..6b60e12d7264 100644 --- a/ui/goose2/src/features/chat/ui/__tests__/ChatInput.test.tsx +++ b/ui/goose2/src/features/chat/ui/__tests__/ChatInput.test.tsx @@ -3,8 +3,22 @@ import { fireEvent, render, screen } from "@testing-library/react"; import userEvent from "@testing-library/user-event"; import { useState } from "react"; import { ChatInput } from "../ChatInput"; +import { ChatInputToolbar } from "../ChatInputToolbar"; import type { Persona } from "@/shared/types/agents"; +const mockVoiceDictation = { + isEnabled: true, + isRecording: false, + isTranscribing: false, + isStarting: vi.fn(() => false), + stopRecording: vi.fn(), + toggleRecording: vi.fn(), +}; + +vi.mock("../hooks/useVoiceDictation", () => ({ + useVoiceDictation: () => mockVoiceDictation, +})); + vi.mock("@/features/providers/hooks/useAgentProviderStatus", () => ({ useAgentProviderStatus: () => ({ readyAgentIds: new Set(["goose", "claude-acp", "codex-acp"]), @@ -63,6 +77,13 @@ describe("ChatInput", () => { beforeEach(() => { mockListFilesForMentions.mockClear(); mockListFilesForMentions.mockResolvedValue([]); + mockVoiceDictation.isEnabled = true; + mockVoiceDictation.isRecording = false; + mockVoiceDictation.isTranscribing = false; + mockVoiceDictation.isStarting.mockReset(); + mockVoiceDictation.isStarting.mockReturnValue(false); + mockVoiceDictation.stopRecording.mockReset(); + mockVoiceDictation.toggleRecording.mockReset(); }); it("renders with default placeholder", () => { @@ -418,6 +439,53 @@ describe("ChatInput", () => { expect(onSend).not.toHaveBeenCalled(); }); + it("does not stop dictation when send is blocked", async () => { + const onSend = vi.fn(); + const user = userEvent.setup(); + mockVoiceDictation.isRecording = true; + + render( + , + ); + + await user.type(screen.getByRole("textbox"), "another message"); + await user.keyboard("{Enter}"); + + expect(onSend).not.toHaveBeenCalled(); + expect(mockVoiceDictation.stopRecording).not.toHaveBeenCalled(); + }); + + it("keeps the mic toggle enabled while recording even if voice input becomes unavailable", () => { + render( + , + ); + + expect(screen.getByRole("button", { name: "Listening..." })).toBeEnabled(); + }); + it("keeps the selected assistant chip after sending subsequent messages", async () => { const onSend = vi.fn(); const user = userEvent.setup(); diff --git a/ui/goose2/src/features/settings/ui/LocalWhisperModels.tsx b/ui/goose2/src/features/settings/ui/LocalWhisperModels.tsx new file mode 100644 index 000000000000..751ef8a628b7 --- /dev/null +++ b/ui/goose2/src/features/settings/ui/LocalWhisperModels.tsx @@ -0,0 +1,325 @@ +import { useCallback, useEffect, useRef, useState } from "react"; +import { useTranslation } from "react-i18next"; +import { Button } from "@/shared/ui/button"; +import { + cancelDictationLocalModelDownload, + deleteDictationLocalModel, + downloadDictationLocalModel, + getDictationLocalModelDownloadProgress, + listDictationLocalModels, +} from "@/shared/api/dictation"; + +type LocalModel = { + id: string; + description: string; + sizeMb: number; + downloaded: boolean; + downloadInProgress: boolean; +}; + +type DownloadProgress = { + bytesDownloaded: number; + totalBytes: number; + progressPercent: number; + status: string; + error?: string | null; +}; + +const POLL_INTERVAL_MS = 750; + +interface LocalWhisperModelsProps { + selectedModelId: string; + onSelectModel: (modelId: string) => void | Promise; + onModelsChanged: () => void | Promise; +} + +export function LocalWhisperModels({ + selectedModelId, + onSelectModel, + onModelsChanged, +}: LocalWhisperModelsProps) { + const { t } = useTranslation(["settings", "common"]); + const [models, setModels] = useState([]); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [downloadingIds, setDownloadingIds] = useState>(new Set()); + const [progresses, setProgresses] = useState>( + new Map(), + ); + const onModelsChangedRef = useRef(onModelsChanged); + onModelsChangedRef.current = onModelsChanged; + + const refresh = useCallback(async () => { + try { + const list = + (await listDictationLocalModels()) as unknown as LocalModel[]; + setModels(list); + setDownloadingIds((prev) => { + const next = new Set(prev); + for (const m of list) { + if (m.downloadInProgress) next.add(m.id); + } + return next; + }); + } catch (err) { + setError( + err instanceof Error ? err.message : t("general.voiceInput.loadError"), + ); + } + }, [t]); + + useEffect(() => { + const load = async () => { + setLoading(true); + setError(null); + await refresh(); + setLoading(false); + }; + void load(); + }, [refresh]); + + useEffect(() => { + if (downloadingIds.size === 0) return; + let cancelled = false; + + const tick = async () => { + const next = new Map(); + const stillActive = new Set(); + const finishedIds: string[] = []; + + for (const id of downloadingIds) { + try { + const progress = (await getDictationLocalModelDownloadProgress( + id, + )) as unknown as DownloadProgress | null; + if (!progress) { + finishedIds.push(id); + continue; + } + next.set(id, progress); + if (progress.status === "downloading") { + stillActive.add(id); + } else { + finishedIds.push(id); + } + } catch { + stillActive.add(id); + } + } + if (cancelled) return; + setProgresses(next); + if (finishedIds.length > 0) { + await refresh(); + await onModelsChangedRef.current(); + } + setDownloadingIds(stillActive); + }; + + const interval = window.setInterval(() => { + void tick(); + }, POLL_INTERVAL_MS); + return () => { + cancelled = true; + window.clearInterval(interval); + }; + }, [downloadingIds, refresh]); + + const startDownload = useCallback( + async (modelId: string) => { + setError(null); + try { + await downloadDictationLocalModel(modelId); + setDownloadingIds((prev) => new Set(prev).add(modelId)); + } catch (err) { + setError( + err instanceof Error + ? err.message + : t("general.voiceInput.saveError"), + ); + } + }, + [t], + ); + + const cancelDownload = useCallback( + async (modelId: string) => { + setError(null); + try { + await cancelDictationLocalModelDownload(modelId); + } catch (err) { + setError( + err instanceof Error + ? err.message + : t("general.voiceInput.saveError"), + ); + } finally { + setProgresses((prev) => { + const next = new Map(prev); + next.delete(modelId); + return next; + }); + setDownloadingIds((prev) => { + const next = new Set(prev); + next.delete(modelId); + return next; + }); + await refresh(); + } + }, + [refresh, t], + ); + + const deleteModel = useCallback( + async (modelId: string) => { + setError(null); + try { + await deleteDictationLocalModel(modelId); + await refresh(); + await onModelsChanged(); + } catch (err) { + setError( + err instanceof Error + ? err.message + : t("general.voiceInput.deleteError"), + ); + } + }, + [onModelsChanged, refresh, t], + ); + + if (loading) { + return ( +
+

+ {t("common:labels.loading")} +

+
+ ); + } + + if (models.length === 0) { + return ( +
+

+ {t("general.voiceInput.noLocalModels")} +

+
+ ); + } + + return ( +
+
+

+ {t("general.voiceInput.localModelLabel")} +

+

+ {t("general.voiceInput.localModelDescription")} +

+
+ +
    + {models.map((model) => { + const progress = progresses.get(model.id); + const isDownloading = + downloadingIds.has(model.id) || + progress?.status === "downloading" || + model.downloadInProgress; + const isSelected = model.downloaded && model.id === selectedModelId; + return ( +
  • +
    +
    +

    + {model.id} +

    + + {model.sizeMb} MB + + {isSelected ? ( + + {t("general.voiceInput.selectedModel")} + + ) : null} +
    +

    + {model.description} +

    + {isDownloading && progress ? ( +
    +
    +
    +
    +

    + {t("general.voiceInput.downloadProgress", { + percent: Math.round(progress.progressPercent), + })} +

    +
    + ) : null} + {progress?.status === "failed" && progress.error ? ( +

    + {progress.error} +

    + ) : null} +
    + +
    + {isDownloading ? ( + + ) : model.downloaded ? ( + <> + {!isSelected ? ( + + ) : null} + + + ) : ( + + )} +
    +
  • + ); + })} +
+ + {error ?

{error}

: null} +
+ ); +} diff --git a/ui/goose2/src/features/settings/ui/SettingsModal.tsx b/ui/goose2/src/features/settings/ui/SettingsModal.tsx index 65ab6b6aff76..03400ccef214 100644 --- a/ui/goose2/src/features/settings/ui/SettingsModal.tsx +++ b/ui/goose2/src/features/settings/ui/SettingsModal.tsx @@ -21,6 +21,7 @@ import { SelectValue, } from "@/shared/ui/select"; import { + Mic, Palette, Settings2, FolderKanban, @@ -34,6 +35,7 @@ import { AppearanceSettings } from "./AppearanceSettings"; import { DoctorSettings } from "./DoctorSettings"; import { ProvidersSettings } from "./ProvidersSettings"; import { ExtensionsSettings } from "@/features/extensions/ui/ExtensionsSettings"; +import { VoiceInputSettings } from "./VoiceInputSettings"; import { listArchivedProjects, restoreProject, @@ -50,6 +52,7 @@ const NAV_ITEMS = [ { id: "appearance", labelKey: "nav.appearance", icon: Palette }, { id: "providers", labelKey: "nav.providers", icon: IconPlug }, { id: "extensions", labelKey: "nav.extensions", icon: IconPuzzle }, + { id: "voice", labelKey: "nav.voice", icon: Mic }, { id: "general", labelKey: "nav.general", icon: Settings2 }, { id: "projects", labelKey: "nav.projects", icon: FolderKanban }, { id: "chats", labelKey: "nav.chats", icon: MessageSquare }, @@ -241,6 +244,7 @@ export function SettingsModal({ {activeSection === "appearance" && } {activeSection === "providers" && } {activeSection === "extensions" && } + {activeSection === "voice" && } {activeSection === "doctor" && } {activeSection === "general" && (
diff --git a/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx b/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx new file mode 100644 index 000000000000..3203608c1d8d --- /dev/null +++ b/ui/goose2/src/features/settings/ui/VoiceInputSettings.tsx @@ -0,0 +1,489 @@ +import { useCallback, useEffect, useMemo, useState } from "react"; +import { useTranslation } from "react-i18next"; +import { + deleteDictationProviderSecret, + getDictationConfig, + saveDictationModelSelection, + saveDictationProviderSecret, +} from "@/shared/api/dictation"; +import { + notifyVoiceDictationConfigChanged, + getDefaultDictationProvider, +} from "@/features/chat/lib/voiceInput"; +import { useVoiceInputPreferences } from "@/features/chat/hooks/useVoiceInputPreferences"; +import type { + DictationProvider, + DictationProviderStatus, +} from "@/shared/types/dictation"; +import { useAudioDevices } from "@/shared/ui/ai-elements/mic-selector"; +import { Button } from "@/shared/ui/button"; +import { LocalWhisperModels } from "./LocalWhisperModels"; +import { Input } from "@/shared/ui/input"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/shared/ui/select"; + +const DISABLED_PROVIDER = "__disabled__"; + +export function VoiceInputSettings() { + const { t } = useTranslation(["settings", "chat", "common"]); + const { + clearSelectedProvider, + hasStoredProviderPreference, + isHydrated: voicePrefsHydrated, + preferredMicrophoneId, + rawAutoSubmitPhrases, + selectedProvider, + setPreferredMicrophoneId, + setRawAutoSubmitPhrases, + setSelectedProvider, + } = useVoiceInputPreferences(); + const [providerStatuses, setProviderStatuses] = useState< + Record + >({} as Record); + const [apiKeyInput, setApiKeyInput] = useState(""); + const [isEditingApiKey, setIsEditingApiKey] = useState(false); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const { + devices, + error: devicesError, + hasPermission, + loadDevices, + loading: loadingDevices, + } = useAudioDevices(); + const isMicrophoneSupported = + typeof navigator !== "undefined" && !!navigator.mediaDevices; + const permissionStatus = hasPermission ? "authorized" : "not_determined"; + const requestPermission = loadDevices; + + const refreshConfig = useCallback(async () => { + const nextConfig = await getDictationConfig(); + setProviderStatuses(nextConfig); + + // Wait for useVoiceInputPreferences to finish loading the stored value + // from goose config before deciding whether to auto-select a default. + // Otherwise the initial mount sees hasStoredProviderPreference=false + // (pre-hydration default) and clobbers the user's saved choice. + if (!voicePrefsHydrated) { + return; + } + + if (!hasStoredProviderPreference) { + const defaultProvider = getDefaultDictationProvider(nextConfig); + if (defaultProvider) { + setSelectedProvider(defaultProvider); + } + return; + } + + if (!selectedProvider) { + return; + } + + // The stored provider is no longer in the fetched config (e.g. it was + // feature-flagged off or removed). Clear the preference entirely rather + // than writing `null`, which would persist the explicit "voice off" + // sentinel and leave the user opted out across future sessions even + // after valid providers reappear. + if (!nextConfig[selectedProvider]) { + clearSelectedProvider(); + } + }, [ + clearSelectedProvider, + hasStoredProviderPreference, + selectedProvider, + setSelectedProvider, + voicePrefsHydrated, + ]); + + useEffect(() => { + const load = async () => { + setLoading(true); + setError(null); + + try { + await refreshConfig(); + } catch (caughtError) { + setError( + caughtError instanceof Error + ? caughtError.message + : t("general.voiceInput.loadError"), + ); + } finally { + setLoading(false); + } + }; + + void load(); + }, [refreshConfig, t]); + + const selectedStatus = selectedProvider + ? providerStatuses[selectedProvider] + : null; + + const providerOptions = useMemo( + () => + Object.entries(providerStatuses) as Array< + [DictationProvider, DictationProviderStatus] + >, + [providerStatuses], + ); + + const currentModelValue = + selectedStatus?.selectedModel ?? selectedStatus?.defaultModel ?? ""; + + const saveApiKey = useCallback(async () => { + if (!selectedProvider) { + return; + } + + setError(null); + try { + await saveDictationProviderSecret( + selectedProvider, + apiKeyInput, + selectedStatus?.configKey ?? undefined, + ); + setApiKeyInput(""); + setIsEditingApiKey(false); + await refreshConfig(); + notifyVoiceDictationConfigChanged(); + } catch (caughtError) { + setError( + caughtError instanceof Error + ? caughtError.message + : t("general.voiceInput.saveError"), + ); + } + }, [apiKeyInput, refreshConfig, selectedProvider, selectedStatus, t]); + + const removeApiKey = useCallback(async () => { + if (!selectedProvider) { + return; + } + + setError(null); + try { + await deleteDictationProviderSecret( + selectedProvider, + selectedStatus?.configKey ?? undefined, + ); + setApiKeyInput(""); + setIsEditingApiKey(false); + await refreshConfig(); + notifyVoiceDictationConfigChanged(); + } catch (caughtError) { + setError( + caughtError instanceof Error + ? caughtError.message + : t("general.voiceInput.deleteError"), + ); + } + }, [refreshConfig, selectedProvider, selectedStatus, t]); + + const handleModelChange = useCallback( + async (modelId: string) => { + if (!selectedProvider) { + return; + } + + setError(null); + try { + await saveDictationModelSelection(selectedProvider, modelId); + await refreshConfig(); + notifyVoiceDictationConfigChanged(); + } catch (caughtError) { + setError( + caughtError instanceof Error + ? caughtError.message + : t("general.voiceInput.saveError"), + ); + } + }, + [refreshConfig, selectedProvider, t], + ); + + const selectedMicrophoneLabel = useMemo(() => { + if (!preferredMicrophoneId) { + return t("general.voiceInput.systemMicrophone"); + } + + return ( + devices.find((device) => device.deviceId === preferredMicrophoneId) + ?.label || t("general.voiceInput.systemMicrophone") + ); + }, [devices, preferredMicrophoneId, t]); + + if (loading) { + return ( +
+

+ {t("general.voiceInput.label")} +

+

+ {t("common:labels.loading")} +

+
+ ); + } + + return ( +
+
+

+ {t("general.voiceInput.label")} +

+

+ {t("general.voiceInput.description")} +

+
+ +
+

+ {t("general.voiceInput.providerLabel")} +

+ +
+ +
+
+
+

+ {t("general.voiceInput.microphoneLabel")} +

+

+ {isMicrophoneSupported + ? t("general.voiceInput.microphoneDescription") + : t("general.voiceInput.microphoneUnavailable")} +

+
+ {isMicrophoneSupported && !hasPermission ? ( + + ) : null} +
+ + {!devicesError && + !hasPermission && + permissionStatus === "not_determined" ? ( +

+ {t("general.voiceInput.microphoneAccessPrompt")} +

+ ) : null} + + {devicesError ? ( +

{devicesError}

+ ) : null} + + {isMicrophoneSupported && hasPermission ? ( + + ) : null} +
+ + {selectedStatus ? ( + <> + {!selectedStatus.usesProviderConfig && + selectedProvider !== "local" ? ( +
+ {isEditingApiKey ? ( + <> +
+

+ {t("general.voiceInput.apiKeyLabel")} +

+

+ {t("general.voiceInput.apiKeyDescription")} +

+
+
+ setApiKeyInput(event.target.value)} + placeholder={t("general.voiceInput.apiKeyPlaceholder")} + className="max-w-sm" + /> +
+ + +
+
+ + ) : ( +
+
+

+ {t("general.voiceInput.apiKeyLabel")} +

+

+ {selectedStatus.configured + ? t("general.voiceInput.apiKeyConfigured") + : t("general.voiceInput.apiKeyDescription")} +

+
+
+ + {selectedStatus.configured ? ( + + ) : null} +
+
+ )} +
+ ) : null} + + {selectedProvider === "local" ? ( + handleModelChange(modelId)} + onModelsChanged={async () => { + await refreshConfig(); + notifyVoiceDictationConfigChanged(); + }} + /> + ) : (selectedStatus.availableModels ?? []).length > 0 ? ( +
+

+ {t("general.voiceInput.modelLabel")} +

+ +

+ {(selectedStatus.availableModels ?? []).find( + (model) => model.id === currentModelValue, + )?.description ?? ""} +

+
+ ) : null} + + ) : null} + +
+ +

+ {t("general.voiceInput.autoSubmitDescription")} +

+ setRawAutoSubmitPhrases(event.target.value)} + placeholder={t("general.voiceInput.placeholder")} + className="max-w-sm" + /> +
+ + {error ?

{error}

: null} +
+ ); +} diff --git a/ui/goose2/src/features/settings/ui/__tests__/LocalWhisperModels.test.tsx b/ui/goose2/src/features/settings/ui/__tests__/LocalWhisperModels.test.tsx new file mode 100644 index 000000000000..b79c34577351 --- /dev/null +++ b/ui/goose2/src/features/settings/ui/__tests__/LocalWhisperModels.test.tsx @@ -0,0 +1,106 @@ +import { render, screen, waitFor } from "@testing-library/react"; +import userEvent from "@testing-library/user-event"; +import { beforeEach, describe, expect, it, vi } from "vitest"; +import { LocalWhisperModels } from "../LocalWhisperModels"; + +const mockListDictationLocalModels = vi.fn(); +const mockDownloadDictationLocalModel = vi.fn(); +const mockGetDictationLocalModelDownloadProgress = vi.fn(); +const mockCancelDictationLocalModelDownload = vi.fn(); +const mockDeleteDictationLocalModel = vi.fn(); + +vi.mock("@/shared/api/dictation", () => ({ + listDictationLocalModels: (...args: unknown[]) => + mockListDictationLocalModels(...args), + downloadDictationLocalModel: (...args: unknown[]) => + mockDownloadDictationLocalModel(...args), + getDictationLocalModelDownloadProgress: (...args: unknown[]) => + mockGetDictationLocalModelDownloadProgress(...args), + cancelDictationLocalModelDownload: (...args: unknown[]) => + mockCancelDictationLocalModelDownload(...args), + deleteDictationLocalModel: (...args: unknown[]) => + mockDeleteDictationLocalModel(...args), +})); + +describe("LocalWhisperModels", () => { + beforeEach(() => { + mockListDictationLocalModels.mockReset(); + mockDownloadDictationLocalModel.mockReset(); + mockGetDictationLocalModelDownloadProgress.mockReset(); + mockCancelDictationLocalModelDownload.mockReset(); + mockDeleteDictationLocalModel.mockReset(); + }); + + it("clears cached progress when cancelling a download", async () => { + const user = userEvent.setup(); + const onModelsChanged = vi.fn(); + + mockListDictationLocalModels + .mockResolvedValueOnce([ + { + id: "tiny", + description: "Tiny model", + sizeMb: 75, + downloaded: false, + downloadInProgress: true, + }, + ]) + .mockResolvedValueOnce([ + { + id: "tiny", + description: "Tiny model", + sizeMb: 75, + downloaded: false, + downloadInProgress: false, + }, + ]); + mockGetDictationLocalModelDownloadProgress.mockResolvedValue({ + bytesDownloaded: 100, + totalBytes: 1000, + progressPercent: 10, + status: "downloading", + error: null, + }); + mockCancelDictationLocalModelDownload.mockResolvedValue(undefined); + + render( + , + ); + + await waitFor(() => + expect( + screen.getByRole("button", { name: /cancel/i }), + ).toBeInTheDocument(), + ); + + await waitFor( + () => + expect(mockGetDictationLocalModelDownloadProgress).toHaveBeenCalledWith( + "tiny", + ), + { timeout: 2000 }, + ); + + await user.click(screen.getByRole("button", { name: /cancel/i })); + + await waitFor(() => + expect(mockCancelDictationLocalModelDownload).toHaveBeenCalledWith( + "tiny", + ), + ); + await waitFor(() => + expect( + screen.getByRole("button", { name: /download/i }), + ).toBeInTheDocument(), + ); + + expect( + screen.queryByRole("button", { name: /cancel/i }), + ).not.toBeInTheDocument(); + expect(onModelsChanged).not.toHaveBeenCalled(); + }); +}); diff --git a/ui/goose2/src/shared/api/__tests__/dictation.test.ts b/ui/goose2/src/shared/api/__tests__/dictation.test.ts new file mode 100644 index 000000000000..79831ca7cc92 --- /dev/null +++ b/ui/goose2/src/shared/api/__tests__/dictation.test.ts @@ -0,0 +1,140 @@ +import { describe, it, expect, vi, beforeEach } from "vitest"; +import { + cancelDictationLocalModelDownload, + deleteDictationLocalModel, + downloadDictationLocalModel, + getDictationConfig, + getDictationLocalModelDownloadProgress, + listDictationLocalModels, + saveDictationModelSelection, + transcribeDictation, +} from "../dictation"; +import { getClient } from "../acpConnection"; + +vi.mock("../acpConnection", () => ({ + getClient: vi.fn(), +})); + +describe("dictation SDK wiring", () => { + let client: any; + beforeEach(() => { + client = { + goose: { + GooseDictationConfig: vi.fn().mockResolvedValue({ + providers: { + openai: { + configured: true, + description: "OpenAI transcription", + usesProviderConfig: true, + availableModels: [], + }, + }, + }), + GooseDictationTranscribe: vi.fn().mockResolvedValue({ text: "hello" }), + }, + }; + vi.mocked(getClient).mockResolvedValue(client); + }); + + it("getDictationConfig calls GooseDictationConfig and returns providers map", async () => { + const result = await getDictationConfig(); + expect(client.goose.GooseDictationConfig).toHaveBeenCalledWith({}); + expect(result.openai.configured).toBe(true); + }); + + it("transcribeDictation forwards audio + mimeType + provider", async () => { + const result = await transcribeDictation({ + audio: "base64==", + mimeType: "audio/webm", + provider: "openai" as any, + }); + expect(client.goose.GooseDictationTranscribe).toHaveBeenCalledWith({ + audio: "base64==", + mimeType: "audio/webm", + provider: "openai", + }); + expect(result.text).toBe("hello"); + }); + + it("saveDictationModelSelection calls GooseDictationModelSelect", async () => { + client.goose.GooseDictationModelSelect = vi.fn().mockResolvedValue({}); + await saveDictationModelSelection("local" as any, "tiny"); + expect(client.goose.GooseDictationModelSelect).toHaveBeenCalledWith({ + provider: "local", + modelId: "tiny", + }); + }); + + it("listDictationLocalModels returns the models array", async () => { + client.goose.GooseDictationModelsList = vi.fn().mockResolvedValue({ + models: [ + { + id: "tiny", + description: "Tiny", + sizeMb: 75, + downloaded: true, + downloadInProgress: false, + }, + ], + }); + const result = await listDictationLocalModels(); + expect(client.goose.GooseDictationModelsList).toHaveBeenCalledWith({}); + expect(result).toHaveLength(1); + expect(result[0].id).toBe("tiny"); + }); + + it("downloadDictationLocalModel forwards modelId", async () => { + client.goose.GooseDictationModelsDownload = vi.fn().mockResolvedValue({}); + await downloadDictationLocalModel("tiny"); + expect(client.goose.GooseDictationModelsDownload).toHaveBeenCalledWith({ + modelId: "tiny", + }); + }); + + it("getDictationLocalModelDownloadProgress returns progress or null", async () => { + client.goose.GooseDictationModelsDownloadProgress = vi + .fn() + .mockResolvedValue({ + progress: { + bytesDownloaded: 100, + totalBytes: 1000, + progressPercent: 10, + status: "downloading", + error: null, + }, + }); + const result = await getDictationLocalModelDownloadProgress("tiny"); + expect(result?.bytesDownloaded).toBe(100); + expect( + client.goose.GooseDictationModelsDownloadProgress, + ).toHaveBeenCalledWith({ + modelId: "tiny", + }); + }); + + it("getDictationLocalModelDownloadProgress returns null when no download", async () => { + client.goose.GooseDictationModelsDownloadProgress = vi + .fn() + .mockResolvedValue({ + progress: undefined, + }); + const result = await getDictationLocalModelDownloadProgress("tiny"); + expect(result).toBeNull(); + }); + + it("cancelDictationLocalModelDownload forwards modelId", async () => { + client.goose.GooseDictationModelsCancel = vi.fn().mockResolvedValue({}); + await cancelDictationLocalModelDownload("tiny"); + expect(client.goose.GooseDictationModelsCancel).toHaveBeenCalledWith({ + modelId: "tiny", + }); + }); + + it("deleteDictationLocalModel forwards modelId", async () => { + client.goose.GooseDictationModelsDelete = vi.fn().mockResolvedValue({}); + await deleteDictationLocalModel("tiny"); + expect(client.goose.GooseDictationModelsDelete).toHaveBeenCalledWith({ + modelId: "tiny", + }); + }); +}); diff --git a/ui/goose2/src/shared/api/dictation.ts b/ui/goose2/src/shared/api/dictation.ts new file mode 100644 index 000000000000..4c3b42c6e140 --- /dev/null +++ b/ui/goose2/src/shared/api/dictation.ts @@ -0,0 +1,106 @@ +import { invoke } from "@tauri-apps/api/core"; +import type { + DictationDownloadProgress, + DictationProvider, + DictationProviderStatus, + DictationTranscribeResponse, + WhisperModelStatus, +} from "@/shared/types/dictation"; +import { getClient } from "./acpConnection"; + +export async function getDictationConfig(): Promise< + Record +> { + const client = await getClient(); + const response = await client.goose.GooseDictationConfig({}); + return response.providers as Record< + DictationProvider, + DictationProviderStatus + >; +} + +export async function transcribeDictation(request: { + audio: string; + mimeType: string; + provider: DictationProvider; +}): Promise { + const client = await getClient(); + return client.goose.GooseDictationTranscribe({ + audio: request.audio, + mimeType: request.mimeType, + provider: request.provider, + }); +} + +export async function saveDictationModelSelection( + provider: DictationProvider, + modelId: string, +): Promise { + const client = await getClient(); + await client.goose.GooseDictationModelSelect({ provider, modelId }); +} + +export async function saveDictationProviderSecret( + _provider: DictationProvider, + value: string, + configKey?: string, +): Promise { + if (!configKey) { + throw new Error("No config key for this provider"); + } + return invoke("save_provider_field", { key: configKey, value }); +} + +export async function deleteDictationProviderSecret( + provider: DictationProvider, + _configKey?: string, +): Promise { + const providerIdMap: Record = { + groq: "dictation_groq", + elevenlabs: "dictation_elevenlabs", + }; + const providerId = providerIdMap[provider]; + if (!providerId) { + throw new Error("Cannot delete secrets for this provider"); + } + return invoke("delete_provider_config", { providerId }); +} + +export async function listDictationLocalModels(): Promise< + WhisperModelStatus[] +> { + const client = await getClient(); + const response = await client.goose.GooseDictationModelsList({}); + return response.models as unknown as WhisperModelStatus[]; +} + +export async function downloadDictationLocalModel( + modelId: string, +): Promise { + const client = await getClient(); + await client.goose.GooseDictationModelsDownload({ modelId }); +} + +export async function getDictationLocalModelDownloadProgress( + modelId: string, +): Promise { + const client = await getClient(); + const response = await client.goose.GooseDictationModelsDownloadProgress({ + modelId, + }); + return (response.progress ?? null) as DictationDownloadProgress | null; +} + +export async function cancelDictationLocalModelDownload( + modelId: string, +): Promise { + const client = await getClient(); + await client.goose.GooseDictationModelsCancel({ modelId }); +} + +export async function deleteDictationLocalModel( + modelId: string, +): Promise { + const client = await getClient(); + await client.goose.GooseDictationModelsDelete({ modelId }); +} diff --git a/ui/goose2/src/shared/i18n/locales/en/chat.json b/ui/goose2/src/shared/i18n/locales/en/chat.json index efe6776e3d87..424007cc8c5c 100644 --- a/ui/goose2/src/shared/i18n/locales/en/chat.json +++ b/ui/goose2/src/shared/i18n/locales/en/chat.json @@ -169,7 +169,11 @@ "selectProject": "Select project", "sendMessage": "Send message", "stopGeneration": "Stop generation", - "voiceInputSoon": "Voice input (coming soon)" + "voiceInput": "Voice dictation", + "voiceInputDisabled": "Configure a voice provider in Settings to enable dictation", + "voiceInputRecording": "Listening...", + "voiceInputTranscribing": "Transcribing...", + "voiceInputAutoSubmitHint": "Say \"submit\" to send" }, "tools": { "fileNotFound": "File not found: {{path}}", diff --git a/ui/goose2/src/shared/i18n/locales/en/settings.json b/ui/goose2/src/shared/i18n/locales/en/settings.json index be55f4766a1d..d9733f3800b5 100644 --- a/ui/goose2/src/shared/i18n/locales/en/settings.json +++ b/ui/goose2/src/shared/i18n/locales/en/settings.json @@ -124,7 +124,49 @@ "spanish": "Spanish", "system": "System default ({{language}})" }, - "title": "General" + "title": "General", + "voiceInput": { + "label": "Voice Input", + "description": "Configure voice dictation for hands-free input.", + "providerLabel": "Transcription Provider", + "disabled": "Disabled", + "notConfiguredSuffix": "(not configured)", + "placeholder": "Select a provider", + "modelLabel": "Model", + "apiKeyLabel": "API Key", + "apiKeyDescription": "Enter your API key for this provider.", + "apiKeyPlaceholder": "sk-...", + "apiKeyConfigured": "API key configured", + "addApiKey": "Add API key", + "updateApiKey": "Update API key", + "removeApiKey": "Remove API key", + "localModelLabel": "Local Whisper Model", + "localModelDescription": "Download a Whisper model to run transcription locally. Selecting a model sets it as your active local transcription model.", + "noLocalModels": "No local Whisper models available.", + "download": "Download", + "selectModel": "Select", + "selectedModel": "Selected", + "deleteModel": "Delete", + "microphoneLabel": "Microphone", + "microphoneDescription": "Choose which microphone to use for voice input.", + "microphoneUnavailable": "Microphone access is not available in this environment.", + "microphoneAccessPrompt": "Click \"Grant access\" to allow microphone use.", + "grantMicrophone": "Grant access", + "systemMicrophone": "System default", + "unknownMicrophone": "Unknown microphone", + "autoSubmitLabel": "Auto-submit Phrases", + "autoSubmitDescription": "Comma-separated words that trigger automatic send (e.g. \"submit\").", + "providers": { + "openai": "OpenAI Whisper", + "groq": "Groq", + "elevenlabs": "ElevenLabs", + "local": "Local Whisper" + }, + "downloadProgress": "Downloading... {{percent}}%", + "loadError": "Failed to load voice settings.", + "saveError": "Failed to save.", + "deleteError": "Failed to delete." + } }, "nav": { "about": "About", @@ -134,7 +176,8 @@ "general": "General", "projects": "Projects", "extensions": "Extensions", - "providers": "Providers" + "providers": "Providers", + "voice": "Voice" }, "projects": { "description": "Manage your projects.", diff --git a/ui/goose2/src/shared/i18n/locales/es/chat.json b/ui/goose2/src/shared/i18n/locales/es/chat.json index 3a5760189e23..5bd93d8a560d 100644 --- a/ui/goose2/src/shared/i18n/locales/es/chat.json +++ b/ui/goose2/src/shared/i18n/locales/es/chat.json @@ -169,7 +169,11 @@ "selectProject": "Seleccionar proyecto", "sendMessage": "Enviar mensaje", "stopGeneration": "Detener generación", - "voiceInputSoon": "Entrada de voz (pronto)" + "voiceInput": "Dictado por voz", + "voiceInputDisabled": "Configura un proveedor de voz en Ajustes para activar el dictado", + "voiceInputRecording": "Escuchando...", + "voiceInputTranscribing": "Transcribiendo...", + "voiceInputAutoSubmitHint": "Di \"enviar\" para enviar" }, "tools": { "fileNotFound": "Archivo no encontrado: {{path}}", diff --git a/ui/goose2/src/shared/i18n/locales/es/settings.json b/ui/goose2/src/shared/i18n/locales/es/settings.json index 8b2b85236ece..16e33a960aa6 100644 --- a/ui/goose2/src/shared/i18n/locales/es/settings.json +++ b/ui/goose2/src/shared/i18n/locales/es/settings.json @@ -124,7 +124,49 @@ "spanish": "Español", "system": "Predeterminado del sistema ({{language}})" }, - "title": "General" + "title": "General", + "voiceInput": { + "label": "Entrada de voz", + "description": "Configura el dictado por voz para entrada manos libres.", + "providerLabel": "Proveedor de transcripción", + "disabled": "Desactivado", + "notConfiguredSuffix": "(no configurado)", + "placeholder": "Selecciona un proveedor", + "modelLabel": "Modelo", + "apiKeyLabel": "Clave API", + "apiKeyDescription": "Ingresa tu clave API para este proveedor.", + "apiKeyPlaceholder": "sk-...", + "apiKeyConfigured": "Clave API configurada", + "addApiKey": "Agregar clave API", + "updateApiKey": "Actualizar clave API", + "removeApiKey": "Eliminar clave API", + "localModelLabel": "Modelo Whisper local", + "localModelDescription": "Descarga un modelo Whisper para transcribir localmente. Seleccionar un modelo lo establece como tu modelo de transcripción local activo.", + "noLocalModels": "No hay modelos Whisper locales disponibles.", + "download": "Descargar", + "selectModel": "Seleccionar", + "selectedModel": "Seleccionado", + "deleteModel": "Eliminar", + "microphoneLabel": "Micrófono", + "microphoneDescription": "Elige qué micrófono usar para la entrada de voz.", + "microphoneUnavailable": "El acceso al micrófono no está disponible en este entorno.", + "microphoneAccessPrompt": "Haz clic en \"Permitir acceso\" para usar el micrófono.", + "grantMicrophone": "Permitir acceso", + "systemMicrophone": "Predeterminado del sistema", + "unknownMicrophone": "Micrófono desconocido", + "autoSubmitLabel": "Frases de envío automático", + "autoSubmitDescription": "Palabras separadas por coma que activan el envío automático (ej. \"enviar\").", + "providers": { + "openai": "OpenAI Whisper", + "groq": "Groq", + "elevenlabs": "ElevenLabs", + "local": "Whisper local" + }, + "downloadProgress": "Descargando... {{percent}}%", + "loadError": "Error al cargar ajustes de voz.", + "saveError": "Error al guardar.", + "deleteError": "Error al eliminar." + } }, "nav": { "about": "Acerca de", @@ -134,7 +176,8 @@ "general": "General", "projects": "Proyectos", "extensions": "Extensiones", - "providers": "Proveedores" + "providers": "Proveedores", + "voice": "Voz" }, "projects": { "description": "Administra tus proyectos.", diff --git a/ui/goose2/src/shared/types/dictation.ts b/ui/goose2/src/shared/types/dictation.ts new file mode 100644 index 000000000000..f27593506772 --- /dev/null +++ b/ui/goose2/src/shared/types/dictation.ts @@ -0,0 +1,47 @@ +export type DictationProvider = "openai" | "groq" | "elevenlabs" | "local"; + +export interface DictationModelOption { + id: string; + label: string; + description: string; +} + +export interface DictationProviderStatus { + configured: boolean; + host?: string | null; + description: string; + usesProviderConfig: boolean; + settingsPath?: string | null; + configKey?: string | null; + modelConfigKey?: string | null; + defaultModel?: string | null; + selectedModel?: string | null; + availableModels: DictationModelOption[]; +} + +export interface DictationTranscribeResponse { + text: string; +} + +export type MicrophonePermissionStatus = + | "not_determined" + | "authorized" + | "denied" + | "restricted" + | "unsupported"; + +export interface WhisperModelStatus { + id: string; + sizeMb: number; + description: string; + downloaded: boolean; + downloadInProgress: boolean; +} + +export interface DictationDownloadProgress { + bytesDownloaded: number; + totalBytes: number; + progressPercent: number; + status: string; + error?: string | null; +} diff --git a/ui/goose2/src/shared/ui/ai-elements/mic-selector.tsx b/ui/goose2/src/shared/ui/ai-elements/mic-selector.tsx index 9e0369e135e3..0a05fc705b59 100644 --- a/ui/goose2/src/shared/ui/ai-elements/mic-selector.tsx +++ b/ui/goose2/src/shared/ui/ai-elements/mic-selector.tsx @@ -74,10 +74,6 @@ export const useAudioDevices = () => { }, []); const loadDevicesWithPermission = useCallback(async () => { - if (loading) { - return; - } - try { setLoading(true); setError(null); @@ -108,11 +104,57 @@ export const useAudioDevices = () => { } finally { setLoading(false); } - }, [loading]); + }, []); useEffect(() => { - loadDevicesWithoutPermission(); - }, [loadDevicesWithoutPermission]); + let cancelled = false; + let status: PermissionStatus | null = null; + const onChange = () => { + if (cancelled || !status) return; + const granted = status.state === "granted"; + setHasPermission(granted); + // When permission flips to granted mid-session (e.g. the user enabled + // mic access via OS settings), re-enumerate devices so we pick up the + // real deviceIds/labels — the prior enumeration may have returned + // empty-string entries that VoiceInputSettings filters out. + if (granted) { + void loadDevicesWithPermission(); + } + }; + + const init = async () => { + let alreadyGranted = false; + try { + status = await navigator.permissions.query({ + name: "microphone" as PermissionName, + }); + if (cancelled) return; + alreadyGranted = status.state === "granted"; + setHasPermission(alreadyGranted); + status.addEventListener("change", onChange); + } catch { + // Permissions API not available for microphone; fall back silently. + } + if (cancelled) return; + // If OS-level permission is already granted, enumerate through the + // permission-ful path — otherwise enumerateDevices() may return + // entries with empty deviceId/label, which Radix Select rejects. + if (alreadyGranted) { + await loadDevicesWithPermission(); + } else { + await loadDevicesWithoutPermission(); + } + }; + + void init(); + + return () => { + cancelled = true; + if (status) { + status.removeEventListener("change", onChange); + } + }; + }, [loadDevicesWithPermission, loadDevicesWithoutPermission]); useEffect(() => { const handleDeviceChange = () => { diff --git a/ui/sdk/src/generated/client.gen.ts b/ui/sdk/src/generated/client.gen.ts index 3bd1e0d13f89..a1eeeee569d3 100644 --- a/ui/sdk/src/generated/client.gen.ts +++ b/ui/sdk/src/generated/client.gen.ts @@ -13,6 +13,18 @@ import type { CheckSecretRequest, CheckSecretResponse, DeleteSessionRequest, + DictationConfigRequest, + DictationConfigResponse, + DictationModelCancelRequest, + DictationModelDeleteRequest, + DictationModelDownloadProgressRequest, + DictationModelDownloadProgressResponse, + DictationModelDownloadRequest, + DictationModelSelectRequest, + DictationModelsListRequest, + DictationModelsListResponse, + DictationTranscribeRequest, + DictationTranscribeResponse, ExportSessionRequest, ExportSessionResponse, GetExtensionsRequest, @@ -43,6 +55,10 @@ import type { } from './types.gen.js'; import { zCheckSecretResponse, + zDictationConfigResponse, + zDictationModelDownloadProgressResponse, + zDictationModelsListResponse, + zDictationTranscribeResponse, zExportSessionResponse, zGetExtensionsResponse, zGetProviderDetailsResponse, @@ -174,4 +190,71 @@ export class GooseExtClient { async GooseSessionUnarchive(params: UnarchiveSessionRequest): Promise { await this.conn.extMethod("_goose/session/unarchive", params); } + + async GooseDictationTranscribe( + params: DictationTranscribeRequest, + ): Promise { + const raw = await this.conn.extMethod( + "_goose/dictation/transcribe", + params, + ); + return zDictationTranscribeResponse.parse( + raw, + ) as DictationTranscribeResponse; + } + + async GooseDictationConfig( + params: DictationConfigRequest, + ): Promise { + const raw = await this.conn.extMethod("_goose/dictation/config", params); + return zDictationConfigResponse.parse(raw) as DictationConfigResponse; + } + + async GooseDictationModelsList( + params: DictationModelsListRequest, + ): Promise { + const raw = await this.conn.extMethod( + "_goose/dictation/models/list", + params, + ); + return zDictationModelsListResponse.parse( + raw, + ) as DictationModelsListResponse; + } + + async GooseDictationModelsDownload( + params: DictationModelDownloadRequest, + ): Promise { + await this.conn.extMethod("_goose/dictation/models/download", params); + } + + async GooseDictationModelsDownloadProgress( + params: DictationModelDownloadProgressRequest, + ): Promise { + const raw = await this.conn.extMethod( + "_goose/dictation/models/download/progress", + params, + ); + return zDictationModelDownloadProgressResponse.parse( + raw, + ) as DictationModelDownloadProgressResponse; + } + + async GooseDictationModelsCancel( + params: DictationModelCancelRequest, + ): Promise { + await this.conn.extMethod("_goose/dictation/models/cancel", params); + } + + async GooseDictationModelsDelete( + params: DictationModelDeleteRequest, + ): Promise { + await this.conn.extMethod("_goose/dictation/models/delete", params); + } + + async GooseDictationModelSelect( + params: DictationModelSelectRequest, + ): Promise { + await this.conn.extMethod("_goose/dictation/model/select", params); + } } diff --git a/ui/sdk/src/generated/index.ts b/ui/sdk/src/generated/index.ts index aa103a439a31..d1886b07d767 100644 --- a/ui/sdk/src/generated/index.ts +++ b/ui/sdk/src/generated/index.ts @@ -1,6 +1,6 @@ // This file is auto-generated by @hey-api/openapi-ts -export type { AddExtensionRequest, ArchiveSessionRequest, CheckSecretRequest, CheckSecretResponse, DeleteSessionRequest, EmptyResponse, ExportSessionRequest, ExportSessionResponse, ExtRequest, ExtResponse, GetExtensionsRequest, GetExtensionsResponse, GetProviderDetailsRequest, GetProviderDetailsResponse, GetProviderModelsRequest, GetProviderModelsResponse, GetSessionExtensionsRequest, GetSessionExtensionsResponse, GetToolsRequest, GetToolsResponse, ImportSessionRequest, ImportSessionResponse, ListProvidersRequest, ListProvidersResponse, ModelEntry, ProviderConfigKey, ProviderDetailEntry, ProviderListEntry, ReadConfigRequest, ReadConfigResponse, ReadResourceRequest, ReadResourceResponse, RemoveConfigRequest, RemoveExtensionRequest, RemoveSecretRequest, UnarchiveSessionRequest, UpdateWorkingDirRequest, UpsertConfigRequest, UpsertSecretRequest } from './types.gen.js'; +export type { AddExtensionRequest, ArchiveSessionRequest, CheckSecretRequest, CheckSecretResponse, DeleteSessionRequest, DictationConfigRequest, DictationConfigResponse, DictationDownloadProgress, DictationLocalModelStatus, DictationModelCancelRequest, DictationModelDeleteRequest, DictationModelDownloadProgressRequest, DictationModelDownloadProgressResponse, DictationModelDownloadRequest, DictationModelOption, DictationModelSelectRequest, DictationModelsListRequest, DictationModelsListResponse, DictationProviderStatusEntry, DictationTranscribeRequest, DictationTranscribeResponse, EmptyResponse, ExportSessionRequest, ExportSessionResponse, ExtRequest, ExtResponse, GetExtensionsRequest, GetExtensionsResponse, GetProviderDetailsRequest, GetProviderDetailsResponse, GetProviderModelsRequest, GetProviderModelsResponse, GetSessionExtensionsRequest, GetSessionExtensionsResponse, GetToolsRequest, GetToolsResponse, ImportSessionRequest, ImportSessionResponse, ListProvidersRequest, ListProvidersResponse, ModelEntry, ProviderConfigKey, ProviderDetailEntry, ProviderListEntry, ReadConfigRequest, ReadConfigResponse, ReadResourceRequest, ReadResourceResponse, RemoveConfigRequest, RemoveExtensionRequest, RemoveSecretRequest, UnarchiveSessionRequest, UpdateWorkingDirRequest, UpsertConfigRequest, UpsertSecretRequest } from './types.gen.js'; export const GOOSE_EXT_METHODS = [ { @@ -108,6 +108,46 @@ export const GOOSE_EXT_METHODS = [ requestType: "UnarchiveSessionRequest", responseType: "EmptyResponse", }, + { + method: "_goose/dictation/transcribe", + requestType: "DictationTranscribeRequest", + responseType: "DictationTranscribeResponse", + }, + { + method: "_goose/dictation/config", + requestType: "DictationConfigRequest", + responseType: "DictationConfigResponse", + }, + { + method: "_goose/dictation/models/list", + requestType: "DictationModelsListRequest", + responseType: "DictationModelsListResponse", + }, + { + method: "_goose/dictation/models/download", + requestType: "DictationModelDownloadRequest", + responseType: "EmptyResponse", + }, + { + method: "_goose/dictation/models/download/progress", + requestType: "DictationModelDownloadProgressRequest", + responseType: "DictationModelDownloadProgressResponse", + }, + { + method: "_goose/dictation/models/cancel", + requestType: "DictationModelCancelRequest", + responseType: "EmptyResponse", + }, + { + method: "_goose/dictation/models/delete", + requestType: "DictationModelDeleteRequest", + responseType: "EmptyResponse", + }, + { + method: "_goose/dictation/model/select", + requestType: "DictationModelSelectRequest", + responseType: "EmptyResponse", + }, ] as const; export type GooseExtMethod = (typeof GOOSE_EXT_METHODS)[number]; diff --git a/ui/sdk/src/generated/types.gen.ts b/ui/sdk/src/generated/types.gen.ts index e27160830133..15cf78ea75a7 100644 --- a/ui/sdk/src/generated/types.gen.ts +++ b/ui/sdk/src/generated/types.gen.ts @@ -281,17 +281,154 @@ export type UnarchiveSessionRequest = { sessionId: string; }; +/** + * Transcribe audio via a dictation provider. + */ +export type DictationTranscribeRequest = { + /** + * Base64-encoded audio data + */ + audio: string; + /** + * MIME type (e.g. "audio/wav", "audio/webm") + */ + mimeType: string; + /** + * Provider to use: "openai", "groq", "elevenlabs", or "local" + */ + provider: string; +}; + +/** + * Transcription result. + */ +export type DictationTranscribeResponse = { + text: string; +}; + +/** + * Get the configuration status of all dictation providers. + */ +export type DictationConfigRequest = { + [key: string]: unknown; +}; + +/** + * Dictation config response — map of provider name to status. + */ +export type DictationConfigResponse = { + providers: { + [key: string]: DictationProviderStatusEntry; + }; +}; + +/** + * Per-provider configuration status. + */ +export type DictationProviderStatusEntry = { + configured: boolean; + host?: string | null; + description: string; + usesProviderConfig: boolean; + settingsPath?: string | null; + configKey?: string | null; + modelConfigKey?: string | null; + defaultModel?: string | null; + selectedModel?: string | null; + availableModels?: Array; +}; + +export type DictationModelOption = { + id: string; + label: string; + description: string; +}; + +/** + * List available local Whisper models with their download status. + */ +export type DictationModelsListRequest = { + [key: string]: unknown; +}; + +export type DictationModelsListResponse = { + models: Array; +}; + +export type DictationLocalModelStatus = { + id: string; + label: string; + description: string; + sizeMb: number; + downloaded: boolean; + downloadInProgress: boolean; +}; + +/** + * Kick off a background download of a local Whisper model. + */ +export type DictationModelDownloadRequest = { + modelId: string; +}; + +/** + * Poll the progress of an in-flight download. + */ +export type DictationModelDownloadProgressRequest = { + modelId: string; +}; + +export type DictationModelDownloadProgressResponse = { + /** + * None when no download is active for this model id. + */ + progress?: DictationDownloadProgress | null; +}; + +export type DictationDownloadProgress = { + bytesDownloaded: number; + totalBytes: number; + progressPercent: number; + /** + * serde lowercase of DownloadStatus: "downloading" | "completed" | "failed" | "cancelled" + */ + status: string; + error?: string | null; +}; + +/** + * Cancel an in-flight download. + */ +export type DictationModelCancelRequest = { + modelId: string; +}; + +/** + * Delete a downloaded local Whisper model from disk. + */ +export type DictationModelDeleteRequest = { + modelId: string; +}; + +/** + * Persist the user's model selection for a given provider. + */ +export type DictationModelSelectRequest = { + provider: string; + modelId: string; +}; + export type ExtRequest = { id: string; method: string; - params?: AddExtensionRequest | RemoveExtensionRequest | GetToolsRequest | ReadResourceRequest | UpdateWorkingDirRequest | DeleteSessionRequest | GetExtensionsRequest | GetSessionExtensionsRequest | ListProvidersRequest | GetProviderDetailsRequest | GetProviderModelsRequest | ReadConfigRequest | UpsertConfigRequest | RemoveConfigRequest | CheckSecretRequest | UpsertSecretRequest | RemoveSecretRequest | ExportSessionRequest | ImportSessionRequest | ArchiveSessionRequest | UnarchiveSessionRequest | { + params?: AddExtensionRequest | RemoveExtensionRequest | GetToolsRequest | ReadResourceRequest | UpdateWorkingDirRequest | DeleteSessionRequest | GetExtensionsRequest | GetSessionExtensionsRequest | ListProvidersRequest | GetProviderDetailsRequest | GetProviderModelsRequest | ReadConfigRequest | UpsertConfigRequest | RemoveConfigRequest | CheckSecretRequest | UpsertSecretRequest | RemoveSecretRequest | ExportSessionRequest | ImportSessionRequest | ArchiveSessionRequest | UnarchiveSessionRequest | DictationTranscribeRequest | DictationConfigRequest | DictationModelsListRequest | DictationModelDownloadRequest | DictationModelDownloadProgressRequest | DictationModelCancelRequest | DictationModelDeleteRequest | DictationModelSelectRequest | { [key: string]: unknown; } | null; }; export type ExtResponse = { id: string; - result?: EmptyResponse | GetToolsResponse | ReadResourceResponse | GetExtensionsResponse | GetSessionExtensionsResponse | ListProvidersResponse | GetProviderDetailsResponse | GetProviderModelsResponse | ReadConfigResponse | CheckSecretResponse | ExportSessionResponse | ImportSessionResponse | unknown; + result?: EmptyResponse | GetToolsResponse | ReadResourceResponse | GetExtensionsResponse | GetSessionExtensionsResponse | ListProvidersResponse | GetProviderDetailsResponse | GetProviderModelsResponse | ReadConfigResponse | CheckSecretResponse | ExportSessionResponse | ImportSessionResponse | DictationTranscribeResponse | DictationConfigResponse | DictationModelsListResponse | DictationModelDownloadProgressResponse | unknown; } | { error: { code: number; diff --git a/ui/sdk/src/generated/zod.gen.ts b/ui/sdk/src/generated/zod.gen.ts index 1679d0ae1585..b48935fb2d5d 100644 --- a/ui/sdk/src/generated/zod.gen.ts +++ b/ui/sdk/src/generated/zod.gen.ts @@ -271,6 +271,146 @@ export const zUnarchiveSessionRequest = z.object({ sessionId: z.string() }); +/** + * Transcribe audio via a dictation provider. + */ +export const zDictationTranscribeRequest = z.object({ + audio: z.string(), + mimeType: z.string(), + provider: z.string() +}); + +/** + * Transcription result. + */ +export const zDictationTranscribeResponse = z.object({ + text: z.string() +}); + +/** + * Get the configuration status of all dictation providers. + */ +export const zDictationConfigRequest = z.record(z.unknown()); + +export const zDictationModelOption = z.object({ + id: z.string(), + label: z.string(), + description: z.string() +}); + +/** + * Per-provider configuration status. + */ +export const zDictationProviderStatusEntry = z.object({ + configured: z.boolean(), + host: z.union([ + z.string(), + z.null() + ]).optional(), + description: z.string(), + usesProviderConfig: z.boolean(), + settingsPath: z.union([ + z.string(), + z.null() + ]).optional(), + configKey: z.union([ + z.string(), + z.null() + ]).optional(), + modelConfigKey: z.union([ + z.string(), + z.null() + ]).optional(), + defaultModel: z.union([ + z.string(), + z.null() + ]).optional(), + selectedModel: z.union([ + z.string(), + z.null() + ]).optional(), + availableModels: z.array(zDictationModelOption).optional().default([]) +}); + +/** + * Dictation config response — map of provider name to status. + */ +export const zDictationConfigResponse = z.object({ + providers: z.record(zDictationProviderStatusEntry) +}); + +/** + * List available local Whisper models with their download status. + */ +export const zDictationModelsListRequest = z.record(z.unknown()); + +export const zDictationLocalModelStatus = z.object({ + id: z.string(), + label: z.string(), + description: z.string(), + sizeMb: z.number().int().gte(0), + downloaded: z.boolean(), + downloadInProgress: z.boolean() +}); + +export const zDictationModelsListResponse = z.object({ + models: z.array(zDictationLocalModelStatus) +}); + +/** + * Kick off a background download of a local Whisper model. + */ +export const zDictationModelDownloadRequest = z.object({ + modelId: z.string() +}); + +/** + * Poll the progress of an in-flight download. + */ +export const zDictationModelDownloadProgressRequest = z.object({ + modelId: z.string() +}); + +export const zDictationDownloadProgress = z.object({ + bytesDownloaded: z.number().int().gte(0), + totalBytes: z.number().int().gte(0), + progressPercent: z.number(), + status: z.string(), + error: z.union([ + z.string(), + z.null() + ]).optional() +}); + +export const zDictationModelDownloadProgressResponse = z.object({ + progress: z.union([ + zDictationDownloadProgress, + z.null() + ]).optional() +}); + +/** + * Cancel an in-flight download. + */ +export const zDictationModelCancelRequest = z.object({ + modelId: z.string() +}); + +/** + * Delete a downloaded local Whisper model from disk. + */ +export const zDictationModelDeleteRequest = z.object({ + modelId: z.string() +}); + +/** + * Persist the user's model selection for a given provider. + */ +export const zDictationModelSelectRequest = z.object({ + provider: z.string(), + modelId: z.string() +}); + export const zExtRequest = z.object({ id: z.string(), method: z.string(), @@ -296,7 +436,15 @@ export const zExtRequest = z.object({ zExportSessionRequest, zImportSessionRequest, zArchiveSessionRequest, - zUnarchiveSessionRequest + zUnarchiveSessionRequest, + zDictationTranscribeRequest, + zDictationConfigRequest, + zDictationModelsListRequest, + zDictationModelDownloadRequest, + zDictationModelDownloadProgressRequest, + zDictationModelCancelRequest, + zDictationModelDeleteRequest, + zDictationModelSelectRequest ]), z.union([ z.record(z.unknown()), @@ -321,7 +469,11 @@ export const zExtResponse = z.union([ zReadConfigResponse, zCheckSecretResponse, zExportSessionResponse, - zImportSessionResponse + zImportSessionResponse, + zDictationTranscribeResponse, + zDictationConfigResponse, + zDictationModelsListResponse, + zDictationModelDownloadProgressResponse ]), z.unknown() ]).optional()