From df517d3ec4194ad01cff9d9b74dc5bf3f6abc3c1 Mon Sep 17 00:00:00 2001 From: jack <> Date: Wed, 25 Jun 2025 22:24:55 +0100 Subject: [PATCH 1/3] feat: add voice dictation with OpenAI Whisper - Add microphone button that appears when OpenAI is configured - Implement real-time waveform visualization during recording - Add backend /audio/transcribe endpoint with security measures: - 25MB file size limit with 413 status code - 30-second timeout for API calls - Proper authentication via X-Secret-Key - Add visual feedback during transcription - Show recording duration and estimated file size - Warn users when approaching 25MB limit - Auto-stop recording at 10 minutes or 25MB - Add comprehensive integration tests - Fix ESLint configuration and MessageCopyLink warning Security: API keys remain backend-only, no frontend exposure --- Cargo.lock | 1 + crates/goose-server/Cargo.toml | 2 +- crates/goose-server/src/routes/audio.rs | 273 ++++++++++++++++++ crates/goose-server/src/routes/mod.rs | 2 + ui/desktop/.eslintrc.json | 3 +- ui/desktop/eslint.config.js | 8 + ui/desktop/src/components/ChatInput.tsx | 185 +++++++++--- ui/desktop/src/components/MessageCopyLink.tsx | 2 +- .../src/components/WaveformVisualizer.tsx | 113 ++++++++ .../src/components/icons/Microphone.tsx | 48 +++ ui/desktop/src/components/icons/index.tsx | 2 + ui/desktop/src/hooks/useWhisper.ts | 235 +++++++++++++++ 12 files changed, 834 insertions(+), 40 deletions(-) create mode 100644 crates/goose-server/src/routes/audio.rs create mode 100644 ui/desktop/src/components/WaveformVisualizer.tsx create mode 100644 ui/desktop/src/components/icons/Microphone.tsx create mode 100644 ui/desktop/src/hooks/useWhisper.ts diff --git a/Cargo.lock b/Cargo.lock index 66eecd729f60..33c5a83cdff6 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7020,6 +7020,7 @@ dependencies = [ "js-sys", "log", "mime", + "mime_guess", "once_cell", "percent-encoding", "pin-project-lite", diff --git a/crates/goose-server/Cargo.toml b/crates/goose-server/Cargo.toml index a343ce5a1af5..2d58367d1378 100644 --- a/crates/goose-server/Cargo.toml +++ b/crates/goose-server/Cargo.toml @@ -37,7 +37,7 @@ serde_yaml = "0.9.34" axum-extra = "0.10.0" utoipa = { version = "4.1", features = ["axum_extras", "chrono"] } dirs = "6.0.0" -reqwest = { version = "0.12.9", features = ["json", "rustls-tls", "blocking"], default-features = false } +reqwest = { version = "0.12.9", features = ["json", "rustls-tls", "blocking", "multipart"], default-features = false } [[bin]] name = "goosed" diff --git a/crates/goose-server/src/routes/audio.rs b/crates/goose-server/src/routes/audio.rs new file mode 100644 index 000000000000..ed358b8ad257 --- /dev/null +++ b/crates/goose-server/src/routes/audio.rs @@ -0,0 +1,273 @@ +/// Audio transcription route handler +/// +/// This module provides endpoints for audio transcription using OpenAI's Whisper API. +/// The OpenAI API key must be configured in the backend for this to work. +use super::utils::verify_secret_key; +use crate::state::AppState; +use axum::{ + extract::State, + http::{HeaderMap, StatusCode}, + routing::post, + Json, Router, +}; +use base64::{engine::general_purpose::STANDARD as BASE64, Engine}; +use reqwest::Client; +use serde::{Deserialize, Serialize}; +use std::sync::Arc; +use std::time::Duration; + +// Constants +const MAX_AUDIO_SIZE_BYTES: usize = 25 * 1024 * 1024; // 25MB +const OPENAI_TIMEOUT_SECONDS: u64 = 30; + +#[derive(Debug, Deserialize)] +struct TranscribeRequest { + audio: String, // Base64 encoded audio data + mime_type: String, +} + +#[derive(Debug, Serialize)] +struct TranscribeResponse { + text: String, +} + +#[derive(Debug, Deserialize)] +struct WhisperResponse { + text: String, +} + +/// Transcribe audio using OpenAI's Whisper API +/// +/// # Request +/// - `audio`: Base64 encoded audio data +/// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav") +/// +/// # Response +/// - `text`: Transcribed text from the audio +/// +/// # Errors +/// - 401: Unauthorized (missing or invalid X-Secret-Key header) +/// - 412: Precondition Failed (OpenAI API key not configured) +/// - 400: Bad Request (invalid base64 audio data) +/// - 413: Payload Too Large (audio file exceeds 25MB limit) +/// - 415: Unsupported Media Type (unsupported audio format) +/// - 502: Bad Gateway (OpenAI API error) +/// - 503: Service Unavailable (network error) +async fn transcribe_handler( + State(state): State>, + headers: HeaderMap, + Json(request): Json, +) -> Result, StatusCode> { + verify_secret_key(&headers, &state)?; + + // Get the OpenAI API key from config + let config = goose::config::Config::global(); + let api_key: String = config + .get_secret("OPENAI_API_KEY") + .map_err(|_| StatusCode::PRECONDITION_FAILED)?; + + // Decode the base64 audio data + let audio_bytes = BASE64 + .decode(&request.audio) + .map_err(|_| StatusCode::BAD_REQUEST)?; + + // Check file size + if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES { + tracing::warn!( + "Audio file too large: {} bytes (max: {} bytes)", + audio_bytes.len(), + MAX_AUDIO_SIZE_BYTES + ); + return Err(StatusCode::PAYLOAD_TOO_LARGE); + } + + // Determine file extension based on MIME type + let file_extension = match request.mime_type.as_str() { + "audio/webm" => "webm", + "audio/mp4" => "mp4", + "audio/mpeg" => "mp3", + "audio/mpga" => "mpga", + "audio/m4a" => "m4a", + "audio/wav" => "wav", + "audio/x-wav" => "wav", + _ => return Err(StatusCode::UNSUPPORTED_MEDIA_TYPE), + }; + + // Create a multipart form with the audio file + let part = reqwest::multipart::Part::bytes(audio_bytes) + .file_name(format!("audio.{}", file_extension)) + .mime_str(&request.mime_type) + .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + + let form = reqwest::multipart::Form::new() + .part("file", part) + .text("model", "whisper-1") + .text("response_format", "json"); + + // Make request to OpenAI Whisper API + let client = Client::builder() + .timeout(Duration::from_secs(OPENAI_TIMEOUT_SECONDS)) + .build() + .map_err(|e| { + tracing::error!("Failed to create HTTP client: {}", e); + StatusCode::INTERNAL_SERVER_ERROR + })?; + + let response = client + .post("https://api.openai.com/v1/audio/transcriptions") + .header("Authorization", format!("Bearer {}", api_key)) + .multipart(form) + .send() + .await + .map_err(|e| { + if e.is_timeout() { + tracing::error!( + "OpenAI API request timed out after {}s", + OPENAI_TIMEOUT_SECONDS + ); + StatusCode::GATEWAY_TIMEOUT + } else { + tracing::error!("Failed to send request to OpenAI: {}", e); + StatusCode::SERVICE_UNAVAILABLE + } + })?; + + if !response.status().is_success() { + let error_text = response.text().await.unwrap_or_default(); + tracing::error!("OpenAI API error: {}", error_text); + return Err(StatusCode::BAD_GATEWAY); + } + + let whisper_response: WhisperResponse = response.json().await.map_err(|e| { + tracing::error!("Failed to parse OpenAI response: {}", e); + StatusCode::INTERNAL_SERVER_ERROR + })?; + + Ok(Json(TranscribeResponse { + text: whisper_response.text, + })) +} + +pub fn routes(state: Arc) -> Router { + Router::new() + .route("/audio/transcribe", post(transcribe_handler)) + .with_state(state) +} + +#[cfg(test)] +mod tests { + use super::*; + use axum::{body::Body, http::Request}; + use tower::ServiceExt; + + #[tokio::test] + async fn test_transcribe_endpoint_requires_auth() { + let state = AppState::new( + Arc::new(goose::agents::Agent::new()), + "test-secret".to_string(), + ) + .await; + let app = routes(state); + + // Test without auth header + let request = Request::builder() + .uri("/audio/transcribe") + .method("POST") + .header("content-type", "application/json") + .body(Body::from( + serde_json::to_string(&serde_json::json!({ + "audio": "dGVzdA==", + "mime_type": "audio/webm" + })) + .unwrap(), + )) + .unwrap(); + + let response = app.oneshot(request).await.unwrap(); + assert_eq!(response.status(), StatusCode::UNAUTHORIZED); + } + + #[tokio::test] + async fn test_transcribe_endpoint_validates_size() { + let state = AppState::new( + Arc::new(goose::agents::Agent::new()), + "test-secret".to_string(), + ) + .await; + let app = routes(state); + + // Create a large base64 string (simulating > 25MB audio) + let large_audio = BASE64.encode(vec![0u8; MAX_AUDIO_SIZE_BYTES + 1]); + + let request = Request::builder() + .uri("/audio/transcribe") + .method("POST") + .header("content-type", "application/json") + .header("x-secret-key", "test-secret") + .body(Body::from( + serde_json::to_string(&serde_json::json!({ + "audio": large_audio, + "mime_type": "audio/webm" + })) + .unwrap(), + )) + .unwrap(); + + let response = app.oneshot(request).await.unwrap(); + assert_eq!(response.status(), StatusCode::PAYLOAD_TOO_LARGE); + } + + #[tokio::test] + async fn test_transcribe_endpoint_validates_mime_type() { + let state = AppState::new( + Arc::new(goose::agents::Agent::new()), + "test-secret".to_string(), + ) + .await; + let app = routes(state); + + let request = Request::builder() + .uri("/audio/transcribe") + .method("POST") + .header("content-type", "application/json") + .header("x-secret-key", "test-secret") + .body(Body::from( + serde_json::to_string(&serde_json::json!({ + "audio": "dGVzdA==", + "mime_type": "application/pdf" // Invalid MIME type + })) + .unwrap(), + )) + .unwrap(); + + let response = app.oneshot(request).await.unwrap(); + assert_eq!(response.status(), StatusCode::UNSUPPORTED_MEDIA_TYPE); + } + + #[tokio::test] + async fn test_transcribe_endpoint_handles_invalid_base64() { + let state = AppState::new( + Arc::new(goose::agents::Agent::new()), + "test-secret".to_string(), + ) + .await; + let app = routes(state); + + let request = Request::builder() + .uri("/audio/transcribe") + .method("POST") + .header("content-type", "application/json") + .header("x-secret-key", "test-secret") + .body(Body::from( + serde_json::to_string(&serde_json::json!({ + "audio": "invalid-base64-!@#$%", + "mime_type": "audio/webm" + })) + .unwrap(), + )) + .unwrap(); + + let response = app.oneshot(request).await.unwrap(); + assert_eq!(response.status(), StatusCode::BAD_REQUEST); + } +} diff --git a/crates/goose-server/src/routes/mod.rs b/crates/goose-server/src/routes/mod.rs index 89e46f23c7d2..c5e662ec2c16 100644 --- a/crates/goose-server/src/routes/mod.rs +++ b/crates/goose-server/src/routes/mod.rs @@ -1,5 +1,6 @@ // Export route modules pub mod agent; +pub mod audio; pub mod config_management; pub mod context; pub mod extension; @@ -19,6 +20,7 @@ pub fn configure(state: Arc) -> Router { .merge(health::routes()) .merge(reply::routes(state.clone())) .merge(agent::routes(state.clone())) + .merge(audio::routes(state.clone())) .merge(context::routes(state.clone())) .merge(extension::routes(state.clone())) .merge(config_management::routes(state.clone())) diff --git a/ui/desktop/.eslintrc.json b/ui/desktop/.eslintrc.json index 5603e4fdf217..0be9c263a561 100644 --- a/ui/desktop/.eslintrc.json +++ b/ui/desktop/.eslintrc.json @@ -2,7 +2,8 @@ "root": true, "env": { "browser": true, - "es2020": true + "es2020": true, + "node": true }, "extends": [ "eslint:recommended", diff --git a/ui/desktop/eslint.config.js b/ui/desktop/eslint.config.js index 42b7e3f7f3cf..38b447c453e9 100644 --- a/ui/desktop/eslint.config.js +++ b/ui/desktop/eslint.config.js @@ -70,6 +70,7 @@ module.exports = [ HTMLTextAreaElement: 'readonly', HTMLButtonElement: 'readonly', HTMLDivElement: 'readonly', + HTMLCanvasElement: 'readonly', File: 'readonly', FileList: 'readonly', FileReader: 'readonly', @@ -87,10 +88,17 @@ module.exports = [ React: 'readonly', handleAction: 'readonly', requestAnimationFrame: 'readonly', + cancelAnimationFrame: 'readonly', ResizeObserver: 'readonly', MutationObserver: 'readonly', NodeFilter: 'readonly', Text: 'readonly', + AudioContext: 'readonly', + AnalyserNode: 'readonly', + MediaRecorder: 'readonly', + MediaStream: 'readonly', + Blob: 'readonly', + FormData: 'readonly', }, }, plugins: { diff --git a/ui/desktop/src/components/ChatInput.tsx b/ui/desktop/src/components/ChatInput.tsx index 09cb2adeda31..50d670e59028 100644 --- a/ui/desktop/src/components/ChatInput.tsx +++ b/ui/desktop/src/components/ChatInput.tsx @@ -2,11 +2,14 @@ import React, { useRef, useState, useEffect, useMemo } from 'react'; import { Button } from './ui/button'; import type { View } from '../App'; import Stop from './ui/Stop'; -import { Attach, Send, Close } from './icons'; +import { Attach, Send, Close, Microphone } from './icons'; import { debounce } from 'lodash'; import BottomMenu from './bottom_menu/BottomMenu'; import { LocalMessageStorage } from '../utils/localMessageStorage'; import { Message } from '../types/message'; +import { useWhisper } from '../hooks/useWhisper'; +import { WaveformVisualizer } from './WaveformVisualizer'; +import { toastError } from '../toasts'; interface PastedImage { id: string; @@ -51,6 +54,39 @@ export default function ChatInput({ const [isFocused, setIsFocused] = useState(false); const [pastedImages, setPastedImages] = useState([]); + // Whisper hook for voice dictation + const { + isRecording, + isTranscribing, + hasOpenAIKey, + audioContext, + analyser, + startRecording, + stopRecording, + recordingDuration, + estimatedSize, + } = useWhisper({ + onTranscription: (text) => { + // Append transcribed text to the current input + const newValue = displayValue.trim() ? `${displayValue.trim()} ${text}` : text; + setDisplayValue(newValue); + setValue(newValue); + textAreaRef.current?.focus(); + }, + onError: (error) => { + toastError({ + title: 'Dictation Error', + msg: error.message, + }); + }, + onSizeWarning: (sizeMB) => { + toastError({ + title: 'Recording Size Warning', + msg: `Recording is ${sizeMB.toFixed(1)}MB. Maximum size is 25MB.`, + }); + }, + }); + // Update internal value when initialValue changes useEffect(() => { setValue(initialValue); @@ -439,28 +475,40 @@ export default function ChatInput({ } bg-bgApp z-10`} >
-