diff --git a/Cargo.lock b/Cargo.lock index f77b5dbff72..79694835e48 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2545,7 +2545,7 @@ dependencies = [ [[package]] name = "goose" -version = "1.6.0" +version = "1.7.0" dependencies = [ "ahash", "anyhow", @@ -2619,7 +2619,7 @@ dependencies = [ [[package]] name = "goose-bench" -version = "1.6.0" +version = "1.7.0" dependencies = [ "anyhow", "async-trait", @@ -2642,7 +2642,7 @@ dependencies = [ [[package]] name = "goose-cli" -version = "1.6.0" +version = "1.7.0" dependencies = [ "anstream", "anyhow", @@ -2694,7 +2694,7 @@ dependencies = [ [[package]] name = "goose-mcp" -version = "1.6.0" +version = "1.7.0" dependencies = [ "anyhow", "async-trait", @@ -2743,7 +2743,7 @@ dependencies = [ [[package]] name = "goose-server" -version = "1.6.0" +version = "1.7.0" dependencies = [ "anyhow", "async-trait", @@ -2781,7 +2781,7 @@ dependencies = [ [[package]] name = "goose-test" -version = "1.6.0" +version = "1.7.0" dependencies = [ "clap", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 4f4356575d3..c6c6e55e46e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ resolver = "2" [workspace.package] edition = "2021" -version = "1.6.0" +version = "1.7.0" authors = ["Block "] license = "Apache-2.0" repository = "https://github.com/block/goose" diff --git a/crates/goose-server/src/routes/audio.rs b/crates/goose-server/src/routes/audio.rs index e0071249b79..8b2046f2af5 100644 --- a/crates/goose-server/src/routes/audio.rs +++ b/crates/goose-server/src/routes/audio.rs @@ -42,35 +42,13 @@ struct WhisperResponse { text: String, } -/// Transcribe audio using OpenAI's Whisper API -/// -/// # Request -/// - `audio`: Base64 encoded audio data -/// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav") -/// -/// # Response -/// - `text`: Transcribed text from the audio -/// -/// # Errors -/// - 401: Unauthorized (missing or invalid X-Secret-Key header) -/// - 412: Precondition Failed (OpenAI API key not configured) -/// - 400: Bad Request (invalid base64 audio data) -/// - 413: Payload Too Large (audio file exceeds 25MB limit) -/// - 415: Unsupported Media Type (unsupported audio format) -/// - 502: Bad Gateway (OpenAI API error) -/// - 503: Service Unavailable (network error) -async fn transcribe_handler( - State(state): State>, - headers: HeaderMap, - Json(request): Json, -) -> Result, StatusCode> { - verify_secret_key(&headers, &state)?; - - // Validate input first before checking API key configuration +/// Validate audio input and return decoded bytes and file extension +fn validate_audio_input( + audio: &str, + mime_type: &str, +) -> Result<(Vec, &'static str), StatusCode> { // Decode the base64 audio data - let audio_bytes = BASE64 - .decode(&request.audio) - .map_err(|_| StatusCode::BAD_REQUEST)?; + let audio_bytes = BASE64.decode(audio).map_err(|_| StatusCode::BAD_REQUEST)?; // Check file size if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES { @@ -83,8 +61,9 @@ async fn transcribe_handler( } // Determine file extension based on MIME type - let file_extension = match request.mime_type.as_str() { + let file_extension = match mime_type { "audio/webm" => "webm", + "audio/webm;codecs=opus" => "webm", "audio/mp4" => "mp4", "audio/mpeg" => "mp3", "audio/mpga" => "mpga", @@ -94,13 +73,18 @@ async fn transcribe_handler( _ => return Err(StatusCode::UNSUPPORTED_MEDIA_TYPE), }; - // Get the OpenAI API key from config (after input validation) + Ok((audio_bytes, file_extension)) +} + +/// Get OpenAI configuration (API key and host) +fn get_openai_config() -> Result<(String, String), StatusCode> { let config = goose::config::Config::global(); - let api_key: String = config - .get_secret("OPENAI_API_KEY") - .map_err(|_| StatusCode::PRECONDITION_FAILED)?; - // Get the OpenAI host from config (with default) + let api_key: String = config.get_secret("OPENAI_API_KEY").map_err(|e| { + tracing::error!("Failed to get OpenAI API key: {:?}", e); + StatusCode::PRECONDITION_FAILED + })?; + let openai_host = match config.get("OPENAI_HOST", false) { Ok(value) => value .as_str() @@ -109,19 +93,41 @@ async fn transcribe_handler( Err(_) => "https://api.openai.com".to_string(), }; - tracing::debug!("Using OpenAI host: {}", openai_host); + Ok((api_key, openai_host)) +} + +/// Send transcription request to OpenAI Whisper API +async fn send_openai_request( + audio_bytes: Vec, + file_extension: &str, + mime_type: &str, + api_key: &str, + openai_host: &str, +) -> Result { + tracing::info!("Using OpenAI host: {}", openai_host); + tracing::info!( + "Audio file size: {} bytes, extension: {}, mime_type: {}", + audio_bytes.len(), + file_extension, + mime_type + ); // Create a multipart form with the audio file let part = reqwest::multipart::Part::bytes(audio_bytes) .file_name(format!("audio.{}", file_extension)) - .mime_str(&request.mime_type) - .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; + .mime_str(mime_type) + .map_err(|e| { + tracing::error!("Failed to create multipart part: {:?}", e); + StatusCode::INTERNAL_SERVER_ERROR + })?; let form = reqwest::multipart::Form::new() .part("file", part) .text("model", "whisper-1") .text("response_format", "json"); + tracing::info!("Created multipart form for OpenAI Whisper API"); + // Make request to OpenAI Whisper API let client = Client::builder() .timeout(Duration::from_secs(OPENAI_TIMEOUT_SECONDS)) @@ -131,6 +137,11 @@ async fn transcribe_handler( StatusCode::INTERNAL_SERVER_ERROR })?; + tracing::info!( + "Sending request to OpenAI: {}/v1/audio/transcriptions", + openai_host + ); + let response = client .post(format!("{}/v1/audio/transcriptions", openai_host)) .header("Authorization", format!("Bearer {}", api_key)) @@ -150,9 +161,25 @@ async fn transcribe_handler( } })?; + tracing::info!( + "Received response from OpenAI with status: {}", + response.status() + ); + if !response.status().is_success() { + let status = response.status(); let error_text = response.text().await.unwrap_or_default(); - tracing::error!("OpenAI API error: {}", error_text); + tracing::error!("OpenAI API error (status: {}): {}", status, error_text); + + // Check for specific error codes + if status == 401 { + tracing::error!("OpenAI API key appears to be invalid or unauthorized"); + return Err(StatusCode::UNAUTHORIZED); + } else if status == 429 { + tracing::error!("OpenAI API quota or rate limit exceeded"); + return Err(StatusCode::TOO_MANY_REQUESTS); + } + return Err(StatusCode::BAD_GATEWAY); } @@ -161,6 +188,45 @@ async fn transcribe_handler( StatusCode::INTERNAL_SERVER_ERROR })?; + Ok(whisper_response) +} + +/// Transcribe audio using OpenAI's Whisper API +/// +/// # Request +/// - `audio`: Base64 encoded audio data +/// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav") +/// +/// # Response +/// - `text`: Transcribed text from the audio +/// +/// # Errors +/// - 401: Unauthorized (missing or invalid X-Secret-Key header) +/// - 412: Precondition Failed (OpenAI API key not configured) +/// - 400: Bad Request (invalid base64 audio data) +/// - 413: Payload Too Large (audio file exceeds 25MB limit) +/// - 415: Unsupported Media Type (unsupported audio format) +/// - 502: Bad Gateway (OpenAI API error) +/// - 503: Service Unavailable (network error) +async fn transcribe_handler( + State(state): State>, + headers: HeaderMap, + Json(request): Json, +) -> Result, StatusCode> { + verify_secret_key(&headers, &state)?; + + let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?; + let (api_key, openai_host) = get_openai_config()?; + + let whisper_response = send_openai_request( + audio_bytes, + file_extension, + &request.mime_type, + &api_key, + &openai_host, + ) + .await?; + Ok(Json(TranscribeResponse { text: whisper_response.text, })) @@ -177,39 +243,13 @@ async fn transcribe_elevenlabs_handler( ) -> Result, StatusCode> { verify_secret_key(&headers, &state)?; - // Validate input first before checking API key configuration - // Decode the base64 audio data - let audio_bytes = BASE64 - .decode(&request.audio) - .map_err(|_| StatusCode::BAD_REQUEST)?; - - // Check file size - if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES { - tracing::warn!( - "Audio file too large: {} bytes (max: {} bytes)", - audio_bytes.len(), - MAX_AUDIO_SIZE_BYTES - ); - return Err(StatusCode::PAYLOAD_TOO_LARGE); - } - - // Determine file extension and content type based on MIME type - let (file_extension, content_type) = match request.mime_type.as_str() { - "audio/webm" => ("webm", "audio/webm"), - "audio/mp4" => ("mp4", "audio/mp4"), - "audio/mpeg" => ("mp3", "audio/mpeg"), - "audio/mpga" => ("mp3", "audio/mpeg"), - "audio/m4a" => ("m4a", "audio/m4a"), - "audio/wav" => ("wav", "audio/wav"), - "audio/x-wav" => ("wav", "audio/wav"), - _ => return Err(StatusCode::UNSUPPORTED_MEDIA_TYPE), - }; + let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?; // Get the ElevenLabs API key from config (after input validation) let config = goose::config::Config::global(); // First try to get it as a secret - let api_key: String = match config.get_secret("ELEVENLABS_API_KEY") { + let api_key: String = match config.get_secret::("ELEVENLABS_API_KEY") { Ok(key) => key, Err(_) => { // Try to get it as non-secret (for backward compatibility) @@ -217,7 +257,6 @@ async fn transcribe_elevenlabs_handler( Ok(value) => { match value.as_str() { Some(key_str) => { - tracing::info!("Migrating ElevenLabs API key to secret storage"); let key = key_str.to_string(); // Migrate to secret storage if let Err(e) = config.set( @@ -228,17 +267,25 @@ async fn transcribe_elevenlabs_handler( tracing::error!("Failed to migrate ElevenLabs API key: {:?}", e); } // Delete the non-secret version - let _ = config.delete("ELEVENLABS_API_KEY"); + if let Err(e) = config.delete("ELEVENLABS_API_KEY") { + tracing::warn!( + "Failed to delete non-secret ElevenLabs API key: {:?}", + e + ); + } key } None => { - tracing::error!("ElevenLabs API key is not a string"); + tracing::error!( + "ElevenLabs API key is not a string, found: {:?}", + value + ); return Err(StatusCode::PRECONDITION_FAILED); } } } - Err(e) => { - tracing::error!("Failed to get ElevenLabs API key from config: {:?}", e); + Err(_) => { + tracing::error!("No ElevenLabs API key found in configuration"); return Err(StatusCode::PRECONDITION_FAILED); } } @@ -248,7 +295,7 @@ async fn transcribe_elevenlabs_handler( // Create multipart form for ElevenLabs API let part = reqwest::multipart::Part::bytes(audio_bytes) .file_name(format!("audio.{}", file_extension)) - .mime_str(content_type) + .mime_str(&request.mime_type) .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; let form = reqwest::multipart::Form::new() @@ -286,8 +333,9 @@ async fn transcribe_elevenlabs_handler( })?; if !response.status().is_success() { + let status = response.status(); let error_text = response.text().await.unwrap_or_default(); - tracing::error!("ElevenLabs API error: {}", error_text); + tracing::error!("ElevenLabs API error (status: {}): {}", status, error_text); // Check for specific error codes if error_text.contains("Unauthorized") || error_text.contains("Invalid API key") { @@ -330,16 +378,13 @@ async fn check_dictation_config( let config = goose::config::Config::global(); // Check if ElevenLabs API key is configured - let has_elevenlabs = config - .get_secret::("ELEVENLABS_API_KEY") - .map(|_| true) - .unwrap_or_else(|_| { + let has_elevenlabs = match config.get_secret::("ELEVENLABS_API_KEY") { + Ok(_) => true, + Err(_) => { // Check non-secret for backward compatibility - config - .get("ELEVENLABS_API_KEY", false) - .map(|_| true) - .unwrap_or(false) - }); + config.get("ELEVENLABS_API_KEY", false).is_ok() + } + }; Ok(Json(serde_json::json!({ "elevenlabs": has_elevenlabs diff --git a/crates/goose-server/src/routes/config_management.rs b/crates/goose-server/src/routes/config_management.rs index 0224cc79565..570ae5bf349 100644 --- a/crates/goose-server/src/routes/config_management.rs +++ b/crates/goose-server/src/routes/config_management.rs @@ -167,6 +167,7 @@ pub async fn read_config( } let config = Config::global(); + let response_value = match config.get(&query.key, query.is_secret) { Ok(value) => { if query.is_secret { @@ -182,7 +183,9 @@ pub async fn read_config( Value::Null } } - Err(_) => return Err(StatusCode::INTERNAL_SERVER_ERROR), + Err(_) => { + return Err(StatusCode::INTERNAL_SERVER_ERROR); + } }; Ok(Json(response_value)) } diff --git a/ui/desktop/openapi.json b/ui/desktop/openapi.json index 61542bf46a3..a64e713defb 100644 --- a/ui/desktop/openapi.json +++ b/ui/desktop/openapi.json @@ -10,7 +10,7 @@ "license": { "name": "Apache-2.0" }, - "version": "1.6.0" + "version": "1.7.0" }, "paths": { "/agent/add_sub_recipes": { diff --git a/ui/desktop/package-lock.json b/ui/desktop/package-lock.json index 4f34a9948aa..bdf4cd2308e 100644 --- a/ui/desktop/package-lock.json +++ b/ui/desktop/package-lock.json @@ -1,12 +1,12 @@ { "name": "goose-app", - "version": "1.6.0", + "version": "1.7.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "goose-app", - "version": "1.6.0", + "version": "1.7.0", "license": "Apache-2.0", "dependencies": { "@ai-sdk/openai": "^2.0.14", diff --git a/ui/desktop/package.json b/ui/desktop/package.json index 0b59e03d532..93f45f1ef55 100644 --- a/ui/desktop/package.json +++ b/ui/desktop/package.json @@ -1,7 +1,7 @@ { "name": "goose-app", "productName": "Goose", - "version": "1.6.0", + "version": "1.7.0", "description": "Goose App", "engines": { "node": "^22.17.1" diff --git a/ui/desktop/src/components/ChatInput.tsx b/ui/desktop/src/components/ChatInput.tsx index f1b69b155bf..dba2f008004 100644 --- a/ui/desktop/src/components/ChatInput.tsx +++ b/ui/desktop/src/components/ChatInput.tsx @@ -1259,8 +1259,8 @@ export default function ChatInput({ {/* Inline action buttons on the right */}
- {/* Microphone button - show if dictation is enabled, disable if not configured */} - {(dictationSettings?.enabled || dictationSettings?.provider === null) && ( + {/* Microphone button - show only if dictation is enabled */} + {dictationSettings?.enabled && ( <> {!canUseDictation ? ( diff --git a/ui/desktop/src/components/settings/dictation/DictationSection.tsx b/ui/desktop/src/components/settings/dictation/DictationSection.tsx index 23f8e16b5fb..f28baa79f3e 100644 --- a/ui/desktop/src/components/settings/dictation/DictationSection.tsx +++ b/ui/desktop/src/components/settings/dictation/DictationSection.tsx @@ -4,9 +4,11 @@ import { ChevronDown } from 'lucide-react'; import { Input } from '../../ui/input'; import { useConfig } from '../../ConfigContext'; import { DictationProvider, DictationSettings } from '../../../hooks/useDictationSettings'; - -const DICTATION_SETTINGS_KEY = 'dictation_settings'; -const ELEVENLABS_API_KEY = 'ELEVENLABS_API_KEY'; +import { + DICTATION_SETTINGS_KEY, + ELEVENLABS_API_KEY, + getDefaultDictationSettings, +} from '../../../hooks/dictationConstants'; export default function DictationSection() { const [settings, setSettings] = useState({ @@ -27,20 +29,19 @@ export default function DictationSection() { useEffect(() => { const loadSettings = async () => { const savedSettings = localStorage.getItem(DICTATION_SETTINGS_KEY); + + let loadedSettings: DictationSettings; + if (savedSettings) { const parsed = JSON.parse(savedSettings); - setSettings(parsed); - setShowElevenLabsKey(parsed.provider === 'elevenlabs'); + loadedSettings = parsed; } else { - // Default settings - const defaultSettings: DictationSettings = { - enabled: true, - provider: 'openai', - }; - setSettings(defaultSettings); - localStorage.setItem(DICTATION_SETTINGS_KEY, JSON.stringify(defaultSettings)); + loadedSettings = await getDefaultDictationSettings(getProviders); } + setSettings(loadedSettings); + setShowElevenLabsKey(loadedSettings.provider === 'elevenlabs'); + // Load ElevenLabs API key from storage setIsLoadingKey(true); try { @@ -58,7 +59,7 @@ export default function DictationSection() { }; loadSettings(); - }, [read]); + }, [read, getProviders]); // Save ElevenLabs key on unmount if it has changed useEffect(() => { @@ -109,6 +110,7 @@ export default function DictationSection() { }; const saveSettings = (newSettings: DictationSettings) => { + console.log('Saving dictation settings to localStorage:', newSettings); setSettings(newSettings); localStorage.setItem(DICTATION_SETTINGS_KEY, JSON.stringify(newSettings)); }; @@ -130,18 +132,26 @@ export default function DictationSection() { const handleElevenLabsKeyChange = (key: string) => { setElevenLabsApiKey(key); elevenLabsApiKeyRef.current = key; + // If user starts typing, they're updating the key + if (key.length > 0) { + setHasElevenLabsKey(false); // Hide "configured" while typing + } }; const saveElevenLabsKey = async () => { // Save to secure storage try { if (elevenLabsApiKey.trim()) { + console.log('Saving ElevenLabs API key to secure storage...'); await upsert(ELEVENLABS_API_KEY, elevenLabsApiKey, true); setHasElevenLabsKey(true); + console.log('ElevenLabs API key saved successfully'); } else { // If key is empty, remove it from storage + console.log('Removing ElevenLabs API key from secure storage...'); await upsert(ELEVENLABS_API_KEY, null, true); setHasElevenLabsKey(false); + console.log('ElevenLabs API key removed successfully'); } } catch (error) { console.error('Error saving ElevenLabs API key:', error); diff --git a/ui/desktop/src/hooks/dictationConstants.ts b/ui/desktop/src/hooks/dictationConstants.ts new file mode 100644 index 00000000000..972ba08b6cf --- /dev/null +++ b/ui/desktop/src/hooks/dictationConstants.ts @@ -0,0 +1,25 @@ +import { DictationSettings, DictationProvider } from './useDictationSettings'; + +export const DICTATION_SETTINGS_KEY = 'dictation_settings'; +export const ELEVENLABS_API_KEY = 'ELEVENLABS_API_KEY'; + +export const getDefaultDictationSettings = async ( + getProviders: (refresh: boolean) => Promise> +): Promise => { + const providers = await getProviders(false); + + // Check if we have an OpenAI API key as primary default + const openAIProvider = providers.find((p) => p.name === 'openai'); + + if (openAIProvider && openAIProvider.is_configured) { + return { + enabled: true, + provider: 'openai' as DictationProvider, + }; + } else { + return { + enabled: false, + provider: null as DictationProvider, + }; + } +}; diff --git a/ui/desktop/src/hooks/useDictationSettings.ts b/ui/desktop/src/hooks/useDictationSettings.ts index 57ab0abb890..d6fe046b5a4 100644 --- a/ui/desktop/src/hooks/useDictationSettings.ts +++ b/ui/desktop/src/hooks/useDictationSettings.ts @@ -1,5 +1,10 @@ import { useState, useEffect } from 'react'; import { useConfig } from '../components/ConfigContext'; +import { + DICTATION_SETTINGS_KEY, + ELEVENLABS_API_KEY, + getDefaultDictationSettings, +} from './dictationConstants'; export type DictationProvider = 'openai' | 'elevenlabs' | null; @@ -8,9 +13,6 @@ export interface DictationSettings { provider: DictationProvider; } -const DICTATION_SETTINGS_KEY = 'dictation_settings'; -const ELEVENLABS_API_KEY = 'ELEVENLABS_API_KEY'; - export const useDictationSettings = () => { const [settings, setSettings] = useState(null); const [hasElevenLabsKey, setHasElevenLabsKey] = useState(false); @@ -20,23 +22,13 @@ export const useDictationSettings = () => { const loadSettings = async () => { // Load settings from localStorage const saved = localStorage.getItem(DICTATION_SETTINGS_KEY); + if (saved) { - setSettings(JSON.parse(saved)); + const parsedSettings = JSON.parse(saved); + setSettings(parsedSettings); } else { - const providers = await getProviders(false); - // Check if we have an OpenAI API key as primary default - const openAIProvider = providers.find((p) => p.name === 'openai'); - if (openAIProvider && openAIProvider.is_configured) { - setSettings({ - enabled: true, - provider: 'openai', - }); - } else { - setSettings({ - enabled: false, - provider: null, - }); - } + const defaultSettings = await getDefaultDictationSettings(getProviders); + setSettings(defaultSettings); } // Load ElevenLabs API key from storage (non-secret for frontend access) diff --git a/ui/desktop/src/hooks/useWhisper.ts b/ui/desktop/src/hooks/useWhisper.ts index 24c9847a0a4..fe0ddb62d8c 100644 --- a/ui/desktop/src/hooks/useWhisper.ts +++ b/ui/desktop/src/hooks/useWhisper.ts @@ -87,7 +87,7 @@ export const useWhisper = ({ onTranscription, onError, onSizeWarning }: UseWhisp // Define stopRecording before startRecording to avoid circular dependency const stopRecording = useCallback(() => { - setIsRecording(false); // Always update the visual state + setIsRecording(false); if (mediaRecorderRef.current && mediaRecorderRef.current.state !== 'inactive') { mediaRecorderRef.current.stop(); @@ -159,14 +159,20 @@ export const useWhisper = ({ onTranscription, onError, onSizeWarning }: UseWhisp reader.readAsDataURL(audioBlob); }); + const mimeType = audioBlob.type; + if (!mimeType) { + throw new Error('Unable to determine audio format. Please try again.'); + } + let endpoint = ''; let headers: Record = { 'Content-Type': 'application/json', 'X-Secret-Key': await window.electron.getSecretKey(), }; + let body: Record = { audio: base64Audio, - mime_type: 'audio/webm', + mime_type: mimeType, }; // Choose endpoint based on provider @@ -234,23 +240,32 @@ export const useWhisper = ({ onTranscription, onError, onSizeWarning }: UseWhisp try { // Request microphone permission - const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + const stream = await navigator.mediaDevices.getUserMedia({ + audio: { + echoCancellation: true, + noiseSuppression: true, + autoGainControl: true, + sampleRate: 44100, + }, + }); streamRef.current = stream; - // Create audio context and analyser for visualization - const context = new AudioContext(); - const source = context.createMediaStreamSource(stream); - const analyserNode = context.createAnalyser(); - analyserNode.fftSize = 2048; - source.connect(analyserNode); + // Verify we have valid audio tracks + const audioTracks = stream.getAudioTracks(); + if (audioTracks.length === 0) { + throw new Error('No audio tracks available in the microphone stream'); + } + + // AudioContext creation is disabled to prevent MediaRecorder conflicts + setAudioContext(null); + setAnalyser(null); + + // Determine best supported MIME type + const supportedTypes = ['audio/webm;codecs=opus', 'audio/webm', 'audio/mp4', 'audio/wav']; - setAudioContext(context); - setAnalyser(analyserNode); + const mimeType = supportedTypes.find((type) => MediaRecorder.isTypeSupported(type)) || ''; - // Create MediaRecorder - const mediaRecorder = new MediaRecorder(stream, { - mimeType: 'audio/webm', - }); + const mediaRecorder = new MediaRecorder(stream, mimeType ? { mimeType } : {}); mediaRecorderRef.current = mediaRecorder; audioChunksRef.current = []; @@ -297,12 +312,49 @@ export const useWhisper = ({ onTranscription, onError, onSizeWarning }: UseWhisp }; mediaRecorder.onstop = async () => { - const audioBlob = new Blob(audioChunksRef.current, { type: 'audio/webm' }); + const audioBlob = new Blob(audioChunksRef.current, { type: mimeType || 'audio/webm' }); + + // Check if the blob is empty + if (audioBlob.size === 0) { + onError?.( + new Error( + 'No audio data was recorded. Please check your microphone permissions and try again.' + ) + ); + return; + } + await transcribeAudio(audioBlob); }; - mediaRecorder.start(1000); // Collect data every second for size monitoring - setIsRecording(true); + // Add error handler for MediaRecorder + mediaRecorder.onerror = (event) => { + console.error('MediaRecorder error:', event); + onError?.(new Error('Recording failed: Unknown error')); + }; + + if (!stream.active) { + throw new Error('Audio stream became inactive before recording could start'); + } + + // Check audio tracks again before starting recording + if (audioTracks.length === 0) { + throw new Error('No audio tracks available in the stream'); + } + + const activeAudioTracks = audioTracks.filter((track) => track.readyState === 'live'); + if (activeAudioTracks.length === 0) { + throw new Error('No live audio tracks available'); + } + + try { + mediaRecorder.start(100); + setIsRecording(true); + } catch (startError) { + console.error('Error calling mediaRecorder.start():', startError); + const errorMessage = startError instanceof Error ? startError.message : String(startError); + throw new Error(`Failed to start recording: ${errorMessage}`); + } } catch (error) { console.error('Error starting recording:', error); stopRecording();