-
Notifications
You must be signed in to change notification settings - Fork 2.6k
Fix eleven labs audio transcription and added more logging #4358
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5ef5d7f
16fd892
d012a77
5d50a53
b95c46a
ac2fcb4
e440bd0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,35 +42,13 @@ struct WhisperResponse { | |
| text: String, | ||
| } | ||
|
|
||
| /// Transcribe audio using OpenAI's Whisper API | ||
| /// | ||
| /// # Request | ||
| /// - `audio`: Base64 encoded audio data | ||
| /// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav") | ||
| /// | ||
| /// # Response | ||
| /// - `text`: Transcribed text from the audio | ||
| /// | ||
| /// # Errors | ||
| /// - 401: Unauthorized (missing or invalid X-Secret-Key header) | ||
| /// - 412: Precondition Failed (OpenAI API key not configured) | ||
| /// - 400: Bad Request (invalid base64 audio data) | ||
| /// - 413: Payload Too Large (audio file exceeds 25MB limit) | ||
| /// - 415: Unsupported Media Type (unsupported audio format) | ||
| /// - 502: Bad Gateway (OpenAI API error) | ||
| /// - 503: Service Unavailable (network error) | ||
| async fn transcribe_handler( | ||
| State(state): State<Arc<AppState>>, | ||
| headers: HeaderMap, | ||
| Json(request): Json<TranscribeRequest>, | ||
| ) -> Result<Json<TranscribeResponse>, StatusCode> { | ||
| verify_secret_key(&headers, &state)?; | ||
|
|
||
| // Validate input first before checking API key configuration | ||
| /// Validate audio input and return decoded bytes and file extension | ||
| fn validate_audio_input( | ||
| audio: &str, | ||
| mime_type: &str, | ||
| ) -> Result<(Vec<u8>, &'static str), StatusCode> { | ||
| // Decode the base64 audio data | ||
| let audio_bytes = BASE64 | ||
| .decode(&request.audio) | ||
| .map_err(|_| StatusCode::BAD_REQUEST)?; | ||
| let audio_bytes = BASE64.decode(audio).map_err(|_| StatusCode::BAD_REQUEST)?; | ||
|
|
||
| // Check file size | ||
| if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES { | ||
|
|
@@ -83,8 +61,9 @@ async fn transcribe_handler( | |
| } | ||
|
|
||
| // Determine file extension based on MIME type | ||
| let file_extension = match request.mime_type.as_str() { | ||
| let file_extension = match mime_type { | ||
| "audio/webm" => "webm", | ||
| "audio/webm;codecs=opus" => "webm", | ||
| "audio/mp4" => "mp4", | ||
| "audio/mpeg" => "mp3", | ||
| "audio/mpga" => "mpga", | ||
|
|
@@ -94,13 +73,18 @@ async fn transcribe_handler( | |
| _ => return Err(StatusCode::UNSUPPORTED_MEDIA_TYPE), | ||
| }; | ||
|
|
||
| // Get the OpenAI API key from config (after input validation) | ||
| Ok((audio_bytes, file_extension)) | ||
| } | ||
|
|
||
| /// Get OpenAI configuration (API key and host) | ||
| fn get_openai_config() -> Result<(String, String), StatusCode> { | ||
| let config = goose::config::Config::global(); | ||
| let api_key: String = config | ||
| .get_secret("OPENAI_API_KEY") | ||
| .map_err(|_| StatusCode::PRECONDITION_FAILED)?; | ||
|
|
||
| // Get the OpenAI host from config (with default) | ||
| let api_key: String = config.get_secret("OPENAI_API_KEY").map_err(|e| { | ||
| tracing::error!("Failed to get OpenAI API key: {:?}", e); | ||
| StatusCode::PRECONDITION_FAILED | ||
| })?; | ||
|
|
||
| let openai_host = match config.get("OPENAI_HOST", false) { | ||
| Ok(value) => value | ||
| .as_str() | ||
|
|
@@ -109,19 +93,41 @@ async fn transcribe_handler( | |
| Err(_) => "https://api.openai.com".to_string(), | ||
| }; | ||
|
|
||
| tracing::debug!("Using OpenAI host: {}", openai_host); | ||
| Ok((api_key, openai_host)) | ||
| } | ||
|
|
||
| /// Send transcription request to OpenAI Whisper API | ||
| async fn send_openai_request( | ||
| audio_bytes: Vec<u8>, | ||
| file_extension: &str, | ||
| mime_type: &str, | ||
| api_key: &str, | ||
| openai_host: &str, | ||
| ) -> Result<WhisperResponse, StatusCode> { | ||
| tracing::info!("Using OpenAI host: {}", openai_host); | ||
| tracing::info!( | ||
| "Audio file size: {} bytes, extension: {}, mime_type: {}", | ||
| audio_bytes.len(), | ||
| file_extension, | ||
| mime_type | ||
| ); | ||
|
|
||
| // Create a multipart form with the audio file | ||
| let part = reqwest::multipart::Part::bytes(audio_bytes) | ||
| .file_name(format!("audio.{}", file_extension)) | ||
| .mime_str(&request.mime_type) | ||
| .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; | ||
| .mime_str(mime_type) | ||
| .map_err(|e| { | ||
| tracing::error!("Failed to create multipart part: {:?}", e); | ||
| StatusCode::INTERNAL_SERVER_ERROR | ||
| })?; | ||
|
|
||
| let form = reqwest::multipart::Form::new() | ||
| .part("file", part) | ||
| .text("model", "whisper-1") | ||
| .text("response_format", "json"); | ||
|
|
||
| tracing::info!("Created multipart form for OpenAI Whisper API"); | ||
|
|
||
| // Make request to OpenAI Whisper API | ||
| let client = Client::builder() | ||
| .timeout(Duration::from_secs(OPENAI_TIMEOUT_SECONDS)) | ||
|
|
@@ -131,6 +137,11 @@ async fn transcribe_handler( | |
| StatusCode::INTERNAL_SERVER_ERROR | ||
| })?; | ||
|
|
||
| tracing::info!( | ||
| "Sending request to OpenAI: {}/v1/audio/transcriptions", | ||
| openai_host | ||
| ); | ||
|
|
||
| let response = client | ||
| .post(format!("{}/v1/audio/transcriptions", openai_host)) | ||
| .header("Authorization", format!("Bearer {}", api_key)) | ||
|
|
@@ -150,9 +161,25 @@ async fn transcribe_handler( | |
| } | ||
| })?; | ||
|
|
||
| tracing::info!( | ||
| "Received response from OpenAI with status: {}", | ||
| response.status() | ||
| ); | ||
|
|
||
| if !response.status().is_success() { | ||
| let status = response.status(); | ||
| let error_text = response.text().await.unwrap_or_default(); | ||
| tracing::error!("OpenAI API error: {}", error_text); | ||
| tracing::error!("OpenAI API error (status: {}): {}", status, error_text); | ||
|
|
||
| // Check for specific error codes | ||
| if status == 401 { | ||
| tracing::error!("OpenAI API key appears to be invalid or unauthorized"); | ||
| return Err(StatusCode::UNAUTHORIZED); | ||
| } else if status == 429 { | ||
| tracing::error!("OpenAI API quota or rate limit exceeded"); | ||
| return Err(StatusCode::TOO_MANY_REQUESTS); | ||
| } | ||
|
|
||
| return Err(StatusCode::BAD_GATEWAY); | ||
| } | ||
|
|
||
|
|
@@ -161,6 +188,45 @@ async fn transcribe_handler( | |
| StatusCode::INTERNAL_SERVER_ERROR | ||
| })?; | ||
|
|
||
| Ok(whisper_response) | ||
| } | ||
|
|
||
| /// Transcribe audio using OpenAI's Whisper API | ||
| /// | ||
| /// # Request | ||
| /// - `audio`: Base64 encoded audio data | ||
| /// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav") | ||
| /// | ||
| /// # Response | ||
| /// - `text`: Transcribed text from the audio | ||
| /// | ||
| /// # Errors | ||
| /// - 401: Unauthorized (missing or invalid X-Secret-Key header) | ||
| /// - 412: Precondition Failed (OpenAI API key not configured) | ||
| /// - 400: Bad Request (invalid base64 audio data) | ||
| /// - 413: Payload Too Large (audio file exceeds 25MB limit) | ||
| /// - 415: Unsupported Media Type (unsupported audio format) | ||
| /// - 502: Bad Gateway (OpenAI API error) | ||
| /// - 503: Service Unavailable (network error) | ||
| async fn transcribe_handler( | ||
| State(state): State<Arc<AppState>>, | ||
| headers: HeaderMap, | ||
| Json(request): Json<TranscribeRequest>, | ||
| ) -> Result<Json<TranscribeResponse>, StatusCode> { | ||
| verify_secret_key(&headers, &state)?; | ||
|
|
||
| let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?; | ||
| let (api_key, openai_host) = get_openai_config()?; | ||
|
|
||
| let whisper_response = send_openai_request( | ||
| audio_bytes, | ||
| file_extension, | ||
| &request.mime_type, | ||
| &api_key, | ||
| &openai_host, | ||
| ) | ||
| .await?; | ||
|
|
||
| Ok(Json(TranscribeResponse { | ||
| text: whisper_response.text, | ||
| })) | ||
|
|
@@ -177,47 +243,20 @@ async fn transcribe_elevenlabs_handler( | |
| ) -> Result<Json<TranscribeResponse>, StatusCode> { | ||
| verify_secret_key(&headers, &state)?; | ||
|
|
||
| // Validate input first before checking API key configuration | ||
| // Decode the base64 audio data | ||
| let audio_bytes = BASE64 | ||
| .decode(&request.audio) | ||
| .map_err(|_| StatusCode::BAD_REQUEST)?; | ||
|
|
||
| // Check file size | ||
| if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES { | ||
| tracing::warn!( | ||
| "Audio file too large: {} bytes (max: {} bytes)", | ||
| audio_bytes.len(), | ||
| MAX_AUDIO_SIZE_BYTES | ||
| ); | ||
| return Err(StatusCode::PAYLOAD_TOO_LARGE); | ||
| } | ||
|
|
||
| // Determine file extension and content type based on MIME type | ||
| let (file_extension, content_type) = match request.mime_type.as_str() { | ||
| "audio/webm" => ("webm", "audio/webm"), | ||
| "audio/mp4" => ("mp4", "audio/mp4"), | ||
| "audio/mpeg" => ("mp3", "audio/mpeg"), | ||
| "audio/mpga" => ("mp3", "audio/mpeg"), | ||
| "audio/m4a" => ("m4a", "audio/m4a"), | ||
| "audio/wav" => ("wav", "audio/wav"), | ||
| "audio/x-wav" => ("wav", "audio/wav"), | ||
| _ => return Err(StatusCode::UNSUPPORTED_MEDIA_TYPE), | ||
| }; | ||
| let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?; | ||
|
|
||
| // Get the ElevenLabs API key from config (after input validation) | ||
| let config = goose::config::Config::global(); | ||
|
|
||
| // First try to get it as a secret | ||
| let api_key: String = match config.get_secret("ELEVENLABS_API_KEY") { | ||
| let api_key: String = match config.get_secret::<String>("ELEVENLABS_API_KEY") { | ||
| Ok(key) => key, | ||
| Err(_) => { | ||
| // Try to get it as non-secret (for backward compatibility) | ||
| match config.get("ELEVENLABS_API_KEY", false) { | ||
| Ok(value) => { | ||
| match value.as_str() { | ||
| Some(key_str) => { | ||
| tracing::info!("Migrating ElevenLabs API key to secret storage"); | ||
| let key = key_str.to_string(); | ||
| // Migrate to secret storage | ||
| if let Err(e) = config.set( | ||
|
|
@@ -228,17 +267,25 @@ async fn transcribe_elevenlabs_handler( | |
| tracing::error!("Failed to migrate ElevenLabs API key: {:?}", e); | ||
| } | ||
| // Delete the non-secret version | ||
| let _ = config.delete("ELEVENLABS_API_KEY"); | ||
| if let Err(e) = config.delete("ELEVENLABS_API_KEY") { | ||
| tracing::warn!( | ||
| "Failed to delete non-secret ElevenLabs API key: {:?}", | ||
| e | ||
| ); | ||
| } | ||
| key | ||
| } | ||
| None => { | ||
| tracing::error!("ElevenLabs API key is not a string"); | ||
| tracing::error!( | ||
| "ElevenLabs API key is not a string, found: {:?}", | ||
| value | ||
| ); | ||
| return Err(StatusCode::PRECONDITION_FAILED); | ||
| } | ||
| } | ||
| } | ||
| Err(e) => { | ||
| tracing::error!("Failed to get ElevenLabs API key from config: {:?}", e); | ||
| Err(_) => { | ||
| tracing::error!("No ElevenLabs API key found in configuration"); | ||
| return Err(StatusCode::PRECONDITION_FAILED); | ||
| } | ||
| } | ||
|
|
@@ -248,7 +295,7 @@ async fn transcribe_elevenlabs_handler( | |
| // Create multipart form for ElevenLabs API | ||
| let part = reqwest::multipart::Part::bytes(audio_bytes) | ||
| .file_name(format!("audio.{}", file_extension)) | ||
| .mime_str(content_type) | ||
| .mime_str(&request.mime_type) | ||
| .map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?; | ||
|
|
||
| let form = reqwest::multipart::Form::new() | ||
|
|
@@ -286,8 +333,9 @@ async fn transcribe_elevenlabs_handler( | |
| })?; | ||
|
|
||
| if !response.status().is_success() { | ||
| let status = response.status(); | ||
| let error_text = response.text().await.unwrap_or_default(); | ||
| tracing::error!("ElevenLabs API error: {}", error_text); | ||
| tracing::error!("ElevenLabs API error (status: {}): {}", status, error_text); | ||
|
|
||
| // Check for specific error codes | ||
| if error_text.contains("Unauthorized") || error_text.contains("Invalid API key") { | ||
|
|
@@ -330,16 +378,13 @@ async fn check_dictation_config( | |
| let config = goose::config::Config::global(); | ||
|
|
||
| // Check if ElevenLabs API key is configured | ||
| let has_elevenlabs = config | ||
| .get_secret::<String>("ELEVENLABS_API_KEY") | ||
| .map(|_| true) | ||
| .unwrap_or_else(|_| { | ||
| let has_elevenlabs = match config.get_secret::<String>("ELEVENLABS_API_KEY") { | ||
| Ok(_) => true, | ||
| Err(_) => { | ||
| // Check non-secret for backward compatibility | ||
| config | ||
| .get("ELEVENLABS_API_KEY", false) | ||
| .map(|_| true) | ||
| .unwrap_or(false) | ||
| }); | ||
| config.get("ELEVENLABS_API_KEY", false).is_ok() | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what backwards compatibility is this?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it used to be stored in plain text in the config so its trying that first before secrets. Might be able to remove that now |
||
| } | ||
| }; | ||
|
|
||
| Ok(Json(serde_json::json!({ | ||
| "elevenlabs": has_elevenlabs | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would be better if we would just move this to the openai provider class and then just instantiate the openai provider; that way it would use whatever the user had configured for openai (proxy etc). we'd still have to check whether it works of course, but it would be cleaner
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah good call, will follow up with this to get this fix out sooner