Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
209 changes: 127 additions & 82 deletions crates/goose-server/src/routes/audio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,35 +42,13 @@ struct WhisperResponse {
text: String,
}

/// Transcribe audio using OpenAI's Whisper API
///
/// # Request
/// - `audio`: Base64 encoded audio data
/// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav")
///
/// # Response
/// - `text`: Transcribed text from the audio
///
/// # Errors
/// - 401: Unauthorized (missing or invalid X-Secret-Key header)
/// - 412: Precondition Failed (OpenAI API key not configured)
/// - 400: Bad Request (invalid base64 audio data)
/// - 413: Payload Too Large (audio file exceeds 25MB limit)
/// - 415: Unsupported Media Type (unsupported audio format)
/// - 502: Bad Gateway (OpenAI API error)
/// - 503: Service Unavailable (network error)
async fn transcribe_handler(
State(state): State<Arc<AppState>>,
headers: HeaderMap,
Json(request): Json<TranscribeRequest>,
) -> Result<Json<TranscribeResponse>, StatusCode> {
verify_secret_key(&headers, &state)?;

// Validate input first before checking API key configuration
/// Validate audio input and return decoded bytes and file extension
fn validate_audio_input(
audio: &str,
mime_type: &str,
) -> Result<(Vec<u8>, &'static str), StatusCode> {
// Decode the base64 audio data
let audio_bytes = BASE64
.decode(&request.audio)
.map_err(|_| StatusCode::BAD_REQUEST)?;
let audio_bytes = BASE64.decode(audio).map_err(|_| StatusCode::BAD_REQUEST)?;

// Check file size
if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES {
Expand All @@ -83,8 +61,9 @@ async fn transcribe_handler(
}

// Determine file extension based on MIME type
let file_extension = match request.mime_type.as_str() {
let file_extension = match mime_type {
"audio/webm" => "webm",
"audio/webm;codecs=opus" => "webm",
"audio/mp4" => "mp4",
"audio/mpeg" => "mp3",
"audio/mpga" => "mpga",
Expand All @@ -94,13 +73,18 @@ async fn transcribe_handler(
_ => return Err(StatusCode::UNSUPPORTED_MEDIA_TYPE),
};

// Get the OpenAI API key from config (after input validation)
Ok((audio_bytes, file_extension))
}

/// Get OpenAI configuration (API key and host)
fn get_openai_config() -> Result<(String, String), StatusCode> {
let config = goose::config::Config::global();
let api_key: String = config
.get_secret("OPENAI_API_KEY")
.map_err(|_| StatusCode::PRECONDITION_FAILED)?;

// Get the OpenAI host from config (with default)
let api_key: String = config.get_secret("OPENAI_API_KEY").map_err(|e| {
tracing::error!("Failed to get OpenAI API key: {:?}", e);
StatusCode::PRECONDITION_FAILED
})?;

let openai_host = match config.get("OPENAI_HOST", false) {
Ok(value) => value
.as_str()
Expand All @@ -109,19 +93,41 @@ async fn transcribe_handler(
Err(_) => "https://api.openai.com".to_string(),
};

tracing::debug!("Using OpenAI host: {}", openai_host);
Ok((api_key, openai_host))
}

/// Send transcription request to OpenAI Whisper API
async fn send_openai_request(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be better if we would just move this to the openai provider class and then just instantiate the openai provider; that way it would use whatever the user had configured for openai (proxy etc). we'd still have to check whether it works of course, but it would be cleaner

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah good call, will follow up with this to get this fix out sooner

audio_bytes: Vec<u8>,
file_extension: &str,
mime_type: &str,
api_key: &str,
openai_host: &str,
) -> Result<WhisperResponse, StatusCode> {
tracing::info!("Using OpenAI host: {}", openai_host);
tracing::info!(
"Audio file size: {} bytes, extension: {}, mime_type: {}",
audio_bytes.len(),
file_extension,
mime_type
);

// Create a multipart form with the audio file
let part = reqwest::multipart::Part::bytes(audio_bytes)
.file_name(format!("audio.{}", file_extension))
.mime_str(&request.mime_type)
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
.mime_str(mime_type)
.map_err(|e| {
tracing::error!("Failed to create multipart part: {:?}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;

let form = reqwest::multipart::Form::new()
.part("file", part)
.text("model", "whisper-1")
.text("response_format", "json");

tracing::info!("Created multipart form for OpenAI Whisper API");

// Make request to OpenAI Whisper API
let client = Client::builder()
.timeout(Duration::from_secs(OPENAI_TIMEOUT_SECONDS))
Expand All @@ -131,6 +137,11 @@ async fn transcribe_handler(
StatusCode::INTERNAL_SERVER_ERROR
})?;

tracing::info!(
"Sending request to OpenAI: {}/v1/audio/transcriptions",
openai_host
);

let response = client
.post(format!("{}/v1/audio/transcriptions", openai_host))
.header("Authorization", format!("Bearer {}", api_key))
Expand All @@ -150,9 +161,25 @@ async fn transcribe_handler(
}
})?;

tracing::info!(
"Received response from OpenAI with status: {}",
response.status()
);

if !response.status().is_success() {
let status = response.status();
let error_text = response.text().await.unwrap_or_default();
tracing::error!("OpenAI API error: {}", error_text);
tracing::error!("OpenAI API error (status: {}): {}", status, error_text);

// Check for specific error codes
if status == 401 {
tracing::error!("OpenAI API key appears to be invalid or unauthorized");
return Err(StatusCode::UNAUTHORIZED);
} else if status == 429 {
tracing::error!("OpenAI API quota or rate limit exceeded");
return Err(StatusCode::TOO_MANY_REQUESTS);
}

return Err(StatusCode::BAD_GATEWAY);
}

Expand All @@ -161,6 +188,45 @@ async fn transcribe_handler(
StatusCode::INTERNAL_SERVER_ERROR
})?;

Ok(whisper_response)
}

/// Transcribe audio using OpenAI's Whisper API
///
/// # Request
/// - `audio`: Base64 encoded audio data
/// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav")
///
/// # Response
/// - `text`: Transcribed text from the audio
///
/// # Errors
/// - 401: Unauthorized (missing or invalid X-Secret-Key header)
/// - 412: Precondition Failed (OpenAI API key not configured)
/// - 400: Bad Request (invalid base64 audio data)
/// - 413: Payload Too Large (audio file exceeds 25MB limit)
/// - 415: Unsupported Media Type (unsupported audio format)
/// - 502: Bad Gateway (OpenAI API error)
/// - 503: Service Unavailable (network error)
async fn transcribe_handler(
State(state): State<Arc<AppState>>,
headers: HeaderMap,
Json(request): Json<TranscribeRequest>,
) -> Result<Json<TranscribeResponse>, StatusCode> {
verify_secret_key(&headers, &state)?;

let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?;
let (api_key, openai_host) = get_openai_config()?;

let whisper_response = send_openai_request(
audio_bytes,
file_extension,
&request.mime_type,
&api_key,
&openai_host,
)
.await?;

Ok(Json(TranscribeResponse {
text: whisper_response.text,
}))
Expand All @@ -177,47 +243,20 @@ async fn transcribe_elevenlabs_handler(
) -> Result<Json<TranscribeResponse>, StatusCode> {
verify_secret_key(&headers, &state)?;

// Validate input first before checking API key configuration
// Decode the base64 audio data
let audio_bytes = BASE64
.decode(&request.audio)
.map_err(|_| StatusCode::BAD_REQUEST)?;

// Check file size
if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES {
tracing::warn!(
"Audio file too large: {} bytes (max: {} bytes)",
audio_bytes.len(),
MAX_AUDIO_SIZE_BYTES
);
return Err(StatusCode::PAYLOAD_TOO_LARGE);
}

// Determine file extension and content type based on MIME type
let (file_extension, content_type) = match request.mime_type.as_str() {
"audio/webm" => ("webm", "audio/webm"),
"audio/mp4" => ("mp4", "audio/mp4"),
"audio/mpeg" => ("mp3", "audio/mpeg"),
"audio/mpga" => ("mp3", "audio/mpeg"),
"audio/m4a" => ("m4a", "audio/m4a"),
"audio/wav" => ("wav", "audio/wav"),
"audio/x-wav" => ("wav", "audio/wav"),
_ => return Err(StatusCode::UNSUPPORTED_MEDIA_TYPE),
};
let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?;

// Get the ElevenLabs API key from config (after input validation)
let config = goose::config::Config::global();

// First try to get it as a secret
let api_key: String = match config.get_secret("ELEVENLABS_API_KEY") {
let api_key: String = match config.get_secret::<String>("ELEVENLABS_API_KEY") {
Ok(key) => key,
Err(_) => {
// Try to get it as non-secret (for backward compatibility)
match config.get("ELEVENLABS_API_KEY", false) {
Ok(value) => {
match value.as_str() {
Some(key_str) => {
tracing::info!("Migrating ElevenLabs API key to secret storage");
let key = key_str.to_string();
// Migrate to secret storage
if let Err(e) = config.set(
Expand All @@ -228,17 +267,25 @@ async fn transcribe_elevenlabs_handler(
tracing::error!("Failed to migrate ElevenLabs API key: {:?}", e);
}
// Delete the non-secret version
let _ = config.delete("ELEVENLABS_API_KEY");
if let Err(e) = config.delete("ELEVENLABS_API_KEY") {
tracing::warn!(
"Failed to delete non-secret ElevenLabs API key: {:?}",
e
);
}
key
}
None => {
tracing::error!("ElevenLabs API key is not a string");
tracing::error!(
"ElevenLabs API key is not a string, found: {:?}",
value
);
return Err(StatusCode::PRECONDITION_FAILED);
}
}
}
Err(e) => {
tracing::error!("Failed to get ElevenLabs API key from config: {:?}", e);
Err(_) => {
tracing::error!("No ElevenLabs API key found in configuration");
return Err(StatusCode::PRECONDITION_FAILED);
}
}
Expand All @@ -248,7 +295,7 @@ async fn transcribe_elevenlabs_handler(
// Create multipart form for ElevenLabs API
let part = reqwest::multipart::Part::bytes(audio_bytes)
.file_name(format!("audio.{}", file_extension))
.mime_str(content_type)
.mime_str(&request.mime_type)
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;

let form = reqwest::multipart::Form::new()
Expand Down Expand Up @@ -286,8 +333,9 @@ async fn transcribe_elevenlabs_handler(
})?;

if !response.status().is_success() {
let status = response.status();
let error_text = response.text().await.unwrap_or_default();
tracing::error!("ElevenLabs API error: {}", error_text);
tracing::error!("ElevenLabs API error (status: {}): {}", status, error_text);

// Check for specific error codes
if error_text.contains("Unauthorized") || error_text.contains("Invalid API key") {
Expand Down Expand Up @@ -330,16 +378,13 @@ async fn check_dictation_config(
let config = goose::config::Config::global();

// Check if ElevenLabs API key is configured
let has_elevenlabs = config
.get_secret::<String>("ELEVENLABS_API_KEY")
.map(|_| true)
.unwrap_or_else(|_| {
let has_elevenlabs = match config.get_secret::<String>("ELEVENLABS_API_KEY") {
Ok(_) => true,
Err(_) => {
// Check non-secret for backward compatibility
config
.get("ELEVENLABS_API_KEY", false)
.map(|_| true)
.unwrap_or(false)
});
config.get("ELEVENLABS_API_KEY", false).is_ok()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what backwards compatibility is this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it used to be stored in plain text in the config so its trying that first before secrets. Might be able to remove that now

}
};

Ok(Json(serde_json::json!({
"elevenlabs": has_elevenlabs
Expand Down
5 changes: 4 additions & 1 deletion crates/goose-server/src/routes/config_management.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ pub async fn read_config(
}

let config = Config::global();

let response_value = match config.get(&query.key, query.is_secret) {
Ok(value) => {
if query.is_secret {
Expand All @@ -182,7 +183,9 @@ pub async fn read_config(
Value::Null
}
}
Err(_) => return Err(StatusCode::INTERNAL_SERVER_ERROR),
Err(_) => {
return Err(StatusCode::INTERNAL_SERVER_ERROR);
}
};
Ok(Json(response_value))
}
Expand Down
4 changes: 2 additions & 2 deletions ui/desktop/src/components/ChatInput.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -1259,8 +1259,8 @@ export default function ChatInput({

{/* Inline action buttons on the right */}
<div className="flex items-center gap-1 px-2 relative">
{/* Microphone button - show if dictation is enabled, disable if not configured */}
{(dictationSettings?.enabled || dictationSettings?.provider === null) && (
{/* Microphone button - show only if dictation is enabled */}
{dictationSettings?.enabled && (
<>
{!canUseDictation ? (
<Tooltip>
Expand Down
Loading
Loading