Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ resolver = "2"

[workspace.package]
edition = "2021"
version = "1.6.0"
version = "1.7.0"
authors = ["Block <ai-oss-tools@block.xyz>"]
license = "Apache-2.0"
repository = "https://github.com/block/goose"
Expand Down
209 changes: 127 additions & 82 deletions crates/goose-server/src/routes/audio.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,35 +42,13 @@ struct WhisperResponse {
text: String,
}

/// Transcribe audio using OpenAI's Whisper API
///
/// # Request
/// - `audio`: Base64 encoded audio data
/// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav")
///
/// # Response
/// - `text`: Transcribed text from the audio
///
/// # Errors
/// - 401: Unauthorized (missing or invalid X-Secret-Key header)
/// - 412: Precondition Failed (OpenAI API key not configured)
/// - 400: Bad Request (invalid base64 audio data)
/// - 413: Payload Too Large (audio file exceeds 25MB limit)
/// - 415: Unsupported Media Type (unsupported audio format)
/// - 502: Bad Gateway (OpenAI API error)
/// - 503: Service Unavailable (network error)
async fn transcribe_handler(
State(state): State<Arc<AppState>>,
headers: HeaderMap,
Json(request): Json<TranscribeRequest>,
) -> Result<Json<TranscribeResponse>, StatusCode> {
verify_secret_key(&headers, &state)?;

// Validate input first before checking API key configuration
/// Validate audio input and return decoded bytes and file extension
fn validate_audio_input(
audio: &str,
mime_type: &str,
) -> Result<(Vec<u8>, &'static str), StatusCode> {
// Decode the base64 audio data
let audio_bytes = BASE64
.decode(&request.audio)
.map_err(|_| StatusCode::BAD_REQUEST)?;
let audio_bytes = BASE64.decode(audio).map_err(|_| StatusCode::BAD_REQUEST)?;

// Check file size
if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES {
Expand All @@ -83,8 +61,9 @@ async fn transcribe_handler(
}

// Determine file extension based on MIME type
let file_extension = match request.mime_type.as_str() {
let file_extension = match mime_type {
"audio/webm" => "webm",
"audio/webm;codecs=opus" => "webm",
"audio/mp4" => "mp4",
"audio/mpeg" => "mp3",
"audio/mpga" => "mpga",
Expand All @@ -94,13 +73,18 @@ async fn transcribe_handler(
_ => return Err(StatusCode::UNSUPPORTED_MEDIA_TYPE),
};

// Get the OpenAI API key from config (after input validation)
Ok((audio_bytes, file_extension))
}

/// Get OpenAI configuration (API key and host)
fn get_openai_config() -> Result<(String, String), StatusCode> {
let config = goose::config::Config::global();
let api_key: String = config
.get_secret("OPENAI_API_KEY")
.map_err(|_| StatusCode::PRECONDITION_FAILED)?;

// Get the OpenAI host from config (with default)
let api_key: String = config.get_secret("OPENAI_API_KEY").map_err(|e| {
tracing::error!("Failed to get OpenAI API key: {:?}", e);
StatusCode::PRECONDITION_FAILED
})?;

let openai_host = match config.get("OPENAI_HOST", false) {
Ok(value) => value
.as_str()
Expand All @@ -109,19 +93,41 @@ async fn transcribe_handler(
Err(_) => "https://api.openai.com".to_string(),
};

tracing::debug!("Using OpenAI host: {}", openai_host);
Ok((api_key, openai_host))
}

/// Send transcription request to OpenAI Whisper API
async fn send_openai_request(
audio_bytes: Vec<u8>,
file_extension: &str,
mime_type: &str,
api_key: &str,
openai_host: &str,
) -> Result<WhisperResponse, StatusCode> {
tracing::info!("Using OpenAI host: {}", openai_host);
tracing::info!(
"Audio file size: {} bytes, extension: {}, mime_type: {}",
audio_bytes.len(),
file_extension,
mime_type
);

// Create a multipart form with the audio file
let part = reqwest::multipart::Part::bytes(audio_bytes)
.file_name(format!("audio.{}", file_extension))
.mime_str(&request.mime_type)
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;
.mime_str(mime_type)
.map_err(|e| {
tracing::error!("Failed to create multipart part: {:?}", e);
StatusCode::INTERNAL_SERVER_ERROR
})?;

let form = reqwest::multipart::Form::new()
.part("file", part)
.text("model", "whisper-1")
.text("response_format", "json");

tracing::info!("Created multipart form for OpenAI Whisper API");

// Make request to OpenAI Whisper API
let client = Client::builder()
.timeout(Duration::from_secs(OPENAI_TIMEOUT_SECONDS))
Expand All @@ -131,6 +137,11 @@ async fn transcribe_handler(
StatusCode::INTERNAL_SERVER_ERROR
})?;

tracing::info!(
"Sending request to OpenAI: {}/v1/audio/transcriptions",
openai_host
);

let response = client
.post(format!("{}/v1/audio/transcriptions", openai_host))
.header("Authorization", format!("Bearer {}", api_key))
Expand All @@ -150,9 +161,25 @@ async fn transcribe_handler(
}
})?;

tracing::info!(
"Received response from OpenAI with status: {}",
response.status()
);

if !response.status().is_success() {
let status = response.status();
let error_text = response.text().await.unwrap_or_default();
tracing::error!("OpenAI API error: {}", error_text);
tracing::error!("OpenAI API error (status: {}): {}", status, error_text);

// Check for specific error codes
if status == 401 {
tracing::error!("OpenAI API key appears to be invalid or unauthorized");
return Err(StatusCode::UNAUTHORIZED);
} else if status == 429 {
tracing::error!("OpenAI API quota or rate limit exceeded");
return Err(StatusCode::TOO_MANY_REQUESTS);
}

return Err(StatusCode::BAD_GATEWAY);
}

Expand All @@ -161,6 +188,45 @@ async fn transcribe_handler(
StatusCode::INTERNAL_SERVER_ERROR
})?;

Ok(whisper_response)
}

/// Transcribe audio using OpenAI's Whisper API
///
/// # Request
/// - `audio`: Base64 encoded audio data
/// - `mime_type`: MIME type of the audio (e.g., "audio/webm", "audio/wav")
///
/// # Response
/// - `text`: Transcribed text from the audio
///
/// # Errors
/// - 401: Unauthorized (missing or invalid X-Secret-Key header)
/// - 412: Precondition Failed (OpenAI API key not configured)
/// - 400: Bad Request (invalid base64 audio data)
/// - 413: Payload Too Large (audio file exceeds 25MB limit)
/// - 415: Unsupported Media Type (unsupported audio format)
/// - 502: Bad Gateway (OpenAI API error)
/// - 503: Service Unavailable (network error)
async fn transcribe_handler(
State(state): State<Arc<AppState>>,
headers: HeaderMap,
Json(request): Json<TranscribeRequest>,
) -> Result<Json<TranscribeResponse>, StatusCode> {
verify_secret_key(&headers, &state)?;

let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?;
let (api_key, openai_host) = get_openai_config()?;

let whisper_response = send_openai_request(
audio_bytes,
file_extension,
&request.mime_type,
&api_key,
&openai_host,
)
.await?;

Ok(Json(TranscribeResponse {
text: whisper_response.text,
}))
Expand All @@ -177,47 +243,20 @@ async fn transcribe_elevenlabs_handler(
) -> Result<Json<TranscribeResponse>, StatusCode> {
verify_secret_key(&headers, &state)?;

// Validate input first before checking API key configuration
// Decode the base64 audio data
let audio_bytes = BASE64
.decode(&request.audio)
.map_err(|_| StatusCode::BAD_REQUEST)?;

// Check file size
if audio_bytes.len() > MAX_AUDIO_SIZE_BYTES {
tracing::warn!(
"Audio file too large: {} bytes (max: {} bytes)",
audio_bytes.len(),
MAX_AUDIO_SIZE_BYTES
);
return Err(StatusCode::PAYLOAD_TOO_LARGE);
}

// Determine file extension and content type based on MIME type
let (file_extension, content_type) = match request.mime_type.as_str() {
"audio/webm" => ("webm", "audio/webm"),
"audio/mp4" => ("mp4", "audio/mp4"),
"audio/mpeg" => ("mp3", "audio/mpeg"),
"audio/mpga" => ("mp3", "audio/mpeg"),
"audio/m4a" => ("m4a", "audio/m4a"),
"audio/wav" => ("wav", "audio/wav"),
"audio/x-wav" => ("wav", "audio/wav"),
_ => return Err(StatusCode::UNSUPPORTED_MEDIA_TYPE),
};
let (audio_bytes, file_extension) = validate_audio_input(&request.audio, &request.mime_type)?;

// Get the ElevenLabs API key from config (after input validation)
let config = goose::config::Config::global();

// First try to get it as a secret
let api_key: String = match config.get_secret("ELEVENLABS_API_KEY") {
let api_key: String = match config.get_secret::<String>("ELEVENLABS_API_KEY") {
Ok(key) => key,
Err(_) => {
// Try to get it as non-secret (for backward compatibility)
match config.get("ELEVENLABS_API_KEY", false) {
Ok(value) => {
match value.as_str() {
Some(key_str) => {
tracing::info!("Migrating ElevenLabs API key to secret storage");
let key = key_str.to_string();
// Migrate to secret storage
if let Err(e) = config.set(
Expand All @@ -228,17 +267,25 @@ async fn transcribe_elevenlabs_handler(
tracing::error!("Failed to migrate ElevenLabs API key: {:?}", e);
}
// Delete the non-secret version
let _ = config.delete("ELEVENLABS_API_KEY");
if let Err(e) = config.delete("ELEVENLABS_API_KEY") {
tracing::warn!(
"Failed to delete non-secret ElevenLabs API key: {:?}",
e
);
}
key
}
None => {
tracing::error!("ElevenLabs API key is not a string");
tracing::error!(
"ElevenLabs API key is not a string, found: {:?}",
value
);
return Err(StatusCode::PRECONDITION_FAILED);
}
}
}
Err(e) => {
tracing::error!("Failed to get ElevenLabs API key from config: {:?}", e);
Err(_) => {
tracing::error!("No ElevenLabs API key found in configuration");
return Err(StatusCode::PRECONDITION_FAILED);
}
}
Expand All @@ -248,7 +295,7 @@ async fn transcribe_elevenlabs_handler(
// Create multipart form for ElevenLabs API
let part = reqwest::multipart::Part::bytes(audio_bytes)
.file_name(format!("audio.{}", file_extension))
.mime_str(content_type)
.mime_str(&request.mime_type)
.map_err(|_| StatusCode::INTERNAL_SERVER_ERROR)?;

let form = reqwest::multipart::Form::new()
Expand Down Expand Up @@ -286,8 +333,9 @@ async fn transcribe_elevenlabs_handler(
})?;

if !response.status().is_success() {
let status = response.status();
let error_text = response.text().await.unwrap_or_default();
tracing::error!("ElevenLabs API error: {}", error_text);
tracing::error!("ElevenLabs API error (status: {}): {}", status, error_text);

// Check for specific error codes
if error_text.contains("Unauthorized") || error_text.contains("Invalid API key") {
Expand Down Expand Up @@ -330,16 +378,13 @@ async fn check_dictation_config(
let config = goose::config::Config::global();

// Check if ElevenLabs API key is configured
let has_elevenlabs = config
.get_secret::<String>("ELEVENLABS_API_KEY")
.map(|_| true)
.unwrap_or_else(|_| {
let has_elevenlabs = match config.get_secret::<String>("ELEVENLABS_API_KEY") {
Ok(_) => true,
Err(_) => {
// Check non-secret for backward compatibility
config
.get("ELEVENLABS_API_KEY", false)
.map(|_| true)
.unwrap_or(false)
});
config.get("ELEVENLABS_API_KEY", false).is_ok()
}
};

Ok(Json(serde_json::json!({
"elevenlabs": has_elevenlabs
Expand Down
Loading
Loading