Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion packages/app/server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,21 +51,23 @@
"@opentelemetry/sdk-metrics": "2.0.1",
"@opentelemetry/semantic-conventions": "^1.36.0",
"@opentelemetry/winston-transport": "^0.14.1",
"@prisma/client": "6.16.0",
"@types/compression": "^1.7.5",
"@types/cors": "^2.8.19",
"@types/express": "^4.17.21",
"@types/form-data": "^2.5.2",
"@types/multer": "^2.0.0",
"@types/node": "^20.11.24",
"@types/node-fetch": "^2.6.11",
"compression": "^1.8.0",
"cors": "^2.8.5",
"dotenv": "^16.5.0",
"express": "^4.18.3",
"form-data": "^4.0.4",
"jose": "^6.0.11",
"multer": "^2.0.2",
"node-fetch": "^2.7.0",
"openai": "^4.97.0",
"@prisma/client": "6.16.0",
"prisma": "6.16.0",
"register": "link:@opentelemetry/auto-instrumentations-node/register",
"ts-node": "^10.9.2",
Expand Down
159 changes: 159 additions & 0 deletions packages/app/server/src/clients/openai-audio-client.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import fetch from 'node-fetch';
import { readFileSync } from 'fs';
import { HttpError } from '../errors/http';
import logger from '../logger';
import FormData from 'form-data';

export interface TranscriptionOptions {
model: 'whisper-1' | 'whisper-large-v3';
language?: string;
prompt?: string;
response_format?: 'json' | 'text' | 'srt' | 'verbose_json' | 'vtt';
temperature?: number;
timestamp_granularities?: ('word' | 'segment')[];
}

export interface TranscriptionResponse {
text: string;
[key: string]: any;
}

export class OpenAIAudioClient {
private apiKey: string;
private baseUrl: string;

constructor(apiKey: string, baseUrl = 'https://api.openai.com') {
this.apiKey = apiKey;
this.baseUrl = baseUrl;
}

async transcribe(
audioBuffer: Buffer,
options: TranscriptionOptions
): Promise<TranscriptionResponse> {
const formData = new FormData();

// Add the audio file
formData.append('file', audioBuffer, {
filename: 'audio.mp3',
contentType: 'audio/mp3',
});

// Add other parameters
formData.append('model', options.model);

if (options.language) {
formData.append('language', options.language);
}

if (options.prompt) {
formData.append('prompt', options.prompt);
}

if (options.response_format) {
formData.append('response_format', options.response_format);
}

if (options.temperature !== undefined) {
formData.append('temperature', options.temperature.toString());
}

if (options.timestamp_granularities) {
options.timestamp_granularities.forEach(granularity => {
formData.append('timestamp_granularities[]', granularity);
});
}

try {
const response = await fetch(`${this.baseUrl}/v1/audio/transcriptions`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${this.apiKey}`,
// FormData sets its own content-type with boundary
},
body: formData,
});

if (!response.ok) {
const errorData = await response.json();
logger.error('OpenAI Audio API Error:', errorData);
throw new HttpError(
response.status,
`OpenAI API Error: ${errorData.error?.message || response.statusText}`
);
}

const data = await response.json();
return data as TranscriptionResponse;
} catch (error) {
if (error instanceof HttpError) {
throw error;
}
logger.error('OpenAI Audio API Error:', error);
throw new HttpError(
500,
`Failed to transcribe audio: ${(error as Error).message}`
);
}
}

async translate(
audioBuffer: Buffer,
options: Omit<TranscriptionOptions, 'language'>
): Promise<TranscriptionResponse> {
const formData = new FormData();

// Add the audio file
formData.append('file', audioBuffer, {
filename: 'audio.mp3',
contentType: 'audio/mp3',
});

// Add other parameters
formData.append('model', options.model);

if (options.prompt) {
formData.append('prompt', options.prompt);
}

if (options.response_format) {
formData.append('response_format', options.response_format);
}

if (options.temperature !== undefined) {
formData.append('temperature', options.temperature.toString());
}

try {
const response = await fetch(`${this.baseUrl}/v1/audio/translations`, {
method: 'POST',
headers: {
'Authorization': `Bearer ${this.apiKey}`,
// FormData sets its own content-type with boundary
},
body: formData,
});

if (!response.ok) {
const errorData = await response.json();
logger.error('OpenAI Audio API Error:', errorData);
throw new HttpError(
response.status,
`OpenAI API Error: ${errorData.error?.message || response.statusText}`
);
}

const data = await response.json();
return data as TranscriptionResponse;
} catch (error) {
if (error instanceof HttpError) {
throw error;
}
logger.error('OpenAI Audio API Error:', error);
throw new HttpError(
500,
`Failed to translate audio: ${(error as Error).message}`
);
}
}
}
138 changes: 138 additions & 0 deletions packages/app/server/src/providers/OpenAIAudioProvider.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import { BaseProvider } from './BaseProvider';
import { ProviderType } from './ProviderType';
import { LlmTransactionMetadata } from '../types';
import logger from '../logger';
import { Decimal } from '@prisma/client/runtime/library';
import { OpenAIAudioClient, TranscriptionOptions, TranscriptionResponse } from '../clients/openai-audio-client';
import { HttpError } from '../errors/http';
import { EchoControlService } from '../services/EchoControlService';

export class OpenAIAudioProvider extends BaseProvider {
private audioClient: OpenAIAudioClient;

constructor(
echoControlService: EchoControlService,
stream: boolean,
model: string
) {
super(echoControlService, stream, model);
const apiKey = this.getApiKey();
if (!apiKey) {
throw new Error('OpenAI API key is required for audio transcription');
}
this.audioClient = new OpenAIAudioClient(apiKey, this.OPENAI_BASE_URL);
}

getType(): ProviderType {
return ProviderType.OPENAI_AUDIO;
}

getBaseUrl(reqPath?: string): string {
return this.OPENAI_BASE_URL;
}

getApiKey(): string | undefined {
return process.env.OPENAI_API_KEY;
}

override formatAuthHeaders(headers: Record<string, string>): Record<string, string> {
return {
...headers,
Authorization: `Bearer ${this.getApiKey()}`,
};
}

override ensureStreamUsage(
reqBody: Record<string, unknown>,
reqPath: string
): Record<string, unknown> {
// Audio transcription doesn't use streaming
return reqBody;
}

/**
* Transcribe audio using the OpenAI Whisper API
*
* @param audioBuffer - The audio buffer to transcribe
* @param options - Transcription options
* @returns The transcription result
*/
async transcribeAudio(audioBuffer: Buffer, options: TranscriptionOptions): Promise<TranscriptionResponse> {
try {
logger.info(`Transcribing audio with model: ${options.model}`);
return await this.audioClient.transcribe(audioBuffer, options);
} catch (error) {
logger.error('OpenAI Audio transcription error:', error);
if (error instanceof HttpError) {
throw error;
}
throw new HttpError(500, `Failed to transcribe audio: ${(error as Error).message}`);
}
}

/**
* Translate audio directly to English using the OpenAI Whisper API
*
* @param audioBuffer - The audio buffer to translate
* @param options - Translation options
* @returns The translation result
*/
async translateAudio(audioBuffer: Buffer, options: Omit<TranscriptionOptions, 'language'>): Promise<TranscriptionResponse> {
try {
logger.info(`Translating audio with model: ${options.model}`);
return await this.audioClient.translate(audioBuffer, options);
} catch (error) {
logger.error('OpenAI Audio translation error:', error);
if (error instanceof HttpError) {
throw error;
}
throw new HttpError(500, `Failed to translate audio: ${(error as Error).message}`);
}
}

async handleBody(data: string): Promise<{
metadata: LlmTransactionMetadata;
rawTransactionCost: Decimal;
status: string;
}> {
try {
const parsed = JSON.parse(data);

// Calculate cost based on duration (Whisper charges per minute)
// Default to 1 second if duration is not available
const durationSeconds = parsed.duration || 1;
const durationMinutes = durationSeconds / 60;

// Apply the Whisper cost of $0.006 per minute
const cost = new Decimal(0.006).mul(durationMinutes);

// Generate a unique provider ID
const providerId = `openai-audio-${Date.now()}`;

// Use seconds as a proxy for tokens since audio doesn't use tokens
const outputTokens = Math.ceil(durationSeconds);

return {
metadata: {
providerId,
provider: this.getType(),
model: 'whisper-1',
inputTokens: 0, // Audio doesn't use input tokens
outputTokens,
totalTokens: outputTokens,
// Include additional metadata as custom properties
audioData: {
durationSeconds,
responseFormat: parsed.format || 'json',
characterCount: parsed.text?.length || 0,
},
},
rawTransactionCost: cost,
status: 'success',
};
} catch (error) {
logger.error('Error processing audio response data:', error);
throw error;
}
}
}
27 changes: 27 additions & 0 deletions packages/app/server/src/providers/ProviderFactory.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ import { GeminiGPTProvider } from './GeminiGPTProvider';
import { OpenAIResponsesProvider } from './OpenAIResponsesProvider';
import { OpenRouterProvider } from './OpenRouterProvider';
import { OpenAIImageProvider } from './OpenAIImageProvider';
import { OpenAIAudioProvider } from './OpenAIAudioProvider';
import {
ALL_SUPPORTED_AUDIO_MODELS,
ALL_SUPPORTED_IMAGE_MODELS,
ALL_SUPPORTED_MODELS,
} from '../services/AccountingService';
Expand Down Expand Up @@ -60,6 +62,16 @@ const createImageModelToProviderMapping = (): Record<string, ProviderType> => {
return mapping;
};

// Create mapping for audio models
const createAudioModelToProviderMapping = (): Record<string, ProviderType> => {
const mapping: Record<string, ProviderType> = {};

// Hard-code whisper-1 for now until AccountingService is updated to include audio models
mapping['whisper-1'] = ProviderType.OPENAI_AUDIO;

return mapping;
};

/**
* Model-to-provider mapping loaded from model_prices_and_context_window.json
* This replaces the previous hardcoded mapping and automatically includes all
Expand All @@ -70,6 +82,9 @@ export const MODEL_TO_PROVIDER: Record<string, ProviderType> =

export const IMAGE_MODEL_TO_PROVIDER: Record<string, ProviderType> =
createImageModelToProviderMapping();

export const AUDIO_MODEL_TO_PROVIDER: Record<string, ProviderType> =
createAudioModelToProviderMapping();

export const getProvider = (
model: string,
Expand All @@ -84,6 +99,11 @@ export const getProvider = (
if (imageType) {
type = imageType;
}

const audioType = AUDIO_MODEL_TO_PROVIDER[model];
if (audioType) {
type = audioType;
}

// If the model is not in the model to provider mapping, throw an error
if (type === undefined) {
Expand All @@ -98,6 +118,11 @@ export const getProvider = (
if (completionPath.includes('images/generations')) {
type = ProviderType.OPENAI_IMAGES;
}

// Check if this is an audio transcription or translation endpoint
if (completionPath.includes('audio/transcriptions') || completionPath.includes('audio/translations')) {
type = ProviderType.OPENAI_AUDIO;
}

// We select for Anthropic Native if the completionPath includes "messages"
// The OpenAI Format does not hit /v1/messages, it hits /v1/chat/completions
Expand Down Expand Up @@ -130,6 +155,8 @@ export const getProvider = (
return new OpenRouterProvider(echoControlService, stream, model);
case ProviderType.OPENAI_IMAGES:
return new OpenAIImageProvider(echoControlService, stream, model);
case ProviderType.OPENAI_AUDIO:
return new OpenAIAudioProvider(echoControlService, stream, model);
default:
throw new Error(`Unknown provider type: ${type}`);
}
Expand Down
Loading