Skip to content
2 changes: 1 addition & 1 deletion packages/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ Run an **OpenAI-compatible HTTP server** backed by locally configured QVAC model
qvac serve openai [options]
```

See **[docs/serve-openai.md](./docs/serve-openai.md)** for supported `/v1/...` routes, multipart request shapes, and how to register models — including **`whispercpp-audio-translation`** for `POST /v1/audio/translations` (Whisper translate-to-English).
See **[docs/serve-openai.md](./docs/serve-openai.md)** for supported `/v1/...` routes, multipart request shapes, and how to register models — including **`whispercpp-audio-translation`** for `POST /v1/audio/translations` (Whisper translate-to-English) and the volatile **`POST /v1/responses`** Responses API with `previous_response_id` chaining.

## Configuration

Expand Down
41 changes: 41 additions & 0 deletions packages/cli/docs/serve-openai.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,53 @@ This document describes the supported routes and how to configure `serve.models`
| `GET` | `/v1/models/{id}` | Model metadata |
| `DELETE` | `/v1/models/{id}` | Unload |
| `POST` | `/v1/chat/completions` | Chat |
| `POST` | `/v1/responses` | Responses API (blocking + SSE streaming); volatile, see below |
| `GET` | `/v1/responses/{id}` | Retrieve a stored response |
| `DELETE` | `/v1/responses/{id}` | Delete a stored response |
| `GET` | `/v1/responses/{id}/input_items` | Paginate the original input items |
| `POST` | `/v1/embeddings` | Embeddings |
| `POST` | `/v1/audio/transcriptions` | Speech-to-text (source language) |
| `POST` | `/v1/audio/translations` | Speech-to-text **into English** (Whisper translate task) |

Other OpenAI routes may be added over time; this file is updated when they ship.

## `POST /v1/responses`

OpenAI-compatible Responses API: blocking, SSE streaming, retrieval by id,
and `previous_response_id` chaining. Backed by the same chat models registered
under `serve.models` (any alias whose endpoint category is `chat`).

> **Volatile state.** All responses are kept in process memory only — there is
> no disk or P2P persistence. Stored ids expire on server restart, after the
> per-entry TTL (1h by default), or once the LRU cap (256 entries) evicts
> them. Each response is also tagged with `X-QVAC-Stub: responses-volatile`
> and a one-line warn is logged at startup so operators know the surface is
> not durable. Pass `store: false` in the request body to skip persistence
> entirely.

Intentionally rejected with `400`: `conversation`, `background: true`, and
built-in tools (`web_search`, `file_search`, `code_interpreter`).
`function`-typed tools work normally.

### Examples

```bash
# Blocking
curl -sS http://127.0.0.1:11434/v1/responses \
-H "Content-Type: application/json" \
-d '{"model":"<alias>","input":"ping","store":true}'

# Streaming (SSE)
curl -sN http://127.0.0.1:11434/v1/responses \
-H "Content-Type: application/json" \
-d '{"model":"<alias>","input":"ping","stream":true}'

# Multi-turn via previous_response_id
curl -sS http://127.0.0.1:11434/v1/responses \
-H "Content-Type: application/json" \
-d '{"model":"<alias>","input":"and now?","previous_response_id":"resp_..."}'
```

## `POST /v1/audio/translations`

OpenAI’s **translations** endpoint always returns **English text**. It maps to Whisper’s **translate** task (not “transcribe then run a text translator”).
Expand Down
12 changes: 12 additions & 0 deletions packages/cli/src/serve/adapters/openai/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,18 @@ export function createOpenAIAdapter (): APIAdapter {
return true
}

if (method === 'POST' && path === '/v1/responses') {
const { handlePostResponses } = await import('./routes/responses.js')
await handlePostResponses(req, res, ctx)
return true
}

if (path.startsWith('/v1/responses/')) {
const { routeResponsesId } = await import('./routes/responses-id.js')
const handled = await routeResponsesId(req, res, ctx)
if (handled) return true
}

if (method === 'POST' && path === '/v1/chat/completions') {
const { handleChatCompletions } = await import('./routes/chat.js')
await handleChatCompletions(req, res, ctx)
Expand Down
127 changes: 127 additions & 0 deletions packages/cli/src/serve/adapters/openai/responses-shape.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import crypto from 'node:crypto'
import type { SDKToolCall, CompletionRunStats } from '../../core/sdk.js'
import { sdkToolCallsToOpenai } from './translate.js'

export function responseId (): string {
return `resp_${randomId()}`
}

export function messageId (): string {
return `msg_${randomId()}`
}

export function functionCallOutputItemId (): string {
return `fc_${randomId()}`
}

function randomId (): string {
return crypto.randomUUID()
}

export interface BuildResponseObjectParams {
id: string
modelAlias: string
text: string
toolCalls: SDKToolCall[] | null | undefined
createdAtSec: number
metadata: Record<string, unknown> | null | undefined
temperature: number | undefined
topP: number | undefined
maxOutputTokens: number | undefined
parallelToolCalls: boolean
previousResponseId: string | null | undefined
store: boolean
/** When set (e.g. streaming), must match SSE item ids so finalized response matches the stream. */
messageItemId?: string
/** When set, must align with `toolCalls` length; same ids as streamed function_call items. */
functionCallItemIds?: string[]
/** From SDK completion stats; `generatedTokens` maps to `usage.output_tokens`. */
stats?: CompletionRunStats
}

function wordCountFallback (text: string): number {
return text ? text.split(/\s+/).filter(Boolean).length : 0
}

export function buildResponseObject (params: BuildResponseObjectParams): Record<string, unknown> {
const hasToolCalls = params.toolCalls !== null && params.toolCalls !== undefined && params.toolCalls.length > 0
const msgId = params.messageItemId ?? messageId()
const output: unknown[] = []

output.push({
type: 'message',
id: msgId,
status: 'completed',
role: 'assistant',
content: [{ type: 'output_text', text: params.text || '', annotations: [] }]
})

const openaiCalls = sdkToolCallsToOpenai(params.toolCalls)
if (hasToolCalls) {
const ids = params.functionCallItemIds
let i = 0
for (const tc of openaiCalls ?? []) {
const fcId = ids !== undefined && ids[i] !== undefined ? ids[i]! : functionCallOutputItemId()
i++
output.push({
type: 'function_call',
id: fcId,
call_id: tc.id,
name: tc.function.name,
arguments: tc.function.arguments,
status: 'completed'
})
}
}

const outputTokens =
typeof params.stats?.generatedTokens === 'number' && Number.isFinite(params.stats.generatedTokens)
? params.stats.generatedTokens
: wordCountFallback(params.text || '')
// SDK does not expose prompt token count today; `cacheTokens` is KV-cache hit count, not full prompt size.
const inputTokens = 0
const usage = {
input_tokens: inputTokens,
output_tokens: outputTokens,
total_tokens: inputTokens + outputTokens
}

const base: Record<string, unknown> = {
id: params.id,
object: 'response',
created_at: params.createdAtSec,
status: hasToolCalls ? 'requires_action' : 'completed',
model: params.modelAlias,
output,
output_text: params.text || '',
usage,
parallel_tool_calls: params.parallelToolCalls,
store: params.store
}

if (hasToolCalls) {
base['required_action'] = {
type: 'submit_tool_outputs',
submit_tool_outputs: {
tool_calls: (openaiCalls ?? []).map((tc) => ({
id: tc.id,
type: 'function',
function: {
name: tc.function.name,
arguments: tc.function.arguments
}
}))
}
}
}

if (params.metadata !== undefined && params.metadata !== null) {
base['metadata'] = params.metadata
}
if (params.temperature !== undefined) base['temperature'] = params.temperature
if (params.topP !== undefined) base['top_p'] = params.topP
if (params.maxOutputTokens !== undefined) base['max_output_tokens'] = params.maxOutputTokens
if (params.previousResponseId) base['previous_response_id'] = params.previousResponseId

return base
}
142 changes: 142 additions & 0 deletions packages/cli/src/serve/adapters/openai/responses-store.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
export const RESPONSES_VOLATILE_STUB = 'responses-volatile'

export interface StoredResponse {
id: string
createdAtSec: number
expiresAtSec: number
responseObject: Record<string, unknown>
inputItems: unknown[]
modelAlias: string
}

export interface ResponsesStoreOptions {
maxEntries?: number
ttlMs?: number
now?: () => number
}

export interface ListInputItemsOptions {
limit?: number
after?: string | undefined
}

export interface ResponsesStore {
put: (record: StoredResponse) => void
get: (id: string) => StoredResponse | undefined
delete: (id: string) => boolean
listInputItems: (id: string, opts?: ListInputItemsOptions) => {
object: string
data: unknown[]
first_id: string | null
last_id: string | null
has_more: boolean
} | null
size: () => number
bannerLine: () => string
}

const DEFAULT_MAX = 256
const DEFAULT_TTL_MS = 60 * 60 * 1000

export const RESPONSES_DEFAULT_TTL_SEC = Math.floor(DEFAULT_TTL_MS / 1000)

export function createResponsesStore (options: ResponsesStoreOptions = {}): ResponsesStore {
const maxEntries = options.maxEntries ?? DEFAULT_MAX
const ttlMs = options.ttlMs ?? DEFAULT_TTL_MS
const nowMs = options.now ?? ((): number => Date.now())

const map = new Map<string, StoredResponse>()

function pruneExpired (): void {
const t = nowMs() / 1000
for (const [k, v] of map) {
if (v.expiresAtSec <= t) map.delete(k)
}
}

function bump (id: string, rec: StoredResponse): void {
map.delete(id)
map.set(id, rec)
}

return {
put (record: StoredResponse): void {
pruneExpired()
bump(record.id, record)
while (map.size > maxEntries) {
const first = map.keys().next().value
if (first === undefined) break
map.delete(first)
}
},

get (id: string): StoredResponse | undefined {
pruneExpired()
const rec = map.get(id)
if (!rec) return undefined
if (rec.expiresAtSec <= nowMs() / 1000) {
map.delete(id)
return undefined
}
bump(id, rec)
return rec
},

delete (id: string): boolean {
return map.delete(id)
},

listInputItems (id: string, opts?: ListInputItemsOptions): {
object: string
data: unknown[]
first_id: string | null
last_id: string | null
has_more: boolean
} | null {
pruneExpired()
const rec = map.get(id)
if (!rec) return null
if (rec.expiresAtSec <= nowMs() / 1000) {
map.delete(id)
return null
}
const limit = typeof opts?.limit === 'number' && opts.limit > 0 ? Math.min(opts.limit, 100) : 20
const items = rec.inputItems as Array<{ id?: string }>
let start = 0
if (opts?.after) {
const idx = items.findIndex((it) => {
if (!it || typeof it !== 'object') return false
const ito = it as Record<string, unknown>
return ito['id'] === opts.after
})
start = idx >= 0 ? idx + 1 : items.length
}
const slice = items.slice(start, start + limit)
const hasMore = start + slice.length < items.length
const firstId = slice[0] && typeof slice[0] === 'object' && typeof (slice[0] as { id?: string }).id === 'string'
? (slice[0] as { id: string }).id
: null
const last = slice[slice.length - 1]
const lastId = last && typeof last === 'object' && typeof (last as { id?: string }).id === 'string'
? (last as { id: string }).id
: null
return {
object: 'list',
data: slice,
first_id: firstId,
last_id: lastId,
has_more: hasMore
}
},

size (): number {
pruneExpired()
return map.size
},

bannerLine (): string {
const ttlMin = Math.round(ttlMs / 60000)
return `responses: in-memory only — IDs expire on restart, max ${maxEntries} entries, ${ttlMin}m TTL`
}
}
}
Loading
Loading