tetherto · lauripiisang · May 14, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
@@ -306,7 +306,7 @@ Run an **OpenAI-compatible HTTP server** backed by locally configured QVAC model
 qvac serve openai [options]
 ```
 
-See **[docs/serve-openai.md](./docs/serve-openai.md)** for supported `/v1/...` routes, multipart request shapes, and how to register models — including **`whispercpp-audio-translation`** for `POST /v1/audio/translations` (Whisper translate-to-English).
+See **[docs/serve-openai.md](./docs/serve-openai.md)** for supported `/v1/...` routes, multipart request shapes, and how to register models — including **`whispercpp-audio-translation`** for `POST /v1/audio/translations` (Whisper translate-to-English) and the volatile **`POST /v1/responses`** Responses API with `previous_response_id` chaining.
 
 ## Configuration
 

@@ -12,12 +12,53 @@ This document describes the supported routes and how to configure `serve.models`
 | `GET` | `/v1/models/{id}` | Model metadata |
 | `DELETE` | `/v1/models/{id}` | Unload |
 | `POST` | `/v1/chat/completions` | Chat |
+| `POST` | `/v1/responses` | Responses API (blocking + SSE streaming); volatile, see below |
+| `GET` | `/v1/responses/{id}` | Retrieve a stored response |
+| `DELETE` | `/v1/responses/{id}` | Delete a stored response |
+| `GET` | `/v1/responses/{id}/input_items` | Paginate the original input items |
 | `POST` | `/v1/embeddings` | Embeddings |
 | `POST` | `/v1/audio/transcriptions` | Speech-to-text (source language) |
 | `POST` | `/v1/audio/translations` | Speech-to-text **into English** (Whisper translate task) |
 
 Other OpenAI routes may be added over time; this file is updated when they ship.
 
+## `POST /v1/responses`
+
+OpenAI-compatible Responses API: blocking, SSE streaming, retrieval by id,
+and `previous_response_id` chaining. Backed by the same chat models registered
+under `serve.models` (any alias whose endpoint category is `chat`).
+
+> **Volatile state.** All responses are kept in process memory only — there is
+> no disk or P2P persistence. Stored ids expire on server restart, after the
+> per-entry TTL (1h by default), or once the LRU cap (256 entries) evicts
+> them. Each response is also tagged with `X-QVAC-Stub: responses-volatile`
+> and a one-line warn is logged at startup so operators know the surface is
+> not durable. Pass `store: false` in the request body to skip persistence
+> entirely.
+
+Intentionally rejected with `400`: `conversation`, `background: true`, and
+built-in tools (`web_search`, `file_search`, `code_interpreter`).
+`function`-typed tools work normally.
+
+### Examples
+
+```bash
+# Blocking
+curl -sS http://127.0.0.1:11434/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{"model":"<alias>","input":"ping","store":true}'
+
+# Streaming (SSE)
+curl -sN http://127.0.0.1:11434/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{"model":"<alias>","input":"ping","stream":true}'
+
+# Multi-turn via previous_response_id
+curl -sS http://127.0.0.1:11434/v1/responses \
+  -H "Content-Type: application/json" \
+  -d '{"model":"<alias>","input":"and now?","previous_response_id":"resp_..."}'
+```
+
 ## `POST /v1/audio/translations`
 
 OpenAI’s **translations** endpoint always returns **English text**. It maps to Whisper’s **translate** task (not “transcribe then run a text translator”).

@@ -32,6 +32,18 @@ export function createOpenAIAdapter (): APIAdapter {
         return true
       }
 
+      if (method === 'POST' && path === '/v1/responses') {
+        const { handlePostResponses } = await import('./routes/responses.js')
+        await handlePostResponses(req, res, ctx)
+        return true
+      }
+
+      if (path.startsWith('/v1/responses/')) {
+        const { routeResponsesId } = await import('./routes/responses-id.js')
+        const handled = await routeResponsesId(req, res, ctx)
+        if (handled) return true
+      }
+
       if (method === 'POST' && path === '/v1/chat/completions') {
         const { handleChatCompletions } = await import('./routes/chat.js')
         await handleChatCompletions(req, res, ctx)

@@ -0,0 +1,127 @@
+import crypto from 'node:crypto'
+import type { SDKToolCall, CompletionRunStats } from '../../core/sdk.js'
+import { sdkToolCallsToOpenai } from './translate.js'
+
+export function responseId (): string {
+  return `resp_${randomId()}`
+}
+
+export function messageId (): string {
+  return `msg_${randomId()}`
+}
+
+export function functionCallOutputItemId (): string {
+  return `fc_${randomId()}`
+}
+
+function randomId (): string {
+  return crypto.randomUUID()
+}
+
+export interface BuildResponseObjectParams {
+  id: string
+  modelAlias: string
+  text: string
+  toolCalls: SDKToolCall[] | null | undefined
+  createdAtSec: number
+  metadata: Record<string, unknown> | null | undefined
+  temperature: number | undefined
+  topP: number | undefined
+  maxOutputTokens: number | undefined
+  parallelToolCalls: boolean
+  previousResponseId: string | null | undefined
+  store: boolean
+  /** When set (e.g. streaming), must match SSE item ids so finalized response matches the stream. */
+  messageItemId?: string
+  /** When set, must align with `toolCalls` length; same ids as streamed function_call items. */
+  functionCallItemIds?: string[]
+  /** From SDK completion stats; `generatedTokens` maps to `usage.output_tokens`. */
+  stats?: CompletionRunStats
+}
+
+function wordCountFallback (text: string): number {
+  return text ? text.split(/\s+/).filter(Boolean).length : 0
+}
+
+export function buildResponseObject (params: BuildResponseObjectParams): Record<string, unknown> {
+  const hasToolCalls = params.toolCalls !== null && params.toolCalls !== undefined && params.toolCalls.length > 0
+  const msgId = params.messageItemId ?? messageId()
+  const output: unknown[] = []
+
+  output.push({
+    type: 'message',
+    id: msgId,
+    status: 'completed',
+    role: 'assistant',
+    content: [{ type: 'output_text', text: params.text || '', annotations: [] }]
+  })
+
+  const openaiCalls = sdkToolCallsToOpenai(params.toolCalls)
+  if (hasToolCalls) {
+    const ids = params.functionCallItemIds
+    let i = 0
+    for (const tc of openaiCalls ?? []) {
+      const fcId = ids !== undefined && ids[i] !== undefined ? ids[i]! : functionCallOutputItemId()
+      i++
+      output.push({
+        type: 'function_call',
+        id: fcId,
+        call_id: tc.id,
+        name: tc.function.name,
+        arguments: tc.function.arguments,
+        status: 'completed'
+      })
+    }
+  }
+
+  const outputTokens =
+    typeof params.stats?.generatedTokens === 'number' && Number.isFinite(params.stats.generatedTokens)
+      ? params.stats.generatedTokens
+      : wordCountFallback(params.text || '')
+  // SDK does not expose prompt token count today; `cacheTokens` is KV-cache hit count, not full prompt size.
+  const inputTokens = 0
+  const usage = {
+    input_tokens: inputTokens,
+    output_tokens: outputTokens,
+    total_tokens: inputTokens + outputTokens
+  }
+
+  const base: Record<string, unknown> = {
+    id: params.id,
+    object: 'response',
+    created_at: params.createdAtSec,
+    status: hasToolCalls ? 'requires_action' : 'completed',
+    model: params.modelAlias,
+    output,
+    output_text: params.text || '',
+    usage,
+    parallel_tool_calls: params.parallelToolCalls,
+    store: params.store
+  }
+
+  if (hasToolCalls) {
+    base['required_action'] = {
+      type: 'submit_tool_outputs',
+      submit_tool_outputs: {
+        tool_calls: (openaiCalls ?? []).map((tc) => ({
+          id: tc.id,
+          type: 'function',
+          function: {
+            name: tc.function.name,
+            arguments: tc.function.arguments
+          }
+        }))
+      }
+    }
+  }
+
+  if (params.metadata !== undefined && params.metadata !== null) {
+    base['metadata'] = params.metadata
+  }
+  if (params.temperature !== undefined) base['temperature'] = params.temperature
+  if (params.topP !== undefined) base['top_p'] = params.topP
+  if (params.maxOutputTokens !== undefined) base['max_output_tokens'] = params.maxOutputTokens
+  if (params.previousResponseId) base['previous_response_id'] = params.previousResponseId
+
+  return base
+}
@@ -0,0 +1,142 @@
+export const RESPONSES_VOLATILE_STUB = 'responses-volatile'
+
+export interface StoredResponse {
+  id: string
+  createdAtSec: number
+  expiresAtSec: number
+  responseObject: Record<string, unknown>
+  inputItems: unknown[]
+  modelAlias: string
+}
+
+export interface ResponsesStoreOptions {
+  maxEntries?: number
+  ttlMs?: number
+  now?: () => number
+}
+
+export interface ListInputItemsOptions {
+  limit?: number
+  after?: string | undefined
+}
+
+export interface ResponsesStore {
+  put: (record: StoredResponse) => void
+  get: (id: string) => StoredResponse | undefined
+  delete: (id: string) => boolean
+  listInputItems: (id: string, opts?: ListInputItemsOptions) => {
+    object: string
+    data: unknown[]
+    first_id: string | null
+    last_id: string | null
+    has_more: boolean
+  } | null
+  size: () => number
+  bannerLine: () => string
+}
+
+const DEFAULT_MAX = 256
+const DEFAULT_TTL_MS = 60 * 60 * 1000
+
+export const RESPONSES_DEFAULT_TTL_SEC = Math.floor(DEFAULT_TTL_MS / 1000)
+
+export function createResponsesStore (options: ResponsesStoreOptions = {}): ResponsesStore {
+  const maxEntries = options.maxEntries ?? DEFAULT_MAX
+  const ttlMs = options.ttlMs ?? DEFAULT_TTL_MS
+  const nowMs = options.now ?? ((): number => Date.now())
+
+  const map = new Map<string, StoredResponse>()
+
+  function pruneExpired (): void {
+    const t = nowMs() / 1000
+    for (const [k, v] of map) {
+      if (v.expiresAtSec <= t) map.delete(k)
+    }
+  }
+
+  function bump (id: string, rec: StoredResponse): void {
+    map.delete(id)
+    map.set(id, rec)
+  }
+
+  return {
+    put (record: StoredResponse): void {
+      pruneExpired()
+      bump(record.id, record)
+      while (map.size > maxEntries) {
+        const first = map.keys().next().value
+        if (first === undefined) break
+        map.delete(first)
+      }
+    },
+
+    get (id: string): StoredResponse | undefined {
+      pruneExpired()
+      const rec = map.get(id)
+      if (!rec) return undefined
+      if (rec.expiresAtSec <= nowMs() / 1000) {
+        map.delete(id)
+        return undefined
+      }
+      bump(id, rec)
+      return rec
+    },
+
+    delete (id: string): boolean {
+      return map.delete(id)
+    },
+
+    listInputItems (id: string, opts?: ListInputItemsOptions): {
+      object: string
+      data: unknown[]
+      first_id: string | null
+      last_id: string | null
+      has_more: boolean
+    } | null {
+      pruneExpired()
+      const rec = map.get(id)
+      if (!rec) return null
+      if (rec.expiresAtSec <= nowMs() / 1000) {
+        map.delete(id)
+        return null
+      }
+      const limit = typeof opts?.limit === 'number' && opts.limit > 0 ? Math.min(opts.limit, 100) : 20
+      const items = rec.inputItems as Array<{ id?: string }>
+      let start = 0
+      if (opts?.after) {
+        const idx = items.findIndex((it) => {
+          if (!it || typeof it !== 'object') return false
+          const ito = it as Record<string, unknown>
+          return ito['id'] === opts.after
+        })
+        start = idx >= 0 ? idx + 1 : items.length
+      }
+      const slice = items.slice(start, start + limit)
+      const hasMore = start + slice.length < items.length
+      const firstId = slice[0] && typeof slice[0] === 'object' && typeof (slice[0] as { id?: string }).id === 'string'
+        ? (slice[0] as { id: string }).id
+        : null
+      const last = slice[slice.length - 1]
+      const lastId = last && typeof last === 'object' && typeof (last as { id?: string }).id === 'string'
+        ? (last as { id: string }).id
+        : null
+      return {
+        object: 'list',
+        data: slice,
+        first_id: firstId,
+        last_id: lastId,
+        has_more: hasMore
+      }
+    },
+
+    size (): number {
+      pruneExpired()
+      return map.size
+    },
+
+    bannerLine (): string {
+      const ttlMin = Math.round(ttlMs / 60000)
+      return `responses: in-memory only — IDs expire on restart, max ${maxEntries} entries, ${ttlMin}m TTL`
+    }
+  }
+}