tetherto · NamelsKing · Apr 15, 2026 · Apr 7, 2026 · Apr 10, 2026 · Apr 10, 2026
@@ -2,6 +2,7 @@ import { send } from "@/client/rpc/rpc-client";
 import {
   type EmbedParams,
   type EmbedRequest,
+  type EmbedStats,
   type RPCOptions,
 } from "@/schemas";
 import { InvalidResponseError } from "@/utils/errors-client";
@@ -18,7 +19,7 @@ import { InvalidResponseError } from "@/utils/errors-client";
 export async function embed(
   params: { modelId: string; text: string },
   options?: RPCOptions,
-): Promise<number[]>;
+): Promise<{ embedding: number[]; stats?: EmbedStats }>;
 
 /**
  * Generates embeddings for multiple texts using a specified model.
@@ -32,12 +33,12 @@ export async function embed(
 export async function embed(
   params: { modelId: string; text: string[] },
   options?: RPCOptions,
-): Promise<number[][]>;
+): Promise<{ embedding: number[][]; stats?: EmbedStats }>;
 
 export async function embed(
   params: EmbedParams,
   options?: RPCOptions,
-): Promise<number[] | number[][]> {
+): Promise<{ embedding: number[] | number[][]; stats?: EmbedStats }> {
   const request: EmbedRequest = {
     type: "embed",
     ...params,
@@ -48,5 +49,8 @@ export async function embed(
     throw new InvalidResponseError("embed");
   }
 
-  return response.embedding;
+  return {
+    embedding: response.embedding,
+    ...(response.stats !== undefined && { stats: response.stats }),
+  };
 }
@@ -206,7 +206,7 @@ export async function ragIngest(
  * ```typescript
  * // Segregated flow
  * const chunks = await ragChunk({ documents: ["text1", "text2"] });
- * const embeddings = await embed({ modelId, text: chunks.map(c => c.content) });
+ * const { embedding: embeddings } = await embed({ modelId, text: chunks.map(c => c.content) });
  * const embeddedDocs = chunks.map((chunk, i) => ({
  *   ...chunk,
  *   embedding: embeddings[i],

@@ -24,7 +24,7 @@ try {
   console.log("\n📝 Example 1: Single Text Embedding");
   console.log("=".repeat(50));
 
-  const singleEmbedding = await embed({ modelId, text: "Hello, world!" });
+  const { embedding: singleEmbedding } = await embed({ modelId, text: "Hello, world!" });
 
   console.log("Input: 'Hello, world!'");
   console.log("Embedding dimensions:", singleEmbedding.length);
@@ -39,7 +39,7 @@ try {
     "Python is a programming language",
   ];
 
-  const batchEmbeddings = await embed({ modelId, text: texts });
+  const { embedding: batchEmbeddings } = await embed({ modelId, text: texts });
 
   console.log("Input: Array of", texts.length, "texts");
   console.log("Output: Array of", batchEmbeddings.length, "embeddings");

@@ -76,7 +76,7 @@ try {
     history: messages,
     stream: true,
   });
-  const embedding = await embed({
+  const { embedding } = await embed({
     modelId: embedModelId,
     text: messages[0]?.content ?? "Hello, world!",
   });

@@ -18,21 +18,21 @@ try {
   console.log("Model loaded:", modelId);
 
   console.log("\n=== Embed with per-call profiling ===");
-  const embedding1 = await embed(
+  const { embedding: embedding1 } = await embed(
     { modelId, text: "Profile this specific call" },
     { profiling: { enabled: true, includeServerBreakdown: true } },
   );
   console.log("Embedding dimensions:", embedding1.length);
 
   console.log("\n=== Embed without profiling ===");
-  const embedding2 = await embed({
+  const { embedding: embedding2 } = await embed({
     modelId,
     text: "This call is not profiled",
   });
   console.log("Embedding dimensions:", embedding2.length);
 
   console.log("\n=== Embed with profiling explicitly disabled ===");
-  const embedding3 = await embed(
+  const { embedding: embedding3 } = await embed(
     { modelId, text: "Profiling explicitly disabled for this call" },
     { profiling: { enabled: false } },
   );

@@ -105,7 +105,7 @@ try {
   for (const s of samples) {
     ids.push(String(s.id));
     documents.push(s.text);
-    embeddings.push(await embed({ modelId, text: s.text }));
+    embeddings.push((await embed({ modelId, text: s.text })).embedding);
   }
 
   await collection.add({
@@ -115,7 +115,7 @@ try {
   });
 
   console.log("🔎 Searching for similar documents...");
-  const queryEmbedding = await embed({ modelId, text: query });
+  const { embedding: queryEmbedding } = await embed({ modelId, text: query });
 
   // Query top 3 by vector similarity and include distances
   const res = await collection.query({

@@ -49,7 +49,7 @@ try {
 
   console.log("\n🧠 Step 2: Generating embeddings (batch)...");
   const texts = chunks.map((chunk) => chunk.content);
-  const embeddings = await embed({ modelId, text: texts });
+  const { embedding: embeddings } = await embed({ modelId, text: texts });
 
   const embeddedDocs = chunks.map((chunk, i) => ({
     id: chunk.id,

@@ -73,7 +73,7 @@ try {
   console.log("📚 Embedding documents...");
   const documents = [];
   for (const sample of samples) {
-    const embedding = await embed({ modelId, text: sample.text });
+    const { embedding } = await embed({ modelId, text: sample.text });
     const record = {
       id: sample.id,
       text: sample.text,
@@ -85,7 +85,7 @@ try {
   await documentsTable.add(documents);
 
   console.log("🔎 Searching for similar documents...");
-  const queryEmbedding = await embed({ modelId, text: query });
+  const { embedding: queryEmbedding } = await embed({ modelId, text: query });
   const results = (await documentsTable
     .vectorSearch(queryEmbedding)
     .limit(1)

@@ -64,7 +64,7 @@ try {
 
   console.log("📚 Embedding documents...");
   for (const sample of samples) {
-    const embedding = await embed({ modelId, text: sample.text });
+    const { embedding } = await embed({ modelId, text: sample.text });
     db.exec({
       sql: "INSERT INTO documents VALUES (?, ?, vector_as_f32(?))",
       bind: [sample.id, sample.text, JSON.stringify(embedding)],
@@ -84,7 +84,7 @@ try {
 
   // Search for similar documents
   console.log("🔎 Searching for similar documents...");
-  const queryEmbedding = await embed({ modelId, text: query });
+  const { embedding: queryEmbedding } = await embed({ modelId, text: query });
 
   const results: Array<{
     id: number;

@@ -50,6 +50,7 @@ export {
   type ToolCallError,
   type ToolCallEvent,
   type CompletionStats,
+  type EmbedStats,
   VERBOSITY,
   type Attachment,
   type TranscribeStreamSession,

@@ -175,10 +175,10 @@
     "@qvac/decoder-audio": "^0.3.3",
     "@qvac/diffusion-cpp": "0.1.1",
     "@qvac/dl-filesystem": "^0.2.0",
-    "@qvac/embed-llamacpp": "^0.13.1",
+    "@qvac/embed-llamacpp": "^0.13.4",
     "@qvac/error": "^0.1.1",
     "@qvac/langdetect-text": "^0.1.0",
-    "@qvac/llm-llamacpp": "^0.14.0",
+    "@qvac/llm-llamacpp": "^0.14.4",
     "@qvac/logging": "^0.1.0",
     "@qvac/ocr-onnx": "^0.4.0",
     "@qvac/rag": "^0.4.4",

@@ -51,6 +51,7 @@ export const completionStatsSchema = z.object({
   timeToFirstToken: z.number().optional(),
   tokensPerSecond: z.number().optional(),
   cacheTokens: z.number().optional(),
+  backendDevice: z.enum(["cpu", "gpu"]).optional(),
 });
 
 export const completionStreamResponseSchema = z.object({

@@ -18,6 +18,7 @@ export const embedStatsSchema = z.object({
   totalTime: z.number().optional(),
   tokensPerSecond: z.number().optional(),
   totalTokens: z.number().optional(),
+  backendDevice: z.enum(["cpu", "gpu"]).optional(),
 });
 
 export const embedResponseSchema = z.object({

@@ -41,6 +41,13 @@ export const llmConfigBaseSchema = z.object({
   stop_sequences: z.array(z.string()).optional(),
   n_discarded: z.number().optional(),
   tools: z.boolean().optional(),
+  "cache-type-k": z.string().optional(),
+  "cache-type-v": z.string().optional(),
+  /**
+   * Writable directory for OpenCL kernel binary cache. Required on Android
+   * for fast GPU startup.
+   */
+  openclCacheDir: z.string().optional(),
   projectionModelSrc: modelSrcInputSchema.optional(),
 });
 
@@ -75,6 +82,11 @@ export const embedConfigBaseSchema = z.object({
     .union([z.number().int().min(0), z.enum(["integrated", "dedicated"])])
     .optional(),
   verbosity: verbositySchema.optional(),
+  /**
+   * Writable directory for OpenCL kernel binary cache. Required on Android
+   * for fast GPU startup.
+   */
+  openclCacheDir: z.string().optional(),
 });
 
 export type EmbedConfigInput = z.infer<typeof embedConfigBaseSchema>;

@@ -38,6 +38,7 @@ export async function embed(params: EmbedParams): Promise<EmbedResult> {
     ...(response.stats?.total_time_ms !== undefined && { totalTime: response.stats.total_time_ms }),
     ...(response.stats?.tokens_per_second !== undefined && { tokensPerSecond: response.stats.tokens_per_second }),
     ...(response.stats?.total_tokens !== undefined && { totalTokens: response.stats.total_tokens }),
+    ...(response.stats?.backendDevice !== undefined && { backendDevice: response.stats.backendDevice }),
   };
 
   const embeddingsArray = rawEmbeddings[0];

@@ -281,6 +281,9 @@ async function* processModelResponse(
     ...(responseWithStats.stats?.CacheTokens !== undefined && {
       cacheTokens: responseWithStats.stats.CacheTokens,
     }),
+    ...(responseWithStats.stats?.backendDevice !== undefined && {
+      backendDevice: responseWithStats.stats.backendDevice,
+    }),
   };
 
   return {

@@ -51,6 +51,11 @@ function transformLlmConfig(llmConfig: LlmConfig) {
     delete transformed["stop_sequences"];
   }
 
+  if ("opencl_cache_dir" in transformed) {
+    transformed["openclCacheDir"] = transformed["opencl_cache_dir"];
+    delete transformed["opencl_cache_dir"];
+  }
+
   return transformed;
 }
 

@@ -56,6 +56,10 @@ function transformEmbedConfig(embedConfig: EmbedConfig): GGMLConfig {
     config.verbosity = `${embedConfig.verbosity}`;
   }
 
+  if (embedConfig.openclCacheDir) {
+    config.openclCacheDir = embedConfig.openclCacheDir;
+  }
+
   return config;
 }
 

@@ -3,6 +3,7 @@ export interface LlmStats {
   TPS?: number;
   CacheTokens?: number;
   generatedTokens?: number;
+  backendDevice?: "cpu" | "gpu";
 }
 
 export interface LlmResponse {
@@ -38,6 +39,7 @@ export interface EmbedStats {
   total_time_ms?: number;
   tokens_per_second?: number;
   total_tokens?: number;
+  backendDevice?: "cpu" | "gpu";
 }
 
 export interface EmbedResponse {

@@ -0,0 +1,48 @@
+// @ts-expect-error brittle has no type declarations
+import test from "brittle";
+import {
+  completionStreamResponseSchema,
+  completionStatsSchema,
+} from "@/schemas/completion-stream";
+
+test("completionStatsSchema: accepts backendDevice 'cpu' and 'gpu'", (t) => {
+  t.is(
+    completionStatsSchema.safeParse({ backendDevice: "cpu" }).success,
+    true,
+  );
+  t.is(
+    completionStatsSchema.safeParse({ backendDevice: "gpu" }).success,
+    true,
+  );
+});
+
+test("completionStatsSchema: rejects unknown backendDevice values", (t) => {
+  const result = completionStatsSchema.safeParse({ backendDevice: "npu" });
+  t.is(result.success, false);
+});
+
+test("completionStatsSchema: backendDevice is optional", (t) => {
+  const result = completionStatsSchema.safeParse({
+    timeToFirstToken: 100,
+    tokensPerSecond: 50,
+  });
+  t.is(result.success, true);
+});
+
+test("completionStreamResponseSchema: round-trips backendDevice through stats", (t) => {
+  const result = completionStreamResponseSchema.safeParse({
+    type: "completionStream",
+    token: "",
+    done: true,
+    stats: {
+      timeToFirstToken: 80,
+      tokensPerSecond: 75,
+      cacheTokens: 12,
+      backendDevice: "cpu",
+    },
+  });
+  t.is(result.success, true);
+  if (result.success) {
+    t.is(result.data.stats?.backendDevice, "cpu");
+  }
+});