Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions packages/sdk/bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 8 additions & 4 deletions packages/sdk/client/api/embed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { send } from "@/client/rpc/rpc-client";
import {
type EmbedParams,
type EmbedRequest,
type EmbedStats,
type RPCOptions,
} from "@/schemas";
import { InvalidResponseError } from "@/utils/errors-client";
Expand All @@ -18,7 +19,7 @@ import { InvalidResponseError } from "@/utils/errors-client";
export async function embed(
params: { modelId: string; text: string },
options?: RPCOptions,
): Promise<number[]>;
): Promise<{ embedding: number[]; stats?: EmbedStats }>;

/**
* Generates embeddings for multiple texts using a specified model.
Expand All @@ -32,12 +33,12 @@ export async function embed(
export async function embed(
params: { modelId: string; text: string[] },
options?: RPCOptions,
): Promise<number[][]>;
): Promise<{ embedding: number[][]; stats?: EmbedStats }>;

export async function embed(
params: EmbedParams,
options?: RPCOptions,
): Promise<number[] | number[][]> {
): Promise<{ embedding: number[] | number[][]; stats?: EmbedStats }> {
const request: EmbedRequest = {
type: "embed",
...params,
Expand All @@ -48,5 +49,8 @@ export async function embed(
throw new InvalidResponseError("embed");
}

return response.embedding;
return {
embedding: response.embedding,
...(response.stats !== undefined && { stats: response.stats }),
};
}
2 changes: 1 addition & 1 deletion packages/sdk/client/api/rag.ts
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ export async function ragIngest(
* ```typescript
* // Segregated flow
* const chunks = await ragChunk({ documents: ["text1", "text2"] });
* const embeddings = await embed({ modelId, text: chunks.map(c => c.content) });
* const { embedding: embeddings } = await embed({ modelId, text: chunks.map(c => c.content) });
* const embeddedDocs = chunks.map((chunk, i) => ({
* ...chunk,
* embedding: embeddings[i],
Expand Down
4 changes: 2 additions & 2 deletions packages/sdk/examples/embed-p2p.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ try {
console.log("\nπŸ“ Example 1: Single Text Embedding");
console.log("=".repeat(50));

const singleEmbedding = await embed({ modelId, text: "Hello, world!" });
const { embedding: singleEmbedding } = await embed({ modelId, text: "Hello, world!" });

console.log("Input: 'Hello, world!'");
console.log("Embedding dimensions:", singleEmbedding.length);
Expand All @@ -39,7 +39,7 @@ try {
"Python is a programming language",
];

const batchEmbeddings = await embed({ modelId, text: texts });
const { embedding: batchEmbeddings } = await embed({ modelId, text: texts });

console.log("Input: Array of", texts.length, "texts");
console.log("Output: Array of", batchEmbeddings.length, "embeddings");
Expand Down
2 changes: 1 addition & 1 deletion packages/sdk/examples/logging-streaming.ts
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ try {
history: messages,
stream: true,
});
const embedding = await embed({
const { embedding } = await embed({
modelId: embedModelId,
text: messages[0]?.content ?? "Hello, world!",
});
Expand Down
6 changes: 3 additions & 3 deletions packages/sdk/examples/profiling/per-call.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,21 @@ try {
console.log("Model loaded:", modelId);

console.log("\n=== Embed with per-call profiling ===");
const embedding1 = await embed(
const { embedding: embedding1 } = await embed(
{ modelId, text: "Profile this specific call" },
{ profiling: { enabled: true, includeServerBreakdown: true } },
);
console.log("Embedding dimensions:", embedding1.length);

console.log("\n=== Embed without profiling ===");
const embedding2 = await embed({
const { embedding: embedding2 } = await embed({
modelId,
text: "This call is not profiled",
});
console.log("Embedding dimensions:", embedding2.length);

console.log("\n=== Embed with profiling explicitly disabled ===");
const embedding3 = await embed(
const { embedding: embedding3 } = await embed(
{ modelId, text: "Profiling explicitly disabled for this call" },
{ profiling: { enabled: false } },
);
Expand Down
4 changes: 2 additions & 2 deletions packages/sdk/examples/rag/rag-chromadb.ts
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ try {
for (const s of samples) {
ids.push(String(s.id));
documents.push(s.text);
embeddings.push(await embed({ modelId, text: s.text }));
embeddings.push((await embed({ modelId, text: s.text })).embedding);
}

await collection.add({
Expand All @@ -115,7 +115,7 @@ try {
});

console.log("πŸ”Ž Searching for similar documents...");
const queryEmbedding = await embed({ modelId, text: query });
const { embedding: queryEmbedding } = await embed({ modelId, text: query });

// Query top 3 by vector similarity and include distances
const res = await collection.query({
Expand Down
2 changes: 1 addition & 1 deletion packages/sdk/examples/rag/rag-hyperdb/pipeline.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ try {

console.log("\n🧠 Step 2: Generating embeddings (batch)...");
const texts = chunks.map((chunk) => chunk.content);
const embeddings = await embed({ modelId, text: texts });
const { embedding: embeddings } = await embed({ modelId, text: texts });

const embeddedDocs = chunks.map((chunk, i) => ({
id: chunk.id,
Expand Down
4 changes: 2 additions & 2 deletions packages/sdk/examples/rag/rag-lancedb.ts
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ try {
console.log("πŸ“š Embedding documents...");
const documents = [];
for (const sample of samples) {
const embedding = await embed({ modelId, text: sample.text });
const { embedding } = await embed({ modelId, text: sample.text });
const record = {
id: sample.id,
text: sample.text,
Expand All @@ -85,7 +85,7 @@ try {
await documentsTable.add(documents);

console.log("πŸ”Ž Searching for similar documents...");
const queryEmbedding = await embed({ modelId, text: query });
const { embedding: queryEmbedding } = await embed({ modelId, text: query });
const results = (await documentsTable
.vectorSearch(queryEmbedding)
.limit(1)
Expand Down
4 changes: 2 additions & 2 deletions packages/sdk/examples/rag/rag-sqlite.ts
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ try {

console.log("πŸ“š Embedding documents...");
for (const sample of samples) {
const embedding = await embed({ modelId, text: sample.text });
const { embedding } = await embed({ modelId, text: sample.text });
db.exec({
sql: "INSERT INTO documents VALUES (?, ?, vector_as_f32(?))",
bind: [sample.id, sample.text, JSON.stringify(embedding)],
Expand All @@ -84,7 +84,7 @@ try {

// Search for similar documents
console.log("πŸ”Ž Searching for similar documents...");
const queryEmbedding = await embed({ modelId, text: query });
const { embedding: queryEmbedding } = await embed({ modelId, text: query });

const results: Array<{
id: number;
Expand Down
1 change: 1 addition & 0 deletions packages/sdk/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ export {
type ToolCallError,
type ToolCallEvent,
type CompletionStats,
type EmbedStats,
VERBOSITY,
type Attachment,
type TranscribeStreamSession,
Expand Down
4 changes: 2 additions & 2 deletions packages/sdk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,10 @@
"@qvac/decoder-audio": "^0.3.3",
"@qvac/diffusion-cpp": "0.1.1",
"@qvac/dl-filesystem": "^0.2.0",
"@qvac/embed-llamacpp": "^0.13.1",
"@qvac/embed-llamacpp": "^0.13.4",
"@qvac/error": "^0.1.1",
"@qvac/langdetect-text": "^0.1.0",
"@qvac/llm-llamacpp": "^0.14.0",
"@qvac/llm-llamacpp": "^0.14.4",
"@qvac/logging": "^0.1.0",
"@qvac/ocr-onnx": "^0.4.0",
"@qvac/rag": "^0.4.4",
Expand Down
1 change: 1 addition & 0 deletions packages/sdk/schemas/completion-stream.ts
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ export const completionStatsSchema = z.object({
timeToFirstToken: z.number().optional(),
tokensPerSecond: z.number().optional(),
cacheTokens: z.number().optional(),
backendDevice: z.enum(["cpu", "gpu"]).optional(),
});

export const completionStreamResponseSchema = z.object({
Expand Down
1 change: 1 addition & 0 deletions packages/sdk/schemas/embed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ export const embedStatsSchema = z.object({
totalTime: z.number().optional(),
tokensPerSecond: z.number().optional(),
totalTokens: z.number().optional(),
backendDevice: z.enum(["cpu", "gpu"]).optional(),
});

export const embedResponseSchema = z.object({
Expand Down
12 changes: 12 additions & 0 deletions packages/sdk/schemas/llamacpp-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,13 @@ export const llmConfigBaseSchema = z.object({
stop_sequences: z.array(z.string()).optional(),
n_discarded: z.number().optional(),
tools: z.boolean().optional(),
"cache-type-k": z.string().optional(),
"cache-type-v": z.string().optional(),
/**
* Writable directory for OpenCL kernel binary cache. Required on Android
* for fast GPU startup.
*/
openclCacheDir: z.string().optional(),
projectionModelSrc: modelSrcInputSchema.optional(),
});

Expand Down Expand Up @@ -75,6 +82,11 @@ export const embedConfigBaseSchema = z.object({
.union([z.number().int().min(0), z.enum(["integrated", "dedicated"])])
.optional(),
verbosity: verbositySchema.optional(),
/**
* Writable directory for OpenCL kernel binary cache. Required on Android
* for fast GPU startup.
*/
openclCacheDir: z.string().optional(),
});

export type EmbedConfigInput = z.infer<typeof embedConfigBaseSchema>;
Expand Down
1 change: 1 addition & 0 deletions packages/sdk/server/bare/ops/embed.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ export async function embed(params: EmbedParams): Promise<EmbedResult> {
...(response.stats?.total_time_ms !== undefined && { totalTime: response.stats.total_time_ms }),
...(response.stats?.tokens_per_second !== undefined && { tokensPerSecond: response.stats.tokens_per_second }),
...(response.stats?.total_tokens !== undefined && { totalTokens: response.stats.total_tokens }),
...(response.stats?.backendDevice !== undefined && { backendDevice: response.stats.backendDevice }),
};

const embeddingsArray = rawEmbeddings[0];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,9 @@ async function* processModelResponse(
...(responseWithStats.stats?.CacheTokens !== undefined && {
cacheTokens: responseWithStats.stats.CacheTokens,
}),
...(responseWithStats.stats?.backendDevice !== undefined && {
backendDevice: responseWithStats.stats.backendDevice,
}),
};

return {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ function transformLlmConfig(llmConfig: LlmConfig) {
delete transformed["stop_sequences"];
}

if ("opencl_cache_dir" in transformed) {
transformed["openclCacheDir"] = transformed["opencl_cache_dir"];
delete transformed["opencl_cache_dir"];
}

return transformed;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@ function transformEmbedConfig(embedConfig: EmbedConfig): GGMLConfig {
config.verbosity = `${embedConfig.verbosity}`;
}

if (embedConfig.openclCacheDir) {
config.openclCacheDir = embedConfig.openclCacheDir;
}

return config;
}

Expand Down
2 changes: 2 additions & 0 deletions packages/sdk/server/bare/types/addon-responses.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ export interface LlmStats {
TPS?: number;
CacheTokens?: number;
generatedTokens?: number;
backendDevice?: "cpu" | "gpu";
}

export interface LlmResponse {
Expand Down Expand Up @@ -38,6 +39,7 @@ export interface EmbedStats {
total_time_ms?: number;
tokens_per_second?: number;
total_tokens?: number;
backendDevice?: "cpu" | "gpu";
}

export interface EmbedResponse {
Expand Down
48 changes: 48 additions & 0 deletions packages/sdk/test/unit/completion-stream-schemas.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// @ts-expect-error brittle has no type declarations
import test from "brittle";
import {
completionStreamResponseSchema,
completionStatsSchema,
} from "@/schemas/completion-stream";

test("completionStatsSchema: accepts backendDevice 'cpu' and 'gpu'", (t) => {
t.is(
completionStatsSchema.safeParse({ backendDevice: "cpu" }).success,
true,
);
t.is(
completionStatsSchema.safeParse({ backendDevice: "gpu" }).success,
true,
);
});

test("completionStatsSchema: rejects unknown backendDevice values", (t) => {
const result = completionStatsSchema.safeParse({ backendDevice: "npu" });
t.is(result.success, false);
});

test("completionStatsSchema: backendDevice is optional", (t) => {
const result = completionStatsSchema.safeParse({
timeToFirstToken: 100,
tokensPerSecond: 50,
});
t.is(result.success, true);
});

test("completionStreamResponseSchema: round-trips backendDevice through stats", (t) => {
const result = completionStreamResponseSchema.safeParse({
type: "completionStream",
token: "",
done: true,
stats: {
timeToFirstToken: 80,
tokensPerSecond: 75,
cacheTokens: 12,
backendDevice: "cpu",
},
});
t.is(result.success, true);
if (result.success) {
t.is(result.data.stats?.backendDevice, "cpu");
}
});
Loading
Loading