Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
0494ff4
feat[api]: add Qwen3.5, Gemma4 tool-call dialects and reasoning_budge…
donriddo May 11, 2026
d6c7a39
fix: exclude system_prompt from C++ config transform; add reasoning_b…
donriddo May 11, 2026
319db09
fix: tighten dialect regexes, extract transformLlmConfig, add exclusi…
donriddo May 11, 2026
d9b82db
fix: extend qwen35 dialect to Qwen3.6; escape newlines in Gemma4 arg …
donriddo May 11, 2026
c6d1933
fix: update toolDialect docs to list all dialects; add qwen35/gemma4 …
donriddo May 11, 2026
3edc4fb
Merge branch 'main' into feat/sdk-qwen35-gemma4-reasoning-budget
donriddo May 11, 2026
2310ace
fix: harden qwen35 coercion errors and gemma4 control-char escaping
donriddo May 12, 2026
85beb84
fix: align qwen35 coercion error handling with pythonic/hermes pattern
donriddo May 12, 2026
cfa3820
test: fix incorrect comments in dialect negative-case tests
donriddo May 12, 2026
35a9ebf
test: remove confusing dialect negative-case comments
donriddo May 12, 2026
b3a390d
fix: reject non-integer floats and malformed array/object params in q…
donriddo May 12, 2026
6ec9e74
fix: expose reasoning_budget as boolean in CLI, transform to -1|0 for…
donriddo May 12, 2026
509f48e
feat: wire toolDialect and resourceKey through ToolsExecutor and crea…
donriddo May 12, 2026
ff18842
Merge branch 'main' into feat/sdk-qwen35-gemma4-reasoning-budget
donriddo May 12, 2026
1c9f085
fix: reject empty numeric params in qwen35, allow hyphens in gemma4 t…
donriddo May 12, 2026
85e083b
Merge branch 'main' into feat/sdk-qwen35-gemma4-reasoning-budget
donriddo May 12, 2026
32791a3
Merge branch 'main' into feat/sdk-qwen35-gemma4-reasoning-budget
gianni-cor May 12, 2026
b977f43
Merge branch 'main' into feat/sdk-qwen35-gemma4-reasoning-budget
donriddo May 12, 2026
52fda71
Merge branch 'main' into feat/sdk-qwen35-gemma4-reasoning-budget
gianni-cor May 12, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions packages/cli/src/serve/adapters/openai/translate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,8 @@ export function extractGenerationParams (body: Record<string, unknown>): SDKGene
if (typeof body['max_tokens'] === 'number') params.predict = body['max_tokens']
if (typeof body['max_completion_tokens'] === 'number') params.predict = body['max_completion_tokens']

if (typeof body['reasoning_budget'] === 'boolean') params.reasoning_budget = body['reasoning_budget']

return Object.keys(params).length > 0 ? params : undefined
}

Expand Down
8 changes: 7 additions & 1 deletion packages/cli/src/serve/core/sdk.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ export interface SDKGenerationParams {
frequency_penalty?: number
presence_penalty?: number
repeat_penalty?: number
reasoning_budget?: boolean
}

export type SDKResponseFormat =
Expand Down Expand Up @@ -162,7 +163,12 @@ export async function sdkCompletion (opts: {
params['tools'] = opts.tools
}
if (opts.generationParams) {
params['generationParams'] = opts.generationParams
const { reasoning_budget, ...rest } = opts.generationParams
const sdkGenParams: Record<string, unknown> = { ...rest }
if (reasoning_budget !== undefined) {
sdkGenParams['reasoning_budget'] = reasoning_budget ? -1 : 0
}
params['generationParams'] = sdkGenParams
}
if (opts.responseFormat) {
params['responseFormat'] = opts.responseFormat
Expand Down
17 changes: 17 additions & 0 deletions packages/cli/test/translate.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,23 @@ describe('extractGenerationParams', () => {
assert.equal(params.presence_penalty, 0.1)
})

it('extracts reasoning_budget true', () => {
const params = extractGenerationParams({ reasoning_budget: true })
assert.ok(params)
assert.equal(params.reasoning_budget, true)
})

it('extracts reasoning_budget false', () => {
const params = extractGenerationParams({ reasoning_budget: false })
assert.ok(params)
assert.equal(params.reasoning_budget, false)
})

it('ignores non-boolean reasoning_budget', () => {
const params = extractGenerationParams({ reasoning_budget: -1 })
assert.equal(params, undefined)
})

it('ignores non-number values', () => {
const params = extractGenerationParams({ temperature: 'hot', max_tokens: '100' })
assert.equal(params, undefined)
Expand Down
2 changes: 1 addition & 1 deletion packages/sdk/client/api/completion-stream.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ type CompletionParams = Omit<CompletionClientParams, "tools"> & {
* @param params.mcp - Optional array of MCP client inputs for tool integration
* @param params.captureThinking - Best-effort parsing of `<think>` blocks into `thinkingDelta` events; `final.raw.fullText` always preserves the original output
* @param params.emitRawDeltas - When true, every raw model token is also emitted as a `rawDelta` event
* @param params.toolDialect - Override the SDK's name-based dialect detection. Use when your model emits a known format (`"hermes"`, `"pythonic"`, or `"json"`) the auto-router doesn't recognise. Drives both streaming frame detection and finalization parsing.
* @param params.toolDialect - Override the SDK's name-based dialect detection. Supported values: `"hermes"`, `"pythonic"`, `"json"`, `"harmony"`, `"qwen35"` (Qwen3.5/3.6), `"gemma4"`. Use when the auto-router doesn't recognise your model name. Drives both streaming frame detection and finalization parsing.
* Common override case: Llama 3.x tool-calling fine-tunes that emit the native pythonic header (`<|start_header_id|>tool_call<|end_header_id|>...<|eot_id|>`).
* @param params.responseFormat - Optional structured-output constraint applied to the model's output:
* - `{ type: "text" }` β€” no constraint (default behavior)
Expand Down
94 changes: 94 additions & 0 deletions packages/sdk/examples/tools/llamacpp-tools-gemma4.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/**
* Tool-calling example using the Gemma4 native dialect.
*
* Gemma4 emits tool calls in a JS-literal format with custom quote tokens:
* <|tool_call>call:NAME{key:<|"|>val<|"|>,...}<tool_call|>
*
* Reasoning output (thinking) is emitted inside <|channel>thought...<channel|>
* frames, which are stripped from contentDelta and forwarded as thinkingDelta
* when captureThinking is true.
*
* The dialect is auto-detected from the model name/path when the file name
* contains "gemma4" or "gemma-4". Pass toolDialect: "gemma4" explicitly to
* completion() if auto-detection does not pick it up for a given file name.
*
* Usage:
* bun run bare:example dist/examples/tools/llamacpp-tools-gemma4.js <model-url>
*/
import {
completion,
loadModel,
unloadModel,
type ToolCall,
} from "@qvac/sdk";
import { tools, mockExecute } from "./shared";

// bartowski's pack tags <eos> as the EOG token (matching the base tokenizer),
// ensuring generation stops correctly; unsloth's variant maps it differently.
const GEMMA4_HF =
"https://huggingface.co/bartowski/google_gemma-4-E2B-it-GGUF/resolve/main/google_gemma-4-E2B-it-Q4_K_M.gguf";

const modelSrc = process.argv[2] ?? GEMMA4_HF;

let modelId: string | undefined;
try {
modelId = await loadModel({
modelSrc,
modelType: "llm",
modelConfig: { ctx_size: 4096, tools: true },
onProgress: (progress) =>
console.log(`Loading: ${progress.percentage.toFixed(1)}%`),
});
console.log(`Model loaded: ${modelId}`);

const history = [
{
role: "system",
content:
"You are a helpful assistant that can call tools to look up weather and horoscopes.",
},
{
role: "user",
content: "What's the weather in Tokyo and my horoscope for Aquarius?",
},
];

const result = completion({ modelId, history, stream: true, tools });

const tokensTask = (async () => {
for await (const token of result.tokenStream) {
process.stdout.write(token);
}
})();

const toolsTask = (async () => {
for await (const evt of result.toolCallStream) {
if (evt.type === "toolCall") {
console.log(
`\n-> ${evt.call.name}(${JSON.stringify(evt.call.arguments)})`,
);
}
}
})();

await Promise.all([tokensTask, toolsTask]);

const toolCalls: ToolCall[] = await result.toolCalls;

console.log("\n\nFinal tool calls:");
if (toolCalls.length > 0) {
for (const call of toolCalls) {
console.log(` - ${call.name}(${JSON.stringify(call.arguments)})`);
const toolResult = mockExecute(call.name, call.arguments);
console.log(` result: ${toolResult}`);
}
} else {
console.log(" (none)");
}

await unloadModel({ modelId, clearStorage: false });
} catch (error) {
console.error("Error:", error);
if (modelId) await unloadModel({ modelId, clearStorage: false }).catch(() => {});
process.exit(1);
}
88 changes: 88 additions & 0 deletions packages/sdk/examples/tools/llamacpp-tools-qwen35.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
/**
* Tool-calling example using the Qwen3.5 dialect.
*
* Qwen3.5 emits tool calls in a Pythonic-XML format:
* <tool_call><function=NAME><parameter=KEY>VALUE</parameter></function></tool_call>
*
* The dialect is auto-detected from the model name/path when the model file
* contains "qwen3.5", "qwen3-5", "qwen3.6", or "qwen3-6". Pass
* toolDialect: "qwen35" explicitly if auto-detection does not pick it up.
*
* Usage:
* bun run bare:example dist/examples/tools/llamacpp-tools-qwen35.js <model-url>
*/
import {
completion,
loadModel,
unloadModel,
type ToolCall,
} from "@qvac/sdk";
import { tools, mockExecute } from "./shared";

const QWEN35_HF =
"https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q8_0.gguf";

const modelSrc = process.argv[2] ?? QWEN35_HF;

let modelId: string | undefined;
try {
modelId = await loadModel({
modelSrc,
modelType: "llm",
modelConfig: { ctx_size: 4096, tools: true },
onProgress: (progress) =>
console.log(`Loading: ${progress.percentage.toFixed(1)}%`),
});
console.log(`Model loaded: ${modelId}`);

const history = [
{
role: "system",
content:
"You are a helpful assistant that can call tools to look up weather and horoscopes.",
},
{
role: "user",
content: "What's the weather in Tokyo and my horoscope for Aquarius?",
},
];

const result = completion({ modelId, history, stream: true, tools });

const tokensTask = (async () => {
for await (const token of result.tokenStream) {
process.stdout.write(token);
}
})();

const toolsTask = (async () => {
for await (const evt of result.toolCallStream) {
if (evt.type === "toolCall") {
console.log(
`\n-> ${evt.call.name}(${JSON.stringify(evt.call.arguments)})`,
);
}
}
})();

await Promise.all([tokensTask, toolsTask]);

const toolCalls: ToolCall[] = await result.toolCalls;

console.log("\n\nFinal tool calls:");
if (toolCalls.length > 0) {
for (const call of toolCalls) {
console.log(` - ${call.name}(${JSON.stringify(call.arguments)})`);
const toolResult = mockExecute(call.name, call.arguments);
console.log(` result: ${toolResult}`);
}
} else {
console.log(" (none)");
}

await unloadModel({ modelId, clearStorage: false });
} catch (error) {
console.error("Error:", error);
if (modelId) await unloadModel({ modelId, clearStorage: false }).catch(() => {});
process.exit(1);
}
2 changes: 1 addition & 1 deletion packages/sdk/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@
"@qvac/embed-llamacpp": "^0.15.0",
"@qvac/error": "^0.1.1",
"@qvac/langdetect-text": "^0.1.2",
"@qvac/llm-llamacpp": "^0.18.0",
"@qvac/llm-llamacpp": "^0.20.0",
"@qvac/logging": "^0.1.0",
"@qvac/ocr-onnx": "^0.4.2",
"@qvac/rag": "^0.4.4",
Expand Down
10 changes: 10 additions & 0 deletions packages/sdk/schemas/completion-stream.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,16 @@ export {
* - `"pythonic"`: `[get_weather(city="Tokyo")]` (optionally `<|tool_call_start|>...<|tool_call_end|>`-wrapped)
* - `"json"`: `{"name":"get_weather","arguments":{"city":"Tokyo"}}` or `{"tool_calls":[{"name":"...","arguments":{...}}]}`
* - `"harmony"`: `<|channel|>commentary to=functions.get_weather <|constrain|>json<|message|>{"city":"Tokyo"}<|call|>`
* - `"qwen35"`: `<tool_call><function=NAME><parameter=KEY>VALUE</parameter></function></tool_call>`
* - `"gemma4"`: `<|tool_call>call:NAME{key:<|"|>val<|"|>,...}<tool_call|>`
*/
export const toolDialectSchema = z.enum([
"hermes",
"pythonic",
"json",
"harmony",
"qwen35",
"gemma4"
]);

export const attachmentSchema = z.object({
Expand Down Expand Up @@ -70,6 +74,12 @@ export const generationParamsSchema = z
.number()
.optional()
.describe("Penalty applied to repeated tokens."),
reasoning_budget: z
.union([z.literal(-1), z.literal(0)])
.optional()
.describe(
"Per-request reasoning channel budget. `-1` keeps the model's reasoning channel on; `0` disables it for this request. Equivalent to the load-time `reasoning_budget` config but scoped to a single `run()` call; the prior value is restored afterwards.",
),
})
.strict();

Expand Down
5 changes: 5 additions & 0 deletions packages/sdk/schemas/llamacpp-config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ export const llmConfigBaseSchema = z.object({
z.number().int().min(1), // positive integer: fixed token count
])
.optional(),
/** JS-side only: seeds conversation history. Never forwarded to the C++ addon. */
system_prompt: z.string().optional(),
no_mmap: z.boolean().optional(),
verbosity: verbositySchema.optional(),
Expand Down Expand Up @@ -60,6 +61,10 @@ export const llmConfigBaseSchema = z.object({
* for fast GPU startup.
*/
openclCacheDir: z.string().optional(),
/**
* Reasoning channel token budget. `-1` = unrestricted, `0` = disabled.
*/
reasoning_budget: z.union([z.literal(-1), z.literal(0)]).optional(),
projectionModelSrc: modelSrcInputSchema.optional(),
});

Expand Down
40 changes: 1 addition & 39 deletions packages/sdk/server/bare/plugins/llamacpp-completion/plugin.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@ import {
ModelType,
llmConfigBaseSchema,
ADDON_LLM,
TOOLS_MODE,
type CompletionEvent,
type CreateModelParams,
type PluginCapabilities,
Expand All @@ -26,51 +25,14 @@ import { expandGGUFIntoShards } from "@/server/utils";
import { completion } from "@/server/bare/plugins/llamacpp-completion/ops/completion-stream";
import { finetune } from "@/server/bare/plugins/llamacpp-completion/ops/finetune";
import { translate } from "@/server/bare/ops/translate";
import { transformLlmConfig } from "@/server/bare/plugins/llamacpp-completion/transform";
import { attachModelExecutionMs } from "@/profiling/model-execution";
import { getModelConfig } from "@/server/bare/registry/model-registry";
import { createCompletionNormalizer } from "@/server/utils/completion-normalizer";
import { detectToolDialect } from "@/server/utils/tool-integration";
import { getRequestRegistry } from "@/server/bare/runtime";
import { generateServerRequestId } from "@/server/bare/runtime/request-id";

function transformLlmConfig(llmConfig: LlmConfig) {
const transformed = JSON.parse(
JSON.stringify(llmConfig, (key: string, v: unknown) =>
key === "modelType"
? undefined
: key === "stop_sequences"
? Array.isArray(v)
? v.join(", ")
: v
: typeof v === "number" || typeof v === "boolean"
? String(v)
: v,
).replace(
/"([a-z][A-Za-z]*)":/g,
(_, key: string) =>
`"${key.replace(/[A-Z]/g, (l: string) => `_${l.toLowerCase()}`)}":`,
),
) as Record<string, string>;

if ("stop_sequences" in transformed) {
transformed["reverse_prompt"] = transformed["stop_sequences"];
delete transformed["stop_sequences"];
}

if ("opencl_cache_dir" in transformed) {
transformed["openclCacheDir"] = transformed["opencl_cache_dir"];
delete transformed["opencl_cache_dir"];
}

if ("tools_mode" in transformed) {
if (transformed["tools_mode"] === TOOLS_MODE.dynamic) {
transformed["tools_compact"] = "true";
}
delete transformed["tools_mode"];
}

return transformed;
}

function createLlmModel(
modelId: string,
Expand Down
Loading
Loading