tetherto · gianni-cor · May 12, 2026 · May 11, 2026 · May 11, 2026 · May 11, 2026
@@ -146,6 +146,8 @@ export function extractGenerationParams (body: Record<string, unknown>): SDKGene
   if (typeof body['max_tokens'] === 'number') params.predict = body['max_tokens']
   if (typeof body['max_completion_tokens'] === 'number') params.predict = body['max_completion_tokens']
 
+  if (typeof body['reasoning_budget'] === 'boolean') params.reasoning_budget = body['reasoning_budget']
+
   return Object.keys(params).length > 0 ? params : undefined
 }
 

@@ -11,6 +11,7 @@ export interface SDKGenerationParams {
   frequency_penalty?: number
   presence_penalty?: number
   repeat_penalty?: number
+  reasoning_budget?: boolean
 }
 
 export type SDKResponseFormat =
@@ -162,7 +163,12 @@ export async function sdkCompletion (opts: {
     params['tools'] = opts.tools
   }
   if (opts.generationParams) {
-    params['generationParams'] = opts.generationParams
+    const { reasoning_budget, ...rest } = opts.generationParams
+    const sdkGenParams: Record<string, unknown> = { ...rest }
+    if (reasoning_budget !== undefined) {
+      sdkGenParams['reasoning_budget'] = reasoning_budget ? -1 : 0
+    }
+    params['generationParams'] = sdkGenParams
   }
   if (opts.responseFormat) {
     params['responseFormat'] = opts.responseFormat

@@ -344,6 +344,23 @@ describe('extractGenerationParams', () => {
     assert.equal(params.presence_penalty, 0.1)
   })
 
+  it('extracts reasoning_budget true', () => {
+    const params = extractGenerationParams({ reasoning_budget: true })
+    assert.ok(params)
+    assert.equal(params.reasoning_budget, true)
+  })
+
+  it('extracts reasoning_budget false', () => {
+    const params = extractGenerationParams({ reasoning_budget: false })
+    assert.ok(params)
+    assert.equal(params.reasoning_budget, false)
+  })
+
+  it('ignores non-boolean reasoning_budget', () => {
+    const params = extractGenerationParams({ reasoning_budget: -1 })
+    assert.equal(params, undefined)
+  })
+
   it('ignores non-number values', () => {
     const params = extractGenerationParams({ temperature: 'hot', max_tokens: '100' })
     assert.equal(params, undefined)

@@ -54,7 +54,7 @@ type CompletionParams = Omit<CompletionClientParams, "tools"> & {
  * @param params.mcp - Optional array of MCP client inputs for tool integration
  * @param params.captureThinking - Best-effort parsing of `<think>` blocks into `thinkingDelta` events; `final.raw.fullText` always preserves the original output
  * @param params.emitRawDeltas - When true, every raw model token is also emitted as a `rawDelta` event
- * @param params.toolDialect - Override the SDK's name-based dialect detection. Use when your model emits a known format (`"hermes"`, `"pythonic"`, or `"json"`) the auto-router doesn't recognise. Drives both streaming frame detection and finalization parsing.
+ * @param params.toolDialect - Override the SDK's name-based dialect detection. Supported values: `"hermes"`, `"pythonic"`, `"json"`, `"harmony"`, `"qwen35"` (Qwen3.5/3.6), `"gemma4"`. Use when the auto-router doesn't recognise your model name. Drives both streaming frame detection and finalization parsing.
  * Common override case: Llama 3.x tool-calling fine-tunes that emit the native pythonic header (`<|start_header_id|>tool_call<|end_header_id|>...<|eot_id|>`).
  * @param params.responseFormat - Optional structured-output constraint applied to the model's output:
  *   - `{ type: "text" }` — no constraint (default behavior)

@@ -0,0 +1,94 @@
+/**
+ * Tool-calling example using the Gemma4 native dialect.
+ *
+ * Gemma4 emits tool calls in a JS-literal format with custom quote tokens:
+ *   <|tool_call>call:NAME{key:<|"|>val<|"|>,...}<tool_call|>
+ *
+ * Reasoning output (thinking) is emitted inside <|channel>thought...<channel|>
+ * frames, which are stripped from contentDelta and forwarded as thinkingDelta
+ * when captureThinking is true.
+ *
+ * The dialect is auto-detected from the model name/path when the file name
+ * contains "gemma4" or "gemma-4". Pass toolDialect: "gemma4" explicitly to
+ * completion() if auto-detection does not pick it up for a given file name.
+ *
+ * Usage:
+ *   bun run bare:example dist/examples/tools/llamacpp-tools-gemma4.js <model-url>
+ */
+import {
+  completion,
+  loadModel,
+  unloadModel,
+  type ToolCall,
+} from "@qvac/sdk";
+import { tools, mockExecute } from "./shared";
+
+// bartowski's pack tags <eos> as the EOG token (matching the base tokenizer),
+// ensuring generation stops correctly; unsloth's variant maps it differently.
+const GEMMA4_HF =
+  "https://huggingface.co/bartowski/google_gemma-4-E2B-it-GGUF/resolve/main/google_gemma-4-E2B-it-Q4_K_M.gguf";
+
+const modelSrc = process.argv[2] ?? GEMMA4_HF;
+
+let modelId: string | undefined;
+try {
+  modelId = await loadModel({
+    modelSrc,
+    modelType: "llm",
+    modelConfig: { ctx_size: 4096, tools: true },
+    onProgress: (progress) =>
+      console.log(`Loading: ${progress.percentage.toFixed(1)}%`),
+  });
+  console.log(`Model loaded: ${modelId}`);
+
+  const history = [
+    {
+      role: "system",
+      content:
+        "You are a helpful assistant that can call tools to look up weather and horoscopes.",
+    },
+    {
+      role: "user",
+      content: "What's the weather in Tokyo and my horoscope for Aquarius?",
+    },
+  ];
+
+  const result = completion({ modelId, history, stream: true, tools });
+
+  const tokensTask = (async () => {
+    for await (const token of result.tokenStream) {
+      process.stdout.write(token);
+    }
+  })();
+
+  const toolsTask = (async () => {
+    for await (const evt of result.toolCallStream) {
+      if (evt.type === "toolCall") {
+        console.log(
+          `\n-> ${evt.call.name}(${JSON.stringify(evt.call.arguments)})`,
+        );
+      }
+    }
+  })();
+
+  await Promise.all([tokensTask, toolsTask]);
+
+  const toolCalls: ToolCall[] = await result.toolCalls;
+
+  console.log("\n\nFinal tool calls:");
+  if (toolCalls.length > 0) {
+    for (const call of toolCalls) {
+      console.log(`  - ${call.name}(${JSON.stringify(call.arguments)})`);
+      const toolResult = mockExecute(call.name, call.arguments);
+      console.log(`    result: ${toolResult}`);
+    }
+  } else {
+    console.log("  (none)");
+  }
+
+  await unloadModel({ modelId, clearStorage: false });
+} catch (error) {
+  console.error("Error:", error);
+  if (modelId) await unloadModel({ modelId, clearStorage: false }).catch(() => {});
+  process.exit(1);
+}
@@ -0,0 +1,88 @@
+/**
+ * Tool-calling example using the Qwen3.5 dialect.
+ *
+ * Qwen3.5 emits tool calls in a Pythonic-XML format:
+ *   <tool_call><function=NAME><parameter=KEY>VALUE</parameter></function></tool_call>
+ *
+ * The dialect is auto-detected from the model name/path when the model file
+ * contains "qwen3.5", "qwen3-5", "qwen3.6", or "qwen3-6". Pass
+ * toolDialect: "qwen35" explicitly if auto-detection does not pick it up.
+ *
+ * Usage:
+ *   bun run bare:example dist/examples/tools/llamacpp-tools-qwen35.js <model-url>
+ */
+import {
+  completion,
+  loadModel,
+  unloadModel,
+  type ToolCall,
+} from "@qvac/sdk";
+import { tools, mockExecute } from "./shared";
+
+const QWEN35_HF =
+  "https://huggingface.co/unsloth/Qwen3.5-0.8B-GGUF/resolve/main/Qwen3.5-0.8B-Q8_0.gguf";
+
+const modelSrc = process.argv[2] ?? QWEN35_HF;
+
+let modelId: string | undefined;
+try {
+  modelId = await loadModel({
+    modelSrc,
+    modelType: "llm",
+    modelConfig: { ctx_size: 4096, tools: true },
+    onProgress: (progress) =>
+      console.log(`Loading: ${progress.percentage.toFixed(1)}%`),
+  });
+  console.log(`Model loaded: ${modelId}`);
+
+  const history = [
+    {
+      role: "system",
+      content:
+        "You are a helpful assistant that can call tools to look up weather and horoscopes.",
+    },
+    {
+      role: "user",
+      content: "What's the weather in Tokyo and my horoscope for Aquarius?",
+    },
+  ];
+
+  const result = completion({ modelId, history, stream: true, tools });
+
+  const tokensTask = (async () => {
+    for await (const token of result.tokenStream) {
+      process.stdout.write(token);
+    }
+  })();
+
+  const toolsTask = (async () => {
+    for await (const evt of result.toolCallStream) {
+      if (evt.type === "toolCall") {
+        console.log(
+          `\n-> ${evt.call.name}(${JSON.stringify(evt.call.arguments)})`,
+        );
+      }
+    }
+  })();
+
+  await Promise.all([tokensTask, toolsTask]);
+
+  const toolCalls: ToolCall[] = await result.toolCalls;
+
+  console.log("\n\nFinal tool calls:");
+  if (toolCalls.length > 0) {
+    for (const call of toolCalls) {
+      console.log(`  - ${call.name}(${JSON.stringify(call.arguments)})`);
+      const toolResult = mockExecute(call.name, call.arguments);
+      console.log(`    result: ${toolResult}`);
+    }
+  } else {
+    console.log("  (none)");
+  }
+
+  await unloadModel({ modelId, clearStorage: false });
+} catch (error) {
+  console.error("Error:", error);
+  if (modelId) await unloadModel({ modelId, clearStorage: false }).catch(() => {});
+  process.exit(1);
+}
@@ -177,7 +177,7 @@
     "@qvac/embed-llamacpp": "^0.15.0",
     "@qvac/error": "^0.1.1",
     "@qvac/langdetect-text": "^0.1.2",
-    "@qvac/llm-llamacpp": "^0.18.0",
+    "@qvac/llm-llamacpp": "^0.20.0",
     "@qvac/logging": "^0.1.0",
     "@qvac/ocr-onnx": "^0.4.2",
     "@qvac/rag": "^0.4.4",

@@ -16,12 +16,16 @@ export {
  * - `"pythonic"`: `[get_weather(city="Tokyo")]` (optionally `<|tool_call_start|>...<|tool_call_end|>`-wrapped)
  * - `"json"`:     `{"name":"get_weather","arguments":{"city":"Tokyo"}}` or `{"tool_calls":[{"name":"...","arguments":{...}}]}`
  * - `"harmony"`:  `<|channel|>commentary to=functions.get_weather <|constrain|>json<|message|>{"city":"Tokyo"}<|call|>`
+ * - `"qwen35"`:   `<tool_call><function=NAME><parameter=KEY>VALUE</parameter></function></tool_call>`
+ * - `"gemma4"`:   `<|tool_call>call:NAME{key:<|"|>val<|"|>,...}<tool_call|>`
  */
 export const toolDialectSchema = z.enum([
   "hermes",
   "pythonic",
   "json",
   "harmony",
+  "qwen35",
+  "gemma4"
 ]);
 
 export const attachmentSchema = z.object({
@@ -70,6 +74,12 @@ export const generationParamsSchema = z
       .number()
       .optional()
       .describe("Penalty applied to repeated tokens."),
+    reasoning_budget: z
+      .union([z.literal(-1), z.literal(0)])
+      .optional()
+      .describe(
+        "Per-request reasoning channel budget. `-1` keeps the model's reasoning channel on; `0` disables it for this request. Equivalent to the load-time `reasoning_budget` config but scoped to a single `run()` call; the prior value is restored afterwards.",
+      ),
   })
   .strict();
 

@@ -33,6 +33,7 @@ export const llmConfigBaseSchema = z.object({
       z.number().int().min(1), // positive integer: fixed token count
     ])
     .optional(),
+  /** JS-side only: seeds conversation history. Never forwarded to the C++ addon. */
   system_prompt: z.string().optional(),
   no_mmap: z.boolean().optional(),
   verbosity: verbositySchema.optional(),
@@ -60,6 +61,10 @@ export const llmConfigBaseSchema = z.object({
    * for fast GPU startup.
    */
   openclCacheDir: z.string().optional(),
+  /**
+   * Reasoning channel token budget. `-1` = unrestricted, `0` = disabled.
+   */
+  reasoning_budget: z.union([z.literal(-1), z.literal(0)]).optional(),
   projectionModelSrc: modelSrcInputSchema.optional(),
 });
 

@@ -12,7 +12,6 @@ import {
   ModelType,
   llmConfigBaseSchema,
   ADDON_LLM,
-  TOOLS_MODE,
   type CompletionEvent,
   type CreateModelParams,
   type PluginCapabilities,
@@ -26,51 +25,14 @@ import { expandGGUFIntoShards } from "@/server/utils";
 import { completion } from "@/server/bare/plugins/llamacpp-completion/ops/completion-stream";
 import { finetune } from "@/server/bare/plugins/llamacpp-completion/ops/finetune";
 import { translate } from "@/server/bare/ops/translate";
+import { transformLlmConfig } from "@/server/bare/plugins/llamacpp-completion/transform";
 import { attachModelExecutionMs } from "@/profiling/model-execution";
 import { getModelConfig } from "@/server/bare/registry/model-registry";
 import { createCompletionNormalizer } from "@/server/utils/completion-normalizer";
 import { detectToolDialect } from "@/server/utils/tool-integration";
 import { getRequestRegistry } from "@/server/bare/runtime";
 import { generateServerRequestId } from "@/server/bare/runtime/request-id";
 
-function transformLlmConfig(llmConfig: LlmConfig) {
-  const transformed = JSON.parse(
-    JSON.stringify(llmConfig, (key: string, v: unknown) =>
-      key === "modelType"
-        ? undefined
-        : key === "stop_sequences"
-          ? Array.isArray(v)
-            ? v.join(", ")
-            : v
-          : typeof v === "number" || typeof v === "boolean"
-            ? String(v)
-            : v,
-    ).replace(
-      /"([a-z][A-Za-z]*)":/g,
-      (_, key: string) =>
-        `"${key.replace(/[A-Z]/g, (l: string) => `_${l.toLowerCase()}`)}":`,
-    ),
-  ) as Record<string, string>;
-
-  if ("stop_sequences" in transformed) {
-    transformed["reverse_prompt"] = transformed["stop_sequences"];
-    delete transformed["stop_sequences"];
-  }
-
-  if ("opencl_cache_dir" in transformed) {
-    transformed["openclCacheDir"] = transformed["opencl_cache_dir"];
-    delete transformed["opencl_cache_dir"];
-  }
-
-  if ("tools_mode" in transformed) {
-    if (transformed["tools_mode"] === TOOLS_MODE.dynamic) {
-      transformed["tools_compact"] = "true";
-    }
-    delete transformed["tools_mode"];
-  }
-
-  return transformed;
-}
 
 function createLlmModel(
   modelId: string,