vellum-ai · siddseethepalli · Apr 17, 2026 · Apr 17, 2026
diff --git a/assistant/src/__tests__/agent-loop-callsite-precedence.test.ts b/assistant/src/__tests__/agent-loop-callsite-precedence.test.ts
@@ -33,8 +33,8 @@ mock.module("../config/loader.js", () => ({
   getConfig: () => ({ llm: mockLlmConfig }),
 }));
 
-import { AgentLoop } from "../agent/loop.js";
 import type { ResolvedSystemPrompt } from "../agent/loop.js";
+import { AgentLoop } from "../agent/loop.js";
 import { LLMSchema } from "../config/schemas/llm.js";
 import { RetryProvider } from "../providers/retry.js";
 import type {
@@ -94,7 +94,11 @@ function makePipeline(providerName: string): {
 describe("AgentLoop — call-site precedence", () => {
   test("call-site maxTokens wins over conversation default when callSite is set", async () => {
     setLlmConfig({
-      default: { provider: "anthropic", model: "claude-default", maxTokens: 64000 },
+      default: {
+        provider: "anthropic",
+        model: "claude-default",
+        maxTokens: 64000,
+      },
       callSites: { mainAgent: { maxTokens: 4096 } },
     });
 
@@ -115,7 +119,11 @@ describe("AgentLoop — call-site precedence", () => {
 
   test("call-site effort wins over conversation default when callSite is set", async () => {
     setLlmConfig({
-      default: { provider: "anthropic", model: "claude-default", effort: "high" },
+      default: {
+        provider: "anthropic",
+        model: "claude-default",
+        effort: "high",
+      },
       callSites: { mainAgent: { effort: "low" } },
     });
 
@@ -139,7 +147,11 @@ describe("AgentLoop — call-site precedence", () => {
 
   test("call-site speed wins over conversation default when callSite is set", async () => {
     setLlmConfig({
-      default: { provider: "anthropic", model: "claude-default", speed: "standard" },
+      default: {
+        provider: "anthropic",
+        model: "claude-default",
+        speed: "standard",
+      },
       callSites: { mainAgent: { speed: "fast" } },
     });
 
@@ -197,15 +209,41 @@ describe("AgentLoop — call-site precedence", () => {
       "mainAgent",
     );
 
-    const thinking = lastConfig()!.thinking as
-      | { enabled?: boolean; type?: string }
-      | undefined;
-    // The resolver fills the schema-shaped object, not the wire-format
-    // `{ type: "adaptive" }`. The important assertion is that the call-site
-    // value reached the provider intact.
-    expect(thinking).toBeDefined();
-    expect(thinking!.enabled).toBe(false);
-    expect(thinking!.type).not.toBe("adaptive");
+    // Call-site override resolves `thinking.enabled: false`, so the
+    // RetryProvider normalizer must omit `thinking` entirely (matching the
+    // legacy non-callSite path which only sets `providerConfig.thinking`
+    // when enabled). Without the fix at agent/loop.ts, the conversation
+    // default's `thinking: { type: "adaptive" }` would be pre-set and mask
+    // the call-site override.
+    expect(lastConfig()!.thinking).toBeUndefined();
+  });
+
+  test("call-site thinking is converted to Anthropic wire-format when enabled", async () => {
+    setLlmConfig({
+      default: {
+        provider: "anthropic",
+        model: "claude-default",
+        thinking: { enabled: true, streamThinking: true },
+      },
+      callSites: { mainAgent: {} },
+    });
+
+    const { provider, lastConfig } = makePipeline("anthropic");
+    const loop = new AgentLoop(provider, "system", { maxTokens: 64000 });
+
+    await loop.run(
+      [userMessage],
+      () => {},
+      undefined,
+      undefined,
+      undefined,
+      "mainAgent",
+    );
+
+    // Must be wire-format `{ type: "adaptive" }` so the Anthropic SDK's
+    // `ThinkingConfigParam` accepts it. The schema-shape `{ enabled,
+    // streamThinking }` would be a runtime API error.
+    expect(lastConfig()!.thinking).toEqual({ type: "adaptive" });
   });
 
   test("conversation defaults still apply when callSite is absent", async () => {

diff --git a/assistant/src/__tests__/approval-routes-http.test.ts b/assistant/src/__tests__/approval-routes-http.test.ts
@@ -27,11 +27,39 @@ mock.module("../config/loader.js", () => ({
     rateLimit: { maxRequestsPerMinute: 0 },
     secretDetection: { enabled: false },
     contextWindow: { maxInputTokens: 200000 },
+    llm: {
+      default: {
+        provider: "anthropic",
+        model: "claude-opus-4-7",
+        maxTokens: 64000,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: true, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 200000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
+      },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
+    },
     services: {
       inference: {
         mode: "your-own",
         provider: "anthropic",
-        model: "claude-opus-4-6",
+        model: "claude-opus-4-7",
       },
       "image-generation": {
         mode: "your-own",

diff --git a/assistant/src/__tests__/config-schema-cmd.test.ts b/assistant/src/__tests__/config-schema-cmd.test.ts
@@ -82,8 +82,11 @@ import { getSchemaAtPath } from "../config/schema-utils.js";
 // ---------------------------------------------------------------------------
 
 describe("getSchemaAtPath", () => {
-  test("returns full schema for a top-level key (maxTokens → number schema)", () => {
-    const result = getSchemaAtPath(AssistantConfigSchema, "maxTokens");
+  test("returns full schema for a leaf key (llm.default.maxTokens → number schema)", () => {
+    const result = getSchemaAtPath(
+      AssistantConfigSchema,
+      "llm.default.maxTokens",
+    );
     expect(result).not.toBeNull();
     // maxTokens has a default, so it should be parseable
     const parsed = (result as z.ZodType).parse(undefined);
@@ -184,7 +187,7 @@ describe("z.toJSONSchema integration", () => {
     expect(properties).toBeDefined();
     // Check that top-level keys are present
     expect(properties.services).toBeDefined();
-    expect(properties.maxTokens).toBeDefined();
+    expect(properties.llm).toBeDefined();
     expect(properties.calls).toBeDefined();
     expect(properties.memory).toBeDefined();
     expect(properties.timeouts).toBeDefined();
@@ -222,8 +225,11 @@ describe("z.toJSONSchema integration", () => {
     expect(properties!.safety).toBeDefined();
   });
 
-  test("sub-schema at a leaf like maxTokens produces integer schema", () => {
-    const maxTokensSchema = getSchemaAtPath(AssistantConfigSchema, "maxTokens");
+  test("sub-schema at a leaf like llm.default.maxTokens produces integer schema", () => {
+    const maxTokensSchema = getSchemaAtPath(
+      AssistantConfigSchema,
+      "llm.default.maxTokens",
+    );
     expect(maxTokensSchema).not.toBeNull();
     const jsonSchema = z.toJSONSchema(maxTokensSchema!, {
       unrepresentable: "any",

diff --git a/assistant/src/__tests__/conversation-routes-disk-view.test.ts b/assistant/src/__tests__/conversation-routes-disk-view.test.ts
@@ -49,11 +49,39 @@ mock.module("../config/loader.js", () => ({
     rateLimit: { maxRequestsPerMinute: 0 },
     secretDetection: { enabled: false },
     contextWindow: { maxInputTokens: 200000 },
+    llm: {
+      default: {
+        provider: "anthropic",
+        model: "claude-opus-4-7",
+        maxTokens: 64000,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: true, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 200000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
+      },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
+    },
     services: {
       inference: {
         mode: "your-own",
         provider: "anthropic",
-        model: "claude-opus-4-6",
+        model: "claude-opus-4-7",
       },
       "image-generation": {
         mode: "your-own",

diff --git a/assistant/src/__tests__/conversation-routes-slash-commands.test.ts b/assistant/src/__tests__/conversation-routes-slash-commands.test.ts
@@ -26,17 +26,45 @@ mock.module("../daemon/conversation-slash.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-    model: "claude-opus-4-6",
+    model: "claude-opus-4-7",
     provider: "anthropic",
     memory: { enabled: false },
     rateLimit: { maxRequestsPerMinute: 0 },
     secretDetection: { enabled: false },
     contextWindow: { maxInputTokens: 200000 },
+    llm: {
+      default: {
+        provider: "anthropic",
+        model: "claude-opus-4-7",
+        maxTokens: 64000,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: true, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 200000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
+      },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
+    },
     services: {
       inference: {
         mode: "your-own",
         provider: "anthropic",
-        model: "claude-opus-4-6",
+        model: "claude-opus-4-7",
       },
       "image-generation": {
         mode: "your-own",
@@ -347,7 +375,7 @@ describe("handleSendMessage slash command interception", () => {
       inputTokens: 1000,
       outputTokens: 500,
       estimatedCost: 0.05,
-      model: "claude-opus-4-6",
+      model: "claude-opus-4-7",
       provider: "anthropic",
       maxInputTokens: 200000,
     });

diff --git a/assistant/src/__tests__/conversation-usage.test.ts b/assistant/src/__tests__/conversation-usage.test.ts
@@ -16,7 +16,9 @@ mock.module("../util/logger.js", () => ({
 
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
-    pricingOverrides: [],
+    llm: {
+      pricingOverrides: [],
+    },
   }),
 }));
 

diff --git a/assistant/src/__tests__/http-user-message-parity.test.ts b/assistant/src/__tests__/http-user-message-parity.test.ts
@@ -130,11 +130,39 @@ mock.module("../config/loader.js", () => ({
     model: "test",
     provider: "test",
     contextWindow: { maxInputTokens: 200000 },
+    llm: {
+      default: {
+        provider: "anthropic",
+        model: "claude-opus-4-7",
+        maxTokens: 64000,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: true, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 200000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
+      },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
+    },
     services: {
       inference: {
         mode: "your-own",
         provider: "anthropic",
-        model: "claude-opus-4-6",
+        model: "claude-opus-4-7",
       },
       "image-generation": {
         mode: "your-own",

diff --git a/assistant/src/__tests__/llm-schema.test.ts b/assistant/src/__tests__/llm-schema.test.ts
@@ -70,7 +70,7 @@ describe("LLMSchema", () => {
     const parsed = LLMSchema.parse({});
     expect(parsed.default).toEqual({
       provider: "anthropic",
-      model: "claude-opus-4-6",
+      model: "claude-opus-4-7",
       maxTokens: 64000,
       effort: "max",
       speed: "standard",

diff --git a/assistant/src/__tests__/secret-ingress-http.test.ts b/assistant/src/__tests__/secret-ingress-http.test.ts
@@ -7,6 +7,34 @@ import { beforeEach, describe, expect, mock, test } from "bun:test";
 const BASE_CONFIG = {
   contextWindow: { maxInputTokens: 100000 },
   services: { inference: { model: "test-model", provider: "test-provider" } },
+  llm: {
+    default: {
+      provider: "anthropic",
+      model: "claude-opus-4-7",
+      maxTokens: 64000,
+      effort: "max" as const,
+      speed: "standard" as const,
+      temperature: null,
+      thinking: { enabled: true, streamThinking: true },
+      contextWindow: {
+        enabled: true,
+        maxInputTokens: 200000,
+        targetBudgetRatio: 0.3,
+        compactThreshold: 0.8,
+        summaryBudgetRatio: 0.05,
+        overflowRecovery: {
+          enabled: true,
+          safetyMarginRatio: 0.05,
+          maxAttempts: 3,
+          interactiveLatestTurnCompression: "summarize",
+          nonInteractiveLatestTurnCompression: "truncate",
+        },
+      },
+    },
+    profiles: {},
+    callSites: {},
+    pricingOverrides: [],
+  },
 };
 
 let mockConfig: Record<string, unknown> = {