diff --git a/assistant/src/__tests__/approval-cascade.test.ts b/assistant/src/__tests__/approval-cascade.test.ts
index 3ade54c5e7f..04c71202cbf 100644
--- a/assistant/src/__tests__/approval-cascade.test.ts
+++ b/assistant/src/__tests__/approval-cascade.test.ts
@@ -56,16 +56,34 @@ mock.module("../providers/registry.js", () => ({
 
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
-    ui: {},
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      maxInputTokens: 100000,
-      thresholdTokens: 80000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
+    ui: {},    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
+      },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     timeouts: { permissionTimeoutSec: 300 },
diff --git a/assistant/src/__tests__/compaction.benchmark.test.ts b/assistant/src/__tests__/compaction.benchmark.test.ts
index 4599c7b7a2a..6317fc06b65 100644
--- a/assistant/src/__tests__/compaction.benchmark.test.ts
+++ b/assistant/src/__tests__/compaction.benchmark.test.ts
@@ -71,7 +71,7 @@ function makeLongMessages(turns: number): Message[] {
 
 function makeConfig() {
   return {
-    ...DEFAULT_CONFIG.contextWindow,
+    ...DEFAULT_CONFIG.llm.default.contextWindow,
     maxInputTokens: 6000,
     targetBudgetRatio: 0.58,
     compactThreshold: 0.6,
diff --git a/assistant/src/__tests__/config-analysis.test.ts b/assistant/src/__tests__/config-analysis.test.ts
index ba09306fd67..3367c7105d5 100644
--- a/assistant/src/__tests__/config-analysis.test.ts
+++ b/assistant/src/__tests__/config-analysis.test.ts
@@ -10,30 +10,27 @@ describe("AnalysisConfigSchema", () => {
     const parsed = AnalysisConfigSchema.parse({});
     expect(parsed.batchSize).toBe(30);
     expect(parsed.idleTimeoutMs).toBe(600_000);
-    expect(parsed.modelIntent).toBeUndefined();
-    expect(parsed.modelOverride).toBeUndefined();
   });
 
-  test("custom values round-trip", () => {
+  test("custom batch/idle values round-trip", () => {
     const input = {
       batchSize: 50,
       idleTimeoutMs: 120_000,
-      modelIntent: "quality-optimized" as const,
-      modelOverride: "anthropic/claude-opus-4-6",
     };
     const parsed = AnalysisConfigSchema.parse(input);
     expect(parsed).toEqual(input);
   });
 
-  test("accepts each valid modelIntent value", () => {
-    for (const intent of [
-      "latency-optimized",
-      "quality-optimized",
-      "vision-optimized",
-    ] as const) {
-      const parsed = AnalysisConfigSchema.parse({ modelIntent: intent });
-      expect(parsed.modelIntent).toBe(intent);
-    }
+  test("legacy modelIntent/modelOverride are stripped after PR 19 cleanup", () => {
+    // Both fields moved to llm.callSites.analyzeConversation in PR 4 and
+    // were removed from the schema in PR 19. Zod silently strips unknown
+    // keys; migration 039 erases them from disk.
+    const parsed = AnalysisConfigSchema.parse({
+      modelIntent: "quality-optimized",
+      modelOverride: "anthropic/claude-opus-4-6",
+    });
+    expect((parsed as Record<string, unknown>).modelIntent).toBeUndefined();
+    expect((parsed as Record<string, unknown>).modelOverride).toBeUndefined();
   });
 
   test("rejects batchSize: 0 (must be positive)", () => {
@@ -60,18 +57,6 @@ describe("AnalysisConfigSchema", () => {
     const result = AnalysisConfigSchema.safeParse({ idleTimeoutMs: -1000 });
     expect(result.success).toBe(false);
   });
-
-  test("rejects invalid modelIntent value", () => {
-    const result = AnalysisConfigSchema.safeParse({
-      modelIntent: "bogus-intent",
-    });
-    expect(result.success).toBe(false);
-  });
-
-  test("rejects non-string modelOverride", () => {
-    const result = AnalysisConfigSchema.safeParse({ modelOverride: 42 });
-    expect(result.success).toBe(false);
-  });
 });
 
 describe("AssistantConfigSchema — analysis integration", () => {
@@ -88,13 +73,11 @@ describe("AssistantConfigSchema — analysis integration", () => {
       analysis: {
         batchSize: 15,
         idleTimeoutMs: 300_000,
-        modelIntent: "latency-optimized",
       },
     });
     expect(parsed.analysis).toEqual({
       batchSize: 15,
       idleTimeoutMs: 300_000,
-      modelIntent: "latency-optimized",
     });
   });
 });
diff --git a/assistant/src/__tests__/config-schema.test.ts b/assistant/src/__tests__/config-schema.test.ts
index a4c3b450abd..91a432045af 100644
--- a/assistant/src/__tests__/config-schema.test.ts
+++ b/assistant/src/__tests__/config-schema.test.ts
@@ -81,9 +81,11 @@ function writeConfig(obj: unknown): void {
 describe("AssistantConfigSchema", () => {
   test("parses empty object with full defaults", () => {
     const result = AssistantConfigSchema.parse({});
-    expect(result.services.inference.provider).toBe("anthropic");
-    expect(result.services.inference.model).toBe("claude-opus-4-6");
+    // services.inference now carries only `mode`; provider/model live under
+    // llm.default.{provider,model} (see PR 19 of unify-llm-callsites).
     expect(result.services.inference.mode).toBe("your-own");
+    expect(result.llm.default.provider).toBe("anthropic");
+    expect(result.llm.default.model).toBe("claude-opus-4-6");
     expect(result.services["image-generation"].provider).toBe("gemini");
     expect(result.services["image-generation"].model).toBe(
       "gemini-3.1-flash-image-preview",
@@ -93,12 +95,12 @@ describe("AssistantConfigSchema", () => {
       "inference-provider-native",
     );
     expect(result.services["web-search"].mode).toBe("your-own");
-    expect(result.maxTokens).toBe(64000);
-    expect(result.thinking).toEqual({
+    expect(result.llm.default.maxTokens).toBe(64000);
+    expect(result.llm.default.thinking).toEqual({
       enabled: true,
       streamThinking: true,
     });
-    expect(result.contextWindow).toEqual({
+    expect(result.llm.default.contextWindow).toEqual({
       enabled: true,
       maxInputTokens: 200000,
       targetBudgetRatio: 0.3,
@@ -134,11 +136,9 @@ describe("AssistantConfigSchema", () => {
 
   test("accepts valid complete config", () => {
     const input = {
-      services: {
-        inference: { provider: "openai", model: "gpt-4" },
+      llm: {
+        default: { provider: "openai" as const, model: "gpt-4", maxTokens: 4096 },
       },
-      maxTokens: 4096,
-      thinking: { enabled: true },
       timeouts: {
         shellDefaultTimeoutSec: 30,
         shellMaxTimeoutSec: 300,
@@ -154,10 +154,10 @@ describe("AssistantConfigSchema", () => {
       auditLog: { retentionDays: 30 },
     };
     const result = AssistantConfigSchema.parse(input);
-    expect(result.services.inference.provider).toBe("openai");
-    expect(result.services.inference.model).toBe("gpt-4");
-    expect(result.maxTokens).toBe(4096);
-    expect(result.thinking.enabled).toBe(true);
+    expect(result.llm.default.provider).toBe("openai");
+    expect(result.llm.default.model).toBe("gpt-4");
+    expect(result.llm.default.maxTokens).toBe(4096);
+    expect(result.llm.default.thinking.enabled).toBe(true);
     expect(result.secretDetection.action).toBe("block");
   });
 
@@ -273,27 +273,32 @@ describe("AssistantConfigSchema", () => {
     expect(() => AssistantConfigSchema.parse(input)).toThrow(/missing-profile/);
   });
 
-  test("legacy top-level inference keys still parse alongside the new llm block", () => {
-    // Backward compatibility: configs that set the legacy top-level keys
-    // (maxTokens, effort, speed, thinking, contextWindow, services.inference)
-    // continue to parse correctly. PR 19 removes these once adoption is done.
+  test("legacy top-level inference keys are ignored after PR 19 cleanup", () => {
+    // The legacy keys (top-level maxTokens, effort, speed, thinking,
+    // contextWindow, services.inference.{provider,model}) were removed in PR
+    // 19. Configs that still carry them parse cleanly because Zod strips
+    // unknown fields, and migration 039 erases them from the on-disk file
+    // entirely.
     const input = {
       services: {
-        inference: { provider: "openai" as const, model: "gpt-4" },
+        inference: { provider: "openai", model: "gpt-4" },
       },
       maxTokens: 8000,
-      effort: "medium" as const,
-      speed: "fast" as const,
+      effort: "medium",
+      speed: "fast",
       thinking: { enabled: false, streamThinking: false },
     };
     const result = AssistantConfigSchema.parse(input);
-    expect(result.services.inference.provider).toBe("openai");
-    expect(result.maxTokens).toBe(8000);
-    expect(result.effort).toBe("medium");
-    expect(result.speed).toBe("fast");
-    expect(result.thinking.enabled).toBe(false);
-    // The new llm block falls back to its own defaults (independent of the
-    // legacy top-level keys until the migration in PR 4 backfills it).
+    expect((result as Record<string, unknown>).maxTokens).toBeUndefined();
+    expect((result as Record<string, unknown>).effort).toBeUndefined();
+    expect((result as Record<string, unknown>).speed).toBeUndefined();
+    expect((result as Record<string, unknown>).thinking).toBeUndefined();
+    expect(
+      (result.services.inference as Record<string, unknown>).provider,
+    ).toBeUndefined();
+    expect(
+      (result.services.inference as Record<string, unknown>).model,
+    ).toBeUndefined();
     expect(result.llm.default.provider).toBe("anthropic");
     expect(result.llm.default.model).toBe("claude-opus-4-6");
   });
@@ -302,16 +307,15 @@ describe("AssistantConfigSchema", () => {
     // Regression guard: previously LLMConfigBase had no schema-level defaults,
     // so any `llm: {}` block would fail validation and the loader's recovery
     // path would fall through to `cloneDefaultConfig()`, discarding unrelated
-    // valid settings (like a custom `maxTokens`). With leaf-level defaults,
-    // `llm: {}` parses cleanly and the user's other settings are preserved.
+    // valid settings (like a custom `llm.default.maxTokens`). With leaf-level
+    // defaults, `llm: {}` parses cleanly and the user's other settings are
+    // preserved.
     const result = AssistantConfigSchema.parse({
-      maxTokens: 32000,
-      llm: {},
+      llm: { default: { maxTokens: 32000 } },
     });
-    expect(result.maxTokens).toBe(32000);
+    expect(result.llm.default.maxTokens).toBe(32000);
     expect(result.llm.default.provider).toBe("anthropic");
     expect(result.llm.default.model).toBe("claude-opus-4-6");
-    expect(result.llm.default.maxTokens).toBe(64000);
   });
 
   test("llm.default with one missing field still parses (defaults applied)", () => {
@@ -416,13 +420,15 @@ describe("AssistantConfigSchema", () => {
 
   test("rejects invalid provider", () => {
     const result = AssistantConfigSchema.safeParse({
-      services: { inference: { provider: "invalid" } },
+      llm: { default: { provider: "invalid" } },
     });
     expect(result.success).toBe(false);
   });
 
-  test("rejects negative maxTokens", () => {
-    const result = AssistantConfigSchema.safeParse({ maxTokens: -100 });
+  test("rejects negative llm.default.maxTokens", () => {
+    const result = AssistantConfigSchema.safeParse({
+      llm: { default: { maxTokens: -100 } },
+    });
     expect(result.success).toBe(false);
     if (!result.success) {
       expect(
@@ -431,8 +437,10 @@ describe("AssistantConfigSchema", () => {
     }
   });
 
-  test("rejects non-integer maxTokens", () => {
-    const result = AssistantConfigSchema.safeParse({ maxTokens: 3.14 });
+  test("rejects non-integer llm.default.maxTokens", () => {
+    const result = AssistantConfigSchema.safeParse({
+      llm: { default: { maxTokens: 3.14 } },
+    });
     expect(result.success).toBe(false);
     if (!result.success) {
       expect(
@@ -441,9 +449,9 @@ describe("AssistantConfigSchema", () => {
     }
   });
 
-  test("rejects string maxTokens", () => {
+  test("rejects string llm.default.maxTokens", () => {
     const result = AssistantConfigSchema.safeParse({
-      maxTokens: "not-a-number",
+      llm: { default: { maxTokens: "not-a-number" } },
     });
     expect(result.success).toBe(false);
     if (!result.success) {
@@ -469,7 +477,7 @@ describe("AssistantConfigSchema", () => {
 
   test("rejects invalid thinking config", () => {
     const result = AssistantConfigSchema.safeParse({
-      thinking: { enabled: "yes" },
+      llm: { default: { thinking: { enabled: "yes" } } },
     });
     expect(result.success).toBe(false);
     if (!result.success) {
@@ -479,16 +487,21 @@ describe("AssistantConfigSchema", () => {
 
   test("rejects contextWindow targetBudgetRatio >= compactThreshold", () => {
     const result = AssistantConfigSchema.safeParse({
-      contextWindow: { targetBudgetRatio: 0.8, compactThreshold: 0.8 },
+      llm: {
+        default: {
+          contextWindow: { targetBudgetRatio: 0.8, compactThreshold: 0.8 },
+        },
+      },
     });
     expect(result.success).toBe(false);
     if (!result.success) {
       expect(
         result.error.issues.some(
           (issue) =>
-            issue.path.join(".") === "contextWindow.targetBudgetRatio" &&
+            issue.path.join(".") ===
+              "llm.default.contextWindow.targetBudgetRatio" &&
             issue.message.includes(
-              "must be less than contextWindow.compactThreshold",
+              "must be less than llm.default.contextWindow.compactThreshold",
             ),
         ),
       ).toBe(true);
@@ -498,7 +511,11 @@ describe("AssistantConfigSchema", () => {
   test("rejects overflowRecovery safetyMarginRatio out of (0,1) range", () => {
     for (const bad of [0, 1, -0.1, 1.5]) {
       const result = AssistantConfigSchema.safeParse({
-        contextWindow: { overflowRecovery: { safetyMarginRatio: bad } },
+        llm: {
+          default: {
+            contextWindow: { overflowRecovery: { safetyMarginRatio: bad } },
+          },
+        },
       });
       expect(result.success).toBe(false);
       if (!result.success) {
@@ -513,8 +530,12 @@ describe("AssistantConfigSchema", () => {
 
   test("rejects invalid overflowRecovery interactiveLatestTurnCompression", () => {
     const result = AssistantConfigSchema.safeParse({
-      contextWindow: {
-        overflowRecovery: { interactiveLatestTurnCompression: "explode" },
+      llm: {
+        default: {
+          contextWindow: {
+            overflowRecovery: { interactiveLatestTurnCompression: "explode" },
+          },
+        },
       },
     });
     expect(result.success).toBe(false);
@@ -529,8 +550,12 @@ describe("AssistantConfigSchema", () => {
 
   test("rejects invalid overflowRecovery nonInteractiveLatestTurnCompression", () => {
     const result = AssistantConfigSchema.safeParse({
-      contextWindow: {
-        overflowRecovery: { nonInteractiveLatestTurnCompression: "nope" },
+      llm: {
+        default: {
+          contextWindow: {
+            overflowRecovery: { nonInteractiveLatestTurnCompression: "nope" },
+          },
+        },
       },
     });
     expect(result.success).toBe(false);
@@ -601,7 +626,7 @@ describe("AssistantConfigSchema", () => {
       "ollama",
     ] as const) {
       const result = AssistantConfigSchema.safeParse({
-        services: { inference: { provider } },
+        llm: { default: { provider } },
       });
       expect(result.success).toBe(true);
     }
@@ -618,13 +643,19 @@ describe("AssistantConfigSchema", () => {
 
   test("provides helpful error messages", () => {
     const result = AssistantConfigSchema.safeParse({
-      maxTokens: -1,
+      llm: { default: { maxTokens: -1 } },
       secretDetection: { action: "explode" },
     });
     expect(result.success).toBe(false);
     if (!result.success) {
       const messages = result.error.issues.map((i) => i.message);
-      expect(messages.some((m) => m.includes("positive"))).toBe(true);
+      // The llm.default.maxTokens validation rejects -1 with a "Too small"
+      // / "expected number to be >0" message from Zod's default issue text.
+      expect(
+        messages.some(
+          (m) => m.includes("positive") || /expected number to be >0/i.test(m),
+        ),
+      ).toBe(true);
       expect(
         messages.some(
           (m) =>
@@ -688,11 +719,7 @@ describe("AssistantConfigSchema", () => {
       enrichmentMaxRetries: 2,
       commitMessageLLM: {
         enabled: false,
-        useConfiguredProvider: true,
-        providerFastModelOverrides: {},
         timeoutMs: 600,
-        maxTokens: 120,
-        temperature: 0.2,
         maxFilesInPrompt: 30,
         maxDiffBytes: 12000,
         minRemainingTurnBudgetMs: 1000,
@@ -746,11 +773,7 @@ describe("AssistantConfigSchema", () => {
     const result = AssistantConfigSchema.parse({});
     const llm = result.workspaceGit.commitMessageLLM;
     expect(llm.enabled).toBe(false);
-    expect(llm.useConfiguredProvider).toBe(true);
-    expect(llm.providerFastModelOverrides).toEqual({});
     expect(llm.timeoutMs).toBe(600);
-    expect(llm.maxTokens).toBe(120);
-    expect(llm.temperature).toBe(0.2);
     expect(llm.maxFilesInPrompt).toBe(30);
     expect(llm.maxDiffBytes).toBe(12000);
     expect(llm.minRemainingTurnBudgetMs).toBe(1000);
@@ -763,13 +786,6 @@ describe("AssistantConfigSchema", () => {
     expect(result.success).toBe(false);
   });
 
-  test("rejects commitMessageLLM.temperature > 2", () => {
-    const result = AssistantConfigSchema.safeParse({
-      workspaceGit: { commitMessageLLM: { temperature: 2.5 } },
-    });
-    expect(result.success).toBe(false);
-  });
-
   test("breaker settings have correct defaults", () => {
     const result = AssistantConfigSchema.parse({});
     const breaker = result.workspaceGit.commitMessageLLM.breaker;
@@ -784,14 +800,12 @@ describe("AssistantConfigSchema", () => {
         commitMessageLLM: {
           enabled: true,
           timeoutMs: 1000,
-          temperature: 0.5,
           breaker: { openAfterFailures: 5 },
         },
       },
     });
     expect(result.workspaceGit.commitMessageLLM.enabled).toBe(true);
     expect(result.workspaceGit.commitMessageLLM.timeoutMs).toBe(1000);
-    expect(result.workspaceGit.commitMessageLLM.temperature).toBe(0.5);
     expect(result.workspaceGit.commitMessageLLM.breaker.openAfterFailures).toBe(
       5,
     );
@@ -801,18 +815,18 @@ describe("AssistantConfigSchema", () => {
     );
   });
 
-  test("rejects commitMessageLLM.temperature < 0", () => {
-    const result = AssistantConfigSchema.safeParse({
-      workspaceGit: { commitMessageLLM: { temperature: -0.1 } },
-    });
-    expect(result.success).toBe(false);
-  });
-
-  test("rejects non-integer commitMessageLLM.maxTokens", () => {
-    const result = AssistantConfigSchema.safeParse({
-      workspaceGit: { commitMessageLLM: { maxTokens: 3.5 } },
+  test("ignores legacy commitMessageLLM.{maxTokens,temperature} keys", () => {
+    // PR 19 removed maxTokens/temperature from the schema; Zod silently
+    // strips them on parse. Migration 039 erases them from disk so they
+    // don't accumulate over time.
+    const result = AssistantConfigSchema.parse({
+      workspaceGit: {
+        commitMessageLLM: { maxTokens: 200, temperature: 0.5 },
+      },
     });
-    expect(result.success).toBe(false);
+    const cm = result.workspaceGit.commitMessageLLM as Record<string, unknown>;
+    expect(cm.maxTokens).toBeUndefined();
+    expect(cm.temperature).toBeUndefined();
   });
 
   // ── Calls config ────────────────────────────────────────────────────
@@ -971,16 +985,13 @@ describe("AssistantConfigSchema", () => {
     ).toBeUndefined();
   });
 
-  test("accepts optional calls.model", () => {
+  test("legacy calls.model key is stripped after PR 19 cleanup", () => {
+    // calls.model moved to llm.callSites.callAgent.model in PR 4 and the
+    // legacy field was removed in PR 19. Zod silently strips unknown keys.
     const result = AssistantConfigSchema.parse({
       calls: { model: "claude-haiku-4-5-20251001" },
     });
-    expect(result.calls.model).toBe("claude-haiku-4-5-20251001");
-  });
-
-  test("calls.model is undefined by default", () => {
-    const result = AssistantConfigSchema.parse({});
-    expect(result.calls.model).toBeUndefined();
+    expect((result.calls as Record<string, unknown>).model).toBeUndefined();
   });
 
   // ── Caller identity config ────────────────────────────────────────
@@ -2106,28 +2117,27 @@ describe("loadConfig with schema validation", () => {
   // intermittently trigger unhandled ENOENT in CI if the directory is removed.
   test("loads valid config", () => {
     writeConfig({
-      services: {
-        inference: { provider: "openai", model: "gpt-4" },
+      llm: {
+        default: { provider: "openai", model: "gpt-4", maxTokens: 4096 },
       },
-      maxTokens: 4096,
     });
     const config = loadConfig();
-    expect(config.services.inference.provider).toBe("openai");
-    expect(config.services.inference.model).toBe("gpt-4");
-    expect(config.maxTokens).toBe(4096);
+    expect(config.llm.default.provider).toBe("openai");
+    expect(config.llm.default.model).toBe("gpt-4");
+    expect(config.llm.default.maxTokens).toBe(4096);
   });
 
   test("applies defaults for missing fields", () => {
     writeConfig({});
     const config = loadConfig();
-    expect(config.services.inference.provider).toBe("anthropic");
-    expect(config.services.inference.model).toBe("claude-opus-4-6");
-    expect(config.maxTokens).toBe(64000);
-    expect(config.thinking).toEqual({
+    expect(config.llm.default.provider).toBe("anthropic");
+    expect(config.llm.default.model).toBe("claude-opus-4-6");
+    expect(config.llm.default.maxTokens).toBe(64000);
+    expect(config.llm.default.thinking).toEqual({
       enabled: true,
       streamThinking: true,
     });
-    expect(config.contextWindow).toEqual({
+    expect(config.llm.default.contextWindow).toEqual({
       enabled: true,
       maxInputTokens: 200000,
       targetBudgetRatio: 0.3,
@@ -2145,16 +2155,16 @@ describe("loadConfig with schema validation", () => {
 
   test("falls back to default for invalid provider", () => {
     writeConfig({
-      services: { inference: { provider: "invalid-provider" } },
+      llm: { default: { provider: "invalid-provider" } },
     });
     const config = loadConfig();
-    expect(config.services.inference.provider).toBe("anthropic");
+    expect(config.llm.default.provider).toBe("anthropic");
   });
 
   test("falls back to default for invalid maxTokens", () => {
-    writeConfig({ maxTokens: -100 });
+    writeConfig({ llm: { default: { maxTokens: -100 } } });
     const config = loadConfig();
-    expect(config.maxTokens).toBe(64000);
+    expect(config.llm.default.maxTokens).toBe(64000);
   });
 
   test("falls back to defaults for invalid nested values", () => {
@@ -2169,23 +2179,26 @@ describe("loadConfig with schema validation", () => {
 
   test("preserves valid fields when other fields are invalid", () => {
     writeConfig({
-      services: {
-        inference: { provider: "openai", model: "gpt-4" },
+      llm: {
+        default: {
+          provider: "openai",
+          model: "gpt-4",
+          maxTokens: -1,
+          thinking: { enabled: true },
+        },
       },
-      maxTokens: -1,
-      thinking: { enabled: true },
     });
     const config = loadConfig();
-    expect(config.services.inference.provider).toBe("openai");
-    expect(config.services.inference.model).toBe("gpt-4");
-    expect(config.thinking.enabled).toBe(true);
-    expect(config.maxTokens).toBe(64000);
+    expect(config.llm.default.provider).toBe("openai");
+    expect(config.llm.default.model).toBe("gpt-4");
+    expect(config.llm.default.thinking.enabled).toBe(true);
+    expect(config.llm.default.maxTokens).toBe(64000);
   });
 
   test("handles no config file", () => {
     const config = loadConfig();
-    expect(config.services.inference.provider).toBe("anthropic");
-    expect(config.maxTokens).toBe(64000);
+    expect(config.llm.default.provider).toBe("anthropic");
+    expect(config.llm.default.maxTokens).toBe(64000);
   });
 
   test("partial nested objects get defaults for missing fields", () => {
@@ -2206,11 +2219,15 @@ describe("loadConfig with schema validation", () => {
 
   test("falls back for invalid contextWindow relationship", () => {
     writeConfig({
-      contextWindow: { targetBudgetRatio: 0.8, compactThreshold: 0.8 },
+      llm: {
+        default: {
+          contextWindow: { targetBudgetRatio: 0.8, compactThreshold: 0.8 },
+        },
+      },
     });
     const config = loadConfig();
-    expect(config.contextWindow.targetBudgetRatio).toBe(0.3);
-    expect(config.contextWindow.compactThreshold).toBe(0.8);
+    expect(config.llm.default.contextWindow.targetBudgetRatio).toBe(0.3);
+    expect(config.llm.default.contextWindow.compactThreshold).toBe(0.8);
   });
 
   test("falls back for invalid rateLimit values", () => {
@@ -2271,13 +2288,13 @@ describe("loadConfig with schema validation", () => {
     // Only activeHoursStart is set. The superRefine must emit the issue so
     // the loader's delete-and-retry can strip the set field; otherwise the
     // mismatch persists and the config falls back to full defaults (which
-    // would reset maxTokens below to 64000).
+    // would reset llm.default.maxTokens below to 64000).
     writeConfig({
-      maxTokens: 4096,
+      llm: { default: { maxTokens: 4096 } },
       filing: { activeHoursStart: 8 },
     });
     const config = loadConfig();
-    expect(config.maxTokens).toBe(4096);
+    expect(config.llm.default.maxTokens).toBe(4096);
     expect(config.filing.activeHoursStart).toBeNull();
     expect(config.filing.activeHoursEnd).toBeNull();
   });
@@ -2285,13 +2302,13 @@ describe("loadConfig with schema validation", () => {
   test("recovers from partial heartbeat.activeHours without wiping unrelated fields", () => {
     // activeHoursStart is explicitly nulled while activeHoursEnd defaults to
     // 22 — a mismatch. Dual-emit strips both sides; both defaults restore
-    // (8, 22). maxTokens is unaffected.
+    // (8, 22). llm.default.maxTokens is unaffected.
     writeConfig({
-      maxTokens: 4096,
+      llm: { default: { maxTokens: 4096 } },
       heartbeat: { activeHoursStart: null },
     });
     const config = loadConfig();
-    expect(config.maxTokens).toBe(4096);
+    expect(config.llm.default.maxTokens).toBe(4096);
     expect(config.heartbeat.activeHoursStart).toBe(8);
     expect(config.heartbeat.activeHoursEnd).toBe(22);
   });
@@ -2299,14 +2316,14 @@ describe("loadConfig with schema validation", () => {
   test("recovers from heartbeat.activeHours null-mismatch where explicit value equals opposite default", () => {
     // { start: null, end: 8 } — single-emit on the null side would strip
     // start, the default 8 would restore it, and the equal-hours check would
-    // fire, cascading to a full defaults reset that wipes maxTokens.
+    // fire, cascading to a full defaults reset that wipes llm.default.maxTokens.
     // Dual-emit strips both sides in one pass.
     writeConfig({
-      maxTokens: 4096,
+      llm: { default: { maxTokens: 4096 } },
       heartbeat: { activeHoursStart: null, activeHoursEnd: 8 },
     });
     const config = loadConfig();
-    expect(config.maxTokens).toBe(4096);
+    expect(config.llm.default.maxTokens).toBe(4096);
     expect(config.heartbeat.activeHoursStart).toBe(8);
     expect(config.heartbeat.activeHoursEnd).toBe(22);
   });
@@ -2314,11 +2331,11 @@ describe("loadConfig with schema validation", () => {
   test("recovers from heartbeat.activeHours null-mismatch on the end side", () => {
     // { start: 22, end: null } — same cascade class as above, mirrored.
     writeConfig({
-      maxTokens: 4096,
+      llm: { default: { maxTokens: 4096 } },
       heartbeat: { activeHoursStart: 22, activeHoursEnd: null },
     });
     const config = loadConfig();
-    expect(config.maxTokens).toBe(4096);
+    expect(config.llm.default.maxTokens).toBe(4096);
     expect(config.heartbeat.activeHoursStart).toBe(8);
     expect(config.heartbeat.activeHoursEnd).toBe(22);
   });
@@ -2327,13 +2344,13 @@ describe("loadConfig with schema validation", () => {
     // { start: 22, end: 22 } — both equal to the default for end. Single-emit
     // on one path would strip one side, the default would recreate the
     // equal-hours mismatch, and the loader would fall back to full defaults,
-    // wiping maxTokens. Dual-emit strips both sides at once.
+    // wiping llm.default.maxTokens. Dual-emit strips both sides at once.
     writeConfig({
-      maxTokens: 4096,
+      llm: { default: { maxTokens: 4096 } },
       heartbeat: { activeHoursStart: 22, activeHoursEnd: 22 },
     });
     const config = loadConfig();
-    expect(config.maxTokens).toBe(4096);
+    expect(config.llm.default.maxTokens).toBe(4096);
     expect(config.heartbeat.activeHoursStart).toBe(8);
     expect(config.heartbeat.activeHoursEnd).toBe(22);
   });
@@ -2342,14 +2359,14 @@ describe("loadConfig with schema validation", () => {
     // activeHoursStart === activeHoursEnd is invalid (empty window). Filing's
     // defaults are null/null, so single-emit on one path would strip one side
     // and the null default would recreate a mismatch — cascading to a full
-    // defaults reset that wipes maxTokens. Dual-emit strips both sides so
-    // both defaults restore to null.
+    // defaults reset that wipes llm.default.maxTokens. Dual-emit strips both
+    // sides so both defaults restore to null.
     writeConfig({
-      maxTokens: 1234,
+      llm: { default: { maxTokens: 1234 } },
       filing: { activeHoursStart: 5, activeHoursEnd: 5 },
     });
     const config = loadConfig();
-    expect(config.maxTokens).toBe(1234);
+    expect(config.llm.default.maxTokens).toBe(1234);
     expect(config.filing.activeHoursStart).toBeNull();
     expect(config.filing.activeHoursEnd).toBeNull();
   });
@@ -2369,7 +2386,7 @@ describe("loadConfig with schema validation", () => {
     expect(
       (config.calls.voice as Record<string, unknown>).ttsProvider,
     ).toBeUndefined();
-    expect(config.calls.model).toBeUndefined();
+    expect((config.calls as Record<string, unknown>).model).toBeUndefined();
     expect(config.calls.callerIdentity).toEqual({
       allowPerCallOverride: true,
     });
diff --git a/assistant/src/__tests__/conversation-abort-tool-results.test.ts b/assistant/src/__tests__/conversation-abort-tool-results.test.ts
index 792f7cd3185..441cad2d800 100644
--- a/assistant/src/__tests__/conversation-abort-tool-results.test.ts
+++ b/assistant/src/__tests__/conversation-abort-tool-results.test.ts
@@ -25,23 +25,34 @@ mock.module("../providers/registry.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      maxInputTokens: 100000,
-      thresholdTokens: 80000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
-      overflowRecovery: {
-        enabled: true,
-        safetyMarginRatio: 0.05,
-        maxAttempts: 3,
-        interactiveLatestTurnCompression: "summarize",
-        nonInteractiveLatestTurnCompression: "truncate",
+    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
       },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     daemon: {
diff --git a/assistant/src/__tests__/conversation-agent-loop-overflow.test.ts b/assistant/src/__tests__/conversation-agent-loop-overflow.test.ts
index 03bf2945568..f60ceeae546 100644
--- a/assistant/src/__tests__/conversation-agent-loop-overflow.test.ts
+++ b/assistant/src/__tests__/conversation-agent-loop-overflow.test.ts
@@ -28,23 +28,34 @@ mock.module("../util/logger.js", () => ({
 }));
 
 mock.module("../config/loader.js", () => ({
-  getConfig: () => ({
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      maxInputTokens: 200_000,
-      thresholdTokens: 160_000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
-      overflowRecovery: {
-        enabled: true,
-        safetyMarginRatio: 0.05,
-        maxAttempts: 3,
-        interactiveLatestTurnCompression: "summarize",
-        nonInteractiveLatestTurnCompression: "truncate",
+  getConfig: () => ({    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
       },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     workspaceGit: { turnCommitMaxWaitMs: 10 },
diff --git a/assistant/src/__tests__/conversation-agent-loop.test.ts b/assistant/src/__tests__/conversation-agent-loop.test.ts
index 767d315e89c..e606cd0ed39 100644
--- a/assistant/src/__tests__/conversation-agent-loop.test.ts
+++ b/assistant/src/__tests__/conversation-agent-loop.test.ts
@@ -16,23 +16,34 @@ mock.module("../util/logger.js", () => ({
 }));
 
 mock.module("../config/loader.js", () => ({
-  getConfig: () => ({
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      maxInputTokens: 100000,
-      thresholdTokens: 80000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
-      overflowRecovery: {
-        enabled: true,
-        safetyMarginRatio: 0.05,
-        maxAttempts: 3,
-        interactiveLatestTurnCompression: "summarize",
-        nonInteractiveLatestTurnCompression: "truncate",
+  getConfig: () => ({    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
       },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     workspaceGit: { turnCommitMaxWaitMs: 10 },
diff --git a/assistant/src/__tests__/conversation-confirmation-signals.test.ts b/assistant/src/__tests__/conversation-confirmation-signals.test.ts
index d740f7afc75..ecd2a8b155c 100644
--- a/assistant/src/__tests__/conversation-confirmation-signals.test.ts
+++ b/assistant/src/__tests__/conversation-confirmation-signals.test.ts
@@ -57,15 +57,33 @@ mock.module("../providers/registry.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      maxInputTokens: 100000,
-      thresholdTokens: 80000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
+      },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     timeouts: { permissionTimeoutSec: 1 },
diff --git a/assistant/src/__tests__/conversation-load-history-repair.test.ts b/assistant/src/__tests__/conversation-load-history-repair.test.ts
index 5184f2584fb..51006ee2f82 100644
--- a/assistant/src/__tests__/conversation-load-history-repair.test.ts
+++ b/assistant/src/__tests__/conversation-load-history-repair.test.ts
@@ -16,16 +16,33 @@ mock.module("../providers/registry.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      maxInputTokens: 100000,
-      thresholdTokens: 80000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
+      },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
   }),
diff --git a/assistant/src/__tests__/conversation-pre-run-repair.test.ts b/assistant/src/__tests__/conversation-pre-run-repair.test.ts
index 62be27d840c..cbd76cccdcc 100644
--- a/assistant/src/__tests__/conversation-pre-run-repair.test.ts
+++ b/assistant/src/__tests__/conversation-pre-run-repair.test.ts
@@ -23,23 +23,34 @@ mock.module("../providers/registry.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      maxInputTokens: 100000,
-      thresholdTokens: 80000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
-      overflowRecovery: {
-        enabled: true,
-        safetyMarginRatio: 0.05,
-        maxAttempts: 3,
-        interactiveLatestTurnCompression: "summarize",
-        nonInteractiveLatestTurnCompression: "truncate",
+    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
       },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     daemon: {
diff --git a/assistant/src/__tests__/conversation-process-callsite.test.ts b/assistant/src/__tests__/conversation-process-callsite.test.ts
index 3a330afd247..7325afcd107 100644
--- a/assistant/src/__tests__/conversation-process-callsite.test.ts
+++ b/assistant/src/__tests__/conversation-process-callsite.test.ts
@@ -46,22 +46,33 @@ mock.module("../providers/registry.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      maxInputTokens: 100000,
-      thresholdTokens: 80000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
-      overflowRecovery: {
-        enabled: true,
-        safetyMarginRatio: 0.05,
-        maxAttempts: 3,
-        interactiveLatestTurnCompression: "summarize",
-        nonInteractiveLatestTurnCompression: "truncate",
+    llm: {
+      default: {
+        provider: "anthropic",
+        model: "claude-opus-4-6",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
       },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     daemon: {
@@ -74,8 +85,6 @@ mock.module("../config/loader.js", () => ({
     services: {
       inference: {
         mode: "your-own",
-        provider: "anthropic",
-        model: "claude-opus-4-6",
       },
       "image-generation": {
         mode: "your-own",
diff --git a/assistant/src/__tests__/conversation-provider-retry-repair.test.ts b/assistant/src/__tests__/conversation-provider-retry-repair.test.ts
index 41e60d1d822..62677a4553e 100644
--- a/assistant/src/__tests__/conversation-provider-retry-repair.test.ts
+++ b/assistant/src/__tests__/conversation-provider-retry-repair.test.ts
@@ -26,23 +26,34 @@ mock.module("../config/loader.js", () => ({
     daemon: {
       titleGenerationMaxTokens: 30,
     },
-
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      enabled: true,
-      maxInputTokens: 100000,
-      targetBudgetRatio: 0.3,
-      compactThreshold: 0.8,
-      summaryBudgetRatio: 0.05,
-      overflowRecovery: {
-        enabled: true,
-        safetyMarginRatio: 0.05,
-        maxAttempts: 3,
-        interactiveLatestTurnCompression: "summarize",
-        nonInteractiveLatestTurnCompression: "truncate",
+    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
       },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     services: {
diff --git a/assistant/src/__tests__/conversation-queue.test.ts b/assistant/src/__tests__/conversation-queue.test.ts
index 92c1ba2d562..9b0f867ae76 100644
--- a/assistant/src/__tests__/conversation-queue.test.ts
+++ b/assistant/src/__tests__/conversation-queue.test.ts
@@ -55,23 +55,34 @@ mock.module("../providers/registry.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      maxInputTokens: 100000,
-      thresholdTokens: 80000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
-      overflowRecovery: {
-        enabled: true,
-        safetyMarginRatio: 0.05,
-        maxAttempts: 3,
-        interactiveLatestTurnCompression: "summarize",
-        nonInteractiveLatestTurnCompression: "truncate",
+    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
       },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     timeouts: { permissionTimeoutSec: 1 },
diff --git a/assistant/src/__tests__/conversation-slash-queue.test.ts b/assistant/src/__tests__/conversation-slash-queue.test.ts
index 3106f807afe..778f896d9a0 100644
--- a/assistant/src/__tests__/conversation-slash-queue.test.ts
+++ b/assistant/src/__tests__/conversation-slash-queue.test.ts
@@ -30,23 +30,34 @@ mock.module("../providers/registry.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      maxInputTokens: 100000,
-      thresholdTokens: 80000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
-      overflowRecovery: {
-        enabled: true,
-        safetyMarginRatio: 0.05,
-        maxAttempts: 3,
-        interactiveLatestTurnCompression: "summarize",
-        nonInteractiveLatestTurnCompression: "truncate",
+    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
       },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     daemon: {
diff --git a/assistant/src/__tests__/conversation-slash-unknown.test.ts b/assistant/src/__tests__/conversation-slash-unknown.test.ts
index 47e416ecbbe..66606d33f45 100644
--- a/assistant/src/__tests__/conversation-slash-unknown.test.ts
+++ b/assistant/src/__tests__/conversation-slash-unknown.test.ts
@@ -30,23 +30,34 @@ mock.module("../providers/registry.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      maxInputTokens: 100000,
-      thresholdTokens: 80000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
-      overflowRecovery: {
-        enabled: true,
-        safetyMarginRatio: 0.05,
-        maxAttempts: 3,
-        interactiveLatestTurnCompression: "summarize",
-        nonInteractiveLatestTurnCompression: "truncate",
+    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
       },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     daemon: {
diff --git a/assistant/src/__tests__/conversation-speed-override.test.ts b/assistant/src/__tests__/conversation-speed-override.test.ts
index 3dd28ee6cd6..dc32eea84c7 100644
--- a/assistant/src/__tests__/conversation-speed-override.test.ts
+++ b/assistant/src/__tests__/conversation-speed-override.test.ts
@@ -57,17 +57,33 @@ let mockConfigSpeed: "standard" | "fast" = "fast";
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    speed: mockConfigSpeed,
-    effort: "high",
-    contextWindow: {
-      maxInputTokens: 100000,
-      thresholdTokens: 80000,
-      preserveRecentMessages: 6,
-      summaryModel: "mock-model",
-      maxSummaryTokens: 512,
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "high" as const,
+        speed: mockConfigSpeed,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
+      },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     timeouts: { permissionTimeoutSec: 1 },
diff --git a/assistant/src/__tests__/conversation-workspace-cache-state.test.ts b/assistant/src/__tests__/conversation-workspace-cache-state.test.ts
index 4efe3c756da..ab0824082db 100644
--- a/assistant/src/__tests__/conversation-workspace-cache-state.test.ts
+++ b/assistant/src/__tests__/conversation-workspace-cache-state.test.ts
@@ -24,16 +24,34 @@ mock.module("../providers/registry.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      enabled: true,
-      maxInputTokens: 100000,
-      targetBudgetRatio: 0.3,
-      compactThreshold: 0.8,
-      summaryBudgetRatio: 0.05,
+    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
+      },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     memory: { enabled: false },
diff --git a/assistant/src/__tests__/conversation-workspace-injection.test.ts b/assistant/src/__tests__/conversation-workspace-injection.test.ts
index 8256edef1d6..f0220ad622b 100644
--- a/assistant/src/__tests__/conversation-workspace-injection.test.ts
+++ b/assistant/src/__tests__/conversation-workspace-injection.test.ts
@@ -33,23 +33,34 @@ mock.module("../providers/registry.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      enabled: true,
-      maxInputTokens: 100000,
-      targetBudgetRatio: 0.3,
-      compactThreshold: 0.8,
-      summaryBudgetRatio: 0.05,
-      overflowRecovery: {
-        enabled: true,
-        safetyMarginRatio: 0.05,
-        maxAttempts: 3,
-        interactiveLatestTurnCompression: "summarize",
-        nonInteractiveLatestTurnCompression: "truncate",
+    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
       },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     memory: { enabled: false },
diff --git a/assistant/src/__tests__/conversation-workspace-tool-tracking.test.ts b/assistant/src/__tests__/conversation-workspace-tool-tracking.test.ts
index cd5d311652b..babebc3ceb8 100644
--- a/assistant/src/__tests__/conversation-workspace-tool-tracking.test.ts
+++ b/assistant/src/__tests__/conversation-workspace-tool-tracking.test.ts
@@ -31,23 +31,34 @@ mock.module("../providers/registry.js", () => ({
 mock.module("../config/loader.js", () => ({
   getConfig: () => ({
     ui: {},
-
-    provider: "mock-provider",
-    maxTokens: 4096,
-    thinking: false,
-    contextWindow: {
-      enabled: true,
-      maxInputTokens: 100000,
-      targetBudgetRatio: 0.3,
-      compactThreshold: 0.8,
-      summaryBudgetRatio: 0.05,
-      overflowRecovery: {
-        enabled: true,
-        safetyMarginRatio: 0.05,
-        maxAttempts: 3,
-        interactiveLatestTurnCompression: "summarize",
-        nonInteractiveLatestTurnCompression: "truncate",
+    
+    llm: {
+      default: {
+        provider: "mock-provider",
+        model: "mock-model",
+        maxTokens: 4096,
+        effort: "max" as const,
+        speed: "standard" as const,
+        temperature: null,
+        thinking: { enabled: false, streamThinking: true },
+        contextWindow: {
+          enabled: true,
+          maxInputTokens: 100000,
+          targetBudgetRatio: 0.3,
+          compactThreshold: 0.8,
+          summaryBudgetRatio: 0.05,
+          overflowRecovery: {
+            enabled: true,
+            safetyMarginRatio: 0.05,
+            maxAttempts: 3,
+            interactiveLatestTurnCompression: "summarize",
+            nonInteractiveLatestTurnCompression: "truncate",
+          },
+        },
       },
+      profiles: {},
+      callSites: {},
+      pricingOverrides: [],
     },
     rateLimit: { maxRequestsPerMinute: 0 },
     memory: { enabled: false },
diff --git a/assistant/src/__tests__/model-intents.test.ts b/assistant/src/__tests__/model-intents.test.ts
index e245d8c0b0c..267acd55b1a 100644
--- a/assistant/src/__tests__/model-intents.test.ts
+++ b/assistant/src/__tests__/model-intents.test.ts
@@ -5,45 +5,6 @@ import {
   isModelIntent,
   resolveModelIntent,
 } from "../providers/model-intents.js";
-import { RetryProvider } from "../providers/retry.js";
-import type {
-  Message,
-  Provider,
-  ProviderResponse,
-  SendMessageOptions,
-} from "../providers/types.js";
-
-const DUMMY_MESSAGES: Message[] = [
-  { role: "user", content: [{ type: "text", text: "hello" }] },
-];
-
-function makeResponse(model: string): ProviderResponse {
-  return {
-    content: [{ type: "text", text: "ok" }],
-    model,
-    usage: {
-      inputTokens: 1,
-      outputTokens: 1,
-    },
-    stopReason: "end_turn",
-  };
-}
-
-function makeProvider(
-  name: string,
-  onCall: (options: SendMessageOptions | undefined) => void,
-): Provider {
-  return {
-    name,
-    async sendMessage(_messages, _tools, _systemPrompt, options) {
-      onCall(options);
-      const config = options?.config as Record<string, unknown> | undefined;
-      return makeResponse(
-        (config?.model as string | undefined) ?? "default-model",
-      );
-    },
-  };
-}
 
 describe("model intents", () => {
   test("validates model intent strings", () => {
@@ -77,45 +38,8 @@ describe("model intents", () => {
   });
 });
 
-describe("RetryProvider model intent normalization", () => {
-  test("translates modelIntent into concrete model and strips modelIntent key", async () => {
-    let seen: SendMessageOptions | undefined;
-    const wrapped = new RetryProvider(
-      makeProvider("anthropic", (options) => {
-        seen = options;
-      }),
-    );
-
-    await wrapped.sendMessage(DUMMY_MESSAGES, undefined, undefined, {
-      config: {
-        modelIntent: "quality-optimized",
-        max_tokens: 123,
-      },
-    });
-
-    const config = seen?.config as Record<string, unknown>;
-    expect(config.model).toBe("claude-opus-4-7");
-    expect(config.modelIntent).toBeUndefined();
-    expect(config.max_tokens).toBe(123);
-  });
-
-  test("explicit model override wins over modelIntent", async () => {
-    let seen: SendMessageOptions | undefined;
-    const wrapped = new RetryProvider(
-      makeProvider("openai", (options) => {
-        seen = options;
-      }),
-    );
-
-    await wrapped.sendMessage(DUMMY_MESSAGES, undefined, undefined, {
-      config: {
-        model: "custom-model-v1",
-        modelIntent: "latency-optimized",
-      },
-    });
-
-    const config = seen?.config as Record<string, unknown>;
-    expect(config.model).toBe("custom-model-v1");
-    expect(config.modelIntent).toBeUndefined();
-  });
-});
+// `RetryProvider`'s legacy `modelIntent` normalization path was removed in
+// PR 19 of the unify-llm-callsites plan. The remaining `resolveModelIntent`
+// helper lives in `providers/model-intents.ts` for use by the workspace
+// migration's snapshot table — see `workspace/migrations/038-unify-llm-
+// callsite-configs.ts`.
diff --git a/assistant/src/__tests__/provider-commit-message-generator.test.ts b/assistant/src/__tests__/provider-commit-message-generator.test.ts
index 504e1d4a0ef..daa0bdd39f3 100644
--- a/assistant/src/__tests__/provider-commit-message-generator.test.ts
+++ b/assistant/src/__tests__/provider-commit-message-generator.test.ts
@@ -20,15 +20,11 @@ mock.module("../security/secure-keys.js", () => ({
 // ---------------------------------------------------------------------------
 function cloneConfig(): AssistantConfig {
   const cfg = structuredClone(DEFAULT_CONFIG);
-  cfg.services.inference.provider = "anthropic";
+  cfg.llm.default.provider = "anthropic";
   cfg.workspaceGit.commitMessageLLM = {
     ...cfg.workspaceGit.commitMessageLLM,
     enabled: true,
-    useConfiguredProvider: true,
-    providerFastModelOverrides: {},
     timeoutMs: 5000,
-    maxTokens: 120,
-    temperature: 0.2,
     maxFilesInPrompt: 30,
     maxDiffBytes: 12000,
     minRemainingTurnBudgetMs: 1000,
@@ -140,17 +136,6 @@ describe("ProviderCommitMessageGenerator", () => {
     expect(result.reason).toBe("disabled");
   });
 
-  // 2. useConfiguredProvider false
-  test('useConfiguredProvider false → returns deterministic, reason "disabled"', async () => {
-    currentConfig.workspaceGit.commitMessageLLM.useConfiguredProvider = false;
-    const gen = getCommitMessageGenerator();
-    const result = await gen.generateCommitMessage(baseContext, {
-      changedFiles: baseContext.changedFiles,
-    });
-    expect(result.source).toBe("deterministic");
-    expect(result.reason).toBe("disabled");
-  });
-
   // 3. missing API key
   test('missing API key → returns deterministic, reason "missing_provider_api_key"', async () => {
     mockSecureKeys = {};
@@ -221,7 +206,7 @@ describe("ProviderCommitMessageGenerator", () => {
   });
 
   // 6. LLM success
-  test('LLM success → returns LLM message, source "llm", fast model + callSite passed', async () => {
+  test('LLM success → returns LLM message, source "llm", callSite passed', async () => {
     const commitMsg = "feat: add new feature";
     mockSendMessage.mockResolvedValueOnce(makeSuccessResponse(commitMsg));
     const gen = getCommitMessageGenerator();
@@ -232,37 +217,12 @@ describe("ProviderCommitMessageGenerator", () => {
     expect(result.message).toBe(commitMsg);
     expect(result.reason).toBeUndefined();
 
-    // Verify the fast model and callSite were passed in the config so the
-    // provider's RetryProvider routes through `resolveCallSiteConfig` for
-    // max_tokens/temperature while preserving the explicit fast-model
-    // override.
+    // Verify the callSite was passed so the provider's RetryProvider routes
+    // through `resolveCallSiteConfig` for model/max_tokens/temperature.
     const callArgs = mockSendMessage.mock.calls[0];
     const options = callArgs[3] as {
-      config: { model: string; callSite: string };
+      config: { callSite: string };
     };
-    expect(options.config.model).toBe("claude-haiku-4-5-20251001");
-    expect(options.config.callSite).toBe("commitMessage");
-  });
-
-  // 7. fast-model override
-  test("fast-model override → uses override instead of default", async () => {
-    currentConfig.workspaceGit.commitMessageLLM.providerFastModelOverrides = {
-      anthropic: "claude-sonnet-4-20250514",
-    };
-    const commitMsg = "fix: resolve issue";
-    mockSendMessage.mockResolvedValueOnce(makeSuccessResponse(commitMsg));
-    const gen = getCommitMessageGenerator();
-    const result = await gen.generateCommitMessage(baseContext, {
-      changedFiles: baseContext.changedFiles,
-    });
-    expect(result.source).toBe("llm");
-    expect(result.message).toBe(commitMsg);
-
-    const callArgs = mockSendMessage.mock.calls[0];
-    const options = callArgs[3] as {
-      config: { model: string; callSite: string };
-    };
-    expect(options.config.model).toBe("claude-sonnet-4-20250514");
     expect(options.config.callSite).toBe("commitMessage");
   });
 
@@ -339,53 +299,16 @@ describe("ProviderCommitMessageGenerator", () => {
     expect(result.message).toBe("b".repeat(72) + body);
   });
 
-  // 12. Keyless provider (Ollama) without fast model → missing_fast_model (skips API key check)
-  test('Ollama without API key or fast model → returns deterministic, reason "missing_fast_model"', async () => {
-    currentConfig.services.inference.provider = "ollama";
+  // 12. Ollama (keyless provider) — passes the API-key preflight even without
+  // a stored secret, then succeeds because the call-site resolver supplies
+  // the model from `llm.default`/`llm.callSites.commitMessage`.
+  test("Ollama (keyless) — succeeds because call-site resolver supplies the model", async () => {
+    currentConfig.llm.default.provider = "ollama";
     mockSecureKeys = {};
     resolvedProvider = {
       provider: mockProvider,
       configuredProviderName: "ollama",
     };
-    const gen = getCommitMessageGenerator();
-    const result = await gen.generateCommitMessage(baseContext, {
-      changedFiles: baseContext.changedFiles,
-    });
-    expect(result.source).toBe("deterministic");
-    expect(result.reason).toBe("missing_fast_model");
-    expect(result.reason).not.toBe("missing_provider_api_key");
-    expect(mockSendMessage).not.toHaveBeenCalled();
-  });
-
-  // 13. Unknown provider without fast model default → missing_fast_model, no provider call
-  test('Unknown provider without fast model default → returns deterministic, reason "missing_fast_model"', async () => {
-    (currentConfig.services.inference as Record<string, unknown>).provider =
-      "exotic-provider";
-    mockSecureKeys = { "exotic-provider": "sk-exotic" };
-    resolvedProvider = {
-      provider: mockProvider,
-      configuredProviderName: "exotic-provider",
-    };
-    const gen = getCommitMessageGenerator();
-    const result = await gen.generateCommitMessage(baseContext, {
-      changedFiles: baseContext.changedFiles,
-    });
-    expect(result.source).toBe("deterministic");
-    expect(result.reason).toBe("missing_fast_model");
-    expect(mockSendMessage).not.toHaveBeenCalled();
-  });
-
-  // 14. Fast-model override enables LLM path for provider without built-in default
-  test("fast-model override enables LLM path for provider without built-in default", async () => {
-    currentConfig.services.inference.provider = "ollama";
-    mockSecureKeys = {}; // Ollama is keyless
-    resolvedProvider = {
-      provider: mockProvider,
-      configuredProviderName: "ollama",
-    };
-    currentConfig.workspaceGit.commitMessageLLM.providerFastModelOverrides = {
-      ollama: "llama3.2:3b",
-    };
     const commitMsg = "fix: local model commit";
     mockSendMessage.mockResolvedValueOnce(makeSuccessResponse(commitMsg));
     const gen = getCommitMessageGenerator();
@@ -394,10 +317,8 @@ describe("ProviderCommitMessageGenerator", () => {
     });
     expect(result.source).toBe("llm");
     expect(result.message).toBe(commitMsg);
-
-    // Verify the override model was passed
     const callArgs = mockSendMessage.mock.calls[0];
-    const options = callArgs[3] as { config: { model: string } };
-    expect(options.config.model).toBe("llama3.2:3b");
+    const options = callArgs[3] as { config: { callSite: string } };
+    expect(options.config.callSite).toBe("commitMessage");
   });
 });
diff --git a/assistant/src/__tests__/provider-managed-proxy-integration.test.ts b/assistant/src/__tests__/provider-managed-proxy-integration.test.ts
index 2581be09cd3..5b4a9ac2fd7 100644
--- a/assistant/src/__tests__/provider-managed-proxy-integration.test.ts
+++ b/assistant/src/__tests__/provider-managed-proxy-integration.test.ts
@@ -65,7 +65,7 @@ import {
 function makeProvidersConfig(provider: string, model: string): ProvidersConfig {
   return {
     services: {
-      inference: { mode: "your-own", provider, model },
+      inference: { mode: "your-own" },
       "image-generation": {
         mode: "your-own",
         provider: "gemini",
@@ -73,6 +73,7 @@ function makeProvidersConfig(provider: string, model: string): ProvidersConfig {
       },
       "web-search": { mode: "your-own", provider: "inference-provider-native" },
     },
+    llm: { default: { provider, model } },
   };
 }
 
diff --git a/assistant/src/__tests__/provider-registry-ollama.test.ts b/assistant/src/__tests__/provider-registry-ollama.test.ts
index 4f93b44b715..bb78a427572 100644
--- a/assistant/src/__tests__/provider-registry-ollama.test.ts
+++ b/assistant/src/__tests__/provider-registry-ollama.test.ts
@@ -19,8 +19,6 @@ describe("provider registry (ollama)", () => {
       services: {
         inference: {
           mode: "your-own",
-          provider: "ollama",
-          model: "claude-opus-4-6",
         },
         "image-generation": {
           mode: "your-own",
@@ -32,6 +30,7 @@ describe("provider registry (ollama)", () => {
           provider: "inference-provider-native",
         },
       },
+      llm: { default: { provider: "ollama", model: "claude-opus-4-6" } },
     });
 
     const provider = getProvider("ollama");
diff --git a/assistant/src/__tests__/secret-routes-managed-proxy.test.ts b/assistant/src/__tests__/secret-routes-managed-proxy.test.ts
index 327cb00925f..b608128a6ad 100644
--- a/assistant/src/__tests__/secret-routes-managed-proxy.test.ts
+++ b/assistant/src/__tests__/secret-routes-managed-proxy.test.ts
@@ -19,8 +19,6 @@ const mockConfig = {
   services: {
     inference: {
       mode: "your-own" as const,
-      provider: "anthropic",
-      model: "test-model",
     },
     "image-generation": {
       mode: "your-own" as const,
@@ -32,6 +30,7 @@ const mockConfig = {
       provider: "inference-provider-native",
     },
   },
+  llm: { default: { provider: "anthropic", model: "test-model" } },
 };
 
 mock.module("@google/genai", () => ({
diff --git a/assistant/src/__tests__/suggestion-routes.test.ts b/assistant/src/__tests__/suggestion-routes.test.ts
index e0f67ece6c7..d2904641a4f 100644
--- a/assistant/src/__tests__/suggestion-routes.test.ts
+++ b/assistant/src/__tests__/suggestion-routes.test.ts
@@ -383,7 +383,7 @@ describe("GET /v1/suggestion", () => {
     expect(body.suggestion).toBeNull();
   });
 
-  test("uses latency-optimized model intent", async () => {
+  test("uses conversationStarters call site", async () => {
     const provider = makeMockProvider("Quick reply");
     mockGetConfiguredProvider.mockImplementation(async () => provider);
     mockGetConversationByKey.mockImplementation(() => ({
@@ -407,8 +407,8 @@ describe("GET /v1/suggestion", () => {
     expect(provider.sendMessage).toHaveBeenCalledTimes(1);
     const callArgs = provider.sendMessage.mock.calls[0] as unknown[];
     const options = callArgs[3] as
-      | { config?: { modelIntent?: string } }
+      | { config?: { callSite?: string } }
       | undefined;
-    expect(options?.config?.modelIntent).toBe("latency-optimized");
+    expect(options?.config?.callSite).toBe("conversationStarters");
   });
 });
diff --git a/assistant/src/__tests__/workspace-migration-unify-llm-callsite-configs.test.ts b/assistant/src/__tests__/workspace-migration-unify-llm-callsite-configs.test.ts
index bbe1e3dc5a9..8a213f3dd57 100644
--- a/assistant/src/__tests__/workspace-migration-unify-llm-callsite-configs.test.ts
+++ b/assistant/src/__tests__/workspace-migration-unify-llm-callsite-configs.test.ts
@@ -692,57 +692,31 @@ describe("038-unify-llm-callsite-configs migration", () => {
     expect("pricingOverrides" in llm).toBe(false);
   });
 
-  // ─── down() rollback ───────────────────────────────────────────────────
-
-  test("down() reverses a migrated config to original shape", () => {
+  // ─── down() — documented no-op since PR 19 ──────────────────────────
+
+  test("down() is a no-op since PR 19 cleanup", () => {
+    // PR 19 of the unify-llm-callsites plan removed the legacy keys from
+    // `AssistantConfigSchema`, so re-creating them in `down()` would have
+    // no effect on the running daemon. The migration's `down()` is now a
+    // documented no-op — it leaves the config exactly as it found it,
+    // whether the `llm` block is present or absent.
     const original = {
-      services: { inference: { provider: "openai", model: "gpt-5.4" } },
+      services: { inference: { mode: "your-own", provider: "openai", model: "gpt-5.4" } },
       maxTokens: 32000,
-      effort: "high",
-      speed: "standard",
-      thinking: { enabled: true, streamThinking: true },
-      contextWindow: { maxInputTokens: 150000 },
-      heartbeat: { speed: "fast" },
-      filing: { speed: "fast" },
-      analysis: { modelOverride: "anthropic/claude-opus-4-6" },
-      workspaceGit: {
-        commitMessageLLM: { maxTokens: 200, temperature: 0.4 },
-      },
-      calls: { model: "gpt-5.4-nano" },
-      pricingOverrides: [
-        {
+      llm: {
+        default: {
           provider: "openai",
-          modelPattern: "gpt-5.4",
-          inputPer1M: 1,
-          outputPer1M: 2,
+          model: "gpt-5.4",
+          maxTokens: 32000,
         },
-      ],
+      },
     };
     writeConfig(original);
 
-    unifyLlmCallSiteConfigsMigration.run(workspaceDir);
-    // Sanity: llm block exists after run()
-    expect((readConfig() as { llm?: unknown }).llm).toBeDefined();
-
     unifyLlmCallSiteConfigsMigration.down(workspaceDir);
 
     const config = readConfig();
-    // The llm block must be removed.
-    expect("llm" in config).toBe(false);
-    // Every original scalar/object key that had a reverse mapping must be
-    // restored to its original value.
-    expect(config.services).toEqual(original.services);
-    expect(config.maxTokens).toBe(original.maxTokens);
-    expect(config.effort).toBe(original.effort);
-    expect(config.speed).toBe(original.speed);
-    expect(config.thinking).toEqual(original.thinking);
-    expect(config.contextWindow).toEqual(original.contextWindow);
-    expect(config.heartbeat).toEqual(original.heartbeat);
-    expect(config.filing).toEqual(original.filing);
-    expect(config.analysis).toEqual(original.analysis);
-    expect(config.workspaceGit).toEqual(original.workspaceGit);
-    expect(config.calls).toEqual(original.calls);
-    expect(config.pricingOverrides).toEqual(original.pricingOverrides);
+    expect(config).toEqual(original);
   });
 
   test("down() is a no-op when llm block is absent", () => {
diff --git a/assistant/src/calls/guardian-question-copy.ts b/assistant/src/calls/guardian-question-copy.ts
index a69a47bf20a..46b80b406b4 100644
--- a/assistant/src/calls/guardian-question-copy.ts
+++ b/assistant/src/calls/guardian-question-copy.ts
@@ -52,7 +52,7 @@ export async function generateGuardianCopy(
   const fallback = buildFallbackCopy(questionText);
 
   // If no provider is configured, return fallback immediately
-  const resolved = await resolveConfiguredProvider();
+  const resolved = await resolveConfiguredProvider("guardianQuestionCopy");
   if (!resolved) {
     log.debug(
       "No provider available for guardian copy generation, using fallback",
diff --git a/assistant/src/cli/commands/config.ts b/assistant/src/cli/commands/config.ts
index f9a4922ec48..a7b91e2e8e1 100644
--- a/assistant/src/cli/commands/config.ts
+++ b/assistant/src/cli/commands/config.ts
@@ -54,9 +54,9 @@ and "assistant keys set <provider> <key>" to view and manage API keys.
 
 Examples:
   $ assistant config list
-  $ assistant config get services.inference.provider
+  $ assistant config get llm.default.provider
   $ assistant config schema services
-  $ assistant config set services.inference.provider anthropic
+  $ assistant config set llm.default.provider anthropic
   $ assistant config set calls.enabled true`,
   );
 
@@ -69,7 +69,7 @@ Examples:
       "after",
       `
 Arguments:
-  key     Dotted path to the config key (e.g. services.inference.provider,
+  key     Dotted path to the config key (e.g. llm.default.provider,
           calls.enabled, twilio.accountSid). Intermediate objects are created
           automatically.
   value   The value to store. Parsed as JSON first (so "true" becomes boolean
@@ -81,7 +81,7 @@ After writing the value to config.json, the change takes effect immediately.
 To manage API keys, use "assistant keys set <provider> <key>" instead.
 
 Examples:
-  $ assistant config set services.inference.provider anthropic
+  $ assistant config set llm.default.provider anthropic
   $ assistant config set calls.enabled true`,
     )
     .action(
@@ -114,7 +114,7 @@ Examples:
       "after",
       `
 Arguments:
-  key   Dotted path to the config key (e.g. services.inference.provider,
+  key   Dotted path to the config key (e.g. llm.default.provider,
         calls.enabled)
 
 Prints the value at the given key path. If the key is not set, prints
@@ -123,7 +123,7 @@ Prints the value at the given key path. If the key is not set, prints
 To view API keys, use "assistant keys list" instead.
 
 Examples:
-  $ assistant config get services.inference.provider
+  $ assistant config get llm.default.provider
   $ assistant config get calls.enabled`,
     )
     .action((key: string) => {
diff --git a/assistant/src/config/bundled-skills/media-processing/services/reduce.ts b/assistant/src/config/bundled-skills/media-processing/services/reduce.ts
index 5991e62d433..d896009a645 100644
--- a/assistant/src/config/bundled-skills/media-processing/services/reduce.ts
+++ b/assistant/src/config/bundled-skills/media-processing/services/reduce.ts
@@ -179,7 +179,7 @@ async function sendToClaude(
   model?: string,
   onProgress?: (msg: string) => void,
 ): Promise<ReduceResult> {
-  const provider = await getConfiguredProvider();
+  const provider = await getConfiguredProvider("mainAgent");
   if (!provider) {
     throw new Error("No LLM provider available. Please configure an API key.");
   }
diff --git a/assistant/src/config/raw-config-utils.ts b/assistant/src/config/raw-config-utils.ts
index 1724108d70b..d36e9255e68 100644
--- a/assistant/src/config/raw-config-utils.ts
+++ b/assistant/src/config/raw-config-utils.ts
@@ -1,11 +1,39 @@
+/**
+ * Safely set a nested field on a raw config object's `llm.default` map.
+ *
+ * Ensures the `llm` and `llm.default` objects exist before writing, so
+ * callers don't need to guard against undefined intermediate keys.
+ *
+ * Example: `setLlmDefaultField(raw, "model", "claude-sonnet-4-6")`
+ * produces `raw.llm.default.model = "claude-sonnet-4-6"`.
+ */
+export function setLlmDefaultField(
+  raw: Record<string, unknown>,
+  field: string,
+  value: unknown,
+): void {
+  const llm: Record<string, unknown> =
+    raw.llm != null && typeof raw.llm === "object" && !Array.isArray(raw.llm)
+      ? (raw.llm as Record<string, unknown>)
+      : {};
+  const existing = llm.default;
+  const defaultBlock: Record<string, unknown> =
+    existing != null && typeof existing === "object" && !Array.isArray(existing)
+      ? (existing as Record<string, unknown>)
+      : {};
+  defaultBlock[field] = value;
+  llm.default = defaultBlock;
+  raw.llm = llm;
+}
+
 /**
  * Safely set a nested field on a raw config object's `services` map.
  *
  * Ensures the `services` and service-level objects exist before writing,
  * so callers don't need to guard against undefined intermediate keys.
  *
- * Example: `setServiceField(raw, "inference", "model", "claude-sonnet-4-6")`
- * produces `raw.services.inference.model = "claude-sonnet-4-6"`.
+ * Example: `setServiceField(raw, "inference", "mode", "managed")`
+ * produces `raw.services.inference.mode = "managed"`.
  */
 export function setServiceField(
   raw: Record<string, unknown>,
diff --git a/assistant/src/config/schema.ts b/assistant/src/config/schema.ts
index e69c4714984..fc6e12250dc 100644
--- a/assistant/src/config/schema.ts
+++ b/assistant/src/config/schema.ts
@@ -239,13 +239,6 @@ import {
 import { FilingConfigSchema } from "./schemas/filing.js";
 import { HeartbeatConfigSchema } from "./schemas/heartbeat.js";
 import { HostBrowserConfigSchema } from "./schemas/host-browser.js";
-import {
-  ContextWindowConfigSchema,
-  EffortSchema,
-  ModelPricingOverrideSchema,
-  SpeedSchema,
-  ThinkingConfigSchema,
-} from "./schemas/inference.js";
 import { IngressConfigSchema } from "./schemas/ingress.js";
 import { JournalConfigSchema } from "./schemas/journal.js";
 import { LLMSchema } from "./schemas/llm.js";
@@ -277,18 +270,6 @@ import { WorkspaceGitConfigSchema } from "./schemas/workspace-git.js";
 export const AssistantConfigSchema = z
   .object({
     services: ServicesSchema.default(ServicesSchema.parse({})),
-    maxTokens: z
-      .number({ error: "maxTokens must be a number" })
-      .int("maxTokens must be an integer")
-      .positive("maxTokens must be a positive integer")
-      .default(64000)
-      .describe("Maximum number of output tokens per LLM response"),
-    effort: EffortSchema,
-    speed: SpeedSchema,
-    thinking: ThinkingConfigSchema.default(ThinkingConfigSchema.parse({})),
-    contextWindow: ContextWindowConfigSchema.default(
-      ContextWindowConfigSchema.parse({}),
-    ),
     memory: MemoryConfigSchema.default(MemoryConfigSchema.parse({})),
     dataDir: z
       .string({ error: "dataDir must be a string" })
@@ -306,18 +287,9 @@ export const AssistantConfigSchema = z
     logFile: LogFileConfigSchema.default(
       LogFileConfigSchema.parse({ dir: getDataDir() + "/logs" }),
     ),
-    pricingOverrides: z
-      .array(ModelPricingOverrideSchema)
-      .default([])
-      .describe(
-        "Custom pricing overrides for specific provider/model combinations",
-      ),
-    // Unified LLM configuration block. Defaults mirror the legacy top-level
-    // inference settings (services.inference, maxTokens, effort, speed,
-    // thinking, contextWindow) so existing configs without an `llm` block
-    // continue to behave identically. No callers consume this yet — PRs 5+
-    // migrate call sites to read through the resolver. PR 19 removes the
-    // legacy keys once adoption is complete.
+    // Unified LLM configuration block. The unique source of truth for
+    // provider/model/maxTokens/effort/speed/temperature/thinking/contextWindow
+    // and pricing overrides for every call site in the assistant.
     //
     // Default values live on each leaf inside `LLMSchema` (see
     // `schemas/llm.ts`), so `LLMSchema.parse({})` returns a fully-populated
@@ -378,30 +350,29 @@ export const AssistantConfigSchema = z
       ),
   })
   .superRefine((config, ctx) => {
+    const llmContextWindow = config.llm?.default?.contextWindow;
     if (
-      config.contextWindow?.targetBudgetRatio != null &&
-      config.contextWindow?.compactThreshold != null &&
-      config.contextWindow.targetBudgetRatio >=
-        config.contextWindow.compactThreshold
+      llmContextWindow?.targetBudgetRatio != null &&
+      llmContextWindow?.compactThreshold != null &&
+      llmContextWindow.targetBudgetRatio >= llmContextWindow.compactThreshold
     ) {
       ctx.addIssue({
         code: z.ZodIssueCode.custom,
-        path: ["contextWindow", "targetBudgetRatio"],
+        path: ["llm", "default", "contextWindow", "targetBudgetRatio"],
         message:
-          "contextWindow.targetBudgetRatio must be less than contextWindow.compactThreshold",
+          "llm.default.contextWindow.targetBudgetRatio must be less than llm.default.contextWindow.compactThreshold",
       });
     }
     if (
-      config.contextWindow?.targetBudgetRatio != null &&
-      config.contextWindow?.summaryBudgetRatio != null &&
-      config.contextWindow.targetBudgetRatio <=
-        config.contextWindow.summaryBudgetRatio
+      llmContextWindow?.targetBudgetRatio != null &&
+      llmContextWindow?.summaryBudgetRatio != null &&
+      llmContextWindow.targetBudgetRatio <= llmContextWindow.summaryBudgetRatio
     ) {
       ctx.addIssue({
         code: z.ZodIssueCode.custom,
-        path: ["contextWindow", "targetBudgetRatio"],
+        path: ["llm", "default", "contextWindow", "targetBudgetRatio"],
         message:
-          "contextWindow.targetBudgetRatio must be greater than contextWindow.summaryBudgetRatio",
+          "llm.default.contextWindow.targetBudgetRatio must be greater than llm.default.contextWindow.summaryBudgetRatio",
       });
     }
     const segmentation = config.memory?.segmentation;
diff --git a/assistant/src/config/schemas/analysis.ts b/assistant/src/config/schemas/analysis.ts
index c93614b50b5..a9dc28e7a93 100644
--- a/assistant/src/config/schemas/analysis.ts
+++ b/assistant/src/config/schemas/analysis.ts
@@ -24,28 +24,9 @@ export const AnalysisConfigSchema = z
       .describe(
         "Milliseconds of idle time after the last message before the debounced analysis job fires",
       ),
-
-    // Optional model intent for the analysis agent loop. When omitted,
-    // the analysis agent uses the same model as the main agent.
-    // Accepted values match the main agent's model-intent vocabulary.
-    modelIntent: z
-      .enum(["latency-optimized", "quality-optimized", "vision-optimized"], {
-        error: "analysis.modelIntent must be a valid model intent",
-      })
-      .optional()
-      .describe(
-        "Model selection strategy for the analysis agent loop — falls back to the main agent's model when omitted",
-      ),
-
-    // Optional explicit model override (provider/model string). Takes
-    // precedence over modelIntent when both are set.
-    modelOverride: z
-      .string({ error: "analysis.modelOverride must be a string" })
-      .optional()
-      .describe(
-        "Explicit model override (provider/model string) for the analysis agent loop — takes precedence over modelIntent when both are set",
-      ),
   })
-  .describe("Controls the auto-analyze agent loop triggered by conversation activity");
+  .describe(
+    "Controls the auto-analyze agent loop triggered by conversation activity. Model selection lives under llm.callSites.analyzeConversation.",
+  );
 
 export type AnalysisConfig = z.infer<typeof AnalysisConfigSchema>;
diff --git a/assistant/src/config/schemas/calls.ts b/assistant/src/config/schemas/calls.ts
index b24b94459cc..d5c30b0d0d6 100644
--- a/assistant/src/config/schemas/calls.ts
+++ b/assistant/src/config/schemas/calls.ts
@@ -222,10 +222,6 @@ export const CallsConfigSchema = z
     ),
     safety: CallsSafetyConfigSchema.default(CallsSafetyConfigSchema.parse({})),
     voice: CallsVoiceConfigSchema.default(CallsVoiceConfigSchema.parse({})),
-    model: z
-      .string({ error: "calls.model must be a string" })
-      .optional()
-      .describe("Override the default model for phone call conversations"),
     callerIdentity: CallerIdentityConfigSchema.default(
       CallerIdentityConfigSchema.parse({}),
     ),
diff --git a/assistant/src/config/schemas/filing.ts b/assistant/src/config/schemas/filing.ts
index f47516ce022..0ae3a3effaf 100644
--- a/assistant/src/config/schemas/filing.ts
+++ b/assistant/src/config/schemas/filing.ts
@@ -1,7 +1,5 @@
 import { z } from "zod";
 
-import { SpeedSchema } from "./inference.js";
-
 export const FilingConfigSchema = z
   .object({
     enabled: z
@@ -16,9 +14,6 @@ export const FilingConfigSchema = z
       .positive("filing.intervalMs must be a positive integer")
       .default(4 * 3_600_000)
       .describe("Time between filing runs in milliseconds"),
-    speed: SpeedSchema.default("standard").describe(
-      "Inference speed mode for filing conversations",
-    ),
     activeHoursStart: z
       .number({ error: "filing.activeHoursStart must be a number" })
       .int("filing.activeHoursStart must be an integer")
diff --git a/assistant/src/config/schemas/heartbeat.ts b/assistant/src/config/schemas/heartbeat.ts
index 91ac0003c2e..a5a3b3e6040 100644
--- a/assistant/src/config/schemas/heartbeat.ts
+++ b/assistant/src/config/schemas/heartbeat.ts
@@ -1,7 +1,5 @@
 import { z } from "zod";
 
-import { SpeedSchema } from "./inference.js";
-
 export const HeartbeatConfigSchema = z
   .object({
     enabled: z
@@ -14,9 +12,6 @@ export const HeartbeatConfigSchema = z
       .positive("heartbeat.intervalMs must be a positive integer")
       .default(6 * 3_600_000)
       .describe("Time between heartbeat checks in milliseconds"),
-    speed: SpeedSchema.default("standard").describe(
-      "Inference speed mode for heartbeat conversations — defaults to standard to avoid inheriting the global fast mode multiplier",
-    ),
     activeHoursStart: z
       .number({ error: "heartbeat.activeHoursStart must be a number" })
       .int("heartbeat.activeHoursStart must be an integer")
diff --git a/assistant/src/config/schemas/memory-processing.ts b/assistant/src/config/schemas/memory-processing.ts
index 6f9e4376089..575454e90f8 100644
--- a/assistant/src/config/schemas/memory-processing.ts
+++ b/assistant/src/config/schemas/memory-processing.ts
@@ -43,17 +43,9 @@ export const MemorySummarizationConfigSchema = z
       .describe(
         "Whether to use an LLM for summarizing and consolidating memory items",
       ),
-    modelIntent: z
-      .enum(["latency-optimized", "quality-optimized", "vision-optimized"], {
-        error: "memory.summarization.modelIntent must be a valid model intent",
-      })
-      .default("quality-optimized")
-      .describe(
-        "Model selection strategy for summarization — trade off speed vs quality",
-      ),
   })
   .describe(
-    "Controls how memory items are summarized and consolidated over time",
+    "Controls how memory items are summarized and consolidated over time. Model selection lives under llm.callSites.conversationSummarization.",
   );
 
 export type MemoryExtractionConfig = z.infer<
diff --git a/assistant/src/config/schemas/notifications.ts b/assistant/src/config/schemas/notifications.ts
index 718b4a9e0db..1a119d95257 100644
--- a/assistant/src/config/schemas/notifications.ts
+++ b/assistant/src/config/schemas/notifications.ts
@@ -1,16 +1,9 @@
 import { z } from "zod";
 
 export const NotificationsConfigSchema = z
-  .object({
-    decisionModelIntent: z
-      .enum(["latency-optimized", "quality-optimized", "vision-optimized"], {
-        error: "notifications.decisionModelIntent must be a valid model intent",
-      })
-      .default("latency-optimized")
-      .describe(
-        "Model selection strategy for deciding whether to send a notification",
-      ),
-  })
-  .describe("Notification delivery configuration");
+  .object({})
+  .describe(
+    "Notification delivery configuration. Model selection lives under llm.callSites.notificationDecision and llm.callSites.preferenceExtraction.",
+  );
 
 export type NotificationsConfig = z.infer<typeof NotificationsConfigSchema>;
diff --git a/assistant/src/config/schemas/platform.ts b/assistant/src/config/schemas/platform.ts
index eb92c3efc46..8221fddce2c 100644
--- a/assistant/src/config/schemas/platform.ts
+++ b/assistant/src/config/schemas/platform.ts
@@ -56,16 +56,10 @@ export const UiConfigSchema = z
       .describe(
         "IANA timezone identifier for displaying dates and times (e.g. 'America/New_York')",
       ),
-    greetingModelIntent: z
-      .enum(["latency-optimized", "quality-optimized"], {
-        error: "ui.greetingModelIntent must be 'latency-optimized' or 'quality-optimized'",
-      })
-      .default("latency-optimized")
-      .describe(
-        "Model intent for empty-state greeting generation (latency-optimized = fast/small model, quality-optimized = primary model)",
-      ),
   })
-  .describe("User interface display settings");
+  .describe(
+    "User interface display settings. Empty-state greeting model selection lives under llm.callSites.emptyStateGreeting.",
+  );
 
 export type DaemonConfig = z.infer<typeof DaemonConfigSchema>;
 export type UiConfig = z.infer<typeof UiConfigSchema>;
diff --git a/assistant/src/config/schemas/services.ts b/assistant/src/config/schemas/services.ts
index 2457a566d92..7a3812f93b5 100644
--- a/assistant/src/config/schemas/services.ts
+++ b/assistant/src/config/schemas/services.ts
@@ -28,10 +28,15 @@ export const BaseServiceSchema = z.object({
 });
 export type BaseService = z.infer<typeof BaseServiceSchema>;
 
-export const InferenceServiceSchema = BaseServiceSchema.extend({
-  provider: z.enum(VALID_INFERENCE_PROVIDERS).default("anthropic"),
-  model: z.string().default("claude-opus-4-6"),
-});
+/**
+ * Inference service entry. Carries only the routing `mode`
+ * (`managed` vs `your-own`) — the provider and model live under
+ * `llm.default.{provider, model}` (see `schemas/llm.ts`). PR 19 of the
+ * unify-llm-callsites plan removed the `provider` and `model` fields here;
+ * legacy configs that still carry them have those keys stripped by
+ * workspace migration `039-drop-legacy-llm-keys`.
+ */
+export const InferenceServiceSchema = BaseServiceSchema;
 export type InferenceService = z.infer<typeof InferenceServiceSchema>;
 
 export const ImageGenerationServiceSchema = BaseServiceSchema.extend({
diff --git a/assistant/src/config/schemas/workspace-git.ts b/assistant/src/config/schemas/workspace-git.ts
index 59e383fe701..516c834dbaf 100644
--- a/assistant/src/config/schemas/workspace-git.ts
+++ b/assistant/src/config/schemas/workspace-git.ts
@@ -74,21 +74,6 @@ export const WorkspaceGitConfigSchema = z
           })
           .default(false)
           .describe("Whether to use an LLM to generate commit messages"),
-        useConfiguredProvider: z
-          .boolean({
-            error:
-              "workspaceGit.commitMessageLLM.useConfiguredProvider must be a boolean",
-          })
-          .default(true)
-          .describe(
-            "Whether to use the globally configured LLM provider for commit messages",
-          ),
-        providerFastModelOverrides: z
-          .record(z.string(), z.string())
-          .default({} as Record<string, string>)
-          .describe(
-            "Map of provider names to fast model overrides for commit message generation",
-          ),
         timeoutMs: z
           .number({
             error: "workspaceGit.commitMessageLLM.timeoutMs must be a number",
@@ -99,26 +84,6 @@ export const WorkspaceGitConfigSchema = z
           )
           .default(600)
           .describe("Timeout for LLM commit message generation (ms)"),
-        maxTokens: z
-          .number({
-            error: "workspaceGit.commitMessageLLM.maxTokens must be a number",
-          })
-          .int("workspaceGit.commitMessageLLM.maxTokens must be an integer")
-          .positive(
-            "workspaceGit.commitMessageLLM.maxTokens must be a positive integer",
-          )
-          .default(120)
-          .describe("Maximum number of tokens in the generated commit message"),
-        temperature: z
-          .number({
-            error: "workspaceGit.commitMessageLLM.temperature must be a number",
-          })
-          .min(0, "workspaceGit.commitMessageLLM.temperature must be >= 0")
-          .max(2, "workspaceGit.commitMessageLLM.temperature must be <= 2")
-          .default(0.2)
-          .describe(
-            "LLM sampling temperature for commit message generation (lower = more deterministic)",
-          ),
         maxFilesInPrompt: z
           .number({
             error:
@@ -203,11 +168,7 @@ export const WorkspaceGitConfigSchema = z
       })
       .default({
         enabled: false,
-        useConfiguredProvider: true,
-        providerFastModelOverrides: {},
         timeoutMs: 600,
-        maxTokens: 120,
-        temperature: 0.2,
         maxFilesInPrompt: 30,
         maxDiffBytes: 12000,
         minRemainingTurnBudgetMs: 1000,
@@ -217,7 +178,9 @@ export const WorkspaceGitConfigSchema = z
           backoffMaxMs: 60000,
         },
       })
-      .describe("LLM-powered commit message generation settings"),
+      .describe(
+        "LLM-powered commit message generation operational settings. Provider/model/maxTokens/temperature live under llm.callSites.commitMessage.",
+      ),
   })
   .describe(
     "Workspace git integration — auto-commits, enrichment, and LLM-generated commit messages",
diff --git a/assistant/src/config/skills.ts b/assistant/src/config/skills.ts
index 201ecc21d88..3f0840b3505 100644
--- a/assistant/src/config/skills.ts
+++ b/assistant/src/config/skills.ts
@@ -1148,7 +1148,7 @@ async function generateSkillIcon(
   name: string,
   description: string,
 ): Promise<string> {
-  const provider = await getConfiguredProvider();
+  const provider = await getConfiguredProvider("skillCategoryInference");
   if (!provider) {
     throw new Error("Configured provider unavailable for icon generation");
   }
@@ -1163,7 +1163,7 @@ async function generateSkillIcon(
     'You are a pixel art icon designer. When asked, return ONLY a single <svg> element — no explanation, no markdown, no code fences. The SVG must be a 16x16 grid pixel art icon using <rect> elements. Use a limited palette (3-5 colors). Keep it under 2KB. The viewBox should be "0 0 16 16" with each pixel being a 1x1 rect.',
     {
       config: {
-        modelIntent: "latency-optimized",
+        callSite: "skillCategoryInference",
         max_tokens: 1024,
       },
     },
diff --git a/assistant/src/daemon/approval-generators.ts b/assistant/src/daemon/approval-generators.ts
index f6f51ca7cca..95ee00601db 100644
--- a/assistant/src/daemon/approval-generators.ts
+++ b/assistant/src/daemon/approval-generators.ts
@@ -91,7 +91,7 @@ export function createApprovalCopyGenerator(): ApprovalCopyGenerator {
     const config = loadConfig();
     let provider;
     try {
-      provider = getProvider(config.services.inference.provider);
+      provider = getProvider(config.llm.default.provider);
     } catch {
       return null;
     }
@@ -142,10 +142,10 @@ export function createApprovalCopyGenerator(): ApprovalCopyGenerator {
 export function createApprovalConversationGenerator(): ApprovalConversationGenerator {
   return async (context) => {
     const config = loadConfig();
-    if (!listProviders().includes(config.services.inference.provider)) {
+    if (!listProviders().includes(config.llm.default.provider)) {
       throw new Error("No provider available for approval conversation");
     }
-    const provider = getProvider(config.services.inference.provider);
+    const provider = getProvider(config.llm.default.provider);
 
     const pendingDescription = context.pendingApprovals
       .map((p) => `- Request ${p.requestId}: tool "${p.toolName}"`)
diff --git a/assistant/src/daemon/classifier.ts b/assistant/src/daemon/classifier.ts
index 69bd455303d..a476fea2e28 100644
--- a/assistant/src/daemon/classifier.ts
+++ b/assistant/src/daemon/classifier.ts
@@ -28,7 +28,7 @@ export async function classifyInteraction(
     return "text_qa";
   }
 
-  const provider = await getConfiguredProvider();
+  const provider = await getConfiguredProvider("interactionClassifier");
   if (!provider) {
     log.warn(
       "No configured provider available, falling back to heuristic classification",
diff --git a/assistant/src/daemon/conversation-agent-loop.ts b/assistant/src/daemon/conversation-agent-loop.ts
index 2a4d8b143d9..e1cbc13ed64 100644
--- a/assistant/src/daemon/conversation-agent-loop.ts
+++ b/assistant/src/daemon/conversation-agent-loop.ts
@@ -907,8 +907,8 @@ export async function runAgentLoopImpl(
     // and proactively invoke the reducer if already above budget. This avoids
     // a wasted provider round-trip that would just fail with context_too_large.
     const config = getConfig();
-    const overflowRecovery = config.contextWindow.overflowRecovery;
-    const providerMaxTokens = config.contextWindow.maxInputTokens;
+    const overflowRecovery = config.llm.default.contextWindow.overflowRecovery;
+    const providerMaxTokens = config.llm.default.contextWindow.maxInputTokens;
     // Widen safety margin for large conversations where estimation error
     // compounds across many messages with tool results.
     const baseSafetyMargin = overflowRecovery.safetyMarginRatio;
@@ -954,7 +954,7 @@ export async function runAgentLoopImpl(
           {
             providerName: ctx.provider.name,
             systemPrompt: ctx.systemPrompt,
-            contextWindow: config.contextWindow,
+            contextWindow: config.llm.default.contextWindow,
             targetTokens: preflightBudget,
             toolTokenBudget,
           },
@@ -1388,7 +1388,7 @@ export async function runAgentLoopImpl(
           {
             providerName: ctx.provider.name,
             systemPrompt: ctx.systemPrompt,
-            contextWindow: config.contextWindow,
+            contextWindow: config.llm.default.contextWindow,
             targetTokens: correctedTarget,
             toolTokenBudget,
           },
@@ -1899,7 +1899,7 @@ export async function runAgentLoopImpl(
       state.exchangeLlmCallCount,
       {
         tokens: state.lastCallInputTokens,
-        maxTokens: config.contextWindow.maxInputTokens,
+        maxTokens: config.llm.default.contextWindow.maxInputTokens,
       },
     );
 
diff --git a/assistant/src/daemon/conversation-process.ts b/assistant/src/daemon/conversation-process.ts
index caad7648796..886b6fa770d 100644
--- a/assistant/src/daemon/conversation-process.ts
+++ b/assistant/src/daemon/conversation-process.ts
@@ -261,9 +261,9 @@ function buildSlashContext(
     messageCount: conversation.messages.length,
     inputTokens: conversation.usageStats.inputTokens,
     outputTokens: conversation.usageStats.outputTokens,
-    maxInputTokens: config.contextWindow.maxInputTokens,
-    model: config.services.inference.model,
-    provider: config.services.inference.provider,
+    maxInputTokens: config.llm.default.contextWindow.maxInputTokens,
+    model: config.llm.default.model,
+    provider: config.llm.default.provider,
     estimatedCost: conversation.usageStats.estimatedCost,
     userMessageInterface: turnInterface?.userMessageInterface,
   };
diff --git a/assistant/src/daemon/conversation-slash.ts b/assistant/src/daemon/conversation-slash.ts
index dd5e78bb41e..121898a6635 100644
--- a/assistant/src/daemon/conversation-slash.ts
+++ b/assistant/src/daemon/conversation-slash.ts
@@ -78,8 +78,8 @@ async function resolveModelList(): Promise<SlashResolution> {
     lines.push(`**${providerName}** ${status}`);
     for (const { id, displayName } of models) {
       const isCurrent =
-        config.services.inference.provider === provider &&
-        config.services.inference.model === id;
+        config.llm.default.provider === provider &&
+        config.llm.default.model === id;
       const current = isCurrent ? " **[current]**" : "";
       lines.push(`  - ${displayName} (\`${id}\`)${current}`);
     }
diff --git a/assistant/src/daemon/conversation-usage.ts b/assistant/src/daemon/conversation-usage.ts
index b37bfc8be79..cda6def5e34 100644
--- a/assistant/src/daemon/conversation-usage.ts
+++ b/assistant/src/daemon/conversation-usage.ts
@@ -109,7 +109,7 @@ function resolveStructuredPricing(
       providerName,
       model,
       usage,
-      config.pricingOverrides,
+      config.llm.pricingOverrides,
     );
   } catch (err) {
     log.warn({ err, model, providerName }, "Failed to resolve usage pricing");
diff --git a/assistant/src/daemon/conversation.ts b/assistant/src/daemon/conversation.ts
index 1a03daca8c2..1c3fa9af48e 100644
--- a/assistant/src/daemon/conversation.ts
+++ b/assistant/src/daemon/conversation.ts
@@ -63,8 +63,7 @@ import {
 } from "../permissions/v2-consent-policy.js";
 import { resolvePersonaContext } from "../prompts/persona-resolver.js";
 import { buildSystemPrompt } from "../prompts/system-prompt.js";
-import { resolveModelIntent } from "../providers/model-intents.js";
-import type { Message, ModelIntent } from "../providers/types.js";
+import type { Message } from "../providers/types.js";
 import type { Provider } from "../providers/types.js";
 import type { TrustClass } from "../runtime/actor-trust-resolver.js";
 import type { AuthContext } from "../runtime/auth/types.js";
@@ -339,7 +338,6 @@ export class Conversation {
     sharedCesClient?: CesClient,
     speedOverride?: Speed,
     cacheTtl?: "5m" | "1h",
-    modelIntent?: ModelIntent,
     modelOverride?: string,
   ) {
     this.conversationId = conversationId;
@@ -425,7 +423,7 @@ export class Conversation {
     );
 
     const config = getConfig();
-    this.streamThinking = config.thinking.streamThinking ?? false;
+    this.streamThinking = config.llm.default.thinking.streamThinking ?? false;
 
     // CES (Credential Execution Service) — use the shared server-level client.
     // The CES sidecar accepts exactly one bootstrap connection, so the
@@ -442,13 +440,10 @@ export class Conversation {
     const hasSystemPromptOverride = systemPrompt !== buildSystemPrompt();
     this.hasSystemPromptOverride = hasSystemPromptOverride;
 
-    // If an explicit modelOverride is supplied, use it verbatim. Otherwise,
-    // if modelIntent is set, resolve it against the active provider's
-    // intent → model mapping. The AgentLoop passes the resulting string
-    // through to `providerConfig.model` on every turn.
-    const resolvedModel: string | undefined =
-      modelOverride ??
-      (modelIntent ? resolveModelIntent(provider.name, modelIntent) : undefined);
+    // If an explicit modelOverride is supplied, use it verbatim. Otherwise
+    // leave the model unset and let `RetryProvider`'s call-site resolver pick
+    // it up from `llm.default` / `llm.callSites.<id>` on every turn.
+    const resolvedModel: string | undefined = modelOverride;
 
     const resolveSystemPromptCallback = (
       _history: import("../providers/types.js").Message[],
@@ -478,16 +473,17 @@ export class Conversation {
     };
 
     const fastModeEnabled = isAssistantFeatureFlagEnabled("fast-mode", config);
-    const resolvedSpeed = speedOverride ?? config.speed;
+    const resolvedSpeed = speedOverride ?? config.llm.default.speed;
+    const llmDefault = config.llm.default;
 
     this.agentLoop = new AgentLoop(
       provider,
       systemPrompt,
       {
         maxTokens,
-        maxInputTokens: config.contextWindow.maxInputTokens,
-        thinking: config.thinking,
-        effort: config.effort,
+        maxInputTokens: llmDefault.contextWindow.maxInputTokens,
+        thinking: llmDefault.thinking,
+        effort: llmDefault.effort,
         ...(fastModeEnabled && resolvedSpeed === "fast"
           ? { speed: resolvedSpeed }
           : {}),
@@ -501,7 +497,7 @@ export class Conversation {
     this.contextWindowManager = new ContextWindowManager({
       provider,
       systemPrompt: () => resolveSystemPromptCallback([]).systemPrompt,
-      config: config.contextWindow,
+      config: llmDefault.contextWindow,
       toolTokenBudget: this.agentLoop.getToolTokenBudget(),
     });
 
diff --git a/assistant/src/daemon/guardian-action-generators.ts b/assistant/src/daemon/guardian-action-generators.ts
index 474e8b20aa0..cb5b33ccd0a 100644
--- a/assistant/src/daemon/guardian-action-generators.ts
+++ b/assistant/src/daemon/guardian-action-generators.ts
@@ -18,8 +18,8 @@ import type {
 /**
  * Create the daemon-owned guardian action copy generator that resolves
  * providers and calls `provider.sendMessage` to generate guardian action
- * copy text. Uses `latency-optimized` model intent since these are
- * time-sensitive voice responses.
+ * copy text. Uses the `guardianQuestionCopy` call site so model selection
+ * tracks the unified `llm.callSites` configuration.
  *
  * This keeps all provider awareness in the daemon lifecycle, away from
  * the runtime composer.
@@ -29,7 +29,7 @@ export function createGuardianActionCopyGenerator(): GuardianActionCopyGenerator
     const config = loadConfig();
     let provider;
     try {
-      provider = getProvider(config.services.inference.provider);
+      provider = getProvider(config.llm.default.provider);
     } catch {
       return null;
     }
@@ -52,7 +52,7 @@ export function createGuardianActionCopyGenerator(): GuardianActionCopyGenerator
       {
         config: {
           max_tokens: options.maxTokens ?? GUARDIAN_ACTION_COPY_MAX_TOKENS,
-          modelIntent: "latency-optimized",
+          callSite: "guardianQuestionCopy",
         },
         signal: AbortSignal.timeout(
           options.timeoutMs ?? GUARDIAN_ACTION_COPY_TIMEOUT_MS,
@@ -131,7 +131,7 @@ const VALID_FOLLOWUP_DISPOSITIONS: ReadonlySet<string> = new Set([
 export function createGuardianFollowUpConversationGenerator(): GuardianFollowUpConversationGenerator {
   return async (context) => {
     const config = loadConfig();
-    const provider = getProvider(config.services.inference.provider);
+    const provider = getProvider(config.llm.default.provider);
 
     const userPrompt = [
       `Original question from the voice call: "${context.questionText}"`,
@@ -146,7 +146,7 @@ export function createGuardianFollowUpConversationGenerator(): GuardianFollowUpC
       {
         config: {
           max_tokens: FOLLOWUP_CONVERSATION_MAX_TOKENS,
-          modelIntent: "latency-optimized",
+          callSite: "guardianQuestionCopy",
         },
         signal: AbortSignal.timeout(FOLLOWUP_CONVERSATION_TIMEOUT_MS),
       },
diff --git a/assistant/src/daemon/handlers/config-model.ts b/assistant/src/daemon/handlers/config-model.ts
index 5cdf5be6e4c..1aa8cdeab24 100644
--- a/assistant/src/daemon/handlers/config-model.ts
+++ b/assistant/src/daemon/handlers/config-model.ts
@@ -3,7 +3,10 @@ import {
   loadRawConfig,
   saveRawConfig,
 } from "../../config/loader.js";
-import { setServiceField } from "../../config/raw-config-utils.js";
+import {
+  setLlmDefaultField,
+  setServiceField,
+} from "../../config/raw-config-utils.js";
 import { VALID_INFERENCE_PROVIDERS } from "../../config/schemas/services.js";
 import type { ProviderCatalogEntry } from "../../providers/model-catalog.js";
 import {
@@ -48,10 +51,10 @@ export interface ModelInfo {
 /** Return current model configuration. */
 export async function getModelInfo(): Promise<ModelInfo> {
   const config = getConfig();
-  const provider = config.services.inference.provider;
+  const provider = config.llm.default.provider;
 
   return {
-    model: config.services.inference.model,
+    model: config.llm.default.model,
     provider,
     configuredProviders: await getConfiguredProviders(),
     availableModels: PROVIDER_CATALOG.find((p) => p.id === provider)?.models,
@@ -102,12 +105,12 @@ export async function setModel(
   const resolvedProvider =
     explicitProvider ??
     MODEL_TO_PROVIDER[modelId] ??
-    current.services.inference.provider;
+    current.llm.default.provider;
 
   // Auto-reset model when provider changes and current modelId doesn't
   // belong to the new provider's catalog.
   if (
-    resolvedProvider !== current.services.inference.provider &&
+    resolvedProvider !== current.llm.default.provider &&
     !isModelInCatalog(resolvedProvider, modelId)
   ) {
     modelId = getProviderDefaultModel(resolvedProvider);
@@ -115,8 +118,8 @@ export async function setModel(
 
   // No-op guard: skip expensive reinitialization when nothing changed
   if (
-    modelId === current.services.inference.model &&
-    resolvedProvider === current.services.inference.provider
+    modelId === current.llm.default.model &&
+    resolvedProvider === current.llm.default.provider
   ) {
     return await getModelInfo();
   }
@@ -129,8 +132,8 @@ export async function setModel(
 
   // Use raw config to avoid persisting env-var API keys to disk
   const raw = loadRawConfig();
-  setServiceField(raw, "inference", "model", modelId);
-  setServiceField(raw, "inference", "provider", resolvedProvider);
+  setLlmDefaultField(raw, "model", modelId);
+  setLlmDefaultField(raw, "provider", resolvedProvider);
 
   // Suppress the file watcher callback — setModel already does
   // the full reload sequence; a redundant watcher-triggered reload
diff --git a/assistant/src/daemon/handlers/conversations.ts b/assistant/src/daemon/handlers/conversations.ts
index 35f754b3b52..af138b7aab4 100644
--- a/assistant/src/daemon/handlers/conversations.ts
+++ b/assistant/src/daemon/handlers/conversations.ts
@@ -435,7 +435,7 @@ export function handleUsageRequest(
     totalInputTokens: conversation.totalInputTokens,
     totalOutputTokens: conversation.totalOutputTokens,
     estimatedCost: conversation.totalEstimatedCost,
-    model: config.services.inference.model,
+    model: config.llm.default.model,
   });
 }
 
diff --git a/assistant/src/daemon/handlers/shared.ts b/assistant/src/daemon/handlers/shared.ts
index 5212d13838f..3ce19037d57 100644
--- a/assistant/src/daemon/handlers/shared.ts
+++ b/assistant/src/daemon/handlers/shared.ts
@@ -5,7 +5,6 @@ import type { Speed } from "../../config/schemas/inference.js";
 import type { LLMCallSite } from "../../config/schemas/llm.js";
 import type { HeartbeatService } from "../../heartbeat/heartbeat-service.js";
 import type { SecretPromptResult } from "../../permissions/secret-prompter.js";
-import type { ModelIntent } from "../../providers/types.js";
 import type { AuthContext } from "../../runtime/auth/types.js";
 import type { DebouncerMap } from "../../util/debounce.js";
 import { getLogger } from "../../util/logger.js";
@@ -129,24 +128,17 @@ export interface ConversationCreateOptions {
   commandIntent?: { type: string; payload?: string; languageCode?: string };
   /** Optional callback to receive real-time agent loop events (text deltas, tool starts, etc.). */
   onEvent?: (msg: ServerMessage) => void;
-  /**
-   * Optional model selection strategy for this conversation's agent loop.
-   * When set, overrides the provider's default model per-turn. Used by the
-   * auto-analyze loop to route the analysis agent to a dedicated model.
-   */
-  modelIntent?: ModelIntent;
   /**
    * Optional explicit model override (provider/model string) for this
-   * conversation's agent loop. Takes precedence over `modelIntent` when
-   * both are set. Used by the auto-analyze loop to pin the analysis agent
-   * to a specific model.
+   * conversation's agent loop. Used by the auto-analyze loop to pin the
+   * analysis agent to a specific model.
    */
   modelOverride?: string;
   /**
    * Optional LLM call-site identifier threaded through to the per-call
    * provider config. Adapter callers (heartbeat, filing, schedule, etc.)
-   * pass their call-site here so PRs 7-11 can route those flows through
-   * `resolveCallSiteConfig` instead of the legacy `speed`/`modelIntent` paths.
+   * pass their call-site here so the agent loop routes through
+   * `resolveCallSiteConfig` instead of the global default.
    */
   callSite?: LLMCallSite;
 }
diff --git a/assistant/src/daemon/handlers/skills.ts b/assistant/src/daemon/handlers/skills.ts
index 0463ac213ad..8abf0227cc3 100644
--- a/assistant/src/daemon/handlers/skills.ts
+++ b/assistant/src/daemon/handlers/skills.ts
@@ -1517,7 +1517,7 @@ export async function draftSkill(
     if (missing.length > 0) {
       let llmGenerated = false;
       try {
-        const provider = await getConfiguredProvider();
+        const provider = await getConfiguredProvider("skillCategoryInference");
         if (provider) {
           const { signal, cleanup } = createTimeout(LLM_DRAFT_TIMEOUT_MS);
           try {
diff --git a/assistant/src/daemon/server.ts b/assistant/src/daemon/server.ts
index cd4d15d1b59..54f8d56538a 100644
--- a/assistant/src/daemon/server.ts
+++ b/assistant/src/daemon/server.ts
@@ -1033,7 +1033,7 @@ export class DaemonServer {
 
       const createPromise = (async () => {
         const config = getConfig();
-        let provider = getProvider(config.services.inference.provider);
+        let provider = getProvider(config.llm.default.provider);
         const { rateLimit } = config;
         if (rateLimit.maxRequestsPerMinute > 0) {
           provider = new RateLimitProvider(
@@ -1046,7 +1046,8 @@ export class DaemonServer {
 
         const systemPrompt =
           storedOptions?.systemPromptOverride ?? buildSystemPrompt();
-        const maxTokens = storedOptions?.maxResponseTokens ?? config.maxTokens;
+        const maxTokens =
+          storedOptions?.maxResponseTokens ?? config.llm.default.maxTokens;
 
         const memoryPolicy = this.deriveMemoryPolicy(conversationId);
         // Resolve the shared CES client (may still be initializing).
@@ -1065,7 +1066,6 @@ export class DaemonServer {
           sharedCesClient,
           storedOptions?.speed,
           undefined,
-          storedOptions?.modelIntent,
           storedOptions?.modelOverride,
         );
         newConversation.updateClient(sendToClient, true);
@@ -1438,9 +1438,9 @@ export class DaemonServer {
       messageCount: conversation.getMessages().length,
       inputTokens: conversation.usageStats.inputTokens,
       outputTokens: conversation.usageStats.outputTokens,
-      maxInputTokens: config.contextWindow.maxInputTokens,
-      model: config.services.inference.model,
-      provider: config.services.inference.provider,
+      maxInputTokens: config.llm.default.contextWindow.maxInputTokens,
+      model: config.llm.default.model,
+      provider: config.llm.default.provider,
       estimatedCost: conversation.usageStats.estimatedCost,
       userMessageInterface: serverInterfaceCtx?.userMessageInterface,
     };
diff --git a/assistant/src/daemon/watch-handler.ts b/assistant/src/daemon/watch-handler.ts
index 1f5478db494..74ff29f6274 100644
--- a/assistant/src/daemon/watch-handler.ts
+++ b/assistant/src/daemon/watch-handler.ts
@@ -113,7 +113,7 @@ export async function handleWatchObservation(
 
 async function generateCommentary(session: WatchSession): Promise<void> {
   try {
-    const provider = await getConfiguredProvider();
+    const provider = await getConfiguredProvider("watchCommentary");
     if (!provider) {
       log.warn(
         { watchId: session.watchId },
@@ -225,7 +225,7 @@ export async function generateSummary(session: WatchSession): Promise<void> {
       },
       "generateSummary starting — calling LLM",
     );
-    const provider = await getConfiguredProvider();
+    const provider = await getConfiguredProvider("watchSummary");
     if (!provider) {
       log.warn(
         { watchId: session.watchId },
diff --git a/assistant/src/home/rollup-producer.ts b/assistant/src/home/rollup-producer.ts
index ca5e790c239..58ad0b6df86 100644
--- a/assistant/src/home/rollup-producer.ts
+++ b/assistant/src/home/rollup-producer.ts
@@ -261,10 +261,10 @@ export async function runRollupProducer(
 
 function resolveDefaultProvider(): ReturnType<typeof getProvider> | null {
   const config = loadConfig();
-  if (!listProviders().includes(config.services.inference.provider)) {
+  if (!listProviders().includes(config.llm.default.provider)) {
     return null;
   }
-  return getProvider(config.services.inference.provider);
+  return getProvider(config.llm.default.provider);
 }
 
 /**
diff --git a/assistant/src/memory/conversation-title-service.ts b/assistant/src/memory/conversation-title-service.ts
index 4f5bc700761..6a1b19ed565 100644
--- a/assistant/src/memory/conversation-title-service.ts
+++ b/assistant/src/memory/conversation-title-service.ts
@@ -118,7 +118,8 @@ export async function generateAndPersistConversationTitle(
     return { title: conversation.title!, updated: false };
   }
 
-  const provider = params.provider ?? (await getConfiguredProvider());
+  const provider =
+    params.provider ?? (await getConfiguredProvider("conversationTitle"));
   if (!provider) {
     // No provider available — fall back to context-derived title or untitled
     const fallback = deriveFallbackTitle(context) ?? UNTITLED_FALLBACK;
@@ -219,7 +220,8 @@ export async function regenerateConversationTitle(
     return { title: conversation?.title ?? UNTITLED_FALLBACK, updated: false };
   }
 
-  const provider = params.provider ?? (await getConfiguredProvider());
+  const provider =
+    params.provider ?? (await getConfiguredProvider("conversationTitle"));
   if (!provider) {
     return { title: conversation.title ?? UNTITLED_FALLBACK, updated: false };
   }
diff --git a/assistant/src/memory/embedding-backend.ts b/assistant/src/memory/embedding-backend.ts
index 2815a48b460..4b4a1f01bba 100644
--- a/assistant/src/memory/embedding-backend.ts
+++ b/assistant/src/memory/embedding-backend.ts
@@ -795,7 +795,7 @@ export async function selectedBackendSupportsMultimodal(
 
 async function isOllamaConfigured(config: AssistantConfig): Promise<boolean> {
   return (
-    config.services.inference.provider === "ollama" ||
+    config.llm.default.provider === "ollama" ||
     Boolean(await getProviderKeyAsync("ollama")) ||
     Boolean(getOllamaBaseUrlEnv())
   );
diff --git a/assistant/src/memory/graph/consolidation.ts b/assistant/src/memory/graph/consolidation.ts
index 6ff0226f720..cfa490f87a8 100644
--- a/assistant/src/memory/graph/consolidation.ts
+++ b/assistant/src/memory/graph/consolidation.ts
@@ -262,7 +262,7 @@ async function identifyDuplicateGroups(
 ): Promise<MemoryNode[][]> {
   if (nodes.length < 2) return [];
 
-  const provider = await getConfiguredProvider();
+  const provider = await getConfiguredProvider("memoryConsolidation");
   if (!provider) return [];
 
   // Compact listing: ID + first 100 chars of content
@@ -428,7 +428,7 @@ async function consolidateChunk(
     return true;
   });
 
-  const provider = await getConfiguredProvider();
+  const provider = await getConfiguredProvider("memoryConsolidation");
   if (!provider) {
     throw new BackendUnavailableError("Provider unavailable for consolidation");
   }
diff --git a/assistant/src/memory/graph/extraction.ts b/assistant/src/memory/graph/extraction.ts
index 8b37ee4500c..108e3266958 100644
--- a/assistant/src/memory/graph/extraction.ts
+++ b/assistant/src/memory/graph/extraction.ts
@@ -844,7 +844,7 @@ export async function runGraphExtraction(
   }
 
   // 2. Get provider
-  const provider = await getConfiguredProvider();
+  const provider = await getConfiguredProvider("memoryExtraction");
   if (!provider) {
     throw new BackendUnavailableError(
       "Provider unavailable for graph extraction",
diff --git a/assistant/src/memory/graph/narrative.ts b/assistant/src/memory/graph/narrative.ts
index 7fccfd583ef..6bcce2f2e98 100644
--- a/assistant/src/memory/graph/narrative.ts
+++ b/assistant/src/memory/graph/narrative.ts
@@ -165,7 +165,7 @@ export async function runNarrativeRefinement(
     .sort((a, b) => b.significance - a.significance)
     .slice(0, 150);
 
-  const provider = await getConfiguredProvider();
+  const provider = await getConfiguredProvider("narrativeRefinement");
   if (!provider) {
     throw new BackendUnavailableError(
       "Provider unavailable for narrative refinement",
diff --git a/assistant/src/memory/graph/pattern-scan.ts b/assistant/src/memory/graph/pattern-scan.ts
index d2f71a470a4..0aafee7cc2f 100644
--- a/assistant/src/memory/graph/pattern-scan.ts
+++ b/assistant/src/memory/graph/pattern-scan.ts
@@ -141,7 +141,7 @@ export async function runPatternScan(
     return result;
   }
 
-  const provider = await getConfiguredProvider();
+  const provider = await getConfiguredProvider("patternScan");
   if (!provider) {
     throw new BackendUnavailableError("Provider unavailable for pattern scan");
   }
diff --git a/assistant/src/memory/graph/retriever.ts b/assistant/src/memory/graph/retriever.ts
index d15818e7cd0..73d1a557dbb 100644
--- a/assistant/src/memory/graph/retriever.ts
+++ b/assistant/src/memory/graph/retriever.ts
@@ -81,7 +81,7 @@ async function rerankAndDedup(
   if (candidates.length <= maxNodes) return candidates;
 
   try {
-    const provider = await getConfiguredProvider();
+    const provider = await getConfiguredProvider("memoryRetrieval");
     if (!provider) return candidates.slice(0, maxNodes);
 
     // Numbered listing for the LLM: index + age + full content
@@ -180,7 +180,7 @@ async function dedupForTurn(
   query: string,
 ): Promise<{ nodes: ScoredNode[]; llmApplied: boolean }> {
   try {
-    const provider = await getConfiguredProvider();
+    const provider = await getConfiguredProvider("memoryRetrieval");
     if (!provider)
       return { nodes: candidates.slice(0, maxNodes), llmApplied: false };
 
@@ -273,7 +273,7 @@ async function dedupCrossCategory(
   maxNodes: number,
 ): Promise<ScoredNode[]> {
   try {
-    const provider = await getConfiguredProvider();
+    const provider = await getConfiguredProvider("memoryRetrieval");
     if (!provider) return candidates.slice(0, maxNodes);
 
     const now = Date.now();
diff --git a/assistant/src/memory/job-handlers/conversation-starters.ts b/assistant/src/memory/job-handlers/conversation-starters.ts
index 50d7e9586c2..f066ae5acc3 100644
--- a/assistant/src/memory/job-handlers/conversation-starters.ts
+++ b/assistant/src/memory/job-handlers/conversation-starters.ts
@@ -174,7 +174,7 @@ interface GeneratedStarter {
 }
 
 async function generateStarters(scopeId: string): Promise<GeneratedStarter[]> {
-  const provider = await getConfiguredProvider();
+  const provider = await getConfiguredProvider("conversationStarters");
   if (!provider) {
     log.info("No configured provider for conversation starters generation");
     return [];
diff --git a/assistant/src/memory/job-handlers/summarization.ts b/assistant/src/memory/job-handlers/summarization.ts
index fa4cd232c36..5af05dd9b03 100644
--- a/assistant/src/memory/job-handlers/summarization.ts
+++ b/assistant/src/memory/job-handlers/summarization.ts
@@ -161,7 +161,7 @@ async function summarizeWithLLM(
     return buildFallbackSummary(existingSummary, newContent, label);
   }
 
-  const provider = await getConfiguredProvider();
+  const provider = await getConfiguredProvider("conversationSummarization");
   if (!provider) {
     log.debug(
       { label },
diff --git a/assistant/src/memory/migrations/140-backfill-usage-cache-accounting.ts b/assistant/src/memory/migrations/140-backfill-usage-cache-accounting.ts
index b1b98ba3d56..cc2321cf6e7 100644
--- a/assistant/src/memory/migrations/140-backfill-usage-cache-accounting.ts
+++ b/assistant/src/memory/migrations/140-backfill-usage-cache-accounting.ts
@@ -196,7 +196,7 @@ export function migrateBackfillUsageCacheAccounting(database: DrizzleDb): void {
     const requestLogsByConversation = buildRequestLogMap(requestLogRows);
     const requestOffsets = new Map<string, number>();
     const previousUsageEventCreatedAt = new Map<string, number>();
-    const pricingOverrides = getConfig().pricingOverrides;
+    const pricingOverrides = getConfig().llm.pricingOverrides;
 
     let scannedAnthropicRows = 0;
     let updatedRows = 0;
diff --git a/assistant/src/messaging/style-analyzer.ts b/assistant/src/messaging/style-analyzer.ts
index 0cd3fb738db..3e2451b7bab 100644
--- a/assistant/src/messaging/style-analyzer.ts
+++ b/assistant/src/messaging/style-analyzer.ts
@@ -127,7 +127,7 @@ export async function extractStylePatterns(
     .map((e, i) => `--- Message ${i + 1} ---\n${e}`)
     .join("\n\n");
 
-  const provider = await getConfiguredProvider();
+  const provider = await getConfiguredProvider("styleAnalyzer");
   if (!provider) {
     return { stylePatterns: [], contactObservations: [] };
   }
diff --git a/assistant/src/providers/__tests__/retry-callsite.test.ts b/assistant/src/providers/__tests__/retry-callsite.test.ts
index f9b2ffe4a4b..84e6e0d69ef 100644
--- a/assistant/src/providers/__tests__/retry-callsite.test.ts
+++ b/assistant/src/providers/__tests__/retry-callsite.test.ts
@@ -9,16 +9,15 @@ mock.module("../../util/logger.js", () => ({
 }));
 
 // Mutable test fixtures for `getConfig()`. Each test rebuilds the relevant
-// pieces via `setLlmConfig(...)` / `setInferenceProvider(...)` before
-// exercising the path. The mock is registered once and reads from these
-// closures so subsequent tests don't need to remock the module.
+// pieces via `setLlmConfig(...)` before exercising the path. The mock is
+// registered once and reads from these closures so subsequent tests don't
+// need to remock the module.
 let mockLlmConfig: Record<string, unknown> = {};
-let mockInferenceProvider = "anthropic";
 
 mock.module("../../config/loader.js", () => ({
   getConfig: () => ({
     llm: mockLlmConfig,
-    services: { inference: { provider: mockInferenceProvider } },
+    services: { inference: { mode: "your-own" } },
   }),
 }));
 
@@ -39,10 +38,7 @@ mock.module("../registry.js", () => ({
 // ── Imports (after mocks) ───────────────────────────────────────────────────
 
 import { LLMSchema } from "../../config/schemas/llm.js";
-import {
-  getConfiguredProvider,
-  resolveConfiguredProvider,
-} from "../provider-send-message.js";
+import { getConfiguredProvider } from "../provider-send-message.js";
 import { RetryProvider } from "../retry.js";
 import type {
   Message,
@@ -90,7 +86,6 @@ function setLlmConfig(raw: unknown): void {
 
 beforeEach(() => {
   mockLlmConfig = LLMSchema.parse({}) as Record<string, unknown>;
-  mockInferenceProvider = "anthropic";
   mockProviders.clear();
 });
 
@@ -244,12 +239,13 @@ describe("RetryProvider — callSite resolution", () => {
   });
 });
 
-// ── RetryProvider — legacy modelIntent path is preserved ────────────────────
+// ── RetryProvider — pre-resolved model fast-path ────────────────────────────
 
-describe("RetryProvider — legacy modelIntent path (no callSite)", () => {
-  test("passing only modelIntent does not consult llm.* config", async () => {
-    // Seed the llm config with a value that, if accidentally consulted, would
-    // produce a clearly-wrong model. The legacy path must ignore it entirely.
+describe("RetryProvider — no callSite (pre-resolved config passes through)", () => {
+  test("config without callSite is forwarded untouched (no llm.* lookup)", async () => {
+    // Seed the llm config with a value that, if accidentally consulted,
+    // would clobber the explicit model. The pre-resolved fast-path must
+    // ignore it entirely.
     setLlmConfig({
       default: { provider: "anthropic", model: "MUST-NOT-LEAK" },
       callSites: {
@@ -264,28 +260,6 @@ describe("RetryProvider — legacy modelIntent path (no callSite)", () => {
       }),
     );
 
-    await wrapped.sendMessage(DUMMY_MESSAGES, undefined, undefined, {
-      config: { modelIntent: "quality-optimized" },
-    });
-
-    const config = seen?.config as Record<string, unknown>;
-    // Legacy path uses model-intents.ts mapping for "quality-optimized" on
-    // anthropic, which is "claude-opus-4-7". It must NOT be the llm.default
-    // value, which would indicate the new path was triggered.
-    expect(config.model).toBe("claude-opus-4-7");
-    expect(config.model).not.toBe("MUST-NOT-LEAK");
-    expect(config.model).not.toBe("ALSO-MUST-NOT-LEAK");
-    expect(config.modelIntent).toBeUndefined();
-  });
-
-  test("no callSite and no modelIntent leaves config untouched (existing fast-path)", async () => {
-    let seen: SendMessageOptions | undefined;
-    const wrapped = new RetryProvider(
-      makeProvider("anthropic", (options) => {
-        seen = options;
-      }),
-    );
-
     await wrapped.sendMessage(DUMMY_MESSAGES, undefined, undefined, {
       config: { model: "explicit-model", max_tokens: 1234 },
     });
@@ -293,6 +267,8 @@ describe("RetryProvider — legacy modelIntent path (no callSite)", () => {
     const config = seen?.config as Record<string, unknown>;
     expect(config.model).toBe("explicit-model");
     expect(config.max_tokens).toBe(1234);
+    expect(config.model).not.toBe("MUST-NOT-LEAK");
+    expect(config.model).not.toBe("ALSO-MUST-NOT-LEAK");
   });
 });
 
@@ -327,15 +303,4 @@ describe("getConfiguredProvider — callSite routing", () => {
     expect(provider?.name).toBe("anthropic");
   });
 
-  test("legacy call (no callSite arg) uses services.inference.provider", async () => {
-    // The legacy path consults `services.inference.provider`. The shared
-    // loader mock reads `mockInferenceProvider` at call time, so we just
-    // overwrite it for this test.
-    mockInferenceProvider = "fireworks";
-    mockProviders.set("fireworks", { name: "fireworks" });
-
-    const result = await resolveConfiguredProvider();
-    expect(result?.configuredProviderName).toBe("fireworks");
-    expect(result?.provider.name).toBe("fireworks");
-  });
 });
diff --git a/assistant/src/providers/provider-send-message.ts b/assistant/src/providers/provider-send-message.ts
index 3c28710295e..11cb8f737c8 100644
--- a/assistant/src/providers/provider-send-message.ts
+++ b/assistant/src/providers/provider-send-message.ts
@@ -4,8 +4,8 @@
  * and response extraction helpers.
  */
 
-import { getConfig } from "../config/loader.js";
 import { resolveCallSiteConfig } from "../config/llm-resolver.js";
+import { getConfig } from "../config/loader.js";
 import type { LLMCallSite } from "../config/schemas/llm.js";
 import {
   getProvider,
@@ -38,15 +38,16 @@ let lazyInitPromise: Promise<void> | null = null;
  * If providers haven't been initialized yet (e.g. non-daemon code paths),
  * performs a one-shot `initializeProviders(getConfig())`.
  *
- * When `callSite` is provided, the provider name comes from
+ * The provider name is sourced from
  * `resolveCallSiteConfig(callSite, config.llm).provider` — i.e. the unified
- * `llm` block drives selection. Otherwise the legacy
- * `services.inference.provider` is used unchanged.
+ * `llm` block drives selection. The `callSite` argument is required so the
+ * resolver can layer per-call-site overrides; pass the closest matching
+ * call-site identifier from `LLMCallSiteEnum` when adding a new caller.
  *
  * Returns `null` when no providers are available at all.
  */
 export async function resolveConfiguredProvider(
-  callSite?: LLMCallSite,
+  callSite: LLMCallSite,
 ): Promise<ConfiguredProviderResult | null> {
   const config = getConfig();
 
@@ -63,10 +64,7 @@ export async function resolveConfiguredProvider(
     }
   }
 
-  const inferenceProvider =
-    callSite !== undefined
-      ? resolveCallSiteConfig(callSite, config.llm).provider
-      : config.services.inference.provider;
+  const inferenceProvider = resolveCallSiteConfig(callSite, config.llm).provider;
 
   try {
     const provider = getProvider(inferenceProvider);
@@ -84,14 +82,11 @@ export async function resolveConfiguredProvider(
  * Thin wrapper around `resolveConfiguredProvider()` for callsites
  * that only need the Provider instance.
  *
- * When `callSite` is provided, resolves the provider via the unified
- * `llm` block (see `resolveConfiguredProvider`). Otherwise preserves the
- * legacy behavior of selecting `services.inference.provider`.
- *
- * Returns `null` when no providers are available.
+ * `callSite` is required — see `resolveConfiguredProvider`. Returns `null`
+ * when no providers are available.
  */
 export async function getConfiguredProvider(
-  callSite?: LLMCallSite,
+  callSite: LLMCallSite,
 ): Promise<Provider | null> {
   const result = await resolveConfiguredProvider(callSite);
   return result?.provider ?? null;
diff --git a/assistant/src/providers/registry.ts b/assistant/src/providers/registry.ts
index 569b1953b81..f159ca08099 100644
--- a/assistant/src/providers/registry.ts
+++ b/assistant/src/providers/registry.ts
@@ -51,8 +51,6 @@ export interface ProvidersConfig {
   services: {
     inference: {
       mode: "managed" | "your-own";
-      provider: string;
-      model: string;
     };
     "image-generation": {
       mode: "managed" | "your-own";
@@ -64,12 +62,18 @@ export interface ProvidersConfig {
       provider: string;
     };
   };
+  llm: {
+    default: {
+      provider: string;
+      model: string;
+    };
+  };
   timeouts?: { providerStreamTimeoutSec?: number };
 }
 
 function resolveModel(config: ProvidersConfig, providerName: string): string {
-  const inferenceProvider = config.services.inference.provider;
-  const inferenceModel = config.services.inference.model;
+  const inferenceProvider = config.llm.default.provider;
+  const inferenceModel = config.llm.default.model;
   if (inferenceProvider === providerName) {
     // If a non-Anthropic provider is selected with the untouched global default
     // model, use a provider-appropriate fallback instead.
@@ -203,7 +207,7 @@ export async function initializeProviders(
 
   // Ollama (keyless provider — always init when configured or key present)
   const ollamaKey = await getProviderKeyAsync("ollama");
-  if (config.services.inference.provider === "ollama" || ollamaKey) {
+  if (config.llm.default.provider === "ollama" || ollamaKey) {
     const model = resolveModel(config, "ollama");
     registerProvider(
       "ollama",
diff --git a/assistant/src/providers/retry.ts b/assistant/src/providers/retry.ts
index 27193c89cf3..e3e8df9d6f0 100644
--- a/assistant/src/providers/retry.ts
+++ b/assistant/src/providers/retry.ts
@@ -1,5 +1,5 @@
-import { getConfig } from "../config/loader.js";
 import { resolveCallSiteConfig } from "../config/llm-resolver.js";
+import { getConfig } from "../config/loader.js";
 import { ProviderError } from "../util/errors.js";
 import { getLogger } from "../util/logger.js";
 import {
@@ -9,7 +9,6 @@ import {
   isRetryableNetworkError,
   sleep,
 } from "../util/retry.js";
-import { isModelIntent, resolveModelIntent } from "./model-intents.js";
 import type {
   Message,
   Provider,
@@ -71,6 +70,23 @@ function isRetryableError(error: unknown): boolean {
   return isRetryableNetworkError(error);
 }
 
+/**
+ * Normalize per-call options before handing them to the wrapped provider.
+ *
+ * When `config.callSite` is set, resolves provider/model/maxTokens/effort/
+ * speed/temperature/thinking/contextWindow via `resolveCallSiteConfig` and
+ * writes them into `nextConfig` using the wire-format names that downstream
+ * provider clients consume (`max_tokens` snake-case for the token cap;
+ * camelCase for the rest, which matches the resolver's shape). Per-call
+ * explicit overrides on the original `config` object win over the resolved
+ * values, so callers can pin a model or other parameter for a single request.
+ *
+ * Whether or not `callSite` is set, this function applies per-provider
+ * stripping (`thinking`/`effort`/`speed`) based on the wrapped provider's
+ * name — agent-loop callers that pre-resolve provider/model still need this
+ * stripping so they don't accidentally send Anthropic-only knobs to OpenAI
+ * etc.
+ */
 function normalizeSendMessageOptions(
   providerName: string,
   options?: SendMessageOptions,
@@ -78,143 +94,47 @@ function normalizeSendMessageOptions(
   const config = options?.config;
   if (!config) return options;
 
-  // ── Call-site path ──────────────────────────────────────────────────
-  // When `config.callSite` is set, route through `resolveCallSiteConfig`
-  // to fully resolve provider/model/maxTokens/effort/speed/temperature/
-  // thinking/contextWindow from `llm.default + profile + site` overrides.
-  // This is the new unified path; the legacy `modelIntent` branch below is
-  // preserved unchanged for unmigrated callers.
-  if (config.callSite !== undefined) {
-    return normalizeViaCallSite(providerName, options, config);
-  }
-
-  // ── Legacy `modelIntent` path (preserved) ───────────────────────────
-  const explicitModel =
-    typeof config.model === "string" && config.model.trim().length > 0
-      ? config.model.trim()
-      : undefined;
-  const intent = isModelIntent(config.modelIntent)
-    ? config.modelIntent
-    : undefined;
-  const hasIntent = config.modelIntent !== undefined;
-
-  const needsThinkingStrip =
-    !THINKING_AWARE_PROVIDERS.has(providerName) && config.thinking !== undefined;
-  const needsEffortStrip =
-    !EFFORT_SUPPORTED_PROVIDERS.has(providerName) && config.effort !== undefined;
-  const needsSpeedStrip =
-    providerName !== "anthropic" && config.speed !== undefined;
-
-  if (
-    !hasIntent &&
-    explicitModel === config.model &&
-    !needsThinkingStrip &&
-    !needsEffortStrip &&
-    !needsSpeedStrip
-  ) {
-    return options;
-  }
-
   const nextConfig: Record<string, unknown> = { ...config };
-  delete nextConfig.modelIntent;
-
-  // thinking is Anthropic-specific on the wire; OpenRouter reads it as a
-  // signal for its unified reasoning parameter. Strip it for other providers.
-  if (
-    !THINKING_AWARE_PROVIDERS.has(providerName) &&
-    nextConfig.thinking !== undefined
-  ) {
-    delete nextConfig.thinking;
-  }
-
-  // effort is supported by Anthropic, OpenAI, and OpenAI-compatible providers; strip for others
-  if (
-    !EFFORT_SUPPORTED_PROVIDERS.has(providerName) &&
-    nextConfig.effort !== undefined
-  ) {
-    delete nextConfig.effort;
-  }
-
-  // speed (fast mode) is Anthropic-specific; strip for other providers
-  if (providerName !== "anthropic" && nextConfig.speed !== undefined) {
-    delete nextConfig.speed;
-  }
 
-  if (explicitModel) {
-    nextConfig.model = explicitModel;
-  } else if (intent) {
-    nextConfig.model = resolveModelIntent(providerName, intent);
-  } else {
-    delete nextConfig.model;
-  }
-
-  return {
-    ...options,
-    config: nextConfig,
-  };
-}
-
-/**
- * Normalize options when the caller opted into call-site resolution.
- *
- * Resolves provider/model/maxTokens/effort/speed/temperature/thinking/
- * contextWindow via `resolveCallSiteConfig` and writes them into `nextConfig`
- * using the wire-format names that downstream provider clients consume
- * (`max_tokens` snake-case for the token cap; camelCase for the rest, which
- * matches the resolver's shape). Per-call explicit overrides on the original
- * `config` object win over the resolved values, mirroring the legacy
- * "explicit `config.model` beats `modelIntent`" semantics so unmigrated
- * callers that pass both can't be silently broken.
- *
- * Both `callSite` and `modelIntent` are stripped from the downstream config.
- * Per-provider stripping (`thinking`/`effort`/`speed`) is applied based on
- * the wrapped provider's name, identical to the legacy path.
- */
-function normalizeViaCallSite(
-  providerName: string,
-  options: SendMessageOptions | undefined,
-  config: NonNullable<SendMessageOptions["config"]>,
-): SendMessageOptions | undefined {
-  const callSite = config.callSite!;
-  const resolved = resolveCallSiteConfig(callSite, getConfig().llm);
+  if (config.callSite !== undefined) {
+    const resolved = resolveCallSiteConfig(config.callSite, getConfig().llm);
 
-  const explicitModel =
-    typeof config.model === "string" && config.model.trim().length > 0
-      ? config.model.trim()
-      : undefined;
+    const explicitModel =
+      typeof config.model === "string" && config.model.trim().length > 0
+        ? config.model.trim()
+        : undefined;
 
-  const nextConfig: Record<string, unknown> = { ...config };
-  // Both opt-in routing keys are consumed by the RetryProvider layer and
-  // must not leak downstream.
-  delete nextConfig.callSite;
-  delete nextConfig.modelIntent;
+    // Routing key is consumed by the RetryProvider layer and must not leak
+    // downstream.
+    delete nextConfig.callSite;
 
-  // Apply resolved values, letting per-call explicit fields win where set.
-  nextConfig.model = explicitModel ?? resolved.model;
-  if (nextConfig.max_tokens === undefined) {
-    nextConfig.max_tokens = resolved.maxTokens;
-  }
-  if (nextConfig.effort === undefined) {
-    nextConfig.effort = resolved.effort;
-  }
-  if (nextConfig.speed === undefined) {
-    nextConfig.speed = resolved.speed;
-  }
-  if (nextConfig.temperature === undefined) {
-    nextConfig.temperature = resolved.temperature;
-  }
-  if (nextConfig.thinking === undefined) {
-    nextConfig.thinking = resolved.thinking;
-  }
-  if (nextConfig.contextWindow === undefined) {
-    nextConfig.contextWindow = resolved.contextWindow;
-  }
-  // Provider name from the resolver — informational; the wrapped provider
-  // is the actual transport. Downstream consumers may inspect this for
-  // diagnostics or wire-format decisions, but the request still routes
-  // through the inner provider that this RetryProvider wraps.
-  if (nextConfig.provider === undefined) {
-    nextConfig.provider = resolved.provider;
+    // Apply resolved values, letting per-call explicit fields win where set.
+    nextConfig.model = explicitModel ?? resolved.model;
+    if (nextConfig.max_tokens === undefined) {
+      nextConfig.max_tokens = resolved.maxTokens;
+    }
+    if (nextConfig.effort === undefined) {
+      nextConfig.effort = resolved.effort;
+    }
+    if (nextConfig.speed === undefined) {
+      nextConfig.speed = resolved.speed;
+    }
+    if (nextConfig.temperature === undefined) {
+      nextConfig.temperature = resolved.temperature;
+    }
+    if (nextConfig.thinking === undefined) {
+      nextConfig.thinking = resolved.thinking;
+    }
+    if (nextConfig.contextWindow === undefined) {
+      nextConfig.contextWindow = resolved.contextWindow;
+    }
+    // Provider name from the resolver — informational; the wrapped provider
+    // is the actual transport. Downstream consumers may inspect this for
+    // diagnostics or wire-format decisions, but the request still routes
+    // through the inner provider that this RetryProvider wraps.
+    if (nextConfig.provider === undefined) {
+      nextConfig.provider = resolved.provider;
+    }
   }
 
   // thinking is Anthropic-specific on the wire; OpenRouter reads it as a
diff --git a/assistant/src/providers/types.ts b/assistant/src/providers/types.ts
index 6358277b411..1179a23b00b 100644
--- a/assistant/src/providers/types.ts
+++ b/assistant/src/providers/types.ts
@@ -132,14 +132,12 @@ export type ProviderEvent =
 
 export interface SendMessageConfig {
   model?: string;
-  modelIntent?: ModelIntent;
   /**
-   * Opt-in routing through the unified LLM call-site resolver. When set,
-   * `RetryProvider` resolves provider/model/maxTokens/effort/speed/temperature/
-   * thinking/contextWindow via `resolveCallSiteConfig(callSite, config.llm)`
-   * instead of consulting `modelIntent`. Both fields may coexist; `callSite`
-   * wins when present, and the legacy `modelIntent` path is preserved for
-   * unmigrated callers.
+   * LLM call-site identifier. `RetryProvider` resolves
+   * provider/model/maxTokens/effort/speed/temperature/thinking/contextWindow
+   * via `resolveCallSiteConfig(callSite, config.llm)`. Required for any new
+   * caller; the legacy `modelIntent`-based fallback was removed in PR 19 of
+   * the unify-llm-callsites plan.
    */
   callSite?: LLMCallSite;
   effort?: "low" | "medium" | "high" | "max";
diff --git a/assistant/src/runtime/btw-sidechain.ts b/assistant/src/runtime/btw-sidechain.ts
index ec62f2d5707..19bffd33a8a 100644
--- a/assistant/src/runtime/btw-sidechain.ts
+++ b/assistant/src/runtime/btw-sidechain.ts
@@ -8,7 +8,6 @@ import {
 } from "../providers/provider-send-message.js";
 import type {
   Message,
-  ModelIntent,
   Provider,
   ProviderEvent,
   ProviderResponse,
@@ -30,12 +29,10 @@ export interface RunBtwSidechainParams {
   systemPrompt?: string;
   tools?: ToolDefinition[];
   maxTokens?: number;
-  modelIntent?: ModelIntent;
   /**
-   * Unified call-site identifier. When set, the provider layer resolves
+   * Unified call-site identifier. The provider layer resolves
    * provider/model/maxTokens/effort/speed/temperature/thinking/contextWindow
-   * via `resolveCallSiteConfig(callSite, config.llm)`. `callSite` wins over
-   * `modelIntent` when both are passed. When neither is passed, defaults to
+   * via `resolveCallSiteConfig(callSite, config.llm)`. Defaults to
    * `'identityIntro'` since this side-chain runner was originally introduced
    * for the identity intro generation path; callers (greeting, title, etc.)
    * override it with their own call-site ID.
@@ -100,16 +97,11 @@ export async function runBtwSidechain(
       config: {
         max_tokens: params.maxTokens ?? 1024,
         tool_choice: { type: "none" },
-        // Resolution precedence: explicit callSite → explicit modelIntent →
-        // default callSite "identityIntro" (the original purpose of this
-        // side-chain runner). PR 5's contract says `callSite` wins over
-        // `modelIntent` when both are present, so we set them mutually
-        // exclusively here for clarity.
-        ...(params.callSite !== undefined
-          ? { callSite: params.callSite }
-          : params.modelIntent !== undefined
-            ? { modelIntent: params.modelIntent }
-            : { callSite: "identityIntro" as LLMCallSite }),
+        // Resolution: explicit callSite → default "identityIntro" (the
+        // original purpose of this side-chain runner). The legacy
+        // `modelIntent` parameter was removed in PR 19 of the
+        // unify-llm-callsites plan.
+        callSite: params.callSite ?? ("identityIntro" as LLMCallSite),
       },
       onEvent: (event) => {
         if (event.type === "text_delta") {
diff --git a/assistant/src/runtime/invite-instruction-generator.ts b/assistant/src/runtime/invite-instruction-generator.ts
index bb03bfdb7e2..66155590735 100644
--- a/assistant/src/runtime/invite-instruction-generator.ts
+++ b/assistant/src/runtime/invite-instruction-generator.ts
@@ -70,7 +70,7 @@ export async function generateInviteInstruction(params: {
     ? `Send ${contact} this link: ${params.shareUrl} — or tell them to message me${handle} with the code below.`
     : `Tell ${contact} to message me${handle} with the code below.`;
 
-  const resolved = await resolveConfiguredProvider();
+  const resolved = await resolveConfiguredProvider("inviteInstructionGenerator");
   if (!resolved) {
     log.debug(
       "No provider available for invite instruction generation, using fallback",
diff --git a/assistant/src/runtime/routes/conversation-routes.ts b/assistant/src/runtime/routes/conversation-routes.ts
index eac67ecd545..50daba22530 100644
--- a/assistant/src/runtime/routes/conversation-routes.ts
+++ b/assistant/src/runtime/routes/conversation-routes.ts
@@ -1929,9 +1929,9 @@ export async function handleSendMessage(
     messageCount: conversation.getMessages().length,
     inputTokens: conversation.usageStats.inputTokens,
     outputTokens: conversation.usageStats.outputTokens,
-    maxInputTokens: config.contextWindow.maxInputTokens,
-    model: config.services.inference.model,
-    provider: config.services.inference.provider,
+    maxInputTokens: config.llm.default.contextWindow.maxInputTokens,
+    model: config.llm.default.model,
+    provider: config.llm.default.provider,
     estimatedCost: conversation.usageStats.estimatedCost,
     userMessageInterface: sourceInterface,
   };
@@ -2181,7 +2181,7 @@ async function generateLlmSuggestion(
     [{ role: "user", content: [{ type: "text", text: prompt }] }],
     [], // no tools
     systemPrompt,
-    { config: { modelIntent: "latency-optimized" } },
+    { config: { callSite: "conversationStarters" } },
   );
 
   const textBlock = response.content.find((b) => b.type === "text");
@@ -2300,7 +2300,7 @@ export async function handleGetSuggestion(
     }
 
     // Try LLM suggestion using the configured provider
-    const provider = await getConfiguredProvider();
+    const provider = await getConfiguredProvider("conversationStarters");
     if (provider) {
       try {
         // Deduplicate concurrent requests
diff --git a/assistant/src/runtime/routes/debug-routes.ts b/assistant/src/runtime/routes/debug-routes.ts
index 83a741034dd..8e565237ad1 100644
--- a/assistant/src/runtime/routes/debug-routes.ts
+++ b/assistant/src/runtime/routes/debug-routes.ts
@@ -65,7 +65,7 @@ function handleDebug(): Response {
       startedAt: new Date(startedAt).toISOString(),
     },
     provider: {
-      configuredProvider: config.services.inference.provider,
+      configuredProvider: config.llm.default.provider,
       registeredProviders,
       routingSources,
       inferenceMode: config.services.inference.mode,
diff --git a/assistant/src/runtime/routes/diagnostics-routes.ts b/assistant/src/runtime/routes/diagnostics-routes.ts
index d0d7a398c83..69f9bcc57ec 100644
--- a/assistant/src/runtime/routes/diagnostics-routes.ts
+++ b/assistant/src/runtime/routes/diagnostics-routes.ts
@@ -222,7 +222,7 @@ async function handleDictation(body: DictationBody): Promise<Response> {
   const transcription = expandSnippets(body.transcription, profile.snippets);
 
   try {
-    const provider = await getConfiguredProvider();
+    const provider = await getConfiguredProvider("interactionClassifier");
     if (!provider) {
       log.warn(
         "Dictation: no provider available, using heuristic + raw transcription",
@@ -288,7 +288,7 @@ async function handleDictation(body: DictationBody): Promise<Response> {
         systemPrompt,
         {
           config: {
-            modelIntent: "latency-optimized",
+            callSite: "interactionClassifier",
             max_tokens: maxTokens,
             tool_choice: {
               type: "tool" as const,
@@ -381,7 +381,7 @@ async function handleCommandMode(
   const maxTokens = Math.max(1024, computeMaxTokens(inputLength));
 
   try {
-    const provider = await getConfiguredProvider();
+    const provider = await getConfiguredProvider("interactionClassifier");
     if (!provider) {
       log.warn("Command mode: no provider available, returning selected text");
       const normalizedText = applyDictionary(
@@ -399,7 +399,9 @@ async function handleCommandMode(
       [userMessage(body.transcription)],
       [],
       systemPrompt,
-      { config: { modelIntent: "latency-optimized", max_tokens: maxTokens } },
+      {
+        config: { callSite: "interactionClassifier", max_tokens: maxTokens },
+      },
     );
 
     const textBlock = response.content.find((b) => b.type === "text");
diff --git a/assistant/src/subagent/manager.ts b/assistant/src/subagent/manager.ts
index 63cac65d41c..08f1e7834c2 100644
--- a/assistant/src/subagent/manager.ts
+++ b/assistant/src/subagent/manager.ts
@@ -211,7 +211,7 @@ export class SubagentManager {
 
     // ── Build conversation dependencies ─────────────────────────────
     const appConfig = getConfig();
-    let provider = getProvider(appConfig.services.inference.provider);
+    let provider = getProvider(appConfig.llm.default.provider);
     const { rateLimit } = appConfig;
     if (rateLimit.maxRequestsPerMinute > 0) {
       provider = new RateLimitProvider(
@@ -247,7 +247,7 @@ export class SubagentManager {
         config.systemPromptOverride ??
         buildSubagentSystemPrompt({ ...config, id: subagentId }, role);
     }
-    const maxTokens = appConfig.maxTokens;
+    const maxTokens = appConfig.llm.default.maxTokens;
     const workingDir = getSandboxWorkingDir();
 
     const memoryPolicy: ConversationMemoryPolicy = isFork
diff --git a/assistant/src/workspace/migrations/038-unify-llm-callsite-configs.ts b/assistant/src/workspace/migrations/038-unify-llm-callsite-configs.ts
index ccbb94455cb..57d9fb9c49a 100644
--- a/assistant/src/workspace/migrations/038-unify-llm-callsite-configs.ts
+++ b/assistant/src/workspace/migrations/038-unify-llm-callsite-configs.ts
@@ -279,131 +279,22 @@ export const unifyLlmCallSiteConfigsMigration: WorkspaceMigration = {
 
     writeFileSync(configPath, JSON.stringify(config, null, 2) + "\n");
   },
-  down(workspaceDir: string): void {
-    const configPath = join(workspaceDir, "config.json");
-    if (!existsSync(configPath)) return;
-
-    let config: Record<string, unknown>;
-    try {
-      const raw = JSON.parse(readFileSync(configPath, "utf-8"));
-      if (!raw || typeof raw !== "object" || Array.isArray(raw)) return;
-      config = raw as Record<string, unknown>;
-    } catch {
-      return;
-    }
-
-    const llm = readObject(config.llm);
-    if (llm === null) return;
-
-    // ── Reverse llm.default → top-level + services.inference ──────────
-    const defaultBlock = readObject(llm.default);
-    if (defaultBlock !== null) {
-      const services = ensureObj(config, "services");
-      const inference = ensureObj(services, "inference");
-      const provider = readString(defaultBlock.provider);
-      if (provider !== undefined) {
-        inference.provider = provider;
-      }
-      const model = readString(defaultBlock.model);
-      if (model !== undefined) {
-        inference.model = model;
-      }
-      const maxTokens = readPositiveInt(defaultBlock.maxTokens);
-      if (maxTokens !== undefined) {
-        config.maxTokens = maxTokens;
-      }
-      const effort = readEnum(defaultBlock.effort, EFFORT_VALUES);
-      if (effort !== undefined) {
-        config.effort = effort;
-      }
-      const speed = readEnum(defaultBlock.speed, SPEED_VALUES);
-      if (speed !== undefined) {
-        config.speed = speed;
-      }
-      const thinking = readObject(defaultBlock.thinking);
-      if (thinking !== null) {
-        config.thinking = thinking;
-      }
-      const contextWindow = readObject(defaultBlock.contextWindow);
-      if (contextWindow !== null) {
-        config.contextWindow = contextWindow;
-      }
-    }
-
-    // ── Reverse llm.callSites → scattered keys ────────────────────────
-    const callSites = readObject(llm.callSites) ?? {};
-
-    const heartbeatAgent = readObject(callSites.heartbeatAgent);
-    if (heartbeatAgent !== null) {
-      const speed = readEnum(heartbeatAgent.speed, SPEED_VALUES);
-      if (speed !== undefined) {
-        const heartbeat = ensureObj(config, "heartbeat");
-        heartbeat.speed = speed;
-      }
-    }
-
-    const filingAgent = readObject(callSites.filingAgent);
-    if (filingAgent !== null) {
-      const speed = readEnum(filingAgent.speed, SPEED_VALUES);
-      if (speed !== undefined) {
-        const filing = ensureObj(config, "filing");
-        filing.speed = speed;
-      }
-    }
-
-    const analyzeConversation = readObject(callSites.analyzeConversation);
-    if (analyzeConversation !== null) {
-      const provider = readString(analyzeConversation.provider);
-      const model = readString(analyzeConversation.model);
-      const recombined =
-        provider !== undefined && model !== undefined
-          ? `${provider}/${model}`
-          : (model ?? undefined);
-      if (recombined !== undefined) {
-        const analysis = ensureObj(config, "analysis");
-        analysis.modelOverride = recombined;
-      }
-    }
-
-    const callAgent = readObject(callSites.callAgent);
-    if (callAgent !== null) {
-      const model = readString(callAgent.model);
-      if (model !== undefined) {
-        const calls = ensureObj(config, "calls");
-        calls.model = model;
-      }
-    }
-
-    const commitMessage = readObject(callSites.commitMessage);
-    if (commitMessage !== null) {
-      const cmMaxTokens = readPositiveInt(commitMessage.maxTokens);
-      const cmTemperature = readTemperature(commitMessage.temperature);
-      if (cmMaxTokens !== undefined || cmTemperature !== undefined) {
-        const workspaceGit = ensureObj(config, "workspaceGit");
-        const commitMessageLLM = ensureObj(workspaceGit, "commitMessageLLM");
-        if (cmMaxTokens !== undefined) {
-          commitMessageLLM.maxTokens = cmMaxTokens;
-        }
-        if (cmTemperature !== undefined) {
-          commitMessageLLM.temperature = cmTemperature;
-        }
-      }
-    }
-    // Note: `conversationSummarization`, `emptyStateGreeting`,
-    // `notificationDecision`, and `preferenceExtraction` were derived from
-    // `modelIntent` keys — `down()` intentionally does not synthesize a
-    // reverse intent (we only have a resolved model, not the intent that
-    // produced it). Callers reading those legacy keys after a rollback will
-    // fall back to schema defaults.
-
-    // ── Reverse llm.pricingOverrides → top-level pricingOverrides ─────
-    if (Array.isArray(llm.pricingOverrides)) {
-      config.pricingOverrides = llm.pricingOverrides;
-    }
-
-    delete config.llm;
-
-    writeFileSync(configPath, JSON.stringify(config, null, 2) + "\n");
+  /**
+   * Documented no-op since PR 19 of the unify-llm-callsites plan.
+   *
+   * The legacy keys that this migration consolidates (`services.inference.
+   * {provider,model}`, top-level `maxTokens`/`effort`/`speed`/`thinking`/
+   * `contextWindow`/`pricingOverrides`, `heartbeat.speed`, `filing.speed`,
+   * `analysis.modelIntent`/`modelOverride`, `memory.summarization.modelIntent`,
+   * `notifications.decisionModelIntent`, `ui.greetingModelIntent`,
+   * `calls.model`, and `workspaceGit.commitMessageLLM.{maxTokens,temperature}`)
+   * were removed from `AssistantConfigSchema` in PR 19. Re-creating them in
+   * `down()` would have no effect on the running daemon (no code reads them
+   * any more), so a rollback that needs to undo this migration must instead
+   * roll back the application binary to a build that predates PR 19.
+   */
+  down(_workspaceDir: string): void {
+    // Forward-only after PR 19. See comment above.
   },
 };
 
@@ -509,18 +400,3 @@ function readTemperature(value: unknown): number | undefined {
     ? value
     : undefined;
 }
-
-function ensureObj(
-  parent: Record<string, unknown>,
-  key: string,
-): Record<string, unknown> {
-  if (
-    !(key in parent) ||
-    parent[key] == null ||
-    typeof parent[key] !== "object" ||
-    Array.isArray(parent[key])
-  ) {
-    parent[key] = {};
-  }
-  return parent[key] as Record<string, unknown>;
-}
diff --git a/assistant/src/workspace/migrations/039-drop-legacy-llm-keys.ts b/assistant/src/workspace/migrations/039-drop-legacy-llm-keys.ts
new file mode 100644
index 00000000000..5ea9c81c094
--- /dev/null
+++ b/assistant/src/workspace/migrations/039-drop-legacy-llm-keys.ts
@@ -0,0 +1,171 @@
+import { existsSync, readFileSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+
+import type { WorkspaceMigration } from "./types.js";
+
+/**
+ * Strip the now-removed legacy LLM-related keys from existing `config.json`
+ * files. PR 19 of the unify-llm-callsites plan removed these keys from
+ * `AssistantConfigSchema`; Zod silently strips unknown fields when re-parsing,
+ * but the keys would otherwise persist on disk forever and re-appear in any
+ * exported config snapshot. Erasing them keeps `config.json` lean and matches
+ * the schema that the in-memory loader sees.
+ *
+ * Keys removed:
+ *   - Top level: `maxTokens`, `effort`, `speed`, `thinking`, `contextWindow`,
+ *     `pricingOverrides`.
+ *   - `services.inference.{provider, model}` (the `mode` field stays — it
+ *     governs `managed` vs `your-own` routing, which is orthogonal to LLM
+ *     model selection).
+ *   - `heartbeat.speed`, `filing.speed`.
+ *   - `analysis.modelIntent`, `analysis.modelOverride`.
+ *   - `memory.summarization.modelIntent`.
+ *   - `notifications.decisionModelIntent`.
+ *   - `ui.greetingModelIntent`.
+ *   - `calls.model`.
+ *   - `workspaceGit.commitMessageLLM.{maxTokens, temperature,
+ *     useConfiguredProvider, providerFastModelOverrides}`.
+ *
+ * Preconditions: this migration depends on
+ * `038-unify-llm-callsite-configs` having already populated `llm.default` /
+ * `llm.callSites` / `llm.pricingOverrides` from these legacy keys. The
+ * registry guarantees ordering.
+ *
+ * Idempotency: each delete is wrapped in a key-exists check so re-runs are
+ * no-ops. Empty objects are left in place rather than recursively pruned —
+ * that matches Zod's default behavior of treating an absent value the same
+ * as an empty `{}` for nested schemas.
+ */
+export const dropLegacyLlmKeysMigration: WorkspaceMigration = {
+  id: "039-drop-legacy-llm-keys",
+  description:
+    "Strip deprecated scattered LLM-related keys from config.json (post-PR-19 cleanup)",
+  run(workspaceDir: string): void {
+    const configPath = join(workspaceDir, "config.json");
+    if (!existsSync(configPath)) return;
+
+    let config: Record<string, unknown>;
+    try {
+      const raw = JSON.parse(readFileSync(configPath, "utf-8"));
+      if (!raw || typeof raw !== "object" || Array.isArray(raw)) return;
+      config = raw as Record<string, unknown>;
+    } catch {
+      return;
+    }
+
+    let mutated = false;
+
+    for (const key of [
+      "maxTokens",
+      "effort",
+      "speed",
+      "thinking",
+      "contextWindow",
+      "pricingOverrides",
+    ]) {
+      if (key in config) {
+        delete config[key];
+        mutated = true;
+      }
+    }
+
+    const services = readObject(config.services);
+    if (services !== null) {
+      const inference = readObject(services.inference);
+      if (inference !== null) {
+        for (const key of ["provider", "model"]) {
+          if (key in inference) {
+            delete inference[key];
+            mutated = true;
+          }
+        }
+      }
+    }
+
+    const heartbeat = readObject(config.heartbeat);
+    if (heartbeat !== null && "speed" in heartbeat) {
+      delete heartbeat.speed;
+      mutated = true;
+    }
+
+    const filing = readObject(config.filing);
+    if (filing !== null && "speed" in filing) {
+      delete filing.speed;
+      mutated = true;
+    }
+
+    const analysis = readObject(config.analysis);
+    if (analysis !== null) {
+      for (const key of ["modelIntent", "modelOverride"]) {
+        if (key in analysis) {
+          delete analysis[key];
+          mutated = true;
+        }
+      }
+    }
+
+    const memory = readObject(config.memory);
+    if (memory !== null) {
+      const summarization = readObject(memory.summarization);
+      if (summarization !== null && "modelIntent" in summarization) {
+        delete summarization.modelIntent;
+        mutated = true;
+      }
+    }
+
+    const notifications = readObject(config.notifications);
+    if (notifications !== null && "decisionModelIntent" in notifications) {
+      delete notifications.decisionModelIntent;
+      mutated = true;
+    }
+
+    const ui = readObject(config.ui);
+    if (ui !== null && "greetingModelIntent" in ui) {
+      delete ui.greetingModelIntent;
+      mutated = true;
+    }
+
+    const calls = readObject(config.calls);
+    if (calls !== null && "model" in calls) {
+      delete calls.model;
+      mutated = true;
+    }
+
+    const workspaceGit = readObject(config.workspaceGit);
+    if (workspaceGit !== null) {
+      const commitMessageLLM = readObject(workspaceGit.commitMessageLLM);
+      if (commitMessageLLM !== null) {
+        for (const key of [
+          "maxTokens",
+          "temperature",
+          "useConfiguredProvider",
+          "providerFastModelOverrides",
+        ]) {
+          if (key in commitMessageLLM) {
+            delete commitMessageLLM[key];
+            mutated = true;
+          }
+        }
+      }
+    }
+
+    if (!mutated) return;
+
+    writeFileSync(configPath, JSON.stringify(config, null, 2) + "\n");
+  },
+  /**
+   * Forward-only. Restoring the deleted keys would re-introduce schema-validation
+   * warnings and have no runtime effect — every reader migrated to `llm.default`
+   * / `llm.callSites` in PR 19.
+   */
+  down(_workspaceDir: string): void {
+    // no-op
+  },
+};
+
+function readObject(value: unknown): Record<string, unknown> | null {
+  if (value === null || typeof value !== "object" || Array.isArray(value)) {
+    return null;
+  }
+  return value as Record<string, unknown>;
+}
diff --git a/assistant/src/workspace/migrations/registry.ts b/assistant/src/workspace/migrations/registry.ts
index 1e8ec0f511f..8a70d522872 100644
--- a/assistant/src/workspace/migrations/registry.ts
+++ b/assistant/src/workspace/migrations/registry.ts
@@ -36,6 +36,7 @@ import { seedSlackChannelPersonaMigration } from "./035-seed-slack-channel-perso
 import { updatePkbIndexBarMigration } from "./036-update-pkb-index-bar.js";
 import { createMeetsDirMigration } from "./037-create-meets-dir.js";
 import { unifyLlmCallSiteConfigsMigration } from "./038-unify-llm-callsite-configs.js";
+import { dropLegacyLlmKeysMigration } from "./039-drop-legacy-llm-keys.js";
 import { migrateToWorkspaceVolumeMigration } from "./migrate-to-workspace-volume.js";
 import type { WorkspaceMigration } from "./types.js";
 
@@ -83,4 +84,5 @@ export const WORKSPACE_MIGRATIONS: WorkspaceMigration[] = [
   updatePkbIndexBarMigration,
   createMeetsDirMigration,
   unifyLlmCallSiteConfigsMigration,
+  dropLegacyLlmKeysMigration,
 ];
diff --git a/assistant/src/workspace/provider-commit-message-generator.ts b/assistant/src/workspace/provider-commit-message-generator.ts
index c50d44bacf5..863f01e4419 100644
--- a/assistant/src/workspace/provider-commit-message-generator.ts
+++ b/assistant/src/workspace/provider-commit-message-generator.ts
@@ -14,7 +14,6 @@ export type LLMFallbackReason =
   | "missing_provider_api_key"
   | "breaker_open"
   | "insufficient_budget"
-  | "missing_fast_model"
   | "provider_not_initialized"
   | "timeout"
   | "provider_error"
@@ -40,19 +39,13 @@ Rules:
 - Total output must be under 300 characters
 - If you cannot determine a meaningful message, respond with exactly: FALLBACK`;
 
-const PROVIDER_DEFAULT_FAST_MODELS: Record<string, string> = {
-  anthropic: "claude-haiku-4-5-20251001",
-  openai: "gpt-4o-mini",
-  gemini: "gemini-2.0-flash",
-};
-
 // Providers that can be initialized without an API key (e.g., Ollama runs locally)
 const KEYLESS_PROVIDERS = new Set(["ollama"]);
 
 const deterministicProvider = new DefaultCommitMessageProvider();
 
 function getProviderCandidates(config: ReturnType<typeof getConfig>): string[] {
-  return [config.services.inference.provider];
+  return [config.llm.default.provider];
 }
 
 function buildDeterministicResult(
@@ -118,22 +111,22 @@ export class ProviderCommitMessageGenerator {
     // 3. selected-provider API key preflight (except keyless providers)
     // 4. breaker_open
     // 5. insufficient_budget
-    // 6. missing_fast_model
-    // 7. call provider → timeout / provider_error / invalid_output
+    // 6. call provider → timeout / provider_error / invalid_output
     // ──────────────────────────────────────────────────────────────────
 
     // Step 1: Feature gate
     if (!llmConfig.enabled) {
       return buildDeterministicResult(context, "disabled");
     }
-    if (!llmConfig.useConfiguredProvider) {
-      return buildDeterministicResult(context, "disabled");
-    }
 
-    // Step 2: Resolve configured provider.
-    // If nothing is resolvable, differentiate likely missing-key cases from
-    // true registry/init failures.
-    const resolved = await resolveConfiguredProvider();
+    // Step 2: Resolve configured provider via the commit-message call site,
+    // so model + maxTokens + temperature come from `llm.callSites.commitMessage`
+    // (with `llm.default` as the fallback). Operational fields (`enabled`,
+    // `timeoutMs`, `breaker`, `maxFilesInPrompt`, `maxDiffBytes`,
+    // `minRemainingTurnBudgetMs`) remain on `workspaceGit.commitMessageLLM`
+    // and are read above. If nothing is resolvable, differentiate likely
+    // missing-key cases from true registry/init failures.
+    const resolved = await resolveConfiguredProvider("commitMessage");
     if (!resolved) {
       const candidates = getProviderCandidates(config);
       const hasAnyKeylessCandidate = candidates.some((name) =>
@@ -153,7 +146,7 @@ export class ProviderCommitMessageGenerator {
         return buildDeterministicResult(context, "missing_provider_api_key");
       }
       log.debug(
-        { provider: config.services.inference.provider },
+        { provider: config.llm.default.provider },
         "Provider not initialized; falling back to deterministic",
       );
       return buildDeterministicResult(context, "provider_not_initialized");
@@ -200,23 +193,7 @@ export class ProviderCommitMessageGenerator {
       }
     }
 
-    // Step 5: Fast model preflight — resolve before any provider call
-    const fastModel =
-      llmConfig.providerFastModelOverrides[providerName] ??
-      PROVIDER_DEFAULT_FAST_MODELS[providerName];
-
-    if (!fastModel) {
-      log.debug(
-        {
-          provider: providerName,
-          configuredProvider: config.services.inference.provider,
-        },
-        "No fast model resolvable for provider; falling back to deterministic",
-      );
-      return buildDeterministicResult(context, "missing_fast_model");
-    }
-
-    // Step 6 + 7: Call the provider
+    // Step 5: Call the provider
     try {
       // Build prompt
       const fileList = options.changedFiles
@@ -263,19 +240,13 @@ export class ProviderCommitMessageGenerator {
           {
             signal: ac.signal,
             config: {
-              // `callSite` lets the provider resolve `max_tokens` and
-              // `temperature` from `llm.callSites.commitMessage` (populated by
-              // the workspace migration from the legacy
-              // `workspaceGit.commitMessageLLM.{maxTokens,temperature}` keys).
-              // Operational fields (`enabled`, `timeoutMs`, `breaker`,
-              // `maxFilesInPrompt`, `maxDiffBytes`, `minRemainingTurnBudgetMs`)
-              // remain on `workspaceGit.commitMessageLLM` and are read above.
+              // `callSite` lets the provider resolve model, max_tokens, and
+              // temperature from `llm.callSites.commitMessage` (with
+              // `llm.default` as the fallback). Operational fields
+              // (`enabled`, `timeoutMs`, `breaker`, `maxFilesInPrompt`,
+              // `maxDiffBytes`, `minRemainingTurnBudgetMs`) remain on
+              // `workspaceGit.commitMessageLLM` and are read above.
               callSite: "commitMessage",
-              // `fastModel` overrides the resolver's `model` because commit
-              // message generation enforces its own provider-specific fast
-              // model selection (see `PROVIDER_DEFAULT_FAST_MODELS` and
-              // `providerFastModelOverrides`).
-              model: fastModel,
             },
           },
         );