From 4d0314e500d35c8d9980b589a2d76e7f199bf8a6 Mon Sep 17 00:00:00 2001
From: siddseethepalli <siddseethepalli@gmail.com>
Date: Fri, 17 Apr 2026 02:13:26 +0000
Subject: [PATCH] fix(cleanup): remove dead code, refresh comments, add
 migration test, update docs

---
 assistant/ARCHITECTURE.md                     |   2 +-
 .../src/__tests__/call-controller.test.ts     |   3 +-
 .../conversation-process-callsite.test.ts     |  11 +-
 .../src/__tests__/heartbeat-service.test.ts   |  17 +-
 assistant/src/__tests__/model-intents.test.ts |  12 +-
 ...migration-039-drop-legacy-llm-keys.test.ts | 343 ++++++++++++++++++
 assistant/src/agent/loop.ts                   |   9 +-
 .../phone-calls/references/CONFIG.md          |   2 +-
 assistant/src/config/schema.ts                |  12 +-
 assistant/src/config/schemas/inference.ts     |  22 --
 .../src/daemon/conversation-agent-loop.ts     |   9 +-
 assistant/src/daemon/conversation.ts          |   3 +-
 assistant/src/daemon/handlers/shared.ts       |   3 +-
 assistant/src/filing/filing-service.ts        |   3 +-
 assistant/src/notifications/README.md         |  14 +-
 assistant/src/providers/types.ts              |   5 +-
 assistant/src/runtime/btw-sidechain.ts        |   6 +-
 .../runtime/services/analyze-conversation.ts  |   4 +-
 cli/src/lib/config-utils.ts                   |   8 +-
 .../Features/Settings/SettingsStore.swift     |  70 +---
 .../MockSettingsClient.swift                  |   5 +-
 gateway/ARCHITECTURE.md                       |   2 +-
 22 files changed, 417 insertions(+), 148 deletions(-)
 create mode 100644 assistant/src/__tests__/workspace-migration-039-drop-legacy-llm-keys.test.ts

diff --git a/assistant/ARCHITECTURE.md b/assistant/ARCHITECTURE.md
index 8fc7a31baf8..6540811b5b8 100644
--- a/assistant/ARCHITECTURE.md
+++ b/assistant/ARCHITECTURE.md
@@ -2094,7 +2094,7 @@ Connected channels are resolved at signal emission time: vellum is always includ
 
 **Audit trail (SQLite):** `notification_events` → `notification_decisions` (with `conversationActions` in validation results) → `notification_deliveries` (with `conversation_id`, `message_id`, `conversation_strategy`, `conversation_action`, `conversation_target_id`, `conversation_fallback_used`)
 
-**Configuration:** `notifications.decisionModelIntent` in `config.json`.
+**Configuration:** `llm.callSites.notificationDecision` (decision engine) and `llm.callSites.preferenceExtraction` (preference extractor) in `config.json`. Both fall back to `llm.default` when unset.
 
 ---
 
diff --git a/assistant/src/__tests__/call-controller.test.ts b/assistant/src/__tests__/call-controller.test.ts
index 8d4208f62b0..3c204c9a193 100644
--- a/assistant/src/__tests__/call-controller.test.ts
+++ b/assistant/src/__tests__/call-controller.test.ts
@@ -35,10 +35,9 @@ mock.module("../config/loader.js", () => {
       silenceTimeoutSeconds: 30,
       disclosure: { enabled: false, text: "" },
       safety: { denyCategories: [] },
-      model: undefined,
     },
     memory: { enabled: false },
-    notifications: { decisionModelIntent: "latency-optimized" },
+    notifications: {},
     services: {
       tts: {
         mode: "your-own" as const,
diff --git a/assistant/src/__tests__/conversation-process-callsite.test.ts b/assistant/src/__tests__/conversation-process-callsite.test.ts
index bb14d7fc856..8abc7df9dd5 100644
--- a/assistant/src/__tests__/conversation-process-callsite.test.ts
+++ b/assistant/src/__tests__/conversation-process-callsite.test.ts
@@ -1,11 +1,14 @@
 /**
- * Verify that `callSite` threads from `Conversation.processMessage` options
- * all the way down to the per-call provider config, and that user-initiated
- * turns default to `'mainAgent'` when no caller-supplied `callSite` is set.
+ * Verifies that `callSite` threads from `Conversation.processMessage`
+ * options all the way down to the per-call provider config, and that
+ * user-initiated turns default to `'mainAgent'` when no caller-supplied
+ * `callSite` is set.
  *
  * The test mocks `AgentLoop.run()` so it can capture the `callSite` parameter
  * the conversation passes after `processMessage` runs the slash-resolver and
- * runtime-injection pipeline.
+ * runtime-injection pipeline. Adapter callers (heartbeat, filing, scheduler)
+ * pass an explicit `callSite` so `RetryProvider` resolves their per-call
+ * config from `llm.callSites.<id>`.
  */
 import { describe, expect, mock, test } from "bun:test";
 
diff --git a/assistant/src/__tests__/heartbeat-service.test.ts b/assistant/src/__tests__/heartbeat-service.test.ts
index 063d284bc44..1249c480368 100644
--- a/assistant/src/__tests__/heartbeat-service.test.ts
+++ b/assistant/src/__tests__/heartbeat-service.test.ts
@@ -9,7 +9,6 @@ let mockConfig = {
   heartbeat: {
     enabled: true,
     intervalMs: 60_000,
-    speed: "standard" as "standard" | "fast",
     activeHoursStart: undefined as number | undefined,
     activeHoursEnd: undefined as number | undefined,
   },
@@ -185,7 +184,6 @@ describe("HeartbeatService", () => {
       heartbeat: {
         enabled: true,
         intervalMs: 60_000,
-        speed: "standard",
         activeHoursStart: undefined,
         activeHoursEnd: undefined,
       },
@@ -547,17 +545,18 @@ describe("HeartbeatService", () => {
     });
   });
 
-  test("callSite is passed regardless of legacy heartbeat.speed value", async () => {
-    // The legacy `heartbeat.speed` field is no longer read by the heartbeat
-    // service — every run unconditionally passes `callSite: 'heartbeatAgent'`
-    // and the resolver maps that to whatever `llm.callSites.heartbeatAgent`
-    // (or the default) configures. PR 19 will remove the schema field.
-    mockConfig.heartbeat.speed = "fast";
+  test("processMessage receives only callSite — no legacy speed knob", async () => {
+    // The heartbeat service unconditionally passes `callSite:
+    // 'heartbeatAgent'` and nothing else. The resolver maps that identifier
+    // to whatever `llm.callSites.heartbeatAgent` (or `llm.default`)
+    // configures.
     const service = createService();
     await service.runOnce();
 
     expect(processMessageCalls).toHaveLength(1);
-    expect(processMessageCalls[0].options?.callSite).toBe("heartbeatAgent");
+    expect(processMessageCalls[0].options).toEqual({
+      callSite: "heartbeatAgent",
+    });
   });
 
   test("end-to-end: llm.callSites.heartbeatAgent.speed resolves to 'fast'", async () => {
diff --git a/assistant/src/__tests__/model-intents.test.ts b/assistant/src/__tests__/model-intents.test.ts
index 267acd55b1a..0841ca94a2f 100644
--- a/assistant/src/__tests__/model-intents.test.ts
+++ b/assistant/src/__tests__/model-intents.test.ts
@@ -38,8 +38,10 @@ describe("model intents", () => {
   });
 });
 
-// `RetryProvider`'s legacy `modelIntent` normalization path was removed in
-// PR 19 of the unify-llm-callsites plan. The remaining `resolveModelIntent`
-// helper lives in `providers/model-intents.ts` for use by the workspace
-// migration's snapshot table — see `workspace/migrations/038-unify-llm-
-// callsite-configs.ts`.
+// `RetryProvider` normalizes outbound calls through call-site routing
+// (`resolveCallSiteConfig` against `llm.callSites.<id>` / `llm.default`).
+// The `resolveModelIntent` helper exercised above lives in
+// `providers/model-intents.ts` and is used by the unify-llm workspace
+// migration's snapshot table (see
+// `workspace/migrations/038-unify-llm-callsite-configs.ts`) to seed
+// `llm.default` from any pre-existing `modelIntent` setting.
diff --git a/assistant/src/__tests__/workspace-migration-039-drop-legacy-llm-keys.test.ts b/assistant/src/__tests__/workspace-migration-039-drop-legacy-llm-keys.test.ts
new file mode 100644
index 00000000000..6da7ca7f411
--- /dev/null
+++ b/assistant/src/__tests__/workspace-migration-039-drop-legacy-llm-keys.test.ts
@@ -0,0 +1,343 @@
+import {
+  existsSync,
+  mkdirSync,
+  readFileSync,
+  rmSync,
+  writeFileSync,
+} from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, beforeEach, describe, expect, test } from "bun:test";
+
+import { dropLegacyLlmKeysMigration } from "../workspace/migrations/039-drop-legacy-llm-keys.js";
+
+// ---------------------------------------------------------------------------
+// Helpers
+// ---------------------------------------------------------------------------
+
+let workspaceDir: string;
+
+function freshWorkspace(): void {
+  workspaceDir = join(
+    tmpdir(),
+    `vellum-migration-039-test-${Date.now()}-${Math.random().toString(36).slice(2)}`,
+  );
+  mkdirSync(workspaceDir, { recursive: true });
+}
+
+function writeConfig(data: Record<string, unknown>): void {
+  writeFileSync(
+    join(workspaceDir, "config.json"),
+    JSON.stringify(data, null, 2) + "\n",
+  );
+}
+
+function readConfig(): Record<string, unknown> {
+  return JSON.parse(readFileSync(join(workspaceDir, "config.json"), "utf-8"));
+}
+
+// ---------------------------------------------------------------------------
+// Setup / teardown
+// ---------------------------------------------------------------------------
+
+beforeEach(() => {
+  freshWorkspace();
+});
+
+afterEach(() => {
+  if (existsSync(workspaceDir)) {
+    rmSync(workspaceDir, { recursive: true, force: true });
+  }
+});
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+describe("039-drop-legacy-llm-keys migration", () => {
+  test("has correct migration id and description", () => {
+    expect(dropLegacyLlmKeysMigration.id).toBe("039-drop-legacy-llm-keys");
+    expect(dropLegacyLlmKeysMigration.description).toBe(
+      "Strip deprecated scattered LLM-related keys from config.json (post-PR-19 cleanup)",
+    );
+  });
+
+  // ─── No-op cases ────────────────────────────────────────────────────────
+
+  test("no-op when config.json does not exist", () => {
+    dropLegacyLlmKeysMigration.run(workspaceDir);
+    expect(existsSync(join(workspaceDir, "config.json"))).toBe(false);
+  });
+
+  test("gracefully handles invalid JSON in config file", () => {
+    writeFileSync(join(workspaceDir, "config.json"), "not-valid-json");
+    dropLegacyLlmKeysMigration.run(workspaceDir);
+    expect(readFileSync(join(workspaceDir, "config.json"), "utf-8")).toBe(
+      "not-valid-json",
+    );
+  });
+
+  test("gracefully handles array-shaped config", () => {
+    writeFileSync(join(workspaceDir, "config.json"), JSON.stringify([1, 2, 3]));
+    dropLegacyLlmKeysMigration.run(workspaceDir);
+    const raw = JSON.parse(
+      readFileSync(join(workspaceDir, "config.json"), "utf-8"),
+    );
+    expect(raw).toEqual([1, 2, 3]);
+  });
+
+  test("no-op when config has none of the targeted legacy keys", () => {
+    const original = {
+      llm: {
+        default: {
+          provider: "anthropic",
+          model: "claude-opus-4-6",
+          maxTokens: 64000,
+          effort: "max",
+          speed: "standard",
+          temperature: null,
+        },
+      },
+      services: { inference: { mode: "your-own" } },
+      otherSetting: true,
+    };
+    writeConfig(original);
+
+    dropLegacyLlmKeysMigration.run(workspaceDir);
+
+    expect(readConfig()).toEqual(original);
+  });
+
+  // ─── Full strip — every legacy key present ─────────────────────────────
+
+  test("strips every targeted legacy key when all are present", () => {
+    writeConfig({
+      // Top-level scattered keys
+      maxTokens: 16000,
+      effort: "low",
+      speed: "fast",
+      thinking: { enabled: false, streamThinking: false },
+      contextWindow: { enabled: true, maxInputTokens: 100000 },
+      pricingOverrides: [
+        {
+          provider: "anthropic",
+          modelPattern: "*",
+          inputPer1M: 1,
+          outputPer1M: 5,
+        },
+      ],
+      // services.inference.{provider, model}
+      services: {
+        inference: {
+          mode: "your-own",
+          provider: "openai",
+          model: "gpt-5.4",
+        },
+      },
+      // heartbeat.speed / filing.speed
+      heartbeat: { enabled: true, intervalMs: 60000, speed: "fast" },
+      filing: { enabled: true, intervalMs: 30000, speed: "fast" },
+      // analysis
+      analysis: {
+        enabled: true,
+        modelIntent: "quality-optimized",
+        modelOverride: "claude-opus-4-7",
+      },
+      // memory.summarization.modelIntent
+      memory: { summarization: { modelIntent: "latency-optimized" } },
+      // notifications.decisionModelIntent
+      notifications: { decisionModelIntent: "latency-optimized" },
+      // ui.greetingModelIntent
+      ui: { greetingModelIntent: "quality-optimized" },
+      // calls.model
+      calls: { enabled: true, model: "gpt-5.4-nano" },
+      // workspaceGit.commitMessageLLM.*
+      workspaceGit: {
+        commitMessageLLM: {
+          maxTokens: 1024,
+          temperature: 0.2,
+          useConfiguredProvider: true,
+          providerFastModelOverrides: { anthropic: "claude-haiku-3-5" },
+        },
+      },
+      // Unrelated key — must survive
+      otherSetting: "preserved",
+    });
+
+    dropLegacyLlmKeysMigration.run(workspaceDir);
+
+    const config = readConfig();
+
+    // Top-level
+    expect(config.maxTokens).toBeUndefined();
+    expect(config.effort).toBeUndefined();
+    expect(config.speed).toBeUndefined();
+    expect(config.thinking).toBeUndefined();
+    expect(config.contextWindow).toBeUndefined();
+    expect(config.pricingOverrides).toBeUndefined();
+
+    // services.inference: provider/model stripped, mode preserved
+    const services = config.services as { inference: Record<string, unknown> };
+    expect(services.inference.provider).toBeUndefined();
+    expect(services.inference.model).toBeUndefined();
+    expect(services.inference.mode).toBe("your-own");
+
+    // heartbeat / filing
+    const heartbeat = config.heartbeat as Record<string, unknown>;
+    expect(heartbeat.speed).toBeUndefined();
+    expect(heartbeat.enabled).toBe(true);
+    expect(heartbeat.intervalMs).toBe(60000);
+    const filing = config.filing as Record<string, unknown>;
+    expect(filing.speed).toBeUndefined();
+    expect(filing.enabled).toBe(true);
+    expect(filing.intervalMs).toBe(30000);
+
+    // analysis
+    const analysis = config.analysis as Record<string, unknown>;
+    expect(analysis.modelIntent).toBeUndefined();
+    expect(analysis.modelOverride).toBeUndefined();
+    expect(analysis.enabled).toBe(true);
+
+    // memory.summarization.modelIntent
+    const memory = config.memory as { summarization: Record<string, unknown> };
+    expect(memory.summarization.modelIntent).toBeUndefined();
+
+    // notifications.decisionModelIntent
+    const notifications = config.notifications as Record<string, unknown>;
+    expect(notifications.decisionModelIntent).toBeUndefined();
+
+    // ui.greetingModelIntent
+    const ui = config.ui as Record<string, unknown>;
+    expect(ui.greetingModelIntent).toBeUndefined();
+
+    // calls.model
+    const calls = config.calls as Record<string, unknown>;
+    expect(calls.model).toBeUndefined();
+    expect(calls.enabled).toBe(true);
+
+    // workspaceGit.commitMessageLLM
+    const workspaceGit = config.workspaceGit as {
+      commitMessageLLM: Record<string, unknown>;
+    };
+    expect(workspaceGit.commitMessageLLM.maxTokens).toBeUndefined();
+    expect(workspaceGit.commitMessageLLM.temperature).toBeUndefined();
+    expect(workspaceGit.commitMessageLLM.useConfiguredProvider).toBeUndefined();
+    expect(
+      workspaceGit.commitMessageLLM.providerFastModelOverrides,
+    ).toBeUndefined();
+
+    // Unrelated key untouched
+    expect(config.otherSetting).toBe("preserved");
+  });
+
+  // ─── Partial strip ─────────────────────────────────────────────────────
+
+  test("strips only the legacy keys actually present", () => {
+    writeConfig({
+      // Only a few legacy keys present
+      speed: "fast",
+      services: { inference: { mode: "managed", provider: "anthropic" } },
+      notifications: { decisionModelIntent: "latency-optimized" },
+      // Unrelated keys
+      maxStepsPerSession: 50,
+    });
+
+    dropLegacyLlmKeysMigration.run(workspaceDir);
+
+    const config = readConfig();
+    expect(config.speed).toBeUndefined();
+    const services = config.services as { inference: Record<string, unknown> };
+    expect(services.inference.provider).toBeUndefined();
+    expect(services.inference.mode).toBe("managed");
+    expect(
+      (config.notifications as Record<string, unknown>).decisionModelIntent,
+    ).toBeUndefined();
+    expect(config.maxStepsPerSession).toBe(50);
+  });
+
+  // ─── services.inference.mode preservation ──────────────────────────────
+
+  test("preserves services.inference.mode while stripping provider/model", () => {
+    writeConfig({
+      services: {
+        inference: { mode: "your-own", provider: "openai", model: "gpt-5.4" },
+      },
+    });
+
+    dropLegacyLlmKeysMigration.run(workspaceDir);
+
+    const config = readConfig();
+    const services = config.services as { inference: Record<string, unknown> };
+    expect(services.inference.mode).toBe("your-own");
+    expect(services.inference.provider).toBeUndefined();
+    expect(services.inference.model).toBeUndefined();
+  });
+
+  // ─── Idempotency ───────────────────────────────────────────────────────
+
+  test("idempotency: re-running the migration yields no further mutation", () => {
+    writeConfig({
+      speed: "fast",
+      services: {
+        inference: { mode: "your-own", provider: "openai", model: "gpt-5.4" },
+      },
+      heartbeat: { enabled: true, speed: "fast" },
+      calls: { enabled: true, model: "gpt-5.4-nano" },
+    });
+
+    dropLegacyLlmKeysMigration.run(workspaceDir);
+    const afterFirst = readConfig();
+
+    dropLegacyLlmKeysMigration.run(workspaceDir);
+    const afterSecond = readConfig();
+
+    expect(afterSecond).toEqual(afterFirst);
+  });
+
+  test("idempotency: writes nothing on a clean config (no targeted keys)", () => {
+    const clean = {
+      llm: { default: { provider: "anthropic", model: "claude-opus-4-6" } },
+      services: { inference: { mode: "your-own" } },
+    };
+    writeConfig(clean);
+
+    const beforeStat = JSON.parse(
+      readFileSync(join(workspaceDir, "config.json"), "utf-8"),
+    );
+
+    dropLegacyLlmKeysMigration.run(workspaceDir);
+
+    const afterStat = JSON.parse(
+      readFileSync(join(workspaceDir, "config.json"), "utf-8"),
+    );
+
+    expect(afterStat).toEqual(beforeStat);
+  });
+
+  // ─── Defensive shape handling ──────────────────────────────────────────
+
+  test("ignores non-object values at sub-paths (e.g. numbers, arrays)", () => {
+    // Attacker-style or otherwise corrupt config — the migration should not
+    // throw on unexpected shapes. Defensive readObject() helper coerces
+    // non-objects to null and the migration skips them.
+    writeConfig({
+      services: 42,
+      heartbeat: ["a", "b"],
+      calls: null,
+      workspaceGit: { commitMessageLLM: "not-an-object" },
+      // A real legacy key alongside garbage
+      speed: "fast",
+    });
+
+    dropLegacyLlmKeysMigration.run(workspaceDir);
+
+    const config = readConfig();
+    // The valid top-level legacy key still gets stripped
+    expect(config.speed).toBeUndefined();
+    // Garbage shapes preserved as-is
+    expect(config.services).toBe(42);
+    expect(config.heartbeat).toEqual(["a", "b"]);
+    expect(config.calls).toBeNull();
+    expect(config.workspaceGit).toEqual({ commitMessageLLM: "not-an-object" });
+  });
+});
diff --git a/assistant/src/agent/loop.ts b/assistant/src/agent/loop.ts
index b34791b9402..228bd8f472c 100644
--- a/assistant/src/agent/loop.ts
+++ b/assistant/src/agent/loop.ts
@@ -268,10 +268,11 @@ export class AgentLoop {
 
         // Per-call LLM call-site identifier. Surfaces on the per-call
         // `config.callSite` so `RetryProvider.normalizeSendMessageOptions`
-        // can route through `resolveCallSiteConfig`. User-initiated
-        // conversation turns default to `mainAgent` in the agent loop's
-        // caller; other invocation contexts (heartbeat, filing, analyze,
-        // etc.) pass their own `callSite`.
+        // can route through `resolveCallSiteConfig` against
+        // `llm.callSites.<id>` (falling back to `llm.default` when absent).
+        // User-initiated conversation turns default to `mainAgent` in the
+        // agent loop's caller; other invocation contexts (heartbeat, filing,
+        // analyze, etc.) pass their own `callSite`.
         if (callSite) {
           providerConfig.callSite = callSite;
         }
diff --git a/assistant/src/config/bundled-skills/phone-calls/references/CONFIG.md b/assistant/src/config/bundled-skills/phone-calls/references/CONFIG.md
index a494853f1ee..759df738ad6 100644
--- a/assistant/src/config/bundled-skills/phone-calls/references/CONFIG.md
+++ b/assistant/src/config/bundled-skills/phone-calls/references/CONFIG.md
@@ -10,7 +10,7 @@ All call-related settings can be managed via `assistant config`:
 | `calls.userConsultTimeoutSeconds`           | How long to wait for user answers                                                                                                                                                                                                         | `120` (2 min)                                                                                            |
 | `calls.disclosure.enabled`                  | Whether the AI announces itself at call start                                                                                                                                                                                             | `true`                                                                                                   |
 | `calls.disclosure.text`                     | The disclosure message spoken at call start                                                                                                                                                                                               | `"At the very beginning of the call, introduce yourself as an assistant calling on behalf of my human."` |
-| `calls.model`                               | Override LLM model for call orchestration                                                                                                                                                                                                 | _(uses default model)_                                                                                   |
+| `llm.callSites.callAgent.model`             | Override LLM model for call orchestration                                                                                                                                                                                                 | _(unset — falls back to `llm.default.model`)_                                                            |
 | `calls.callerIdentity.allowPerCallOverride` | Allow per-call caller identity selection                                                                                                                                                                                                  | `true`                                                                                                   |
 | `calls.callerIdentity.userNumber`           | E.164 phone number for user-number mode                                                                                                                                                                                                   | _(empty)_                                                                                                |
 | `calls.voice.language`                      | Language code for TTS and transcription                                                                                                                                                                                                   | `en-US`                                                                                                  |
diff --git a/assistant/src/config/schema.ts b/assistant/src/config/schema.ts
index fc6e12250dc..99f03a7eac7 100644
--- a/assistant/src/config/schema.ts
+++ b/assistant/src/config/schema.ts
@@ -62,17 +62,13 @@ export {
 export type {
   ContextOverflowRecoveryConfig,
   ContextWindowConfig,
-  Effort,
   ModelPricingOverride,
-  Speed,
   ThinkingConfig,
 } from "./schemas/inference.js";
 export {
   ContextOverflowRecoveryConfigSchema,
   ContextWindowConfigSchema,
-  EffortSchema,
   ModelPricingOverrideSchema,
-  SpeedSchema,
   ThinkingConfigSchema,
 } from "./schemas/inference.js";
 export type {
@@ -87,6 +83,14 @@ export {
 } from "./schemas/ingress.js";
 export type { JournalConfig } from "./schemas/journal.js";
 export { JournalConfigSchema } from "./schemas/journal.js";
+export type {
+  LLMCallSite,
+  LLMCallSiteConfig,
+  LLMConfig,
+  LLMConfigBase,
+  LLMConfigFragment,
+} from "./schemas/llm.js";
+export { LLMCallSiteEnum, LLMSchema } from "./schemas/llm.js";
 export type { AuditLogConfig, LogFileConfig } from "./schemas/logging.js";
 export {
   AuditLogConfigSchema,
diff --git a/assistant/src/config/schemas/inference.ts b/assistant/src/config/schemas/inference.ts
index 21fc696af18..208845851ba 100644
--- a/assistant/src/config/schemas/inference.ts
+++ b/assistant/src/config/schemas/inference.ts
@@ -25,28 +25,6 @@ export const ThinkingConfigSchema = z
   })
   .describe("Extended thinking (chain-of-thought) configuration");
 
-export const EffortSchema = z
-  .enum(["low", "medium", "high", "max"], {
-    error: 'effort must be "low", "medium", "high", or "max"',
-  })
-  .default("max")
-  .describe(
-    "How much effort the model should put into its response — lower effort is faster and cheaper",
-  );
-
-export type Effort = z.infer<typeof EffortSchema>;
-
-export const SpeedSchema = z
-  .enum(["standard", "fast"], {
-    error: 'speed must be "standard" or "fast"',
-  })
-  .default("standard")
-  .describe(
-    'Inference speed mode — "fast" enables higher output token throughput on supported models (Opus 4.6, Opus 4.7) at premium pricing',
-  );
-
-export type Speed = z.infer<typeof SpeedSchema>;
-
 export const ContextOverflowRecoveryConfigSchema = z
   .object({
     enabled: z
diff --git a/assistant/src/daemon/conversation-agent-loop.ts b/assistant/src/daemon/conversation-agent-loop.ts
index d23c0637f4a..e9f5a015c81 100644
--- a/assistant/src/daemon/conversation-agent-loop.ts
+++ b/assistant/src/daemon/conversation-agent-loop.ts
@@ -359,10 +359,9 @@ export async function runAgentLoopImpl(
     titleText?: string;
     /**
      * LLM call-site identifier threaded into the per-call provider config.
-     * When unset, defaults to `'mainAgent'` so this turn routes through the
-     * main-agent profile in the unified `llm` config. Adapter callers
-     * (heartbeat, filing, schedule, etc.) override with their own call-site
-     * id as PRs 7-11 migrate them off the legacy `speed` / `modelIntent` paths.
+     * Adapter callers (heartbeat, filing, scheduler, etc.) pass their own
+     * call-site id so the resolver picks `llm.callSites.<id>`. When unset,
+     * the agent loop defaults to `'mainAgent'` for user-initiated turns.
      */
     callSite?: LLMCallSite;
   },
@@ -392,7 +391,7 @@ export async function runAgentLoopImpl(
   // invocation contexts (heartbeat, filing, analyze, etc.) pass their own
   // `callSite`. The provider layer resolves provider/model/maxTokens via
   // `resolveCallSiteConfig`, picking up any user overrides under
-  // `llm.callSites.mainAgent`.
+  // `llm.callSites.mainAgent` (falling back to `llm.default` when absent).
   const turnCallSite: LLMCallSite = options?.callSite ?? "mainAgent";
 
   // Capture the turn channel context *before* any awaits so a second
diff --git a/assistant/src/daemon/conversation.ts b/assistant/src/daemon/conversation.ts
index 1c3fa9af48e..4d4167cf7df 100644
--- a/assistant/src/daemon/conversation.ts
+++ b/assistant/src/daemon/conversation.ts
@@ -24,8 +24,7 @@ import type {
 } from "../channels/types.js";
 import { isAssistantFeatureFlagEnabled } from "../config/assistant-feature-flags.js";
 import { getConfig } from "../config/loader.js";
-import type { Speed } from "../config/schemas/inference.js";
-import type { LLMCallSite } from "../config/schemas/llm.js";
+import type { LLMCallSite, Speed } from "../config/schemas/llm.js";
 import {
   ContextWindowManager,
   type ContextWindowResult,
diff --git a/assistant/src/daemon/handlers/shared.ts b/assistant/src/daemon/handlers/shared.ts
index 3ce19037d57..95beaca64ec 100644
--- a/assistant/src/daemon/handlers/shared.ts
+++ b/assistant/src/daemon/handlers/shared.ts
@@ -1,8 +1,7 @@
 import { v4 as uuid } from "uuid";
 
 import { getConfig } from "../../config/loader.js";
-import type { Speed } from "../../config/schemas/inference.js";
-import type { LLMCallSite } from "../../config/schemas/llm.js";
+import type { LLMCallSite, Speed } from "../../config/schemas/llm.js";
 import type { HeartbeatService } from "../../heartbeat/heartbeat-service.js";
 import type { SecretPromptResult } from "../../permissions/secret-prompter.js";
 import type { AuthContext } from "../../runtime/auth/types.js";
diff --git a/assistant/src/filing/filing-service.ts b/assistant/src/filing/filing-service.ts
index ae9bf729d4c..f007a305c21 100644
--- a/assistant/src/filing/filing-service.ts
+++ b/assistant/src/filing/filing-service.ts
@@ -2,7 +2,6 @@ import { existsSync, readFileSync } from "node:fs";
 import { join } from "node:path";
 
 import { getConfig } from "../config/loader.js";
-import type { Speed } from "../config/schemas/inference.js";
 import type { LLMCallSite } from "../config/schemas/llm.js";
 import { bootstrapConversation } from "../memory/conversation-bootstrap.js";
 import { getLogger } from "../util/logger.js";
@@ -43,7 +42,7 @@ export interface FilingDeps {
   processMessage: (
     conversationId: string,
     content: string,
-    options?: { speed?: Speed; callSite?: LLMCallSite },
+    options?: { callSite?: LLMCallSite },
   ) => Promise<{ messageId: string }>;
   onConversationCreated?: (info: {
     conversationId: string;
diff --git a/assistant/src/notifications/README.md b/assistant/src/notifications/README.md
index 710a68df6fa..5b371b94a64 100644
--- a/assistant/src/notifications/README.md
+++ b/assistant/src/notifications/README.md
@@ -31,7 +31,7 @@ The candidate set is serialized into a compact `<conversation-candidates>` block
 
 ### 3. Decision
 
-The decision engine (`decision-engine.ts`) sends the signal to an LLM (configured via `notifications.decisionModelIntent`) along with available channels, the user's preference summary, and the conversation candidate set. The LLM responds with a structured decision: whether to notify, which channels, rendered copy per channel, a deduplication key, and **per-channel conversation actions**.
+The decision engine (`decision-engine.ts`) sends the signal to an LLM (configured via `llm.callSites.notificationDecision`) along with available channels, the user's preference summary, and the conversation candidate set. The LLM responds with a structured decision: whether to notify, which channels, rendered copy per channel, a deduplication key, and **per-channel conversation actions**.
 
 **Conversation actions:** For each selected channel, the LLM decides:
 
@@ -538,10 +538,14 @@ Preferences are sanitized against prompt injection (angle brackets replaced with
 
 ## Configuration
 
-All settings live under the `notifications` key in `config.json`:
+The decision engine and preference extractor pick their per-call LLM config
+from the unified `llm` block. Override defaults by setting either of:
 
-| Key                                 | Type   | Default               | Description                                                              |
-| ----------------------------------- | ------ | --------------------- | ------------------------------------------------------------------------ |
-| `notifications.decisionModelIntent` | string | `"latency-optimized"` | Model intent used for both the decision engine and preference extraction |
+| Key                                  | Type    | Default        | Description                                                                 |
+| ------------------------------------ | ------- | -------------- | --------------------------------------------------------------------------- |
+| `llm.callSites.notificationDecision` | object  | _(unset)_      | Provider/model/effort/etc. override for the decision engine call site       |
+| `llm.callSites.preferenceExtraction` | object  | _(unset)_      | Provider/model/effort/etc. override for the preference extractor call site  |
+
+When a call site override is unset, the resolver falls back to `llm.default`.
 
 The notification pipeline is always active -- signals are processed and dispatched as soon as the daemon is running. The audit trail (events, decisions, deliveries) is written for every signal.
diff --git a/assistant/src/providers/types.ts b/assistant/src/providers/types.ts
index 1179a23b00b..8aed31acd9a 100644
--- a/assistant/src/providers/types.ts
+++ b/assistant/src/providers/types.ts
@@ -135,9 +135,8 @@ export interface SendMessageConfig {
   /**
    * LLM call-site identifier. `RetryProvider` resolves
    * provider/model/maxTokens/effort/speed/temperature/thinking/contextWindow
-   * via `resolveCallSiteConfig(callSite, config.llm)`. Required for any new
-   * caller; the legacy `modelIntent`-based fallback was removed in PR 19 of
-   * the unify-llm-callsites plan.
+   * via `resolveCallSiteConfig(callSite, config.llm)`, falling back to
+   * `llm.default` when no callSite-specific entry is present.
    */
   callSite?: LLMCallSite;
   effort?: "low" | "medium" | "high" | "max";
diff --git a/assistant/src/runtime/btw-sidechain.ts b/assistant/src/runtime/btw-sidechain.ts
index 19bffd33a8a..50c6776d73f 100644
--- a/assistant/src/runtime/btw-sidechain.ts
+++ b/assistant/src/runtime/btw-sidechain.ts
@@ -97,10 +97,8 @@ export async function runBtwSidechain(
       config: {
         max_tokens: params.maxTokens ?? 1024,
         tool_choice: { type: "none" },
-        // Resolution: explicit callSite → default "identityIntro" (the
-        // original purpose of this side-chain runner). The legacy
-        // `modelIntent` parameter was removed in PR 19 of the
-        // unify-llm-callsites plan.
+        // Default call site is "identityIntro" — the original purpose of
+        // this side-chain runner. Callers may override per invocation.
         callSite: params.callSite ?? ("identityIntro" as LLMCallSite),
       },
       onEvent: (event) => {
diff --git a/assistant/src/runtime/services/analyze-conversation.ts b/assistant/src/runtime/services/analyze-conversation.ts
index 90d584e6ce4..62cfa59e413 100644
--- a/assistant/src/runtime/services/analyze-conversation.ts
+++ b/assistant/src/runtime/services/analyze-conversation.ts
@@ -17,8 +17,8 @@
  *
  * Both triggers route the agent loop through `callSite: 'analyzeConversation'`
  * so per-call provider/model selection flows through `resolveCallSiteConfig`
- * against `llm.callSites.analyzeConversation` (populated by the unify-llm
- * workspace migration from the legacy `analysis.modelIntent`/`modelOverride`).
+ * against `llm.callSites.analyzeConversation` (falling back to `llm.default`
+ * when no override is set).
  */
 import type { ServerMessage } from "../../daemon/message-protocol.js";
 import {
diff --git a/cli/src/lib/config-utils.ts b/cli/src/lib/config-utils.ts
index 5d468f6cbc0..10eb785a470 100644
--- a/cli/src/lib/config-utils.ts
+++ b/cli/src/lib/config-utils.ts
@@ -5,8 +5,8 @@ import { join } from "path";
 /**
  * Convert flat dot-notation key=value pairs into a nested config object.
  *
- * e.g. {"services.inference.provider": "anthropic", "services.inference.model": "claude-opus-4-6"}
- *   → {services: {inference: {provider: "anthropic", model: "claude-opus-4-6"}}}
+ * e.g. {"llm.default.provider": "anthropic", "llm.default.model": "claude-opus-4-6"}
+ *   → {llm: {default: {provider: "anthropic", model: "claude-opus-4-6"}}}
  */
 export function buildNestedConfig(
   configValues: Record<string, string>,
@@ -39,8 +39,8 @@ export function buildNestedConfig(
  * values into its workspace config on first boot.
  *
  * Keys use dot-notation to address nested fields. For example:
- *   "services.inference.provider" → {services: {inference: {provider: ...}}}
- *   "services.inference.model"    → {services: {inference: {model: ...}}}
+ *   "llm.default.provider" → {llm: {default: {provider: ...}}}
+ *   "llm.default.model"    → {llm: {default: {model: ...}}}
  *
  * Returns undefined when configValues is empty (nothing to write).
  */
diff --git a/clients/macos/vellum-assistant/Features/Settings/SettingsStore.swift b/clients/macos/vellum-assistant/Features/Settings/SettingsStore.swift
index 51b7446edf8..2cafac0d82f 100644
--- a/clients/macos/vellum-assistant/Features/Settings/SettingsStore.swift
+++ b/clients/macos/vellum-assistant/Features/Settings/SettingsStore.swift
@@ -2170,19 +2170,15 @@ public final class SettingsStore: ObservableObject {
     /// Loads service modes (inference, image-generation) from workspace config.
     /// Called during init and when the daemon reconnects.
     func loadServiceModes(config: [String: Any]) {
-        // Resolve inference provider/model with `llm.default.*` as the
-        // canonical source (PR 4 backfills it from `services.inference`).
-        // Fall back to `services.inference.{provider,model}` for unmigrated
-        // configs — defensive, since the migration should have run for
-        // existing users but skips early-return cases (missing config,
-        // malformed JSON, etc.).
+        // Resolve inference provider/model from `llm.default.*` — the unified
+        // LLM config block is the only source of truth. The
+        // `services.inference.mode` field stays under `services` because it is
+        // an inference-delivery setting (managed vs. your-own), orthogonal to
+        // model selection.
         let services = config["services"] as? [String: Any]
         let llmDefault = (config["llm"] as? [String: Any])?["default"] as? [String: Any]
         let inference = services?["inference"] as? [String: Any]
 
-        // Inference mode remains under `services.inference.mode` — it is an
-        // inference-delivery setting (managed vs. your-own), not part of the
-        // LLM model config.
         if let inference, let mode = inference["mode"] as? String {
             self.inferenceMode = mode
         }
@@ -2191,10 +2187,10 @@ public final class SettingsStore: ObservableObject {
         // via applyModelInfoResponse, its values take precedence over local
         // config which may be stale (especially for remote assistants).
         if lastDaemonProvider == nil {
-            if let provider = (llmDefault?["provider"] as? String) ?? (inference?["provider"] as? String) {
+            if let provider = llmDefault?["provider"] as? String {
                 self.selectedInferenceProvider = provider
             }
-            if let model = (llmDefault?["model"] as? String) ?? (inference?["model"] as? String) {
+            if let model = llmDefault?["model"] as? String {
                 self.selectedModel = model
             }
         }
@@ -2960,29 +2956,8 @@ public final class SettingsStore: ObservableObject {
         return task
     }
 
-    // TODO PR 19: remove. Superseded by setLLMDefaultProvider — kept for any
-    // legacy callers / tests that still write to `services.inference.provider`
-    // directly while the unification rollout is in progress.
-    @discardableResult
-    func setInferenceProvider(_ provider: String) -> Task<Bool, Never> {
-        selectedInferenceProvider = provider
-        let task = Task {
-            let success = await settingsClient.patchConfig([
-                "services": ["inference": ["provider": provider]]
-            ])
-            if !success {
-                log.error("Failed to patch config for inference provider")
-            }
-            return success
-        }
-        scheduleRoutingSourceRefresh()
-        return task
-    }
-
     /// Persists the selected default LLM provider to the daemon config under
-    /// the unified `llm.default.provider` key. This is the canonical write
-    /// path now that the workspace migration consolidates LLM call-site
-    /// settings under `llm.*` (see PR 4 of the unify-llm-callsites plan).
+    /// the unified `llm.default.provider` key.
     ///
     /// Prefer `setLLMDefault(provider:model:)` when both keys change together
     /// — it writes them atomically in a single PATCH so the daemon's
@@ -3793,35 +3768,6 @@ public final class SettingsStore: ObservableObject {
         return "http://\(ip):\(LockfilePaths.resolveGatewayPort(connectedAssistantId: connectedId))"
     }
 
-    // MARK: - Model Actions
-
-    // TODO PR 19: remove. Superseded by setLLMDefaultModel — kept for any
-    // legacy callers (e.g. routes through the daemon's set-model HTTP endpoint
-    // which still enriches state via applyModelInfoResponse) while the
-    // unification rollout is in progress.
-    func setModel(_ model: String, provider: String? = nil, force: Bool = false) {
-        // Skip if neither model nor provider changed (unless forced,
-        // e.g. after an inference-mode switch that requires re-persisting
-        // the model+provider pair even when IDs haven't changed).
-        if !force {
-            let modelUnchanged = model == lastDaemonModel
-            let providerUnchanged = provider == nil || provider == lastDaemonProvider
-            guard !modelUnchanged || !providerUnchanged else { return }
-        }
-        lastDaemonModel = model
-        if let provider { lastDaemonProvider = provider }
-        Task {
-            let info = await settingsClient.setModel(model: model, provider: provider)
-            if let info {
-                applyModelInfoResponse(info)
-            } else if lastDaemonModel == model {
-                // Request failed — revert only if no newer call overwrote lastDaemonModel
-                lastDaemonModel = nil
-                lastDaemonProvider = nil
-            }
-        }
-    }
-
     // MARK: - User Timezone Actions
 
     /// Saves a user timezone override under `ui.userTimezone`.
diff --git a/clients/macos/vellum-assistantTests/MockSettingsClient.swift b/clients/macos/vellum-assistantTests/MockSettingsClient.swift
index 535a6c0f588..e6416f050fd 100644
--- a/clients/macos/vellum-assistantTests/MockSettingsClient.swift
+++ b/clients/macos/vellum-assistantTests/MockSettingsClient.swift
@@ -10,7 +10,6 @@ final class MockSettingsClient: SettingsClientProtocol {
     var saveVercelConfigCalls: [String] = []
     var deleteVercelConfigCallCount = 0
     var fetchModelInfoCallCount = 0
-    var setModelCalls: [(model: String, provider: String?)] = []
     var setImageGenModelCalls: [String] = []
     var fetchTelegramConfigCallCount = 0
     var setTelegramConfigCalls: [(action: String, botToken: String?, commands: [TelegramConfigRequestCommand]?)] = []
@@ -26,7 +25,6 @@ final class MockSettingsClient: SettingsClientProtocol {
     var saveVercelConfigResponse: VercelApiConfigResponseMessage?
     var deleteVercelConfigResponse: VercelApiConfigResponseMessage?
     var modelInfoResponse: ModelInfoMessage?
-    var setModelResponse: ModelInfoMessage?
     var setImageGenModelResponse: ModelInfoMessage?
     var embeddingConfigResponse: EmbeddingConfigMessage?
     var setEmbeddingConfigResponse: EmbeddingConfigMessage?
@@ -73,8 +71,7 @@ final class MockSettingsClient: SettingsClientProtocol {
     }
 
     func setModel(model: String, provider: String? = nil) async -> ModelInfoMessage? {
-        setModelCalls.append((model: model, provider: provider))
-        return setModelResponse
+        nil
     }
 
     func setImageGenModel(modelId: String) async -> ModelInfoMessage? {
diff --git a/gateway/ARCHITECTURE.md b/gateway/ARCHITECTURE.md
index b92b45f33eb..9e2450c2d6a 100644
--- a/gateway/ARCHITECTURE.md
+++ b/gateway/ARCHITECTURE.md
@@ -1112,7 +1112,7 @@ Call behavior is controlled via the `calls` config block in the assistant config
 | `calls.disclosure.enabled`        | boolean  | `true`                         | Whether the AI should disclose it is an AI at the start of the call.                                                                                                                      |
 | `calls.disclosure.text`           | string   | _(default disclosure prompt)_  | The disclosure instruction included in the system prompt.                                                                                                                                 |
 | `calls.safety.denyCategories`     | string[] | `[]`                           | Categories of calls to deny (e.g., emergency numbers are always denied regardless of this setting).                                                                                       |
-| `calls.model`                     | string   | _(unset — uses default model)_ | Optional override for the LLM model used in voice call conversations.                                                                                                                     |
+| `llm.callSites.callAgent.model`   | string   | _(unset — falls back to `llm.default.model`)_ | Optional override for the LLM model used in voice call conversations.                                                                                                                     |
 | `calls.voice.language`            | string   | `'en-US'`                      | Language code for TTS and transcription.                                                                                                                                                  |
 | `services.stt.provider`           | enum     | `'deepgram'`                   | STT provider for all boundaries including telephony. Determines the Twilio integration path (ConversationRelay-native for `deepgram`/`google-gemini`, media-stream for `openai-whisper`). |
 | `services.tts.provider`           | enum     | `'elevenlabs'`                 | Active TTS provider for speech synthesis (catalog-driven; see [TTS Provider Abstraction](../assistant/ARCHITECTURE.md#tts-provider-abstraction-servicestts)).                             |