From d870677bd5de393d8c2b55002f410544c202a563 Mon Sep 17 00:00:00 2001
From: Hannes Rudolph <hrudolph@gmail.com>
Date: Mon, 5 Jan 2026 13:03:57 -0700
Subject: [PATCH 1/4] feat(web-evals): remember last Roo model selection

---
 .roo/skills/evals-context/SKILL.md            | 188 ++++++++++++++++++
 apps/web-evals/src/app/runs/new/new-run.tsx   |  59 +++++-
 .../__tests__/normalize-create-run.spec.ts    |  43 ++++
 .../roo-last-model-selection.spec.ts          |  58 ++++++
 .../web-evals/src/lib/normalize-create-run.ts |  19 ++
 .../src/lib/roo-last-model-selection.ts       |  52 +++++
 6 files changed, 416 insertions(+), 3 deletions(-)
 create mode 100644 .roo/skills/evals-context/SKILL.md
 create mode 100644 apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts
 create mode 100644 apps/web-evals/src/lib/__tests__/roo-last-model-selection.spec.ts
 create mode 100644 apps/web-evals/src/lib/normalize-create-run.ts
 create mode 100644 apps/web-evals/src/lib/roo-last-model-selection.ts

diff --git a/.roo/skills/evals-context/SKILL.md b/.roo/skills/evals-context/SKILL.md
new file mode 100644
index 00000000000..985b788b94f
--- /dev/null
+++ b/.roo/skills/evals-context/SKILL.md
@@ -0,0 +1,188 @@
+---
+name: evals-context
+description: Provides context about the Roo Code evals system structure in this monorepo. Use when tasks mention "evals", "evaluation", "eval runs", "eval exercises", or working with the evals infrastructure. Helps distinguish between the evals execution system (packages/evals, apps/web-evals) and the public website evals display page (apps/web-roo-code/src/app/evals).
+---
+
+# Evals Codebase Context
+
+## When to Use This Skill
+
+Use this skill when the task involves:
+
+- Modifying or debugging the evals execution infrastructure
+- Adding new eval exercises or languages
+- Working with the evals web interface (apps/web-evals)
+- Modifying the public evals display page on roocode.com
+- Understanding where evals code lives in this monorepo
+
+## When NOT to Use This Skill
+
+Do NOT use this skill when:
+
+- Working on unrelated parts of the codebase (extension, webview-ui, etc.)
+- The task is purely about the VS Code extension's core functionality
+- Working on the main website pages that don't involve evals
+
+## Key Disambiguation: Two "Evals" Locations
+
+This monorepo has **two distinct evals-related locations** that can cause confusion:
+
+| Component                   | Path                                                           | Purpose                                                        |
+| --------------------------- | -------------------------------------------------------------- | -------------------------------------------------------------- |
+| **Evals Execution System**  | `packages/evals/`                                              | Core eval infrastructure: CLI, DB schema, Docker configs       |
+| **Evals Management UI**     | `apps/web-evals/`                                              | Next.js app for creating/monitoring eval runs (localhost:3446) |
+| **Website Evals Page**      | `apps/web-roo-code/src/app/evals/`                             | Public roocode.com page displaying eval results                |
+| **External Exercises Repo** | [Roo-Code-Evals](https://github.com/RooCodeInc/Roo-Code-Evals) | Actual coding exercises (NOT in this monorepo)                 |
+
+## Directory Structure Reference
+
+### `packages/evals/` - Core Evals Package
+
+```
+packages/evals/
+├── ARCHITECTURE.md          # Detailed architecture documentation
+├── ADDING-EVALS.md          # Guide for adding new exercises/languages
+├── README.md                # Setup and running instructions
+├── docker-compose.yml       # Container orchestration
+├── Dockerfile.runner        # Runner container definition
+├── Dockerfile.web           # Web app container
+├── drizzle.config.ts        # Database ORM config
+├── src/
+│   ├── index.ts             # Package exports
+│   ├── cli/                 # CLI commands for running evals
+│   │   ├── runEvals.ts      # Orchestrates complete eval runs
+│   │   ├── runTask.ts       # Executes individual tasks in containers
+│   │   ├── runUnitTest.ts   # Validates task completion via tests
+│   │   └── redis.ts         # Redis pub/sub integration
+│   ├── db/
+│   │   ├── schema.ts        # Database schema (runs, tasks)
+│   │   ├── queries/         # Database query functions
+│   │   └── migrations/      # SQL migrations
+│   └── exercises/
+│       └── index.ts         # Exercise loading utilities
+└── scripts/
+    └── setup.sh             # Local macOS setup script
+```
+
+### `apps/web-evals/` - Evals Management Web App
+
+```
+apps/web-evals/
+├── src/
+│   ├── app/
+│   │   ├── page.tsx         # Home page (runs list)
+│   │   ├── runs/
+│   │   │   ├── new/         # Create new eval run
+│   │   │   └── [id]/        # View specific run status
+│   │   └── api/runs/        # SSE streaming endpoint
+│   ├── actions/             # Server actions
+│   │   ├── runs.ts          # Run CRUD operations
+│   │   ├── tasks.ts         # Task queries
+│   │   ├── exercises.ts     # Exercise listing
+│   │   └── heartbeat.ts     # Controller health checks
+│   ├── hooks/               # React hooks (SSE, models, etc.)
+│   └── lib/                 # Utilities and schemas
+```
+
+### `apps/web-roo-code/src/app/evals/` - Public Website Evals Page
+
+```
+apps/web-roo-code/src/app/evals/
+├── page.tsx      # Fetches and displays public eval results
+├── evals.tsx     # Main evals display component
+├── plot.tsx      # Visualization component
+└── types.ts      # EvalRun type (extends packages/evals types)
+```
+
+This page **displays** eval results on the public roocode.com website. It imports types from `@roo-code/evals` but does NOT run evals.
+
+## Architecture Overview
+
+The evals system is a distributed evaluation platform that runs AI coding tasks in isolated VS Code environments:
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│  Web App (apps/web-evals)  ──────────────────────────────── │
+│        │                                                    │
+│        ▼                                                    │
+│  PostgreSQL ◄────► Controller Container                     │
+│        │               │                                    │
+│        ▼               ▼                                    │
+│     Redis ◄───► Runner Containers (1-25 parallel)           │
+└─────────────────────────────────────────────────────────────┘
+```
+
+**Key components:**
+
+- **Controller**: Orchestrates eval runs, spawns runners, manages task queue (p-queue)
+- **Runner**: Isolated Docker container with VS Code + Roo Code extension + language runtimes
+- **Redis**: Pub/sub for real-time events (NOT task queuing)
+- **PostgreSQL**: Stores runs, tasks, metrics
+
+## Common Tasks Quick Reference
+
+### Adding a New Eval Exercise
+
+1. Add exercise to [Roo-Code-Evals](https://github.com/RooCodeInc/Roo-Code-Evals) repo (external)
+2. See [`packages/evals/ADDING-EVALS.md`](packages/evals/ADDING-EVALS.md) for structure
+
+### Modifying Eval CLI Behavior
+
+Edit files in [`packages/evals/src/cli/`](packages/evals/src/cli/):
+
+- [`runEvals.ts`](packages/evals/src/cli/runEvals.ts) - Run orchestration
+- [`runTask.ts`](packages/evals/src/cli/runTask.ts) - Task execution
+- [`runUnitTest.ts`](packages/evals/src/cli/runUnitTest.ts) - Test validation
+
+### Modifying the Evals Web Interface
+
+Edit files in [`apps/web-evals/src/`](apps/web-evals/src/):
+
+- [`app/runs/new/new-run.tsx`](apps/web-evals/src/app/runs/new/new-run.tsx) - New run form
+- [`actions/runs.ts`](apps/web-evals/src/actions/runs.ts) - Run server actions
+
+### Modifying the Public Evals Display Page
+
+Edit files in [`apps/web-roo-code/src/app/evals/`](apps/web-roo-code/src/app/evals/):
+
+- [`evals.tsx`](apps/web-roo-code/src/app/evals/evals.tsx) - Display component
+- [`plot.tsx`](apps/web-roo-code/src/app/evals/plot.tsx) - Charts
+
+### Database Schema Changes
+
+1. Edit [`packages/evals/src/db/schema.ts`](packages/evals/src/db/schema.ts)
+2. Generate migration: `cd packages/evals && pnpm drizzle-kit generate`
+3. Apply migration: `pnpm drizzle-kit migrate`
+
+## Running Evals Locally
+
+```bash
+# From repo root
+pnpm evals
+
+# Opens web UI at http://localhost:3446
+```
+
+**Ports (defaults):**
+
+- PostgreSQL: 5433
+- Redis: 6380
+- Web: 3446
+
+## Testing
+
+```bash
+# packages/evals tests
+cd packages/evals && npx vitest run
+
+# apps/web-evals tests
+cd apps/web-evals && npx vitest run
+```
+
+## Key Types/Exports from `@roo-code/evals`
+
+The package exports are defined in [`packages/evals/src/index.ts`](packages/evals/src/index.ts):
+
+- Database queries: `getRuns`, `getTasks`, `getTaskMetrics`, etc.
+- Schema types: `Run`, `Task`, `TaskMetrics`
+- Used by both `apps/web-evals` and `apps/web-roo-code`
diff --git a/apps/web-evals/src/app/runs/new/new-run.tsx b/apps/web-evals/src/app/runs/new/new-run.tsx
index 28fb4abfd5c..aefd51528a4 100644
--- a/apps/web-evals/src/app/runs/new/new-run.tsx
+++ b/apps/web-evals/src/app/runs/new/new-run.tsx
@@ -48,6 +48,9 @@ import {
 } from "@/lib/schemas"
 import { cn } from "@/lib/utils"
 
+import { loadRooLastModelSelection, saveRooLastModelSelection } from "@/lib/roo-last-model-selection"
+import { normalizeCreateRunForSubmit } from "@/lib/normalize-create-run"
+
 import { useOpenRouterModels } from "@/hooks/use-open-router-models"
 import { useRooCodeCloudModels } from "@/hooks/use-roo-code-cloud-models"
 
@@ -147,6 +150,7 @@ export function NewRun() {
 	})
 
 	const {
+		register,
 		setValue,
 		clearErrors,
 		watch,
@@ -155,6 +159,33 @@ export function NewRun() {
 
 	const [suite, settings] = watch(["suite", "settings", "concurrency"])
 
+	const selectedModelIds = useMemo(
+		() => modelSelections.map((s) => s.model).filter((m) => m.length > 0),
+		[modelSelections],
+	)
+
+	const applyModelIds = useCallback(
+		(modelIds: string[]) => {
+			const unique = Array.from(new Set(modelIds.map((m) => m.trim()).filter((m) => m.length > 0)))
+
+			if (unique.length === 0) {
+				setModelSelections([{ id: crypto.randomUUID(), model: "", popoverOpen: false }])
+				setValue("model", "")
+				return
+			}
+
+			setModelSelections(unique.map((model) => ({ id: crypto.randomUUID(), model, popoverOpen: false })))
+			setValue("model", unique[0] ?? "")
+		},
+		[setValue],
+	)
+
+	// Ensure the `exercises` field is registered so RHF always includes it in submit values.
+	useEffect(() => {
+		register("exercises")
+	}, [register])
+
+	// Load settings from localStorage on mount
 	useEffect(() => {
 		const savedConcurrency = localStorage.getItem("evals-concurrency")
 
@@ -215,6 +246,24 @@ export function NewRun() {
 		}
 	}, [setValue])
 
+	// When switching to Roo provider, restore last-used selection if current selection is empty
+	useEffect(() => {
+		if (provider !== "roo") return
+		if (selectedModelIds.length > 0) return
+
+		const last = loadRooLastModelSelection()
+		if (last.length > 0) {
+			applyModelIds(last)
+		}
+	}, [applyModelIds, provider, selectedModelIds.length])
+
+	// Persist last-used Roo provider model selection
+	useEffect(() => {
+		if (provider !== "roo") return
+		saveRooLastModelSelection(selectedModelIds)
+	}, [provider, selectedModelIds])
+
+	// Extract unique languages from exercises
 	const languages = useMemo(() => {
 		if (!exercises.data) {
 			return []
@@ -337,7 +386,10 @@ export function NewRun() {
 	const onSubmit = useCallback(
 		async (values: CreateRun) => {
 			try {
-				if (provider === "roo" && !values.jobToken?.trim()) {
+				const baseValues = normalizeCreateRunForSubmit(values, selectedExercises, suite)
+
+				// Validate jobToken for Roo Code Cloud provider
+				if (provider === "roo" && !baseValues.jobToken?.trim()) {
 					toast.error("Roo Code Cloud Token is required")
 					return
 				}
@@ -374,8 +426,7 @@ export function NewRun() {
 						await new Promise((resolve) => setTimeout(resolve, 20_000))
 					}
 
-					const runValues = { ...values }
-					runValues.executionMethod = executionMethod
+					const runValues = { ...baseValues }
 
 					if (provider === "openrouter") {
 						runValues.model = selection.model
@@ -424,6 +475,8 @@ export function NewRun() {
 			}
 		},
 		[
+			suite,
+			selectedExercises,
 			provider,
 			executionMethod,
 			modelSelections,
diff --git a/apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts b/apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts
new file mode 100644
index 00000000000..2e43eeeea72
--- /dev/null
+++ b/apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts
@@ -0,0 +1,43 @@
+import { normalizeCreateRunForSubmit } from "../normalize-create-run"
+
+describe("normalizeCreateRunForSubmit", () => {
+	it("uses selectedExercises for partial suite", () => {
+		const result = normalizeCreateRunForSubmit(
+			{
+				model: "roo/model-a",
+				description: "",
+				suite: "partial",
+				exercises: [],
+				settings: undefined,
+				concurrency: 1,
+				timeout: 5,
+				iterations: 1,
+				jobToken: "",
+			},
+			["js/foo", "py/bar"],
+		)
+
+		expect(result.suite).toBe("partial")
+		expect(result.exercises).toEqual(["js/foo", "py/bar"])
+	})
+
+	it("clears exercises for full suite", () => {
+		const result = normalizeCreateRunForSubmit(
+			{
+				model: "roo/model-a",
+				description: "",
+				suite: "full",
+				exercises: ["js/foo"],
+				settings: undefined,
+				concurrency: 1,
+				timeout: 5,
+				iterations: 1,
+				jobToken: "",
+			},
+			["js/foo"],
+		)
+
+		expect(result.suite).toBe("full")
+		expect(result.exercises).toEqual([])
+	})
+})
diff --git a/apps/web-evals/src/lib/__tests__/roo-last-model-selection.spec.ts b/apps/web-evals/src/lib/__tests__/roo-last-model-selection.spec.ts
new file mode 100644
index 00000000000..db853fd4a2b
--- /dev/null
+++ b/apps/web-evals/src/lib/__tests__/roo-last-model-selection.spec.ts
@@ -0,0 +1,58 @@
+import {
+	loadRooLastModelSelection,
+	ROO_LAST_MODEL_SELECTION_KEY,
+	saveRooLastModelSelection,
+} from "../roo-last-model-selection"
+
+class LocalStorageMock implements Storage {
+	private store = new Map<string, string>()
+
+	get length(): number {
+		return this.store.size
+	}
+
+	clear(): void {
+		this.store.clear()
+	}
+
+	getItem(key: string): string | null {
+		return this.store.get(key) ?? null
+	}
+
+	key(index: number): string | null {
+		return Array.from(this.store.keys())[index] ?? null
+	}
+
+	removeItem(key: string): void {
+		this.store.delete(key)
+	}
+
+	setItem(key: string, value: string): void {
+		this.store.set(key, value)
+	}
+}
+
+beforeEach(() => {
+	Object.defineProperty(globalThis, "localStorage", {
+		value: new LocalStorageMock(),
+		configurable: true,
+	})
+})
+
+describe("roo-last-model-selection", () => {
+	it("saves and loads (deduped + trimmed)", () => {
+		saveRooLastModelSelection([" roo/model-a ", "roo/model-a", "roo/model-b"])
+		expect(loadRooLastModelSelection()).toEqual(["roo/model-a", "roo/model-b"])
+	})
+
+	it("ignores invalid JSON", () => {
+		localStorage.setItem(ROO_LAST_MODEL_SELECTION_KEY, "{this is not json")
+		expect(loadRooLastModelSelection()).toEqual([])
+	})
+
+	it("clears when empty", () => {
+		localStorage.setItem(ROO_LAST_MODEL_SELECTION_KEY, JSON.stringify(["roo/model-a"]))
+		saveRooLastModelSelection([])
+		expect(localStorage.getItem(ROO_LAST_MODEL_SELECTION_KEY)).toBeNull()
+	})
+})
diff --git a/apps/web-evals/src/lib/normalize-create-run.ts b/apps/web-evals/src/lib/normalize-create-run.ts
new file mode 100644
index 00000000000..2b0020447be
--- /dev/null
+++ b/apps/web-evals/src/lib/normalize-create-run.ts
@@ -0,0 +1,19 @@
+import type { CreateRun } from "./schemas"
+
+/**
+ * The New Run UI keeps exercise selection in component state.
+ * This normalizer ensures we submit the *visible/selected* exercises when suite is partial.
+ */
+export function normalizeCreateRunForSubmit(
+	values: CreateRun,
+	selectedExercises: string[],
+	suiteOverride?: CreateRun["suite"],
+): CreateRun {
+	const suite = suiteOverride ?? values.suite
+
+	return {
+		...values,
+		suite,
+		exercises: suite === "partial" ? selectedExercises : [],
+	}
+}
diff --git a/apps/web-evals/src/lib/roo-last-model-selection.ts b/apps/web-evals/src/lib/roo-last-model-selection.ts
new file mode 100644
index 00000000000..c6969741bed
--- /dev/null
+++ b/apps/web-evals/src/lib/roo-last-model-selection.ts
@@ -0,0 +1,52 @@
+import { z } from "zod"
+
+export const ROO_LAST_MODEL_SELECTION_KEY = "evals-roo-last-model-selection"
+
+const modelIdListSchema = z.array(z.string())
+
+function hasLocalStorage(): boolean {
+	try {
+		return typeof localStorage !== "undefined"
+	} catch {
+		return false
+	}
+}
+
+function tryParseJson(raw: string | null): unknown {
+	if (raw === null) return undefined
+	try {
+		return JSON.parse(raw)
+	} catch {
+		return undefined
+	}
+}
+
+function normalizeModelIds(modelIds: string[]): string[] {
+	const unique = new Set<string>()
+	for (const id of modelIds) {
+		const trimmed = id.trim()
+		if (trimmed) unique.add(trimmed)
+	}
+	return Array.from(unique)
+}
+
+export function loadRooLastModelSelection(): string[] {
+	if (!hasLocalStorage()) return []
+
+	const parsed = modelIdListSchema.safeParse(tryParseJson(localStorage.getItem(ROO_LAST_MODEL_SELECTION_KEY)))
+	if (!parsed.success) return []
+
+	return normalizeModelIds(parsed.data)
+}
+
+export function saveRooLastModelSelection(modelIds: string[]): void {
+	if (!hasLocalStorage()) return
+
+	const normalized = normalizeModelIds(modelIds)
+	if (normalized.length === 0) {
+		localStorage.removeItem(ROO_LAST_MODEL_SELECTION_KEY)
+		return
+	}
+
+	localStorage.setItem(ROO_LAST_MODEL_SELECTION_KEY, JSON.stringify(normalized))
+}

From 1cf799b307624149a2a7320f526cb0ffe028216c Mon Sep 17 00:00:00 2001
From: Hannes Rudolph <hrudolph@gmail.com>
Date: Mon, 5 Jan 2026 13:59:57 -0700
Subject: [PATCH 2/4] fix(web-evals): reset model selections on provider switch
 and fix lint warning

- Add useEffect to reset model selections when switching between providers
  This prevents OpenRouter model IDs from persisting when switching to Roo,
  which was causing Roo's stored selection to be overwritten with wrong IDs

- Remove unused 'executionMethod' from onSubmit dependency array to fix
  react-hooks/exhaustive-deps warning
---
 apps/web-evals/src/app/runs/new/new-run.tsx | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/apps/web-evals/src/app/runs/new/new-run.tsx b/apps/web-evals/src/app/runs/new/new-run.tsx
index aefd51528a4..80fa47bff7b 100644
--- a/apps/web-evals/src/app/runs/new/new-run.tsx
+++ b/apps/web-evals/src/app/runs/new/new-run.tsx
@@ -246,6 +246,18 @@ export function NewRun() {
 		}
 	}, [setValue])
 
+	// Track previous provider to detect switches
+	const [prevProvider, setPrevProvider] = useState(provider)
+
+	// Reset model selections when switching providers to avoid cross-contamination
+	useEffect(() => {
+		if (provider !== prevProvider) {
+			setModelSelections([{ id: crypto.randomUUID(), model: "", popoverOpen: false }])
+			setValue("model", "")
+			setPrevProvider(provider)
+		}
+	}, [provider, prevProvider, setValue])
+
 	// When switching to Roo provider, restore last-used selection if current selection is empty
 	useEffect(() => {
 		if (provider !== "roo") return
@@ -478,7 +490,6 @@ export function NewRun() {
 			suite,
 			selectedExercises,
 			provider,
-			executionMethod,
 			modelSelections,
 			configSelections,
 			importedSettings,

From 0bcb8a35f3eb2618fc684ab273791fbbbbbe20f6 Mon Sep 17 00:00:00 2001
From: Hannes Rudolph <hrudolph@gmail.com>
Date: Mon, 5 Jan 2026 14:12:45 -0700
Subject: [PATCH 3/4] fix(web-evals): add missing executionMethod to test cases

---
 apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts b/apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts
index 2e43eeeea72..5543c589977 100644
--- a/apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts
+++ b/apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts
@@ -13,6 +13,7 @@ describe("normalizeCreateRunForSubmit", () => {
 				timeout: 5,
 				iterations: 1,
 				jobToken: "",
+				executionMethod: "vscode",
 			},
 			["js/foo", "py/bar"],
 		)
@@ -33,6 +34,7 @@ describe("normalizeCreateRunForSubmit", () => {
 				timeout: 5,
 				iterations: 1,
 				jobToken: "",
+				executionMethod: "vscode",
 			},
 			["js/foo"],
 		)

From ef80ab4b4bc1219dcb880f5386e797d8f8f9de58 Mon Sep 17 00:00:00 2001
From: Hannes Rudolph <hrudolph@gmail.com>
Date: Tue, 6 Jan 2026 11:50:53 -0700
Subject: [PATCH 4/4] fix(web-evals): harden localStorage + keep provider
 selections

---
 apps/web-evals/src/app/runs/new/new-run.tsx   | 34 ++++++++++++++-----
 .../__tests__/normalize-create-run.spec.ts    | 20 +++++++++++
 .../roo-last-model-selection.spec.ts          | 20 +++++++++++
 .../web-evals/src/lib/normalize-create-run.ts |  3 +-
 .../src/lib/roo-last-model-selection.ts       | 30 ++++++++++++++--
 5 files changed, 95 insertions(+), 12 deletions(-)

diff --git a/apps/web-evals/src/app/runs/new/new-run.tsx b/apps/web-evals/src/app/runs/new/new-run.tsx
index 80fa47bff7b..cea15c6ddd8 100644
--- a/apps/web-evals/src/app/runs/new/new-run.tsx
+++ b/apps/web-evals/src/app/runs/new/new-run.tsx
@@ -1,6 +1,6 @@
 "use client"
 
-import { useCallback, useEffect, useMemo, useState } from "react"
+import { useCallback, useEffect, useMemo, useRef, useState } from "react"
 import { useRouter } from "next/navigation"
 import { z } from "zod"
 import { useQuery } from "@tanstack/react-query"
@@ -106,6 +106,8 @@ type ConfigSelection = {
 
 export function NewRun() {
 	const router = useRouter()
+	const modelSelectionsByProviderRef = useRef<Record<string, ModelSelection[]>>({})
+	const modelValueByProviderRef = useRef<Record<string, string>>({})
 
 	const [provider, setModelSource] = useState<"roo" | "openrouter" | "other">("other")
 	const [executionMethod, setExecutionMethod] = useState<ExecutionMethod>("vscode")
@@ -154,6 +156,7 @@ export function NewRun() {
 		setValue,
 		clearErrors,
 		watch,
+		getValues,
 		formState: { isSubmitting },
 	} = form
 
@@ -249,14 +252,29 @@ export function NewRun() {
 	// Track previous provider to detect switches
 	const [prevProvider, setPrevProvider] = useState(provider)
 
-	// Reset model selections when switching providers to avoid cross-contamination
+	// Preserve selections per provider; avoids cross-contamination while keeping UX stable.
 	useEffect(() => {
-		if (provider !== prevProvider) {
-			setModelSelections([{ id: crypto.randomUUID(), model: "", popoverOpen: false }])
-			setValue("model", "")
-			setPrevProvider(provider)
-		}
-	}, [provider, prevProvider, setValue])
+		if (provider === prevProvider) return
+
+		modelSelectionsByProviderRef.current[prevProvider] = modelSelections
+		modelValueByProviderRef.current[prevProvider] = getValues("model")
+
+		const nextModelSelections =
+			modelSelectionsByProviderRef.current[provider] ??
+			([{ id: crypto.randomUUID(), model: "", popoverOpen: false }] satisfies ModelSelection[])
+
+		setModelSelections(nextModelSelections)
+
+		const nextModelValue =
+			modelValueByProviderRef.current[provider] ??
+			nextModelSelections.find((s) => s.model.trim().length > 0)?.model ??
+			(provider === "other" && importedSettings && configSelections[0]?.configName
+				? (getModelId(importedSettings.apiConfigs[configSelections[0].configName] ?? {}) ?? "")
+				: "")
+
+		setValue("model", nextModelValue)
+		setPrevProvider(provider)
+	}, [provider, prevProvider, modelSelections, setValue, getValues, importedSettings, configSelections])
 
 	// When switching to Roo provider, restore last-used selection if current selection is empty
 	useEffect(() => {
diff --git a/apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts b/apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts
index 5543c589977..947df313545 100644
--- a/apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts
+++ b/apps/web-evals/src/lib/__tests__/normalize-create-run.spec.ts
@@ -22,6 +22,26 @@ describe("normalizeCreateRunForSubmit", () => {
 		expect(result.exercises).toEqual(["js/foo", "py/bar"])
 	})
 
+	it("dedupes selectedExercises for partial suite", () => {
+		const result = normalizeCreateRunForSubmit(
+			{
+				model: "roo/model-a",
+				description: "",
+				suite: "partial",
+				exercises: [],
+				settings: undefined,
+				concurrency: 1,
+				timeout: 5,
+				iterations: 1,
+				jobToken: "",
+				executionMethod: "vscode",
+			},
+			["js/foo", "js/foo", "py/bar"],
+		)
+
+		expect(result.exercises).toEqual(["js/foo", "py/bar"])
+	})
+
 	it("clears exercises for full suite", () => {
 		const result = normalizeCreateRunForSubmit(
 			{
diff --git a/apps/web-evals/src/lib/__tests__/roo-last-model-selection.spec.ts b/apps/web-evals/src/lib/__tests__/roo-last-model-selection.spec.ts
index db853fd4a2b..45879b4be5b 100644
--- a/apps/web-evals/src/lib/__tests__/roo-last-model-selection.spec.ts
+++ b/apps/web-evals/src/lib/__tests__/roo-last-model-selection.spec.ts
@@ -55,4 +55,24 @@ describe("roo-last-model-selection", () => {
 		saveRooLastModelSelection([])
 		expect(localStorage.getItem(ROO_LAST_MODEL_SELECTION_KEY)).toBeNull()
 	})
+
+	it("does not throw if localStorage access fails", () => {
+		Object.defineProperty(globalThis, "localStorage", {
+			value: {
+				getItem: () => {
+					throw new Error("blocked")
+				},
+				setItem: () => {
+					throw new Error("blocked")
+				},
+				removeItem: () => {
+					throw new Error("blocked")
+				},
+			},
+			configurable: true,
+		})
+
+		expect(() => loadRooLastModelSelection()).not.toThrow()
+		expect(() => saveRooLastModelSelection(["roo/model-a"])).not.toThrow()
+	})
 })
diff --git a/apps/web-evals/src/lib/normalize-create-run.ts b/apps/web-evals/src/lib/normalize-create-run.ts
index 2b0020447be..a5f21ba5ad1 100644
--- a/apps/web-evals/src/lib/normalize-create-run.ts
+++ b/apps/web-evals/src/lib/normalize-create-run.ts
@@ -10,10 +10,11 @@ export function normalizeCreateRunForSubmit(
 	suiteOverride?: CreateRun["suite"],
 ): CreateRun {
 	const suite = suiteOverride ?? values.suite
+	const normalizedSelectedExercises = Array.from(new Set(selectedExercises))
 
 	return {
 		...values,
 		suite,
-		exercises: suite === "partial" ? selectedExercises : [],
+		exercises: suite === "partial" ? normalizedSelectedExercises : [],
 	}
 }
diff --git a/apps/web-evals/src/lib/roo-last-model-selection.ts b/apps/web-evals/src/lib/roo-last-model-selection.ts
index c6969741bed..b66d493172f 100644
--- a/apps/web-evals/src/lib/roo-last-model-selection.ts
+++ b/apps/web-evals/src/lib/roo-last-model-selection.ts
@@ -12,6 +12,30 @@ function hasLocalStorage(): boolean {
 	}
 }
 
+function safeGetItem(key: string): string | null {
+	try {
+		return localStorage.getItem(key)
+	} catch {
+		return null
+	}
+}
+
+function safeSetItem(key: string, value: string): void {
+	try {
+		localStorage.setItem(key, value)
+	} catch {
+		// ignore
+	}
+}
+
+function safeRemoveItem(key: string): void {
+	try {
+		localStorage.removeItem(key)
+	} catch {
+		// ignore
+	}
+}
+
 function tryParseJson(raw: string | null): unknown {
 	if (raw === null) return undefined
 	try {
@@ -33,7 +57,7 @@ function normalizeModelIds(modelIds: string[]): string[] {
 export function loadRooLastModelSelection(): string[] {
 	if (!hasLocalStorage()) return []
 
-	const parsed = modelIdListSchema.safeParse(tryParseJson(localStorage.getItem(ROO_LAST_MODEL_SELECTION_KEY)))
+	const parsed = modelIdListSchema.safeParse(tryParseJson(safeGetItem(ROO_LAST_MODEL_SELECTION_KEY)))
 	if (!parsed.success) return []
 
 	return normalizeModelIds(parsed.data)
@@ -44,9 +68,9 @@ export function saveRooLastModelSelection(modelIds: string[]): void {
 
 	const normalized = normalizeModelIds(modelIds)
 	if (normalized.length === 0) {
-		localStorage.removeItem(ROO_LAST_MODEL_SELECTION_KEY)
+		safeRemoveItem(ROO_LAST_MODEL_SELECTION_KEY)
 		return
 	}
 
-	localStorage.setItem(ROO_LAST_MODEL_SELECTION_KEY, JSON.stringify(normalized))
+	safeSetItem(ROO_LAST_MODEL_SELECTION_KEY, JSON.stringify(normalized))
 }