diff --git a/apps/web-evals/src/actions/runs.ts b/apps/web-evals/src/actions/runs.ts index a3fb3feccc8..9d213547cee 100644 --- a/apps/web-evals/src/actions/runs.ts +++ b/apps/web-evals/src/actions/runs.ts @@ -13,6 +13,9 @@ import { exerciseLanguages, createRun as _createRun, deleteRun as _deleteRun, + updateRun as _updateRun, + getIncompleteRuns as _getIncompleteRuns, + deleteRunsByIds as _deleteRunsByIds, createTask, getExercisesForLanguage, } from "@roo-code/evals" @@ -20,6 +23,9 @@ import { import { CreateRun } from "@/lib/schemas" import { redisClient } from "@/lib/server/redis" +// Storage base path for eval logs +const EVALS_STORAGE_PATH = "/tmp/evals/runs" + const EVALS_REPO_PATH = path.resolve(path.dirname(fileURLToPath(import.meta.url)), "../../../../../evals") export async function createRun({ suite, exercises = [], timeout, iterations = 1, ...values }: CreateRun) { @@ -214,3 +220,150 @@ export async function killRun(runId: number): Promise { errors, } } + +export type DeleteIncompleteRunsResult = { + success: boolean + deletedCount: number + deletedRunIds: number[] + storageErrors: string[] +} + +/** + * Delete all incomplete runs (runs without a taskMetricsId/final score). + * Removes both database records and storage folders. + */ +export async function deleteIncompleteRuns(): Promise { + const storageErrors: string[] = [] + + // Get all incomplete runs + const incompleteRuns = await _getIncompleteRuns() + const runIds = incompleteRuns.map((run) => run.id) + + if (runIds.length === 0) { + return { + success: true, + deletedCount: 0, + deletedRunIds: [], + storageErrors: [], + } + } + + // Delete storage folders for each run + for (const runId of runIds) { + const storagePath = path.join(EVALS_STORAGE_PATH, String(runId)) + try { + if (fs.existsSync(storagePath)) { + fs.rmSync(storagePath, { recursive: true, force: true }) + console.log(`Deleted storage folder: ${storagePath}`) + } + } catch (error) { + console.error(`Failed to delete storage folder ${storagePath}:`, error) + storageErrors.push(`Failed to delete storage for run ${runId}`) + } + + // Also try to clear Redis state for any potentially running incomplete runs + try { + const redis = await redisClient() + await redis.del(`heartbeat:${runId}`) + await redis.del(`runners:${runId}`) + } catch (error) { + // Non-critical error, just log it + console.error(`Failed to clear Redis state for run ${runId}:`, error) + } + } + + // Delete from database + await _deleteRunsByIds(runIds) + + revalidatePath("/runs") + + return { + success: true, + deletedCount: runIds.length, + deletedRunIds: runIds, + storageErrors, + } +} + +/** + * Get count of incomplete runs (for UI display) + */ +export async function getIncompleteRunsCount(): Promise { + const incompleteRuns = await _getIncompleteRuns() + return incompleteRuns.length +} + +/** + * Delete all runs older than 30 days. + * Removes both database records and storage folders. + */ +export async function deleteOldRuns(): Promise { + const storageErrors: string[] = [] + + // Get all runs older than 30 days + const thirtyDaysAgo = new Date(Date.now() - 30 * 24 * 60 * 60 * 1000) + const { getRuns } = await import("@roo-code/evals") + const allRuns = await getRuns() + const oldRuns = allRuns.filter((run) => run.createdAt < thirtyDaysAgo) + const runIds = oldRuns.map((run) => run.id) + + if (runIds.length === 0) { + return { + success: true, + deletedCount: 0, + deletedRunIds: [], + storageErrors: [], + } + } + + // Delete storage folders for each run + for (const runId of runIds) { + const storagePath = path.join(EVALS_STORAGE_PATH, String(runId)) + try { + if (fs.existsSync(storagePath)) { + fs.rmSync(storagePath, { recursive: true, force: true }) + console.log(`Deleted storage folder: ${storagePath}`) + } + } catch (error) { + console.error(`Failed to delete storage folder ${storagePath}:`, error) + storageErrors.push(`Failed to delete storage for run ${runId}`) + } + + // Also try to clear Redis state + try { + const redis = await redisClient() + await redis.del(`heartbeat:${runId}`) + await redis.del(`runners:${runId}`) + } catch (error) { + // Non-critical error, just log it + console.error(`Failed to clear Redis state for run ${runId}:`, error) + } + } + + // Delete from database + await _deleteRunsByIds(runIds) + + revalidatePath("/runs") + + return { + success: true, + deletedCount: runIds.length, + deletedRunIds: runIds, + storageErrors, + } +} + +/** + * Update the description of a run. + */ +export async function updateRunDescription(runId: number, description: string | null): Promise<{ success: boolean }> { + try { + await _updateRun(runId, { description }) + revalidatePath("/runs") + revalidatePath(`/runs/${runId}`) + return { success: true } + } catch (error) { + console.error("Failed to update run description:", error) + return { success: false } + } +} diff --git a/apps/web-evals/src/components/home/run.tsx b/apps/web-evals/src/components/home/run.tsx index 4abbfc67b65..99950bae436 100644 --- a/apps/web-evals/src/components/home/run.tsx +++ b/apps/web-evals/src/components/home/run.tsx @@ -2,12 +2,12 @@ import { useCallback, useState, useRef } from "react" import Link from "next/link" import { useRouter } from "next/navigation" import { toast } from "sonner" -import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash, Settings, FileDown } from "lucide-react" +import { Ellipsis, ClipboardList, Copy, Check, LoaderCircle, Trash, Settings, FileDown, StickyNote } from "lucide-react" import type { Run as EvalsRun, TaskMetrics as EvalsTaskMetrics } from "@roo-code/evals" import type { ToolName } from "@roo-code/types" -import { deleteRun } from "@/actions/runs" +import { deleteRun, updateRunDescription } from "@/actions/runs" import { formatCurrency, formatDateTime, @@ -20,6 +20,10 @@ import { Button, TableCell, TableRow, + Textarea, + Tooltip, + TooltipContent, + TooltipTrigger, DropdownMenu, DropdownMenuContent, DropdownMenuItem, @@ -34,6 +38,7 @@ import { AlertDialogTitle, Dialog, DialogContent, + DialogFooter, DialogHeader, DialogTitle, ScrollArea, @@ -43,16 +48,41 @@ type RunProps = { run: EvalsRun taskMetrics: EvalsTaskMetrics | null toolColumns: ToolName[] + consolidatedToolColumns: string[] } -export function Run({ run, taskMetrics, toolColumns }: RunProps) { +export function Run({ run, taskMetrics, toolColumns, consolidatedToolColumns }: RunProps) { const router = useRouter() const [deleteRunId, setDeleteRunId] = useState() const [showSettings, setShowSettings] = useState(false) const [isExportingLogs, setIsExportingLogs] = useState(false) + const [showNotesDialog, setShowNotesDialog] = useState(false) + const [editingDescription, setEditingDescription] = useState(run.description ?? "") + const [isSavingNotes, setIsSavingNotes] = useState(false) const continueRef = useRef(null) const { isPending, copyRun, copied } = useCopyRun(run.id) + const hasDescription = Boolean(run.description && run.description.trim().length > 0) + + const handleSaveDescription = useCallback(async () => { + setIsSavingNotes(true) + try { + const result = await updateRunDescription(run.id, editingDescription.trim() || null) + if (result.success) { + toast.success("Description saved") + setShowNotesDialog(false) + router.refresh() + } else { + toast.error("Failed to save description") + } + } catch (error) { + console.error("Error saving description:", error) + toast.error("Failed to save description") + } finally { + setIsSavingNotes(false) + } + }, [run.id, editingDescription, router]) + const onExportFailedLogs = useCallback(async () => { if (run.failed === 0) { toast.error("No failed tasks to export") @@ -140,6 +170,68 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) { )} + {consolidatedToolColumns.length > 0 && ( + + {taskMetrics?.toolUsage ? ( + (() => { + // Calculate aggregated stats for consolidated tools + let totalAttempts = 0 + let totalFailures = 0 + const breakdown: Array<{ tool: string; attempts: number; rate: string }> = [] + + for (const toolName of consolidatedToolColumns) { + const usage = taskMetrics.toolUsage[toolName as ToolName] + if (usage) { + totalAttempts += usage.attempts + totalFailures += usage.failures + const rate = + usage.attempts > 0 + ? `${Math.round(((usage.attempts - usage.failures) / usage.attempts) * 100)}%` + : "0%" + breakdown.push({ tool: toolName, attempts: usage.attempts, rate }) + } + } + + const consolidatedRate = + totalAttempts > 0 ? ((totalAttempts - totalFailures) / totalAttempts) * 100 : 100 + const rateColor = + consolidatedRate === 100 + ? "text-muted-foreground" + : consolidatedRate >= 80 + ? "text-yellow-500" + : "text-red-500" + + return totalAttempts > 0 ? ( + + +
+ {totalAttempts} + {Math.round(consolidatedRate)}% +
+
+ +
+
Consolidated Tools:
+ {breakdown.map(({ tool, attempts, rate }) => ( +
+ {tool}: + + {attempts} ({rate}) + +
+ ))} +
+
+
+ ) : ( + - + ) + })() + ) : ( + - + )} +
+ )} {toolColumns.map((toolName) => { const usage = taskMetrics?.toolUsage?.[toolName] const successRate = @@ -166,80 +258,107 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) { {taskMetrics && formatCurrency(taskMetrics.cost)} {taskMetrics && formatDuration(taskMetrics.duration)} e.stopPropagation()}> - - - - - -
- -
View Tasks
-
- -
- {run.settings && ( - setShowSettings(true)}> -
- -
View Settings
-
-
- )} - {run.taskMetricsId && ( - copyRun()} disabled={isPending || copied}> -
- {isPending ? ( - <> - - Copying... - - ) : copied ? ( - <> - - Copied! - - ) : ( - <> - - Copy to Production - - )} -
+
+ {/* Note Icon */} + + + + + + {hasDescription ? ( +
{run.description}
+ ) : ( +
No description. Click to add one.
+ )} +
+
+ + {/* More Actions Menu */} + + + + + +
+ +
View Tasks
+
+
- )} - {run.failed > 0 && ( - + {run.settings && ( + setShowSettings(true)}> +
+ +
View Settings
+
+
+ )} + {run.taskMetricsId && ( + copyRun()} disabled={isPending || copied}> +
+ {isPending ? ( + <> + + Copying... + + ) : copied ? ( + <> + + Copied! + + ) : ( + <> + + Copy to Production + + )} +
+
+ )} + {run.failed > 0 && ( + +
+ {isExportingLogs ? ( + <> + + Exporting... + + ) : ( + <> + + Export Failed Logs + + )} +
+
+ )} + { + setDeleteRunId(run.id) + setTimeout(() => continueRef.current?.focus(), 0) + }}>
- {isExportingLogs ? ( - <> - - Exporting... - - ) : ( - <> - - Export Failed Logs - - )} + +
Delete
- )} - { - setDeleteRunId(run.id) - setTimeout(() => continueRef.current?.focus(), 0) - }}> -
- -
Delete
-
-
-
-
+ + +
setDeleteRunId(undefined)}> @@ -268,6 +387,39 @@ export function Run({ run, taskMetrics, toolColumns }: RunProps) { + + {/* Notes/Description Dialog */} + + + + Run Description + +
+