Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 24 additions & 101 deletions apps/web-roo-code/src/app/evals/evals.tsx
Original file line number Diff line number Diff line change
@@ -1,59 +1,33 @@
"use client"

import { useMemo } from "react"
import { ScatterChart, Scatter, XAxis, YAxis, Label, Customized, Cross } from "recharts"

import type { TaskMetrics, Run } from "@roo-code/evals"

import { formatTokens, formatCurrency, formatDuration, formatScore } from "@/lib"
import { useOpenRouterModels } from "@/lib/hooks"
import {
ChartContainer,
ChartTooltip,
ChartTooltipContent,
ChartConfig,
ChartLegend,
ChartLegendContent,
Table,
TableBody,
TableCaption,
TableCell,
TableHead,
TableHeader,
TableRow,
} from "@/components/ui"
import { Table, TableBody, TableCaption, TableCell, TableHead, TableHeader, TableRow } from "@/components/ui"

import type { EvalRun } from "./types"
import { Plot } from "./plot"

export function Evals({
runs,
}: {
runs: (Run & {
label: string
score: number
languageScores?: Record<"go" | "java" | "javascript" | "python" | "rust", number>
taskMetrics: TaskMetrics
modelId?: string
})[]
}) {
export function Evals({ runs }: { runs: EvalRun[] }) {
const { data: openRouterModels } = useOpenRouterModels()

const tableData = useMemo(
const tableData: (EvalRun & { label: string; cost: number })[] = useMemo(
() =>
runs.map((run) => ({
...run,
label: run.description || run.model,
score: run.score,
cost: run.taskMetrics.cost,
model: openRouterModels?.[run.modelId ?? ""],
modelInfo: openRouterModels?.[run.modelId ?? ""]?.modelInfo,
})),
[runs, openRouterModels],
)
runs.map((run) => {
const openRouterModelInfo = openRouterModels?.[run.modelId ?? ""]?.modelInfo

const chartData = useMemo(() => tableData.filter(({ cost }) => cost < 100), [tableData])

const chartConfig = useMemo(
() => chartData.reduce((acc, run) => ({ ...acc, [run.label]: run }), {} as ChartConfig),
[chartData],
return {
...run,
label: run.name || run.description || run.model,
cost: run.taskMetrics.cost,
description: run.description ?? openRouterModelInfo?.description ?? null,
contextWindow: run.contextWindow ?? openRouterModelInfo?.contextWindow ?? null,
inputPrice: run.inputPrice ?? openRouterModelInfo?.inputPrice ?? null,
outputPrice: run.outputPrice ?? openRouterModelInfo?.outputPrice ?? null,
}
}),
[runs, openRouterModels],
)

return (
Expand Down Expand Up @@ -127,17 +101,15 @@ export function Evals({
<TableBody className="font-mono">
{tableData.map((run) => (
<TableRow key={run.id}>
<TableCell title={run.model?.description}>
<TableCell title={run.description ?? undefined}>
<div className="font-sans">{run.label}</div>
<div className="text-xs opacity-50">
{formatTokens(run.modelInfo?.contextWindow ?? 0)}
</div>
<div className="text-xs opacity-50">{formatTokens(run.contextWindow ?? 0)}</div>
</TableCell>
<TableCell className="border-r">
<div className="flex flex-row gap-2">
<div>{formatCurrency(run.modelInfo?.inputPrice ?? 0)}</div>
<div>{formatCurrency(run.inputPrice ?? 0)}</div>
<div className="opacity-25">/</div>
<div>{formatCurrency(run.modelInfo?.outputPrice ?? 0)}</div>
<div>{formatCurrency(run.outputPrice ?? 0)}</div>
</div>
</TableCell>
<TableCell className="font-mono">{formatDuration(run.taskMetrics.duration)}</TableCell>
Expand Down Expand Up @@ -169,58 +141,9 @@ export function Evals({
))}
</TableBody>
<TableCaption>
<div className="pb-4 font-medium">Cost Versus Score</div>
<ChartContainer config={chartConfig} className="h-[500px] w-full">
<ScatterChart margin={{ top: 0, right: 0, bottom: 0, left: 20 }}>
<XAxis
type="number"
dataKey="cost"
name="Cost"
domain={[
(dataMin: number) => Math.round((dataMin - 5) / 5) * 5,
(dataMax: number) => Math.round((dataMax + 5) / 5) * 5,
]}
tickFormatter={(value) => formatCurrency(value)}>
<Label value="Cost" position="bottom" offset={0} />
</XAxis>
<YAxis
type="number"
dataKey="score"
name="Score"
domain={[
(dataMin: number) => Math.max(0, Math.round((dataMin - 5) / 5) * 5),
(dataMax: number) => Math.min(100, Math.round((dataMax + 5) / 5) * 5),
]}
tickFormatter={(value) => `${value}%`}>
<Label value="Score" angle={-90} position="left" dy={-15} />
</YAxis>
<ChartTooltip content={<ChartTooltipContent labelKey="label" hideIndicator />} />
<Customized component={renderQuadrant} />
{chartData.map((d, i) => (
<Scatter key={d.label} name={d.label} data={[d]} fill={`hsl(var(--chart-${i + 1}))`} />
))}
<ChartLegend content={<ChartLegendContent />} />
</ScatterChart>
</ChartContainer>
<div className="py-4 text-xs opacity-50">
(Note: Very expensive models are excluded from the scatter plot.)
</div>
<Plot tableData={tableData} />
</TableCaption>
</Table>
</div>
)
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const renderQuadrant = (props: any) => (
<Cross
width={props.width}
height={props.height}
x={props.width / 2 + 35}
y={props.height / 2 - 15}
top={0}
left={0}
stroke="currentColor"
opacity={0.1}
/>
)
128 changes: 128 additions & 0 deletions apps/web-roo-code/src/app/evals/plot.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"use client"

import { useMemo } from "react"
import { ScatterChart, Scatter, XAxis, YAxis, Label, Customized, Cross, LabelList } from "recharts"

import { formatCurrency } from "@/lib"
import { ChartContainer, ChartTooltip, ChartTooltipContent, ChartConfig } from "@/components/ui"

import type { EvalRun } from "./types"

type PlotProps = {
tableData: (EvalRun & { label: string; cost: number })[]
}

export const Plot = ({ tableData }: PlotProps) => {
const chartData = useMemo(() => tableData.filter(({ cost }) => cost < 50), [tableData])

const chartConfig = useMemo(
() => chartData.reduce((acc, run) => ({ ...acc, [run.label]: run }), {} as ChartConfig),
[chartData],
)

return (
<>
<div className="pb-4 font-medium">Cost Versus Score</div>
<ChartContainer config={chartConfig} className="h-[500px] w-full">
<ScatterChart margin={{ top: 0, right: 0, bottom: 0, left: 20 }}>
<XAxis
type="number"
dataKey="cost"
name="Cost"
domain={[
(dataMin: number) => Math.round((dataMin - 5) / 5) * 5,
(dataMax: number) => Math.round((dataMax + 5) / 5) * 5,
]}
tickFormatter={(value) => formatCurrency(value)}>
<Label value="Cost" position="bottom" offset={0} />
</XAxis>
<YAxis
type="number"
dataKey="score"
name="Score"
domain={[
(dataMin: number) => Math.max(0, Math.round((dataMin - 5) / 5) * 5),
(dataMax: number) => Math.min(100, Math.round((dataMax + 5) / 5) * 5),
]}
tickFormatter={(value) => `${value}%`}>
<Label value="Score" angle={-90} position="left" dy={-15} />
</YAxis>
<ChartTooltip content={<ChartTooltipContent labelKey="label" hideIndicator />} />
<Customized component={renderQuadrant} />
{chartData.map((d, index) => (
<Scatter
key={d.label}
name={d.label}
data={[d]}
fill={generateSpectrumColor(index, chartData.length)}>
<LabelList dataKey="label" position="top" offset={8} content={renderCustomLabel} />
</Scatter>
))}
</ScatterChart>
</ChartContainer>
<div className="py-4 text-xs opacity-50">
(Note: Very expensive models are excluded from the scatter plot.)
</div>
</>
)
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const renderQuadrant = (props: any) => (
<Cross
width={props.width}
height={props.height}
x={props.width / 2 + 35}
y={props.height / 2 - 15}
top={0}
left={0}
stroke="currentColor"
opacity={0.1}
/>
)

// eslint-disable-next-line @typescript-eslint/no-explicit-any
const renderCustomLabel = (props: any) => {
const { x, y, value } = props
const maxWidth = 80 // Maximum width in pixels - adjust as needed.

const truncateText = (text: string, maxChars: number = 12) => {
if (text.length <= maxChars) {
return text
}

return text.substring(0, maxChars - 1) + "…"
}

return (
<text
x={x}
y={y - 5}
fontSize="10"
fontWeight="500"
fill="currentColor"
opacity="0.8"
textAnchor="middle"
dominantBaseline="auto"
style={{
pointerEvents: "none",
maxWidth: `${maxWidth}px`,
overflow: "hidden",
textOverflow: "ellipsis",
whiteSpace: "nowrap",
}}>
{truncateText(value)}
</text>
)
}

const generateSpectrumColor = (index: number, total: number): string => {
// Distribute hues evenly across the color wheel (0-360 degrees)
// Start at 0 (red) and distribute evenly.
const hue = (index * 360) / total
// Use high saturation for vibrant colors.
const saturation = 70
// Use medium lightness for good visibility on both light and dark backgrounds.
const lightness = 50
return `hsl(${Math.round(hue)}, ${saturation}%, ${lightness}%)`
}
9 changes: 9 additions & 0 deletions apps/web-roo-code/src/app/evals/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import type { TaskMetrics, Run } from "@roo-code/evals"

export type EvalRun = Run & {
label: string
score: number
languageScores?: Record<"go" | "java" | "javascript" | "python" | "rust", number>
taskMetrics: TaskMetrics
modelId?: string
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ export const getOpenRouterModels = async (): Promise<OpenRouterModelRecord> => {

return result.data.data
.filter((rawModel) => {
// Skip image generation models (models that output images)
// Skip image generation models (models that output images).
return !rawModel.architecture?.output_modalities?.includes("image")
})
.sort((a, b) => a.name.localeCompare(b.name))
Expand Down
2 changes: 2 additions & 0 deletions packages/evals/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
"drizzle-kit:production": "dotenvx run -f .env.production -- tsx node_modules/drizzle-kit/bin.cjs",
"db:generate": "pnpm drizzle-kit generate",
"db:migrate": "pnpm drizzle-kit migrate",
"db:test:migrate": "pnpm drizzle-kit:test migrate",
"db:production:migrate": "pnpm drizzle-kit:production migrate",
"db:push": "pnpm drizzle-kit push",
"db:test:push": "pnpm drizzle-kit:test push",
"db:production:push": "pnpm drizzle-kit:production push",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
ALTER TABLE "runs" ADD COLUMN "name" text;--> statement-breakpoint
ALTER TABLE "runs" ADD COLUMN "contextWindow" integer;--> statement-breakpoint
ALTER TABLE "runs" ADD COLUMN "inputPrice" real;--> statement-breakpoint
ALTER TABLE "runs" ADD COLUMN "outputPrice" real;--> statement-breakpoint
ALTER TABLE "runs" ADD COLUMN "cacheWritesPrice" real;--> statement-breakpoint
ALTER TABLE "runs" ADD COLUMN "cacheReadsPrice" real;
Loading