diff --git a/.github/actions/ui-judge-comment/README.md b/.github/actions/ui-judge-comment/README.md index 8706ec6739..fc79efab0d 100644 --- a/.github/actions/ui-judge-comment/README.md +++ b/.github/actions/ui-judge-comment/README.md @@ -25,8 +25,19 @@ Example result payload: { "results": [ { + "demoId": "recs", "dimension": "visual-correctness", - "score": 5, + "dimensions": [ + { + "dimension": "usability-interaction", + "dimensionLabel": "Usability & Interaction", + "score": 4, + "steps": [], + "url": "http://127.0.0.1:3000/render.html?demo=recs", + "weight": 30 + } + ], + "score": 3, "steps": [], "url": "http://127.0.0.1:3000/render.html?demo=recs" } @@ -34,6 +45,10 @@ Example result payload: } ``` +When a result includes weighted GEQI `dimensions`, the comment renders one row +per example, adds one column per GEQI dimension, and shows the weighted +100-point GEQI score without replacing the visual-correctness score. + Inputs: - `result-file`: path to a JSON result file. diff --git a/.github/actions/ui-judge-comment/comment.mjs b/.github/actions/ui-judge-comment/comment.mjs index 280b1e0dcd..b07a6cab30 100644 --- a/.github/actions/ui-judge-comment/comment.mjs +++ b/.github/actions/ui-judge-comment/comment.mjs @@ -129,13 +129,17 @@ function normalizeResult(result, index) { } return { + demoId: stringValue(result.demoId), dimension: stringValue(result.dimension) || 'visual-correctness', + dimensionLabel: stringValue(result.dimensionLabel), + dimensions: normalizeDimensionResults(result.dimensions, index), error: normalizeError(result.error), reference: stringValue(result.reference), score: normalizeScore(result.score, index), steps: normalizeSteps(result.steps), task: stringValue(result.task), url: stringValue(result.url), + weight: normalizeWeight(result.weight, index), }; } @@ -149,6 +153,54 @@ function normalizeScore(value, index) { return Math.max(0, Math.min(5, Math.round(score))); } +function normalizeDimensionResults(dimensions, resultIndex) { + if (!Array.isArray(dimensions)) return []; + + return dimensions.map((dimensionResult, dimensionIndex) => + normalizeDimensionResult( + dimensionResult, + `${resultIndex}.${dimensionIndex}`, + ) + ); +} + +function normalizeDimensionResult(result, index) { + if (!result || typeof result !== 'object') { + throw new Error( + `UI Judge dimension result at index ${index} must be an object.`, + ); + } + + const dimension = stringValue(result.dimension); + if (!dimension) { + throw new Error( + `UI Judge dimension result at index ${index} is missing dimension.`, + ); + } + + return { + dimension, + dimensionLabel: stringValue(result.dimensionLabel) || dimension, + error: normalizeError(result.error), + score: normalizeScore(result.score, index), + steps: normalizeSteps(result.steps), + url: stringValue(result.url), + weight: normalizeWeight(result.weight, index), + }; +} + +function normalizeWeight(value, index) { + if (value === undefined || value === null || value === '') return undefined; + + const weight = typeof value === 'number' ? value : Number(value); + if (!Number.isFinite(weight) || weight <= 0) { + throw new Error( + `UI Judge result at index ${index} has an invalid dimension weight.`, + ); + } + return weight; +} + function normalizeError(error) { if (!error) return undefined; if (typeof error === 'string') return { message: error }; @@ -169,17 +221,31 @@ function normalizeSteps(steps) { function formatComment({ marker, results, title }) { const average = results.reduce((sum, result) => sum + result.score, 0) / results.length; - const failedCount = results.filter((result) => result.error).length; + const dimensionColumns = buildDimensionColumns(results); + const weightedSummary = buildWeightedSummary(results); + const failedCount = results.filter((result) => hasResultError(result)).length; const runLink = getRunLink(); const lines = [ marker, `### ${escapeMarkdown(title)}`, '', - `Average score: **${formatScore(average)} / 5** across ${ - pluralize(results.length, 'result') - }.`, ]; + if (weightedSummary) { + lines.push( + `GEQI weighted score: **${ + formatScore(weightedSummary.score) + } / 100** across ${pluralize(results.length, 'example')}.`, + `Average visual-correctness score: **${formatScore(average)} / 5**.`, + ); + } else { + lines.push( + `Average score: **${formatScore(average)} / 5** across ${ + pluralize(results.length, 'result') + }.`, + ); + } + if (failedCount > 0) { lines.push( `${failedCount} ${ @@ -188,12 +254,27 @@ function formatComment({ marker, results, title }) { ); } - lines.push( - '', - '| # | Dimension | Score | Page | Status |', - '| - | - | -: | - | - |', - ...results.map((result, index) => formatTableRow(result, index)), - ); + if (weightedSummary) { + lines.push( + '', + '| Dimension | Weight | Average | Results | Status |', + '| - | -: | -: | -: | - |', + ...weightedSummary.dimensions.map((dimension) => + formatDimensionSummaryRow(dimension) + ), + ); + } + + lines.push(''); + if (dimensionColumns.length > 0) { + lines.push(...formatDimensionColumnTable(results, dimensionColumns)); + } else { + lines.push( + '| # | Example | Dimension | Weight | Score | Page | Status |', + '| - | - | - | -: | -: | - | - |', + ...results.map((result, index) => formatTableRow(result, index)), + ); + } const details = results .map((result, index) => formatResultDetails(result, index)) @@ -216,23 +297,234 @@ function formatComment({ marker, results, title }) { return lines.join('\n'); } +function hasResultError(result) { + return Boolean(result.error) + || result.dimensions.some((dimensionResult) => dimensionResult.error); +} + +function buildDimensionColumns(results) { + const columns = new Map(); + for (const result of results) { + for (const dimensionResult of result.dimensions) { + if (columns.has(dimensionResult.dimension)) continue; + + columns.set(dimensionResult.dimension, { + dimension: dimensionResult.dimension, + label: dimensionResult.dimensionLabel || dimensionResult.dimension, + weight: dimensionResult.weight, + }); + } + } + return [...columns.values()]; +} + +function buildWeightedSummary(results) { + const weightedResults = getWeightedDimensionResults(results); + if (weightedResults.length === 0) return undefined; + + const dimensionsById = new Map(); + for (const result of weightedResults) { + const existing = dimensionsById.get(result.dimension); + if (existing) { + existing.count += 1; + existing.errorCount += result.error ? 1 : 0; + existing.score += result.score; + continue; + } + + dimensionsById.set(result.dimension, { + count: 1, + dimension: result.dimension, + errorCount: result.error ? 1 : 0, + label: result.dimensionLabel || result.dimension, + score: result.score, + weight: result.weight, + }); + } + + const dimensions = [...dimensionsById.values()].map((dimension) => ({ + ...dimension, + average: dimension.score / dimension.count, + })); + const totalWeight = dimensions.reduce( + (sum, dimension) => sum + dimension.weight, + 0, + ); + if (totalWeight <= 0) return undefined; + + return { + dimensions, + score: dimensions.reduce( + (sum, dimension) => + sum + (dimension.average / 5) * (dimension.weight / totalWeight) * 100, + 0, + ), + }; +} + +function getWeightedDimensionResults(results) { + const weightedResults = []; + for (const result of results) { + if (result.dimensions.length > 0) { + weightedResults.push( + ...result.dimensions.filter((dimensionResult) => + dimensionResult.weight + ), + ); + continue; + } + + if (result.weight) { + weightedResults.push(result); + } + } + return weightedResults; +} + +function formatDimensionSummaryRow(dimension) { + const status = dimension.errorCount > 0 + ? `${dimension.errorCount} error${dimension.errorCount === 1 ? '' : 's'}` + : 'OK'; + return [ + escapeTableCell(dimension.label), + `${formatScore(dimension.weight)}%`, + `${formatScore(dimension.average)} / 5`, + String(dimension.count), + status, + ].join(' | ').replace(/^/, '| ').replace(/$/, ' |'); +} + +function formatDimensionColumnTable(results, dimensionColumns) { + const headers = [ + '#', + 'Example', + 'Visual Correctness', + ...dimensionColumns.map((dimension) => + formatDimensionColumnHeader(dimension) + ), + 'GEQI', + 'Page', + 'Status', + ]; + const alignment = [ + '-', + '-', + '-:', + ...dimensionColumns.map(() => '-:'), + '-:', + '-', + '-', + ]; + + return [ + formatTableLine(headers), + formatTableLine(alignment), + ...results.map((result, index) => + formatDimensionColumnTableRow(result, index, dimensionColumns) + ), + ]; +} + +function formatDimensionColumnHeader(dimension) { + const weight = dimension.weight ? ` (${formatScore(dimension.weight)}%)` : ''; + return `${dimension.label}${weight}`; +} + +function formatDimensionColumnTableRow(result, index, dimensionColumns) { + const page = result.url + ? `[preview](${sanitizeUrlForMarkdown(result.url)})` + : 'n/a'; + const status = hasResultError(result) ? 'Error' : 'OK'; + return formatTableLine([ + String(index + 1), + result.demoId || 'n/a', + `${result.score} / 5`, + ...dimensionColumns.map((dimension) => + formatDimensionScoreCell(result, dimension) + ), + formatGeqiScoreCell(result.dimensions), + page, + status, + ]); +} + +function formatDimensionScoreCell(result, dimension) { + const dimensionResult = result.dimensions.find((candidate) => + candidate.dimension === dimension.dimension + ); + if (!dimensionResult) return 'n/a'; + return `${dimensionResult.score} / 5`; +} + +function formatGeqiScoreCell(dimensions) { + const geqiScore = calculateGeqiScore(dimensions); + return geqiScore === undefined ? 'n/a' : `${formatScore(geqiScore)} / 100`; +} + +function calculateGeqiScore(dimensions) { + const weightedDimensions = dimensions.filter((dimension) => dimension.weight); + const totalWeight = weightedDimensions.reduce( + (sum, dimension) => sum + dimension.weight, + 0, + ); + if (totalWeight <= 0) return undefined; + + return weightedDimensions.reduce( + (sum, dimension) => + sum + (dimension.score / 5) * (dimension.weight / totalWeight) * 100, + 0, + ); +} + function formatTableRow(result, index) { const page = result.url ? `[preview](${sanitizeUrlForMarkdown(result.url)})` : 'n/a'; - const status = result.error ? 'Error' : 'OK'; - return [ + const status = hasResultError(result) ? 'Error' : 'OK'; + return formatTableLine([ String(index + 1), - escapeTableCell(result.dimension), + result.demoId || 'n/a', + result.dimensionLabel || result.dimension, + result.weight ? `${formatScore(result.weight)}%` : 'n/a', `${result.score} / 5`, page, status, - ].join(' | ').replace(/^/, '| ').replace(/$/, ' |'); + ]); } function formatResultDetails(result, index) { const lines = [`#### Result ${index + 1}`, '']; + if (result.demoId) { + lines.push(`- Example: ${truncateText(result.demoId)}`); + } + if (result.dimensionLabel || result.dimension) { + lines.push( + `- Dimension: ${truncateText(result.dimensionLabel || result.dimension)}`, + ); + } + if (result.weight) { + lines.push(`- Weight: ${formatScore(result.weight)}%`); + } + lines.push(`- Visual correctness: ${result.score} / 5`); + if (result.dimensions.length > 0) { + lines.push( + '- GEQI dimensions:', + ...result.dimensions.map((dimensionResult) => + ` - ${ + truncateText(dimensionResult.dimensionLabel) + }: ${dimensionResult.score} / 5${ + dimensionResult.weight + ? ` (${formatScore(dimensionResult.weight)}%)` + : '' + }${ + dimensionResult.error + ? `, error: ${truncateText(dimensionResult.error.message)}` + : '' + }` + ), + ); + } if (result.task) { lines.push(`- Task: ${truncateText(result.task)}`); } @@ -389,6 +681,13 @@ function escapeTableCell(value) { return escapeMarkdown(value).replaceAll('|', '\\|'); } +function formatTableLine(values) { + return values.map((value) => escapeTableCell(value)).join(' | ').replace( + /^/, + '| ', + ).replace(/$/, ' |'); +} + function escapeMarkdown(value) { return String(value).replaceAll('\n', ' ').trim(); } diff --git a/.github/scripts/write-ui-judge-result.mjs b/.github/scripts/write-ui-judge-result.mjs index babb1ac98c..d9cc47c410 100644 --- a/.github/scripts/write-ui-judge-result.mjs +++ b/.github/scripts/write-ui-judge-result.mjs @@ -8,6 +8,13 @@ const resultFile = process.env.UI_JUDGE_RESULT_FILE || join(process.env.GITHUB_WORKSPACE, 'ui-judge-results.json'); const errorMessage = process.env.UI_JUDGE_RESULT_ERROR_MESSAGE || 'UI Judge did not produce a model result. See the workflow logs for details.'; +const geqiDimensions = [ + ['usability-interaction', 'Usability & Interaction', 30], + ['visual-aesthetics', 'Visual & Aesthetics', 25], + ['consistency-standards', 'Consistency & Standards', 15], + ['architecture-writing', 'Architecture & UX Writing', 15], + ['accessibility-performance', 'Accessibility & Performance', 15], +]; if (!existsSync(resultFile)) { writeFileSync( @@ -18,10 +25,23 @@ if (!existsSync(resultFile)) { results: [ { dimension: 'visual-correctness', - score: 0, + dimensions: geqiDimensions.map(( + [dimension, dimensionLabel, weight], + ) => ({ + dimension, + dimensionLabel, + error: { + message: errorMessage, + }, + score: 0, + steps: [], + url: '', + weight, + })), error: { message: errorMessage, }, + score: 0, steps: [], url: '', }, diff --git a/.github/ui-judge.instructions.md b/.github/ui-judge.instructions.md index 2cee725d64..74376d404c 100644 --- a/.github/ui-judge.instructions.md +++ b/.github/ui-judge.instructions.md @@ -6,6 +6,8 @@ When extending `@lynx-js/ui-judge`, keep `judgePage` as the only public runtime Midscene scoring in this package should use `aiNumber()` and return a JSON-serializable integer score from 0 to 5. Prompt text must cooperate with Midscene's `aiNumber()` parser by asking for the requested `Number` field, not a bare JSON number. Do not reintroduce letter grades or `GRADE:` output in prompts. +GEQI model-backed scoring should run each playground demo across the five weighted dimensions: usability-interaction (30), visual-aesthetics (25), consistency-standards (15), architecture-writing (15), and accessibility-performance (15). Keep the original visual-correctness judge as its own test and result score. Attach GEQI scores under each example result's `dimensions` array with `dimensionLabel` and `weight`, so the PR comment can summarize the weighted 100-point GEQI score while rendering one table row per example. + Avoid writing screenshots by default. Playwright and Midscene may capture the page internally, but persistent screenshot artifacts should require an explicit future option. Midscene currently brings in `sharp`; keep its pnpm build-script policy explicit in `pnpm-workspace.yaml` rather than letting `pnpm install` leave the placeholder value. diff --git a/packages/genui/ui-judge/README.md b/packages/genui/ui-judge/README.md index 73904c9bdd..f25694f163 100644 --- a/packages/genui/ui-judge/README.md +++ b/packages/genui/ui-judge/README.md @@ -5,7 +5,7 @@ The first public API is `judgePage`. Callers own the Playwright page lifecycle, including navigation, viewport, cookies, route mocks, and authentication. The judge reads `page.url()` for the returned JSON object and produces a single -`visual-correctness` score from `0` to `5`. +score from `0` to `5`. ```ts import { test } from '@playwright/test'; @@ -16,6 +16,7 @@ test('judges generated UI', async ({ page }) => { await page.goto('http://localhost:3000/render.html'); const result = await judgePage({ + dimension: 'usability-interaction', page, task: 'The page should render a login form with email, password, and submit.', @@ -24,6 +25,15 @@ test('judges generated UI', async ({ page }) => { }); ``` +When `dimension` is omitted, `judgePage` keeps the legacy +`visual-correctness` prompt. GEQI scoring can pass one of these dimensions: + +- `usability-interaction` +- `visual-aesthetics` +- `consistency-standards` +- `architecture-writing` +- `accessibility-performance` + Midscene reads its model configuration from the standard Midscene environment variables, such as `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_API_KEY`, `MIDSCENE_MODEL_NAME`, and `MIDSCENE_MODEL_FAMILY`. diff --git a/packages/genui/ui-judge/src/index.ts b/packages/genui/ui-judge/src/index.ts index 7e4add210b..b35ad65cbb 100644 --- a/packages/genui/ui-judge/src/index.ts +++ b/packages/genui/ui-judge/src/index.ts @@ -4,14 +4,23 @@ import { PlaywrightAgent } from '@midscene/web/playwright'; import type { Page } from '@playwright/test'; -const VISUAL_CORRECTNESS_DIMENSION = 'visual-correctness'; +const DEFAULT_DIMENSION = 'visual-correctness'; const DEFAULT_TIMEOUT_MS = 60_000; const MIN_SCORE = 0; const MAX_SCORE = 5; +export type UiJudgeDimension = + | 'visual-correctness' + | 'usability-interaction' + | 'visual-aesthetics' + | 'consistency-standards' + | 'architecture-writing' + | 'accessibility-performance'; + export type UiJudgeScore = 0 | 1 | 2 | 3 | 4 | 5; export interface JudgePageOptions { + dimension?: UiJudgeDimension; page: Page; reference?: string; steps?: string[]; @@ -24,7 +33,7 @@ export interface UiJudgeError { } export interface UiJudgeResult { - dimension: typeof VISUAL_CORRECTNESS_DIMENSION; + dimension: UiJudgeDimension; error?: UiJudgeError; score: UiJudgeScore; steps: string[]; @@ -32,6 +41,7 @@ export interface UiJudgeResult { } interface NormalizedJudgePageOptions { + dimension: UiJudgeDimension; page: Page; reference?: string; steps: string[]; @@ -46,14 +56,14 @@ export async function judgePage( const normalized = normalizeOptions(options); const score = await judgePageUnsafe(normalized); return { - dimension: VISUAL_CORRECTNESS_DIMENSION, + dimension: normalized.dimension, score, steps: normalized.steps, url: normalized.page.url(), }; } catch (error) { return { - dimension: VISUAL_CORRECTNESS_DIMENSION, + dimension: getResultDimension(options?.dimension), error: { message: toErrorMessage(error) }, score: 0, steps: normalizeSteps(options?.steps), @@ -84,7 +94,7 @@ async function judgePageUnsafe( } const rawScore = await withTimeout( - agent.aiNumber(buildVisualCorrectnessPrompt(options), { + agent.aiNumber(buildJudgePrompt(options), { domIncluded: 'visible-only', screenshotIncluded: true, }), @@ -113,6 +123,7 @@ function normalizeOptions( } const normalized: NormalizedJudgePageOptions = { + dimension: normalizeDimension(options.dimension), page: options.page, steps: normalizeSteps(options.steps), task, @@ -127,6 +138,34 @@ function normalizeOptions( return normalized; } +function normalizeDimension( + dimension: UiJudgeDimension | undefined, +): UiJudgeDimension { + if (dimension === undefined) return DEFAULT_DIMENSION; + if (isKnownDimension(dimension)) return dimension; + + throw new Error( + `judgePage dimension must be one of: ${getDimensionNames().join(', ')}.`, + ); +} + +function getResultDimension( + dimension: UiJudgeDimension | undefined, +): UiJudgeDimension { + return isKnownDimension(dimension) ? dimension : DEFAULT_DIMENSION; +} + +function isKnownDimension( + dimension: UiJudgeDimension | undefined, +): dimension is UiJudgeDimension { + return typeof dimension === 'string' + && Object.hasOwn(JUDGE_DIMENSION_PROMPTS, dimension); +} + +function getDimensionNames(): string[] { + return Object.keys(JUDGE_DIMENSION_PROMPTS).sort(); +} + function normalizeSteps(steps: string[] | undefined): string[] { return (steps ?? []) .filter((step): step is string => typeof step === 'string') @@ -142,14 +181,95 @@ function normalizeTimeout(timeoutMs: number | undefined): number { return timeoutMs; } -function buildVisualCorrectnessPrompt( +interface JudgeDimensionPromptDefinition { + criteria: readonly string[]; + focus: string; + title: string; +} + +const JUDGE_DIMENSION_PROMPTS: Record< + UiJudgeDimension, + JudgeDimensionPromptDefinition +> = { + 'accessibility-performance': { + title: 'Accessibility & Performance', + focus: + 'Judge whether the UI feels inclusive, robust across screen sizes, and technically mature under real usage conditions.', + criteria: [ + 'WCAG contrast and non-color cues: text/background contrast should meet AA expectations, and important states should not rely only on color.', + 'Touch targets and responsive behavior: interactive areas should be easy to tap, and the layout should avoid overlap, truncation, or broken adaptation.', + 'Perceived performance: loading, large data, or waiting states should use skeletons, progressive loading, optimistic feedback, or other anxiety-reducing patterns when relevant.', + ], + }, + 'architecture-writing': { + title: 'Information Architecture & UX Writing', + focus: + 'Judge whether users can quickly find what they need, understand where they are, and act on clear product language.', + criteria: [ + 'Wayfinding and navigation: navigation should be flat enough for the task, with clear current location, next destinations, and return paths when relevant.', + 'Microcopy: buttons, labels, and helper text should be concise, consistent, action-oriented, and free of ambiguity.', + 'Empty states: no-data, first-use, or no-result states should feel intentional and provide a useful next action instead of dead ends.', + ], + }, + 'consistency-standards': { + title: 'Consistency & Standards', + focus: + 'Judge whether the UI follows expected design-system, product, and platform conventions so it lowers both implementation and learning cost.', + criteria: [ + 'Design-system fit: components, spacing, radius, color, and typography should look tokenized and reusable rather than improvised.', + 'Internal consistency: repeated components and behaviors should stay consistent across cards, lists, controls, dialogs, and modules.', + 'Platform conventions: icons, gestures, search, settings, navigation, and form behaviors should match familiar iOS, Android, or web standards for the visible context.', + ], + }, + 'usability-interaction': { + title: 'Usability & Interaction Logic', + focus: + 'Judge whether the product is easy to understand, easy to operate, and resilient when users take normal actions.', + criteria: [ + 'Cognitive load: information density should be reasonable, and the page purpose should be understandable within about one second.', + 'System feedback: clicks, hover states, loading, success, and error transitions should provide immediate and clear feedback when visible in the current state.', + 'Error recovery: destructive or high-stakes actions should show confirmation, and errors should use human language with a clear recovery path when relevant.', + 'Task efficiency: the core flow should minimize unnecessary steps and use smart defaults, history, shortcuts, or direct actions for frequent tasks when appropriate.', + ], + }, + 'visual-aesthetics': { + title: 'Visual Communication & Aesthetics', + focus: + 'Judge whether the interface looks professional, trustworthy, and visually comfortable while guiding attention to the right actions.', + criteria: [ + 'Visual hierarchy: the primary action and most important information should be prominent, with clear contrast in size, weight, color, and placement.', + 'Typography and whitespace: spacing should follow Gestalt proximity, related elements should group naturally, and the layout should have enough breathing room.', + 'Color semantics: brand, neutral, warning, success, and emphasis colors should be restrained, meaningful, and consistent.', + 'Graphics and icons: icon stroke, corner style, illustration quality, imagery, and decorative graphics should feel consistent and support comprehension.', + ], + }, + 'visual-correctness': { + title: 'Visual Correctness', + focus: + 'Judge whether the generated UI visually satisfies the requested task and reference content.', + criteria: [ + 'Required content: the expected components, labels, data, and relationships should be present.', + 'Task fit: the visible UI should match the requested scenario rather than merely showing related generic content.', + 'Rendering quality: the page should not be blank, broken, clipped, or impossible to inspect.', + ], + }, +}; + +function buildJudgePrompt( options: NormalizedJudgePageOptions, ): string { + const dimensionPrompt = JUDGE_DIMENSION_PROMPTS[options.dimension]; const reference = options.reference ? `\nReference answer or target:\n${options.reference}\n` : ''; - return `You are judging the visual correctness of a generated UI. + return `You are a senior product and design reviewer judging one GEQI dimension of a generated UI. + +Dimension: +${dimensionPrompt.title} + +Dimension focus: +${dimensionPrompt.focus} Task: ${options.task} @@ -158,22 +278,28 @@ Set Midscene's requested Number result to exactly one integer from 0 to 5. Do not return a bare JSON number; the structured result must use the Number field. Do not return "GRADE:", letters, Markdown, prose, or explanation. -Use this scale: -5 = The UI fully satisfies the task and reference. -4 = The UI is mostly correct, with only minor visual, wording, layout, punctuation, capitalization, or spacing differences. -3 = The UI is partially correct: the core structure is present, but meaningful components, states, data, or relationships are missing or wrong. -2 = A small amount of relevant content is correct, but most important requirements are missing or wrong. -1 = The UI is barely related to the task, with only weakly relevant elements. +Use this 1-5 Likert scale for the requested dimension: +5 = Excellent benchmark: exceptional craft, thoughtful details, and an "aha moment" that exceeds expectations. +4 = Strong professional quality: smooth, comfortable, and aligned with industry best practices. +3 = Acceptable baseline: the core task works with no fatal issue, but the experience is ordinary or under-polished. +2 = Poor with clear defects: noticeable friction, inconsistency, confusion, or frustration. +1 = Disaster or blocker: seriously violates interaction common sense or blocks the core flow and should be redone. 0 = The UI is unrelated, blank, failed to render, impossible to inspect, or completely wrong. +Subcriteria for this dimension: +${ + dimensionPrompt.criteria.map((criterion, index) => + `${index + 1}. ${criterion}` + ).join('\n') + } + Grading notes: -1. Variations in capitalization, punctuation, and minor spacing differences are acceptable when the semantic intent and required components are present. -2. Unless a specific vertical or horizontal order is explicitly requested, variations in component order within a container are acceptable. -3. Generated component IDs do not need to match any specific pattern or example, as long as they are unique and correctly establish the requested parent-child relationships. -4. Minor label variations that preserve the core semantic meaning are acceptable unless exact literal text was requested. +1. Score only the requested dimension; do not collapse all GEQI dimensions into one general quality score. +2. Variations in capitalization, punctuation, and minor spacing differences are acceptable when semantic intent and required components are present. +3. Unless a specific vertical or horizontal order is explicitly requested, variations in component order within a container are acceptable. +4. Minor label variations that preserve core semantic meaning are acceptable unless exact literal text was requested. 5. Valid optional properties, such as accessibility hints or default values, should not be penalized when they make sense in context. -6. If data binding paths are not explicitly specified, accept any logically sound path structure. -7. Do not award a high score when required components are missing or substantive behavior is wrong. +6. Do not award a high score when required components are missing or substantive behavior is wrong for this dimension. Think through the criteria internally, then return only the structured Number result.`; } diff --git a/packages/genui/ui-judge/tests/judge-page.spec.ts b/packages/genui/ui-judge/tests/judge-page.spec.ts index 5882bebc51..f530be7024 100644 --- a/packages/genui/ui-judge/tests/judge-page.spec.ts +++ b/packages/genui/ui-judge/tests/judge-page.spec.ts @@ -8,7 +8,7 @@ import { expect, test } from '@playwright/test'; import type { Page } from '@playwright/test'; import { judgePage } from '../src/index.js'; -import type { UiJudgeResult } from '../src/index.js'; +import type { UiJudgeDimension, UiJudgeResult } from '../src/index.js'; import { startPlaygroundPreviewServer, } from './helpers/playground-preview-server.js'; @@ -25,6 +25,40 @@ interface PlaygroundDemoCase { task: string; } +interface GeqiDimensionCase { + dimension: UiJudgeDimension; + label: string; + weight: number; +} + +const GEQI_DIMENSION_CASES: GeqiDimensionCase[] = [ + { + dimension: 'usability-interaction', + label: 'Usability & Interaction', + weight: 30, + }, + { + dimension: 'visual-aesthetics', + label: 'Visual & Aesthetics', + weight: 25, + }, + { + dimension: 'consistency-standards', + label: 'Consistency & Standards', + weight: 15, + }, + { + dimension: 'architecture-writing', + label: 'Architecture & UX Writing', + weight: 15, + }, + { + dimension: 'accessibility-performance', + label: 'Accessibility & Performance', + weight: 15, + }, +]; + const PLAYGROUND_DEMO_CASES: PlaygroundDemoCase[] = [ { demoId: 'recs', @@ -84,6 +118,7 @@ const PLAYGROUND_DEMO_CASES: PlaygroundDemoCase[] = [ }, ]; const UI_JUDGE_RESULT_FILE_ENV = 'UI_JUDGE_RESULT_FILE'; +const judgedResultsByDemo = new Map(); test.describe('A2UI playground preview', () => { test.skip( @@ -128,7 +163,6 @@ test.describe('A2UI playground preview', () => { const server = previewServer; await page.setViewportSize({ width: 390, height: 844 }); - const judgedResults: JudgedPlaygroundResult[] = []; for (const demo of PLAYGROUND_DEMO_CASES) { await test.step(`score ${demo.demoId}`, async () => { @@ -147,11 +181,8 @@ test.describe('A2UI playground preview', () => { timeoutMs: 180_000, }); - judgedResults.push({ - result, - task: demo.task, - }); - await writeUiJudgeResults(judgedResults); + upsertVisualJudgeResult(demo, result); + await writeUiJudgeResults(); expect(result).toMatchObject({ dimension: 'visual-correctness', @@ -164,6 +195,53 @@ test.describe('A2UI playground preview', () => { }); } }); + + test('adds GEQI dimension scores for playground render.html demos with speed zero', async ({ page }) => { + test.setTimeout(1_500_000); + + if (!previewServer) { + throw new Error('A2UI playground preview server was not started.'); + } + + const server = previewServer; + await page.setViewportSize({ width: 390, height: 844 }); + + for (const demo of PLAYGROUND_DEMO_CASES) { + await test.step(`score GEQI dimensions for ${demo.demoId}`, async () => { + const previewUrl = server.createDemoPreviewUrl({ + demoId: demo.demoId, + speed: 0, + }); + + await page.goto(previewUrl); + await waitForPreviewText(page, demo.readyText); + await waitForPreviewText(page, demo.expectedText, 2_000); + + for (const dimensionCase of GEQI_DIMENSION_CASES) { + await test.step(`${demo.demoId} ${dimensionCase.dimension}`, async () => { + const result = await judgePage({ + dimension: dimensionCase.dimension, + page, + task: demo.task, + timeoutMs: 90_000, + }); + + upsertGeqiDimensionJudgeResult(demo, dimensionCase, result); + await writeUiJudgeResults(); + + expect(result).toMatchObject({ + dimension: dimensionCase.dimension, + steps: [], + url: previewUrl, + }); + expect(result.error).toBeUndefined(); + expect(result.score).toBeGreaterThanOrEqual(0); + expect(result.score).toBeLessThanOrEqual(5); + }); + } + }); + } + }); }); test('returns a JSON error when input validation fails', async ({ page }) => { @@ -203,13 +281,91 @@ async function waitForPreviewText( } interface JudgedPlaygroundResult { + demoId: string; + dimensions: JudgedGeqiDimensionResult[]; result: UiJudgeResult; task: string; } -async function writeUiJudgeResults( - judgedResults: JudgedPlaygroundResult[], -): Promise { +interface JudgedGeqiDimensionResult { + dimension: UiJudgeDimension; + dimensionLabel: string; + error?: UiJudgeResult['error']; + score: UiJudgeResult['score']; + steps: string[]; + url: string; + weight: number; +} + +function upsertVisualJudgeResult( + demo: PlaygroundDemoCase, + result: UiJudgeResult, +): void { + const existing = judgedResultsByDemo.get(demo.demoId); + judgedResultsByDemo.set(demo.demoId, { + demoId: demo.demoId, + dimensions: existing?.dimensions ?? [], + result, + task: demo.task, + }); +} + +function upsertGeqiDimensionJudgeResult( + demo: PlaygroundDemoCase, + dimensionCase: GeqiDimensionCase, + result: UiJudgeResult, +): void { + const judgedResult = judgedResultsByDemo.get(demo.demoId) ?? { + demoId: demo.demoId, + dimensions: [], + result: createMissingVisualJudgeResult(result), + task: demo.task, + }; + const dimensions = judgedResult.dimensions.filter( + (dimensionResult) => dimensionResult.dimension !== dimensionCase.dimension, + ); + dimensions.push({ + dimension: result.dimension, + dimensionLabel: dimensionCase.label, + error: result.error, + score: result.score, + steps: result.steps, + url: result.url, + weight: dimensionCase.weight, + }); + + judgedResultsByDemo.set(demo.demoId, { + ...judgedResult, + dimensions: sortGeqiDimensions(dimensions), + }); +} + +function createMissingVisualJudgeResult(result: UiJudgeResult): UiJudgeResult { + return { + dimension: 'visual-correctness', + error: { + message: 'visual-correctness judge did not run before GEQI scoring.', + }, + score: 0, + steps: [], + url: result.url, + }; +} + +function sortGeqiDimensions( + dimensions: JudgedGeqiDimensionResult[], +): JudgedGeqiDimensionResult[] { + return GEQI_DIMENSION_CASES.map((dimensionCase) => + dimensions.find((dimensionResult) => + dimensionResult.dimension === dimensionCase.dimension + ) + ).filter((dimension): dimension is JudgedGeqiDimensionResult => + dimension !== undefined + ); +} + +async function writeUiJudgeResults(): Promise { + const judgedResults = [...judgedResultsByDemo.values()]; if (judgedResults.length === 0) return; const resultFile = process.env[UI_JUDGE_RESULT_FILE_ENV]; @@ -221,8 +377,12 @@ async function writeUiJudgeResults( `${ JSON.stringify( { - results: judgedResults.map(({ result, task }) => ({ + results: judgedResults.map(( + { demoId, dimensions, result, task }, + ) => ({ ...result, + demoId, + dimensions, task, })), },