diff --git a/.github/actions/ui-judge-comment/README.md b/.github/actions/ui-judge-comment/README.md new file mode 100644 index 0000000000..8706ec6739 --- /dev/null +++ b/.github/actions/ui-judge-comment/README.md @@ -0,0 +1,47 @@ +# UI Judge Comment + +Creates or updates a pull request comment with `@lynx-js/ui-judge` results. + +The action expects JSON shaped as one `UiJudgeResult`, an array of results, or +an object with a `results` array. + +```yaml +permissions: + pull-requests: write + +steps: + - run: pnpm --filter @lynx-js/ui-judge test + env: + UI_JUDGE_RESULT_FILE: ${{ github.workspace }}/ui-judge-results.json + + - uses: ./.github/actions/ui-judge-comment + with: + result-file: ui-judge-results.json +``` + +Example result payload: + +```json +{ + "results": [ + { + "dimension": "visual-correctness", + "score": 5, + "steps": [], + "url": "http://127.0.0.1:3000/render.html?demo=recs" + } + ] +} +``` + +Inputs: + +- `result-file`: path to a JSON result file. +- `result-json`: inline JSON result payload. Use this instead of + `result-file`. +- `pr-number`: pull request number. Defaults to the `pull_request` event. +- `title`: comment heading. Defaults to `UI Judge`. +- `marker`: hidden marker used to update a previous comment. +- `update-existing`: update the previous marked comment. Defaults to `true`. +- `dry-run`: print the comment body without calling the GitHub API. +- `github-token`: token for the GitHub API. Defaults to `github.token`. diff --git a/.github/actions/ui-judge-comment/action.yml b/.github/actions/ui-judge-comment/action.yml new file mode 100644 index 0000000000..2ce0c2fd13 --- /dev/null +++ b/.github/actions/ui-judge-comment/action.yml @@ -0,0 +1,66 @@ +name: UI Judge Comment + +description: Create or update a pull request comment with @lynx-js/ui-judge results. + +inputs: + result-file: + description: Path to a JSON file containing a UiJudgeResult, an array of results, or an object with a results array. + required: false + result-json: + description: Inline JSON containing a UiJudgeResult, an array of results, or an object with a results array. + required: false + pr-number: + description: Pull request number. Defaults to the pull_request payload number. + required: false + title: + description: Heading shown in the pull request comment. + default: UI Judge + required: false + marker: + description: Hidden marker used to find and update the previous UI Judge comment. + default: + required: false + update-existing: + description: Update the previous marked comment instead of creating a new one. + default: "true" + required: false + dry-run: + description: Print the rendered comment without calling the GitHub API. + default: "false" + required: false + github-token: + description: Token used to create or update the pull request comment. + required: false + +outputs: + body: + description: The rendered pull request comment body. + value: ${{ steps.comment.outputs.body }} + comment-id: + description: The created or updated issue comment id. + value: ${{ steps.comment.outputs.comment-id }} + comment-url: + description: The created or updated issue comment URL. + value: ${{ steps.comment.outputs.comment-url }} + +runs: + using: composite + steps: + - uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5 + with: + node-version: "22" + package-manager-cache: false + - name: Create or update UI Judge comment + id: comment + shell: bash + run: node "$GITHUB_ACTION_PATH/comment.mjs" + env: + INPUT_RESULT_FILE: ${{ inputs.result-file }} + INPUT_RESULT_JSON: ${{ inputs.result-json }} + INPUT_PR_NUMBER: ${{ inputs.pr-number }} + INPUT_TITLE: ${{ inputs.title }} + INPUT_MARKER: ${{ inputs.marker }} + INPUT_UPDATE_EXISTING: ${{ inputs.update-existing }} + INPUT_DRY_RUN: ${{ inputs.dry-run }} + INPUT_GITHUB_TOKEN: ${{ inputs.github-token }} + GITHUB_TOKEN: ${{ inputs.github-token || github.token }} diff --git a/.github/actions/ui-judge-comment/comment.mjs b/.github/actions/ui-judge-comment/comment.mjs new file mode 100644 index 0000000000..280b1e0dcd --- /dev/null +++ b/.github/actions/ui-judge-comment/comment.mjs @@ -0,0 +1,432 @@ +#!/usr/bin/env node + +// Copyright 2026 The Lynx Authors. All rights reserved. +// Licensed under the Apache License Version 2.0 that can be found in the +// LICENSE file in the root directory of this source tree. +import { appendFile, readFile } from 'node:fs/promises'; +import { isAbsolute, resolve } from 'node:path'; + +const MAX_DETAIL_LENGTH = 1_200; +const MAX_COMMENT_LENGTH = 64_000; + +main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exitCode = 1; +}); + +async function main() { + const inputs = readInputs(); + const results = normalizeResults(await readResultPayload(inputs)); + const body = truncateComment(formatComment({ + marker: inputs.marker, + results, + title: inputs.title, + })); + + await writeOutput('body', body); + + if (inputs.dryRun) { + console.info(body); + return; + } + + const event = await readEventPayload(); + const repository = parseRepository(process.env.GITHUB_REPOSITORY); + const prNumber = inputs.prNumber ?? getPullRequestNumber(event); + if (!prNumber) { + throw new Error( + 'Unable to determine the pull request number. Run this action on a pull_request event or pass pr-number.', + ); + } + + const token = inputs.githubToken || process.env.GITHUB_TOKEN; + if (!token) { + throw new Error( + 'Missing github-token. Pass github-token or allow the action to use github.token.', + ); + } + + const client = createGitHubClient(token); + const existingComment = inputs.updateExisting + ? await findExistingComment(client, repository, prNumber, inputs.marker) + : undefined; + const comment = existingComment + ? await updateComment(client, repository, existingComment.id, body) + : await createComment(client, repository, prNumber, body); + + await writeOutput('comment-id', String(comment.id ?? '')); + await writeOutput('comment-url', String(comment.html_url ?? '')); + console.info( + existingComment + ? `Updated UI Judge comment: ${comment.html_url}` + : `Created UI Judge comment: ${comment.html_url}`, + ); +} + +function readInputs() { + const resultFile = emptyToUndefined(process.env.INPUT_RESULT_FILE); + const resultJson = emptyToUndefined(process.env.INPUT_RESULT_JSON); + if (!resultFile && !resultJson) { + throw new Error('Pass result-file or result-json to ui-judge-comment.'); + } + if (resultFile && resultJson) { + throw new Error('Pass only one of result-file or result-json.'); + } + + return { + dryRun: parseBoolean(process.env.INPUT_DRY_RUN, false), + githubToken: emptyToUndefined(process.env.INPUT_GITHUB_TOKEN), + marker: process.env.INPUT_MARKER?.trim() || '', + prNumber: parseOptionalPositiveInteger(process.env.INPUT_PR_NUMBER), + resultFile, + resultJson, + title: process.env.INPUT_TITLE?.trim() || 'UI Judge', + updateExisting: parseBoolean(process.env.INPUT_UPDATE_EXISTING, true), + }; +} + +async function readResultPayload(inputs) { + if (inputs.resultJson) { + return parseJson(inputs.resultJson, 'result-json'); + } + + const workspace = process.env.GITHUB_WORKSPACE || process.cwd(); + const filePath = isAbsolute(inputs.resultFile) + ? inputs.resultFile + : resolve(workspace, inputs.resultFile); + const content = await readFile(filePath, 'utf8'); + return parseJson(content, filePath); +} + +function parseJson(content, source) { + try { + return JSON.parse(content); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + throw new Error(`Failed to parse ${source} as JSON: ${message}`); + } +} + +function normalizeResults(payload) { + const rawResults = Array.isArray(payload) + ? payload + : (Array.isArray(payload?.results) + ? payload.results + : [payload]); + + const results = rawResults.map((result, index) => + normalizeResult(result, index) + ); + if (results.length === 0) { + throw new Error('UI Judge result payload did not contain any results.'); + } + return results; +} + +function normalizeResult(result, index) { + if (!result || typeof result !== 'object') { + throw new Error(`UI Judge result at index ${index} must be an object.`); + } + + return { + dimension: stringValue(result.dimension) || 'visual-correctness', + error: normalizeError(result.error), + reference: stringValue(result.reference), + score: normalizeScore(result.score, index), + steps: normalizeSteps(result.steps), + task: stringValue(result.task), + url: stringValue(result.url), + }; +} + +function normalizeScore(value, index) { + const score = typeof value === 'number' ? value : Number(value); + if (!Number.isFinite(score)) { + throw new Error( + `UI Judge result at index ${index} has a non-numeric score.`, + ); + } + return Math.max(0, Math.min(5, Math.round(score))); +} + +function normalizeError(error) { + if (!error) return undefined; + if (typeof error === 'string') return { message: error }; + if (typeof error === 'object') { + return { + message: stringValue(error.message) || JSON.stringify(error), + }; + } + return { message: String(error) }; +} + +function normalizeSteps(steps) { + if (!Array.isArray(steps)) return []; + return steps.filter((step) => typeof step === 'string' && step.trim()) + .map((step) => step.trim()); +} + +function formatComment({ marker, results, title }) { + const average = results.reduce((sum, result) => sum + result.score, 0) + / results.length; + const failedCount = results.filter((result) => result.error).length; + const runLink = getRunLink(); + const lines = [ + marker, + `### ${escapeMarkdown(title)}`, + '', + `Average score: **${formatScore(average)} / 5** across ${ + pluralize(results.length, 'result') + }.`, + ]; + + if (failedCount > 0) { + lines.push( + `${failedCount} ${ + failedCount === 1 ? 'result has' : 'results have' + } an error.`, + ); + } + + lines.push( + '', + '| # | Dimension | Score | Page | Status |', + '| - | - | -: | - | - |', + ...results.map((result, index) => formatTableRow(result, index)), + ); + + const details = results + .map((result, index) => formatResultDetails(result, index)) + .filter(Boolean); + if (details.length > 0) { + lines.push( + '', + '
', + 'Details', + '', + ...details, + '
', + ); + } + + if (runLink) { + lines.push('', `[${runLink.label}](${runLink.url})`); + } + + return lines.join('\n'); +} + +function formatTableRow(result, index) { + const page = result.url + ? `[preview](${sanitizeUrlForMarkdown(result.url)})` + : 'n/a'; + const status = result.error ? 'Error' : 'OK'; + return [ + String(index + 1), + escapeTableCell(result.dimension), + `${result.score} / 5`, + page, + status, + ].join(' | ').replace(/^/, '| ').replace(/$/, ' |'); +} + +function formatResultDetails(result, index) { + const lines = [`#### Result ${index + 1}`, '']; + + if (result.task) { + lines.push(`- Task: ${truncateText(result.task)}`); + } + if (result.reference) { + lines.push(`- Reference: ${truncateText(result.reference)}`); + } + if (result.steps.length > 0) { + lines.push( + '- Steps:', + ...result.steps.map((step) => ` - ${truncateText(step)}`), + ); + } + if (result.error) { + lines.push(`- Error: ${truncateText(result.error.message)}`); + } + + return lines.length > 2 ? [...lines, ''].join('\n') : ''; +} + +async function readEventPayload() { + const eventPath = process.env.GITHUB_EVENT_PATH; + if (!eventPath) return {}; + + try { + return parseJson(await readFile(eventPath, 'utf8'), eventPath); + } catch { + return {}; + } +} + +function getPullRequestNumber(event) { + const number = event?.pull_request?.number + ?? (event?.issue?.pull_request ? event.issue.number : undefined); + return Number.isInteger(number) && number > 0 ? number : undefined; +} + +function parseRepository(repository) { + const [owner, repo] = String(repository || '').split('/'); + if (!owner || !repo) { + throw new Error('GITHUB_REPOSITORY must be set to owner/repo.'); + } + return { owner, repo }; +} + +function createGitHubClient(token) { + const apiUrl = process.env.GITHUB_API_URL || 'https://api.github.com'; + return async function request(path, options = {}) { + const response = await fetch(`${apiUrl}${path}`, { + ...options, + headers: { + accept: 'application/vnd.github+json', + authorization: `Bearer ${token}`, + 'content-type': 'application/json', + 'user-agent': 'lynx-ui-judge-comment', + 'x-github-api-version': '2022-11-28', + ...options.headers, + }, + }); + + const text = await response.text(); + const data = text ? parseJsonResponse(text) : {}; + if (!response.ok) { + const message = data?.message || response.statusText; + throw new Error( + `GitHub API ${ + options.method || 'GET' + } ${path} failed with ${response.status}: ${message}`, + ); + } + return data; + }; +} + +function parseJsonResponse(text) { + try { + return JSON.parse(text); + } catch { + return { message: text }; + } +} + +async function findExistingComment(client, repository, prNumber, marker) { + const comments = await client( + `/repos/${repository.owner}/${repository.repo}/issues/${prNumber}/comments?per_page=100`, + ); + return comments.find((comment) => + typeof comment.body === 'string' && comment.body.includes(marker) + ); +} + +async function createComment(client, repository, prNumber, body) { + return await client( + `/repos/${repository.owner}/${repository.repo}/issues/${prNumber}/comments`, + { + body: JSON.stringify({ body }), + method: 'POST', + }, + ); +} + +async function updateComment(client, repository, commentId, body) { + return await client( + `/repos/${repository.owner}/${repository.repo}/issues/comments/${commentId}`, + { + body: JSON.stringify({ body }), + method: 'PATCH', + }, + ); +} + +async function writeOutput(name, value) { + const outputPath = process.env.GITHUB_OUTPUT; + if (!outputPath) return; + + const delimiter = `ui_judge_${name}_${Date.now()}`; + const content = `${name}<<${delimiter}\n${value}\n${delimiter}\n`; + await appendFile(outputPath, content, 'utf8'); +} + +function parseOptionalPositiveInteger(value) { + const normalized = emptyToUndefined(value); + if (!normalized) return undefined; + + const number = Number(normalized); + if (!Number.isInteger(number) || number <= 0) { + throw new Error(`Expected a positive integer, received: ${normalized}`); + } + return number; +} + +function parseBoolean(value, defaultValue) { + const normalized = emptyToUndefined(value); + if (!normalized) return defaultValue; + return ['1', 'true', 'yes', 'on'].includes(normalized.toLowerCase()); +} + +function stringValue(value) { + return typeof value === 'string' && value.trim() ? value.trim() : undefined; +} + +function emptyToUndefined(value) { + return typeof value === 'string' && value.trim() ? value.trim() : undefined; +} + +function formatScore(value) { + return Number.isInteger(value) ? String(value) : value.toFixed(1); +} + +function pluralize(count, word) { + return `${count} ${count === 1 ? word : `${word}s`}`; +} + +function escapeTableCell(value) { + return escapeMarkdown(value).replaceAll('|', '\\|'); +} + +function escapeMarkdown(value) { + return String(value).replaceAll('\n', ' ').trim(); +} + +function sanitizeUrlForMarkdown(url) { + return String(url).replaceAll(')', '%29'); +} + +function truncateText(value) { + const text = escapeMarkdown(value); + if (text.length <= MAX_DETAIL_LENGTH) return text; + return `${text.slice(0, MAX_DETAIL_LENGTH - 3)}...`; +} + +function truncateComment(body) { + if (body.length <= MAX_COMMENT_LENGTH) return body; + return `${ + body.slice(0, MAX_COMMENT_LENGTH - 120) + }\n\n_Comment truncated because it exceeded ${MAX_COMMENT_LENGTH} characters._`; +} + +function getRunLink() { + const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'; + const repository = process.env.GITHUB_REPOSITORY; + const runId = process.env.GITHUB_RUN_ID; + if (!repository || !runId) return undefined; + + const runUrl = `${serverUrl}/${repository}/actions/runs/${runId}`; + const runAttempt = Number(process.env.GITHUB_RUN_ATTEMPT || '1'); + if (!Number.isInteger(runAttempt) || runAttempt <= 1) { + return { + label: 'Workflow run', + url: runUrl, + }; + } + + return { + label: `Workflow run (attempt ${runAttempt})`, + url: `${runUrl}/attempts/${runAttempt}`, + }; +} diff --git a/.github/scripts/write-ui-judge-result.mjs b/.github/scripts/write-ui-judge-result.mjs new file mode 100644 index 0000000000..babb1ac98c --- /dev/null +++ b/.github/scripts/write-ui-judge-result.mjs @@ -0,0 +1,35 @@ +// Copyright 2026 The Lynx Authors. All rights reserved. +// Licensed under the Apache License Version 2.0 that can be found in the +// LICENSE file in the root directory of this source tree. +import { existsSync, writeFileSync } from 'node:fs'; +import { join } from 'node:path'; + +const resultFile = process.env.UI_JUDGE_RESULT_FILE + || join(process.env.GITHUB_WORKSPACE, 'ui-judge-results.json'); +const errorMessage = process.env.UI_JUDGE_RESULT_ERROR_MESSAGE + || 'UI Judge did not produce a model result. See the workflow logs for details.'; + +if (!existsSync(resultFile)) { + writeFileSync( + resultFile, + `${ + JSON.stringify( + { + results: [ + { + dimension: 'visual-correctness', + score: 0, + error: { + message: errorMessage, + }, + steps: [], + url: '', + }, + ], + }, + null, + 2, + ) + }\n`, + ); +} diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md new file mode 100644 index 0000000000..095fd73f1f --- /dev/null +++ b/.github/ui-judge-ci.instructions.md @@ -0,0 +1,21 @@ +--- +applyTo: ".github/workflows/test.yml,.github/workflows/workflow-test.yml,.github/scripts/write-ui-judge-result.mjs,.github/ui-judge*.instructions.md,.github/actions/ui-judge-comment/**" +--- + +When wiring `@lynx-js/ui-judge` into pull request CI, preserve the PR comment even when the model-backed test fails, but do not hide the failed test. Prefer running UI Judge through the reusable `workflow-test.yml` job with `is-web: true`, uploading `ui-judge-results.json` as an artifact, and posting the comment from a separate thin job with `issues: write` and `pull-requests: write`. + +Keep long UI Judge work inside a job with a bounded timeout. If UI Judge setup is ever split back into custom steps outside the reusable workflow, use step-level `timeout-minutes` on long setup, build, and model execution steps so the PR comment action can still run when a result artifact exists. + +Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` dependency shape. Let the reusable `workflow-test.yml` run its default `pnpm turbo build --summarize`; do not add UI Judge-specific build overrides. The A2UI playground Turbo config already makes `build` depend on `build:lynx`. + +Use the upstream build job's restored turbo cache in UI Judge CI. Do not call package scripts directly with `pnpm --filter build`, and do not pass `--force`; use Turbo commands so dependency ordering and cached outputs remain consistent. + +Do not add changed-file gating, GitHub API calls, or extra reusable workflow inputs for UI Judge CI. If Midscene secrets are unavailable, the UI Judge test command should write a clear skipped result and exit successfully; fork pull requests should skip the comment steps rather than requiring write permissions. + +Raise the soft open-file limit before running UI Judge Playwright tests in the Playwright container. The A2UI playground dev server uses rsbuild/chokidar watchers, so mirror the web-elements Playwright pattern with `ulimit -Sn 655350` before invoking `pnpm --filter @lynx-js/ui-judge test`. + +Inject the full Midscene/OpenAI model environment into the UI Judge execution step, including `MIDSCENE_MODEL_API_KEY`, `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_FAMILY`, `MIDSCENE_MODEL_NAME`, and `MIDSCENE_OPENAI_INIT_CONFIG_JSON`. + +When rendering the UI Judge PR comment, include `GITHUB_RUN_ATTEMPT` in the workflow footer/link. GitHub reruns keep the same `GITHUB_RUN_ID`, so relying only on the run URL can make a successful rerun write an identical comment body and appear not to update. + +Keep `.github/actions/ui-judge-comment` self-contained for self-hosted runners: set up Node inside the composite action before invoking `comment.mjs`, rather than requiring every caller job to prepare `node` separately. diff --git a/.github/ui-judge.instructions.md b/.github/ui-judge.instructions.md index 73b2d2b846..2cee725d64 100644 --- a/.github/ui-judge.instructions.md +++ b/.github/ui-judge.instructions.md @@ -4,10 +4,18 @@ applyTo: "packages/genui/ui-judge/**/*" When extending `@lynx-js/ui-judge`, keep `judgePage` as the only public runtime API until a caller needs more surface area. Callers own Playwright page setup, navigation, viewport, cookies, route mocks, and authentication. Additional dimensions should remain internal unless they are intentionally added to the package exports. -Midscene scoring in this package should use `aiNumber()` and return a JSON-serializable integer score from 0 to 5. Do not reintroduce letter grades or `GRADE:` output in prompts. +Midscene scoring in this package should use `aiNumber()` and return a JSON-serializable integer score from 0 to 5. Prompt text must cooperate with Midscene's `aiNumber()` parser by asking for the requested `Number` field, not a bare JSON number. Do not reintroduce letter grades or `GRADE:` output in prompts. Avoid writing screenshots by default. Playwright and Midscene may capture the page internally, but persistent screenshot artifacts should require an explicit future option. Midscene currently brings in `sharp`; keep its pnpm build-script policy explicit in `pnpm-workspace.yaml` rather than letting `pnpm install` leave the placeholder value. -Model-backed Playwright tests should use the real Midscene service when `MIDSCENE_MODEL_NAME` is configured, and skip only the model-dependent cases when that environment variable is absent. +Model-backed Playwright tests should use the real Midscene service when `MIDSCENE_MODEL_NAME` is configured, and skip only the model-dependent cases when that environment variable is absent. Keep the playground server startup inside the skipped model-backed test group so non-model validation tests do not bind local ports. + +Prefer `page.setContent()` or another non-listening fixture setup for static `@lynx-js/ui-judge` Playwright fixtures. Avoid starting local HTTP servers in package tests unless the behavior under test specifically needs network navigation. + +When a `@lynx-js/ui-judge` Playwright test needs real network navigation, use the A2UI playground preview server rather than a package-local scratch HTTP server. Start `pnpm dev` from `packages/genui/a2ui-playground` and navigate Playwright to the playground `render.html` demo route, such as `/render.html?protocol=a2ui&demoUrl=.%2Fa2ui.web.js&theme=light&demo=recs&speed=0`. + +The A2UI playground preview server requires generated catalog artifacts from `@lynx-js/a2ui-reactlynx`. If they are missing, fail with a clear prerequisite message that points to `pnpm --filter @lynx-js/a2ui-reactlynx build` instead of silently running broad cross-package builds from Playwright hooks. + +The Codex sandbox blocks TCP listeners on loopback addresses such as `127.0.0.1`, `localhost`, `0.0.0.0`, and `::1`, so bind-dependent verification should use an escalated command such as `pnpm --filter @lynx-js/ui-judge test` rather than rewriting the test to avoid the bind. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index be0a4228a0..e5d35023b4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -72,6 +72,59 @@ jobs: runs-on: lynx-ubuntu-24.04-medium run: pnpm eslint . --flag v10_config_lookup_from_file + ui-judge: + needs: build + uses: ./.github/workflows/workflow-test.yml + secrets: + MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} + MIDSCENE_MODEL_BASE_URL: ${{ secrets.MIDSCENE_MODEL_BASE_URL }} + MIDSCENE_MODEL_FAMILY: ${{ secrets.MIDSCENE_MODEL_FAMILY }} + MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} + MIDSCENE_OPENAI_INIT_CONFIG_JSON: ${{ secrets.MIDSCENE_OPENAI_INIT_CONFIG_JSON }} + with: + runs-on: lynx-custom-container + is-web: true + upload-codecov: false + artifact-name: ui-judge-results + artifact-path: ui-judge-results.json + artifact-if-no-files-found: error + web-report-name: ui-judge-playwright-report + web-report-path: packages/genui/ui-judge/playwright-report + run: | + cd "$GITHUB_WORKSPACE" + set -eu + export UI_JUDGE_RESULT_FILE="$GITHUB_WORKSPACE/ui-judge-results.json" + if [ -z "${MIDSCENE_MODEL_NAME:-}" ] || [ -z "${MIDSCENE_MODEL_API_KEY:-}" ]; then + UI_JUDGE_RESULT_ERROR_MESSAGE="Midscene secrets are unavailable; UI Judge model test was skipped." node .github/scripts/write-ui-judge-result.mjs + echo "Midscene secrets are unavailable; skipping UI Judge." + exit 0 + fi + ulimit -Sn 655350 + pnpm --filter @lynx-js/ui-judge test + + ui-judge-comment: + needs: ui-judge + if: always() + runs-on: lynx-ubuntu-24.04-medium + permissions: + contents: read + issues: write + pull-requests: write + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + if: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository && needs.ui-judge.result != 'skipped' && needs.ui-judge.result != 'cancelled' }} + with: + persist-credentials: false + - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5 + if: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository && needs.ui-judge.result != 'skipped' && needs.ui-judge.result != 'cancelled' }} + with: + name: ui-judge-results + - name: Comment UI Judge result + if: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository && needs.ui-judge.result != 'skipped' && needs.ui-judge.result != 'cancelled' }} + uses: ./.github/actions/ui-judge-comment + with: + result-file: ui-judge-results.json + lighthouse: needs: build uses: ./.github/workflows/workflow-test.yml @@ -350,6 +403,8 @@ jobs: - benchmark - code-style-check - eslint + - ui-judge + - ui-judge-comment # - playwright-linux - playwright-web-elements - test-api diff --git a/.github/workflows/workflow-test.yml b/.github/workflows/workflow-test.yml index bb2651c7f6..0fe2ddeee3 100644 --- a/.github/workflows/workflow-test.yml +++ b/.github/workflows/workflow-test.yml @@ -6,6 +6,16 @@ on: required: false LHCI_GITHUB_APP_TOKEN: required: false + MIDSCENE_MODEL_API_KEY: + required: false + MIDSCENE_MODEL_BASE_URL: + required: false + MIDSCENE_MODEL_FAMILY: + required: false + MIDSCENE_MODEL_NAME: + required: false + MIDSCENE_OPENAI_INIT_CONFIG_JSON: + required: false inputs: runs-on: required: true @@ -18,6 +28,10 @@ on: required: false type: boolean default: false + upload-codecov: + required: false + type: boolean + default: true web-report-name: required: false type: string @@ -26,6 +40,18 @@ on: required: false type: string default: "packages/web-platform/web-core-e2e/playwright-report" + artifact-name: + required: false + type: string + default: "" + artifact-path: + required: false + type: string + default: "" + artifact-if-no-files-found: + required: false + type: string + default: "warn" codecov-flags: required: false type: string @@ -70,8 +96,7 @@ jobs: corepack enable pnpm install --frozen-lockfile - name: Build - run: | - pnpm turbo build --summarize + run: pnpm turbo build --summarize - name: Test # zizmor: ignore[template-injection] The inputs.run is provided by us. id: test env: @@ -80,20 +105,36 @@ jobs: CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} LHCI_GITHUB_APP_TOKEN: ${{ secrets.LHCI_GITHUB_APP_TOKEN }} + MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} + MIDSCENE_MODEL_BASE_URL: ${{ secrets.MIDSCENE_MODEL_BASE_URL }} + MIDSCENE_MODEL_FAMILY: ${{ secrets.MIDSCENE_MODEL_FAMILY }} + MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} + MIDSCENE_OPENAI_INIT_CONFIG_JSON: ${{ secrets.MIDSCENE_OPENAI_INIT_CONFIG_JSON }} run: ${{ inputs.run }} - name: Upload coverage reports to Codecov + if: ${{ inputs.upload-codecov }} uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de # v5 with: token: ${{ secrets.CODECOV_TOKEN }} flags: ${{ inputs.codecov-flags }} - name: Upload test results to Codecov - if: ${{ !cancelled() }} + if: ${{ !cancelled() && inputs.upload-codecov }} continue-on-error: true uses: codecov/test-results-action@0fa95f0e1eeaafde2c782583b36b28ad0d8c77d3 # v1 with: token: ${{ secrets.CODECOV_TOKEN }} flags: ${{ inputs.codecov-flags }} override_branch: ${{ github.event_name == 'merge_group' && 'main' || '' }} + - name: Upload Artifact + if: ${{ always() && !cancelled() && inputs.artifact-name != '' && inputs.artifact-path != '' }} + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + with: + name: ${{ inputs.artifact-name }} + path: ${{ inputs.artifact-path }} + if-no-files-found: ${{ inputs.artifact-if-no-files-found }} + retention-days: 1 + overwrite: true + include-hidden-files: true - name: Upload Test Result if: ${{ inputs.is-web && failure() }} uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 diff --git a/.gitignore b/.gitignore index eef93622aa..ea9ee57f63 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ www playwright-report test-results trace.zip +midscene_run .turbo **/test/js .swc diff --git a/AGENTS.md b/AGENTS.md index 0a9aff6c8a..5dbb93cedf 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -35,12 +35,17 @@ rustc --version # Required for native bindings # Full build (REQUIRED before running tests) pnpm turbo build +# Focused package builds should still go through Turbo filtering +pnpm turbo build --filter + # Development build with watching pnpm turbo watch build ``` **⚠️ Critical**: Always run full build before tests. Watch mode only compiles TypeScript, not Rust components. +When narrowing builds to a package or task, prefer `pnpm turbo build --filter ...` over `pnpm build --filter ...` so Turbo dependency ordering, task outputs, and cache behavior remain consistent. + ### 3. Code Quality ```bash diff --git a/packages/genui/a2ui-playground/lynx-src/a2ui/App.tsx b/packages/genui/a2ui-playground/lynx-src/a2ui/App.tsx index 73f69c1053..058d1b6249 100644 --- a/packages/genui/a2ui-playground/lynx-src/a2ui/App.tsx +++ b/packages/genui/a2ui-playground/lynx-src/a2ui/App.tsx @@ -344,14 +344,16 @@ export function App() { const playbackTargetCountRef = useRef(0); // Per-batch delay (ms) the mock agent waits between successive - // protocol messages. Configurable via `?speed=2` (faster) etc. + // protocol messages. Configurable via `?speed=2` (faster); + // `?speed=0` paints the full stream with no delay. const streamDelay = useMemo(() => { const raw = (globalProps as Record | null)?.speed ?? (rawInitData as Record | null)?.speed; const speed = typeof raw === 'string' ? Number(raw) : (typeof raw === 'number' ? raw : 1); - if (!speed || speed <= 0) return DEFAULT_STREAM_DELAY_MS; + if (!Number.isFinite(speed) || speed < 0) return DEFAULT_STREAM_DELAY_MS; + if (speed === 0) return 0; return DEFAULT_STREAM_DELAY_MS / speed; }, [globalProps, rawInitData]); diff --git a/packages/genui/a2ui-playground/src/render.tsx b/packages/genui/a2ui-playground/src/render.tsx index ffc63eaf79..b890a199ce 100644 --- a/packages/genui/a2ui-playground/src/render.tsx +++ b/packages/genui/a2ui-playground/src/render.tsx @@ -13,7 +13,7 @@ import { decodeBase64Url } from './utils/base64url.js'; import { DEFAULT_A2UI_DEMO_URL } from './utils/demoUrl.js'; interface InitData { - protocol?: '0.9'; + protocol?: '0.9' | 'a2ui' | 'openui'; messagesUrl?: string; messages?: unknown; actionMocksUrl?: string; @@ -107,7 +107,10 @@ function parseInitDataFromQuery(): InitData | null { return null; } - const protocolValue = protocol === '0.9' ? '0.9' : undefined; + const protocolValue = protocol === '0.9' || protocol === 'a2ui' + || protocol === 'openui' + ? protocol + : undefined; const speedRaw = params.get('speed'); const speedVal = speedRaw === null ? undefined : Number(speedRaw); @@ -118,7 +121,7 @@ function parseInitDataFromQuery(): InitData | null { actionMocksUrl: actionMocksUrl ?? undefined, demoUrl: demoUrl ?? undefined, messages: [], // Default to an empty array - speed: speedVal && Number.isFinite(speedVal) && speedVal > 0 + speed: speedVal !== undefined && Number.isFinite(speedVal) && speedVal >= 0 ? speedVal : undefined, instant: instant === '1' ? true : undefined, diff --git a/packages/genui/a2ui-playground/src/utils/renderUrl.ts b/packages/genui/a2ui-playground/src/utils/renderUrl.ts index 76d2b1492a..8dcc7ba5c9 100644 --- a/packages/genui/a2ui-playground/src/utils/renderUrl.ts +++ b/packages/genui/a2ui-playground/src/utils/renderUrl.ts @@ -13,7 +13,7 @@ export interface RenderInit { theme?: 'light' | 'dark'; /** When set, use a short `?demo=` param instead of inlining the payload. */ demoId?: string; - /** Simulation speed multiplier (e.g. 0.5, 1, 2, 4). */ + /** Simulation speed multiplier (e.g. 0, 0.5, 1, 2, 4); 0 disables delay. */ speed?: number; /** When true, render the final UI immediately without streaming playback. */ instant?: boolean; diff --git a/packages/genui/ui-judge/README.md b/packages/genui/ui-judge/README.md index 3a9be88494..73904c9bdd 100644 --- a/packages/genui/ui-judge/README.md +++ b/packages/genui/ui-judge/README.md @@ -31,3 +31,17 @@ variables, such as `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_API_KEY`, The Playwright test suite uses the real Midscene service when `MIDSCENE_MODEL_NAME` is present. Without model configuration, the model-backed test is skipped and the error-path test still runs. + +The model-backed package test uses the A2UI playground preview server instead +of a scratch HTTP fixture. It opens the playground's `render.html` demo route +with `speed=0`, for example +`/render.html?protocol=a2ui&demoUrl=.%2Fa2ui.web.js&theme=light&demo=recs&speed=0`. +Prepare the playground artifacts first: + +```sh +pnpm turbo build:lynx --filter a2ui-playground +pnpm --filter @lynx-js/ui-judge test +``` + +The playground dev server binds to a local TCP port, so sandboxed runs need +local-bind permission. diff --git a/packages/genui/ui-judge/src/index.ts b/packages/genui/ui-judge/src/index.ts index 0d41bbf67a..7e4add210b 100644 --- a/packages/genui/ui-judge/src/index.ts +++ b/packages/genui/ui-judge/src/index.ts @@ -154,7 +154,9 @@ function buildVisualCorrectnessPrompt( Task: ${options.task} ${reference} -Return exactly one integer from 0 to 5. Do not return "GRADE:", letters, Markdown, prose, or explanation. +Set Midscene's requested Number result to exactly one integer from 0 to 5. +Do not return a bare JSON number; the structured result must use the Number field. +Do not return "GRADE:", letters, Markdown, prose, or explanation. Use this scale: 5 = The UI fully satisfies the task and reference. @@ -173,7 +175,7 @@ Grading notes: 6. If data binding paths are not explicitly specified, accept any logically sound path structure. 7. Do not award a high score when required components are missing or substantive behavior is wrong. -Think through the criteria internally, then return only the final integer score.`; +Think through the criteria internally, then return only the structured Number result.`; } async function waitForNetworkIdleBestEffort( diff --git a/packages/genui/ui-judge/tests/fixtures/interactive.html b/packages/genui/ui-judge/tests/fixtures/interactive.html deleted file mode 100644 index f11faef9bd..0000000000 --- a/packages/genui/ui-judge/tests/fixtures/interactive.html +++ /dev/null @@ -1,118 +0,0 @@ - - - - - - UI Judge Fixture - - - -
-

Order confirmed

-

- The generated UI should show a confirmation card and reveal shipping - details after interaction. -

- -
-
-
Status
-
Paid
-
-
-
Shipping
-
Arrives Friday
-
-
-
Viewport
-
unknown
-
-
-
- - - diff --git a/packages/genui/ui-judge/tests/helpers/playground-preview-server.ts b/packages/genui/ui-judge/tests/helpers/playground-preview-server.ts new file mode 100644 index 0000000000..41e4e63287 --- /dev/null +++ b/packages/genui/ui-judge/tests/helpers/playground-preview-server.ts @@ -0,0 +1,272 @@ +// Copyright 2026 The Lynx Authors. All rights reserved. +// Licensed under the Apache License Version 2.0 that can be found in the +// LICENSE file in the root directory of this source tree. +import { spawn } from 'node:child_process'; +import { existsSync } from 'node:fs'; +import { createServer } from 'node:net'; +import { dirname, relative, resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +interface PlaygroundDemoPreviewOptions { + demoId: string; + demoUrl?: string; + protocol?: 'a2ui' | 'openui'; + speed?: number; + theme?: 'light' | 'dark'; +} + +export interface PlaygroundPreviewServer { + readonly baseUrl: string; + createDemoPreviewUrl(options: PlaygroundDemoPreviewOptions): string; + dispose(): Promise; + getLogs(): string; +} + +const HELPER_DIR = dirname(fileURLToPath(import.meta.url)); +const WORKSPACE_ROOT = resolve(HELPER_DIR, '../../../../..'); +const PLAYGROUND_CWD = resolve( + WORKSPACE_ROOT, + 'packages/genui/a2ui-playground', +); +const REQUIRED_CATALOG_ARTIFACTS = [ + 'packages/genui/a2ui/dist/catalog/Button/catalog.json', + 'packages/genui/a2ui/dist/catalog/Text/catalog.json', +]; + +const READY_TIMEOUT_MS = 120_000; +const POLL_INTERVAL_MS = 250; +const FETCH_TIMEOUT_MS = 2_500; +const DISPOSE_TIMEOUT_MS = 5_000; +const LOG_LIMIT = 12_000; + +class BoundedLog { + #value = ''; + + append(chunk: unknown): void { + this.#value += Buffer.isBuffer(chunk) + ? chunk.toString('utf8') + : String(chunk); + if (this.#value.length > LOG_LIMIT) { + this.#value = this.#value.slice(-LOG_LIMIT); + } + } + + toString(): string { + return this.#value; + } +} + +export async function startPlaygroundPreviewServer(): Promise< + PlaygroundPreviewServer +> { + assertPlaygroundPrerequisites(); + + const port = await findFreePort(); + const baseUrl = `http://127.0.0.1:${port}`; + const stdout = new BoundedLog(); + const stderr = new BoundedLog(); + let spawnError: Error | null = null; + let exitState: { code: number | null; signal: NodeJS.Signals | null } | null = + null; + + const detached = process.platform !== 'win32'; + const child = spawn('pnpm', ['dev'], { + cwd: PLAYGROUND_CWD, + detached, + env: { + ...process.env, + PORT: String(port), + }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + child.stdout?.on('data', (chunk) => stdout.append(chunk)); + child.stderr?.on('data', (chunk) => stderr.append(chunk)); + child.once('error', (error) => { + spawnError = error; + }); + + const exitPromise = new Promise((resolveExit) => { + child.once('exit', (code, signal) => { + exitState = { code, signal }; + resolveExit(); + }); + }); + + const processStateError = () => { + if (spawnError) { + return new Error( + `Failed to start the A2UI playground dev server: ${spawnError.message}\n\n${ + formatLogs(stdout, stderr) + }`, + ); + } + if (exitState) { + return new Error( + `A2UI playground dev server exited before it became ready. code=${ + String(exitState.code) + } signal=${String(exitState.signal)}\n\n${formatLogs(stdout, stderr)}`, + ); + } + return null; + }; + + try { + await waitForPlaygroundReady(baseUrl, processStateError); + } catch (error) { + if (!exitState) { + await disposeChildProcess(child.pid, detached, exitPromise); + } + throw error; + } + + return { + baseUrl, + createDemoPreviewUrl(options) { + const renderUrl = new URL('/render.html', baseUrl); + renderUrl.searchParams.set('protocol', options.protocol ?? 'a2ui'); + renderUrl.searchParams.set('demoUrl', options.demoUrl ?? './a2ui.web.js'); + renderUrl.searchParams.set('theme', options.theme ?? 'light'); + renderUrl.searchParams.set('demo', options.demoId); + if (options.speed !== undefined) { + renderUrl.searchParams.set('speed', String(options.speed)); + } + return renderUrl.toString(); + }, + async dispose() { + if (!exitState) { + await disposeChildProcess(child.pid, detached, exitPromise); + } + }, + getLogs() { + return formatLogs(stdout, stderr); + }, + }; +} + +function assertPlaygroundPrerequisites(): void { + const missing = REQUIRED_CATALOG_ARTIFACTS.filter((artifact) => + !existsSync(resolve(WORKSPACE_ROOT, artifact)) + ); + if (missing.length === 0) return; + + const formatted = missing.map((artifact) => `- ${artifact}`).join('\n'); + throw new Error( + `Missing A2UI catalog artifacts required by the playground preview server:\n${formatted}\n\nRun \`pnpm --filter @lynx-js/a2ui-reactlynx build\` before starting @lynx-js/ui-judge model-backed tests.`, + ); +} + +async function findFreePort(): Promise { + return await new Promise((resolvePort, reject) => { + const server = createServer(); + server.once('error', reject); + server.listen(0, '127.0.0.1', () => { + const address = server.address(); + if (!address || typeof address === 'string') { + server.close(() => reject(new Error('Failed to allocate a TCP port.'))); + return; + } + + const port = address.port; + server.close((error) => { + if (error) { + reject(error); + return; + } + resolvePort(port); + }); + }); + }); +} + +async function waitForPlaygroundReady( + baseUrl: string, + getProcessError: () => Error | null, +): Promise { + const deadline = Date.now() + READY_TIMEOUT_MS; + const renderUrl = new URL('/render.html', baseUrl).toString(); + const bundleUrl = new URL('/a2ui.web.js', baseUrl).toString(); + + while (Date.now() < deadline) { + const processError = getProcessError(); + if (processError) throw processError; + + if (await fetchOk(renderUrl) && await fetchOk(bundleUrl)) { + return; + } + + await sleep(POLL_INTERVAL_MS); + } + + const processError = getProcessError(); + if (processError) throw processError; + throw new Error( + `Timed out waiting for the A2UI playground preview server at ${baseUrl}.`, + ); +} + +async function fetchOk(url: string): Promise { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); + try { + const response = await fetch(url, { + cache: 'no-store', + signal: controller.signal, + }); + return response.ok; + } catch { + return false; + } finally { + clearTimeout(timeout); + } +} + +async function disposeChildProcess( + pid: number | undefined, + detached: boolean, + exitPromise: Promise, +): Promise { + if (!pid) return; + + tryKill(pid, detached, 'SIGTERM'); + const didExit = await Promise.race([ + exitPromise.then(() => true), + sleep(DISPOSE_TIMEOUT_MS).then(() => false), + ]); + if (didExit) return; + + tryKill(pid, detached, 'SIGKILL'); + await Promise.race([exitPromise, sleep(1_000)]); +} + +function tryKill( + pid: number, + detached: boolean, + signal: NodeJS.Signals, +): void { + try { + process.kill(detached ? -pid : pid, signal); + } catch { + try { + process.kill(pid, signal); + } catch { + // The process is already gone. + } + } +} + +function formatLogs(stdout: BoundedLog, stderr: BoundedLog): string { + const out = stdout.toString().trim(); + const err = stderr.toString().trim(); + const cwd = relative(WORKSPACE_ROOT, PLAYGROUND_CWD); + return [ + `command: pnpm dev`, + `cwd: ${cwd}`, + `stdout:\n${out || '(empty)'}`, + `stderr:\n${err || '(empty)'}`, + ].join('\n\n'); +} + +function sleep(ms: number): Promise { + return new Promise((resolveSleep) => setTimeout(resolveSleep, ms)); +} diff --git a/packages/genui/ui-judge/tests/judge-page.spec.ts b/packages/genui/ui-judge/tests/judge-page.spec.ts index 85dc8f1384..781a97768d 100644 --- a/packages/genui/ui-judge/tests/judge-page.spec.ts +++ b/packages/genui/ui-judge/tests/judge-page.spec.ts @@ -1,90 +1,132 @@ // Copyright 2026 The Lynx Authors. All rights reserved. // Licensed under the Apache License Version 2.0 that can be found in the // LICENSE file in the root directory of this source tree. -import { readFile } from 'node:fs/promises'; -import { createServer } from 'node:http'; -import type { Server } from 'node:http'; -import type { AddressInfo } from 'node:net'; +import { mkdir, writeFile } from 'node:fs/promises'; +import { dirname } from 'node:path'; import { expect, test } from '@playwright/test'; +import type { Page } from '@playwright/test'; import { judgePage } from '../src/index.js'; - -let server: Server; -let baseUrl: string; +import type { UiJudgeResult } from '../src/index.js'; +import { + startPlaygroundPreviewServer, +} from './helpers/playground-preview-server.js'; +import type { PlaygroundPreviewServer } from './helpers/playground-preview-server.js'; function hasMidsceneModelConfig(): boolean { return Boolean(process.env['MIDSCENE_MODEL_NAME']); } -test.beforeAll(async () => { - const fixtureHtml = await readFile( - new URL('./fixtures/interactive.html', import.meta.url), - 'utf8', +interface PlaygroundDemoCase { + demoId: string; + expectedText: string; + readyText: string; +} + +const PLAYGROUND_DEMO_CASES: PlaygroundDemoCase[] = [ + { + demoId: 'recs', + readyText: 'Recommendations: Date-Night Dining Ideas', + expectedText: 'Sea Breeze Kitchen', + }, + { + demoId: 'trip-planner', + readyText: 'Trip Planner: Kyoto in 48 Hours', + expectedText: 'Monkey Park Viewpoint', + }, + { + demoId: 'weather-current', + readyText: 'Austin, TX', + expectedText: 'Clear skies with light breeze', + }, + { + demoId: 'product-card', + readyText: 'Wireless Headphones Pro', + expectedText: 'Add to Cart', + }, +]; +const JUDGE_DEMO: PlaygroundDemoCase = PLAYGROUND_DEMO_CASES[0]!; +const UI_JUDGE_RESULT_FILE_ENV = 'UI_JUDGE_RESULT_FILE'; + +test.describe('A2UI playground preview', () => { + test.skip( + !hasMidsceneModelConfig(), + 'MIDSCENE_MODEL_NAME is required for the real Midscene model test.', ); - server = createServer((req, res) => { - const url = new URL(req.url ?? '/', 'http://127.0.0.1'); - if (url.pathname === '/' || url.pathname === '/interactive') { - res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' }); - res.end(fixtureHtml); - return; - } + let previewServer: PlaygroundPreviewServer | undefined; - res.writeHead(404, { 'Content-Type': 'text/plain; charset=utf-8' }); - res.end('not found'); + test.beforeAll(async () => { + previewServer = await startPlaygroundPreviewServer(); }); - await new Promise((resolve) => { - server.listen(0, '127.0.0.1', resolve); + test.afterAll(async () => { + await previewServer?.dispose(); }); - const address = server.address() as AddressInfo; - baseUrl = `http://127.0.0.1:${address.port}`; -}); - -test.afterAll(async () => { - await new Promise((resolve, reject) => { - server.close((error) => { - if (error) { - reject(error); - return; + for (const demo of PLAYGROUND_DEMO_CASES) { + test(`renders playground example ${demo.demoId} with speed zero`, async ({ page }) => { + if (!previewServer) { + throw new Error('A2UI playground preview server was not started.'); } - resolve(); + + const previewUrl = previewServer.createDemoPreviewUrl({ + demoId: demo.demoId, + speed: 0, + }); + + await page.setViewportSize({ width: 390, height: 844 }); + await page.goto(previewUrl); + await waitForPreviewText(page, demo.readyText); + await waitForPreviewText(page, demo.expectedText, 2_000); }); - }); -}); + } -test('scores a caller-provided page after Midscene interactions', async ({ page }) => { - test.skip( - !hasMidsceneModelConfig(), - 'MIDSCENE_MODEL_NAME is required for the real Midscene model test.', - ); + test('scores a playground render.html demo with speed zero', async ({ page }) => { + test.setTimeout(300_000); - const steps = ['Click the Reveal details button.']; - await page.setViewportSize({ width: 390, height: 844 }); - await page.goto(`${baseUrl}/interactive`); + if (!previewServer) { + throw new Error('A2UI playground preview server was not started.'); + } - const result = await judgePage({ - page, - task: - 'The page should show an order confirmation card with a revealed status, shipping date, and 390x844 viewport label.', - steps, - timeoutMs: 120_000, - }); + const previewUrl = previewServer.createDemoPreviewUrl({ + demoId: JUDGE_DEMO.demoId, + speed: 0, + }); - expect(result).toMatchObject({ - dimension: 'visual-correctness', - steps, - url: `${baseUrl}/interactive`, + await page.setViewportSize({ width: 390, height: 844 }); + await page.goto(previewUrl); + await waitForPreviewText(page, JUDGE_DEMO.readyText); + await waitForPreviewText(page, JUDGE_DEMO.expectedText, 2_000); + + const task = + 'The A2UI playground preview should show date-night dining recommendations for Moonlight Terrace, Pinewood Bistro, and Sea Breeze Kitchen.'; + const result = await judgePage({ + page, + task, + timeoutMs: 180_000, + }); + + await writeUiJudgeResult({ + result, + task, + }); + + expect(result).toMatchObject({ + dimension: 'visual-correctness', + steps: [], + url: previewUrl, + }); + expect(result.error).toBeUndefined(); + expect(result.score).toBeGreaterThanOrEqual(0); + expect(result.score).toBeLessThanOrEqual(5); }); - expect(result.error).toBeUndefined(); - expect(result.score).toBeGreaterThanOrEqual(0); - expect(result.score).toBeLessThanOrEqual(5); }); test('returns a JSON error when input validation fails', async ({ page }) => { - await page.goto(`${baseUrl}/interactive`); + await page.setContent('

Order Confirmed

'); + const url = page.url(); const result = await judgePage({ page, @@ -96,7 +138,42 @@ test('returns a JSON error when input validation fails', async ({ page }) => { dimension: 'visual-correctness', score: 0, steps: [], - url: `${baseUrl}/interactive`, + url, }); expect(result.error?.message).toBeTruthy(); }); + +async function waitForPreviewText( + page: Page, + text: string, + timeout = 30_000, +): Promise { + await page.waitForFunction( + (expectedText) => { + const lynxView = document.querySelector('lynx-view'); + const shadowText = lynxView?.shadowRoot?.textContent ?? ''; + return shadowText.includes(expectedText) + || document.body.textContent?.includes(expectedText) === true; + }, + text, + { timeout }, + ); +} + +async function writeUiJudgeResult({ + result, + task, +}: { + result: UiJudgeResult; + task: string; +}): Promise { + const resultFile = process.env[UI_JUDGE_RESULT_FILE_ENV]; + if (!resultFile) return; + + await mkdir(dirname(resultFile), { recursive: true }); + await writeFile( + resultFile, + `${JSON.stringify({ results: [{ ...result, task }] }, null, 2)}\n`, + 'utf8', + ); +}