From a968ffecbb4127915a2d1574e8e511ec5a58f67d Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Wed, 20 May 2026 11:29:24 +0800 Subject: [PATCH 01/15] Use playground preview for ui-judge tests --- .github/actions/ui-judge-comment/README.md | 47 ++ .github/actions/ui-judge-comment/action.yml | 62 +++ .github/actions/ui-judge-comment/comment.mjs | 419 ++++++++++++++++++ .github/ui-judge-ci.instructions.md | 11 + .github/ui-judge.instructions.md | 12 +- .github/workflows/test.yml | 199 +++++++++ .gitignore | 1 + .../a2ui-playground/lynx-src/a2ui/App.tsx | 6 +- packages/genui/a2ui-playground/src/render.tsx | 9 +- .../a2ui-playground/src/utils/renderUrl.ts | 2 +- packages/genui/ui-judge/README.md | 14 + packages/genui/ui-judge/src/index.ts | 6 +- .../ui-judge/tests/fixtures/interactive.html | 118 ----- .../helpers/playground-preview-server.ts | 272 ++++++++++++ .../genui/ui-judge/tests/judge-page.spec.ts | 195 +++++--- 15 files changed, 1186 insertions(+), 187 deletions(-) create mode 100644 .github/actions/ui-judge-comment/README.md create mode 100644 .github/actions/ui-judge-comment/action.yml create mode 100644 .github/actions/ui-judge-comment/comment.mjs create mode 100644 .github/ui-judge-ci.instructions.md delete mode 100644 packages/genui/ui-judge/tests/fixtures/interactive.html create mode 100644 packages/genui/ui-judge/tests/helpers/playground-preview-server.ts diff --git a/.github/actions/ui-judge-comment/README.md b/.github/actions/ui-judge-comment/README.md new file mode 100644 index 0000000000..8706ec6739 --- /dev/null +++ b/.github/actions/ui-judge-comment/README.md @@ -0,0 +1,47 @@ +# UI Judge Comment + +Creates or updates a pull request comment with `@lynx-js/ui-judge` results. + +The action expects JSON shaped as one `UiJudgeResult`, an array of results, or +an object with a `results` array. + +```yaml +permissions: + pull-requests: write + +steps: + - run: pnpm --filter @lynx-js/ui-judge test + env: + UI_JUDGE_RESULT_FILE: ${{ github.workspace }}/ui-judge-results.json + + - uses: ./.github/actions/ui-judge-comment + with: + result-file: ui-judge-results.json +``` + +Example result payload: + +```json +{ + "results": [ + { + "dimension": "visual-correctness", + "score": 5, + "steps": [], + "url": "http://127.0.0.1:3000/render.html?demo=recs" + } + ] +} +``` + +Inputs: + +- `result-file`: path to a JSON result file. +- `result-json`: inline JSON result payload. Use this instead of + `result-file`. +- `pr-number`: pull request number. Defaults to the `pull_request` event. +- `title`: comment heading. Defaults to `UI Judge`. +- `marker`: hidden marker used to update a previous comment. +- `update-existing`: update the previous marked comment. Defaults to `true`. +- `dry-run`: print the comment body without calling the GitHub API. +- `github-token`: token for the GitHub API. Defaults to `github.token`. diff --git a/.github/actions/ui-judge-comment/action.yml b/.github/actions/ui-judge-comment/action.yml new file mode 100644 index 0000000000..f277e5ef13 --- /dev/null +++ b/.github/actions/ui-judge-comment/action.yml @@ -0,0 +1,62 @@ +name: UI Judge Comment + +description: Create or update a pull request comment with @lynx-js/ui-judge results. + +inputs: + result-file: + description: Path to a JSON file containing a UiJudgeResult, an array of results, or an object with a results array. + required: false + result-json: + description: Inline JSON containing a UiJudgeResult, an array of results, or an object with a results array. + required: false + pr-number: + description: Pull request number. Defaults to the pull_request payload number. + required: false + title: + description: Heading shown in the pull request comment. + default: UI Judge + required: false + marker: + description: Hidden marker used to find and update the previous UI Judge comment. + default: + required: false + update-existing: + description: Update the previous marked comment instead of creating a new one. + default: "true" + required: false + dry-run: + description: Print the rendered comment without calling the GitHub API. + default: "false" + required: false + github-token: + description: Token used to create or update the pull request comment. + required: false + +outputs: + body: + description: The rendered pull request comment body. + value: ${{ steps.comment.outputs.body }} + comment-id: + description: The created or updated issue comment id. + value: ${{ steps.comment.outputs.comment-id }} + comment-url: + description: The created or updated issue comment URL. + value: ${{ steps.comment.outputs.comment-url }} + +runs: + using: composite + steps: + - name: Create or update UI Judge comment + id: comment + shell: bash + run: node "$GITHUB_ACTION_PATH/comment.mjs" + env: + INPUT_RESULT_FILE: ${{ inputs.result-file }} + INPUT_RESULT_JSON: ${{ inputs.result-json }} + INPUT_PR_NUMBER: ${{ inputs.pr-number }} + INPUT_TITLE: ${{ inputs.title }} + INPUT_MARKER: ${{ inputs.marker }} + INPUT_UPDATE_EXISTING: ${{ inputs.update-existing }} + INPUT_DRY_RUN: ${{ inputs.dry-run }} + INPUT_GITHUB_TOKEN: ${{ inputs.github-token }} + GITHUB_TOKEN: ${{ inputs.github-token || github.token }} diff --git a/.github/actions/ui-judge-comment/comment.mjs b/.github/actions/ui-judge-comment/comment.mjs new file mode 100644 index 0000000000..43a62aa649 --- /dev/null +++ b/.github/actions/ui-judge-comment/comment.mjs @@ -0,0 +1,419 @@ +#!/usr/bin/env node + +// Copyright 2026 The Lynx Authors. All rights reserved. +// Licensed under the Apache License Version 2.0 that can be found in the +// LICENSE file in the root directory of this source tree. +import { appendFile, readFile } from 'node:fs/promises'; +import { isAbsolute, resolve } from 'node:path'; + +const MAX_DETAIL_LENGTH = 1_200; +const MAX_COMMENT_LENGTH = 64_000; + +main().catch((error) => { + console.error(error instanceof Error ? error.message : String(error)); + process.exitCode = 1; +}); + +async function main() { + const inputs = readInputs(); + const results = normalizeResults(await readResultPayload(inputs)); + const body = truncateComment(formatComment({ + marker: inputs.marker, + results, + title: inputs.title, + })); + + await writeOutput('body', body); + + if (inputs.dryRun) { + console.info(body); + return; + } + + const event = await readEventPayload(); + const repository = parseRepository(process.env.GITHUB_REPOSITORY); + const prNumber = inputs.prNumber ?? getPullRequestNumber(event); + if (!prNumber) { + throw new Error( + 'Unable to determine the pull request number. Run this action on a pull_request event or pass pr-number.', + ); + } + + const token = inputs.githubToken || process.env.GITHUB_TOKEN; + if (!token) { + throw new Error( + 'Missing github-token. Pass github-token or allow the action to use github.token.', + ); + } + + const client = createGitHubClient(token); + const existingComment = inputs.updateExisting + ? await findExistingComment(client, repository, prNumber, inputs.marker) + : undefined; + const comment = existingComment + ? await updateComment(client, repository, existingComment.id, body) + : await createComment(client, repository, prNumber, body); + + await writeOutput('comment-id', String(comment.id ?? '')); + await writeOutput('comment-url', String(comment.html_url ?? '')); + console.info( + existingComment + ? `Updated UI Judge comment: ${comment.html_url}` + : `Created UI Judge comment: ${comment.html_url}`, + ); +} + +function readInputs() { + const resultFile = emptyToUndefined(process.env.INPUT_RESULT_FILE); + const resultJson = emptyToUndefined(process.env.INPUT_RESULT_JSON); + if (!resultFile && !resultJson) { + throw new Error('Pass result-file or result-json to ui-judge-comment.'); + } + if (resultFile && resultJson) { + throw new Error('Pass only one of result-file or result-json.'); + } + + return { + dryRun: parseBoolean(process.env.INPUT_DRY_RUN, false), + githubToken: emptyToUndefined(process.env.INPUT_GITHUB_TOKEN), + marker: process.env.INPUT_MARKER?.trim() || '', + prNumber: parseOptionalPositiveInteger(process.env.INPUT_PR_NUMBER), + resultFile, + resultJson, + title: process.env.INPUT_TITLE?.trim() || 'UI Judge', + updateExisting: parseBoolean(process.env.INPUT_UPDATE_EXISTING, true), + }; +} + +async function readResultPayload(inputs) { + if (inputs.resultJson) { + return parseJson(inputs.resultJson, 'result-json'); + } + + const workspace = process.env.GITHUB_WORKSPACE || process.cwd(); + const filePath = isAbsolute(inputs.resultFile) + ? inputs.resultFile + : resolve(workspace, inputs.resultFile); + const content = await readFile(filePath, 'utf8'); + return parseJson(content, filePath); +} + +function parseJson(content, source) { + try { + return JSON.parse(content); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + throw new Error(`Failed to parse ${source} as JSON: ${message}`); + } +} + +function normalizeResults(payload) { + const rawResults = Array.isArray(payload) + ? payload + : (Array.isArray(payload?.results) + ? payload.results + : [payload]); + + const results = rawResults.map((result, index) => + normalizeResult(result, index) + ); + if (results.length === 0) { + throw new Error('UI Judge result payload did not contain any results.'); + } + return results; +} + +function normalizeResult(result, index) { + if (!result || typeof result !== 'object') { + throw new Error(`UI Judge result at index ${index} must be an object.`); + } + + return { + dimension: stringValue(result.dimension) || 'visual-correctness', + error: normalizeError(result.error), + reference: stringValue(result.reference), + score: normalizeScore(result.score, index), + steps: normalizeSteps(result.steps), + task: stringValue(result.task), + url: stringValue(result.url), + }; +} + +function normalizeScore(value, index) { + const score = typeof value === 'number' ? value : Number(value); + if (!Number.isFinite(score)) { + throw new Error( + `UI Judge result at index ${index} has a non-numeric score.`, + ); + } + return Math.max(0, Math.min(5, Math.round(score))); +} + +function normalizeError(error) { + if (!error) return undefined; + if (typeof error === 'string') return { message: error }; + if (typeof error === 'object') { + return { + message: stringValue(error.message) || JSON.stringify(error), + }; + } + return { message: String(error) }; +} + +function normalizeSteps(steps) { + if (!Array.isArray(steps)) return []; + return steps.filter((step) => typeof step === 'string' && step.trim()) + .map((step) => step.trim()); +} + +function formatComment({ marker, results, title }) { + const average = results.reduce((sum, result) => sum + result.score, 0) + / results.length; + const failedCount = results.filter((result) => result.error).length; + const runUrl = getRunUrl(); + const lines = [ + marker, + `### ${escapeMarkdown(title)}`, + '', + `Average score: **${formatScore(average)} / 5** across ${ + pluralize(results.length, 'result') + }.`, + ]; + + if (failedCount > 0) { + lines.push( + `${failedCount} ${ + failedCount === 1 ? 'result has' : 'results have' + } an error.`, + ); + } + + lines.push( + '', + '| # | Dimension | Score | Page | Status |', + '| - | - | -: | - | - |', + ...results.map((result, index) => formatTableRow(result, index)), + ); + + const details = results + .map((result, index) => formatResultDetails(result, index)) + .filter(Boolean); + if (details.length > 0) { + lines.push( + '', + '
', + 'Details', + '', + ...details, + '
', + ); + } + + if (runUrl) { + lines.push('', `[Workflow run](${runUrl})`); + } + + return lines.join('\n'); +} + +function formatTableRow(result, index) { + const page = result.url + ? `[preview](${sanitizeUrlForMarkdown(result.url)})` + : 'n/a'; + const status = result.error ? 'Error' : 'OK'; + return [ + String(index + 1), + escapeTableCell(result.dimension), + `${result.score} / 5`, + page, + status, + ].join(' | ').replace(/^/, '| ').replace(/$/, ' |'); +} + +function formatResultDetails(result, index) { + const lines = [`#### Result ${index + 1}`, '']; + + if (result.task) { + lines.push(`- Task: ${truncateText(result.task)}`); + } + if (result.reference) { + lines.push(`- Reference: ${truncateText(result.reference)}`); + } + if (result.steps.length > 0) { + lines.push( + '- Steps:', + ...result.steps.map((step) => ` - ${truncateText(step)}`), + ); + } + if (result.error) { + lines.push(`- Error: ${truncateText(result.error.message)}`); + } + + return lines.length > 2 ? [...lines, ''].join('\n') : ''; +} + +async function readEventPayload() { + const eventPath = process.env.GITHUB_EVENT_PATH; + if (!eventPath) return {}; + + try { + return parseJson(await readFile(eventPath, 'utf8'), eventPath); + } catch { + return {}; + } +} + +function getPullRequestNumber(event) { + const number = event?.pull_request?.number + ?? (event?.issue?.pull_request ? event.issue.number : undefined); + return Number.isInteger(number) && number > 0 ? number : undefined; +} + +function parseRepository(repository) { + const [owner, repo] = String(repository || '').split('/'); + if (!owner || !repo) { + throw new Error('GITHUB_REPOSITORY must be set to owner/repo.'); + } + return { owner, repo }; +} + +function createGitHubClient(token) { + const apiUrl = process.env.GITHUB_API_URL || 'https://api.github.com'; + return async function request(path, options = {}) { + const response = await fetch(`${apiUrl}${path}`, { + ...options, + headers: { + accept: 'application/vnd.github+json', + authorization: `Bearer ${token}`, + 'content-type': 'application/json', + 'user-agent': 'lynx-ui-judge-comment', + 'x-github-api-version': '2022-11-28', + ...options.headers, + }, + }); + + const text = await response.text(); + const data = text ? parseJsonResponse(text) : {}; + if (!response.ok) { + const message = data?.message || response.statusText; + throw new Error( + `GitHub API ${ + options.method || 'GET' + } ${path} failed with ${response.status}: ${message}`, + ); + } + return data; + }; +} + +function parseJsonResponse(text) { + try { + return JSON.parse(text); + } catch { + return { message: text }; + } +} + +async function findExistingComment(client, repository, prNumber, marker) { + const comments = await client( + `/repos/${repository.owner}/${repository.repo}/issues/${prNumber}/comments?per_page=100`, + ); + return comments.find((comment) => + typeof comment.body === 'string' && comment.body.includes(marker) + ); +} + +async function createComment(client, repository, prNumber, body) { + return await client( + `/repos/${repository.owner}/${repository.repo}/issues/${prNumber}/comments`, + { + body: JSON.stringify({ body }), + method: 'POST', + }, + ); +} + +async function updateComment(client, repository, commentId, body) { + return await client( + `/repos/${repository.owner}/${repository.repo}/issues/comments/${commentId}`, + { + body: JSON.stringify({ body }), + method: 'PATCH', + }, + ); +} + +async function writeOutput(name, value) { + const outputPath = process.env.GITHUB_OUTPUT; + if (!outputPath) return; + + const delimiter = `ui_judge_${name}_${Date.now()}`; + const content = `${name}<<${delimiter}\n${value}\n${delimiter}\n`; + await appendFile(outputPath, content, 'utf8'); +} + +function parseOptionalPositiveInteger(value) { + const normalized = emptyToUndefined(value); + if (!normalized) return undefined; + + const number = Number(normalized); + if (!Number.isInteger(number) || number <= 0) { + throw new Error(`Expected a positive integer, received: ${normalized}`); + } + return number; +} + +function parseBoolean(value, defaultValue) { + const normalized = emptyToUndefined(value); + if (!normalized) return defaultValue; + return ['1', 'true', 'yes', 'on'].includes(normalized.toLowerCase()); +} + +function stringValue(value) { + return typeof value === 'string' && value.trim() ? value.trim() : undefined; +} + +function emptyToUndefined(value) { + return typeof value === 'string' && value.trim() ? value.trim() : undefined; +} + +function formatScore(value) { + return Number.isInteger(value) ? String(value) : value.toFixed(1); +} + +function pluralize(count, word) { + return `${count} ${count === 1 ? word : `${word}s`}`; +} + +function escapeTableCell(value) { + return escapeMarkdown(value).replaceAll('|', '\\|'); +} + +function escapeMarkdown(value) { + return String(value).replaceAll('\n', ' ').trim(); +} + +function sanitizeUrlForMarkdown(url) { + return String(url).replaceAll(')', '%29'); +} + +function truncateText(value) { + const text = escapeMarkdown(value); + if (text.length <= MAX_DETAIL_LENGTH) return text; + return `${text.slice(0, MAX_DETAIL_LENGTH - 3)}...`; +} + +function truncateComment(body) { + if (body.length <= MAX_COMMENT_LENGTH) return body; + return `${ + body.slice(0, MAX_COMMENT_LENGTH - 120) + }\n\n_Comment truncated because it exceeded ${MAX_COMMENT_LENGTH} characters._`; +} + +function getRunUrl() { + const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'; + const repository = process.env.GITHUB_REPOSITORY; + const runId = process.env.GITHUB_RUN_ID; + if (!repository || !runId) return undefined; + return `${serverUrl}/${repository}/actions/runs/${runId}`; +} diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md new file mode 100644 index 0000000000..5fef91e49f --- /dev/null +++ b/.github/ui-judge-ci.instructions.md @@ -0,0 +1,11 @@ +--- +applyTo: ".github/workflows/test.yml,.github/actions/ui-judge-comment/**" +--- + +When wiring `@lynx-js/ui-judge` into pull request CI, preserve the PR comment even when the model-backed test fails, but do not hide the failed test. Use `continue-on-error` only on the judge execution step, run the comment action afterward with `always()`, then add a final failing step keyed to `steps..outcome == 'failure'`. + +Use step-level `timeout-minutes` on long UI Judge setup, build, and model execution steps so a hung prerequisite fails early enough for the fallback result writer and PR comment action to run before the job-level timeout kills the whole job. + +Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` pattern. Restore the same strict `.turbo` cache key with `fail-on-cache-miss: true`, run `pnpm turbo build --summarize` in the Playwright container, then run the UI Judge-specific playground artifact preparation and package test. + +Inject the full Midscene/OpenAI model environment into the UI Judge execution step, including `MIDSCENE_MODEL_API_KEY`, `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_FAMILY`, `MIDSCENE_MODEL_NAME`, and `MIDSCENE_OPENAI_INIT_CONFIG_JSON`. diff --git a/.github/ui-judge.instructions.md b/.github/ui-judge.instructions.md index 73b2d2b846..2cee725d64 100644 --- a/.github/ui-judge.instructions.md +++ b/.github/ui-judge.instructions.md @@ -4,10 +4,18 @@ applyTo: "packages/genui/ui-judge/**/*" When extending `@lynx-js/ui-judge`, keep `judgePage` as the only public runtime API until a caller needs more surface area. Callers own Playwright page setup, navigation, viewport, cookies, route mocks, and authentication. Additional dimensions should remain internal unless they are intentionally added to the package exports. -Midscene scoring in this package should use `aiNumber()` and return a JSON-serializable integer score from 0 to 5. Do not reintroduce letter grades or `GRADE:` output in prompts. +Midscene scoring in this package should use `aiNumber()` and return a JSON-serializable integer score from 0 to 5. Prompt text must cooperate with Midscene's `aiNumber()` parser by asking for the requested `Number` field, not a bare JSON number. Do not reintroduce letter grades or `GRADE:` output in prompts. Avoid writing screenshots by default. Playwright and Midscene may capture the page internally, but persistent screenshot artifacts should require an explicit future option. Midscene currently brings in `sharp`; keep its pnpm build-script policy explicit in `pnpm-workspace.yaml` rather than letting `pnpm install` leave the placeholder value. -Model-backed Playwright tests should use the real Midscene service when `MIDSCENE_MODEL_NAME` is configured, and skip only the model-dependent cases when that environment variable is absent. +Model-backed Playwright tests should use the real Midscene service when `MIDSCENE_MODEL_NAME` is configured, and skip only the model-dependent cases when that environment variable is absent. Keep the playground server startup inside the skipped model-backed test group so non-model validation tests do not bind local ports. + +Prefer `page.setContent()` or another non-listening fixture setup for static `@lynx-js/ui-judge` Playwright fixtures. Avoid starting local HTTP servers in package tests unless the behavior under test specifically needs network navigation. + +When a `@lynx-js/ui-judge` Playwright test needs real network navigation, use the A2UI playground preview server rather than a package-local scratch HTTP server. Start `pnpm dev` from `packages/genui/a2ui-playground` and navigate Playwright to the playground `render.html` demo route, such as `/render.html?protocol=a2ui&demoUrl=.%2Fa2ui.web.js&theme=light&demo=recs&speed=0`. + +The A2UI playground preview server requires generated catalog artifacts from `@lynx-js/a2ui-reactlynx`. If they are missing, fail with a clear prerequisite message that points to `pnpm --filter @lynx-js/a2ui-reactlynx build` instead of silently running broad cross-package builds from Playwright hooks. + +The Codex sandbox blocks TCP listeners on loopback addresses such as `127.0.0.1`, `localhost`, `0.0.0.0`, and `::1`, so bind-dependent verification should use an escalated command such as `pnpm --filter @lynx-js/ui-judge test` rather than rewriting the test to avoid the bind. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index f23e07fcbb..2e042a31c5 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -72,6 +72,204 @@ jobs: runs-on: lynx-ubuntu-24.04-medium run: pnpm eslint . --flag v10_config_lookup_from_file + ui-judge: + needs: build + runs-on: lynx-custom-container + timeout-minutes: 30 + permissions: + contents: read + issues: write + pull-requests: write + container: + image: mcr.microsoft.com/playwright:v1.58.2-noble + env: + CI: 1 + TURBO_TELEMETRY_DISABLED: 1 + steps: + - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + with: + fetch-depth: 0 + persist-credentials: false + - uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5 + with: + node-version: "22" + package-manager-cache: false + - name: Check UI Judge inputs + id: ui-judge-inputs + shell: bash + env: + GITHUB_TOKEN: ${{ github.token }} + MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} + MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} + run: | + set -euo pipefail + node --input-type=module <<'NODE' + import { appendFileSync, readFileSync } from 'node:fs'; + + const relevantFilePatterns = [ + /^packages\/genui\/(ui-judge|a2ui|a2ui-playground)\//, + /^\.github\/actions\/ui-judge-comment\//, + /^\.github\/workflows\/test\.yml$/, + /^\.github\/ui-judge\.instructions\.md$/, + ]; + + let shouldRun = false; + let reason = 'UI Judge only comments on pull_request events.'; + + if (process.env.GITHUB_EVENT_NAME === 'pull_request') { + if ( + !process.env.MIDSCENE_MODEL_NAME + || !process.env.MIDSCENE_MODEL_API_KEY + ) { + reason = + 'Midscene model secrets are not configured for this pull request.'; + } else { + const changedFiles = await listPullRequestFiles(); + shouldRun = changedFiles.some((file) => + relevantFilePatterns.some((pattern) => pattern.test(file)) + ); + reason = shouldRun + ? 'Relevant UI Judge files changed.' + : 'No UI Judge, A2UI, or playground files changed.'; + } + } + + appendFileSync(process.env.GITHUB_OUTPUT, `should-run=${shouldRun}\n`); + appendFileSync(process.env.GITHUB_OUTPUT, `reason=${reason}\n`); + console.info(reason); + + async function listPullRequestFiles() { + const event = JSON.parse(readFileSync(process.env.GITHUB_EVENT_PATH, 'utf8')); + const pullRequestNumber = event.pull_request?.number ?? event.number; + const repository = process.env.GITHUB_REPOSITORY; + const token = process.env.GITHUB_TOKEN; + + if (!pullRequestNumber || !repository || !token) { + throw new Error( + 'Unable to list pull request files for the UI Judge gate.', + ); + } + + const apiUrl = process.env.GITHUB_API_URL || 'https://api.github.com'; + const files = []; + for (let page = 1; page < 11; page++) { + const url = `${apiUrl}/repos/${repository}/pulls/${pullRequestNumber}/files?per_page=100&page=${page}`; + const response = await fetch(url, { + headers: { + accept: 'application/vnd.github+json', + authorization: `Bearer ${token}`, + 'x-github-api-version': '2022-11-28', + }, + }); + if (!response.ok) { + throw new Error( + `Failed to list pull request files: ${response.status} ${response.statusText}`, + ); + } + + const pageFiles = await response.json(); + files.push( + ...pageFiles + .map((file) => file.filename) + .filter((file) => typeof file === 'string'), + ); + if (pageFiles.length < 100) break; + } + return files; + } + NODE + - name: TurboCache + if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} + uses: lynx-infra/cache@5c6160a6a4c7fca80a2f3057bb9dfc9513fcb732 + with: + path: .turbo + # Match the reusable Playwright test workflow: the build job must have + # already produced the cache for this exact commit. + key: turbo-v4-${{ runner.os }}-${{ hashFiles('**/packages/**/src/**/*.rs') }}-${{ github.sha }} + fail-on-cache-miss: true + - name: Install native build tools + if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} + run: | + apt-get update + apt-get install -y --no-install-recommends build-essential + - uses: ./.github/actions/rustup + if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} + with: + key: ui-judge + - name: Install Rust wasm targets + if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} + run: rustup target add wasm32-unknown-unknown wasm32-wasip1 + - name: Install + if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} + working-directory: ${{ github.workspace }} + run: | + cd "$GITHUB_WORKSPACE" + npm install -g corepack@latest + corepack enable + pnpm install --frozen-lockfile + - name: Build + if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} + timeout-minutes: 10 + working-directory: ${{ github.workspace }} + env: + NODE_OPTIONS: --max-old-space-size=32768 + run: | + cd "$GITHUB_WORKSPACE" + pnpm turbo build --summarize + - name: Prepare A2UI playground artifacts + if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} + timeout-minutes: 12 + working-directory: ${{ github.workspace }} + run: | + cd "$GITHUB_WORKSPACE" + pnpm turbo build:lynx --filter a2ui-playground + - name: Run UI Judge + id: ui-judge + if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} + continue-on-error: true + timeout-minutes: 10 + working-directory: ${{ github.workspace }} + env: + MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} + MIDSCENE_MODEL_BASE_URL: ${{ secrets.MIDSCENE_MODEL_BASE_URL }} + MIDSCENE_MODEL_FAMILY: ${{ secrets.MIDSCENE_MODEL_FAMILY }} + MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} + MIDSCENE_OPENAI_INIT_CONFIG_JSON: ${{ secrets.MIDSCENE_OPENAI_INIT_CONFIG_JSON }} + UI_JUDGE_RESULT_FILE: ${{ github.workspace }}/ui-judge-results.json + run: | + cd "$GITHUB_WORKSPACE" + pnpm --filter @lynx-js/ui-judge test + - name: Write UI Judge failure result + if: ${{ always() && steps.ui-judge-inputs.outputs.should-run == 'true' && (failure() || steps.ui-judge.outcome == 'failure') && hashFiles('ui-judge-results.json') == '' }} + working-directory: ${{ github.workspace }} + env: + UI_JUDGE_RESULT_FILE: ${{ github.workspace }}/ui-judge-results.json + run: | + cd "$GITHUB_WORKSPACE" + cat > "$UI_JUDGE_RESULT_FILE" <<'JSON' + { + "results": [ + { + "dimension": "visual-correctness", + "score": 0, + "error": { + "message": "UI Judge CI failed before writing a model result. See the workflow logs for details." + }, + "steps": [], + "url": "" + } + ] + } + JSON + - name: Comment UI Judge result + if: ${{ always() && steps.ui-judge-inputs.outputs.should-run == 'true' }} + uses: ./.github/actions/ui-judge-comment + with: + result-file: ui-judge-results.json + - name: Fail when UI Judge fails + if: ${{ always() && steps.ui-judge-inputs.outputs.should-run == 'true' && steps.ui-judge.outcome == 'failure' }} + run: exit 1 + lighthouse: needs: build uses: ./.github/workflows/workflow-test.yml @@ -381,6 +579,7 @@ jobs: - benchmark - code-style-check - eslint + - ui-judge # - playwright-linux - playwright-web-elements - test-api diff --git a/.gitignore b/.gitignore index eef93622aa..ea9ee57f63 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ www playwright-report test-results trace.zip +midscene_run .turbo **/test/js .swc diff --git a/packages/genui/a2ui-playground/lynx-src/a2ui/App.tsx b/packages/genui/a2ui-playground/lynx-src/a2ui/App.tsx index f8bcf6186c..337349ca8f 100644 --- a/packages/genui/a2ui-playground/lynx-src/a2ui/App.tsx +++ b/packages/genui/a2ui-playground/lynx-src/a2ui/App.tsx @@ -341,14 +341,16 @@ export function App() { const playbackTargetCountRef = useRef(0); // Per-batch delay (ms) the mock agent waits between successive - // protocol messages. Configurable via `?speed=2` (faster) etc. + // protocol messages. Configurable via `?speed=2` (faster); + // `?speed=0` paints the full stream with no delay. const streamDelay = useMemo(() => { const raw = (globalProps as Record | null)?.speed ?? (rawInitData as Record | null)?.speed; const speed = typeof raw === 'string' ? Number(raw) : (typeof raw === 'number' ? raw : 1); - if (!speed || speed <= 0) return DEFAULT_STREAM_DELAY_MS; + if (!Number.isFinite(speed) || speed < 0) return DEFAULT_STREAM_DELAY_MS; + if (speed === 0) return 0; return DEFAULT_STREAM_DELAY_MS / speed; }, [globalProps, rawInitData]); diff --git a/packages/genui/a2ui-playground/src/render.tsx b/packages/genui/a2ui-playground/src/render.tsx index ffc63eaf79..b890a199ce 100644 --- a/packages/genui/a2ui-playground/src/render.tsx +++ b/packages/genui/a2ui-playground/src/render.tsx @@ -13,7 +13,7 @@ import { decodeBase64Url } from './utils/base64url.js'; import { DEFAULT_A2UI_DEMO_URL } from './utils/demoUrl.js'; interface InitData { - protocol?: '0.9'; + protocol?: '0.9' | 'a2ui' | 'openui'; messagesUrl?: string; messages?: unknown; actionMocksUrl?: string; @@ -107,7 +107,10 @@ function parseInitDataFromQuery(): InitData | null { return null; } - const protocolValue = protocol === '0.9' ? '0.9' : undefined; + const protocolValue = protocol === '0.9' || protocol === 'a2ui' + || protocol === 'openui' + ? protocol + : undefined; const speedRaw = params.get('speed'); const speedVal = speedRaw === null ? undefined : Number(speedRaw); @@ -118,7 +121,7 @@ function parseInitDataFromQuery(): InitData | null { actionMocksUrl: actionMocksUrl ?? undefined, demoUrl: demoUrl ?? undefined, messages: [], // Default to an empty array - speed: speedVal && Number.isFinite(speedVal) && speedVal > 0 + speed: speedVal !== undefined && Number.isFinite(speedVal) && speedVal >= 0 ? speedVal : undefined, instant: instant === '1' ? true : undefined, diff --git a/packages/genui/a2ui-playground/src/utils/renderUrl.ts b/packages/genui/a2ui-playground/src/utils/renderUrl.ts index 76d2b1492a..8dcc7ba5c9 100644 --- a/packages/genui/a2ui-playground/src/utils/renderUrl.ts +++ b/packages/genui/a2ui-playground/src/utils/renderUrl.ts @@ -13,7 +13,7 @@ export interface RenderInit { theme?: 'light' | 'dark'; /** When set, use a short `?demo=` param instead of inlining the payload. */ demoId?: string; - /** Simulation speed multiplier (e.g. 0.5, 1, 2, 4). */ + /** Simulation speed multiplier (e.g. 0, 0.5, 1, 2, 4); 0 disables delay. */ speed?: number; /** When true, render the final UI immediately without streaming playback. */ instant?: boolean; diff --git a/packages/genui/ui-judge/README.md b/packages/genui/ui-judge/README.md index 3a9be88494..73904c9bdd 100644 --- a/packages/genui/ui-judge/README.md +++ b/packages/genui/ui-judge/README.md @@ -31,3 +31,17 @@ variables, such as `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_API_KEY`, The Playwright test suite uses the real Midscene service when `MIDSCENE_MODEL_NAME` is present. Without model configuration, the model-backed test is skipped and the error-path test still runs. + +The model-backed package test uses the A2UI playground preview server instead +of a scratch HTTP fixture. It opens the playground's `render.html` demo route +with `speed=0`, for example +`/render.html?protocol=a2ui&demoUrl=.%2Fa2ui.web.js&theme=light&demo=recs&speed=0`. +Prepare the playground artifacts first: + +```sh +pnpm turbo build:lynx --filter a2ui-playground +pnpm --filter @lynx-js/ui-judge test +``` + +The playground dev server binds to a local TCP port, so sandboxed runs need +local-bind permission. diff --git a/packages/genui/ui-judge/src/index.ts b/packages/genui/ui-judge/src/index.ts index 0d41bbf67a..7e4add210b 100644 --- a/packages/genui/ui-judge/src/index.ts +++ b/packages/genui/ui-judge/src/index.ts @@ -154,7 +154,9 @@ function buildVisualCorrectnessPrompt( Task: ${options.task} ${reference} -Return exactly one integer from 0 to 5. Do not return "GRADE:", letters, Markdown, prose, or explanation. +Set Midscene's requested Number result to exactly one integer from 0 to 5. +Do not return a bare JSON number; the structured result must use the Number field. +Do not return "GRADE:", letters, Markdown, prose, or explanation. Use this scale: 5 = The UI fully satisfies the task and reference. @@ -173,7 +175,7 @@ Grading notes: 6. If data binding paths are not explicitly specified, accept any logically sound path structure. 7. Do not award a high score when required components are missing or substantive behavior is wrong. -Think through the criteria internally, then return only the final integer score.`; +Think through the criteria internally, then return only the structured Number result.`; } async function waitForNetworkIdleBestEffort( diff --git a/packages/genui/ui-judge/tests/fixtures/interactive.html b/packages/genui/ui-judge/tests/fixtures/interactive.html deleted file mode 100644 index f11faef9bd..0000000000 --- a/packages/genui/ui-judge/tests/fixtures/interactive.html +++ /dev/null @@ -1,118 +0,0 @@ - - - - - - UI Judge Fixture - - - -
-

Order confirmed

-

- The generated UI should show a confirmation card and reveal shipping - details after interaction. -

- -
-
-
Status
-
Paid
-
-
-
Shipping
-
Arrives Friday
-
-
-
Viewport
-
unknown
-
-
-
- - - diff --git a/packages/genui/ui-judge/tests/helpers/playground-preview-server.ts b/packages/genui/ui-judge/tests/helpers/playground-preview-server.ts new file mode 100644 index 0000000000..41e4e63287 --- /dev/null +++ b/packages/genui/ui-judge/tests/helpers/playground-preview-server.ts @@ -0,0 +1,272 @@ +// Copyright 2026 The Lynx Authors. All rights reserved. +// Licensed under the Apache License Version 2.0 that can be found in the +// LICENSE file in the root directory of this source tree. +import { spawn } from 'node:child_process'; +import { existsSync } from 'node:fs'; +import { createServer } from 'node:net'; +import { dirname, relative, resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +interface PlaygroundDemoPreviewOptions { + demoId: string; + demoUrl?: string; + protocol?: 'a2ui' | 'openui'; + speed?: number; + theme?: 'light' | 'dark'; +} + +export interface PlaygroundPreviewServer { + readonly baseUrl: string; + createDemoPreviewUrl(options: PlaygroundDemoPreviewOptions): string; + dispose(): Promise; + getLogs(): string; +} + +const HELPER_DIR = dirname(fileURLToPath(import.meta.url)); +const WORKSPACE_ROOT = resolve(HELPER_DIR, '../../../../..'); +const PLAYGROUND_CWD = resolve( + WORKSPACE_ROOT, + 'packages/genui/a2ui-playground', +); +const REQUIRED_CATALOG_ARTIFACTS = [ + 'packages/genui/a2ui/dist/catalog/Button/catalog.json', + 'packages/genui/a2ui/dist/catalog/Text/catalog.json', +]; + +const READY_TIMEOUT_MS = 120_000; +const POLL_INTERVAL_MS = 250; +const FETCH_TIMEOUT_MS = 2_500; +const DISPOSE_TIMEOUT_MS = 5_000; +const LOG_LIMIT = 12_000; + +class BoundedLog { + #value = ''; + + append(chunk: unknown): void { + this.#value += Buffer.isBuffer(chunk) + ? chunk.toString('utf8') + : String(chunk); + if (this.#value.length > LOG_LIMIT) { + this.#value = this.#value.slice(-LOG_LIMIT); + } + } + + toString(): string { + return this.#value; + } +} + +export async function startPlaygroundPreviewServer(): Promise< + PlaygroundPreviewServer +> { + assertPlaygroundPrerequisites(); + + const port = await findFreePort(); + const baseUrl = `http://127.0.0.1:${port}`; + const stdout = new BoundedLog(); + const stderr = new BoundedLog(); + let spawnError: Error | null = null; + let exitState: { code: number | null; signal: NodeJS.Signals | null } | null = + null; + + const detached = process.platform !== 'win32'; + const child = spawn('pnpm', ['dev'], { + cwd: PLAYGROUND_CWD, + detached, + env: { + ...process.env, + PORT: String(port), + }, + stdio: ['ignore', 'pipe', 'pipe'], + }); + + child.stdout?.on('data', (chunk) => stdout.append(chunk)); + child.stderr?.on('data', (chunk) => stderr.append(chunk)); + child.once('error', (error) => { + spawnError = error; + }); + + const exitPromise = new Promise((resolveExit) => { + child.once('exit', (code, signal) => { + exitState = { code, signal }; + resolveExit(); + }); + }); + + const processStateError = () => { + if (spawnError) { + return new Error( + `Failed to start the A2UI playground dev server: ${spawnError.message}\n\n${ + formatLogs(stdout, stderr) + }`, + ); + } + if (exitState) { + return new Error( + `A2UI playground dev server exited before it became ready. code=${ + String(exitState.code) + } signal=${String(exitState.signal)}\n\n${formatLogs(stdout, stderr)}`, + ); + } + return null; + }; + + try { + await waitForPlaygroundReady(baseUrl, processStateError); + } catch (error) { + if (!exitState) { + await disposeChildProcess(child.pid, detached, exitPromise); + } + throw error; + } + + return { + baseUrl, + createDemoPreviewUrl(options) { + const renderUrl = new URL('/render.html', baseUrl); + renderUrl.searchParams.set('protocol', options.protocol ?? 'a2ui'); + renderUrl.searchParams.set('demoUrl', options.demoUrl ?? './a2ui.web.js'); + renderUrl.searchParams.set('theme', options.theme ?? 'light'); + renderUrl.searchParams.set('demo', options.demoId); + if (options.speed !== undefined) { + renderUrl.searchParams.set('speed', String(options.speed)); + } + return renderUrl.toString(); + }, + async dispose() { + if (!exitState) { + await disposeChildProcess(child.pid, detached, exitPromise); + } + }, + getLogs() { + return formatLogs(stdout, stderr); + }, + }; +} + +function assertPlaygroundPrerequisites(): void { + const missing = REQUIRED_CATALOG_ARTIFACTS.filter((artifact) => + !existsSync(resolve(WORKSPACE_ROOT, artifact)) + ); + if (missing.length === 0) return; + + const formatted = missing.map((artifact) => `- ${artifact}`).join('\n'); + throw new Error( + `Missing A2UI catalog artifacts required by the playground preview server:\n${formatted}\n\nRun \`pnpm --filter @lynx-js/a2ui-reactlynx build\` before starting @lynx-js/ui-judge model-backed tests.`, + ); +} + +async function findFreePort(): Promise { + return await new Promise((resolvePort, reject) => { + const server = createServer(); + server.once('error', reject); + server.listen(0, '127.0.0.1', () => { + const address = server.address(); + if (!address || typeof address === 'string') { + server.close(() => reject(new Error('Failed to allocate a TCP port.'))); + return; + } + + const port = address.port; + server.close((error) => { + if (error) { + reject(error); + return; + } + resolvePort(port); + }); + }); + }); +} + +async function waitForPlaygroundReady( + baseUrl: string, + getProcessError: () => Error | null, +): Promise { + const deadline = Date.now() + READY_TIMEOUT_MS; + const renderUrl = new URL('/render.html', baseUrl).toString(); + const bundleUrl = new URL('/a2ui.web.js', baseUrl).toString(); + + while (Date.now() < deadline) { + const processError = getProcessError(); + if (processError) throw processError; + + if (await fetchOk(renderUrl) && await fetchOk(bundleUrl)) { + return; + } + + await sleep(POLL_INTERVAL_MS); + } + + const processError = getProcessError(); + if (processError) throw processError; + throw new Error( + `Timed out waiting for the A2UI playground preview server at ${baseUrl}.`, + ); +} + +async function fetchOk(url: string): Promise { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS); + try { + const response = await fetch(url, { + cache: 'no-store', + signal: controller.signal, + }); + return response.ok; + } catch { + return false; + } finally { + clearTimeout(timeout); + } +} + +async function disposeChildProcess( + pid: number | undefined, + detached: boolean, + exitPromise: Promise, +): Promise { + if (!pid) return; + + tryKill(pid, detached, 'SIGTERM'); + const didExit = await Promise.race([ + exitPromise.then(() => true), + sleep(DISPOSE_TIMEOUT_MS).then(() => false), + ]); + if (didExit) return; + + tryKill(pid, detached, 'SIGKILL'); + await Promise.race([exitPromise, sleep(1_000)]); +} + +function tryKill( + pid: number, + detached: boolean, + signal: NodeJS.Signals, +): void { + try { + process.kill(detached ? -pid : pid, signal); + } catch { + try { + process.kill(pid, signal); + } catch { + // The process is already gone. + } + } +} + +function formatLogs(stdout: BoundedLog, stderr: BoundedLog): string { + const out = stdout.toString().trim(); + const err = stderr.toString().trim(); + const cwd = relative(WORKSPACE_ROOT, PLAYGROUND_CWD); + return [ + `command: pnpm dev`, + `cwd: ${cwd}`, + `stdout:\n${out || '(empty)'}`, + `stderr:\n${err || '(empty)'}`, + ].join('\n\n'); +} + +function sleep(ms: number): Promise { + return new Promise((resolveSleep) => setTimeout(resolveSleep, ms)); +} diff --git a/packages/genui/ui-judge/tests/judge-page.spec.ts b/packages/genui/ui-judge/tests/judge-page.spec.ts index 85dc8f1384..781a97768d 100644 --- a/packages/genui/ui-judge/tests/judge-page.spec.ts +++ b/packages/genui/ui-judge/tests/judge-page.spec.ts @@ -1,90 +1,132 @@ // Copyright 2026 The Lynx Authors. All rights reserved. // Licensed under the Apache License Version 2.0 that can be found in the // LICENSE file in the root directory of this source tree. -import { readFile } from 'node:fs/promises'; -import { createServer } from 'node:http'; -import type { Server } from 'node:http'; -import type { AddressInfo } from 'node:net'; +import { mkdir, writeFile } from 'node:fs/promises'; +import { dirname } from 'node:path'; import { expect, test } from '@playwright/test'; +import type { Page } from '@playwright/test'; import { judgePage } from '../src/index.js'; - -let server: Server; -let baseUrl: string; +import type { UiJudgeResult } from '../src/index.js'; +import { + startPlaygroundPreviewServer, +} from './helpers/playground-preview-server.js'; +import type { PlaygroundPreviewServer } from './helpers/playground-preview-server.js'; function hasMidsceneModelConfig(): boolean { return Boolean(process.env['MIDSCENE_MODEL_NAME']); } -test.beforeAll(async () => { - const fixtureHtml = await readFile( - new URL('./fixtures/interactive.html', import.meta.url), - 'utf8', +interface PlaygroundDemoCase { + demoId: string; + expectedText: string; + readyText: string; +} + +const PLAYGROUND_DEMO_CASES: PlaygroundDemoCase[] = [ + { + demoId: 'recs', + readyText: 'Recommendations: Date-Night Dining Ideas', + expectedText: 'Sea Breeze Kitchen', + }, + { + demoId: 'trip-planner', + readyText: 'Trip Planner: Kyoto in 48 Hours', + expectedText: 'Monkey Park Viewpoint', + }, + { + demoId: 'weather-current', + readyText: 'Austin, TX', + expectedText: 'Clear skies with light breeze', + }, + { + demoId: 'product-card', + readyText: 'Wireless Headphones Pro', + expectedText: 'Add to Cart', + }, +]; +const JUDGE_DEMO: PlaygroundDemoCase = PLAYGROUND_DEMO_CASES[0]!; +const UI_JUDGE_RESULT_FILE_ENV = 'UI_JUDGE_RESULT_FILE'; + +test.describe('A2UI playground preview', () => { + test.skip( + !hasMidsceneModelConfig(), + 'MIDSCENE_MODEL_NAME is required for the real Midscene model test.', ); - server = createServer((req, res) => { - const url = new URL(req.url ?? '/', 'http://127.0.0.1'); - if (url.pathname === '/' || url.pathname === '/interactive') { - res.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' }); - res.end(fixtureHtml); - return; - } + let previewServer: PlaygroundPreviewServer | undefined; - res.writeHead(404, { 'Content-Type': 'text/plain; charset=utf-8' }); - res.end('not found'); + test.beforeAll(async () => { + previewServer = await startPlaygroundPreviewServer(); }); - await new Promise((resolve) => { - server.listen(0, '127.0.0.1', resolve); + test.afterAll(async () => { + await previewServer?.dispose(); }); - const address = server.address() as AddressInfo; - baseUrl = `http://127.0.0.1:${address.port}`; -}); - -test.afterAll(async () => { - await new Promise((resolve, reject) => { - server.close((error) => { - if (error) { - reject(error); - return; + for (const demo of PLAYGROUND_DEMO_CASES) { + test(`renders playground example ${demo.demoId} with speed zero`, async ({ page }) => { + if (!previewServer) { + throw new Error('A2UI playground preview server was not started.'); } - resolve(); + + const previewUrl = previewServer.createDemoPreviewUrl({ + demoId: demo.demoId, + speed: 0, + }); + + await page.setViewportSize({ width: 390, height: 844 }); + await page.goto(previewUrl); + await waitForPreviewText(page, demo.readyText); + await waitForPreviewText(page, demo.expectedText, 2_000); }); - }); -}); + } -test('scores a caller-provided page after Midscene interactions', async ({ page }) => { - test.skip( - !hasMidsceneModelConfig(), - 'MIDSCENE_MODEL_NAME is required for the real Midscene model test.', - ); + test('scores a playground render.html demo with speed zero', async ({ page }) => { + test.setTimeout(300_000); - const steps = ['Click the Reveal details button.']; - await page.setViewportSize({ width: 390, height: 844 }); - await page.goto(`${baseUrl}/interactive`); + if (!previewServer) { + throw new Error('A2UI playground preview server was not started.'); + } - const result = await judgePage({ - page, - task: - 'The page should show an order confirmation card with a revealed status, shipping date, and 390x844 viewport label.', - steps, - timeoutMs: 120_000, - }); + const previewUrl = previewServer.createDemoPreviewUrl({ + demoId: JUDGE_DEMO.demoId, + speed: 0, + }); - expect(result).toMatchObject({ - dimension: 'visual-correctness', - steps, - url: `${baseUrl}/interactive`, + await page.setViewportSize({ width: 390, height: 844 }); + await page.goto(previewUrl); + await waitForPreviewText(page, JUDGE_DEMO.readyText); + await waitForPreviewText(page, JUDGE_DEMO.expectedText, 2_000); + + const task = + 'The A2UI playground preview should show date-night dining recommendations for Moonlight Terrace, Pinewood Bistro, and Sea Breeze Kitchen.'; + const result = await judgePage({ + page, + task, + timeoutMs: 180_000, + }); + + await writeUiJudgeResult({ + result, + task, + }); + + expect(result).toMatchObject({ + dimension: 'visual-correctness', + steps: [], + url: previewUrl, + }); + expect(result.error).toBeUndefined(); + expect(result.score).toBeGreaterThanOrEqual(0); + expect(result.score).toBeLessThanOrEqual(5); }); - expect(result.error).toBeUndefined(); - expect(result.score).toBeGreaterThanOrEqual(0); - expect(result.score).toBeLessThanOrEqual(5); }); test('returns a JSON error when input validation fails', async ({ page }) => { - await page.goto(`${baseUrl}/interactive`); + await page.setContent('

Order Confirmed

'); + const url = page.url(); const result = await judgePage({ page, @@ -96,7 +138,42 @@ test('returns a JSON error when input validation fails', async ({ page }) => { dimension: 'visual-correctness', score: 0, steps: [], - url: `${baseUrl}/interactive`, + url, }); expect(result.error?.message).toBeTruthy(); }); + +async function waitForPreviewText( + page: Page, + text: string, + timeout = 30_000, +): Promise { + await page.waitForFunction( + (expectedText) => { + const lynxView = document.querySelector('lynx-view'); + const shadowText = lynxView?.shadowRoot?.textContent ?? ''; + return shadowText.includes(expectedText) + || document.body.textContent?.includes(expectedText) === true; + }, + text, + { timeout }, + ); +} + +async function writeUiJudgeResult({ + result, + task, +}: { + result: UiJudgeResult; + task: string; +}): Promise { + const resultFile = process.env[UI_JUDGE_RESULT_FILE_ENV]; + if (!resultFile) return; + + await mkdir(dirname(resultFile), { recursive: true }); + await writeFile( + resultFile, + `${JSON.stringify({ results: [{ ...result, task }] }, null, 2)}\n`, + 'utf8', + ); +} From b021be07488082f3147644329418628257c44559 Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 14:18:53 +0800 Subject: [PATCH 02/15] Show UI judge rerun attempts in PR comment --- .github/actions/ui-judge-comment/comment.mjs | 23 +++++++++++++++----- .github/ui-judge-ci.instructions.md | 2 ++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/.github/actions/ui-judge-comment/comment.mjs b/.github/actions/ui-judge-comment/comment.mjs index 43a62aa649..280b1e0dcd 100644 --- a/.github/actions/ui-judge-comment/comment.mjs +++ b/.github/actions/ui-judge-comment/comment.mjs @@ -170,7 +170,7 @@ function formatComment({ marker, results, title }) { const average = results.reduce((sum, result) => sum + result.score, 0) / results.length; const failedCount = results.filter((result) => result.error).length; - const runUrl = getRunUrl(); + const runLink = getRunLink(); const lines = [ marker, `### ${escapeMarkdown(title)}`, @@ -209,8 +209,8 @@ function formatComment({ marker, results, title }) { ); } - if (runUrl) { - lines.push('', `[Workflow run](${runUrl})`); + if (runLink) { + lines.push('', `[${runLink.label}](${runLink.url})`); } return lines.join('\n'); @@ -410,10 +410,23 @@ function truncateComment(body) { }\n\n_Comment truncated because it exceeded ${MAX_COMMENT_LENGTH} characters._`; } -function getRunUrl() { +function getRunLink() { const serverUrl = process.env.GITHUB_SERVER_URL || 'https://github.com'; const repository = process.env.GITHUB_REPOSITORY; const runId = process.env.GITHUB_RUN_ID; if (!repository || !runId) return undefined; - return `${serverUrl}/${repository}/actions/runs/${runId}`; + + const runUrl = `${serverUrl}/${repository}/actions/runs/${runId}`; + const runAttempt = Number(process.env.GITHUB_RUN_ATTEMPT || '1'); + if (!Number.isInteger(runAttempt) || runAttempt <= 1) { + return { + label: 'Workflow run', + url: runUrl, + }; + } + + return { + label: `Workflow run (attempt ${runAttempt})`, + url: `${runUrl}/attempts/${runAttempt}`, + }; } diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md index 5fef91e49f..80461e3e35 100644 --- a/.github/ui-judge-ci.instructions.md +++ b/.github/ui-judge-ci.instructions.md @@ -9,3 +9,5 @@ Use step-level `timeout-minutes` on long UI Judge setup, build, and model execut Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` pattern. Restore the same strict `.turbo` cache key with `fail-on-cache-miss: true`, run `pnpm turbo build --summarize` in the Playwright container, then run the UI Judge-specific playground artifact preparation and package test. Inject the full Midscene/OpenAI model environment into the UI Judge execution step, including `MIDSCENE_MODEL_API_KEY`, `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_FAMILY`, `MIDSCENE_MODEL_NAME`, and `MIDSCENE_OPENAI_INIT_CONFIG_JSON`. + +When rendering the UI Judge PR comment, include `GITHUB_RUN_ATTEMPT` in the workflow footer/link. GitHub reruns keep the same `GITHUB_RUN_ID`, so relying only on the run URL can make a successful rerun write an identical comment body and appear not to update. From 19935de1f8adaf2bf3f02028dad90c752542b12f Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 14:47:25 +0800 Subject: [PATCH 03/15] Scope ui judge CI build steps --- .github/ui-judge-ci.instructions.md | 2 +- .github/workflows/test.yml | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md index 80461e3e35..191620725d 100644 --- a/.github/ui-judge-ci.instructions.md +++ b/.github/ui-judge-ci.instructions.md @@ -6,7 +6,7 @@ When wiring `@lynx-js/ui-judge` into pull request CI, preserve the PR comment ev Use step-level `timeout-minutes` on long UI Judge setup, build, and model execution steps so a hung prerequisite fails early enough for the fallback result writer and PR comment action to run before the job-level timeout kills the whole job. -Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` pattern. Restore the same strict `.turbo` cache key with `fail-on-cache-miss: true`, run `pnpm turbo build --summarize` in the Playwright container, then run the UI Judge-specific playground artifact preparation and package test. +Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` dependency shape. Restore the same strict `.turbo` cache key with `fail-on-cache-miss: true`, but do not repeat a broad full-repository build in the Playwright container; build the UI Judge package and the A2UI playground prerequisites that the test actually consumes. Inject the full Midscene/OpenAI model environment into the UI Judge execution step, including `MIDSCENE_MODEL_API_KEY`, `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_FAMILY`, `MIDSCENE_MODEL_NAME`, and `MIDSCENE_OPENAI_INIT_CONFIG_JSON`. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 2e042a31c5..8b940fbbc7 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -178,6 +178,9 @@ jobs: return files; } NODE + - name: Trust workspace + if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - name: TurboCache if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} uses: lynx-infra/cache@5c6160a6a4c7fca80a2f3057bb9dfc9513fcb732 @@ -207,7 +210,7 @@ jobs: npm install -g corepack@latest corepack enable pnpm install --frozen-lockfile - - name: Build + - name: Build UI Judge package if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} timeout-minutes: 10 working-directory: ${{ github.workspace }} @@ -215,13 +218,14 @@ jobs: NODE_OPTIONS: --max-old-space-size=32768 run: | cd "$GITHUB_WORKSPACE" - pnpm turbo build --summarize + pnpm --filter @lynx-js/ui-judge build - name: Prepare A2UI playground artifacts if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} timeout-minutes: 12 working-directory: ${{ github.workspace }} run: | cd "$GITHUB_WORKSPACE" + pnpm --filter @lynx-js/rspeedy build pnpm turbo build:lynx --filter a2ui-playground - name: Run UI Judge id: ui-judge From bed72e74e0b9e9ff0d47855cc1d79ddbec25b943 Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 15:24:57 +0800 Subject: [PATCH 04/15] Build rspeedy prerequisites for ui judge CI --- .github/ui-judge-ci.instructions.md | 2 ++ .github/workflows/test.yml | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md index 191620725d..57d54c53f9 100644 --- a/.github/ui-judge-ci.instructions.md +++ b/.github/ui-judge-ci.instructions.md @@ -8,6 +8,8 @@ Use step-level `timeout-minutes` on long UI Judge setup, build, and model execut Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` dependency shape. Restore the same strict `.turbo` cache key with `fail-on-cache-miss: true`, but do not repeat a broad full-repository build in the Playwright container; build the UI Judge package and the A2UI playground prerequisites that the test actually consumes. +When the A2UI playground preview needs Rspeedy in CI, build `@lynx-js/rspeedy` through turbo with its workspace dependencies, for example `pnpm turbo build --filter @lynx-js/rspeedy... --force`, instead of calling `pnpm --filter @lynx-js/rspeedy build` directly. + Inject the full Midscene/OpenAI model environment into the UI Judge execution step, including `MIDSCENE_MODEL_API_KEY`, `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_FAMILY`, `MIDSCENE_MODEL_NAME`, and `MIDSCENE_OPENAI_INIT_CONFIG_JSON`. When rendering the UI Judge PR comment, include `GITHUB_RUN_ATTEMPT` in the workflow footer/link. GitHub reruns keep the same `GITHUB_RUN_ID`, so relying only on the run URL can make a successful rerun write an identical comment body and appear not to update. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 8b940fbbc7..fc7460e0a6 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -225,7 +225,7 @@ jobs: working-directory: ${{ github.workspace }} run: | cd "$GITHUB_WORKSPACE" - pnpm --filter @lynx-js/rspeedy build + pnpm turbo build --filter @lynx-js/rspeedy... --summarize --force pnpm turbo build:lynx --filter a2ui-playground - name: Run UI Judge id: ui-judge From 664ae386b602daff334fdb69dd39094976e5d9f7 Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 15:29:22 +0800 Subject: [PATCH 05/15] Use turbo cache for ui judge CI builds --- .github/ui-judge-ci.instructions.md | 4 ++-- .github/workflows/test.yml | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md index 57d54c53f9..cc524013a1 100644 --- a/.github/ui-judge-ci.instructions.md +++ b/.github/ui-judge-ci.instructions.md @@ -6,9 +6,9 @@ When wiring `@lynx-js/ui-judge` into pull request CI, preserve the PR comment ev Use step-level `timeout-minutes` on long UI Judge setup, build, and model execution steps so a hung prerequisite fails early enough for the fallback result writer and PR comment action to run before the job-level timeout kills the whole job. -Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` dependency shape. Restore the same strict `.turbo` cache key with `fail-on-cache-miss: true`, but do not repeat a broad full-repository build in the Playwright container; build the UI Judge package and the A2UI playground prerequisites that the test actually consumes. +Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` dependency shape. Restore the same strict `.turbo` cache key with `fail-on-cache-miss: true`, but do not repeat a broad full-repository build in the Playwright container; run focused `pnpm turbo build` commands for the UI Judge package and the A2UI playground prerequisites that the test actually consumes. -When the A2UI playground preview needs Rspeedy in CI, build `@lynx-js/rspeedy` through turbo with its workspace dependencies, for example `pnpm turbo build --filter @lynx-js/rspeedy... --force`, instead of calling `pnpm --filter @lynx-js/rspeedy build` directly. +Use the upstream build job's restored turbo cache in UI Judge CI. Do not call package scripts directly with `pnpm --filter build`, and do not pass `--force`; the focused turbo commands should replay the upstream build outputs from cache. Inject the full Midscene/OpenAI model environment into the UI Judge execution step, including `MIDSCENE_MODEL_API_KEY`, `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_FAMILY`, `MIDSCENE_MODEL_NAME`, and `MIDSCENE_OPENAI_INIT_CONFIG_JSON`. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index fc7460e0a6..05e54cc008 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -218,15 +218,14 @@ jobs: NODE_OPTIONS: --max-old-space-size=32768 run: | cd "$GITHUB_WORKSPACE" - pnpm --filter @lynx-js/ui-judge build + pnpm turbo build --filter @lynx-js/ui-judge... --summarize - name: Prepare A2UI playground artifacts if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} timeout-minutes: 12 working-directory: ${{ github.workspace }} run: | cd "$GITHUB_WORKSPACE" - pnpm turbo build --filter @lynx-js/rspeedy... --summarize --force - pnpm turbo build:lynx --filter a2ui-playground + pnpm turbo build:lynx --filter a2ui-playground --summarize - name: Run UI Judge id: ui-judge if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} From 265227b582b59e798b529915d7c7e15b3c8772fd Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 16:01:24 +0800 Subject: [PATCH 06/15] Raise file limit for ui judge CI --- .github/ui-judge-ci.instructions.md | 2 ++ .github/workflows/test.yml | 1 + 2 files changed, 3 insertions(+) diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md index cc524013a1..6ecdc38149 100644 --- a/.github/ui-judge-ci.instructions.md +++ b/.github/ui-judge-ci.instructions.md @@ -10,6 +10,8 @@ Keep the UI Judge Playwright job dependent on the repository `build` job, matchi Use the upstream build job's restored turbo cache in UI Judge CI. Do not call package scripts directly with `pnpm --filter build`, and do not pass `--force`; the focused turbo commands should replay the upstream build outputs from cache. +Raise the soft open-file limit before running UI Judge Playwright tests in the Playwright container. The A2UI playground dev server uses rsbuild/chokidar watchers, so mirror the web-elements Playwright pattern with `ulimit -Sn 655350` before invoking `pnpm --filter @lynx-js/ui-judge test`. + Inject the full Midscene/OpenAI model environment into the UI Judge execution step, including `MIDSCENE_MODEL_API_KEY`, `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_FAMILY`, `MIDSCENE_MODEL_NAME`, and `MIDSCENE_OPENAI_INIT_CONFIG_JSON`. When rendering the UI Judge PR comment, include `GITHUB_RUN_ATTEMPT` in the workflow footer/link. GitHub reruns keep the same `GITHUB_RUN_ID`, so relying only on the run URL can make a successful rerun write an identical comment body and appear not to update. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 05e54cc008..e4a34fd0ec 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -241,6 +241,7 @@ jobs: UI_JUDGE_RESULT_FILE: ${{ github.workspace }}/ui-judge-results.json run: | cd "$GITHUB_WORKSPACE" + ulimit -Sn 655350 pnpm --filter @lynx-js/ui-judge test - name: Write UI Judge failure result if: ${{ always() && steps.ui-judge-inputs.outputs.should-run == 'true' && (failure() || steps.ui-judge.outcome == 'failure') && hashFiles('ui-judge-results.json') == '' }} From 6383bd59e9bdff0f39fb9738afab8111cdada4f8 Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 16:59:39 +0800 Subject: [PATCH 07/15] Clean up ui judge CI wiring --- .github/scripts/check-ui-judge-inputs.mjs | 52 ++++ .../scripts/write-ui-judge-failure-result.mjs | 34 +++ .github/ui-judge-ci.instructions.md | 8 +- .github/workflows/test.yml | 228 ++++-------------- .github/workflows/workflow-test.yml | 82 ++++++- 5 files changed, 211 insertions(+), 193 deletions(-) create mode 100644 .github/scripts/check-ui-judge-inputs.mjs create mode 100644 .github/scripts/write-ui-judge-failure-result.mjs diff --git a/.github/scripts/check-ui-judge-inputs.mjs b/.github/scripts/check-ui-judge-inputs.mjs new file mode 100644 index 0000000000..c492d0e7e4 --- /dev/null +++ b/.github/scripts/check-ui-judge-inputs.mjs @@ -0,0 +1,52 @@ +// Copyright 2026 The Lynx Authors. All rights reserved. +// Licensed under the Apache License Version 2.0 that can be found in the +// LICENSE file in the root directory of this source tree. +import { execFileSync } from 'node:child_process'; +import { appendFileSync, readFileSync } from 'node:fs'; + +const relevantFilePatterns = [ + /^packages\/genui\/(ui-judge|a2ui|a2ui-playground)\//, + /^\.github\/actions\/ui-judge-comment\//, + /^\.github\/scripts\/(check-ui-judge-inputs|write-ui-judge-failure-result)\.mjs$/, + /^\.github\/workflows\/(test|workflow-test)\.yml$/, + /^\.github\/ui-judge(-ci)?\.instructions\.md$/, +]; + +let shouldRun = false; +let reason = 'UI Judge only comments on pull_request events.'; + +if (process.env.GITHUB_EVENT_NAME === 'pull_request') { + if (!process.env.MIDSCENE_MODEL_NAME || !process.env.MIDSCENE_MODEL_API_KEY) { + reason = 'Midscene model secrets are not configured for this pull request.'; + } else { + const changedFiles = listPullRequestFiles(); + shouldRun = changedFiles.some((file) => + relevantFilePatterns.some((pattern) => pattern.test(file)) + ); + reason = shouldRun + ? 'Relevant UI Judge files changed.' + : 'No UI Judge, A2UI, or playground files changed.'; + } +} + +appendFileSync(process.env.GITHUB_OUTPUT, `should-run=${shouldRun}\n`); +appendFileSync(process.env.GITHUB_OUTPUT, `reason=${reason}\n`); +console.info(reason); + +function listPullRequestFiles() { + const event = JSON.parse(readFileSync(process.env.GITHUB_EVENT_PATH, 'utf8')); + const base = event.pull_request?.base?.sha; + const head = event.pull_request?.head?.sha; + if (!base || !head) { + throw new Error( + 'Unable to resolve pull request base/head SHAs for the UI Judge gate.', + ); + } + + const output = execFileSync('git', [ + 'diff', + '--name-only', + `${base}...${head}`, + ], { encoding: 'utf8' }); + return output.split(/\r?\n/).filter(Boolean); +} diff --git a/.github/scripts/write-ui-judge-failure-result.mjs b/.github/scripts/write-ui-judge-failure-result.mjs new file mode 100644 index 0000000000..4ae1df8093 --- /dev/null +++ b/.github/scripts/write-ui-judge-failure-result.mjs @@ -0,0 +1,34 @@ +// Copyright 2026 The Lynx Authors. All rights reserved. +// Licensed under the Apache License Version 2.0 that can be found in the +// LICENSE file in the root directory of this source tree. +import { existsSync, writeFileSync } from 'node:fs'; +import { join } from 'node:path'; + +const resultFile = process.env.UI_JUDGE_RESULT_FILE + || join(process.env.GITHUB_WORKSPACE, 'ui-judge-results.json'); + +if (!existsSync(resultFile)) { + writeFileSync( + resultFile, + `${ + JSON.stringify( + { + results: [ + { + dimension: 'visual-correctness', + score: 0, + error: { + message: + 'UI Judge CI failed before writing a model result. See the workflow logs for details.', + }, + steps: [], + url: '', + }, + ], + }, + null, + 2, + ) + }\n`, + ); +} diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md index 6ecdc38149..45f59570ca 100644 --- a/.github/ui-judge-ci.instructions.md +++ b/.github/ui-judge-ci.instructions.md @@ -1,12 +1,12 @@ --- -applyTo: ".github/workflows/test.yml,.github/actions/ui-judge-comment/**" +applyTo: ".github/workflows/test.yml,.github/workflows/workflow-test.yml,.github/scripts/*ui-judge*.mjs,.github/ui-judge*.instructions.md,.github/actions/ui-judge-comment/**" --- -When wiring `@lynx-js/ui-judge` into pull request CI, preserve the PR comment even when the model-backed test fails, but do not hide the failed test. Use `continue-on-error` only on the judge execution step, run the comment action afterward with `always()`, then add a final failing step keyed to `steps..outcome == 'failure'`. +When wiring `@lynx-js/ui-judge` into pull request CI, preserve the PR comment even when the model-backed test fails, but do not hide the failed test. Prefer running UI Judge through the reusable `workflow-test.yml` job with `is-web: true`, uploading `ui-judge-results.json` as an artifact, and posting the comment from a separate thin job with `issues: write` and `pull-requests: write`. -Use step-level `timeout-minutes` on long UI Judge setup, build, and model execution steps so a hung prerequisite fails early enough for the fallback result writer and PR comment action to run before the job-level timeout kills the whole job. +Keep long UI Judge work inside a job with a bounded timeout, and write a fallback `ui-judge-results.json` before artifact upload when build or test execution fails. If UI Judge setup is ever split back into custom steps outside the reusable workflow, use step-level `timeout-minutes` on long setup, build, and model execution steps so the fallback result writer and PR comment action still run. -Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` dependency shape. Restore the same strict `.turbo` cache key with `fail-on-cache-miss: true`, but do not repeat a broad full-repository build in the Playwright container; run focused `pnpm turbo build` commands for the UI Judge package and the A2UI playground prerequisites that the test actually consumes. +Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` dependency shape. Restore the same strict `.turbo` cache key with `fail-on-cache-miss: true`, but do not repeat a broad full-repository build in the Playwright container; pass focused `pnpm turbo build` commands through the reusable workflow's build command input for the UI Judge package and the A2UI playground prerequisites that the test actually consumes. Use the upstream build job's restored turbo cache in UI Judge CI. Do not call package scripts directly with `pnpm --filter build`, and do not pass `--force`; the focused turbo commands should replay the upstream build outputs from cache. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index e4a34fd0ec..75ed352145 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -74,205 +74,60 @@ jobs: ui-judge: needs: build - runs-on: lynx-custom-container - timeout-minutes: 30 + uses: ./.github/workflows/workflow-test.yml + secrets: + MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} + MIDSCENE_MODEL_BASE_URL: ${{ secrets.MIDSCENE_MODEL_BASE_URL }} + MIDSCENE_MODEL_FAMILY: ${{ secrets.MIDSCENE_MODEL_FAMILY }} + MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} + MIDSCENE_OPENAI_INIT_CONFIG_JSON: ${{ secrets.MIDSCENE_OPENAI_INIT_CONFIG_JSON }} + with: + runs-on: lynx-custom-container + is-web: true + upload-codecov: false + artifact-name: ui-judge-results + artifact-path: ui-judge-results.json + artifact-if-no-files-found: error + web-report-name: ui-judge-playwright-report + web-report-path: packages/genui/ui-judge/playwright-report + preflight-run: | + set -euo pipefail + node .github/scripts/check-ui-judge-inputs.mjs + build-run: | + export NODE_OPTIONS="--max-old-space-size=32768" + pnpm turbo build --filter @lynx-js/ui-judge... --summarize + pnpm turbo build:lynx --filter a2ui-playground --summarize + run: | + set -euo pipefail + export UI_JUDGE_RESULT_FILE="$GITHUB_WORKSPACE/ui-judge-results.json" + ulimit -Sn 655350 + pnpm --filter @lynx-js/ui-judge test + failure-result-run: | + set -euo pipefail + UI_JUDGE_RESULT_FILE="$GITHUB_WORKSPACE/ui-judge-results.json" node .github/scripts/write-ui-judge-failure-result.mjs + + ui-judge-comment: + needs: ui-judge + if: always() + runs-on: lynx-ubuntu-24.04-medium permissions: contents: read issues: write pull-requests: write - container: - image: mcr.microsoft.com/playwright:v1.58.2-noble - env: - CI: 1 - TURBO_TELEMETRY_DISABLED: 1 steps: - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 + if: ${{ needs.ui-judge.outputs.should-run == 'true' }} with: - fetch-depth: 0 persist-credentials: false - - uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5 + - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5 + if: ${{ needs.ui-judge.outputs.should-run == 'true' }} with: - node-version: "22" - package-manager-cache: false - - name: Check UI Judge inputs - id: ui-judge-inputs - shell: bash - env: - GITHUB_TOKEN: ${{ github.token }} - MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} - MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} - run: | - set -euo pipefail - node --input-type=module <<'NODE' - import { appendFileSync, readFileSync } from 'node:fs'; - - const relevantFilePatterns = [ - /^packages\/genui\/(ui-judge|a2ui|a2ui-playground)\//, - /^\.github\/actions\/ui-judge-comment\//, - /^\.github\/workflows\/test\.yml$/, - /^\.github\/ui-judge\.instructions\.md$/, - ]; - - let shouldRun = false; - let reason = 'UI Judge only comments on pull_request events.'; - - if (process.env.GITHUB_EVENT_NAME === 'pull_request') { - if ( - !process.env.MIDSCENE_MODEL_NAME - || !process.env.MIDSCENE_MODEL_API_KEY - ) { - reason = - 'Midscene model secrets are not configured for this pull request.'; - } else { - const changedFiles = await listPullRequestFiles(); - shouldRun = changedFiles.some((file) => - relevantFilePatterns.some((pattern) => pattern.test(file)) - ); - reason = shouldRun - ? 'Relevant UI Judge files changed.' - : 'No UI Judge, A2UI, or playground files changed.'; - } - } - - appendFileSync(process.env.GITHUB_OUTPUT, `should-run=${shouldRun}\n`); - appendFileSync(process.env.GITHUB_OUTPUT, `reason=${reason}\n`); - console.info(reason); - - async function listPullRequestFiles() { - const event = JSON.parse(readFileSync(process.env.GITHUB_EVENT_PATH, 'utf8')); - const pullRequestNumber = event.pull_request?.number ?? event.number; - const repository = process.env.GITHUB_REPOSITORY; - const token = process.env.GITHUB_TOKEN; - - if (!pullRequestNumber || !repository || !token) { - throw new Error( - 'Unable to list pull request files for the UI Judge gate.', - ); - } - - const apiUrl = process.env.GITHUB_API_URL || 'https://api.github.com'; - const files = []; - for (let page = 1; page < 11; page++) { - const url = `${apiUrl}/repos/${repository}/pulls/${pullRequestNumber}/files?per_page=100&page=${page}`; - const response = await fetch(url, { - headers: { - accept: 'application/vnd.github+json', - authorization: `Bearer ${token}`, - 'x-github-api-version': '2022-11-28', - }, - }); - if (!response.ok) { - throw new Error( - `Failed to list pull request files: ${response.status} ${response.statusText}`, - ); - } - - const pageFiles = await response.json(); - files.push( - ...pageFiles - .map((file) => file.filename) - .filter((file) => typeof file === 'string'), - ); - if (pageFiles.length < 100) break; - } - return files; - } - NODE - - name: Trust workspace - if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} - run: git config --global --add safe.directory "$GITHUB_WORKSPACE" - - name: TurboCache - if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} - uses: lynx-infra/cache@5c6160a6a4c7fca80a2f3057bb9dfc9513fcb732 - with: - path: .turbo - # Match the reusable Playwright test workflow: the build job must have - # already produced the cache for this exact commit. - key: turbo-v4-${{ runner.os }}-${{ hashFiles('**/packages/**/src/**/*.rs') }}-${{ github.sha }} - fail-on-cache-miss: true - - name: Install native build tools - if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} - run: | - apt-get update - apt-get install -y --no-install-recommends build-essential - - uses: ./.github/actions/rustup - if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} - with: - key: ui-judge - - name: Install Rust wasm targets - if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} - run: rustup target add wasm32-unknown-unknown wasm32-wasip1 - - name: Install - if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} - working-directory: ${{ github.workspace }} - run: | - cd "$GITHUB_WORKSPACE" - npm install -g corepack@latest - corepack enable - pnpm install --frozen-lockfile - - name: Build UI Judge package - if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} - timeout-minutes: 10 - working-directory: ${{ github.workspace }} - env: - NODE_OPTIONS: --max-old-space-size=32768 - run: | - cd "$GITHUB_WORKSPACE" - pnpm turbo build --filter @lynx-js/ui-judge... --summarize - - name: Prepare A2UI playground artifacts - if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} - timeout-minutes: 12 - working-directory: ${{ github.workspace }} - run: | - cd "$GITHUB_WORKSPACE" - pnpm turbo build:lynx --filter a2ui-playground --summarize - - name: Run UI Judge - id: ui-judge - if: ${{ steps.ui-judge-inputs.outputs.should-run == 'true' }} - continue-on-error: true - timeout-minutes: 10 - working-directory: ${{ github.workspace }} - env: - MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} - MIDSCENE_MODEL_BASE_URL: ${{ secrets.MIDSCENE_MODEL_BASE_URL }} - MIDSCENE_MODEL_FAMILY: ${{ secrets.MIDSCENE_MODEL_FAMILY }} - MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} - MIDSCENE_OPENAI_INIT_CONFIG_JSON: ${{ secrets.MIDSCENE_OPENAI_INIT_CONFIG_JSON }} - UI_JUDGE_RESULT_FILE: ${{ github.workspace }}/ui-judge-results.json - run: | - cd "$GITHUB_WORKSPACE" - ulimit -Sn 655350 - pnpm --filter @lynx-js/ui-judge test - - name: Write UI Judge failure result - if: ${{ always() && steps.ui-judge-inputs.outputs.should-run == 'true' && (failure() || steps.ui-judge.outcome == 'failure') && hashFiles('ui-judge-results.json') == '' }} - working-directory: ${{ github.workspace }} - env: - UI_JUDGE_RESULT_FILE: ${{ github.workspace }}/ui-judge-results.json - run: | - cd "$GITHUB_WORKSPACE" - cat > "$UI_JUDGE_RESULT_FILE" <<'JSON' - { - "results": [ - { - "dimension": "visual-correctness", - "score": 0, - "error": { - "message": "UI Judge CI failed before writing a model result. See the workflow logs for details." - }, - "steps": [], - "url": "" - } - ] - } - JSON + name: ui-judge-results - name: Comment UI Judge result - if: ${{ always() && steps.ui-judge-inputs.outputs.should-run == 'true' }} + if: ${{ needs.ui-judge.outputs.should-run == 'true' }} uses: ./.github/actions/ui-judge-comment with: result-file: ui-judge-results.json - - name: Fail when UI Judge fails - if: ${{ always() && steps.ui-judge-inputs.outputs.should-run == 'true' && steps.ui-judge.outcome == 'failure' }} - run: exit 1 lighthouse: needs: build @@ -584,6 +439,7 @@ jobs: - code-style-check - eslint - ui-judge + - ui-judge-comment # - playwright-linux - playwright-web-elements - test-api diff --git a/.github/workflows/workflow-test.yml b/.github/workflows/workflow-test.yml index c66d68c13d..b5fcf4dbcb 100644 --- a/.github/workflows/workflow-test.yml +++ b/.github/workflows/workflow-test.yml @@ -6,6 +6,16 @@ on: required: false LHCI_GITHUB_APP_TOKEN: required: false + MIDSCENE_MODEL_API_KEY: + required: false + MIDSCENE_MODEL_BASE_URL: + required: false + MIDSCENE_MODEL_FAMILY: + required: false + MIDSCENE_MODEL_NAME: + required: false + MIDSCENE_OPENAI_INIT_CONFIG_JSON: + required: false inputs: runs-on: required: true @@ -14,10 +24,29 @@ on: required: true type: string description: "Command run parameters, limited to predefined test commands" + preflight-run: + required: false + type: string + default: "" + description: "Optional command that writes should-run=true/false to GITHUB_OUTPUT" + build-run: + required: false + type: string + default: "pnpm turbo build --summarize" + description: "Build command to run after restoring the upstream turbo cache" + failure-result-run: + required: false + type: string + default: "" + description: "Optional command that writes a fallback result before artifacts are uploaded" is-web: required: false type: boolean default: false + upload-codecov: + required: false + type: boolean + default: true web-report-name: required: false type: string @@ -26,10 +55,26 @@ on: required: false type: string default: "packages/web-platform/web-tests/playwright-report" + artifact-name: + required: false + type: string + default: "" + artifact-path: + required: false + type: string + default: "" + artifact-if-no-files-found: + required: false + type: string + default: "warn" codecov-flags: required: false type: string default: "unittest" + outputs: + should-run: + description: "Whether this reusable test job ran past the optional preflight" + value: ${{ jobs.check.outputs.should-run }} # Set minimum permissions to prevent unnecessary access permissions: {} @@ -42,6 +87,8 @@ jobs: timeout-minutes: 30 runs-on: ${{ inputs.runs-on }} permissions: {} + outputs: + should-run: ${{ inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true' }} container: image: ${{ inputs.is-web && 'mcr.microsoft.com/playwright:v1.58.2-noble' || null }} env: @@ -57,7 +104,15 @@ jobs: with: node-version: "22" package-manager-cache: false + - name: Preflight # zizmor: ignore[template-injection] The inputs.preflight-run is provided by us. + id: preflight + if: ${{ inputs.preflight-run != '' }} + env: + MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} + MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} + run: ${{ inputs.preflight-run }} - name: TurboCache + if: ${{ inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true' }} uses: lynx-infra/cache@5c6160a6a4c7fca80a2f3057bb9dfc9513fcb732 with: path: .turbo @@ -65,35 +120,56 @@ jobs: key: turbo-v4-${{ runner.os }}-${{ hashFiles('**/packages/**/src/**/*.rs') }}-${{ github.sha }} fail-on-cache-miss: true - name: Install + if: ${{ inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true' }} run: | npm install -g corepack@latest corepack enable pnpm install --frozen-lockfile - name: Build - run: | - pnpm turbo build --summarize + if: ${{ inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true' }} + run: ${{ inputs.build-run }} - name: Test # zizmor: ignore[template-injection] The inputs.run is provided by us. id: test + if: ${{ inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true' }} env: NODE_OPTIONS: --max-old-space-size=16384 GITHUB_SHA: ${{ github.event.pull_request.head.sha }} CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} LHCI_GITHUB_APP_TOKEN: ${{ secrets.LHCI_GITHUB_APP_TOKEN }} + MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} + MIDSCENE_MODEL_BASE_URL: ${{ secrets.MIDSCENE_MODEL_BASE_URL }} + MIDSCENE_MODEL_FAMILY: ${{ secrets.MIDSCENE_MODEL_FAMILY }} + MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} + MIDSCENE_OPENAI_INIT_CONFIG_JSON: ${{ secrets.MIDSCENE_OPENAI_INIT_CONFIG_JSON }} run: ${{ inputs.run }} + - name: Write Failure Result # zizmor: ignore[template-injection] The inputs.failure-result-run is provided by us. + if: ${{ always() && inputs.failure-result-run != '' && (inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true') && failure() }} + run: ${{ inputs.failure-result-run }} - name: Upload coverage reports to Codecov + if: ${{ inputs.upload-codecov && (inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true') }} uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de # v5 with: token: ${{ secrets.CODECOV_TOKEN }} flags: ${{ inputs.codecov-flags }} - name: Upload test results to Codecov - if: ${{ !cancelled() }} + if: ${{ !cancelled() && inputs.upload-codecov && (inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true') }} continue-on-error: true uses: codecov/test-results-action@0fa95f0e1eeaafde2c782583b36b28ad0d8c77d3 # v1 with: token: ${{ secrets.CODECOV_TOKEN }} flags: ${{ inputs.codecov-flags }} override_branch: ${{ github.event_name == 'merge_group' && 'main' || '' }} + - name: Upload Artifact + if: ${{ always() && !cancelled() && inputs.artifact-name != '' && inputs.artifact-path != '' && (inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true') }} + uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + with: + name: ${{ inputs.artifact-name }} + path: ${{ inputs.artifact-path }} + if-no-files-found: ${{ inputs.artifact-if-no-files-found }} + retention-days: 1 + overwrite: true + include-hidden-files: true - name: Upload Test Result if: ${{ inputs.is-web && failure() }} uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 From 7a3b68f4bbd609de289115d19bd5041b8ce247fd Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 17:27:47 +0800 Subject: [PATCH 08/15] Make ui judge CI scripts sh-compatible --- .github/workflows/test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 0e1e919445..cc738095c3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -91,19 +91,19 @@ jobs: web-report-name: ui-judge-playwright-report web-report-path: packages/genui/ui-judge/playwright-report preflight-run: | - set -euo pipefail + set -eu node .github/scripts/check-ui-judge-inputs.mjs build-run: | export NODE_OPTIONS="--max-old-space-size=32768" pnpm turbo build --filter @lynx-js/ui-judge... --summarize pnpm turbo build:lynx --filter a2ui-playground --summarize run: | - set -euo pipefail + set -eu export UI_JUDGE_RESULT_FILE="$GITHUB_WORKSPACE/ui-judge-results.json" ulimit -Sn 655350 pnpm --filter @lynx-js/ui-judge test failure-result-run: | - set -euo pipefail + set -eu UI_JUDGE_RESULT_FILE="$GITHUB_WORKSPACE/ui-judge-results.json" node .github/scripts/write-ui-judge-failure-result.mjs ui-judge-comment: From 0654b5d1f3a545b057758fdb0aeecca9c6b9363c Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 17:46:25 +0800 Subject: [PATCH 09/15] Set ui judge CI workspace explicitly --- .github/scripts/check-ui-judge-inputs.mjs | 2 +- .github/workflows/test.yml | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/scripts/check-ui-judge-inputs.mjs b/.github/scripts/check-ui-judge-inputs.mjs index c492d0e7e4..c5d0db3eec 100644 --- a/.github/scripts/check-ui-judge-inputs.mjs +++ b/.github/scripts/check-ui-judge-inputs.mjs @@ -47,6 +47,6 @@ function listPullRequestFiles() { 'diff', '--name-only', `${base}...${head}`, - ], { encoding: 'utf8' }); + ], { cwd: process.env.GITHUB_WORKSPACE, encoding: 'utf8' }); return output.split(/\r?\n/).filter(Boolean); } diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index cc738095c3..caa17abd7a 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -91,18 +91,22 @@ jobs: web-report-name: ui-judge-playwright-report web-report-path: packages/genui/ui-judge/playwright-report preflight-run: | + cd "$GITHUB_WORKSPACE" set -eu node .github/scripts/check-ui-judge-inputs.mjs build-run: | + cd "$GITHUB_WORKSPACE" export NODE_OPTIONS="--max-old-space-size=32768" pnpm turbo build --filter @lynx-js/ui-judge... --summarize pnpm turbo build:lynx --filter a2ui-playground --summarize run: | + cd "$GITHUB_WORKSPACE" set -eu export UI_JUDGE_RESULT_FILE="$GITHUB_WORKSPACE/ui-judge-results.json" ulimit -Sn 655350 pnpm --filter @lynx-js/ui-judge test failure-result-run: | + cd "$GITHUB_WORKSPACE" set -eu UI_JUDGE_RESULT_FILE="$GITHUB_WORKSPACE/ui-judge-results.json" node .github/scripts/write-ui-judge-failure-result.mjs From 74997d5806eda74f83bf5f7e2d8c0964fda2622c Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 18:07:31 +0800 Subject: [PATCH 10/15] Fix ui judge reusable preflight cwd --- .github/scripts/check-ui-judge-inputs.mjs | 5 ++++- .github/workflows/workflow-test.yml | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/scripts/check-ui-judge-inputs.mjs b/.github/scripts/check-ui-judge-inputs.mjs index c5d0db3eec..ad7ddab5e8 100644 --- a/.github/scripts/check-ui-judge-inputs.mjs +++ b/.github/scripts/check-ui-judge-inputs.mjs @@ -3,6 +3,9 @@ // LICENSE file in the root directory of this source tree. import { execFileSync } from 'node:child_process'; import { appendFileSync, readFileSync } from 'node:fs'; +import { fileURLToPath } from 'node:url'; + +const repoRoot = fileURLToPath(new URL('../..', import.meta.url)); const relevantFilePatterns = [ /^packages\/genui\/(ui-judge|a2ui|a2ui-playground)\//, @@ -47,6 +50,6 @@ function listPullRequestFiles() { 'diff', '--name-only', `${base}...${head}`, - ], { cwd: process.env.GITHUB_WORKSPACE, encoding: 'utf8' }); + ], { cwd: repoRoot, encoding: 'utf8' }); return output.split(/\r?\n/).filter(Boolean); } diff --git a/.github/workflows/workflow-test.yml b/.github/workflows/workflow-test.yml index 63d57ba38b..5aa3fbf8c7 100644 --- a/.github/workflows/workflow-test.yml +++ b/.github/workflows/workflow-test.yml @@ -171,7 +171,7 @@ jobs: overwrite: true include-hidden-files: true - name: Upload Test Result - if: ${{ inputs.is-web && failure() }} + if: ${{ inputs.is-web && failure() && (inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true') }} uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: ${{ inputs.web-report-name }} From e53f0d03e60ba4ba936882217238e29b1dabcd25 Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 18:37:37 +0800 Subject: [PATCH 11/15] Make ui judge preflight independent of git metadata --- .github/scripts/check-ui-judge-inputs.mjs | 72 ++++++++++++++++------- .github/ui-judge-ci.instructions.md | 2 + .github/workflows/workflow-test.yml | 1 + 3 files changed, 53 insertions(+), 22 deletions(-) diff --git a/.github/scripts/check-ui-judge-inputs.mjs b/.github/scripts/check-ui-judge-inputs.mjs index ad7ddab5e8..9a529cb485 100644 --- a/.github/scripts/check-ui-judge-inputs.mjs +++ b/.github/scripts/check-ui-judge-inputs.mjs @@ -1,11 +1,7 @@ // Copyright 2026 The Lynx Authors. All rights reserved. // Licensed under the Apache License Version 2.0 that can be found in the // LICENSE file in the root directory of this source tree. -import { execFileSync } from 'node:child_process'; import { appendFileSync, readFileSync } from 'node:fs'; -import { fileURLToPath } from 'node:url'; - -const repoRoot = fileURLToPath(new URL('../..', import.meta.url)); const relevantFilePatterns = [ /^packages\/genui\/(ui-judge|a2ui|a2ui-playground)\//, @@ -22,13 +18,19 @@ if (process.env.GITHUB_EVENT_NAME === 'pull_request') { if (!process.env.MIDSCENE_MODEL_NAME || !process.env.MIDSCENE_MODEL_API_KEY) { reason = 'Midscene model secrets are not configured for this pull request.'; } else { - const changedFiles = listPullRequestFiles(); - shouldRun = changedFiles.some((file) => - relevantFilePatterns.some((pattern) => pattern.test(file)) - ); - reason = shouldRun - ? 'Relevant UI Judge files changed.' - : 'No UI Judge, A2UI, or playground files changed.'; + const changedFiles = await listPullRequestFiles(); + if (changedFiles === null) { + shouldRun = true; + reason = + 'Unable to list pull request files; running UI Judge by default.'; + } else { + shouldRun = changedFiles.some((file) => + relevantFilePatterns.some((pattern) => pattern.test(file)) + ); + reason = shouldRun + ? 'Relevant UI Judge files changed.' + : 'No UI Judge, A2UI, or playground files changed.'; + } } } @@ -36,20 +38,46 @@ appendFileSync(process.env.GITHUB_OUTPUT, `should-run=${shouldRun}\n`); appendFileSync(process.env.GITHUB_OUTPUT, `reason=${reason}\n`); console.info(reason); -function listPullRequestFiles() { +async function listPullRequestFiles() { const event = JSON.parse(readFileSync(process.env.GITHUB_EVENT_PATH, 'utf8')); - const base = event.pull_request?.base?.sha; - const head = event.pull_request?.head?.sha; - if (!base || !head) { + const pullRequestUrl = event.pull_request?.url; + if (!pullRequestUrl) { throw new Error( - 'Unable to resolve pull request base/head SHAs for the UI Judge gate.', + 'Unable to resolve pull request API URL for the UI Judge gate.', ); } - const output = execFileSync('git', [ - 'diff', - '--name-only', - `${base}...${head}`, - ], { cwd: repoRoot, encoding: 'utf8' }); - return output.split(/\r?\n/).filter(Boolean); + const headers = { + Accept: 'application/vnd.github+json', + 'X-GitHub-Api-Version': '2022-11-28', + }; + if (process.env.GITHUB_TOKEN) { + headers.Authorization = `Bearer ${process.env.GITHUB_TOKEN}`; + } + + try { + const files = []; + for (let page = 1; page <= 30; page++) { + const url = new URL(`${pullRequestUrl}/files`); + url.searchParams.set('per_page', '100'); + url.searchParams.set('page', String(page)); + + const response = await fetch(url, { headers }); + if (!response.ok) { + throw new Error( + `GitHub API returned ${response.status} ${response.statusText}`, + ); + } + + const pageFiles = await response.json(); + files.push(...pageFiles.map((file) => file.filename)); + if (pageFiles.length < 100) { + break; + } + } + return files; + } catch (error) { + console.warn(error instanceof Error ? error.message : String(error)); + return null; + } } diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md index 45f59570ca..2917386715 100644 --- a/.github/ui-judge-ci.instructions.md +++ b/.github/ui-judge-ci.instructions.md @@ -10,6 +10,8 @@ Keep the UI Judge Playwright job dependent on the repository `build` job, matchi Use the upstream build job's restored turbo cache in UI Judge CI. Do not call package scripts directly with `pnpm --filter build`, and do not pass `--force`; the focused turbo commands should replay the upstream build outputs from cache. +In UI Judge preflight code, do not depend on local `.git` metadata inside the custom Playwright container. The checkout can be available as files while `git diff` is unusable there, so use the pull request files API for changed-file gating and fail open by running UI Judge if the file list cannot be fetched. + Raise the soft open-file limit before running UI Judge Playwright tests in the Playwright container. The A2UI playground dev server uses rsbuild/chokidar watchers, so mirror the web-elements Playwright pattern with `ulimit -Sn 655350` before invoking `pnpm --filter @lynx-js/ui-judge test`. Inject the full Midscene/OpenAI model environment into the UI Judge execution step, including `MIDSCENE_MODEL_API_KEY`, `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_FAMILY`, `MIDSCENE_MODEL_NAME`, and `MIDSCENE_OPENAI_INIT_CONFIG_JSON`. diff --git a/.github/workflows/workflow-test.yml b/.github/workflows/workflow-test.yml index 5aa3fbf8c7..693ab2e318 100644 --- a/.github/workflows/workflow-test.yml +++ b/.github/workflows/workflow-test.yml @@ -108,6 +108,7 @@ jobs: id: preflight if: ${{ inputs.preflight-run != '' }} env: + GITHUB_TOKEN: ${{ github.token }} MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} run: ${{ inputs.preflight-run }} From d3d19f9ea7a6f41783ddcd811742a18474a823c5 Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 19:04:22 +0800 Subject: [PATCH 12/15] Make ui judge comment action set up node --- .github/actions/ui-judge-comment/action.yml | 4 ++++ .github/ui-judge-ci.instructions.md | 2 ++ 2 files changed, 6 insertions(+) diff --git a/.github/actions/ui-judge-comment/action.yml b/.github/actions/ui-judge-comment/action.yml index f277e5ef13..2ce0c2fd13 100644 --- a/.github/actions/ui-judge-comment/action.yml +++ b/.github/actions/ui-judge-comment/action.yml @@ -46,6 +46,10 @@ outputs: runs: using: composite steps: + - uses: actions/setup-node@a0853c24544627f65ddf259abe73b1d18a591444 # v5 + with: + node-version: "22" + package-manager-cache: false - name: Create or update UI Judge comment id: comment shell: bash diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md index 2917386715..1311271f75 100644 --- a/.github/ui-judge-ci.instructions.md +++ b/.github/ui-judge-ci.instructions.md @@ -17,3 +17,5 @@ Raise the soft open-file limit before running UI Judge Playwright tests in the P Inject the full Midscene/OpenAI model environment into the UI Judge execution step, including `MIDSCENE_MODEL_API_KEY`, `MIDSCENE_MODEL_BASE_URL`, `MIDSCENE_MODEL_FAMILY`, `MIDSCENE_MODEL_NAME`, and `MIDSCENE_OPENAI_INIT_CONFIG_JSON`. When rendering the UI Judge PR comment, include `GITHUB_RUN_ATTEMPT` in the workflow footer/link. GitHub reruns keep the same `GITHUB_RUN_ID`, so relying only on the run URL can make a successful rerun write an identical comment body and appear not to update. + +Keep `.github/actions/ui-judge-comment` self-contained for self-hosted runners: set up Node inside the composite action before invoking `comment.mjs`, rather than requiring every caller job to prepare `node` separately. From 92bd7dedc6ca836cc0900fcb87a276a871d104ef Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 19:48:29 +0800 Subject: [PATCH 13/15] Simplify ui judge secrets guard --- .github/scripts/check-ui-judge-inputs.mjs | 83 ----------------------- .github/ui-judge-ci.instructions.md | 4 +- .github/workflows/test.yml | 8 ++- .github/workflows/workflow-test.yml | 1 - 4 files changed, 8 insertions(+), 88 deletions(-) delete mode 100644 .github/scripts/check-ui-judge-inputs.mjs diff --git a/.github/scripts/check-ui-judge-inputs.mjs b/.github/scripts/check-ui-judge-inputs.mjs deleted file mode 100644 index 9a529cb485..0000000000 --- a/.github/scripts/check-ui-judge-inputs.mjs +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright 2026 The Lynx Authors. All rights reserved. -// Licensed under the Apache License Version 2.0 that can be found in the -// LICENSE file in the root directory of this source tree. -import { appendFileSync, readFileSync } from 'node:fs'; - -const relevantFilePatterns = [ - /^packages\/genui\/(ui-judge|a2ui|a2ui-playground)\//, - /^\.github\/actions\/ui-judge-comment\//, - /^\.github\/scripts\/(check-ui-judge-inputs|write-ui-judge-failure-result)\.mjs$/, - /^\.github\/workflows\/(test|workflow-test)\.yml$/, - /^\.github\/ui-judge(-ci)?\.instructions\.md$/, -]; - -let shouldRun = false; -let reason = 'UI Judge only comments on pull_request events.'; - -if (process.env.GITHUB_EVENT_NAME === 'pull_request') { - if (!process.env.MIDSCENE_MODEL_NAME || !process.env.MIDSCENE_MODEL_API_KEY) { - reason = 'Midscene model secrets are not configured for this pull request.'; - } else { - const changedFiles = await listPullRequestFiles(); - if (changedFiles === null) { - shouldRun = true; - reason = - 'Unable to list pull request files; running UI Judge by default.'; - } else { - shouldRun = changedFiles.some((file) => - relevantFilePatterns.some((pattern) => pattern.test(file)) - ); - reason = shouldRun - ? 'Relevant UI Judge files changed.' - : 'No UI Judge, A2UI, or playground files changed.'; - } - } -} - -appendFileSync(process.env.GITHUB_OUTPUT, `should-run=${shouldRun}\n`); -appendFileSync(process.env.GITHUB_OUTPUT, `reason=${reason}\n`); -console.info(reason); - -async function listPullRequestFiles() { - const event = JSON.parse(readFileSync(process.env.GITHUB_EVENT_PATH, 'utf8')); - const pullRequestUrl = event.pull_request?.url; - if (!pullRequestUrl) { - throw new Error( - 'Unable to resolve pull request API URL for the UI Judge gate.', - ); - } - - const headers = { - Accept: 'application/vnd.github+json', - 'X-GitHub-Api-Version': '2022-11-28', - }; - if (process.env.GITHUB_TOKEN) { - headers.Authorization = `Bearer ${process.env.GITHUB_TOKEN}`; - } - - try { - const files = []; - for (let page = 1; page <= 30; page++) { - const url = new URL(`${pullRequestUrl}/files`); - url.searchParams.set('per_page', '100'); - url.searchParams.set('page', String(page)); - - const response = await fetch(url, { headers }); - if (!response.ok) { - throw new Error( - `GitHub API returned ${response.status} ${response.statusText}`, - ); - } - - const pageFiles = await response.json(); - files.push(...pageFiles.map((file) => file.filename)); - if (pageFiles.length < 100) { - break; - } - } - return files; - } catch (error) { - console.warn(error instanceof Error ? error.message : String(error)); - return null; - } -} diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md index 1311271f75..091987e7f4 100644 --- a/.github/ui-judge-ci.instructions.md +++ b/.github/ui-judge-ci.instructions.md @@ -1,5 +1,5 @@ --- -applyTo: ".github/workflows/test.yml,.github/workflows/workflow-test.yml,.github/scripts/*ui-judge*.mjs,.github/ui-judge*.instructions.md,.github/actions/ui-judge-comment/**" +applyTo: ".github/workflows/test.yml,.github/workflows/workflow-test.yml,.github/scripts/write-ui-judge-failure-result.mjs,.github/ui-judge*.instructions.md,.github/actions/ui-judge-comment/**" --- When wiring `@lynx-js/ui-judge` into pull request CI, preserve the PR comment even when the model-backed test fails, but do not hide the failed test. Prefer running UI Judge through the reusable `workflow-test.yml` job with `is-web: true`, uploading `ui-judge-results.json` as an artifact, and posting the comment from a separate thin job with `issues: write` and `pull-requests: write`. @@ -10,7 +10,7 @@ Keep the UI Judge Playwright job dependent on the repository `build` job, matchi Use the upstream build job's restored turbo cache in UI Judge CI. Do not call package scripts directly with `pnpm --filter build`, and do not pass `--force`; the focused turbo commands should replay the upstream build outputs from cache. -In UI Judge preflight code, do not depend on local `.git` metadata inside the custom Playwright container. The checkout can be available as files while `git diff` is unusable there, so use the pull request files API for changed-file gating and fail open by running UI Judge if the file list cannot be fetched. +Keep the UI Judge early-success gate limited to Midscene secret availability. Do not add changed-file gating or GitHub API calls to the reusable test path; fork pull requests without secrets should set `should-run=false` before install/build/test, while normal pull requests with secrets should run UI Judge. Raise the soft open-file limit before running UI Judge Playwright tests in the Playwright container. The A2UI playground dev server uses rsbuild/chokidar watchers, so mirror the web-elements Playwright pattern with `ulimit -Sn 655350` before invoking `pnpm --filter @lynx-js/ui-judge test`. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index caa17abd7a..51429e3eb2 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -91,9 +91,13 @@ jobs: web-report-name: ui-judge-playwright-report web-report-path: packages/genui/ui-judge/playwright-report preflight-run: | - cd "$GITHUB_WORKSPACE" set -eu - node .github/scripts/check-ui-judge-inputs.mjs + if [ -z "${MIDSCENE_MODEL_NAME:-}" ] || [ -z "${MIDSCENE_MODEL_API_KEY:-}" ]; then + echo "should-run=false" >> "$GITHUB_OUTPUT" + echo "Midscene secrets are unavailable; skipping UI Judge." + exit 0 + fi + echo "should-run=true" >> "$GITHUB_OUTPUT" build-run: | cd "$GITHUB_WORKSPACE" export NODE_OPTIONS="--max-old-space-size=32768" diff --git a/.github/workflows/workflow-test.yml b/.github/workflows/workflow-test.yml index 693ab2e318..5aa3fbf8c7 100644 --- a/.github/workflows/workflow-test.yml +++ b/.github/workflows/workflow-test.yml @@ -108,7 +108,6 @@ jobs: id: preflight if: ${{ inputs.preflight-run != '' }} env: - GITHUB_TOKEN: ${{ github.token }} MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} run: ${{ inputs.preflight-run }} From 1b15be2f1a9e406f151edd029a785dce22a1ab82 Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 20:13:06 +0800 Subject: [PATCH 14/15] Simplify ui judge reusable workflow inputs --- .../scripts/write-ui-judge-failure-result.mjs | 5 ++- .github/ui-judge-ci.instructions.md | 6 +-- .github/workflows/test.yml | 22 ++++------- .github/workflows/workflow-test.yml | 39 +++---------------- AGENTS.md | 5 +++ 5 files changed, 24 insertions(+), 53 deletions(-) diff --git a/.github/scripts/write-ui-judge-failure-result.mjs b/.github/scripts/write-ui-judge-failure-result.mjs index 4ae1df8093..72373f1faf 100644 --- a/.github/scripts/write-ui-judge-failure-result.mjs +++ b/.github/scripts/write-ui-judge-failure-result.mjs @@ -6,6 +6,8 @@ import { join } from 'node:path'; const resultFile = process.env.UI_JUDGE_RESULT_FILE || join(process.env.GITHUB_WORKSPACE, 'ui-judge-results.json'); +const errorMessage = process.env.UI_JUDGE_RESULT_ERROR_MESSAGE + || 'UI Judge CI failed before writing a model result. See the workflow logs for details.'; if (!existsSync(resultFile)) { writeFileSync( @@ -18,8 +20,7 @@ if (!existsSync(resultFile)) { dimension: 'visual-correctness', score: 0, error: { - message: - 'UI Judge CI failed before writing a model result. See the workflow logs for details.', + message: errorMessage, }, steps: [], url: '', diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md index 091987e7f4..80c3466864 100644 --- a/.github/ui-judge-ci.instructions.md +++ b/.github/ui-judge-ci.instructions.md @@ -6,11 +6,11 @@ When wiring `@lynx-js/ui-judge` into pull request CI, preserve the PR comment ev Keep long UI Judge work inside a job with a bounded timeout, and write a fallback `ui-judge-results.json` before artifact upload when build or test execution fails. If UI Judge setup is ever split back into custom steps outside the reusable workflow, use step-level `timeout-minutes` on long setup, build, and model execution steps so the fallback result writer and PR comment action still run. -Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` dependency shape. Restore the same strict `.turbo` cache key with `fail-on-cache-miss: true`, but do not repeat a broad full-repository build in the Playwright container; pass focused `pnpm turbo build` commands through the reusable workflow's build command input for the UI Judge package and the A2UI playground prerequisites that the test actually consumes. +Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` dependency shape. Let the reusable `workflow-test.yml` run its default `pnpm turbo build --summarize`; do not add UI Judge-specific `build-run` overrides. The A2UI playground Turbo config already makes `build` depend on `build:lynx`. -Use the upstream build job's restored turbo cache in UI Judge CI. Do not call package scripts directly with `pnpm --filter build`, and do not pass `--force`; the focused turbo commands should replay the upstream build outputs from cache. +Use the upstream build job's restored turbo cache in UI Judge CI. Do not call package scripts directly with `pnpm --filter build`, and do not pass `--force`; use Turbo commands so dependency ordering and cached outputs remain consistent. -Keep the UI Judge early-success gate limited to Midscene secret availability. Do not add changed-file gating or GitHub API calls to the reusable test path; fork pull requests without secrets should set `should-run=false` before install/build/test, while normal pull requests with secrets should run UI Judge. +Do not add changed-file gating, GitHub API calls, or reusable-workflow `preflight-run` wiring to UI Judge CI. If Midscene secrets are unavailable, the UI Judge test command should write a clear skipped result and exit successfully; fork pull requests should skip the comment steps rather than requiring write permissions. Raise the soft open-file limit before running UI Judge Playwright tests in the Playwright container. The A2UI playground dev server uses rsbuild/chokidar watchers, so mirror the web-elements Playwright pattern with `ulimit -Sn 655350` before invoking `pnpm --filter @lynx-js/ui-judge test`. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 51429e3eb2..4bd4b5b6be 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -90,23 +90,15 @@ jobs: artifact-if-no-files-found: error web-report-name: ui-judge-playwright-report web-report-path: packages/genui/ui-judge/playwright-report - preflight-run: | + run: | + cd "$GITHUB_WORKSPACE" set -eu + export UI_JUDGE_RESULT_FILE="$GITHUB_WORKSPACE/ui-judge-results.json" if [ -z "${MIDSCENE_MODEL_NAME:-}" ] || [ -z "${MIDSCENE_MODEL_API_KEY:-}" ]; then - echo "should-run=false" >> "$GITHUB_OUTPUT" + UI_JUDGE_RESULT_ERROR_MESSAGE="Midscene secrets are unavailable; UI Judge model test was skipped." node .github/scripts/write-ui-judge-failure-result.mjs echo "Midscene secrets are unavailable; skipping UI Judge." exit 0 fi - echo "should-run=true" >> "$GITHUB_OUTPUT" - build-run: | - cd "$GITHUB_WORKSPACE" - export NODE_OPTIONS="--max-old-space-size=32768" - pnpm turbo build --filter @lynx-js/ui-judge... --summarize - pnpm turbo build:lynx --filter a2ui-playground --summarize - run: | - cd "$GITHUB_WORKSPACE" - set -eu - export UI_JUDGE_RESULT_FILE="$GITHUB_WORKSPACE/ui-judge-results.json" ulimit -Sn 655350 pnpm --filter @lynx-js/ui-judge test failure-result-run: | @@ -124,15 +116,15 @@ jobs: pull-requests: write steps: - uses: actions/checkout@93cb6efe18208431cddfb8368fd83d5badbf9bfd # v5 - if: ${{ needs.ui-judge.outputs.should-run == 'true' }} + if: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository && needs.ui-judge.result != 'skipped' && needs.ui-judge.result != 'cancelled' }} with: persist-credentials: false - uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5 - if: ${{ needs.ui-judge.outputs.should-run == 'true' }} + if: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository && needs.ui-judge.result != 'skipped' && needs.ui-judge.result != 'cancelled' }} with: name: ui-judge-results - name: Comment UI Judge result - if: ${{ needs.ui-judge.outputs.should-run == 'true' }} + if: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository && needs.ui-judge.result != 'skipped' && needs.ui-judge.result != 'cancelled' }} uses: ./.github/actions/ui-judge-comment with: result-file: ui-judge-results.json diff --git a/.github/workflows/workflow-test.yml b/.github/workflows/workflow-test.yml index 5aa3fbf8c7..dd3dfcafab 100644 --- a/.github/workflows/workflow-test.yml +++ b/.github/workflows/workflow-test.yml @@ -24,16 +24,6 @@ on: required: true type: string description: "Command run parameters, limited to predefined test commands" - preflight-run: - required: false - type: string - default: "" - description: "Optional command that writes should-run=true/false to GITHUB_OUTPUT" - build-run: - required: false - type: string - default: "pnpm turbo build --summarize" - description: "Build command to run after restoring the upstream turbo cache" failure-result-run: required: false type: string @@ -71,10 +61,6 @@ on: required: false type: string default: "unittest" - outputs: - should-run: - description: "Whether this reusable test job ran past the optional preflight" - value: ${{ jobs.check.outputs.should-run }} # Set minimum permissions to prevent unnecessary access permissions: {} @@ -87,8 +73,6 @@ jobs: timeout-minutes: 30 runs-on: ${{ inputs.runs-on }} permissions: {} - outputs: - should-run: ${{ inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true' }} container: image: ${{ inputs.is-web && 'mcr.microsoft.com/playwright:v1.58.2-noble' || null }} env: @@ -104,15 +88,7 @@ jobs: with: node-version: "22" package-manager-cache: false - - name: Preflight # zizmor: ignore[template-injection] The inputs.preflight-run is provided by us. - id: preflight - if: ${{ inputs.preflight-run != '' }} - env: - MIDSCENE_MODEL_API_KEY: ${{ secrets.MIDSCENE_MODEL_API_KEY }} - MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} - run: ${{ inputs.preflight-run }} - name: TurboCache - if: ${{ inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true' }} uses: lynx-infra/cache@5c6160a6a4c7fca80a2f3057bb9dfc9513fcb732 with: path: .turbo @@ -120,17 +96,14 @@ jobs: key: turbo-v4-${{ runner.os }}-${{ hashFiles('**/packages/**/src/**/*.rs') }}-${{ github.sha }} fail-on-cache-miss: true - name: Install - if: ${{ inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true' }} run: | npm install -g corepack@latest corepack enable pnpm install --frozen-lockfile - name: Build - if: ${{ inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true' }} - run: ${{ inputs.build-run }} + run: pnpm turbo build --summarize - name: Test # zizmor: ignore[template-injection] The inputs.run is provided by us. id: test - if: ${{ inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true' }} env: NODE_OPTIONS: --max-old-space-size=16384 GITHUB_SHA: ${{ github.event.pull_request.head.sha }} @@ -144,16 +117,16 @@ jobs: MIDSCENE_OPENAI_INIT_CONFIG_JSON: ${{ secrets.MIDSCENE_OPENAI_INIT_CONFIG_JSON }} run: ${{ inputs.run }} - name: Write Failure Result # zizmor: ignore[template-injection] The inputs.failure-result-run is provided by us. - if: ${{ always() && inputs.failure-result-run != '' && (inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true') && failure() }} + if: ${{ always() && inputs.failure-result-run != '' && failure() }} run: ${{ inputs.failure-result-run }} - name: Upload coverage reports to Codecov - if: ${{ inputs.upload-codecov && (inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true') }} + if: ${{ inputs.upload-codecov }} uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de # v5 with: token: ${{ secrets.CODECOV_TOKEN }} flags: ${{ inputs.codecov-flags }} - name: Upload test results to Codecov - if: ${{ !cancelled() && inputs.upload-codecov && (inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true') }} + if: ${{ !cancelled() && inputs.upload-codecov }} continue-on-error: true uses: codecov/test-results-action@0fa95f0e1eeaafde2c782583b36b28ad0d8c77d3 # v1 with: @@ -161,7 +134,7 @@ jobs: flags: ${{ inputs.codecov-flags }} override_branch: ${{ github.event_name == 'merge_group' && 'main' || '' }} - name: Upload Artifact - if: ${{ always() && !cancelled() && inputs.artifact-name != '' && inputs.artifact-path != '' && (inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true') }} + if: ${{ always() && !cancelled() && inputs.artifact-name != '' && inputs.artifact-path != '' }} uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: ${{ inputs.artifact-name }} @@ -171,7 +144,7 @@ jobs: overwrite: true include-hidden-files: true - name: Upload Test Result - if: ${{ inputs.is-web && failure() && (inputs.preflight-run == '' || steps.preflight.outputs.should-run == 'true') }} + if: ${{ inputs.is-web && failure() }} uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: ${{ inputs.web-report-name }} diff --git a/AGENTS.md b/AGENTS.md index 0a9aff6c8a..5dbb93cedf 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -35,12 +35,17 @@ rustc --version # Required for native bindings # Full build (REQUIRED before running tests) pnpm turbo build +# Focused package builds should still go through Turbo filtering +pnpm turbo build --filter + # Development build with watching pnpm turbo watch build ``` **⚠️ Critical**: Always run full build before tests. Watch mode only compiles TypeScript, not Rust components. +When narrowing builds to a package or task, prefer `pnpm turbo build --filter ...` over `pnpm build --filter ...` so Turbo dependency ordering, task outputs, and cache behavior remain consistent. + ### 3. Code Quality ```bash From 83162b26052600b1121683c7733840e353e1f0cc Mon Sep 17 00:00:00 2001 From: Haoyang Wang <12288479+PupilTong@users.noreply.github.com> Date: Thu, 21 May 2026 20:48:16 +0800 Subject: [PATCH 15/15] Remove ui judge failure result hook --- ...judge-failure-result.mjs => write-ui-judge-result.mjs} | 2 +- .github/ui-judge-ci.instructions.md | 8 ++++---- .github/workflows/test.yml | 6 +----- .github/workflows/workflow-test.yml | 8 -------- 4 files changed, 6 insertions(+), 18 deletions(-) rename .github/scripts/{write-ui-judge-failure-result.mjs => write-ui-judge-result.mjs} (90%) diff --git a/.github/scripts/write-ui-judge-failure-result.mjs b/.github/scripts/write-ui-judge-result.mjs similarity index 90% rename from .github/scripts/write-ui-judge-failure-result.mjs rename to .github/scripts/write-ui-judge-result.mjs index 72373f1faf..babb1ac98c 100644 --- a/.github/scripts/write-ui-judge-failure-result.mjs +++ b/.github/scripts/write-ui-judge-result.mjs @@ -7,7 +7,7 @@ import { join } from 'node:path'; const resultFile = process.env.UI_JUDGE_RESULT_FILE || join(process.env.GITHUB_WORKSPACE, 'ui-judge-results.json'); const errorMessage = process.env.UI_JUDGE_RESULT_ERROR_MESSAGE - || 'UI Judge CI failed before writing a model result. See the workflow logs for details.'; + || 'UI Judge did not produce a model result. See the workflow logs for details.'; if (!existsSync(resultFile)) { writeFileSync( diff --git a/.github/ui-judge-ci.instructions.md b/.github/ui-judge-ci.instructions.md index 80c3466864..095fd73f1f 100644 --- a/.github/ui-judge-ci.instructions.md +++ b/.github/ui-judge-ci.instructions.md @@ -1,16 +1,16 @@ --- -applyTo: ".github/workflows/test.yml,.github/workflows/workflow-test.yml,.github/scripts/write-ui-judge-failure-result.mjs,.github/ui-judge*.instructions.md,.github/actions/ui-judge-comment/**" +applyTo: ".github/workflows/test.yml,.github/workflows/workflow-test.yml,.github/scripts/write-ui-judge-result.mjs,.github/ui-judge*.instructions.md,.github/actions/ui-judge-comment/**" --- When wiring `@lynx-js/ui-judge` into pull request CI, preserve the PR comment even when the model-backed test fails, but do not hide the failed test. Prefer running UI Judge through the reusable `workflow-test.yml` job with `is-web: true`, uploading `ui-judge-results.json` as an artifact, and posting the comment from a separate thin job with `issues: write` and `pull-requests: write`. -Keep long UI Judge work inside a job with a bounded timeout, and write a fallback `ui-judge-results.json` before artifact upload when build or test execution fails. If UI Judge setup is ever split back into custom steps outside the reusable workflow, use step-level `timeout-minutes` on long setup, build, and model execution steps so the fallback result writer and PR comment action still run. +Keep long UI Judge work inside a job with a bounded timeout. If UI Judge setup is ever split back into custom steps outside the reusable workflow, use step-level `timeout-minutes` on long setup, build, and model execution steps so the PR comment action can still run when a result artifact exists. -Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` dependency shape. Let the reusable `workflow-test.yml` run its default `pnpm turbo build --summarize`; do not add UI Judge-specific `build-run` overrides. The A2UI playground Turbo config already makes `build` depend on `build:lynx`. +Keep the UI Judge Playwright job dependent on the repository `build` job, matching the `playwright-web-elements` dependency shape. Let the reusable `workflow-test.yml` run its default `pnpm turbo build --summarize`; do not add UI Judge-specific build overrides. The A2UI playground Turbo config already makes `build` depend on `build:lynx`. Use the upstream build job's restored turbo cache in UI Judge CI. Do not call package scripts directly with `pnpm --filter build`, and do not pass `--force`; use Turbo commands so dependency ordering and cached outputs remain consistent. -Do not add changed-file gating, GitHub API calls, or reusable-workflow `preflight-run` wiring to UI Judge CI. If Midscene secrets are unavailable, the UI Judge test command should write a clear skipped result and exit successfully; fork pull requests should skip the comment steps rather than requiring write permissions. +Do not add changed-file gating, GitHub API calls, or extra reusable workflow inputs for UI Judge CI. If Midscene secrets are unavailable, the UI Judge test command should write a clear skipped result and exit successfully; fork pull requests should skip the comment steps rather than requiring write permissions. Raise the soft open-file limit before running UI Judge Playwright tests in the Playwright container. The A2UI playground dev server uses rsbuild/chokidar watchers, so mirror the web-elements Playwright pattern with `ulimit -Sn 655350` before invoking `pnpm --filter @lynx-js/ui-judge test`. diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4bd4b5b6be..e5d35023b4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -95,16 +95,12 @@ jobs: set -eu export UI_JUDGE_RESULT_FILE="$GITHUB_WORKSPACE/ui-judge-results.json" if [ -z "${MIDSCENE_MODEL_NAME:-}" ] || [ -z "${MIDSCENE_MODEL_API_KEY:-}" ]; then - UI_JUDGE_RESULT_ERROR_MESSAGE="Midscene secrets are unavailable; UI Judge model test was skipped." node .github/scripts/write-ui-judge-failure-result.mjs + UI_JUDGE_RESULT_ERROR_MESSAGE="Midscene secrets are unavailable; UI Judge model test was skipped." node .github/scripts/write-ui-judge-result.mjs echo "Midscene secrets are unavailable; skipping UI Judge." exit 0 fi ulimit -Sn 655350 pnpm --filter @lynx-js/ui-judge test - failure-result-run: | - cd "$GITHUB_WORKSPACE" - set -eu - UI_JUDGE_RESULT_FILE="$GITHUB_WORKSPACE/ui-judge-results.json" node .github/scripts/write-ui-judge-failure-result.mjs ui-judge-comment: needs: ui-judge diff --git a/.github/workflows/workflow-test.yml b/.github/workflows/workflow-test.yml index dd3dfcafab..0fe2ddeee3 100644 --- a/.github/workflows/workflow-test.yml +++ b/.github/workflows/workflow-test.yml @@ -24,11 +24,6 @@ on: required: true type: string description: "Command run parameters, limited to predefined test commands" - failure-result-run: - required: false - type: string - default: "" - description: "Optional command that writes a fallback result before artifacts are uploaded" is-web: required: false type: boolean @@ -116,9 +111,6 @@ jobs: MIDSCENE_MODEL_NAME: ${{ secrets.MIDSCENE_MODEL_NAME }} MIDSCENE_OPENAI_INIT_CONFIG_JSON: ${{ secrets.MIDSCENE_OPENAI_INIT_CONFIG_JSON }} run: ${{ inputs.run }} - - name: Write Failure Result # zizmor: ignore[template-injection] The inputs.failure-result-run is provided by us. - if: ${{ always() && inputs.failure-result-run != '' && failure() }} - run: ${{ inputs.failure-result-run }} - name: Upload coverage reports to Codecov if: ${{ inputs.upload-codecov }} uses: codecov/codecov-action@671740ac38dd9b0130fbe1cec585b89eea48d3de # v5