diff --git a/.sandcastle/concurrency-pool.ts b/.sandcastle/concurrency-pool.ts new file mode 100644 index 0000000..a755e5a --- /dev/null +++ b/.sandcastle/concurrency-pool.ts @@ -0,0 +1,69 @@ +/** Internal node for the O(1) FIFO waiting queue. Not exported. */ +interface QueueNode { + next: null | QueueNode; + resolve: () => void; +} + +/** + * A concurrency limiter that restricts parallel execution to a maximum number of tasks. + * Queue operations are O(1) amortized (singly-linked list). + */ +export class ConcurrencyPool { + private head: null | QueueNode = null; + private running = 0; + private tail: null | QueueNode = null; + + /** + * @param max - Maximum number of concurrent tasks. Must be a positive integer >= 1. + */ + constructor(private readonly max: number) { + if (!Number.isInteger(max) || max < 1) { + throw new RangeError("ConcurrencyPool max must be a positive integer >= 1"); + } + } + + /** + * Executes the given async function, waiting if the pool is at capacity. + * @param fn - Async function to execute within the pool. + * @returns The result of the function. + * @remarks Re-entrant calls using the same pool instance may deadlock when all slots are occupied. + */ + async run(fn: () => Promise): Promise { + await this.acquire(); + try { + return await fn(); + } finally { + this.release(); + } + } + + private acquire(): Promise { + if (this.running < this.max) { + this.running++; + return Promise.resolve(); + } + return new Promise((resolve) => { + const node: QueueNode = { next: null, resolve }; + if (this.tail === null) { + this.head = node; + this.tail = node; + } else { + this.tail.next = node; + this.tail = node; + } + }); + } + + private release(): void { + this.running--; + const next = this.head; + if (next !== null) { + this.head = next.next; + if (this.head === null) { + this.tail = null; + } + this.running++; + next.resolve(); + } + } +} diff --git a/.sandcastle/constants.ts b/.sandcastle/constants.ts new file mode 100644 index 0000000..c47bed5 --- /dev/null +++ b/.sandcastle/constants.ts @@ -0,0 +1,64 @@ +import { execFile } from "node:child_process"; +import util from "node:util"; + +/** Model identifier used for implementation and critic agents. */ +export const AGENT_MODEL = "github-copilot/claude-sonnet-4.6"; + +/** Number of context lines around a diff hunk used for hash computation. */ +export const CONTEXT_HASH_RADIUS = 3; + +/** Async execFile — does not block the event loop. Same error shape as execFileSync. */ +export const execFileAsync = util.promisify(execFile); + +/** Timeout in milliseconds for git operations. */ +export const GIT_TIMEOUT_MS = 30_000; + +/** Number of characters to retain from a SHA for display purposes. */ +export const HASH_PREFIX_LENGTH = 16; + +/** Maximum number of characters captured from stderr before truncation. */ +export const MAX_STDERR_CHARS = 500; + +/** Maximum number of characters allowed in a PR or commit title. */ +export const MAX_TITLE_LENGTH = 200; + +/** Model identifier used for planning and orchestration agents. */ +export const PLANNER_MODEL = "github-copilot/claude-opus-4.6"; + +/** Timeout in milliseconds for git push operations. */ +export const PUSH_TIMEOUT_MS = 60_000; + +/** Timeout in milliseconds for a single sandcastle task execution. */ +export const TASK_TIMEOUT_MS = 15 * 60 * 1000; + +/** Full validation command run after each implementation round. */ +export const VALIDATION_COMMAND = + "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2"; + +/** Timeout in milliseconds for the validation command. */ +export const VALIDATION_TIMEOUT_MS = 120_000; + +/** + * Returns the current HEAD commit SHA for the given working directory. + * @param cwd - Absolute path to the git repository root. + * @returns The full SHA string, or `null` if the command fails. + */ +export async function getHeadSha(cwd: string): Promise { + try { + const { stdout } = await execFileAsync("git", ["rev-parse", "HEAD"], { + cwd, + }); + return stdout.trim(); + } catch { + return null; + } +} + +/** + * Converts an unknown thrown value to a human-readable error message. + * @param err - The caught value (may be an `Error` or any other type). + * @returns The `message` property if `err` is an `Error`, otherwise `String(err)`. + */ +export function toErrorMessage(err: unknown): string { + return err instanceof Error ? err.message : String(err); +} diff --git a/.sandcastle/critic-prompt.md b/.sandcastle/critic-prompt.md new file mode 100644 index 0000000..fb6a997 --- /dev/null +++ b/.sandcastle/critic-prompt.md @@ -0,0 +1,64 @@ +# Critic Agent + +Analyze the implementation on branch `{{BRANCH}}` and produce structured findings. + +## Task + +Run `git diff main...{{BRANCH}}` to see all changes. Examine the diff carefully. For each issue found, produce a structured finding. + +Read `AGENTS.md` and `CONTRIBUTING.md` for the project's coding standards. + +## Output Format + +Output your findings as JSON wrapped in nonce-tagged delimiters. Use EXACTLY this tag format: + +```text +[...] +``` + +Each finding must have this structure: + +```json +{ + "file": "path/to/file.ts", + "line": 42, + "title": "short description of the issue", + "severity": "CRITICAL|HIGH|MEDIUM|LOW", + "category": "security|logic|performance|architecture|style", + "confidence": "HIGH|MEDIUM|LOW", + "description": "detailed explanation of why this is a problem", + "suggestion": "how to fix it" +} +``` + +If no issues are found, output: + +```text +[] +``` + +## Rules + +- Report ≤5 findings. HIGH and CRITICAL only. Omit LOW/MEDIUM unless zero higher-severity issues exist. +- If >5 HIGH/CRITICAL issues exist, report the top 5 and add a summary note in the last finding's description. +- Do NOT modify any files. Do NOT commit. Do NOT push. +- Only report issues in the CHANGED code (not pre-existing issues). +- Use HIGH confidence only when you've verified the issue by reading the relevant code. +- Use MEDIUM confidence for pattern-based detection. +- Use LOW confidence for style preferences or uncertain issues. +- Focus on: logic errors, missing edge cases, security issues, type safety violations, test gaps. +- Do NOT report formatting issues (prettier handles those). + +## Known Design Decisions (do not flag) + +- Mid-loop validation convergence bypasses critic (ARCS pattern — deterministic tests > subjective review). +- `process.exit()` at script end kills timed-out sandboxes (no cooperative cancellation available in sandcastle). +- Content-addressed dedup hash includes line number (collision reduction tradeoff, bounded by hard cap). + +## Completion + +After outputting the findings, output: + +```text +COMPLETE +``` diff --git a/.sandcastle/finalizer.ts b/.sandcastle/finalizer.ts new file mode 100644 index 0000000..a164565 --- /dev/null +++ b/.sandcastle/finalizer.ts @@ -0,0 +1,281 @@ +import * as sandcastle from "@ai-hero/sandcastle"; +import crypto from "node:crypto"; + +import type { FinalizeResult, LoopResult, SandboxInstance, TaskSpec } from "./types.js"; + +import { + AGENT_MODEL, + execFileAsync, + GIT_TIMEOUT_MS, + MAX_STDERR_CHARS, + PUSH_TIMEOUT_MS, + toErrorMessage, + VALIDATION_COMMAND, + VALIDATION_TIMEOUT_MS, +} from "./constants.js"; +import { ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS } from "./types.js"; + +/** + * Finalizes a task after the refinement loop: validates, retries if needed, rebases, pushes, and creates a PR. + * @param spec - The task specification. + * @param loopResult - The result from the refinement loop. + * @param sandbox - The sandcastle sandbox instance. + * @param cwd - Working directory (worktree path). + * @returns Finalization result with PR and validation status. + */ +export async function finalizeTask( + spec: TaskSpec, + loopResult: LoopResult, + sandbox: SandboxInstance, + cwd: string, +): Promise { + let validationPassed = await runValidation(cwd, spec); + + // Retry one more round if validation failed and budget remains + if (!validationPassed && loopResult.roundsCompleted < MAX_CRITIC_ROUNDS) { + const retryBudget = ITERATION_BUDGET_PER_ROUND; + console.log( + ` #${spec.id}: Retrying one more implement round (budget: ${String(retryBudget)})`, + ); + + try { + await sandbox.run({ + agent: sandcastle.opencode(AGENT_MODEL), + maxIterations: retryBudget, + name: `Implementer #${spec.id} retry`, + promptArgs: { + BRANCH: spec.branch, + FINDINGS: + loopResult.lastFindings.length > 0 + ? JSON.stringify(loopResult.lastFindings, null, 2) + : "", + ISSUE_BODY: spec.body, + ISSUE_TITLE: spec.title, + TASK_ID: spec.id, + }, + promptFile: "./.sandcastle/implement-prompt.md", + }); + } catch (retryErr: unknown) { + const retryMsg = toErrorMessage(retryErr); + console.warn( + ` #${spec.id}: Implementer retry threw: ${retryMsg}. Falling through to PR creation.`, + ); + } + + try { + await execFileAsync("sh", ["-c", VALIDATION_COMMAND], { + cwd, + maxBuffer: 8 * 1024 * 1024, + timeout: VALIDATION_TIMEOUT_MS, + }); + validationPassed = true; + console.log(` #${spec.id}: Validation passed after retry round.`); + } catch { + console.warn(` #${spec.id}: Validation still fails after retry. Will create draft PR.`); + } + } + + // Rebase on latest main + const rebaseSucceeded = await attemptRebase(cwd); + if (rebaseSucceeded && validationPassed) { + try { + await execFileAsync("sh", ["-c", VALIDATION_COMMAND], { + cwd, + maxBuffer: 8 * 1024 * 1024, + timeout: VALIDATION_TIMEOUT_MS, + }); + } catch (postRebaseErr: unknown) { + const postRebaseStderr = extractStderr(postRebaseErr); + console.warn( + ` #${spec.id}: Post-rebase validation failed.${postRebaseStderr ? `\n${postRebaseStderr}` : ""}`, + ); + validationPassed = false; + } + } + + // Push + const pushSucceeded = await pushBranch(cwd, spec, rebaseSucceeded); + if (!pushSucceeded) { + console.warn(` #${spec.id}: Push did not succeed; PR may reference unpushed commits.`); + } + + // Build PR arguments and create PR + const { isDraft, prArgs } = buildPrArgs(spec, loopResult, validationPassed, rebaseSucceeded); + + let prCreated = false; + try { + await execFileAsync("gh", prArgs, { cwd, maxBuffer: 8 * 1024 * 1024 }); + console.log(` #${spec.id}: PR created${isDraft ? " (draft)" : ""}.`); + prCreated = true; + } catch (err: unknown) { + const msg = toErrorMessage(err); + console.error(` #${spec.id}: PR creation failed: ${msg}`); + } + + return { isDraft, prCreated, validationPassed }; +} + +/** + * Fetches origin/main and rebases the current branch onto it. + * On failure, aborts the rebase cleanly. + * @param cwd - Working directory (worktree path). + * @returns `true` if rebase succeeded, `false` otherwise. + */ +async function attemptRebase(cwd: string): Promise { + try { + await execFileAsync("git", ["fetch", "origin", "main"], { + cwd, + timeout: GIT_TIMEOUT_MS, + }); + await execFileAsync("git", ["rebase", "origin/main"], { cwd, timeout: GIT_TIMEOUT_MS }); + return true; + } catch { + try { + await execFileAsync("git", ["rebase", "--abort"], { cwd }); + } catch { + /* empty */ + } + return false; + } +} + +/** + * Builds the PR title, body, and `gh pr create` argument list. + * @param spec - The task specification. + * @param loopResult - The result from the refinement loop. + * @param validationPassed - Whether the validation suite passed. + * @param rebaseSucceeded - Whether the rebase onto main succeeded. + * @returns Object with `isDraft` flag and `prArgs` string array. + */ +function buildPrArgs( + spec: TaskSpec, + loopResult: LoopResult, + validationPassed: boolean, + rebaseSucceeded: boolean, +): { isDraft: boolean; prArgs: string[] } { + const converged = loopResult.status === "converged"; + const isDraft = !converged || !validationPassed; + const outstandingNote = + loopResult.lastFindings.length > 0 + ? `\n\n${converged ? "ℹ️ Known findings (not addressed):" : "⚠️ Outstanding findings:"}\n${loopResult.lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` + : ""; + const validationNote = !validationPassed + ? "\n\n⚠️ Validation did not pass. Manual review required." + : ""; + const rebaseNote = !rebaseSucceeded + ? "\n\n⚠️ Rebase failed. Branch is not rebased onto main." + : ""; + + const validationCheck = validationPassed ? "- [x]" : "- [ ]"; + const commitPrefix = spec.labels.includes("feature request") + ? "feat" + : spec.labels.includes("bug") + ? "fix" + : "chore"; + const prTitle = `${commitPrefix}: resolve #${spec.id} \u2014 ${spec.title}`; + const typeOfChange = + commitPrefix === "feat" + ? "New feature (non-breaking change that adds functionality)" + : commitPrefix === "fix" + ? "Bug fix (non-breaking change that fixes an issue)" + : "Refactoring (no functional changes)"; + const prBody = `## Description\n\nAutomated ${commitPrefix} for #${spec.id}: ${spec.title}\n\n## Type of Change\n\n- [x] ${typeOfChange}\n\n## Checklist\n\n${validationCheck} I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${spec.id}${outstandingNote}${validationNote}${rebaseNote}`; + + const prArgs = [ + "pr", + "create", + ...(isDraft ? ["--draft"] : []), + "--head", + spec.branch, + "--base", + "main", + "--title", + prTitle, + "--body", + prBody, + ]; + + return { isDraft, prArgs }; +} + +/** + * Extracts stderr from a caught error, truncated to 500 chars. + * @param err - The caught error value. + * @returns Stderr string or empty string if unavailable. + */ +function extractStderr(err: unknown): string { + return err instanceof Error && "stderr" in err + ? String((err as { stderr: unknown }).stderr).slice(0, MAX_STDERR_CHARS) + : ""; +} + +/** + * Pushes the branch to origin. When rebase succeeded, uses force-with-lease + * with a rescue-branch fallback. When rebase was aborted, does a plain push. + * @param cwd - Working directory (worktree path). + * @param spec - The task specification. + * @param rebaseSucceeded - Whether the preceding rebase completed successfully. + * @returns `true` if the primary push succeeded, `false` otherwise. + */ +async function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): Promise { + if (rebaseSucceeded) { + try { + await execFileAsync("git", ["push", "--force-with-lease"], { + cwd, + timeout: PUSH_TIMEOUT_MS, + }); + return true; + } catch (pushErr: unknown) { + const pushMsg = toErrorMessage(pushErr); + try { + const suffix = crypto.randomBytes(4).toString("hex"); + await execFileAsync( + "git", + ["push", "origin", `HEAD:refs/heads/rescue/${spec.branch}-${suffix}`], + { + cwd, + timeout: PUSH_TIMEOUT_MS, + }, + ); + console.warn( + ` #${spec.id}: Push failed. Commits preserved at rescue/${spec.branch}-${suffix}`, + ); + } catch { + console.error( + ` #${spec.id}: Push failed and rescue failed. Commits will be lost on sandbox disposal: ${pushMsg}`, + ); + } + return false; + } + } else { + try { + await execFileAsync("git", ["push"], { cwd, timeout: PUSH_TIMEOUT_MS }); + return true; + } catch (pushErr: unknown) { + const pushMsg = toErrorMessage(pushErr); + console.warn(` #${spec.id}: git push failed after rebase abort: ${pushMsg}`); + return false; + } + } +} + +/** + * Runs the full validation suite. + * @param cwd - Working directory (worktree path). + * @param spec - The task specification (used for logging). + * @returns `true` if validation passed, `false` otherwise. + */ +async function runValidation(cwd: string, spec: TaskSpec): Promise { + try { + await execFileAsync("sh", ["-c", VALIDATION_COMMAND], { + cwd, + maxBuffer: 8 * 1024 * 1024, + timeout: VALIDATION_TIMEOUT_MS, + }); + return true; + } catch (err: unknown) { + const stderr = extractStderr(err); + console.warn(` #${spec.id}: Validation failed.${stderr ? `\n${stderr}` : ""}`); + return false; + } +} diff --git a/.sandcastle/implement-prompt.md b/.sandcastle/implement-prompt.md index 79f39b1..d132f5c 100644 --- a/.sandcastle/implement-prompt.md +++ b/.sandcastle/implement-prompt.md @@ -4,11 +4,11 @@ Implement issue **#{{TASK_ID}}** ("{{ISSUE_TITLE}}") on branch `{{BRANCH}}`. ## Issue Details -!`gh issue view {{TASK_ID}} --json body,title,labels,comments` +{{ISSUE_BODY}} -## Recent Commits +## Review Findings -!`git log -n 10 --format="%h %s" --date=short` +{{FINDINGS}} ## Exploration @@ -22,24 +22,26 @@ Read `AGENTS.md` and `CONTRIBUTING.md` for project conventions. ## Implementation -1. Implement the fix/feature. Follow existing patterns: +1. If review findings are provided above, cross-validate each one against the code. Fix findings you agree with. Ignore findings that are incorrect or not applicable. + +2. If no findings are provided, implement the issue from scratch following existing patterns: - Strict TypeScript, JSDoc on public APIs - Co-located tests in `*.test.ts` files - Zod for runtime validation -2. Before every commit, run the full validation suite: +3. Before every commit, run the full validation suite: ```bash npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2 ``` -3. Commit with conventional commits: +4. Commit with conventional commits: - `fix: ` — bug fix - `feat: ` — new feature - `refactor: ` — restructuring - `chore: ` — tooling/config -4. Push the branch: +5. Push the branch: ```bash git push -u origin {{BRANCH}} @@ -51,6 +53,7 @@ Read `AGENTS.md` and `CONTRIBUTING.md` for project conventions. - Tests must pass before pushing. Zero type errors, zero test failures. - Do not modify unrelated files. - Do not bump version numbers. +- Push BEFORE signaling completion. ## Completion diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 2153268..74122d9 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -1,217 +1,103 @@ import * as sandcastle from "@ai-hero/sandcastle"; import { docker } from "@ai-hero/sandcastle/sandboxes/docker"; +import { ConcurrencyPool } from "./concurrency-pool.js"; +import { TASK_TIMEOUT_MS } from "./constants.js"; +import { finalizeTask } from "./finalizer.js"; +import { runRefinementLoop } from "./refinement-loop.js"; +import { GithubIssueSource } from "./task-source.js"; +import { ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS } from "./types.js"; + const BRANCH_PREFIX = "agent/issue"; -const ESCAPED_PREFIX = BRANCH_PREFIX.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); -const BRANCH_PATTERN = new RegExp(`^${ESCAPED_PREFIX}-\\d+-[\\w-]+$`); const ISSUE_LABEL = "sandcastle"; -const LABEL_FILTER = `--label "${ISSUE_LABEL}"`; -const MAX_PLANNER_RETRIES = 5; const MAX_PARALLEL = 3; const DOCKER_IMAGE = "sandcastle-sap-ai"; -let workCompleted = false; - -for (let iteration = 1; iteration <= MAX_PLANNER_RETRIES; iteration++) { - console.log(`\n=== Iteration ${String(iteration)}/${String(MAX_PLANNER_RETRIES)} ===\n`); - - // Phase 1: Plan - const plan = await sandcastle.run({ - agent: sandcastle.opencode("github-copilot/claude-opus-4.6"), - maxIterations: 1, - name: "Planner", - promptArgs: { - BRANCH_PREFIX, - LABEL_FILTER, - }, - promptFile: "./.sandcastle/plan-prompt.md", - sandbox: docker({ imageName: DOCKER_IMAGE }), +/** + * Races a promise against a timeout, rejecting with a descriptive error if the timeout fires first. + * @param promise - The promise to race against the timeout. + * @param ms - Timeout duration in milliseconds. + * @param label - Human-readable label used in the timeout error message. + * @returns The resolved value of the promise if it completes before the timeout. + */ +function withTimeout(promise: Promise, ms: number, label: string): Promise { + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => { + reject(new Error(`${label} timed out after ${String(ms)}ms`)); + }, ms).unref(); }); + timeoutPromise.catch(() => { + /* suppress unhandled rejection when task completes before timeout */ + }); + return Promise.race([promise, timeoutPromise]); +} - const planMatches = [...plan.stdout.matchAll(/([\s\S]*?)<\/plan>/g)]; - const planMatch = planMatches.at(-1); - if (!planMatch) { - console.error("Planner did not produce a tag. Skipping iteration."); - continue; - } - - const planContent = planMatch[1] ?? ""; - let issues: { branch: string; id: string; title: string }[]; - try { - const parsed = JSON.parse(planContent) as { issues: unknown[] }; - if (!Array.isArray(parsed.issues)) { - console.error("Planner output missing issues array. Skipping iteration."); - continue; - } - const validated = parsed.issues.filter( - (entry): entry is { branch: string; id: string; title: string } => { - if (typeof entry !== "object" || entry === null) { - console.warn(" Skipping non-object issue entry"); - return false; - } - const item = entry as Record; - if (typeof item.id !== "string" || !/^\d+$/.test(item.id)) { - console.warn(` Skipping issue with invalid id: ${String(item.id)}`); - return false; - } - if (typeof item.branch !== "string") { - console.warn(" Skipping issue with missing branch"); - return false; - } - if (typeof item.title !== "string") { - console.warn(" Skipping issue with missing title"); - return false; - } - if (!BRANCH_PATTERN.test(item.branch)) { - console.warn(` Skipping issue with invalid branch: ${item.branch}`); - return false; - } - return true; - }, - ); - issues = validated; - } catch { - console.error("Planner produced invalid JSON. Skipping iteration."); - continue; - } - - if (issues.length === 0) { - console.log("No issues to work on. Exiting."); - workCompleted = true; - break; - } +const source = new GithubIssueSource({ + branchPrefix: BRANCH_PREFIX, + dockerImage: DOCKER_IMAGE, + label: ISSUE_LABEL, +}); - console.log(`Planning complete. ${String(issues.length)} issue(s) to work in parallel:`); - for (const issue of issues) { - console.log(` #${issue.id}: ${issue.title} → ${issue.branch}`); - } +const tasks = await source.discover(); - // Phase 2: Execute + Review (semaphore for MAX_PARALLEL) - let running = 0; - const queue: (() => void)[] = []; - const acquire = () => - running < MAX_PARALLEL - ? (running++, Promise.resolve()) - : new Promise((resolve) => queue.push(resolve)); - const release = () => { - running--; - const next = queue.shift(); - if (next) { - running++; - next(); - } - }; +if (tasks.length === 0) { + console.log("No tasks to process."); +} else { + const pool = new ConcurrencyPool(MAX_PARALLEL); const settled = await Promise.allSettled( - issues.map(async (issue) => { - await acquire(); - try { - await using sandbox = await sandcastle.createSandbox({ - branch: issue.branch, - copyToWorktree: ["node_modules"], - hooks: { - sandbox: { onSandboxReady: [{ command: "npm install && npm run build" }] }, - }, - sandbox: docker({ imageName: DOCKER_IMAGE }), - }); - - const result = await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: 100, - name: "Implementer #" + issue.id, - promptArgs: { - BRANCH: issue.branch, - ISSUE_TITLE: issue.title, - TASK_ID: issue.id, - }, - promptFile: "./.sandcastle/implement-prompt.md", - }); - - if (result.commits.length > 0) { - try { - await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: 10, - name: "Reviewer #" + issue.id, - promptArgs: { - BRANCH: issue.branch, + tasks.map((spec) => + pool.run(() => + withTimeout( + (async () => { + await using sandbox = await sandcastle.createSandbox({ + branch: spec.branch, + copyToWorktree: ["node_modules"], + hooks: { + sandbox: { onSandboxReady: [{ command: "npm install && npm run build" }] }, }, - promptFile: "./.sandcastle/review-prompt.md", + sandbox: docker({ imageName: DOCKER_IMAGE }), + }); + + const loopResult = await runRefinementLoop(spec, sandbox, { + iterationBudget: ITERATION_BUDGET_PER_ROUND, + maxRounds: MAX_CRITIC_ROUNDS, }); - } catch (reviewError: unknown) { - const msg = reviewError instanceof Error ? reviewError.message : String(reviewError); - console.warn(` Reviewer for #${issue.id} failed, proceeding unreviewed: ${msg}`); - } - } - return result; - } finally { - release(); - } - }), + let prCreated = false; + if (loopResult.totalCommits > 0) { + const cwd = sandbox.worktreePath; + const result = await finalizeTask(spec, loopResult, sandbox, cwd); + prCreated = result.prCreated; + } + + return { prCreated, spec }; + })(), + TASK_TIMEOUT_MS, + `Task #${spec.id}`, + ), + ), + ), + ); + + const workCompleted = settled.some( + (outcome) => outcome.status === "fulfilled" && outcome.value.prCreated, ); for (const [i, outcome] of settled.entries()) { if (outcome.status === "rejected") { - const currentIssue = issues[i]; + const spec = tasks[i]; const reason: unknown = outcome.reason; - const errorMessage = - reason instanceof Error ? (reason.stack ?? reason.message) : String(reason); - console.error( - ` ✗ #${currentIssue?.id ?? String(i)} (${currentIssue?.branch ?? "unknown"}) failed: ${errorMessage}`, - ); + const msg = reason instanceof Error ? (reason.stack ?? reason.message) : String(reason); + console.error(` ✗ #${spec?.id ?? String(i)} failed: ${msg}`); } } - const completedIssues = settled - .map((outcome, i) => ({ issue: issues[i], outcome })) - .filter( - ( - entry, - ): entry is { - issue: (typeof issues)[number]; - outcome: PromiseFulfilledResult>>; - } => - entry.issue !== undefined && - entry.outcome.status === "fulfilled" && - entry.outcome.value.commits.length > 0, - ) - .map((entry) => entry.issue); + console.log("\nAll done."); - const completedBranches = completedIssues.map((i) => i.branch); - - if (completedBranches.length === 0) { - console.log("No commits produced. Nothing to merge."); - break; - } - - // Phase 3: Merge - try { - await sandcastle.run({ - agent: sandcastle.opencode("github-copilot/claude-opus-4.6"), - maxIterations: 10, - name: "Merger", - promptArgs: { - BRANCHES: completedBranches.map((b) => `- ${b}`).join("\n"), - ISSUES: completedIssues.map((i) => `- #${i.id}: ${i.title}`).join("\n"), - }, - promptFile: "./.sandcastle/merge-prompt.md", - sandbox: docker({ imageName: DOCKER_IMAGE }), - }); - - console.log("\nPR created."); - workCompleted = true; - break; - } catch (error: unknown) { - const errorMessage = error instanceof Error ? (error.stack ?? error.message) : String(error); - console.error(`Merge phase failed: ${errorMessage}`); - console.error("Branches are pushed and available for manual merge."); - break; + if (!workCompleted) { + process.exitCode = 1; } } -console.log("\nAll done."); - -if (!workCompleted) { - process.exitCode = 1; -} +process.exit(process.exitCode ?? 0); diff --git a/.sandcastle/merge-prompt.md b/.sandcastle/merge-prompt.md deleted file mode 100644 index 11aec41..0000000 --- a/.sandcastle/merge-prompt.md +++ /dev/null @@ -1,68 +0,0 @@ -# Merge Agent - -Merge completed branches and create a pull request. - -## Inputs - -- Branches: {{BRANCHES}} -- Issues: {{ISSUES}} - -## Current State - -!`git status --short` - -!`git branch -a | grep agent/ || true` - -## Steps - -1. Create a merge branch from main: - - ```bash - git branch -D agent/merge-batch 2>/dev/null || true - git checkout -b agent/merge-batch origin/main - ``` - -2. Ensure working tree is clean. - -3. Merge each branch with a merge commit: - - ```bash - git merge --no-ff - ``` - - Process branches in the order given. - -4. If a merge conflict occurs: - - Read the conflicting files. - - Resolve favoring the incoming branch changes. Validation in step 6 will catch regressions. - - Stage resolved files and complete the merge. - -5. After all merges, verify that `git diff main...agent/merge-batch` shows changes. If there are no file changes compared to main, do NOT create a PR — output `COMPLETE` and stop. - -6. Run full validation: - - ```bash - npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2 - ``` - -7. If validation fails, fix the issue and amend the merge commit. - -8. Push the branch and create a pull request. Read `.github/PULL_REQUEST_TEMPLATE.md` and fill in all sections. Use conventional commit format for the title (`feat:`, `fix:`, `chore:`, `refactor:`). Include `Fixes #N` for each resolved issue in the Related Issues section. - -## Rules - -- Every merge uses `--no-ff` to preserve branch history. -- Validation must pass after all merges complete. -- Do not push directly to main — create a PR for human review instead. -- Do not force-push. -- Do not delete remote branches (leave for cleanup elsewhere). -- Do not close issues manually — the PR merge handles it via "Fixes #N" in the body. -- Do not create a PR if there are zero file changes compared to main. - -## Completion - -When all branches are merged, validation passes, and the PR is created, output: - -```text -COMPLETE -``` diff --git a/.sandcastle/plan-prompt.md b/.sandcastle/plan-prompt.md index 9c67d7b..27be837 100644 --- a/.sandcastle/plan-prompt.md +++ b/.sandcastle/plan-prompt.md @@ -9,7 +9,7 @@ Read `AGENTS.md` for project conventions. ## Open Issues -!`gh issue list --state open --json number,title,labels,body --limit 50 {{LABEL_FILTER}}` +{{ISSUES_JSON}} ## Steps @@ -31,6 +31,7 @@ Read `AGENTS.md` for project conventions. - Exclude issues labeled `wontfix`, `duplicate`, or `question`. - Exclude issues that depend on another open issue (mention "blocked by #N" or similar). +- Prefer issues where scope fits a single-file change over cross-cutting refactors. - If every issue is blocked, include the single highest-priority candidate (fewest/weakest dependencies). - If no actionable issues exist, output: diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts new file mode 100644 index 0000000..3e852a3 --- /dev/null +++ b/.sandcastle/refinement-loop.ts @@ -0,0 +1,580 @@ +import * as sandcastle from "@ai-hero/sandcastle"; +import crypto from "node:crypto"; +import { readFileSync, realpathSync } from "node:fs"; +import { join, sep } from "node:path"; + +import type { Finding, LoopResult, LoopStatus, SandboxInstance, TaskSpec } from "./types.js"; + +import { + AGENT_MODEL, + CONTEXT_HASH_RADIUS, + execFileAsync, + HASH_PREFIX_LENGTH, + VALIDATION_COMMAND, + VALIDATION_TIMEOUT_MS, +} from "./constants.js"; +import { ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS, parseFindingsSafe } from "./types.js"; + +/** Options for configuring the refinement loop. */ +export interface RefinementLoopOptions { + /** Budget of iterations per round (flat constant applied to every round). */ + iterationBudget?: number; + /** Maximum number of implement↔critic rounds. */ + maxRounds?: number; + /** Optional callback invoked after each round completes. */ + onRoundComplete?: (round: number, findings: Finding[]) => void; +} + +/** Result of a convergence check. */ +interface ConvergenceResult { + /** Best SHA to restore (empty string = no update). */ + bestSha: string; + /** Updated last findings. */ + lastFindings: Finding[]; + /** New loop status. */ + status: LoopStatus; +} + +/** + * Input descriptor for hashing a window of source lines around a finding. + */ +interface HashInput { + /** Working directory (worktree path) for resolving the file. */ + readonly cwd: string; + /** Relative file path of the finding. */ + readonly file: string; + /** Line number of the finding (1-indexed). */ + readonly line: number; +} + +/** + * Context passed to the quality ratchet check. + * Groups the per-round identifiers needed for regression detection and rollback. + */ +interface RatchetContext { + /** SHA of HEAD before the implementer ran (used for rollback). */ + readonly beforeSha: string; + /** Working directory for git operations. */ + readonly cwd: string; + /** Current round number (1-indexed). */ + readonly round: number; + /** The task specification. */ + readonly spec: TaskSpec; +} + +/** Resolved loop options with defaults applied. */ +interface ResolvedLoopOptions { + /** Iteration budget per round. */ + budget: number; + /** Maximum number of rounds. */ + maxRounds: number; + /** Optional round-complete callback (no-op if not provided). */ + onRoundComplete: (round: number, findings: Finding[]) => void; +} + +/** Result of a single implement↔critic round. */ +interface RoundResult { + /** SHA of HEAD before the implementer ran. */ + beforeSha: string; + /** Number of commits made by the implementer. */ + commits: number; + /** Parsed findings from the critic, or null on critic failure. */ + findings: Finding[] | null; +} + +/** + * Runs the implement↔critic refinement loop for a given task. + * @param spec - The task specification. + * @param sandbox - The sandcastle sandbox instance. + * @param opts - Optional configuration for rounds, budget, and callbacks. + * @returns The loop result with status, commits, findings, and rounds completed. + */ +export async function runRefinementLoop( + spec: TaskSpec, + sandbox: SandboxInstance, + opts?: RefinementLoopOptions, +): Promise { + const { budget, maxRounds, onRoundComplete } = resolveLoopOptions(opts); + + const seenKeys = new Set(); + let lastFindings: Finding[] = []; + let status: LoopStatus = "exhausted"; + let totalCommits = 0; + let roundsCompleted = 0; + let previousFindingsCount = Infinity; + let bestSha = ""; + let bestFindingsCount = Infinity; + + for (let round = 1; round <= maxRounds; round++) { + roundsCompleted = round; + + console.log( + ` #${spec.id} round ${String(round)}/${String(maxRounds)} (budget: ${String(budget)})`, + ); + + const result = await executeRound(spec, sandbox, round, budget, lastFindings); + + const earlyExit = checkEarlyExit(spec, round, result, totalCommits); + if (earlyExit !== null) { + totalCommits = earlyExit.totalCommits; + status = earlyExit.status; + break; + } + + if (result.findings === null) break; + const findings: Finding[] = result.findings; + + if (result.commits > 0 && (await runMidLoopValidation(sandbox.worktreePath))) { + totalCommits += result.commits; + status = "converged"; + break; + } + + const cwd = sandbox.worktreePath; + const newFindings = deduplicateFindings(findings, seenKeys, cwd); + + console.log( + ` #${spec.id}: ${String(findings.length)} findings, ${String(newFindings.length)} new`, + ); + + const nonLowFindings = findings.filter((f) => f.confidence !== "LOW"); + if ( + await checkQualityRatchet( + { beforeSha: result.beforeSha, cwd, round, spec }, + nonLowFindings.length, + previousFindingsCount, + ) + ) { + status = "exhausted"; + break; + } + + if (newFindings.length < bestFindingsCount) { + bestFindingsCount = newFindings.length; + bestSha = await captureHeadSha(cwd); + } + + totalCommits += result.commits; + previousFindingsCount = nonLowFindings.length; + onRoundComplete(round, findings); + + const convergenceResult = await checkConvergence(cwd, findings, newFindings, nonLowFindings); + if (convergenceResult !== null) { + lastFindings = convergenceResult.lastFindings; + status = convergenceResult.status; + bestSha = convergenceResult.bestSha; + break; + } + + lastFindings = newFindings; + } + + if (shouldResetToBest(status, bestSha)) { + totalCommits = await resetToBestState(sandbox.worktreePath, bestSha, totalCommits); + } + + return { lastFindings, roundsCompleted, status, totalCommits }; +} + +/** + * Captures the current HEAD SHA, returning empty string on failure. + * @param cwd - Working directory for git operations. + * @returns The HEAD SHA or empty string. + */ +async function captureHeadSha(cwd: string): Promise { + try { + const { stdout } = await execFileAsync("git", ["rev-parse", "HEAD"], { cwd }); + return stdout.trim(); + } catch { + return ""; + } +} + +/** + * Checks whether the current round converged (no new findings). + * @param cwd - Working directory for git operations. + * @param allFindings - All findings from the critic. + * @param newFindings - Deduplicated new findings. + * @param nonLowFindings - Non-LOW-confidence findings. + * @returns A ConvergenceResult if the loop should break, or null to continue. + */ +async function checkConvergence( + cwd: string, + allFindings: Finding[], + newFindings: Finding[], + nonLowFindings: Finding[], +): Promise { + if (newFindings.length !== 0) return null; + + // Severity-weighted convergence (OpenHands pattern): + // Don't converge if CRITICAL/HIGH findings persist, even if already seen + const criticalPersistent = allFindings.filter( + (f) => (f.severity === "CRITICAL" || f.severity === "HIGH") && f.confidence !== "LOW", + ); + if (criticalPersistent.length > 0) { + // Capture current HEAD so post-loop reset is a no-op (code matches findings) + return { + bestSha: await captureHeadSha(cwd), + lastFindings: criticalPersistent, + status: "exhausted", + }; + } + + return { + bestSha: "", + lastFindings: nonLowFindings.length > 0 ? nonLowFindings : [], + status: "converged", + }; +} + +/** + * Checks whether the round result warrants an early exit from the loop. + * @param spec - The task specification. + * @param round - Current round number. + * @param result - The round result. + * @param totalCommits - Running total of commits before this round. + * @returns An object with updated status and totalCommits if early exit, or null to continue. + */ +function checkEarlyExit( + spec: TaskSpec, + round: number, + result: RoundResult, + totalCommits: number, +): null | { status: LoopStatus; totalCommits: number } { + if (round === 1 && result.commits === 0) { + console.warn(` #${spec.id}: 0 commits on round 1. Skipping.`); + return { status: "skipped", totalCommits }; + } + if (result.findings === null) { + console.warn(` #${spec.id}: Critic failed twice. Breaking (non-converged).`); + return { status: "failed", totalCommits: totalCommits + result.commits }; + } + if (round > 1 && result.commits === 0) { + return { status: "exhausted", totalCommits }; + } + return null; +} + +/** + * @param ctx - Ratchet context containing spec, round, beforeSha, and cwd. + * @param findingsCount - Number of non-LOW findings this round. + * @param previousCount - Number of non-LOW findings from the previous round. + * @returns True if a regression was detected and rollback performed. + */ +async function checkQualityRatchet( + ctx: RatchetContext, + findingsCount: number, + previousCount: number, +): Promise { + const { beforeSha, cwd, round, spec } = ctx; + if (round <= 2 || findingsCount <= previousCount) { + return false; + } + + // Validate SHA format before passing to execFileAsync + if (!/^[0-9a-f]{40}$/.test(beforeSha)) { + console.warn(` #${spec.id}: Invalid SHA for rollback, skipping reset.`); + return true; + } + + try { + await execFileAsync("git", ["reset", "--hard", beforeSha], { cwd }); + console.warn( + ` #${spec.id} R${String(round)}: Regression detected (${String(previousCount)} → ${String(findingsCount)}). Rolled back.`, + ); + } catch { + console.warn(` #${spec.id}: Failed to reset to ${beforeSha} after regression.`); + } + + return true; +} + +/** + * Computes a deduplication key for a finding using a context hash of surrounding lines. + * @param f - Finding to compute a key for. + * @param cwd - Working directory (worktree path) for reading file context. + * @param fileCache - Optional cache of file contents keyed by resolved path. + * @returns Composite dedup key. + */ +function computeFindingKey(f: Finding, cwd: string, fileCache?: Map): string { + if (!f.file || f.line == null) { + const normalizedTitle = f.title + .toLowerCase() + .replace(/[^\w\s]/g, "") + .replace(/\s+/g, " ") + .trim(); + const titleHash = crypto + .createHash("sha256") + .update(normalizedTitle) + .digest("hex") + .slice(0, HASH_PREFIX_LENGTH); + return `${f.file || "global"}::${f.category}::${titleHash}`; + } + const contextHash = hashContextLines( + { cwd, file: f.file, line: f.line }, + CONTEXT_HASH_RADIUS, + fileCache, + ); + return `${f.file}::${f.category}::${contextHash}`; +} + +/** + * Filters findings by confidence and deduplicates against previously seen keys. + * @param findings - Raw findings from the critic. + * @param seenKeys - Set of previously seen dedup keys (mutated: new keys are added). + * @param cwd - Working directory for context hashing. + * @returns Array of new, non-LOW-confidence findings. + */ +function deduplicateFindings(findings: Finding[], seenKeys: Set, cwd: string): Finding[] { + const fileCache = new Map(); + const newFindings = findings.filter( + (f) => f.confidence !== "LOW" && !seenKeys.has(computeFindingKey(f, cwd, fileCache)), + ); + for (const f of newFindings) { + seenKeys.add(computeFindingKey(f, cwd, fileCache)); + } + return newFindings; +} + +/** + * Executes a single implement↔critic round. + * @param spec - The task specification. + * @param sandbox - The sandcastle sandbox instance. + * @param round - Current round number (1-indexed). + * @param budget - Iteration budget for the implementer. + * @param lastFindings - Findings from the previous round to feed to the implementer. + * @returns The round result containing commits, findings, and the pre-round SHA. + */ +async function executeRound( + spec: TaskSpec, + sandbox: SandboxInstance, + round: number, + budget: number, + lastFindings: Finding[], +): Promise { + const findingsArg = lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : ""; + + // Capture SHA before implementer runs (for quality ratchet rollback) + let beforeSha = ""; + try { + const { stdout } = await execFileAsync("git", ["rev-parse", "HEAD"], { + cwd: sandbox.worktreePath, + }); + beforeSha = stdout.trim(); + } catch { + console.warn(` #${spec.id}: Failed to capture HEAD SHA before round ${String(round)}.`); + } + + // Implementer + let implementerResult: Awaited>; + try { + implementerResult = await sandbox.run({ + agent: sandcastle.opencode(AGENT_MODEL), + maxIterations: budget, + name: `Implementer #${spec.id} R${String(round)}`, + promptArgs: { + BRANCH: spec.branch, + FINDINGS: findingsArg, + ISSUE_BODY: spec.body, + ISSUE_TITLE: spec.title, + TASK_ID: spec.id, + }, + promptFile: "./.sandcastle/implement-prompt.md", + }); + } catch (err: unknown) { + const msg = err instanceof Error ? (err.stack ?? err.message) : String(err); + console.error(` #${spec.id} R${String(round)}: Implementer threw: ${msg}`); + return { beforeSha, commits: 0, findings: null }; + } + + // Critic + const nonce = crypto.randomBytes(4).toString("hex"); + let findings: Finding[] | null; + try { + findings = await runCritic(sandbox, spec, round, nonce); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.error(` #${spec.id} R${String(round)}: Critic threw: ${msg}`); + findings = null; + } + + return { beforeSha, commits: implementerResult.commits.length, findings }; +} + +/** + * Hashes a window of lines around the finding for dedup stability. + * @param input - Hash input containing cwd, file, and line. + * @param radius - Number of lines above/below to include in the context window. + * @param fileCache - Optional cache of file contents keyed by resolved path. + * @returns Truncated SHA-256 hex digest. + */ +function hashContextLines( + input: HashInput, + radius: number, + fileCache?: Map, +): string { + const { cwd, file, line } = input; + try { + const fullPath = realpathSync(join(cwd, file)); + if (!fullPath.startsWith(realpathSync(cwd) + sep)) { + throw new Error("Path traversal"); + } + let raw: string; + const cached = fileCache?.get(fullPath); + if (cached !== undefined) { + raw = cached; + } else { + raw = readFileSync(fullPath, "utf-8"); + if (fileCache) fileCache.set(fullPath, raw); + } + const lines = raw.split("\n"); + const idx = Math.min(Math.max(0, line - 1), lines.length - 1); + const start = Math.max(0, idx - radius); + const end = Math.min(lines.length - 1, idx + radius); + const window = lines.slice(start, end + 1).join("\n"); + const normalized = window.replace(/\s+/g, " ").trim(); + return crypto + .createHash("sha256") + .update(`${file}:${String(line)}:${normalized}`) + .digest("hex") + .slice(0, HASH_PREFIX_LENGTH); + } catch { + return crypto + .createHash("sha256") + .update(`${file}:${String(line)}:fallback`) + .digest("hex") + .slice(0, HASH_PREFIX_LENGTH); + } +} + +/** + * Parses findings from agent stdout using nonce-tagged delimiters. + * @param stdout - Agent stdout to parse findings from. + * @param nonce - Unique tag identifier for this run. + * @returns Parsed findings array or null on parse failure. + */ +function parseFindings(stdout: string, nonce: string): Finding[] | null { + if (!/^[0-9a-f]+$/.test(nonce)) return null; + const tagPattern = new RegExp(`([\\s\\S]*?)<\\/findings-${nonce}>`, "g"); + const matches = [...stdout.matchAll(tagPattern)]; + if (matches.length === 0) return null; + // Find last non-trivial match + for (let i = matches.length - 1; i >= 0; i--) { + const raw = matches[i]?.[1]?.trim() ?? ""; + if (raw.length < 2) continue; + const cleaned = raw.replace(/^```(?:json)?\s*\n?/g, "").replace(/\n?```\s*$/g, ""); + try { + return parseFindingsSafe(JSON.parse(cleaned)); + } catch { + continue; + } + } + return null; +} + +/** + * Resets the worktree to the best intermediate state and recounts commits. + * @param cwd - Working directory for git operations. + * @param bestSha - The SHA to reset to. + * @param currentCommits - Current total commits (fallback if recount fails). + * @returns Updated total commit count. + */ +async function resetToBestState( + cwd: string, + bestSha: string, + currentCommits: number, +): Promise { + try { + await execFileAsync("git", ["reset", "--hard", bestSha], { cwd }); + const { stdout } = await execFileAsync("git", ["rev-list", "--count", "main..HEAD"], { cwd }); + return parseInt(stdout.trim(), 10) || 0; + } catch { + return currentCommits; + } +} + +/** + * Resolves loop options, applying defaults for missing values. + * @param opts - Optional loop options. + * @returns Resolved options with all fields populated. + */ +function resolveLoopOptions(opts: RefinementLoopOptions | undefined): ResolvedLoopOptions { + return { + budget: opts?.iterationBudget ?? ITERATION_BUDGET_PER_ROUND, + maxRounds: opts?.maxRounds ?? MAX_CRITIC_ROUNDS, + onRoundComplete: opts?.onRoundComplete ?? (() => undefined), + }; +} + +/** + * Runs the critic agent, retrying once on parse failure. + * @param sandbox - The sandcastle sandbox instance. + * @param spec - The task specification. + * @param round - Current round number. + * @param nonce - Unique nonce for parsing. + * @returns Parsed findings or null if both attempts failed. + */ +async function runCritic( + sandbox: SandboxInstance, + spec: TaskSpec, + round: number, + nonce: string, +): Promise { + let critic = await sandbox.run({ + agent: sandcastle.opencode(AGENT_MODEL), + maxIterations: 1, + name: `Critic #${spec.id} R${String(round)}`, + promptArgs: { + BRANCH: spec.branch, + NONCE: nonce, + }, + promptFile: "./.sandcastle/critic-prompt.md", + }); + + let findings = parseFindings(critic.stdout, nonce); + + if (findings === null) { + console.warn(` #${spec.id}: Critic parse failed. Retrying.`); + critic = await sandbox.run({ + agent: sandcastle.opencode(AGENT_MODEL), + maxIterations: 1, + name: `Critic #${spec.id} R${String(round)} retry`, + promptArgs: { + BRANCH: spec.branch, + NONCE: nonce, + }, + promptFile: "./.sandcastle/critic-prompt.md", + }); + findings = parseFindings(critic.stdout, nonce); + } + + return findings; +} + +/** + * Runs the mid-loop validation command (ARCS pattern). + * @param cwd - Working directory for the validation command. + * @returns True if validation passed (deterministic convergence), false otherwise. + */ +async function runMidLoopValidation(cwd: string): Promise { + try { + await execFileAsync("sh", ["-c", VALIDATION_COMMAND], { + cwd, + maxBuffer: 8 * 1024 * 1024, + timeout: VALIDATION_TIMEOUT_MS, + }); + return true; + } catch { + return false; + } +} + +/** + * Returns true if the best-state reset should be applied after the loop. + * @param status - Final loop status. + * @param bestSha - Best intermediate SHA (empty string if none captured). + * @returns True if reset should be applied. + */ +function shouldResetToBest(status: LoopStatus, bestSha: string): boolean { + return status !== "converged" && /^[0-9a-f]{40}$/.test(bestSha); +} diff --git a/.sandcastle/review-prompt.md b/.sandcastle/review-prompt.md deleted file mode 100644 index 854c9f1..0000000 --- a/.sandcastle/review-prompt.md +++ /dev/null @@ -1,67 +0,0 @@ -# Review Agent - -Review and validate the implementation on branch `{{BRANCH}}`. - -## Setup - -```bash -git checkout {{BRANCH}} -``` - -## Changes to Review - -!`git diff --stat main...{{BRANCH}}` - -## Commits on This Branch - -!`git log main..{{BRANCH}} --oneline` - -## Validation - -Run the full CI validation suite. Every command must exit 0: - -```bash -npm run type-check -npm run test -npm run test:node -npm run test:edge -npm run prettier-check -npm run lint -npm run build -npm run check-build -npm run build:v2 -npm run check-build:v2 -``` - -## On Failure - -If any command fails: - -1. Read the error output. -2. Fix the issue in the source code. -3. Commit the fix: `fix: `. -4. Re-run the full suite from the top. -5. Repeat until all commands pass. - -## Quality Checks - -After validation passes, verify compliance with the coding standards in `CONTRIBUTING.md`. Fix violations and commit. - -## Rules - -- Zero errors, zero warnings from type-check and lint. -- All tests pass in both Node.js and Edge environments. -- Both V3 and V2 builds succeed. -- Do not skip or disable tests. - -## Completion - -When the full suite passes cleanly, push the fixes and output: - -```bash -git push -``` - -```text -COMPLETE -``` diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts new file mode 100644 index 0000000..358d73c --- /dev/null +++ b/.sandcastle/task-source.ts @@ -0,0 +1,248 @@ +import * as sandcastle from "@ai-hero/sandcastle"; +import { docker } from "@ai-hero/sandcastle/sandboxes/docker"; +import { z } from "zod"; + +import type { TaskSpec } from "./types.js"; + +import { + execFileAsync, + GIT_TIMEOUT_MS, + MAX_TITLE_LENGTH, + PLANNER_MODEL, + TASK_TIMEOUT_MS, + toErrorMessage, +} from "./constants.js"; + +const RawIssueSchema = z.object({ + body: z + .string() + .nullable() + .transform((b) => b ?? ""), + labels: z.array(z.object({ name: z.string() })), + number: z.number(), + title: z.string(), +}); +const RawIssuesSchema = z.array(RawIssueSchema); + +/** Configuration for the GitHub issue task source. */ +export interface GithubIssueSourceConfig { + /** Git branch prefix for issue branches. */ + branchPrefix: string; + /** Docker image name for the sandbox. */ + dockerImage: string; + /** GitHub issue label to filter by. */ + label: string; + /** Maximum planner retries. */ + maxRetries?: number; +} + +/** Interface for task discovery sources. */ +export interface TaskSource { + /** Discovers tasks to work on. */ + discover(): Promise; +} + +/** + * Task source that discovers work from GitHub issues via planner agent. + */ +export class GithubIssueSource implements TaskSource { + private readonly branchPattern: RegExp; + private readonly branchPrefix: string; + private readonly dockerImage: string; + private readonly label: string; + private readonly maxRetries: number; + + /** + * @param config - Configuration for the GitHub issue source. + */ + constructor(config: GithubIssueSourceConfig) { + this.branchPrefix = config.branchPrefix; + this.dockerImage = config.dockerImage; + this.label = config.label; + this.maxRetries = config.maxRetries ?? 5; + + const escapedPrefix = this.branchPrefix.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + this.branchPattern = new RegExp(`^${escapedPrefix}-\\d+-[\\w-]+$`); + } + + /** + * Discovers tasks by fetching GitHub issues, running the planner, and validating the plan. + * @returns Array of task specifications to implement. + */ + async discover(): Promise { + const issuesJson = await this.fetchAndSanitizeIssues(); + + if (issuesJson.length === 0) { + console.log("No issues with label '%s'. Exiting.", this.label); + return []; + } + + for (let attempt = 1; attempt <= this.maxRetries; attempt++) { + console.log(`\n=== Planner attempt ${String(attempt)}/${String(this.maxRetries)} ===\n`); + + const planPromise = sandcastle.run({ + agent: sandcastle.opencode(PLANNER_MODEL), + maxIterations: 1, + name: "Planner", + promptArgs: { + BRANCH_PREFIX: this.branchPrefix, + ISSUES_JSON: JSON.stringify(issuesJson, null, 2), + }, + promptFile: "./.sandcastle/plan-prompt.md", + sandbox: docker({ imageName: this.dockerImage }), + }); + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => { + reject(new Error("Planner timed out")); + }, TASK_TIMEOUT_MS).unref(); + }); + timeoutPromise.catch(() => { + /* suppress unhandled rejection when planner completes before timeout */ + }); + let plan: Awaited>; + try { + plan = await Promise.race([planPromise, timeoutPromise]); + } catch { + console.error("Planner timed out or failed. Retrying."); + continue; + } + + const planMatches = [...plan.stdout.matchAll(/([\s\S]*?)<\/plan>/g)]; + const planMatch = planMatches.at(-1); + if (!planMatch) { + console.error("Planner did not produce a tag. Retrying."); + continue; + } + + const planContent = planMatch[1] ?? ""; + const tasks = this.validatePlan(planContent, issuesJson); + if (tasks === null) { + continue; + } + + if (tasks.length === 0) { + console.log("No actionable issues. Exiting."); + return []; + } + + console.log(`Plan: ${String(tasks.length)} issue(s) to work on:`); + for (const task of tasks) { + console.log(` #${task.id}: ${task.title} → ${task.branch}`); + } + + return tasks; + } + + console.warn("Planner failed to produce a valid plan after all retries."); + process.exitCode = 1; + return []; + } + + private async fetchAndSanitizeIssues(): Promise< + { + body: string; + labels: string[]; + number: number; + title: string; + }[] + > { + let rawIssuesJson: string; + try { + const { stdout } = await execFileAsync( + "gh", + [ + "issue", + "list", + "--state", + "open", + "--json", + "number,title,labels,body", + "--limit", + "50", + "--label", + this.label, + ], + { encoding: "utf-8", maxBuffer: 8 * 1024 * 1024, timeout: GIT_TIMEOUT_MS }, + ); + rawIssuesJson = stdout; + } catch (err: unknown) { + console.error( + `Failed to fetch issues: ${toErrorMessage(err)}. Ensure gh is installed and authenticated.`, + ); + process.exit(1); + } + + let rawIssues: z.infer; + try { + rawIssues = RawIssuesSchema.parse(JSON.parse(rawIssuesJson)); + } catch (err: unknown) { + console.error( + `Failed to parse issues JSON: ${toErrorMessage(err)}. Unexpected format from gh CLI.`, + ); + process.exit(1); + } + + return rawIssues.map((issue) => ({ + body: sanitizeForPrompt(issue.body), + labels: issue.labels.map((label) => label.name), + number: issue.number, + title: sanitizeForPrompt(issue.title), + })); + } + + private validatePlan( + planContent: string, + issuesJson: { body: string; labels: string[]; number: number; title: string }[], + ): null | TaskSpec[] { + try { + const parsed = JSON.parse(planContent) as { issues: unknown[] }; + if (!Array.isArray(parsed.issues)) { + console.error("Planner output missing issues array. Retrying."); + return null; + } + const validated = parsed.issues.filter( + (entry): entry is { branch: string; id: string; title: string } => { + if (typeof entry !== "object" || entry === null) return false; + const item = entry as Record; + if (typeof item.id !== "string" || !/^\d+$/.test(item.id)) return false; + if (typeof item.branch !== "string" || !this.branchPattern.test(item.branch)) + return false; + if (typeof item.title !== "string") return false; + if (item.title.length > MAX_TITLE_LENGTH) return false; + // eslint-disable-next-line no-control-regex + if (/[\x00-\x1f]/.test(item.title)) return false; + return true; + }, + ); + + const issueMap = new Map(issuesJson.map((issue) => [String(issue.number), issue])); + return validated + .map((entry) => { + const source = issueMap.get(entry.id); + if (!source) return null; + return { + ...entry, + body: source.body, + labels: source.labels, + }; + }) + .filter((entry): entry is NonNullable => entry !== null); + } catch (err: unknown) { + console.error(`Planner produced invalid JSON: ${toErrorMessage(err)}. Retrying.`); + return null; + } + } +} + +/** + * Strips agent-control tags from text to reduce prompt-injection risk. + * @param text - Raw text to sanitize. + * @returns Text with plan/findings/promise tags removed. + */ +function sanitizeForPrompt(text: string): string { + const normalized = text.normalize("NFKC"); + return normalized.replace( + /<\/?(?:plan|findings|promise|system|code|instructions|implement|review|tool_call)[^>]*>/gi, + "", + ); +} diff --git a/.sandcastle/types.ts b/.sandcastle/types.ts new file mode 100644 index 0000000..9ebc589 --- /dev/null +++ b/.sandcastle/types.ts @@ -0,0 +1,83 @@ +import type * as sandcastle from "@ai-hero/sandcastle"; + +import { z } from "zod"; + +/** Result of post-loop finalization. */ +export interface FinalizeResult { + /** Whether the PR was marked as draft. */ + isDraft: boolean; + /** Whether a PR was successfully created. */ + prCreated: boolean; + /** Whether validation passed. */ + validationPassed: boolean; +} + +/** Zod schema for a single critic finding. */ +export const FindingSchema = z.object({ + category: z.enum(["security", "logic", "performance", "architecture", "style"]), + confidence: z.enum(["HIGH", "MEDIUM", "LOW"]), + description: z.string(), + file: z.string(), + line: z.number().optional(), + severity: z.enum(["CRITICAL", "HIGH", "MEDIUM", "LOW"]), + suggestion: z.string().optional(), + title: z.string(), +}); + +/** A single critic finding parsed from agent output. */ +export type Finding = z.infer; + +/** Result returned by the refinement loop. */ +export interface LoopResult { + /** Outstanding findings from the last round. */ + lastFindings: Finding[]; + /** Number of rounds completed. */ + roundsCompleted: number; + /** Termination status. */ + status: LoopStatus; + /** Total commits produced across all rounds. */ + totalCommits: number; +} + +/** Outcome status of the refinement loop. */ +export type LoopStatus = "converged" | "exhausted" | "failed" | "skipped"; + +/** Type alias for a sandcastle sandbox instance. */ +export type SandboxInstance = Awaited>; + +/** + * Parses a findings array with partial recovery — invalid entries are discarded. + * @param data - Raw parsed JSON value to validate as a findings array. + * @returns Array of valid findings (may be empty). + */ +export function parseFindingsSafe(data: unknown): Finding[] { + if (!Array.isArray(data)) return []; + return data + .map((entry) => FindingSchema.safeParse(entry)) + .filter((r): r is z.ZodSafeParseSuccess => r.success) + .map((r) => r.data); +} + +/** Maximum implement↔critic rounds before giving up. */ +export const MAX_CRITIC_ROUNDS = 5; + +/** + * Flat iteration budget per round (intentionally constant, not decreasing). + * Evidence: ARCS (arXiv:2504.20434), SWE-Agent, AutoCodeRover all use flat budgets. + * Decreasing schedules penalize harder residual problems in later rounds. + */ +export const ITERATION_BUDGET_PER_ROUND = 50; + +/** Specification for a task to be implemented. */ +export interface TaskSpec { + /** Sanitized issue body text. */ + body: string; + /** Git branch name for this task. */ + branch: string; + /** Task identifier (e.g. GitHub issue number as string). */ + id: string; + /** Label names associated with the task. */ + labels: string[]; + /** Task title. */ + title: string; +}