From eea2a1f74c93394f5e0517cdf88ed8bd57533006 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 01:25:57 +0200 Subject: [PATCH 01/31] feat: implement sandcastle refinement loop with critic-based convergence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace single-pass implement→review→merge with iterative implement↔critic loop per task. Key changes: - Orchestrator fetches and sanitizes issues (prevents prompt injection) - Implement↔Critic loop with deterministic dedup convergence - Critic produces structured findings (nonce-tagged JSON, zod-validated) - Decreasing iteration budget per round [100, 50, 25, 10, 10] - Host-side validation and rebase (no agent needed) - One PR per task (no merger agent) - Draft PR on non-convergence with outstanding findings listed Implements #110 --- .sandcastle/critic-prompt.md | 56 +++++ .sandcastle/implement-prompt.md | 17 +- .sandcastle/main.ts | 364 ++++++++++++++++++++++---------- .sandcastle/merge-prompt.md | 68 ------ .sandcastle/plan-prompt.md | 2 +- .sandcastle/review-prompt.md | 67 ------ 6 files changed, 325 insertions(+), 249 deletions(-) create mode 100644 .sandcastle/critic-prompt.md delete mode 100644 .sandcastle/merge-prompt.md delete mode 100644 .sandcastle/review-prompt.md diff --git a/.sandcastle/critic-prompt.md b/.sandcastle/critic-prompt.md new file mode 100644 index 0000000..98ec518 --- /dev/null +++ b/.sandcastle/critic-prompt.md @@ -0,0 +1,56 @@ +# Critic Agent + +Analyze the implementation on branch `{{BRANCH}}` and produce structured findings. + +## Task + +Run `git diff main...{{BRANCH}}` to see all changes. Examine the diff carefully. For each issue found, produce a structured finding. + +Read `CONTRIBUTING.md` for the project's coding standards. + +## Output Format + +Output your findings as JSON wrapped in nonce-tagged delimiters. Use EXACTLY this tag format: + +```text +[...] +``` + +Each finding must have this structure: + +```json +{ + "file": "path/to/file.ts", + "line": 42, + "title": "short description of the issue", + "severity": "CRITICAL|HIGH|MEDIUM|LOW", + "category": "security|logic|performance|architecture|style", + "confidence": "HIGH|MEDIUM|LOW", + "description": "detailed explanation of why this is a problem", + "suggestion": "how to fix it" +} +``` + +If no issues are found, output: + +```text +[] +``` + +## Rules + +- Do NOT modify any files. Do NOT commit. Do NOT push. +- Only report issues in the CHANGED code (not pre-existing issues). +- Use HIGH confidence only when you've verified the issue by reading the relevant code. +- Use MEDIUM confidence for pattern-based detection. +- Use LOW confidence for style preferences or uncertain issues. +- Focus on: logic errors, missing edge cases, security issues, type safety violations, test gaps. +- Do NOT report formatting issues (prettier handles those). + +## Completion + +After outputting the findings, output: + +```text +COMPLETE +``` diff --git a/.sandcastle/implement-prompt.md b/.sandcastle/implement-prompt.md index 79f39b1..d132f5c 100644 --- a/.sandcastle/implement-prompt.md +++ b/.sandcastle/implement-prompt.md @@ -4,11 +4,11 @@ Implement issue **#{{TASK_ID}}** ("{{ISSUE_TITLE}}") on branch `{{BRANCH}}`. ## Issue Details -!`gh issue view {{TASK_ID}} --json body,title,labels,comments` +{{ISSUE_BODY}} -## Recent Commits +## Review Findings -!`git log -n 10 --format="%h %s" --date=short` +{{FINDINGS}} ## Exploration @@ -22,24 +22,26 @@ Read `AGENTS.md` and `CONTRIBUTING.md` for project conventions. ## Implementation -1. Implement the fix/feature. Follow existing patterns: +1. If review findings are provided above, cross-validate each one against the code. Fix findings you agree with. Ignore findings that are incorrect or not applicable. + +2. If no findings are provided, implement the issue from scratch following existing patterns: - Strict TypeScript, JSDoc on public APIs - Co-located tests in `*.test.ts` files - Zod for runtime validation -2. Before every commit, run the full validation suite: +3. Before every commit, run the full validation suite: ```bash npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2 ``` -3. Commit with conventional commits: +4. Commit with conventional commits: - `fix: ` — bug fix - `feat: ` — new feature - `refactor: ` — restructuring - `chore: ` — tooling/config -4. Push the branch: +5. Push the branch: ```bash git push -u origin {{BRANCH}} @@ -51,6 +53,7 @@ Read `AGENTS.md` and `CONTRIBUTING.md` for project conventions. - Tests must pass before pushing. Zero type errors, zero test failures. - Do not modify unrelated files. - Do not bump version numbers. +- Push BEFORE signaling completion. ## Completion diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 2153268..cebbea6 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -1,28 +1,104 @@ import * as sandcastle from "@ai-hero/sandcastle"; import { docker } from "@ai-hero/sandcastle/sandboxes/docker"; +import { execSync } from "node:child_process"; +import crypto from "node:crypto"; +import { z } from "zod"; const BRANCH_PREFIX = "agent/issue"; const ESCAPED_PREFIX = BRANCH_PREFIX.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); const BRANCH_PATTERN = new RegExp(`^${ESCAPED_PREFIX}-\\d+-[\\w-]+$`); const ISSUE_LABEL = "sandcastle"; -const LABEL_FILTER = `--label "${ISSUE_LABEL}"`; const MAX_PLANNER_RETRIES = 5; +const MAX_CRITIC_ROUNDS = 5; +const ITERATION_BUDGET = [100, 50, 25, 10, 10]; const MAX_PARALLEL = 3; const DOCKER_IMAGE = "sandcastle-sap-ai"; +const FindingSchema = z.object({ + category: z.enum(["security", "logic", "performance", "architecture", "style"]), + confidence: z.enum(["HIGH", "MEDIUM", "LOW"]), + description: z.string(), + file: z.string(), + line: z.number().optional(), + severity: z.enum(["CRITICAL", "HIGH", "MEDIUM", "LOW"]), + suggestion: z.string().optional(), + title: z.string(), +}); +const FindingsSchema = z.array(FindingSchema); +type Finding = z.infer; + +/** + * @param f - Finding to compute dedup key for. + * @returns Composite key for deterministic deduplication. + */ +function findingKey(f: Finding): string { + const normalizedTitle = f.title + .toLowerCase() + .replace(/[^\w\s]/g, "") + .replace(/\s+/g, " ") + .trim(); + return `${f.file}::${f.category}::${normalizedTitle}`; +} + +/** + * @param stdout - Agent stdout to parse findings from. + * @param nonce - Unique tag identifier for this run. + * @returns Parsed findings array or null on parse failure. + */ +function parseFindings(stdout: string, nonce: string): Finding[] | null { + const tagPattern = new RegExp(`([\\s\\S]*?)<\\/findings-${nonce}>`, "g"); + const matches = [...stdout.matchAll(tagPattern)]; + const raw = matches.at(-1)?.[1]?.trim() ?? "[]"; + const cleaned = raw.replace(/^```(?:json)?\s*\n?/g, "").replace(/\n?```\s*$/g, ""); + try { + return FindingsSchema.parse(JSON.parse(cleaned)); + } catch { + return null; + } +} + +/** + * @param text - Raw text to strip injection-prone tags from. + * @returns Sanitized text safe for prompt injection. + */ +function sanitizeForPrompt(text: string): string { + return text.replace(/<\/?(?:plan|findings[\w-]*|promise)[^>]*>/gi, ""); +} + +// --- Phase 1: Fetch and sanitize issues --- + +const rawIssues = JSON.parse( + execSync( + `gh issue list --state open --json number,title,labels,body --limit 50 --label "${ISSUE_LABEL}"`, + ).toString(), +) as { body: string; labels: { name: string }[]; number: number; title: string }[]; + +const issuesJson = rawIssues.map((i) => ({ + body: sanitizeForPrompt(i.body), + labels: i.labels.map((l) => l.name), + number: i.number, + title: i.title, +})); + +if (issuesJson.length === 0) { + console.log("No issues with label '%s'. Exiting.", ISSUE_LABEL); + process.exit(0); +} + +// --- Phase 2: Plan --- + let workCompleted = false; -for (let iteration = 1; iteration <= MAX_PLANNER_RETRIES; iteration++) { - console.log(`\n=== Iteration ${String(iteration)}/${String(MAX_PLANNER_RETRIES)} ===\n`); +for (let attempt = 1; attempt <= MAX_PLANNER_RETRIES; attempt++) { + console.log(`\n=== Planner attempt ${String(attempt)}/${String(MAX_PLANNER_RETRIES)} ===\n`); - // Phase 1: Plan const plan = await sandcastle.run({ agent: sandcastle.opencode("github-copilot/claude-opus-4.6"), maxIterations: 1, name: "Planner", promptArgs: { BRANCH_PREFIX, - LABEL_FILTER, + ISSUES_JSON: JSON.stringify(issuesJson, null, 2), }, promptFile: "./.sandcastle/plan-prompt.md", sandbox: docker({ imageName: DOCKER_IMAGE }), @@ -31,62 +107,51 @@ for (let iteration = 1; iteration <= MAX_PLANNER_RETRIES; iteration++) { const planMatches = [...plan.stdout.matchAll(/([\s\S]*?)<\/plan>/g)]; const planMatch = planMatches.at(-1); if (!planMatch) { - console.error("Planner did not produce a tag. Skipping iteration."); + console.error("Planner did not produce a tag. Retrying."); continue; } const planContent = planMatch[1] ?? ""; - let issues: { branch: string; id: string; title: string }[]; + let issues: { body: string; branch: string; id: string; title: string }[]; try { const parsed = JSON.parse(planContent) as { issues: unknown[] }; if (!Array.isArray(parsed.issues)) { - console.error("Planner output missing issues array. Skipping iteration."); + console.error("Planner output missing issues array. Retrying."); continue; } const validated = parsed.issues.filter( - (entry): entry is { branch: string; id: string; title: string } => { - if (typeof entry !== "object" || entry === null) { - console.warn(" Skipping non-object issue entry"); - return false; - } + (entry): entry is { body: string; branch: string; id: string; title: string } => { + if (typeof entry !== "object" || entry === null) return false; const item = entry as Record; - if (typeof item.id !== "string" || !/^\d+$/.test(item.id)) { - console.warn(` Skipping issue with invalid id: ${String(item.id)}`); - return false; - } - if (typeof item.branch !== "string") { - console.warn(" Skipping issue with missing branch"); - return false; - } - if (typeof item.title !== "string") { - console.warn(" Skipping issue with missing title"); - return false; - } - if (!BRANCH_PATTERN.test(item.branch)) { - console.warn(` Skipping issue with invalid branch: ${item.branch}`); - return false; - } + if (typeof item.id !== "string" || !/^\d+$/.test(item.id)) return false; + if (typeof item.branch !== "string" || !BRANCH_PATTERN.test(item.branch)) return false; + if (typeof item.title !== "string") return false; return true; }, ); - issues = validated; + // Attach sanitized body from our fetched data + issues = validated.map((v) => ({ + ...v, + body: issuesJson.find((i) => String(i.number) === v.id)?.body ?? "", + })); } catch { - console.error("Planner produced invalid JSON. Skipping iteration."); + console.error("Planner produced invalid JSON. Retrying."); continue; } if (issues.length === 0) { - console.log("No issues to work on. Exiting."); + console.log("No actionable issues. Exiting."); workCompleted = true; break; } - console.log(`Planning complete. ${String(issues.length)} issue(s) to work in parallel:`); + console.log(`Plan: ${String(issues.length)} issue(s) to work on:`); for (const issue of issues) { console.log(` #${issue.id}: ${issue.title} → ${issue.branch}`); } - // Phase 2: Execute + Review (semaphore for MAX_PARALLEL) + // --- Phase 3: Implement ↔ Critic loop (parallel, max 3) --- + let running = 0; const queue: (() => void)[] = []; const acquire = () => @@ -115,99 +180,186 @@ for (let iteration = 1; iteration <= MAX_PLANNER_RETRIES; iteration++) { sandbox: docker({ imageName: DOCKER_IMAGE }), }); - const result = await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: 100, - name: "Implementer #" + issue.id, - promptArgs: { - BRANCH: issue.branch, - ISSUE_TITLE: issue.title, - TASK_ID: issue.id, - }, - promptFile: "./.sandcastle/implement-prompt.md", - }); + const seenKeys = new Set(); + let lastFindings: Finding[] = []; + let converged = false; + let totalCommits = 0; - if (result.commits.length > 0) { - try { - await sandbox.run({ + for (let round = 1; round <= MAX_CRITIC_ROUNDS; round++) { + const budget = ITERATION_BUDGET[round - 1] ?? 10; + const findingsArg = lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : ""; + + console.log( + ` #${issue.id} round ${String(round)}/${String(MAX_CRITIC_ROUNDS)} (budget: ${String(budget)})`, + ); + + // Implementer + const impl = await sandbox.run({ + agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + maxIterations: budget, + name: `Implementer #${issue.id} R${String(round)}`, + promptArgs: { + BRANCH: issue.branch, + FINDINGS: findingsArg, + ISSUE_BODY: issue.body, + ISSUE_TITLE: issue.title, + TASK_ID: issue.id, + }, + promptFile: "./.sandcastle/implement-prompt.md", + }); + + totalCommits += impl.commits.length; + + if (round === 1 && impl.commits.length === 0) { + console.warn(` #${issue.id}: 0 commits on round 1. Skipping.`); + break; + } + + // Critic + const nonce = crypto.randomBytes(4).toString("hex"); + + let critic = await sandbox.run({ + agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + maxIterations: 1, + name: `Critic #${issue.id} R${String(round)}`, + promptArgs: { + BRANCH: issue.branch, + NONCE: nonce, + }, + promptFile: "./.sandcastle/critic-prompt.md", + }); + + let findings = parseFindings(critic.stdout, nonce); + + // Retry once on parse failure + if (findings === null) { + console.warn(` #${issue.id}: Critic parse failed. Retrying.`); + critic = await sandbox.run({ agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: 10, - name: "Reviewer #" + issue.id, + maxIterations: 1, + name: `Critic #${issue.id} R${String(round)} retry`, promptArgs: { BRANCH: issue.branch, + NONCE: nonce, }, - promptFile: "./.sandcastle/review-prompt.md", + promptFile: "./.sandcastle/critic-prompt.md", }); - } catch (reviewError: unknown) { - const msg = reviewError instanceof Error ? reviewError.message : String(reviewError); - console.warn(` Reviewer for #${issue.id} failed, proceeding unreviewed: ${msg}`); + findings = parseFindings(critic.stdout, nonce); + } + + if (findings === null) { + console.warn(` #${issue.id}: Critic failed twice. Breaking (non-converged).`); + break; + } + + // Dedup + const newFindings = findings.filter( + (f) => f.confidence !== "LOW" && !seenKeys.has(findingKey(f)), + ); + for (const f of newFindings) { + seenKeys.add(findingKey(f)); } + + console.log( + ` #${issue.id}: ${String(findings.length)} findings, ${String(newFindings.length)} new`, + ); + + if (newFindings.length === 0) { + converged = true; + break; + } + + lastFindings = newFindings; } - return result; + // --- Final validation (host-side, deterministic) --- + if (totalCommits > 0) { + const cwd = sandbox.worktreePath; + let validationPassed = false; + + try { + execSync( + "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2", + { cwd, stdio: "pipe" }, + ); + validationPassed = true; + } catch { + console.warn(` #${issue.id}: Validation failed.`); + } + + // Rebase on latest main + try { + execSync("git fetch origin main && git rebase origin/main", { + cwd, + stdio: "pipe", + }); + if (validationPassed) { + // Post-rebase smoke test + try { + execSync("npm run type-check && npm run test", { + cwd, + stdio: "pipe", + }); + } catch { + validationPassed = false; + } + } + execSync("git push --force-with-lease", { cwd, stdio: "pipe" }); + } catch { + // Rebase failed — push un-rebased + try { + execSync("git rebase --abort", { cwd, stdio: "pipe" }); + } catch { + /* empty */ + } + execSync("git push", { cwd, stdio: "pipe" }); + } + + // Create PR + const isDraft = !converged || !validationPassed; + const draftFlag = isDraft ? " --draft" : ""; + const outstandingNote = + !converged && lastFindings.length > 0 + ? `\n\n⚠️ Outstanding findings:\n${lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` + : ""; + const validationNote = !validationPassed + ? "\n\n⚠️ Validation did not pass. Manual review required." + : ""; + + const prTitle = `fix: resolve #${issue.id} — ${issue.title}`; + const prBody = `## Description\n\nAutomated fix for #${issue.id}: ${issue.title}\n\n## Type of Change\n\n- [x] Bug fix (non-breaking change that fixes an issue)\n\n## Checklist\n\n- [x] I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${issue.id}${outstandingNote}${validationNote}`; + + try { + execSync( + `gh pr create${draftFlag} --head "${issue.branch}" --base main --title "${prTitle}" --body "${prBody.replace(/"/g, '\\"')}"`, + { cwd, stdio: "pipe" }, + ); + console.log(` #${issue.id}: PR created${isDraft ? " (draft)" : ""}.`); + workCompleted = true; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.error(` #${issue.id}: PR creation failed: ${msg}`); + } + } + + return { converged, issue, totalCommits }; } finally { release(); } }), ); + // Log failures for (const [i, outcome] of settled.entries()) { if (outcome.status === "rejected") { - const currentIssue = issues[i]; + const issue = issues[i]; const reason: unknown = outcome.reason; - const errorMessage = - reason instanceof Error ? (reason.stack ?? reason.message) : String(reason); - console.error( - ` ✗ #${currentIssue?.id ?? String(i)} (${currentIssue?.branch ?? "unknown"}) failed: ${errorMessage}`, - ); + const msg = reason instanceof Error ? (reason.stack ?? reason.message) : String(reason); + console.error(` ✗ #${issue?.id ?? String(i)} failed: ${msg}`); } } - const completedIssues = settled - .map((outcome, i) => ({ issue: issues[i], outcome })) - .filter( - ( - entry, - ): entry is { - issue: (typeof issues)[number]; - outcome: PromiseFulfilledResult>>; - } => - entry.issue !== undefined && - entry.outcome.status === "fulfilled" && - entry.outcome.value.commits.length > 0, - ) - .map((entry) => entry.issue); - - const completedBranches = completedIssues.map((i) => i.branch); - - if (completedBranches.length === 0) { - console.log("No commits produced. Nothing to merge."); - break; - } - - // Phase 3: Merge - try { - await sandcastle.run({ - agent: sandcastle.opencode("github-copilot/claude-opus-4.6"), - maxIterations: 10, - name: "Merger", - promptArgs: { - BRANCHES: completedBranches.map((b) => `- ${b}`).join("\n"), - ISSUES: completedIssues.map((i) => `- #${i.id}: ${i.title}`).join("\n"), - }, - promptFile: "./.sandcastle/merge-prompt.md", - sandbox: docker({ imageName: DOCKER_IMAGE }), - }); - - console.log("\nPR created."); - workCompleted = true; - break; - } catch (error: unknown) { - const errorMessage = error instanceof Error ? (error.stack ?? error.message) : String(error); - console.error(`Merge phase failed: ${errorMessage}`); - console.error("Branches are pushed and available for manual merge."); - break; - } + break; // Plan executed — exit planner retry loop } console.log("\nAll done."); diff --git a/.sandcastle/merge-prompt.md b/.sandcastle/merge-prompt.md deleted file mode 100644 index 11aec41..0000000 --- a/.sandcastle/merge-prompt.md +++ /dev/null @@ -1,68 +0,0 @@ -# Merge Agent - -Merge completed branches and create a pull request. - -## Inputs - -- Branches: {{BRANCHES}} -- Issues: {{ISSUES}} - -## Current State - -!`git status --short` - -!`git branch -a | grep agent/ || true` - -## Steps - -1. Create a merge branch from main: - - ```bash - git branch -D agent/merge-batch 2>/dev/null || true - git checkout -b agent/merge-batch origin/main - ``` - -2. Ensure working tree is clean. - -3. Merge each branch with a merge commit: - - ```bash - git merge --no-ff - ``` - - Process branches in the order given. - -4. If a merge conflict occurs: - - Read the conflicting files. - - Resolve favoring the incoming branch changes. Validation in step 6 will catch regressions. - - Stage resolved files and complete the merge. - -5. After all merges, verify that `git diff main...agent/merge-batch` shows changes. If there are no file changes compared to main, do NOT create a PR — output `COMPLETE` and stop. - -6. Run full validation: - - ```bash - npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2 - ``` - -7. If validation fails, fix the issue and amend the merge commit. - -8. Push the branch and create a pull request. Read `.github/PULL_REQUEST_TEMPLATE.md` and fill in all sections. Use conventional commit format for the title (`feat:`, `fix:`, `chore:`, `refactor:`). Include `Fixes #N` for each resolved issue in the Related Issues section. - -## Rules - -- Every merge uses `--no-ff` to preserve branch history. -- Validation must pass after all merges complete. -- Do not push directly to main — create a PR for human review instead. -- Do not force-push. -- Do not delete remote branches (leave for cleanup elsewhere). -- Do not close issues manually — the PR merge handles it via "Fixes #N" in the body. -- Do not create a PR if there are zero file changes compared to main. - -## Completion - -When all branches are merged, validation passes, and the PR is created, output: - -```text -COMPLETE -``` diff --git a/.sandcastle/plan-prompt.md b/.sandcastle/plan-prompt.md index 9c67d7b..5d34138 100644 --- a/.sandcastle/plan-prompt.md +++ b/.sandcastle/plan-prompt.md @@ -9,7 +9,7 @@ Read `AGENTS.md` for project conventions. ## Open Issues -!`gh issue list --state open --json number,title,labels,body --limit 50 {{LABEL_FILTER}}` +{{ISSUES_JSON}} ## Steps diff --git a/.sandcastle/review-prompt.md b/.sandcastle/review-prompt.md deleted file mode 100644 index 854c9f1..0000000 --- a/.sandcastle/review-prompt.md +++ /dev/null @@ -1,67 +0,0 @@ -# Review Agent - -Review and validate the implementation on branch `{{BRANCH}}`. - -## Setup - -```bash -git checkout {{BRANCH}} -``` - -## Changes to Review - -!`git diff --stat main...{{BRANCH}}` - -## Commits on This Branch - -!`git log main..{{BRANCH}} --oneline` - -## Validation - -Run the full CI validation suite. Every command must exit 0: - -```bash -npm run type-check -npm run test -npm run test:node -npm run test:edge -npm run prettier-check -npm run lint -npm run build -npm run check-build -npm run build:v2 -npm run check-build:v2 -``` - -## On Failure - -If any command fails: - -1. Read the error output. -2. Fix the issue in the source code. -3. Commit the fix: `fix: `. -4. Re-run the full suite from the top. -5. Repeat until all commands pass. - -## Quality Checks - -After validation passes, verify compliance with the coding standards in `CONTRIBUTING.md`. Fix violations and commit. - -## Rules - -- Zero errors, zero warnings from type-check and lint. -- All tests pass in both Node.js and Edge environments. -- Both V3 and V2 builds succeed. -- Do not skip or disable tests. - -## Completion - -When the full suite passes cleanly, push the fixes and output: - -```bash -git push -``` - -```text -COMPLETE -``` From 6a5e9670d0b37ada2c3cfed5490db9536117ca39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 01:46:33 +0200 Subject: [PATCH 02/31] fix: address review findings (shell injection, false convergence, nesting, zod validation) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace execSync with execFileSync for gh pr create (prevents shell injection) - Guard parseFindings against empty matches (prevents false convergence) - Add try/catch on gh issue list startup call - Guard git push in rebase catch block - Extract finalizeIssue function (reduces nesting from 6+ to 3 levels) - Add zod schema for rawIssues (replaces unsafe 'as' cast) - Implement validation retry round per spec (one more implement→critic if budget remains) --- .sandcastle/main.ts | 261 ++++++++++++++++++++++++++++++++------------ 1 file changed, 192 insertions(+), 69 deletions(-) diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index cebbea6..8d42337 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -1,6 +1,6 @@ import * as sandcastle from "@ai-hero/sandcastle"; import { docker } from "@ai-hero/sandcastle/sandboxes/docker"; -import { execSync } from "node:child_process"; +import { execFileSync, execSync } from "node:child_process"; import crypto from "node:crypto"; import { z } from "zod"; @@ -27,6 +27,171 @@ const FindingSchema = z.object({ const FindingsSchema = z.array(FindingSchema); type Finding = z.infer; +// --- Zod schema for GitHub issue list response (fix #6) --- + +const RawIssueSchema = z.object({ + body: z.string(), + labels: z.array(z.object({ name: z.string() })), + number: z.number(), + title: z.string(), +}); +const RawIssuesSchema = z.array(RawIssueSchema); + +// --- Type alias for sandbox instance --- + +type SandboxInstance = Awaited>; + +/** + * @param sandbox - The sandcastle sandbox instance. + * @param cwd - Working directory (worktree path). + * @param issue - Issue metadata. + * @param issue.body + * @param issue.branch + * @param issue.id + * @param issue.title + * @param converged - Whether the critic loop converged. + * @param lastFindings - Outstanding findings from the last round. + * @param round - The round at which the critic loop ended. + * @returns Whether work was completed (PR created). + */ +async function finalizeIssue( + sandbox: SandboxInstance, + cwd: string, + issue: { body: string; branch: string; id: string; title: string }, + converged: boolean, + lastFindings: Finding[], + round: number, +): Promise { + let validationPassed = false; + + try { + execSync( + "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2", + { cwd, stdio: "pipe" }, + ); + validationPassed = true; + } catch { + console.warn(` #${issue.id}: Validation failed.`); + } + + // --- Validation retry round (fix #7) --- + if (!validationPassed && round < MAX_CRITIC_ROUNDS) { + const retryBudget = ITERATION_BUDGET[MAX_CRITIC_ROUNDS - 1] ?? 10; + console.log( + ` #${issue.id}: Retrying one more implement→critic round (budget: ${String(retryBudget)})`, + ); + + const retryNonce = crypto.randomBytes(4).toString("hex"); + + await sandbox.run({ + agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + maxIterations: retryBudget, + name: `Implementer #${issue.id} retry`, + promptArgs: { + BRANCH: issue.branch, + FINDINGS: lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : "", + ISSUE_BODY: issue.body, + ISSUE_TITLE: issue.title, + TASK_ID: issue.id, + }, + promptFile: "./.sandcastle/implement-prompt.md", + }); + + await sandbox.run({ + agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + maxIterations: 1, + name: `Critic #${issue.id} retry-validation`, + promptArgs: { + BRANCH: issue.branch, + NONCE: retryNonce, + }, + promptFile: "./.sandcastle/critic-prompt.md", + }); + + // Re-validate after retry + try { + execSync( + "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2", + { cwd, stdio: "pipe" }, + ); + validationPassed = true; + console.log(` #${issue.id}: Validation passed after retry round.`); + } catch { + console.warn(` #${issue.id}: Validation still fails after retry. Will create draft PR.`); + } + } + + // Rebase on latest main + try { + execSync("git fetch origin main && git rebase origin/main", { + cwd, + stdio: "pipe", + }); + if (validationPassed) { + // Post-rebase smoke test + try { + execSync("npm run type-check && npm run test", { + cwd, + stdio: "pipe", + }); + } catch { + validationPassed = false; + } + } + execSync("git push --force-with-lease", { cwd, stdio: "pipe" }); + } catch { + // Rebase failed — push un-rebased + try { + execSync("git rebase --abort", { cwd, stdio: "pipe" }); + } catch { + /* empty */ + } + try { + execSync("git push", { cwd, stdio: "pipe" }); + } catch (pushErr: unknown) { + const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); + console.warn(` #${issue.id}: git push failed after rebase abort: ${pushMsg}`); + } + } + + // Create PR (fix #1: use execFileSync to avoid shell injection) + const isDraft = !converged || !validationPassed; + const outstandingNote = + !converged && lastFindings.length > 0 + ? `\n\n⚠️ Outstanding findings:\n${lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` + : ""; + const validationNote = !validationPassed + ? "\n\n⚠️ Validation did not pass. Manual review required." + : ""; + + const prTitle = `fix: resolve #${issue.id} — ${issue.title}`; + const prBody = `## Description\n\nAutomated fix for #${issue.id}: ${issue.title}\n\n## Type of Change\n\n- [x] Bug fix (non-breaking change that fixes an issue)\n\n## Checklist\n\n- [x] I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${issue.id}${outstandingNote}${validationNote}`; + + const prArgs = [ + "pr", + "create", + ...(isDraft ? ["--draft"] : []), + "--head", + issue.branch, + "--base", + "main", + "--title", + prTitle, + "--body", + prBody, + ]; + + try { + execFileSync("gh", prArgs, { cwd, encoding: "utf-8", stdio: "pipe" }); + console.log(` #${issue.id}: PR created${isDraft ? " (draft)" : ""}.`); + return true; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.error(` #${issue.id}: PR creation failed: ${msg}`); + return false; + } +} + /** * @param f - Finding to compute dedup key for. * @returns Composite key for deterministic deduplication. @@ -48,6 +213,9 @@ function findingKey(f: Finding): string { function parseFindings(stdout: string, nonce: string): Finding[] | null { const tagPattern = new RegExp(`([\\s\\S]*?)<\\/findings-${nonce}>`, "g"); const matches = [...stdout.matchAll(tagPattern)]; + if (matches.length === 0) { + return null; + } const raw = matches.at(-1)?.[1]?.trim() ?? "[]"; const cleaned = raw.replace(/^```(?:json)?\s*\n?/g, "").replace(/\n?```\s*$/g, ""); try { @@ -67,11 +235,18 @@ function sanitizeForPrompt(text: string): string { // --- Phase 1: Fetch and sanitize issues --- -const rawIssues = JSON.parse( - execSync( +let rawIssuesJson: string; +try { + rawIssuesJson = execSync( `gh issue list --state open --json number,title,labels,body --limit 50 --label "${ISSUE_LABEL}"`, - ).toString(), -) as { body: string; labels: { name: string }[]; number: number; title: string }[]; + { encoding: "utf-8" }, + ); +} catch { + console.error("Failed to fetch issues. Ensure gh is installed and authenticated."); + process.exit(1); +} + +const rawIssues = RawIssuesSchema.parse(JSON.parse(rawIssuesJson)); const issuesJson = rawIssues.map((i) => ({ body: sanitizeForPrompt(i.body), @@ -184,8 +359,10 @@ for (let attempt = 1; attempt <= MAX_PLANNER_RETRIES; attempt++) { let lastFindings: Finding[] = []; let converged = false; let totalCommits = 0; + let lastRound = 0; for (let round = 1; round <= MAX_CRITIC_ROUNDS; round++) { + lastRound = round; const budget = ITERATION_BUDGET[round - 1] ?? 10; const findingsArg = lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : ""; @@ -272,73 +449,19 @@ for (let attempt = 1; attempt <= MAX_PLANNER_RETRIES; attempt++) { lastFindings = newFindings; } - // --- Final validation (host-side, deterministic) --- + // --- Final validation, rebase, and PR creation --- if (totalCommits > 0) { const cwd = sandbox.worktreePath; - let validationPassed = false; - - try { - execSync( - "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2", - { cwd, stdio: "pipe" }, - ); - validationPassed = true; - } catch { - console.warn(` #${issue.id}: Validation failed.`); - } - - // Rebase on latest main - try { - execSync("git fetch origin main && git rebase origin/main", { - cwd, - stdio: "pipe", - }); - if (validationPassed) { - // Post-rebase smoke test - try { - execSync("npm run type-check && npm run test", { - cwd, - stdio: "pipe", - }); - } catch { - validationPassed = false; - } - } - execSync("git push --force-with-lease", { cwd, stdio: "pipe" }); - } catch { - // Rebase failed — push un-rebased - try { - execSync("git rebase --abort", { cwd, stdio: "pipe" }); - } catch { - /* empty */ - } - execSync("git push", { cwd, stdio: "pipe" }); - } - - // Create PR - const isDraft = !converged || !validationPassed; - const draftFlag = isDraft ? " --draft" : ""; - const outstandingNote = - !converged && lastFindings.length > 0 - ? `\n\n⚠️ Outstanding findings:\n${lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` - : ""; - const validationNote = !validationPassed - ? "\n\n⚠️ Validation did not pass. Manual review required." - : ""; - - const prTitle = `fix: resolve #${issue.id} — ${issue.title}`; - const prBody = `## Description\n\nAutomated fix for #${issue.id}: ${issue.title}\n\n## Type of Change\n\n- [x] Bug fix (non-breaking change that fixes an issue)\n\n## Checklist\n\n- [x] I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${issue.id}${outstandingNote}${validationNote}`; - - try { - execSync( - `gh pr create${draftFlag} --head "${issue.branch}" --base main --title "${prTitle}" --body "${prBody.replace(/"/g, '\\"')}"`, - { cwd, stdio: "pipe" }, - ); - console.log(` #${issue.id}: PR created${isDraft ? " (draft)" : ""}.`); + const prCreated = await finalizeIssue( + sandbox, + cwd, + issue, + converged, + lastFindings, + lastRound, + ); + if (prCreated) { workCompleted = true; - } catch (err: unknown) { - const msg = err instanceof Error ? err.message : String(err); - console.error(` #${issue.id}: PR creation failed: ${msg}`); } } From d7cd5e2226f1ac2bd00e8596a8856be8e622dbbc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 01:52:46 +0200 Subject: [PATCH 03/31] fix: guard retry calls, split rebase logic, remove dead critic retry --- .sandcastle/main.ts | 62 +++++++++++++++++++++++++-------------------- 1 file changed, 34 insertions(+), 28 deletions(-) diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 8d42337..9cc64b8 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -81,32 +81,26 @@ async function finalizeIssue( ` #${issue.id}: Retrying one more implement→critic round (budget: ${String(retryBudget)})`, ); - const retryNonce = crypto.randomBytes(4).toString("hex"); - - await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: retryBudget, - name: `Implementer #${issue.id} retry`, - promptArgs: { - BRANCH: issue.branch, - FINDINGS: lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : "", - ISSUE_BODY: issue.body, - ISSUE_TITLE: issue.title, - TASK_ID: issue.id, - }, - promptFile: "./.sandcastle/implement-prompt.md", - }); - - await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: 1, - name: `Critic #${issue.id} retry-validation`, - promptArgs: { - BRANCH: issue.branch, - NONCE: retryNonce, - }, - promptFile: "./.sandcastle/critic-prompt.md", - }); + try { + await sandbox.run({ + agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + maxIterations: retryBudget, + name: `Implementer #${issue.id} retry`, + promptArgs: { + BRANCH: issue.branch, + FINDINGS: lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : "", + ISSUE_BODY: issue.body, + ISSUE_TITLE: issue.title, + TASK_ID: issue.id, + }, + promptFile: "./.sandcastle/implement-prompt.md", + }); + } catch (retryErr: unknown) { + const retryMsg = retryErr instanceof Error ? retryErr.message : String(retryErr); + console.warn( + ` #${issue.id}: Implementer retry threw: ${retryMsg}. Falling through to PR creation.`, + ); + } // Re-validate after retry try { @@ -122,11 +116,13 @@ async function finalizeIssue( } // Rebase on latest main + let rebaseSucceeded = false; try { execSync("git fetch origin main && git rebase origin/main", { cwd, stdio: "pipe", }); + rebaseSucceeded = true; if (validationPassed) { // Post-rebase smoke test try { @@ -138,9 +134,8 @@ async function finalizeIssue( validationPassed = false; } } - execSync("git push --force-with-lease", { cwd, stdio: "pipe" }); } catch { - // Rebase failed — push un-rebased + // Rebase failed — abort and push un-rebased try { execSync("git rebase --abort", { cwd, stdio: "pipe" }); } catch { @@ -154,6 +149,17 @@ async function finalizeIssue( } } + if (rebaseSucceeded) { + try { + execSync("git push --force-with-lease", { cwd, stdio: "pipe" }); + } catch (pushErr: unknown) { + const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); + console.warn( + ` #${issue.id}: git push --force-with-lease failed (branch un-pushed, PR creation will fail gracefully): ${pushMsg}`, + ); + } + } + // Create PR (fix #1: use execFileSync to avoid shell injection) const isDraft = !converged || !validationPassed; const outstandingNote = From bfad1d93ca5ae5715634eb5693185fbdd5233290 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 01:57:37 +0200 Subject: [PATCH 04/31] fix: handle nullable issue body and guard JSON parse --- .sandcastle/main.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 9cc64b8..8dde19e 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -30,7 +30,10 @@ type Finding = z.infer; // --- Zod schema for GitHub issue list response (fix #6) --- const RawIssueSchema = z.object({ - body: z.string(), + body: z + .string() + .nullable() + .transform((b) => b ?? ""), labels: z.array(z.object({ name: z.string() })), number: z.number(), title: z.string(), @@ -252,7 +255,13 @@ try { process.exit(1); } -const rawIssues = RawIssuesSchema.parse(JSON.parse(rawIssuesJson)); +let rawIssues: z.infer; +try { + rawIssues = RawIssuesSchema.parse(JSON.parse(rawIssuesJson)); +} catch { + console.error("Failed to parse issues JSON. Unexpected format from gh CLI."); + process.exit(1); +} const issuesJson = rawIssues.map((i) => ({ body: sanitizeForPrompt(i.body), From e6b613ac3a7a23567fd03416c82a810da9fc1c3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 02:02:03 +0200 Subject: [PATCH 05/31] =?UTF-8?q?fix:=20distinguish=20stalled=20from=20con?= =?UTF-8?q?verged=20(re-reported=20findings=20=E2=86=92=20draft=20PR)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .sandcastle/main.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 8dde19e..4263733 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -457,7 +457,12 @@ for (let attempt = 1; attempt <= MAX_PLANNER_RETRIES; attempt++) { ); if (newFindings.length === 0) { - converged = true; + if (findings.length > 0) { + lastFindings = findings; + converged = false; + } else { + converged = true; + } break; } From 5901fbfde1159466a3546ef825c9138e6ee86c73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 02:08:33 +0200 Subject: [PATCH 06/31] fix: LOW-only findings should not prevent convergence --- .sandcastle/main.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 4263733..09e7c8c 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -457,8 +457,9 @@ for (let attempt = 1; attempt <= MAX_PLANNER_RETRIES; attempt++) { ); if (newFindings.length === 0) { - if (findings.length > 0) { - lastFindings = findings; + const nonLowFindings = findings.filter((f) => f.confidence !== "LOW"); + if (nonLowFindings.length > 0) { + lastFindings = nonLowFindings; converged = false; } else { converged = true; From ee43546a4fa47f65e9308c8d08adb384de8fd4b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 02:17:08 +0200 Subject: [PATCH 07/31] fix: log validation errors, conditional checklist, derive PR title from labels --- .sandcastle/main.ts | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 09e7c8c..f5729cb 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -48,10 +48,11 @@ type SandboxInstance = Awaited>; * @param sandbox - The sandcastle sandbox instance. * @param cwd - Working directory (worktree path). * @param issue - Issue metadata. - * @param issue.body - * @param issue.branch - * @param issue.id - * @param issue.title + * @param issue.body - Sanitized issue body text. + * @param issue.branch - Git branch name for this issue. + * @param issue.id - GitHub issue number as string. + * @param issue.labels - Issue label names. + * @param issue.title - Issue title. * @param converged - Whether the critic loop converged. * @param lastFindings - Outstanding findings from the last round. * @param round - The round at which the critic loop ended. @@ -60,7 +61,7 @@ type SandboxInstance = Awaited>; async function finalizeIssue( sandbox: SandboxInstance, cwd: string, - issue: { body: string; branch: string; id: string; title: string }, + issue: { body: string; branch: string; id: string; labels: string[]; title: string }, converged: boolean, lastFindings: Finding[], round: number, @@ -73,15 +74,19 @@ async function finalizeIssue( { cwd, stdio: "pipe" }, ); validationPassed = true; - } catch { - console.warn(` #${issue.id}: Validation failed.`); + } catch (err: unknown) { + const stderr = + err instanceof Error && "stderr" in err + ? String((err as { stderr: unknown }).stderr).slice(0, 500) + : ""; + console.warn(` #${issue.id}: Validation failed.${stderr ? `\n${stderr}` : ""}`); } // --- Validation retry round (fix #7) --- if (!validationPassed && round < MAX_CRITIC_ROUNDS) { const retryBudget = ITERATION_BUDGET[MAX_CRITIC_ROUNDS - 1] ?? 10; console.log( - ` #${issue.id}: Retrying one more implement→critic round (budget: ${String(retryBudget)})`, + ` #${issue.id}: Retrying one more implement round (budget: ${String(retryBudget)})`, ); try { @@ -173,8 +178,14 @@ async function finalizeIssue( ? "\n\n⚠️ Validation did not pass. Manual review required." : ""; - const prTitle = `fix: resolve #${issue.id} — ${issue.title}`; - const prBody = `## Description\n\nAutomated fix for #${issue.id}: ${issue.title}\n\n## Type of Change\n\n- [x] Bug fix (non-breaking change that fixes an issue)\n\n## Checklist\n\n- [x] I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${issue.id}${outstandingNote}${validationNote}`; + const validationCheck = validationPassed ? "- [x]" : "- [ ]"; + const commitPrefix = issue.labels.includes("feature request") + ? "feat" + : issue.labels.includes("bug") + ? "fix" + : "chore"; + const prTitle = `${commitPrefix}: resolve #${issue.id} — ${issue.title}`; + const prBody = `## Description\n\nAutomated fix for #${issue.id}: ${issue.title}\n\n## Type of Change\n\n- [x] Bug fix (non-breaking change that fixes an issue)\n\n## Checklist\n\n${validationCheck} I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${issue.id}${outstandingNote}${validationNote}`; const prArgs = [ "pr", @@ -302,7 +313,7 @@ for (let attempt = 1; attempt <= MAX_PLANNER_RETRIES; attempt++) { } const planContent = planMatch[1] ?? ""; - let issues: { body: string; branch: string; id: string; title: string }[]; + let issues: { body: string; branch: string; id: string; labels: string[]; title: string }[]; try { const parsed = JSON.parse(planContent) as { issues: unknown[] }; if (!Array.isArray(parsed.issues)) { @@ -323,6 +334,7 @@ for (let attempt = 1; attempt <= MAX_PLANNER_RETRIES; attempt++) { issues = validated.map((v) => ({ ...v, body: issuesJson.find((i) => String(i.number) === v.id)?.body ?? "", + labels: issuesJson.find((i) => String(i.number) === v.id)?.labels ?? [], })); } catch { console.error("Planner produced invalid JSON. Retrying."); From d70bc29ab551e2f28b2cd66e79ba439562e2625f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 13:38:10 +0200 Subject: [PATCH 08/31] refactor: extract sandcastle into modular architecture MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Split main.ts (525 lines) into 6 self-contained modules: - types.ts: shared domain types (TaskSpec, Finding, LoopResult, FinalizeResult) - refinement-loop.ts: reusable implement↔critic loop engine - finalizer.ts: validation, rebase, PR creation - concurrency-pool.ts: semaphore utility - task-source.ts: TaskSource interface + GithubIssueSource - main.ts: 74-line thin orchestrator wiring all modules The refinement loop is now reusable by any task source (GitHub issues, CI failures, manual triggers) without coupling to the planner. --- .sandcastle/concurrency-pool.ts | 45 +++ .sandcastle/finalizer.ts | 164 ++++++++++ .sandcastle/main.ts | 545 +++----------------------------- .sandcastle/refinement-loop.ts | 193 +++++++++++ .sandcastle/task-source.ts | 190 +++++++++++ .sandcastle/types.ts | 63 ++++ 6 files changed, 703 insertions(+), 497 deletions(-) create mode 100644 .sandcastle/concurrency-pool.ts create mode 100644 .sandcastle/finalizer.ts create mode 100644 .sandcastle/refinement-loop.ts create mode 100644 .sandcastle/task-source.ts create mode 100644 .sandcastle/types.ts diff --git a/.sandcastle/concurrency-pool.ts b/.sandcastle/concurrency-pool.ts new file mode 100644 index 0000000..f2e6fee --- /dev/null +++ b/.sandcastle/concurrency-pool.ts @@ -0,0 +1,45 @@ +/** + * A concurrency limiter that restricts parallel execution to a maximum number of tasks. + */ +export class ConcurrencyPool { + private readonly queue: (() => void)[] = []; + private running = 0; + + /** + * @param max - Maximum number of concurrent tasks. + */ + constructor(private readonly max: number) {} + + /** + * Executes the given async function, waiting if the pool is at capacity. + * @param fn - Async function to execute within the pool. + * @returns The result of the function. + */ + async run(fn: () => Promise): Promise { + await this.acquire(); + try { + return await fn(); + } finally { + this.release(); + } + } + + private acquire(): Promise { + if (this.running < this.max) { + this.running++; + return Promise.resolve(); + } + return new Promise((resolve) => { + this.queue.push(resolve); + }); + } + + private release(): void { + this.running--; + const next = this.queue.shift(); + if (next) { + this.running++; + next(); + } + } +} diff --git a/.sandcastle/finalizer.ts b/.sandcastle/finalizer.ts new file mode 100644 index 0000000..1ac076e --- /dev/null +++ b/.sandcastle/finalizer.ts @@ -0,0 +1,164 @@ +import * as sandcastle from "@ai-hero/sandcastle"; +import { execFileSync, execSync } from "node:child_process"; + +import type { FinalizeResult, LoopResult, SandboxInstance, TaskSpec } from "./types.js"; + +const MAX_CRITIC_ROUNDS = 5; +const ITERATION_BUDGET = [100, 50, 25, 10, 10]; +const VALIDATION_COMMAND = + "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2"; + +/** + * Finalizes a task after the refinement loop: validates, retries if needed, rebases, pushes, and creates a PR. + * @param spec - The task specification. + * @param loopResult - The result from the refinement loop. + * @param sandbox - The sandcastle sandbox instance. + * @param cwd - Working directory (worktree path). + * @returns Finalization result with PR and validation status. + */ +export async function finalizeTask( + spec: TaskSpec, + loopResult: LoopResult, + sandbox: SandboxInstance, + cwd: string, +): Promise { + let validationPassed = false; + + try { + execSync(VALIDATION_COMMAND, { cwd, stdio: "pipe" }); + validationPassed = true; + } catch (err: unknown) { + const stderr = + err instanceof Error && "stderr" in err + ? String((err as { stderr: unknown }).stderr).slice(0, 500) + : ""; + console.warn(` #${spec.id}: Validation failed.${stderr ? `\n${stderr}` : ""}`); + } + + if (!validationPassed && loopResult.roundsCompleted < MAX_CRITIC_ROUNDS) { + const retryBudget = ITERATION_BUDGET[MAX_CRITIC_ROUNDS - 1] ?? 10; + console.log( + ` #${spec.id}: Retrying one more implement round (budget: ${String(retryBudget)})`, + ); + + try { + await sandbox.run({ + agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + maxIterations: retryBudget, + name: `Implementer #${spec.id} retry`, + promptArgs: { + BRANCH: spec.branch, + FINDINGS: + loopResult.lastFindings.length > 0 + ? JSON.stringify(loopResult.lastFindings, null, 2) + : "", + ISSUE_BODY: spec.body, + ISSUE_TITLE: spec.title, + TASK_ID: spec.id, + }, + promptFile: "./.sandcastle/implement-prompt.md", + }); + } catch (retryErr: unknown) { + const retryMsg = retryErr instanceof Error ? retryErr.message : String(retryErr); + console.warn( + ` #${spec.id}: Implementer retry threw: ${retryMsg}. Falling through to PR creation.`, + ); + } + + try { + execSync(VALIDATION_COMMAND, { cwd, stdio: "pipe" }); + validationPassed = true; + console.log(` #${spec.id}: Validation passed after retry round.`); + } catch { + console.warn(` #${spec.id}: Validation still fails after retry. Will create draft PR.`); + } + } + + // Rebase on latest main + let rebaseSucceeded = false; + try { + execSync("git fetch origin main && git rebase origin/main", { + cwd, + stdio: "pipe", + }); + rebaseSucceeded = true; + if (validationPassed) { + try { + execSync("npm run type-check && npm run test", { + cwd, + stdio: "pipe", + }); + } catch { + validationPassed = false; + } + } + } catch { + try { + execSync("git rebase --abort", { cwd, stdio: "pipe" }); + } catch { + /* empty */ + } + try { + execSync("git push", { cwd, stdio: "pipe" }); + } catch (pushErr: unknown) { + const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); + console.warn(` #${spec.id}: git push failed after rebase abort: ${pushMsg}`); + } + } + + if (rebaseSucceeded) { + try { + execSync("git push --force-with-lease", { cwd, stdio: "pipe" }); + } catch (pushErr: unknown) { + const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); + console.warn( + ` #${spec.id}: git push --force-with-lease failed (branch un-pushed, PR creation will fail gracefully): ${pushMsg}`, + ); + } + } + + const converged = loopResult.status === "converged"; + const isDraft = !converged || !validationPassed; + const outstandingNote = + !converged && loopResult.lastFindings.length > 0 + ? `\n\n⚠️ Outstanding findings:\n${loopResult.lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` + : ""; + const validationNote = !validationPassed + ? "\n\n⚠️ Validation did not pass. Manual review required." + : ""; + + const validationCheck = validationPassed ? "- [x]" : "- [ ]"; + const commitPrefix = spec.labels.includes("feature request") + ? "feat" + : spec.labels.includes("bug") + ? "fix" + : "chore"; + const prTitle = `${commitPrefix}: resolve #${spec.id} — ${spec.title}`; + const prBody = `## Description\n\nAutomated fix for #${spec.id}: ${spec.title}\n\n## Type of Change\n\n- [x] Bug fix (non-breaking change that fixes an issue)\n\n## Checklist\n\n${validationCheck} I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${spec.id}${outstandingNote}${validationNote}`; + + const prArgs = [ + "pr", + "create", + ...(isDraft ? ["--draft"] : []), + "--head", + spec.branch, + "--base", + "main", + "--title", + prTitle, + "--body", + prBody, + ]; + + let prCreated = false; + try { + execFileSync("gh", prArgs, { cwd, encoding: "utf-8", stdio: "pipe" }); + console.log(` #${spec.id}: PR created${isDraft ? " (draft)" : ""}.`); + prCreated = true; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.error(` #${spec.id}: PR creation failed: ${msg}`); + } + + return { isDraft, prCreated, validationPassed }; +} diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index f5729cb..de9987d 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -1,521 +1,72 @@ import * as sandcastle from "@ai-hero/sandcastle"; import { docker } from "@ai-hero/sandcastle/sandboxes/docker"; -import { execFileSync, execSync } from "node:child_process"; -import crypto from "node:crypto"; -import { z } from "zod"; + +import { ConcurrencyPool } from "./concurrency-pool.js"; +import { finalizeTask } from "./finalizer.js"; +import { runRefinementLoop } from "./refinement-loop.js"; +import { GithubIssueSource } from "./task-source.js"; const BRANCH_PREFIX = "agent/issue"; -const ESCAPED_PREFIX = BRANCH_PREFIX.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); -const BRANCH_PATTERN = new RegExp(`^${ESCAPED_PREFIX}-\\d+-[\\w-]+$`); const ISSUE_LABEL = "sandcastle"; -const MAX_PLANNER_RETRIES = 5; -const MAX_CRITIC_ROUNDS = 5; -const ITERATION_BUDGET = [100, 50, 25, 10, 10]; const MAX_PARALLEL = 3; const DOCKER_IMAGE = "sandcastle-sap-ai"; +const ITERATION_BUDGET = [100, 50, 25, 10, 10]; +const MAX_CRITIC_ROUNDS = 5; -const FindingSchema = z.object({ - category: z.enum(["security", "logic", "performance", "architecture", "style"]), - confidence: z.enum(["HIGH", "MEDIUM", "LOW"]), - description: z.string(), - file: z.string(), - line: z.number().optional(), - severity: z.enum(["CRITICAL", "HIGH", "MEDIUM", "LOW"]), - suggestion: z.string().optional(), - title: z.string(), -}); -const FindingsSchema = z.array(FindingSchema); -type Finding = z.infer; - -// --- Zod schema for GitHub issue list response (fix #6) --- - -const RawIssueSchema = z.object({ - body: z - .string() - .nullable() - .transform((b) => b ?? ""), - labels: z.array(z.object({ name: z.string() })), - number: z.number(), - title: z.string(), +const source = new GithubIssueSource({ + branchPrefix: BRANCH_PREFIX, + dockerImage: DOCKER_IMAGE, + label: ISSUE_LABEL, }); -const RawIssuesSchema = z.array(RawIssueSchema); - -// --- Type alias for sandbox instance --- - -type SandboxInstance = Awaited>; -/** - * @param sandbox - The sandcastle sandbox instance. - * @param cwd - Working directory (worktree path). - * @param issue - Issue metadata. - * @param issue.body - Sanitized issue body text. - * @param issue.branch - Git branch name for this issue. - * @param issue.id - GitHub issue number as string. - * @param issue.labels - Issue label names. - * @param issue.title - Issue title. - * @param converged - Whether the critic loop converged. - * @param lastFindings - Outstanding findings from the last round. - * @param round - The round at which the critic loop ended. - * @returns Whether work was completed (PR created). - */ -async function finalizeIssue( - sandbox: SandboxInstance, - cwd: string, - issue: { body: string; branch: string; id: string; labels: string[]; title: string }, - converged: boolean, - lastFindings: Finding[], - round: number, -): Promise { - let validationPassed = false; +const tasks = await source.discover(); - try { - execSync( - "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2", - { cwd, stdio: "pipe" }, - ); - validationPassed = true; - } catch (err: unknown) { - const stderr = - err instanceof Error && "stderr" in err - ? String((err as { stderr: unknown }).stderr).slice(0, 500) - : ""; - console.warn(` #${issue.id}: Validation failed.${stderr ? `\n${stderr}` : ""}`); - } +if (tasks.length === 0) { + process.exit(0); +} - // --- Validation retry round (fix #7) --- - if (!validationPassed && round < MAX_CRITIC_ROUNDS) { - const retryBudget = ITERATION_BUDGET[MAX_CRITIC_ROUNDS - 1] ?? 10; - console.log( - ` #${issue.id}: Retrying one more implement round (budget: ${String(retryBudget)})`, - ); +const pool = new ConcurrencyPool(MAX_PARALLEL); - try { - await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: retryBudget, - name: `Implementer #${issue.id} retry`, - promptArgs: { - BRANCH: issue.branch, - FINDINGS: lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : "", - ISSUE_BODY: issue.body, - ISSUE_TITLE: issue.title, - TASK_ID: issue.id, +const settled = await Promise.allSettled( + tasks.map((spec) => + pool.run(async () => { + await using sandbox = await sandcastle.createSandbox({ + branch: spec.branch, + copyToWorktree: ["node_modules"], + hooks: { + sandbox: { onSandboxReady: [{ command: "npm install && npm run build" }] }, }, - promptFile: "./.sandcastle/implement-prompt.md", + sandbox: docker({ imageName: DOCKER_IMAGE }), }); - } catch (retryErr: unknown) { - const retryMsg = retryErr instanceof Error ? retryErr.message : String(retryErr); - console.warn( - ` #${issue.id}: Implementer retry threw: ${retryMsg}. Falling through to PR creation.`, - ); - } - // Re-validate after retry - try { - execSync( - "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2", - { cwd, stdio: "pipe" }, - ); - validationPassed = true; - console.log(` #${issue.id}: Validation passed after retry round.`); - } catch { - console.warn(` #${issue.id}: Validation still fails after retry. Will create draft PR.`); - } - } + const loopResult = await runRefinementLoop(spec, sandbox, { + iterationBudget: ITERATION_BUDGET, + maxRounds: MAX_CRITIC_ROUNDS, + }); - // Rebase on latest main - let rebaseSucceeded = false; - try { - execSync("git fetch origin main && git rebase origin/main", { - cwd, - stdio: "pipe", - }); - rebaseSucceeded = true; - if (validationPassed) { - // Post-rebase smoke test - try { - execSync("npm run type-check && npm run test", { - cwd, - stdio: "pipe", - }); - } catch { - validationPassed = false; + let prCreated = false; + if (loopResult.totalCommits > 0) { + const cwd = sandbox.worktreePath; + const result = await finalizeTask(spec, loopResult, sandbox, cwd); + prCreated = result.prCreated; } - } - } catch { - // Rebase failed — abort and push un-rebased - try { - execSync("git rebase --abort", { cwd, stdio: "pipe" }); - } catch { - /* empty */ - } - try { - execSync("git push", { cwd, stdio: "pipe" }); - } catch (pushErr: unknown) { - const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); - console.warn(` #${issue.id}: git push failed after rebase abort: ${pushMsg}`); - } - } - - if (rebaseSucceeded) { - try { - execSync("git push --force-with-lease", { cwd, stdio: "pipe" }); - } catch (pushErr: unknown) { - const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); - console.warn( - ` #${issue.id}: git push --force-with-lease failed (branch un-pushed, PR creation will fail gracefully): ${pushMsg}`, - ); - } - } - - // Create PR (fix #1: use execFileSync to avoid shell injection) - const isDraft = !converged || !validationPassed; - const outstandingNote = - !converged && lastFindings.length > 0 - ? `\n\n⚠️ Outstanding findings:\n${lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` - : ""; - const validationNote = !validationPassed - ? "\n\n⚠️ Validation did not pass. Manual review required." - : ""; - - const validationCheck = validationPassed ? "- [x]" : "- [ ]"; - const commitPrefix = issue.labels.includes("feature request") - ? "feat" - : issue.labels.includes("bug") - ? "fix" - : "chore"; - const prTitle = `${commitPrefix}: resolve #${issue.id} — ${issue.title}`; - const prBody = `## Description\n\nAutomated fix for #${issue.id}: ${issue.title}\n\n## Type of Change\n\n- [x] Bug fix (non-breaking change that fixes an issue)\n\n## Checklist\n\n${validationCheck} I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${issue.id}${outstandingNote}${validationNote}`; - - const prArgs = [ - "pr", - "create", - ...(isDraft ? ["--draft"] : []), - "--head", - issue.branch, - "--base", - "main", - "--title", - prTitle, - "--body", - prBody, - ]; - - try { - execFileSync("gh", prArgs, { cwd, encoding: "utf-8", stdio: "pipe" }); - console.log(` #${issue.id}: PR created${isDraft ? " (draft)" : ""}.`); - return true; - } catch (err: unknown) { - const msg = err instanceof Error ? err.message : String(err); - console.error(` #${issue.id}: PR creation failed: ${msg}`); - return false; - } -} - -/** - * @param f - Finding to compute dedup key for. - * @returns Composite key for deterministic deduplication. - */ -function findingKey(f: Finding): string { - const normalizedTitle = f.title - .toLowerCase() - .replace(/[^\w\s]/g, "") - .replace(/\s+/g, " ") - .trim(); - return `${f.file}::${f.category}::${normalizedTitle}`; -} - -/** - * @param stdout - Agent stdout to parse findings from. - * @param nonce - Unique tag identifier for this run. - * @returns Parsed findings array or null on parse failure. - */ -function parseFindings(stdout: string, nonce: string): Finding[] | null { - const tagPattern = new RegExp(`([\\s\\S]*?)<\\/findings-${nonce}>`, "g"); - const matches = [...stdout.matchAll(tagPattern)]; - if (matches.length === 0) { - return null; - } - const raw = matches.at(-1)?.[1]?.trim() ?? "[]"; - const cleaned = raw.replace(/^```(?:json)?\s*\n?/g, "").replace(/\n?```\s*$/g, ""); - try { - return FindingsSchema.parse(JSON.parse(cleaned)); - } catch { - return null; - } -} - -/** - * @param text - Raw text to strip injection-prone tags from. - * @returns Sanitized text safe for prompt injection. - */ -function sanitizeForPrompt(text: string): string { - return text.replace(/<\/?(?:plan|findings[\w-]*|promise)[^>]*>/gi, ""); -} - -// --- Phase 1: Fetch and sanitize issues --- - -let rawIssuesJson: string; -try { - rawIssuesJson = execSync( - `gh issue list --state open --json number,title,labels,body --limit 50 --label "${ISSUE_LABEL}"`, - { encoding: "utf-8" }, - ); -} catch { - console.error("Failed to fetch issues. Ensure gh is installed and authenticated."); - process.exit(1); -} - -let rawIssues: z.infer; -try { - rawIssues = RawIssuesSchema.parse(JSON.parse(rawIssuesJson)); -} catch { - console.error("Failed to parse issues JSON. Unexpected format from gh CLI."); - process.exit(1); -} - -const issuesJson = rawIssues.map((i) => ({ - body: sanitizeForPrompt(i.body), - labels: i.labels.map((l) => l.name), - number: i.number, - title: i.title, -})); -if (issuesJson.length === 0) { - console.log("No issues with label '%s'. Exiting.", ISSUE_LABEL); - process.exit(0); -} - -// --- Phase 2: Plan --- - -let workCompleted = false; - -for (let attempt = 1; attempt <= MAX_PLANNER_RETRIES; attempt++) { - console.log(`\n=== Planner attempt ${String(attempt)}/${String(MAX_PLANNER_RETRIES)} ===\n`); - - const plan = await sandcastle.run({ - agent: sandcastle.opencode("github-copilot/claude-opus-4.6"), - maxIterations: 1, - name: "Planner", - promptArgs: { - BRANCH_PREFIX, - ISSUES_JSON: JSON.stringify(issuesJson, null, 2), - }, - promptFile: "./.sandcastle/plan-prompt.md", - sandbox: docker({ imageName: DOCKER_IMAGE }), - }); - - const planMatches = [...plan.stdout.matchAll(/([\s\S]*?)<\/plan>/g)]; - const planMatch = planMatches.at(-1); - if (!planMatch) { - console.error("Planner did not produce a tag. Retrying."); - continue; - } - - const planContent = planMatch[1] ?? ""; - let issues: { body: string; branch: string; id: string; labels: string[]; title: string }[]; - try { - const parsed = JSON.parse(planContent) as { issues: unknown[] }; - if (!Array.isArray(parsed.issues)) { - console.error("Planner output missing issues array. Retrying."); - continue; - } - const validated = parsed.issues.filter( - (entry): entry is { body: string; branch: string; id: string; title: string } => { - if (typeof entry !== "object" || entry === null) return false; - const item = entry as Record; - if (typeof item.id !== "string" || !/^\d+$/.test(item.id)) return false; - if (typeof item.branch !== "string" || !BRANCH_PATTERN.test(item.branch)) return false; - if (typeof item.title !== "string") return false; - return true; - }, - ); - // Attach sanitized body from our fetched data - issues = validated.map((v) => ({ - ...v, - body: issuesJson.find((i) => String(i.number) === v.id)?.body ?? "", - labels: issuesJson.find((i) => String(i.number) === v.id)?.labels ?? [], - })); - } catch { - console.error("Planner produced invalid JSON. Retrying."); - continue; - } - - if (issues.length === 0) { - console.log("No actionable issues. Exiting."); - workCompleted = true; - break; - } - - console.log(`Plan: ${String(issues.length)} issue(s) to work on:`); - for (const issue of issues) { - console.log(` #${issue.id}: ${issue.title} → ${issue.branch}`); - } - - // --- Phase 3: Implement ↔ Critic loop (parallel, max 3) --- - - let running = 0; - const queue: (() => void)[] = []; - const acquire = () => - running < MAX_PARALLEL - ? (running++, Promise.resolve()) - : new Promise((resolve) => queue.push(resolve)); - const release = () => { - running--; - const next = queue.shift(); - if (next) { - running++; - next(); - } - }; - - const settled = await Promise.allSettled( - issues.map(async (issue) => { - await acquire(); - try { - await using sandbox = await sandcastle.createSandbox({ - branch: issue.branch, - copyToWorktree: ["node_modules"], - hooks: { - sandbox: { onSandboxReady: [{ command: "npm install && npm run build" }] }, - }, - sandbox: docker({ imageName: DOCKER_IMAGE }), - }); - - const seenKeys = new Set(); - let lastFindings: Finding[] = []; - let converged = false; - let totalCommits = 0; - let lastRound = 0; - - for (let round = 1; round <= MAX_CRITIC_ROUNDS; round++) { - lastRound = round; - const budget = ITERATION_BUDGET[round - 1] ?? 10; - const findingsArg = lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : ""; - - console.log( - ` #${issue.id} round ${String(round)}/${String(MAX_CRITIC_ROUNDS)} (budget: ${String(budget)})`, - ); - - // Implementer - const impl = await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: budget, - name: `Implementer #${issue.id} R${String(round)}`, - promptArgs: { - BRANCH: issue.branch, - FINDINGS: findingsArg, - ISSUE_BODY: issue.body, - ISSUE_TITLE: issue.title, - TASK_ID: issue.id, - }, - promptFile: "./.sandcastle/implement-prompt.md", - }); - - totalCommits += impl.commits.length; - - if (round === 1 && impl.commits.length === 0) { - console.warn(` #${issue.id}: 0 commits on round 1. Skipping.`); - break; - } - - // Critic - const nonce = crypto.randomBytes(4).toString("hex"); - - let critic = await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: 1, - name: `Critic #${issue.id} R${String(round)}`, - promptArgs: { - BRANCH: issue.branch, - NONCE: nonce, - }, - promptFile: "./.sandcastle/critic-prompt.md", - }); - - let findings = parseFindings(critic.stdout, nonce); - - // Retry once on parse failure - if (findings === null) { - console.warn(` #${issue.id}: Critic parse failed. Retrying.`); - critic = await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: 1, - name: `Critic #${issue.id} R${String(round)} retry`, - promptArgs: { - BRANCH: issue.branch, - NONCE: nonce, - }, - promptFile: "./.sandcastle/critic-prompt.md", - }); - findings = parseFindings(critic.stdout, nonce); - } - - if (findings === null) { - console.warn(` #${issue.id}: Critic failed twice. Breaking (non-converged).`); - break; - } - - // Dedup - const newFindings = findings.filter( - (f) => f.confidence !== "LOW" && !seenKeys.has(findingKey(f)), - ); - for (const f of newFindings) { - seenKeys.add(findingKey(f)); - } - - console.log( - ` #${issue.id}: ${String(findings.length)} findings, ${String(newFindings.length)} new`, - ); - - if (newFindings.length === 0) { - const nonLowFindings = findings.filter((f) => f.confidence !== "LOW"); - if (nonLowFindings.length > 0) { - lastFindings = nonLowFindings; - converged = false; - } else { - converged = true; - } - break; - } - - lastFindings = newFindings; - } - - // --- Final validation, rebase, and PR creation --- - if (totalCommits > 0) { - const cwd = sandbox.worktreePath; - const prCreated = await finalizeIssue( - sandbox, - cwd, - issue, - converged, - lastFindings, - lastRound, - ); - if (prCreated) { - workCompleted = true; - } - } - - return { converged, issue, totalCommits }; - } finally { - release(); - } + return { prCreated, spec }; }), - ); - - // Log failures - for (const [i, outcome] of settled.entries()) { - if (outcome.status === "rejected") { - const issue = issues[i]; - const reason: unknown = outcome.reason; - const msg = reason instanceof Error ? (reason.stack ?? reason.message) : String(reason); - console.error(` ✗ #${issue?.id ?? String(i)} failed: ${msg}`); - } + ), +); + +const workCompleted = settled.some( + (outcome) => outcome.status === "fulfilled" && outcome.value.prCreated, +); + +for (const [i, outcome] of settled.entries()) { + if (outcome.status === "rejected") { + const spec = tasks[i]; + const reason: unknown = outcome.reason; + const msg = reason instanceof Error ? (reason.stack ?? reason.message) : String(reason); + console.error(` ✗ #${spec?.id ?? String(i)} failed: ${msg}`); } - - break; // Plan executed — exit planner retry loop } console.log("\nAll done."); diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts new file mode 100644 index 0000000..ae66e9e --- /dev/null +++ b/.sandcastle/refinement-loop.ts @@ -0,0 +1,193 @@ +import * as sandcastle from "@ai-hero/sandcastle"; +import crypto from "node:crypto"; + +import type { Finding, LoopResult, LoopStatus, SandboxInstance, TaskSpec } from "./types.js"; + +import { FindingsSchema } from "./types.js"; + +const DEFAULT_MAX_ROUNDS = 5; +const DEFAULT_ITERATION_BUDGET = [100, 50, 25, 10, 10]; + +/** Options for configuring the refinement loop. */ +export interface RefinementLoopOptions { + /** Budget of iterations per round (array indexed by round - 1). */ + iterationBudget?: number[]; + /** Maximum number of implement↔critic rounds. */ + maxRounds?: number; + /** Optional callback invoked after each round completes. */ + onRoundComplete?: (round: number, findings: Finding[]) => void; +} + +/** + * Runs the implement↔critic refinement loop for a given task. + * @param spec - The task specification. + * @param sandbox - The sandcastle sandbox instance. + * @param opts - Optional configuration for rounds, budget, and callbacks. + * @returns The loop result with status, commits, findings, and rounds completed. + */ +export async function runRefinementLoop( + spec: TaskSpec, + sandbox: SandboxInstance, + opts?: RefinementLoopOptions, +): Promise { + const maxRounds = opts?.maxRounds ?? DEFAULT_MAX_ROUNDS; + const iterationBudget = opts?.iterationBudget ?? DEFAULT_ITERATION_BUDGET; + + const seenKeys = new Set(); + let lastFindings: Finding[] = []; + let status: LoopStatus = "exhausted"; + let totalCommits = 0; + let roundsCompleted = 0; + + for (let round = 1; round <= maxRounds; round++) { + roundsCompleted = round; + const budget = iterationBudget[round - 1] ?? 10; + const findingsArg = lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : ""; + + console.log( + ` #${spec.id} round ${String(round)}/${String(maxRounds)} (budget: ${String(budget)})`, + ); + + // Implementer + const impl = await sandbox.run({ + agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + maxIterations: budget, + name: `Implementer #${spec.id} R${String(round)}`, + promptArgs: { + BRANCH: spec.branch, + FINDINGS: findingsArg, + ISSUE_BODY: spec.body, + ISSUE_TITLE: spec.title, + TASK_ID: spec.id, + }, + promptFile: "./.sandcastle/implement-prompt.md", + }); + + totalCommits += impl.commits.length; + + if (round === 1 && impl.commits.length === 0) { + console.warn(` #${spec.id}: 0 commits on round 1. Skipping.`); + status = "skipped"; + break; + } + + // Critic + const nonce = crypto.randomBytes(4).toString("hex"); + const findings = await runCritic(sandbox, spec, round, nonce); + + if (findings === null) { + console.warn(` #${spec.id}: Critic failed twice. Breaking (non-converged).`); + status = "failed"; + break; + } + + // Dedup + const newFindings = findings.filter( + (f) => f.confidence !== "LOW" && !seenKeys.has(findingKey(f)), + ); + for (const f of newFindings) { + seenKeys.add(findingKey(f)); + } + + console.log( + ` #${spec.id}: ${String(findings.length)} findings, ${String(newFindings.length)} new`, + ); + + opts?.onRoundComplete?.(round, findings); + + if (newFindings.length === 0) { + const nonLowFindings = findings.filter((f) => f.confidence !== "LOW"); + if (nonLowFindings.length > 0) { + lastFindings = nonLowFindings; + status = "exhausted"; + } else { + status = "converged"; + } + break; + } + + lastFindings = newFindings; + } + + return { lastFindings, roundsCompleted, status, totalCommits }; +} + +/** + * Computes a deduplication key for a finding. + * @param f - Finding to compute a key for. + * @returns Composite dedup key. + */ +function findingKey(f: Finding): string { + const normalizedTitle = f.title + .toLowerCase() + .replace(/[^\w\s]/g, "") + .replace(/\s+/g, " ") + .trim(); + return `${f.file}::${f.category}::${normalizedTitle}`; +} + +/** + * Parses findings from agent stdout using nonce-tagged delimiters. + * @param stdout - Agent stdout to parse findings from. + * @param nonce - Unique tag identifier for this run. + * @returns Parsed findings array or null on parse failure. + */ +function parseFindings(stdout: string, nonce: string): Finding[] | null { + const tagPattern = new RegExp(`([\\s\\S]*?)<\\/findings-${nonce}>`, "g"); + const matches = [...stdout.matchAll(tagPattern)]; + if (matches.length === 0) { + return null; + } + const raw = matches.at(-1)?.[1]?.trim() ?? "[]"; + const cleaned = raw.replace(/^```(?:json)?\s*\n?/g, "").replace(/\n?```\s*$/g, ""); + try { + return FindingsSchema.parse(JSON.parse(cleaned)); + } catch { + return null; + } +} + +/** + * Runs the critic agent, retrying once on parse failure. + * @param sandbox - The sandcastle sandbox instance. + * @param spec - The task specification. + * @param round - Current round number. + * @param nonce - Unique nonce for parsing. + * @returns Parsed findings or null if both attempts failed. + */ +async function runCritic( + sandbox: SandboxInstance, + spec: TaskSpec, + round: number, + nonce: string, +): Promise { + let critic = await sandbox.run({ + agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + maxIterations: 1, + name: `Critic #${spec.id} R${String(round)}`, + promptArgs: { + BRANCH: spec.branch, + NONCE: nonce, + }, + promptFile: "./.sandcastle/critic-prompt.md", + }); + + let findings = parseFindings(critic.stdout, nonce); + + if (findings === null) { + console.warn(` #${spec.id}: Critic parse failed. Retrying.`); + critic = await sandbox.run({ + agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + maxIterations: 1, + name: `Critic #${spec.id} R${String(round)} retry`, + promptArgs: { + BRANCH: spec.branch, + NONCE: nonce, + }, + promptFile: "./.sandcastle/critic-prompt.md", + }); + findings = parseFindings(critic.stdout, nonce); + } + + return findings; +} diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts new file mode 100644 index 0000000..e39e9c6 --- /dev/null +++ b/.sandcastle/task-source.ts @@ -0,0 +1,190 @@ +import * as sandcastle from "@ai-hero/sandcastle"; +import { docker } from "@ai-hero/sandcastle/sandboxes/docker"; +import { execSync } from "node:child_process"; +import { z } from "zod"; + +import type { TaskSpec } from "./types.js"; + +const RawIssueSchema = z.object({ + body: z + .string() + .nullable() + .transform((b) => b ?? ""), + labels: z.array(z.object({ name: z.string() })), + number: z.number(), + title: z.string(), +}); +const RawIssuesSchema = z.array(RawIssueSchema); + +/** Configuration for the GitHub issue task source. */ +export interface GithubIssueSourceConfig { + /** Git branch prefix for issue branches. */ + branchPrefix: string; + /** Docker image name for the sandbox. */ + dockerImage: string; + /** GitHub issue label to filter by. */ + label: string; + /** Maximum planner retries. */ + maxRetries?: number; +} + +/** Interface for task discovery sources. */ +export interface TaskSource { + /** Discovers tasks to work on. */ + discover(): Promise; +} + +/** + * Task source that discovers work from GitHub issues via planner agent. + */ +export class GithubIssueSource implements TaskSource { + private readonly branchPattern: RegExp; + private readonly branchPrefix: string; + private readonly dockerImage: string; + private readonly label: string; + private readonly maxRetries: number; + + /** + * @param config - Configuration for the GitHub issue source. + */ + constructor(config: GithubIssueSourceConfig) { + this.branchPrefix = config.branchPrefix; + this.dockerImage = config.dockerImage; + this.label = config.label; + this.maxRetries = config.maxRetries ?? 5; + + const escapedPrefix = this.branchPrefix.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + this.branchPattern = new RegExp(`^${escapedPrefix}-\\d+-[\\w-]+$`); + } + + /** + * Discovers tasks by fetching GitHub issues, running the planner, and validating the plan. + * @returns Array of task specifications to implement. + */ + async discover(): Promise { + const issuesJson = this.fetchAndSanitizeIssues(); + + if (issuesJson.length === 0) { + console.log("No issues with label '%s'. Exiting.", this.label); + return []; + } + + for (let attempt = 1; attempt <= this.maxRetries; attempt++) { + console.log(`\n=== Planner attempt ${String(attempt)}/${String(this.maxRetries)} ===\n`); + + const plan = await sandcastle.run({ + agent: sandcastle.opencode("github-copilot/claude-opus-4.6"), + maxIterations: 1, + name: "Planner", + promptArgs: { + BRANCH_PREFIX: this.branchPrefix, + ISSUES_JSON: JSON.stringify(issuesJson, null, 2), + }, + promptFile: "./.sandcastle/plan-prompt.md", + sandbox: docker({ imageName: this.dockerImage }), + }); + + const planMatches = [...plan.stdout.matchAll(/([\s\S]*?)<\/plan>/g)]; + const planMatch = planMatches.at(-1); + if (!planMatch) { + console.error("Planner did not produce a tag. Retrying."); + continue; + } + + const planContent = planMatch[1] ?? ""; + const tasks = this.validatePlan(planContent, issuesJson); + if (tasks === null) { + continue; + } + + if (tasks.length === 0) { + console.log("No actionable issues. Exiting."); + return []; + } + + console.log(`Plan: ${String(tasks.length)} issue(s) to work on:`); + for (const task of tasks) { + console.log(` #${task.id}: ${task.title} → ${task.branch}`); + } + + return tasks; + } + + return []; + } + + private fetchAndSanitizeIssues(): { + body: string; + labels: string[]; + number: number; + title: string; + }[] { + let rawIssuesJson: string; + try { + rawIssuesJson = execSync( + `gh issue list --state open --json number,title,labels,body --limit 50 --label "${this.label}"`, + { encoding: "utf-8" }, + ); + } catch { + console.error("Failed to fetch issues. Ensure gh is installed and authenticated."); + process.exit(1); + } + + let rawIssues: z.infer; + try { + rawIssues = RawIssuesSchema.parse(JSON.parse(rawIssuesJson)); + } catch { + console.error("Failed to parse issues JSON. Unexpected format from gh CLI."); + process.exit(1); + } + + return rawIssues.map((i) => ({ + body: sanitizeForPrompt(i.body), + labels: i.labels.map((l) => l.name), + number: i.number, + title: i.title, + })); + } + + private validatePlan( + planContent: string, + issuesJson: { body: string; labels: string[]; number: number; title: string }[], + ): null | TaskSpec[] { + try { + const parsed = JSON.parse(planContent) as { issues: unknown[] }; + if (!Array.isArray(parsed.issues)) { + console.error("Planner output missing issues array. Retrying."); + return null; + } + const validated = parsed.issues.filter( + (entry): entry is { body: string; branch: string; id: string; title: string } => { + if (typeof entry !== "object" || entry === null) return false; + const item = entry as Record; + if (typeof item.id !== "string" || !/^\d+$/.test(item.id)) return false; + if (typeof item.branch !== "string" || !this.branchPattern.test(item.branch)) + return false; + if (typeof item.title !== "string") return false; + return true; + }, + ); + + return validated.map((v) => ({ + ...v, + body: issuesJson.find((i) => String(i.number) === v.id)?.body ?? "", + labels: issuesJson.find((i) => String(i.number) === v.id)?.labels ?? [], + })); + } catch { + console.error("Planner produced invalid JSON. Retrying."); + return null; + } + } +} + +/** + * Strips injection-prone tags from text. + * @param text - Raw text to sanitize. + * @returns Sanitized text safe for prompt injection. + */ +function sanitizeForPrompt(text: string): string { + return text.replace(/<\/?(?:plan|findings[\w-]*|promise)[^>]*>/gi, ""); +} diff --git a/.sandcastle/types.ts b/.sandcastle/types.ts new file mode 100644 index 0000000..14d1b38 --- /dev/null +++ b/.sandcastle/types.ts @@ -0,0 +1,63 @@ +import type * as sandcastle from "@ai-hero/sandcastle"; + +import { z } from "zod"; + +/** Zod schema for a single critic finding. */ +export const FindingSchema = z.object({ + category: z.enum(["security", "logic", "performance", "architecture", "style"]), + confidence: z.enum(["HIGH", "MEDIUM", "LOW"]), + description: z.string(), + file: z.string(), + line: z.number().optional(), + severity: z.enum(["CRITICAL", "HIGH", "MEDIUM", "LOW"]), + suggestion: z.string().optional(), + title: z.string(), +}); + +/** Zod schema for an array of critic findings. */ +export const FindingsSchema = z.array(FindingSchema); + +/** Result of post-loop finalization. */ +export interface FinalizeResult { + /** Whether the PR was marked as draft. */ + isDraft: boolean; + /** Whether a PR was successfully created. */ + prCreated: boolean; + /** Whether validation passed. */ + validationPassed: boolean; +} + +/** A single critic finding parsed from agent output. */ +export type Finding = z.infer; + +/** Result returned by the refinement loop. */ +export interface LoopResult { + /** Outstanding findings from the last round. */ + lastFindings: Finding[]; + /** Number of rounds completed. */ + roundsCompleted: number; + /** Termination status. */ + status: LoopStatus; + /** Total commits produced across all rounds. */ + totalCommits: number; +} + +/** Outcome status of the refinement loop. */ +export type LoopStatus = "converged" | "exhausted" | "failed" | "skipped"; + +/** Type alias for a sandcastle sandbox instance. */ +export type SandboxInstance = Awaited>; + +/** Specification for a task to be implemented. */ +export interface TaskSpec { + /** Sanitized issue body text. */ + body: string; + /** Git branch name for this task. */ + branch: string; + /** Task identifier (e.g. GitHub issue number as string). */ + id: string; + /** Label names associated with the task. */ + labels: string[]; + /** Task title. */ + title: string; +} From 3215748573a25fe5c36acf7a2184d017e98c882f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 14:00:47 +0200 Subject: [PATCH 09/31] fix: centralize constants, fix PR type-of-change, sanitize titles --- .sandcastle/finalizer.ts | 12 +++++++++--- .sandcastle/main.ts | 3 +-- .sandcastle/refinement-loop.ts | 11 ++++------- .sandcastle/task-source.ts | 2 +- .sandcastle/types.ts | 6 ++++++ 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/.sandcastle/finalizer.ts b/.sandcastle/finalizer.ts index 1ac076e..a2e9db7 100644 --- a/.sandcastle/finalizer.ts +++ b/.sandcastle/finalizer.ts @@ -3,8 +3,8 @@ import { execFileSync, execSync } from "node:child_process"; import type { FinalizeResult, LoopResult, SandboxInstance, TaskSpec } from "./types.js"; -const MAX_CRITIC_ROUNDS = 5; -const ITERATION_BUDGET = [100, 50, 25, 10, 10]; +import { ITERATION_BUDGET, MAX_CRITIC_ROUNDS } from "./types.js"; + const VALIDATION_COMMAND = "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2"; @@ -134,7 +134,13 @@ export async function finalizeTask( ? "fix" : "chore"; const prTitle = `${commitPrefix}: resolve #${spec.id} — ${spec.title}`; - const prBody = `## Description\n\nAutomated fix for #${spec.id}: ${spec.title}\n\n## Type of Change\n\n- [x] Bug fix (non-breaking change that fixes an issue)\n\n## Checklist\n\n${validationCheck} I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${spec.id}${outstandingNote}${validationNote}`; + const typeOfChange = + commitPrefix === "feat" + ? "New feature (non-breaking change that adds functionality)" + : commitPrefix === "fix" + ? "Bug fix (non-breaking change that fixes an issue)" + : "Refactoring (no functional changes)"; + const prBody = `## Description\n\nAutomated ${commitPrefix} for #${spec.id}: ${spec.title}\n\n## Type of Change\n\n- [x] ${typeOfChange}\n\n## Checklist\n\n${validationCheck} I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${spec.id}${outstandingNote}${validationNote}`; const prArgs = [ "pr", diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index de9987d..0b916fd 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -5,13 +5,12 @@ import { ConcurrencyPool } from "./concurrency-pool.js"; import { finalizeTask } from "./finalizer.js"; import { runRefinementLoop } from "./refinement-loop.js"; import { GithubIssueSource } from "./task-source.js"; +import { ITERATION_BUDGET, MAX_CRITIC_ROUNDS } from "./types.js"; const BRANCH_PREFIX = "agent/issue"; const ISSUE_LABEL = "sandcastle"; const MAX_PARALLEL = 3; const DOCKER_IMAGE = "sandcastle-sap-ai"; -const ITERATION_BUDGET = [100, 50, 25, 10, 10]; -const MAX_CRITIC_ROUNDS = 5; const source = new GithubIssueSource({ branchPrefix: BRANCH_PREFIX, diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index ae66e9e..61c8b54 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -3,15 +3,12 @@ import crypto from "node:crypto"; import type { Finding, LoopResult, LoopStatus, SandboxInstance, TaskSpec } from "./types.js"; -import { FindingsSchema } from "./types.js"; - -const DEFAULT_MAX_ROUNDS = 5; -const DEFAULT_ITERATION_BUDGET = [100, 50, 25, 10, 10]; +import { FindingsSchema, ITERATION_BUDGET, MAX_CRITIC_ROUNDS } from "./types.js"; /** Options for configuring the refinement loop. */ export interface RefinementLoopOptions { /** Budget of iterations per round (array indexed by round - 1). */ - iterationBudget?: number[]; + iterationBudget?: readonly number[]; /** Maximum number of implement↔critic rounds. */ maxRounds?: number; /** Optional callback invoked after each round completes. */ @@ -30,8 +27,8 @@ export async function runRefinementLoop( sandbox: SandboxInstance, opts?: RefinementLoopOptions, ): Promise { - const maxRounds = opts?.maxRounds ?? DEFAULT_MAX_ROUNDS; - const iterationBudget = opts?.iterationBudget ?? DEFAULT_ITERATION_BUDGET; + const maxRounds = opts?.maxRounds ?? MAX_CRITIC_ROUNDS; + const iterationBudget = opts?.iterationBudget ?? ITERATION_BUDGET; const seenKeys = new Set(); let lastFindings: Finding[] = []; diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts index e39e9c6..347e120 100644 --- a/.sandcastle/task-source.ts +++ b/.sandcastle/task-source.ts @@ -142,7 +142,7 @@ export class GithubIssueSource implements TaskSource { body: sanitizeForPrompt(i.body), labels: i.labels.map((l) => l.name), number: i.number, - title: i.title, + title: sanitizeForPrompt(i.title), })); } diff --git a/.sandcastle/types.ts b/.sandcastle/types.ts index 14d1b38..849df1f 100644 --- a/.sandcastle/types.ts +++ b/.sandcastle/types.ts @@ -48,6 +48,12 @@ export type LoopStatus = "converged" | "exhausted" | "failed" | "skipped"; /** Type alias for a sandcastle sandbox instance. */ export type SandboxInstance = Awaited>; +/** Maximum implement↔critic rounds before giving up. */ +export const MAX_CRITIC_ROUNDS = 5; + +/** Token budget per round (decreasing). Index = round - 1. */ +export const ITERATION_BUDGET = [100, 50, 25, 10, 10] as const; + /** Specification for a task to be implemented. */ export interface TaskSpec { /** Sanitized issue body text. */ From 25e85de3173a2f56a04de4db00ca0f97fcb94e35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 14:06:55 +0200 Subject: [PATCH 10/31] fix: full validation post-rebase, execFileSync for gh issue list --- .sandcastle/finalizer.ts | 2 +- .sandcastle/task-source.ts | 18 +++++++++++++++--- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/.sandcastle/finalizer.ts b/.sandcastle/finalizer.ts index a2e9db7..97efd38 100644 --- a/.sandcastle/finalizer.ts +++ b/.sandcastle/finalizer.ts @@ -84,7 +84,7 @@ export async function finalizeTask( rebaseSucceeded = true; if (validationPassed) { try { - execSync("npm run type-check && npm run test", { + execSync(VALIDATION_COMMAND, { cwd, stdio: "pipe", }); diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts index 347e120..28e9e49 100644 --- a/.sandcastle/task-source.ts +++ b/.sandcastle/task-source.ts @@ -1,6 +1,6 @@ import * as sandcastle from "@ai-hero/sandcastle"; import { docker } from "@ai-hero/sandcastle/sandboxes/docker"; -import { execSync } from "node:child_process"; +import { execFileSync } from "node:child_process"; import { z } from "zod"; import type { TaskSpec } from "./types.js"; @@ -121,8 +121,20 @@ export class GithubIssueSource implements TaskSource { }[] { let rawIssuesJson: string; try { - rawIssuesJson = execSync( - `gh issue list --state open --json number,title,labels,body --limit 50 --label "${this.label}"`, + rawIssuesJson = execFileSync( + "gh", + [ + "issue", + "list", + "--state", + "open", + "--json", + "number,title,labels,body", + "--limit", + "50", + "--label", + this.label, + ], { encoding: "utf-8" }, ); } catch { From 4f83a6c72d140446004db6a4321e62988f3d9fd3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 14:12:40 +0200 Subject: [PATCH 11/31] perf: skip critic when implementer produces 0 commits on round 2+ --- .sandcastle/refinement-loop.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index 61c8b54..d3c3aaf 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -68,6 +68,11 @@ export async function runRefinementLoop( break; } + if (round > 1 && impl.commits.length === 0) { + status = "exhausted"; + break; + } + // Critic const nonce = crypto.randomBytes(4).toString("hex"); const findings = await runCritic(sandbox, spec, round, nonce); From 301c3918ad05e680690453c2a90ee7d9c92752ae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 14:44:57 +0200 Subject: [PATCH 12/31] fix: remove unsound type guard, filter unknown plan IDs, guard ConcurrencyPool, warn on planner exhaustion --- .sandcastle/concurrency-pool.ts | 8 ++++++-- .sandcastle/task-source.ts | 19 +++++++++++++------ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/.sandcastle/concurrency-pool.ts b/.sandcastle/concurrency-pool.ts index f2e6fee..5809e53 100644 --- a/.sandcastle/concurrency-pool.ts +++ b/.sandcastle/concurrency-pool.ts @@ -6,9 +6,13 @@ export class ConcurrencyPool { private running = 0; /** - * @param max - Maximum number of concurrent tasks. + * @param max - Maximum number of concurrent tasks. Must be >= 1. */ - constructor(private readonly max: number) {} + constructor(private readonly max: number) { + if (max < 1) { + throw new RangeError("ConcurrencyPool max must be >= 1"); + } + } /** * Executes the given async function, waiting if the pool is at capacity. diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts index 28e9e49..8e67b8b 100644 --- a/.sandcastle/task-source.ts +++ b/.sandcastle/task-source.ts @@ -110,6 +110,7 @@ export class GithubIssueSource implements TaskSource { return tasks; } + console.warn("Planner failed to produce a valid plan after all retries."); return []; } @@ -169,7 +170,7 @@ export class GithubIssueSource implements TaskSource { return null; } const validated = parsed.issues.filter( - (entry): entry is { body: string; branch: string; id: string; title: string } => { + (entry): entry is { branch: string; id: string; title: string } => { if (typeof entry !== "object" || entry === null) return false; const item = entry as Record; if (typeof item.id !== "string" || !/^\d+$/.test(item.id)) return false; @@ -180,11 +181,17 @@ export class GithubIssueSource implements TaskSource { }, ); - return validated.map((v) => ({ - ...v, - body: issuesJson.find((i) => String(i.number) === v.id)?.body ?? "", - labels: issuesJson.find((i) => String(i.number) === v.id)?.labels ?? [], - })); + return validated + .map((v) => { + const source = issuesJson.find((i) => String(i.number) === v.id); + if (!source) return null; + return { + ...v, + body: source.body, + labels: source.labels, + }; + }) + .filter((v): v is NonNullable => v !== null); } catch { console.error("Planner produced invalid JSON. Retrying."); return null; From 8e05a151711dd56917398e98b06ed3126fc4ed37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 15:34:52 +0200 Subject: [PATCH 13/31] feat: state-of-the-art algorithmic improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Budget: flat constant 50/round (ARCS/SWE-Agent pattern, replaces suboptimal decreasing schedule) - Dedup: context-hash fingerprint of ±3 lines (CodeQL/Qodana pattern, drift-safe) - Quality ratchet: rollback round on regression (findings increase → git reset --hard) - Log: post-rebase validation failure now logs stderr - Safety: rescue branch on push failure (preserves commits before sandbox disposal) --- .sandcastle/finalizer.ts | 27 +++++++--- .sandcastle/main.ts | 4 +- .sandcastle/refinement-loop.ts | 95 ++++++++++++++++++++++++++++------ .sandcastle/types.ts | 4 +- 4 files changed, 104 insertions(+), 26 deletions(-) diff --git a/.sandcastle/finalizer.ts b/.sandcastle/finalizer.ts index 97efd38..60527b9 100644 --- a/.sandcastle/finalizer.ts +++ b/.sandcastle/finalizer.ts @@ -3,7 +3,7 @@ import { execFileSync, execSync } from "node:child_process"; import type { FinalizeResult, LoopResult, SandboxInstance, TaskSpec } from "./types.js"; -import { ITERATION_BUDGET, MAX_CRITIC_ROUNDS } from "./types.js"; +import { ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS } from "./types.js"; const VALIDATION_COMMAND = "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2"; @@ -36,7 +36,7 @@ export async function finalizeTask( } if (!validationPassed && loopResult.roundsCompleted < MAX_CRITIC_ROUNDS) { - const retryBudget = ITERATION_BUDGET[MAX_CRITIC_ROUNDS - 1] ?? 10; + const retryBudget = ITERATION_BUDGET_PER_ROUND; console.log( ` #${spec.id}: Retrying one more implement round (budget: ${String(retryBudget)})`, ); @@ -88,7 +88,14 @@ export async function finalizeTask( cwd, stdio: "pipe", }); - } catch { + } catch (postRebaseErr: unknown) { + const postRebaseStderr = + postRebaseErr instanceof Error && "stderr" in postRebaseErr + ? String((postRebaseErr as { stderr: unknown }).stderr).slice(0, 500) + : ""; + console.warn( + ` #${spec.id}: Post-rebase validation failed.${postRebaseStderr ? `\n${postRebaseStderr}` : ""}`, + ); validationPassed = false; } } @@ -111,9 +118,17 @@ export async function finalizeTask( execSync("git push --force-with-lease", { cwd, stdio: "pipe" }); } catch (pushErr: unknown) { const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); - console.warn( - ` #${spec.id}: git push --force-with-lease failed (branch un-pushed, PR creation will fail gracefully): ${pushMsg}`, - ); + try { + execSync(`git push origin HEAD:refs/heads/rescue/${spec.branch}-${String(Date.now())}`, { + cwd, + stdio: "pipe", + }); + console.warn(` #${spec.id}: Push failed. Commits preserved at rescue/${spec.branch}-...`); + } catch { + console.error( + ` #${spec.id}: Push failed and rescue failed. Commits will be lost on sandbox disposal: ${pushMsg}`, + ); + } } } diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 0b916fd..988d832 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -5,7 +5,7 @@ import { ConcurrencyPool } from "./concurrency-pool.js"; import { finalizeTask } from "./finalizer.js"; import { runRefinementLoop } from "./refinement-loop.js"; import { GithubIssueSource } from "./task-source.js"; -import { ITERATION_BUDGET, MAX_CRITIC_ROUNDS } from "./types.js"; +import { ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS } from "./types.js"; const BRANCH_PREFIX = "agent/issue"; const ISSUE_LABEL = "sandcastle"; @@ -39,7 +39,7 @@ const settled = await Promise.allSettled( }); const loopResult = await runRefinementLoop(spec, sandbox, { - iterationBudget: ITERATION_BUDGET, + iterationBudget: ITERATION_BUDGET_PER_ROUND, maxRounds: MAX_CRITIC_ROUNDS, }); diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index d3c3aaf..8b08221 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -1,14 +1,15 @@ import * as sandcastle from "@ai-hero/sandcastle"; +import { execSync } from "node:child_process"; import crypto from "node:crypto"; import type { Finding, LoopResult, LoopStatus, SandboxInstance, TaskSpec } from "./types.js"; -import { FindingsSchema, ITERATION_BUDGET, MAX_CRITIC_ROUNDS } from "./types.js"; +import { FindingsSchema, ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS } from "./types.js"; /** Options for configuring the refinement loop. */ export interface RefinementLoopOptions { - /** Budget of iterations per round (array indexed by round - 1). */ - iterationBudget?: readonly number[]; + /** Budget of iterations per round (flat constant applied to every round). */ + iterationBudget?: number; /** Maximum number of implement↔critic rounds. */ maxRounds?: number; /** Optional callback invoked after each round completes. */ @@ -28,23 +29,35 @@ export async function runRefinementLoop( opts?: RefinementLoopOptions, ): Promise { const maxRounds = opts?.maxRounds ?? MAX_CRITIC_ROUNDS; - const iterationBudget = opts?.iterationBudget ?? ITERATION_BUDGET; + const budget = opts?.iterationBudget ?? ITERATION_BUDGET_PER_ROUND; const seenKeys = new Set(); let lastFindings: Finding[] = []; let status: LoopStatus = "exhausted"; let totalCommits = 0; let roundsCompleted = 0; + let previousFindingsCount = Infinity; for (let round = 1; round <= maxRounds; round++) { roundsCompleted = round; - const budget = iterationBudget[round - 1] ?? 10; const findingsArg = lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : ""; console.log( ` #${spec.id} round ${String(round)}/${String(maxRounds)} (budget: ${String(budget)})`, ); + // Capture SHA before implementer runs (for quality ratchet rollback) + let beforeRoundSha = ""; + try { + beforeRoundSha = execSync("git rev-parse HEAD", { + cwd: sandbox.worktreePath, + encoding: "utf-8", + stdio: ["pipe", "pipe", "pipe"], + }).trim(); + } catch { + /* empty */ + } + // Implementer const impl = await sandbox.run({ agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), @@ -83,18 +96,37 @@ export async function runRefinementLoop( break; } - // Dedup + // Dedup via context hash + const cwd = sandbox.worktreePath; const newFindings = findings.filter( - (f) => f.confidence !== "LOW" && !seenKeys.has(findingKey(f)), + (f) => f.confidence !== "LOW" && !seenKeys.has(findingKey(f, cwd)), ); for (const f of newFindings) { - seenKeys.add(findingKey(f)); + seenKeys.add(findingKey(f, cwd)); } console.log( ` #${spec.id}: ${String(findings.length)} findings, ${String(newFindings.length)} new`, ); + // Quality ratchet: rollback if findings increased (regression) + if (round > 1 && findings.length > previousFindingsCount) { + try { + execSync(`git reset --hard ${beforeRoundSha}`, { + cwd: sandbox.worktreePath, + stdio: "pipe", + }); + console.warn( + ` #${spec.id} R${String(round)}: Regression detected (${String(previousFindingsCount)} → ${String(findings.length)}). Rolled back.`, + ); + } catch { + /* empty */ + } + status = "exhausted"; + break; + } + previousFindingsCount = findings.length; + opts?.onRoundComplete?.(round, findings); if (newFindings.length === 0) { @@ -115,17 +147,48 @@ export async function runRefinementLoop( } /** - * Computes a deduplication key for a finding. + * Computes a deduplication key for a finding using a context hash of surrounding lines. * @param f - Finding to compute a key for. + * @param cwd - Working directory (worktree path) for reading file context. * @returns Composite dedup key. */ -function findingKey(f: Finding): string { - const normalizedTitle = f.title - .toLowerCase() - .replace(/[^\w\s]/g, "") - .replace(/\s+/g, " ") - .trim(); - return `${f.file}::${f.category}::${normalizedTitle}`; +function findingKey(f: Finding, cwd: string): string { + if (!f.file || f.line == null) { + const truncTitle = f.title + .toLowerCase() + .slice(0, 50) + .replace(/[^\w\s]/g, "") + .replace(/\s+/g, " ") + .trim(); + return `${f.file || "global"}::${f.category}::${truncTitle}`; + } + const contextHash = hashContextLines(cwd, f.file, f.line, 3); + return `${f.file}::${f.category}::${contextHash}`; +} + +/** + * Hashes ±radius lines around the given line in a file for dedup stability. + * @param cwd - Working directory. + * @param file - Relative file path. + * @param line - Line number of the finding. + * @param radius - Number of context lines above and below. + * @returns Truncated SHA-256 hex digest. + */ +function hashContextLines(cwd: string, file: string, line: number, radius: number): string { + try { + const start = Math.max(1, line - radius); + const end = line + radius; + const content = execSync(`sed -n '${String(start)},${String(end)}p' "${file}"`, { + cwd, + encoding: "utf-8", + stdio: ["pipe", "pipe", "pipe"], + }); + const normalized = content.replace(/\s+/g, " ").trim(); + return crypto.createHash("sha256").update(normalized).digest("hex").slice(0, 16); + } catch { + const truncTitle = file.slice(0, 30) + String(line); + return crypto.createHash("sha256").update(truncTitle).digest("hex").slice(0, 16); + } } /** diff --git a/.sandcastle/types.ts b/.sandcastle/types.ts index 849df1f..efea8f8 100644 --- a/.sandcastle/types.ts +++ b/.sandcastle/types.ts @@ -51,8 +51,8 @@ export type SandboxInstance = Awaited Date: Tue, 5 May 2026 16:08:35 +0200 Subject: [PATCH 14/31] fix(sandcastle): resolve all algorithmic audit findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace execSync+sed shell injection with readFileSync+join in hashContextLines (RCE, P0) - Fix quality ratchet to compare nonLowFindings.length not findings.length (logic bug, P1) - Replace title truncation with SHA-256 hash in findingKey (collision fix, P2) - Hash single target line + file salt instead of positional context window (line-drift fix, P2) - Replace overlapping regex quantifiers in sanitizeForPrompt (ReDoS O(n²), P2) - Replace Array FIFO queue with O(1) singly-linked list in ConcurrencyPool (P3) - Move totalCommits accumulation after ratchet check to fix phantom counting (P3) - Replace O(T×I) .find() with O(1) Map lookup in validatePlan (P3) --- .sandcastle/concurrency-pool.ts | 29 +++++++++++++++++++---- .sandcastle/refinement-loop.ts | 41 ++++++++++++++++++--------------- .sandcastle/task-source.ts | 25 ++++++++++---------- 3 files changed, 59 insertions(+), 36 deletions(-) diff --git a/.sandcastle/concurrency-pool.ts b/.sandcastle/concurrency-pool.ts index 5809e53..7ce4be2 100644 --- a/.sandcastle/concurrency-pool.ts +++ b/.sandcastle/concurrency-pool.ts @@ -1,8 +1,16 @@ +/** Internal node for the O(1) FIFO waiting queue. Not exported. */ +interface QueueNode { + resolve: () => void; + next: QueueNode | null; +} + /** * A concurrency limiter that restricts parallel execution to a maximum number of tasks. + * Queue operations are O(1) amortized (singly-linked list). */ export class ConcurrencyPool { - private readonly queue: (() => void)[] = []; + private head: QueueNode | null = null; + private tail: QueueNode | null = null; private running = 0; /** @@ -34,16 +42,27 @@ export class ConcurrencyPool { return Promise.resolve(); } return new Promise((resolve) => { - this.queue.push(resolve); + const node: QueueNode = { resolve, next: null }; + if (this.tail === null) { + this.head = node; + this.tail = node; + } else { + this.tail.next = node; + this.tail = node; + } }); } private release(): void { this.running--; - const next = this.queue.shift(); - if (next) { + const next = this.head; + if (next !== null) { + this.head = next.next; + if (this.head === null) { + this.tail = null; + } this.running++; - next(); + next.resolve(); } } } diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index 8b08221..918170f 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -1,6 +1,8 @@ import * as sandcastle from "@ai-hero/sandcastle"; import { execSync } from "node:child_process"; import crypto from "node:crypto"; +import { readFileSync } from "node:fs"; +import { join } from "node:path"; import type { Finding, LoopResult, LoopStatus, SandboxInstance, TaskSpec } from "./types.js"; @@ -73,8 +75,6 @@ export async function runRefinementLoop( promptFile: "./.sandcastle/implement-prompt.md", }); - totalCommits += impl.commits.length; - if (round === 1 && impl.commits.length === 0) { console.warn(` #${spec.id}: 0 commits on round 1. Skipping.`); status = "skipped"; @@ -110,7 +110,8 @@ export async function runRefinementLoop( ); // Quality ratchet: rollback if findings increased (regression) - if (round > 1 && findings.length > previousFindingsCount) { + const nonLowFindings = findings.filter((f) => f.confidence !== "LOW"); + if (round > 1 && nonLowFindings.length > previousFindingsCount) { try { execSync(`git reset --hard ${beforeRoundSha}`, { cwd: sandbox.worktreePath, @@ -125,7 +126,8 @@ export async function runRefinementLoop( status = "exhausted"; break; } - previousFindingsCount = findings.length; + totalCommits += impl.commits.length; + previousFindingsCount = nonLowFindings.length; opts?.onRoundComplete?.(round, findings); @@ -154,37 +156,38 @@ export async function runRefinementLoop( */ function findingKey(f: Finding, cwd: string): string { if (!f.file || f.line == null) { - const truncTitle = f.title + const normalizedTitle = f.title .toLowerCase() - .slice(0, 50) .replace(/[^\w\s]/g, "") .replace(/\s+/g, " ") .trim(); - return `${f.file || "global"}::${f.category}::${truncTitle}`; + const titleHash = crypto.createHash("sha256").update(normalizedTitle).digest("hex").slice(0, 16); + return `${f.file || "global"}::${f.category}::${titleHash}`; } const contextHash = hashContextLines(cwd, f.file, f.line, 3); return `${f.file}::${f.category}::${contextHash}`; } /** - * Hashes ±radius lines around the given line in a file for dedup stability. + * Hashes the target line content for dedup stability. * @param cwd - Working directory. * @param file - Relative file path. * @param line - Line number of the finding. - * @param radius - Number of context lines above and below. + * @param _radius - Unused; kept for call-site compatibility. * @returns Truncated SHA-256 hex digest. */ -function hashContextLines(cwd: string, file: string, line: number, radius: number): string { +function hashContextLines(cwd: string, file: string, line: number, _radius: number): string { try { - const start = Math.max(1, line - radius); - const end = line + radius; - const content = execSync(`sed -n '${String(start)},${String(end)}p' "${file}"`, { - cwd, - encoding: "utf-8", - stdio: ["pipe", "pipe", "pipe"], - }); - const normalized = content.replace(/\s+/g, " ").trim(); - return crypto.createHash("sha256").update(normalized).digest("hex").slice(0, 16); + const fullPath = join(cwd, file); + if (!fullPath.startsWith(cwd)) { + throw new Error("Path traversal"); + } + const raw = readFileSync(fullPath, "utf-8"); + const lines = raw.split("\n"); + const targetLine = lines[Math.max(0, line - 1)] ?? ""; + const normalized = targetLine.replace(/\s+/g, " ").trim(); + // Include file path as salt to prevent cross-file hash collisions + return crypto.createHash("sha256").update(`${file}:${normalized}`).digest("hex").slice(0, 16); } catch { const truncTitle = file.slice(0, 30) + String(line); return crypto.createHash("sha256").update(truncTitle).digest("hex").slice(0, 16); diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts index 8e67b8b..42d27bb 100644 --- a/.sandcastle/task-source.ts +++ b/.sandcastle/task-source.ts @@ -181,17 +181,18 @@ export class GithubIssueSource implements TaskSource { }, ); - return validated - .map((v) => { - const source = issuesJson.find((i) => String(i.number) === v.id); - if (!source) return null; - return { - ...v, - body: source.body, - labels: source.labels, - }; - }) - .filter((v): v is NonNullable => v !== null); + const issueMap = new Map(issuesJson.map((i) => [String(i.number), i])); + return validated + .map((v) => { + const source = issueMap.get(v.id); + if (!source) return null; + return { + ...v, + body: source.body, + labels: source.labels, + }; + }) + .filter((v): v is NonNullable => v !== null); } catch { console.error("Planner produced invalid JSON. Retrying."); return null; @@ -205,5 +206,5 @@ export class GithubIssueSource implements TaskSource { * @returns Sanitized text safe for prompt injection. */ function sanitizeForPrompt(text: string): string { - return text.replace(/<\/?(?:plan|findings[\w-]*|promise)[^>]*>/gi, ""); + return text.replace(/<\/?(?:plan|findings|promise)[^>]*>/gi, ""); } From badfc9231d089499d8258666fc7f85a9c6d8d3ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 16:34:48 +0200 Subject: [PATCH 15/31] fix(sandcastle): harden subprocess calls and reduce cyclomatic complexity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace all execSync template literals with execFileSync (array args) to eliminate shell injection vectors from LLM-controlled data - Strengthen path traversal guard: resolve() + sep instead of join() - Validate SHA format (/^[0-9a-f]{40}$/) before git reset --hard - Decompose finalizeTask (CC 15→5) into runValidation, attemptRebase, pushBranch, buildPrArgs, extractStderr - Decompose runRefinementLoop (CC 18→8) into executeRound, deduplicateFindings, checkQualityRatchet - Replace process.exit(0) with natural script termination - Add title validation (length ≤200, no control chars) in validatePlan - Add console.warn in previously silent catch blocks --- .sandcastle/concurrency-pool.ts | 8 +- .sandcastle/finalizer.ts | 201 +++++++++++++++++++----------- .sandcastle/main.ts | 84 ++++++------- .sandcastle/refinement-loop.ts | 213 ++++++++++++++++++++++---------- .sandcastle/task-source.ts | 28 +++-- 5 files changed, 340 insertions(+), 194 deletions(-) diff --git a/.sandcastle/concurrency-pool.ts b/.sandcastle/concurrency-pool.ts index 7ce4be2..e023a43 100644 --- a/.sandcastle/concurrency-pool.ts +++ b/.sandcastle/concurrency-pool.ts @@ -1,7 +1,7 @@ /** Internal node for the O(1) FIFO waiting queue. Not exported. */ interface QueueNode { + next: null | QueueNode; resolve: () => void; - next: QueueNode | null; } /** @@ -9,9 +9,9 @@ interface QueueNode { * Queue operations are O(1) amortized (singly-linked list). */ export class ConcurrencyPool { - private head: QueueNode | null = null; - private tail: QueueNode | null = null; + private head: null | QueueNode = null; private running = 0; + private tail: null | QueueNode = null; /** * @param max - Maximum number of concurrent tasks. Must be >= 1. @@ -42,7 +42,7 @@ export class ConcurrencyPool { return Promise.resolve(); } return new Promise((resolve) => { - const node: QueueNode = { resolve, next: null }; + const node: QueueNode = { next: null, resolve }; if (this.tail === null) { this.head = node; this.tail = node; diff --git a/.sandcastle/finalizer.ts b/.sandcastle/finalizer.ts index 60527b9..e1fedff 100644 --- a/.sandcastle/finalizer.ts +++ b/.sandcastle/finalizer.ts @@ -1,5 +1,5 @@ import * as sandcastle from "@ai-hero/sandcastle"; -import { execFileSync, execSync } from "node:child_process"; +import { execFileSync } from "node:child_process"; import type { FinalizeResult, LoopResult, SandboxInstance, TaskSpec } from "./types.js"; @@ -22,19 +22,9 @@ export async function finalizeTask( sandbox: SandboxInstance, cwd: string, ): Promise { - let validationPassed = false; - - try { - execSync(VALIDATION_COMMAND, { cwd, stdio: "pipe" }); - validationPassed = true; - } catch (err: unknown) { - const stderr = - err instanceof Error && "stderr" in err - ? String((err as { stderr: unknown }).stderr).slice(0, 500) - : ""; - console.warn(` #${spec.id}: Validation failed.${stderr ? `\n${stderr}` : ""}`); - } + let validationPassed = runValidation(cwd, spec); + // Retry one more round if validation failed and budget remains if (!validationPassed && loopResult.roundsCompleted < MAX_CRITIC_ROUNDS) { const retryBudget = ITERATION_BUDGET_PER_ROUND; console.log( @@ -66,7 +56,7 @@ export async function finalizeTask( } try { - execSync(VALIDATION_COMMAND, { cwd, stdio: "pipe" }); + execFileSync("sh", ["-c", VALIDATION_COMMAND], { cwd, stdio: "pipe" }); validationPassed = true; console.log(` #${spec.id}: Validation passed after retry round.`); } catch { @@ -75,71 +65,80 @@ export async function finalizeTask( } // Rebase on latest main - let rebaseSucceeded = false; - try { - execSync("git fetch origin main && git rebase origin/main", { - cwd, - stdio: "pipe", - }); - rebaseSucceeded = true; - if (validationPassed) { - try { - execSync(VALIDATION_COMMAND, { - cwd, - stdio: "pipe", - }); - } catch (postRebaseErr: unknown) { - const postRebaseStderr = - postRebaseErr instanceof Error && "stderr" in postRebaseErr - ? String((postRebaseErr as { stderr: unknown }).stderr).slice(0, 500) - : ""; - console.warn( - ` #${spec.id}: Post-rebase validation failed.${postRebaseStderr ? `\n${postRebaseStderr}` : ""}`, - ); - validationPassed = false; - } - } - } catch { - try { - execSync("git rebase --abort", { cwd, stdio: "pipe" }); - } catch { - /* empty */ - } + const rebaseSucceeded = attemptRebase(cwd, spec); + if (rebaseSucceeded && validationPassed) { try { - execSync("git push", { cwd, stdio: "pipe" }); - } catch (pushErr: unknown) { - const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); - console.warn(` #${spec.id}: git push failed after rebase abort: ${pushMsg}`); + execFileSync("sh", ["-c", VALIDATION_COMMAND], { cwd, stdio: "pipe" }); + } catch (postRebaseErr: unknown) { + const postRebaseStderr = extractStderr(postRebaseErr); + console.warn( + ` #${spec.id}: Post-rebase validation failed.${postRebaseStderr ? `\n${postRebaseStderr}` : ""}`, + ); + validationPassed = false; } } - if (rebaseSucceeded) { + // Push + pushBranch(cwd, spec, rebaseSucceeded); + + // Build PR arguments and create PR + const { isDraft, prArgs } = buildPrArgs(spec, loopResult, validationPassed); + + let prCreated = false; + try { + execFileSync("gh", prArgs, { cwd, encoding: "utf-8", stdio: "pipe" }); + console.log(` #${spec.id}: PR created${isDraft ? " (draft)" : ""}.`); + prCreated = true; + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.error(` #${spec.id}: PR creation failed: ${msg}`); + } + + return { isDraft, prCreated, validationPassed }; +} + +/** + * Fetches origin/main and rebases the current branch onto it. + * On failure, aborts the rebase cleanly. + * @param cwd - Working directory (worktree path). + * @param _spec - Task specification (unused, reserved for future logging). + * @returns `true` if rebase succeeded, `false` otherwise. + */ +function attemptRebase(cwd: string, _spec: TaskSpec): boolean { + try { + execFileSync("git", ["fetch", "origin", "main"], { cwd, stdio: "pipe" }); + execFileSync("git", ["rebase", "origin/main"], { cwd, stdio: "pipe" }); + return true; + } catch { try { - execSync("git push --force-with-lease", { cwd, stdio: "pipe" }); - } catch (pushErr: unknown) { - const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); - try { - execSync(`git push origin HEAD:refs/heads/rescue/${spec.branch}-${String(Date.now())}`, { - cwd, - stdio: "pipe", - }); - console.warn(` #${spec.id}: Push failed. Commits preserved at rescue/${spec.branch}-...`); - } catch { - console.error( - ` #${spec.id}: Push failed and rescue failed. Commits will be lost on sandbox disposal: ${pushMsg}`, - ); - } + execFileSync("git", ["rebase", "--abort"], { cwd, stdio: "pipe" }); + } catch { + /* empty */ } + return false; } +} +/** + * Builds the PR title, body, and `gh pr create` argument list. + * @param spec - The task specification. + * @param loopResult - The result from the refinement loop. + * @param validationPassed - Whether the validation suite passed. + * @returns Object with `isDraft` flag and `prArgs` string array. + */ +function buildPrArgs( + spec: TaskSpec, + loopResult: LoopResult, + validationPassed: boolean, +): { isDraft: boolean; prArgs: string[] } { const converged = loopResult.status === "converged"; const isDraft = !converged || !validationPassed; const outstandingNote = !converged && loopResult.lastFindings.length > 0 - ? `\n\n⚠️ Outstanding findings:\n${loopResult.lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` + ? `\n\n\u26a0\ufe0f Outstanding findings:\n${loopResult.lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` : ""; const validationNote = !validationPassed - ? "\n\n⚠️ Validation did not pass. Manual review required." + ? "\n\n\u26a0\ufe0f Validation did not pass. Manual review required." : ""; const validationCheck = validationPassed ? "- [x]" : "- [ ]"; @@ -148,7 +147,7 @@ export async function finalizeTask( : spec.labels.includes("bug") ? "fix" : "chore"; - const prTitle = `${commitPrefix}: resolve #${spec.id} — ${spec.title}`; + const prTitle = `${commitPrefix}: resolve #${spec.id} \u2014 ${spec.title}`; const typeOfChange = commitPrefix === "feat" ? "New feature (non-breaking change that adds functionality)" @@ -171,15 +170,69 @@ export async function finalizeTask( prBody, ]; - let prCreated = false; + return { isDraft, prArgs }; +} + +/** + * Extracts stderr from a caught error, truncated to 500 chars. + * @param err - The caught error value. + * @returns Stderr string or empty string if unavailable. + */ +function extractStderr(err: unknown): string { + return err instanceof Error && "stderr" in err + ? String((err as { stderr: unknown }).stderr).slice(0, 500) + : ""; +} + +/** + * Pushes the branch to origin. When rebase succeeded, uses force-with-lease + * with a rescue-branch fallback. When rebase was aborted, does a plain push. + * @param cwd - Working directory (worktree path). + * @param spec - The task specification. + * @param rebaseSucceeded - Whether the preceding rebase completed successfully. + */ +function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): void { + if (rebaseSucceeded) { + try { + execFileSync("git", ["push", "--force-with-lease"], { cwd, stdio: "pipe" }); + } catch (pushErr: unknown) { + const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); + try { + execFileSync( + "git", + ["push", "origin", `HEAD:refs/heads/rescue/${spec.branch}-${String(Date.now())}`], + { cwd, stdio: "pipe" }, + ); + console.warn(` #${spec.id}: Push failed. Commits preserved at rescue/${spec.branch}-...`); + } catch { + console.error( + ` #${spec.id}: Push failed and rescue failed. Commits will be lost on sandbox disposal: ${pushMsg}`, + ); + } + } + } else { + try { + execFileSync("git", ["push"], { cwd, stdio: "pipe" }); + } catch (pushErr: unknown) { + const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); + console.warn(` #${spec.id}: git push failed after rebase abort: ${pushMsg}`); + } + } +} + +/** + * Runs the full validation suite. + * @param cwd - Working directory (worktree path). + * @param spec - The task specification (used for logging). + * @returns `true` if validation passed, `false` otherwise. + */ +function runValidation(cwd: string, spec: TaskSpec): boolean { try { - execFileSync("gh", prArgs, { cwd, encoding: "utf-8", stdio: "pipe" }); - console.log(` #${spec.id}: PR created${isDraft ? " (draft)" : ""}.`); - prCreated = true; + execFileSync("sh", ["-c", VALIDATION_COMMAND], { cwd, stdio: "pipe" }); + return true; } catch (err: unknown) { - const msg = err instanceof Error ? err.message : String(err); - console.error(` #${spec.id}: PR creation failed: ${msg}`); + const stderr = extractStderr(err); + console.warn(` #${spec.id}: Validation failed.${stderr ? `\n${stderr}` : ""}`); + return false; } - - return { isDraft, prCreated, validationPassed }; } diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 988d832..98bf998 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -21,55 +21,55 @@ const source = new GithubIssueSource({ const tasks = await source.discover(); if (tasks.length === 0) { - process.exit(0); -} - -const pool = new ConcurrencyPool(MAX_PARALLEL); + console.log("No tasks to process."); +} else { + const pool = new ConcurrencyPool(MAX_PARALLEL); -const settled = await Promise.allSettled( - tasks.map((spec) => - pool.run(async () => { - await using sandbox = await sandcastle.createSandbox({ - branch: spec.branch, - copyToWorktree: ["node_modules"], - hooks: { - sandbox: { onSandboxReady: [{ command: "npm install && npm run build" }] }, - }, - sandbox: docker({ imageName: DOCKER_IMAGE }), - }); + const settled = await Promise.allSettled( + tasks.map((spec) => + pool.run(async () => { + await using sandbox = await sandcastle.createSandbox({ + branch: spec.branch, + copyToWorktree: ["node_modules"], + hooks: { + sandbox: { onSandboxReady: [{ command: "npm install && npm run build" }] }, + }, + sandbox: docker({ imageName: DOCKER_IMAGE }), + }); - const loopResult = await runRefinementLoop(spec, sandbox, { - iterationBudget: ITERATION_BUDGET_PER_ROUND, - maxRounds: MAX_CRITIC_ROUNDS, - }); + const loopResult = await runRefinementLoop(spec, sandbox, { + iterationBudget: ITERATION_BUDGET_PER_ROUND, + maxRounds: MAX_CRITIC_ROUNDS, + }); - let prCreated = false; - if (loopResult.totalCommits > 0) { - const cwd = sandbox.worktreePath; - const result = await finalizeTask(spec, loopResult, sandbox, cwd); - prCreated = result.prCreated; - } + let prCreated = false; + if (loopResult.totalCommits > 0) { + const cwd = sandbox.worktreePath; + const result = await finalizeTask(spec, loopResult, sandbox, cwd); + prCreated = result.prCreated; + } - return { prCreated, spec }; - }), - ), -); + return { prCreated, spec }; + }), + ), + ); -const workCompleted = settled.some( - (outcome) => outcome.status === "fulfilled" && outcome.value.prCreated, -); + const workCompleted = settled.some( + (outcome) => outcome.status === "fulfilled" && outcome.value.prCreated, + ); -for (const [i, outcome] of settled.entries()) { - if (outcome.status === "rejected") { - const spec = tasks[i]; - const reason: unknown = outcome.reason; - const msg = reason instanceof Error ? (reason.stack ?? reason.message) : String(reason); - console.error(` ✗ #${spec?.id ?? String(i)} failed: ${msg}`); + for (const [i, outcome] of settled.entries()) { + if (outcome.status === "rejected") { + const spec = tasks[i]; + const reason: unknown = outcome.reason; + const msg = reason instanceof Error ? (reason.stack ?? reason.message) : String(reason); + console.error(` ✗ #${spec?.id ?? String(i)} failed: ${msg}`); + } } -} -console.log("\nAll done."); + console.log("\nAll done."); -if (!workCompleted) { - process.exitCode = 1; + if (!workCompleted) { + process.exitCode = 1; + } } diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index 918170f..baa06a9 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -1,8 +1,8 @@ import * as sandcastle from "@ai-hero/sandcastle"; -import { execSync } from "node:child_process"; +import { execFileSync } from "node:child_process"; import crypto from "node:crypto"; import { readFileSync } from "node:fs"; -import { join } from "node:path"; +import { resolve, sep } from "node:path"; import type { Finding, LoopResult, LoopStatus, SandboxInstance, TaskSpec } from "./types.js"; @@ -18,6 +18,16 @@ export interface RefinementLoopOptions { onRoundComplete?: (round: number, findings: Finding[]) => void; } +/** Result of a single implement↔critic round. */ +interface RoundResult { + /** SHA of HEAD before the implementer ran. */ + beforeSha: string; + /** Number of commits made by the implementer. */ + commits: number; + /** Parsed findings from the critic, or null on critic failure. */ + findings: Finding[] | null; +} + /** * Runs the implement↔critic refinement loop for a given task. * @param spec - The task specification. @@ -42,55 +52,25 @@ export async function runRefinementLoop( for (let round = 1; round <= maxRounds; round++) { roundsCompleted = round; - const findingsArg = lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : ""; console.log( ` #${spec.id} round ${String(round)}/${String(maxRounds)} (budget: ${String(budget)})`, ); - // Capture SHA before implementer runs (for quality ratchet rollback) - let beforeRoundSha = ""; - try { - beforeRoundSha = execSync("git rev-parse HEAD", { - cwd: sandbox.worktreePath, - encoding: "utf-8", - stdio: ["pipe", "pipe", "pipe"], - }).trim(); - } catch { - /* empty */ - } + const result = await executeRound(spec, sandbox, round, budget, lastFindings); - // Implementer - const impl = await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: budget, - name: `Implementer #${spec.id} R${String(round)}`, - promptArgs: { - BRANCH: spec.branch, - FINDINGS: findingsArg, - ISSUE_BODY: spec.body, - ISSUE_TITLE: spec.title, - TASK_ID: spec.id, - }, - promptFile: "./.sandcastle/implement-prompt.md", - }); - - if (round === 1 && impl.commits.length === 0) { + if (round === 1 && result.commits === 0) { console.warn(` #${spec.id}: 0 commits on round 1. Skipping.`); status = "skipped"; break; } - if (round > 1 && impl.commits.length === 0) { + if (round > 1 && result.commits === 0) { status = "exhausted"; break; } - // Critic - const nonce = crypto.randomBytes(4).toString("hex"); - const findings = await runCritic(sandbox, spec, round, nonce); - - if (findings === null) { + if (result.findings === null) { console.warn(` #${spec.id}: Critic failed twice. Breaking (non-converged).`); status = "failed"; break; @@ -98,41 +78,34 @@ export async function runRefinementLoop( // Dedup via context hash const cwd = sandbox.worktreePath; - const newFindings = findings.filter( - (f) => f.confidence !== "LOW" && !seenKeys.has(findingKey(f, cwd)), - ); - for (const f of newFindings) { - seenKeys.add(findingKey(f, cwd)); - } + const newFindings = deduplicateFindings(result.findings, seenKeys, cwd); console.log( - ` #${spec.id}: ${String(findings.length)} findings, ${String(newFindings.length)} new`, + ` #${spec.id}: ${String(result.findings.length)} findings, ${String(newFindings.length)} new`, ); // Quality ratchet: rollback if findings increased (regression) - const nonLowFindings = findings.filter((f) => f.confidence !== "LOW"); - if (round > 1 && nonLowFindings.length > previousFindingsCount) { - try { - execSync(`git reset --hard ${beforeRoundSha}`, { - cwd: sandbox.worktreePath, - stdio: "pipe", - }); - console.warn( - ` #${spec.id} R${String(round)}: Regression detected (${String(previousFindingsCount)} → ${String(findings.length)}). Rolled back.`, - ); - } catch { - /* empty */ - } + const nonLowFindings = result.findings.filter((f) => f.confidence !== "LOW"); + if ( + checkQualityRatchet( + spec, + round, + nonLowFindings.length, + previousFindingsCount, + result.beforeSha, + cwd, + ) + ) { status = "exhausted"; break; } - totalCommits += impl.commits.length; + totalCommits += result.commits; previousFindingsCount = nonLowFindings.length; - opts?.onRoundComplete?.(round, findings); + opts?.onRoundComplete?.(round, result.findings); if (newFindings.length === 0) { - const nonLowFindings = findings.filter((f) => f.confidence !== "LOW"); + const nonLowFindings = result.findings.filter((f) => f.confidence !== "LOW"); if (nonLowFindings.length > 0) { lastFindings = nonLowFindings; status = "exhausted"; @@ -148,6 +121,118 @@ export async function runRefinementLoop( return { lastFindings, roundsCompleted, status, totalCommits }; } +/** + * Checks whether findings regressed compared to the previous round and rolls back if so. + * @param spec - The task specification. + * @param round - Current round number. + * @param findingsCount - Number of non-LOW findings this round. + * @param previousCount - Number of non-LOW findings from the previous round. + * @param beforeSha - SHA to reset to on regression. + * @param cwd - Working directory for git operations. + * @returns True if a regression was detected and rollback performed. + */ +function checkQualityRatchet( + spec: TaskSpec, + round: number, + findingsCount: number, + previousCount: number, + beforeSha: string, + cwd: string, +): boolean { + if (round <= 1 || findingsCount <= previousCount) { + return false; + } + + // Validate SHA format before passing to execFileSync + if (!/^[0-9a-f]{40}$/.test(beforeSha)) { + console.warn(` #${spec.id}: Invalid SHA for rollback, skipping reset.`); + return true; + } + + try { + execFileSync("git", ["reset", "--hard", beforeSha], { + cwd, + stdio: "pipe", + }); + console.warn( + ` #${spec.id} R${String(round)}: Regression detected (${String(previousCount)} → ${String(findingsCount)}). Rolled back.`, + ); + } catch { + console.warn(` #${spec.id}: Failed to reset to ${beforeSha} after regression.`); + } + + return true; +} + +/** + * Filters findings by confidence and deduplicates against previously seen keys. + * @param findings - Raw findings from the critic. + * @param seenKeys - Set of previously seen dedup keys (mutated: new keys are added). + * @param cwd - Working directory for context hashing. + * @returns Array of new, non-LOW-confidence findings. + */ +function deduplicateFindings(findings: Finding[], seenKeys: Set, cwd: string): Finding[] { + const newFindings = findings.filter( + (f) => f.confidence !== "LOW" && !seenKeys.has(findingKey(f, cwd)), + ); + for (const f of newFindings) { + seenKeys.add(findingKey(f, cwd)); + } + return newFindings; +} + +/** + * Executes a single implement↔critic round. + * @param spec - The task specification. + * @param sandbox - The sandcastle sandbox instance. + * @param round - Current round number (1-indexed). + * @param budget - Iteration budget for the implementer. + * @param lastFindings - Findings from the previous round to feed to the implementer. + * @returns The round result containing commits, findings, and the pre-round SHA. + */ +async function executeRound( + spec: TaskSpec, + sandbox: SandboxInstance, + round: number, + budget: number, + lastFindings: Finding[], +): Promise { + const findingsArg = lastFindings.length > 0 ? JSON.stringify(lastFindings, null, 2) : ""; + + // Capture SHA before implementer runs (for quality ratchet rollback) + let beforeSha = ""; + try { + beforeSha = execFileSync("git", ["rev-parse", "HEAD"], { + cwd: sandbox.worktreePath, + encoding: "utf-8", + stdio: ["pipe", "pipe", "pipe"], + }).trim(); + } catch { + console.warn(` #${spec.id}: Failed to capture HEAD SHA before round ${String(round)}.`); + } + + // Implementer + const impl = await sandbox.run({ + agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + maxIterations: budget, + name: `Implementer #${spec.id} R${String(round)}`, + promptArgs: { + BRANCH: spec.branch, + FINDINGS: findingsArg, + ISSUE_BODY: spec.body, + ISSUE_TITLE: spec.title, + TASK_ID: spec.id, + }, + promptFile: "./.sandcastle/implement-prompt.md", + }); + + // Critic + const nonce = crypto.randomBytes(4).toString("hex"); + const findings = await runCritic(sandbox, spec, round, nonce); + + return { beforeSha, commits: impl.commits.length, findings }; +} + /** * Computes a deduplication key for a finding using a context hash of surrounding lines. * @param f - Finding to compute a key for. @@ -161,7 +246,11 @@ function findingKey(f: Finding, cwd: string): string { .replace(/[^\w\s]/g, "") .replace(/\s+/g, " ") .trim(); - const titleHash = crypto.createHash("sha256").update(normalizedTitle).digest("hex").slice(0, 16); + const titleHash = crypto + .createHash("sha256") + .update(normalizedTitle) + .digest("hex") + .slice(0, 16); return `${f.file || "global"}::${f.category}::${titleHash}`; } const contextHash = hashContextLines(cwd, f.file, f.line, 3); @@ -178,8 +267,8 @@ function findingKey(f: Finding, cwd: string): string { */ function hashContextLines(cwd: string, file: string, line: number, _radius: number): string { try { - const fullPath = join(cwd, file); - if (!fullPath.startsWith(cwd)) { + const fullPath = resolve(cwd, file); + if (!fullPath.startsWith(resolve(cwd) + sep)) { throw new Error("Path traversal"); } const raw = readFileSync(fullPath, "utf-8"); diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts index 42d27bb..99e415c 100644 --- a/.sandcastle/task-source.ts +++ b/.sandcastle/task-source.ts @@ -177,22 +177,26 @@ export class GithubIssueSource implements TaskSource { if (typeof item.branch !== "string" || !this.branchPattern.test(item.branch)) return false; if (typeof item.title !== "string") return false; + if (item.title.length > 200) return false; + for (let ci = 0; ci < item.title.length; ci++) { + if (item.title.charCodeAt(ci) < 0x20) return false; + } return true; }, ); - const issueMap = new Map(issuesJson.map((i) => [String(i.number), i])); - return validated - .map((v) => { - const source = issueMap.get(v.id); - if (!source) return null; - return { - ...v, - body: source.body, - labels: source.labels, - }; - }) - .filter((v): v is NonNullable => v !== null); + const issueMap = new Map(issuesJson.map((i) => [String(i.number), i])); + return validated + .map((v) => { + const source = issueMap.get(v.id); + if (!source) return null; + return { + ...v, + body: source.body, + labels: source.labels, + }; + }) + .filter((v): v is NonNullable => v !== null); } catch { console.error("Planner produced invalid JSON. Retrying."); return null; From 3d2a46f8c5fff39923607b035835f563479cab97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 16:38:44 +0200 Subject: [PATCH 16/31] =?UTF-8?q?fix(sandcastle):=20address=20review=20fin?= =?UTF-8?q?dings=20=E2=80=94=20convergence,=20planner=20failure,=20rebase?= =?UTF-8?q?=20note?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Treat newFindings=0 as 'converged' regardless of repeated non-LOW findings (previously marked 'exhausted', forcing draft PRs) - Set process.exitCode=1 when planner exhausts retries (prevents masking a broken planner as 'no work to do') - Pass rebaseSucceeded to buildPrArgs, append rebase failure note to PR body when branch is not rebased onto main - Reword sanitizeForPrompt JSDoc to accurately describe scope --- .sandcastle/finalizer.ts | 13 +++++++++---- .sandcastle/refinement-loop.ts | 4 +--- .sandcastle/task-source.ts | 5 +++-- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/.sandcastle/finalizer.ts b/.sandcastle/finalizer.ts index e1fedff..2574e90 100644 --- a/.sandcastle/finalizer.ts +++ b/.sandcastle/finalizer.ts @@ -82,7 +82,7 @@ export async function finalizeTask( pushBranch(cwd, spec, rebaseSucceeded); // Build PR arguments and create PR - const { isDraft, prArgs } = buildPrArgs(spec, loopResult, validationPassed); + const { isDraft, prArgs } = buildPrArgs(spec, loopResult, validationPassed, rebaseSucceeded); let prCreated = false; try { @@ -124,21 +124,26 @@ function attemptRebase(cwd: string, _spec: TaskSpec): boolean { * @param spec - The task specification. * @param loopResult - The result from the refinement loop. * @param validationPassed - Whether the validation suite passed. + * @param rebaseSucceeded - Whether the rebase onto main succeeded. * @returns Object with `isDraft` flag and `prArgs` string array. */ function buildPrArgs( spec: TaskSpec, loopResult: LoopResult, validationPassed: boolean, + rebaseSucceeded: boolean, ): { isDraft: boolean; prArgs: string[] } { const converged = loopResult.status === "converged"; const isDraft = !converged || !validationPassed; const outstandingNote = !converged && loopResult.lastFindings.length > 0 - ? `\n\n\u26a0\ufe0f Outstanding findings:\n${loopResult.lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` + ? `\n\n⚠️ Outstanding findings:\n${loopResult.lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` : ""; const validationNote = !validationPassed - ? "\n\n\u26a0\ufe0f Validation did not pass. Manual review required." + ? "\n\n⚠️ Validation did not pass. Manual review required." + : ""; + const rebaseNote = !rebaseSucceeded + ? "\n\n⚠️ Rebase failed. Branch is not rebased onto main." : ""; const validationCheck = validationPassed ? "- [x]" : "- [ ]"; @@ -154,7 +159,7 @@ function buildPrArgs( : commitPrefix === "fix" ? "Bug fix (non-breaking change that fixes an issue)" : "Refactoring (no functional changes)"; - const prBody = `## Description\n\nAutomated ${commitPrefix} for #${spec.id}: ${spec.title}\n\n## Type of Change\n\n- [x] ${typeOfChange}\n\n## Checklist\n\n${validationCheck} I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${spec.id}${outstandingNote}${validationNote}`; + const prBody = `## Description\n\nAutomated ${commitPrefix} for #${spec.id}: ${spec.title}\n\n## Type of Change\n\n- [x] ${typeOfChange}\n\n## Checklist\n\n${validationCheck} I have run validation suite\n- [x] My changes follow the existing code style\n\n## Related Issues\n\nFixes #${spec.id}${outstandingNote}${validationNote}${rebaseNote}`; const prArgs = [ "pr", diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index baa06a9..f2755d8 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -108,10 +108,8 @@ export async function runRefinementLoop( const nonLowFindings = result.findings.filter((f) => f.confidence !== "LOW"); if (nonLowFindings.length > 0) { lastFindings = nonLowFindings; - status = "exhausted"; - } else { - status = "converged"; } + status = "converged"; break; } diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts index 99e415c..a0405d3 100644 --- a/.sandcastle/task-source.ts +++ b/.sandcastle/task-source.ts @@ -111,6 +111,7 @@ export class GithubIssueSource implements TaskSource { } console.warn("Planner failed to produce a valid plan after all retries."); + process.exitCode = 1; return []; } @@ -205,9 +206,9 @@ export class GithubIssueSource implements TaskSource { } /** - * Strips injection-prone tags from text. + * Strips agent-control tags from text to reduce prompt-injection risk. * @param text - Raw text to sanitize. - * @returns Sanitized text safe for prompt injection. + * @returns Text with plan/findings/promise tags removed. */ function sanitizeForPrompt(text: string): string { return text.replace(/<\/?(?:plan|findings|promise)[^>]*>/gi, ""); From bf8ca0485b67a4cbdec7a30ea1fc19ce6e63aa14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 17:31:24 +0200 Subject: [PATCH 17/31] fix(.sandcastle): address all algorithmic audit findings - types.ts: parseFindingsSafe with partial recovery (finding #18) - concurrency-pool.ts: guard NaN/float/Infinity in constructor (#1,#4), JSDoc re-entrant warning (#2) - task-source.ts: NFKC normalize + extended blocklist in sanitizeForPrompt (#17,#20), Map lookup in validatePlan (#7) - refinement-loop.ts: readFileSync replaces execSync/sed (shell injection #5), radius used (#12), fallback hash hardened (#13), line clamping (#14), realpathSync symlinks (#15), file cache Map (#11), parseFindingsSafe (#18), nonce validation (#16), trivial match skip (#19), implementer try/catch (#7), ratchet guard round<=2 (#6), no duplicate nonLowFindings (#9) - finalizer.ts: pushBranch returns boolean (#22), crypto rescue suffix (#21), rollback error level (#8) - main.ts: TASK_TIMEOUT_MS + Promise.race timeout (#3), flat budget JSDoc (#10) --- .sandcastle/concurrency-pool.ts | 7 +- .sandcastle/finalizer.ts | 25 +++++-- .sandcastle/main.ts | 52 ++++++++------ .sandcastle/refinement-loop.ts | 121 ++++++++++++++++++++------------ .sandcastle/task-source.ts | 6 +- .sandcastle/types.ts | 19 ++++- 6 files changed, 154 insertions(+), 76 deletions(-) diff --git a/.sandcastle/concurrency-pool.ts b/.sandcastle/concurrency-pool.ts index e023a43..a755e5a 100644 --- a/.sandcastle/concurrency-pool.ts +++ b/.sandcastle/concurrency-pool.ts @@ -14,11 +14,11 @@ export class ConcurrencyPool { private tail: null | QueueNode = null; /** - * @param max - Maximum number of concurrent tasks. Must be >= 1. + * @param max - Maximum number of concurrent tasks. Must be a positive integer >= 1. */ constructor(private readonly max: number) { - if (max < 1) { - throw new RangeError("ConcurrencyPool max must be >= 1"); + if (!Number.isInteger(max) || max < 1) { + throw new RangeError("ConcurrencyPool max must be a positive integer >= 1"); } } @@ -26,6 +26,7 @@ export class ConcurrencyPool { * Executes the given async function, waiting if the pool is at capacity. * @param fn - Async function to execute within the pool. * @returns The result of the function. + * @remarks Re-entrant calls using the same pool instance may deadlock when all slots are occupied. */ async run(fn: () => Promise): Promise { await this.acquire(); diff --git a/.sandcastle/finalizer.ts b/.sandcastle/finalizer.ts index 2574e90..3832fd4 100644 --- a/.sandcastle/finalizer.ts +++ b/.sandcastle/finalizer.ts @@ -1,5 +1,6 @@ import * as sandcastle from "@ai-hero/sandcastle"; import { execFileSync } from "node:child_process"; +import crypto from "node:crypto"; import type { FinalizeResult, LoopResult, SandboxInstance, TaskSpec } from "./types.js"; @@ -79,7 +80,10 @@ export async function finalizeTask( } // Push - pushBranch(cwd, spec, rebaseSucceeded); + const pushSucceeded = pushBranch(cwd, spec, rebaseSucceeded); + if (!pushSucceeded) { + console.warn(` #${spec.id}: Push did not succeed; PR may reference unpushed commits.`); + } // Build PR arguments and create PR const { isDraft, prArgs } = buildPrArgs(spec, loopResult, validationPassed, rebaseSucceeded); @@ -195,32 +199,39 @@ function extractStderr(err: unknown): string { * @param cwd - Working directory (worktree path). * @param spec - The task specification. * @param rebaseSucceeded - Whether the preceding rebase completed successfully. + * @returns `true` if the primary push succeeded, `false` otherwise. */ -function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): void { +function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): boolean { if (rebaseSucceeded) { try { execFileSync("git", ["push", "--force-with-lease"], { cwd, stdio: "pipe" }); + return true; } catch (pushErr: unknown) { const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); try { - execFileSync( - "git", - ["push", "origin", `HEAD:refs/heads/rescue/${spec.branch}-${String(Date.now())}`], - { cwd, stdio: "pipe" }, + const suffix = crypto.randomBytes(4).toString("hex"); + execFileSync("git", ["push", "origin", `HEAD:refs/heads/rescue/${spec.branch}-${suffix}`], { + cwd, + stdio: "pipe", + }); + console.warn( + ` #${spec.id}: Push failed. Commits preserved at rescue/${spec.branch}-${suffix}`, ); - console.warn(` #${spec.id}: Push failed. Commits preserved at rescue/${spec.branch}-...`); } catch { console.error( ` #${spec.id}: Push failed and rescue failed. Commits will be lost on sandbox disposal: ${pushMsg}`, ); } + return false; } } else { try { execFileSync("git", ["push"], { cwd, stdio: "pipe" }); + return true; } catch (pushErr: unknown) { const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); console.warn(` #${spec.id}: git push failed after rebase abort: ${pushMsg}`); + return false; } } } diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 98bf998..dbf3286 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -11,6 +11,7 @@ const BRANCH_PREFIX = "agent/issue"; const ISSUE_LABEL = "sandcastle"; const MAX_PARALLEL = 3; const DOCKER_IMAGE = "sandcastle-sap-ai"; +const TASK_TIMEOUT_MS = 15 * 60 * 1000; const source = new GithubIssueSource({ branchPrefix: BRANCH_PREFIX, @@ -27,30 +28,39 @@ if (tasks.length === 0) { const settled = await Promise.allSettled( tasks.map((spec) => - pool.run(async () => { - await using sandbox = await sandcastle.createSandbox({ - branch: spec.branch, - copyToWorktree: ["node_modules"], - hooks: { - sandbox: { onSandboxReady: [{ command: "npm install && npm run build" }] }, - }, - sandbox: docker({ imageName: DOCKER_IMAGE }), - }); + pool.run(() => + Promise.race([ + (async () => { + await using sandbox = await sandcastle.createSandbox({ + branch: spec.branch, + copyToWorktree: ["node_modules"], + hooks: { + sandbox: { onSandboxReady: [{ command: "npm install && npm run build" }] }, + }, + sandbox: docker({ imageName: DOCKER_IMAGE }), + }); - const loopResult = await runRefinementLoop(spec, sandbox, { - iterationBudget: ITERATION_BUDGET_PER_ROUND, - maxRounds: MAX_CRITIC_ROUNDS, - }); + const loopResult = await runRefinementLoop(spec, sandbox, { + iterationBudget: ITERATION_BUDGET_PER_ROUND, + maxRounds: MAX_CRITIC_ROUNDS, + }); - let prCreated = false; - if (loopResult.totalCommits > 0) { - const cwd = sandbox.worktreePath; - const result = await finalizeTask(spec, loopResult, sandbox, cwd); - prCreated = result.prCreated; - } + let prCreated = false; + if (loopResult.totalCommits > 0) { + const cwd = sandbox.worktreePath; + const result = await finalizeTask(spec, loopResult, sandbox, cwd); + prCreated = result.prCreated; + } - return { prCreated, spec }; - }), + return { prCreated, spec }; + })(), + new Promise((_, reject) => { + setTimeout(() => { + reject(new Error(`Task #${spec.id} timed out after ${String(TASK_TIMEOUT_MS)}ms`)); + }, TASK_TIMEOUT_MS); + }), + ]), + ), ), ); diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index f2755d8..b3c5a93 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -1,12 +1,12 @@ import * as sandcastle from "@ai-hero/sandcastle"; import { execFileSync } from "node:child_process"; import crypto from "node:crypto"; -import { readFileSync } from "node:fs"; -import { resolve, sep } from "node:path"; +import { readFileSync, realpathSync } from "node:fs"; +import { join, sep } from "node:path"; import type { Finding, LoopResult, LoopStatus, SandboxInstance, TaskSpec } from "./types.js"; -import { FindingsSchema, ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS } from "./types.js"; +import { ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS, parseFindingsSafe } from "./types.js"; /** Options for configuring the refinement loop. */ export interface RefinementLoopOptions { @@ -105,7 +105,6 @@ export async function runRefinementLoop( opts?.onRoundComplete?.(round, result.findings); if (newFindings.length === 0) { - const nonLowFindings = result.findings.filter((f) => f.confidence !== "LOW"); if (nonLowFindings.length > 0) { lastFindings = nonLowFindings; } @@ -137,7 +136,7 @@ function checkQualityRatchet( beforeSha: string, cwd: string, ): boolean { - if (round <= 1 || findingsCount <= previousCount) { + if (round <= 2 || findingsCount <= previousCount) { return false; } @@ -170,11 +169,12 @@ function checkQualityRatchet( * @returns Array of new, non-LOW-confidence findings. */ function deduplicateFindings(findings: Finding[], seenKeys: Set, cwd: string): Finding[] { + const fileCache = new Map(); const newFindings = findings.filter( - (f) => f.confidence !== "LOW" && !seenKeys.has(findingKey(f, cwd)), + (f) => f.confidence !== "LOW" && !seenKeys.has(findingKey(f, cwd, fileCache)), ); for (const f of newFindings) { - seenKeys.add(findingKey(f, cwd)); + seenKeys.add(findingKey(f, cwd, fileCache)); } return newFindings; } @@ -210,19 +210,26 @@ async function executeRound( } // Implementer - const impl = await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), - maxIterations: budget, - name: `Implementer #${spec.id} R${String(round)}`, - promptArgs: { - BRANCH: spec.branch, - FINDINGS: findingsArg, - ISSUE_BODY: spec.body, - ISSUE_TITLE: spec.title, - TASK_ID: spec.id, - }, - promptFile: "./.sandcastle/implement-prompt.md", - }); + let impl: Awaited>; + try { + impl = await sandbox.run({ + agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + maxIterations: budget, + name: `Implementer #${spec.id} R${String(round)}`, + promptArgs: { + BRANCH: spec.branch, + FINDINGS: findingsArg, + ISSUE_BODY: spec.body, + ISSUE_TITLE: spec.title, + TASK_ID: spec.id, + }, + promptFile: "./.sandcastle/implement-prompt.md", + }); + } catch (err: unknown) { + const msg = err instanceof Error ? (err.stack ?? err.message) : String(err); + console.error(` #${spec.id} R${String(round)}: Implementer threw: ${msg}`); + return { beforeSha, commits: 0, findings: null }; + } // Critic const nonce = crypto.randomBytes(4).toString("hex"); @@ -235,9 +242,10 @@ async function executeRound( * Computes a deduplication key for a finding using a context hash of surrounding lines. * @param f - Finding to compute a key for. * @param cwd - Working directory (worktree path) for reading file context. + * @param fileCache - Optional cache of file contents keyed by resolved path. * @returns Composite dedup key. */ -function findingKey(f: Finding, cwd: string): string { +function findingKey(f: Finding, cwd: string, fileCache?: Map): string { if (!f.file || f.line == null) { const normalizedTitle = f.title .toLowerCase() @@ -251,33 +259,56 @@ function findingKey(f: Finding, cwd: string): string { .slice(0, 16); return `${f.file || "global"}::${f.category}::${titleHash}`; } - const contextHash = hashContextLines(cwd, f.file, f.line, 3); + const contextHash = hashContextLines(cwd, f.file, f.line, 3, fileCache); return `${f.file}::${f.category}::${contextHash}`; } /** - * Hashes the target line content for dedup stability. + * Hashes a window of lines around the finding for dedup stability. * @param cwd - Working directory. * @param file - Relative file path. * @param line - Line number of the finding. - * @param _radius - Unused; kept for call-site compatibility. + * @param radius - Number of lines above/below to include in the context window. + * @param fileCache - Optional cache of file contents keyed by resolved path. * @returns Truncated SHA-256 hex digest. */ -function hashContextLines(cwd: string, file: string, line: number, _radius: number): string { +function hashContextLines( + cwd: string, + file: string, + line: number, + radius: number, + fileCache?: Map, +): string { try { - const fullPath = resolve(cwd, file); - if (!fullPath.startsWith(resolve(cwd) + sep)) { + const fullPath = realpathSync(join(cwd, file)); + if (!fullPath.startsWith(realpathSync(cwd) + sep)) { throw new Error("Path traversal"); } - const raw = readFileSync(fullPath, "utf-8"); + let raw: string; + const cached = fileCache?.get(fullPath); + if (cached !== undefined) { + raw = cached; + } else { + raw = readFileSync(fullPath, "utf-8"); + if (fileCache) fileCache.set(fullPath, raw); + } const lines = raw.split("\n"); - const targetLine = lines[Math.max(0, line - 1)] ?? ""; - const normalized = targetLine.replace(/\s+/g, " ").trim(); - // Include file path as salt to prevent cross-file hash collisions - return crypto.createHash("sha256").update(`${file}:${normalized}`).digest("hex").slice(0, 16); + const idx = Math.min(Math.max(0, line - 1), lines.length - 1); + const start = Math.max(0, idx - radius); + const end = Math.min(lines.length - 1, idx + radius); + const window = lines.slice(start, end + 1).join("\n"); + const normalized = window.replace(/\s+/g, " ").trim(); + return crypto + .createHash("sha256") + .update(`${file}:${String(line)}:${normalized}`) + .digest("hex") + .slice(0, 16); } catch { - const truncTitle = file.slice(0, 30) + String(line); - return crypto.createHash("sha256").update(truncTitle).digest("hex").slice(0, 16); + return crypto + .createHash("sha256") + .update(`${file}:${String(line)}:fallback`) + .digest("hex") + .slice(0, 16); } } @@ -288,18 +319,22 @@ function hashContextLines(cwd: string, file: string, line: number, _radius: numb * @returns Parsed findings array or null on parse failure. */ function parseFindings(stdout: string, nonce: string): Finding[] | null { + if (!/^[0-9a-f]+$/.test(nonce)) return null; const tagPattern = new RegExp(`([\\s\\S]*?)<\\/findings-${nonce}>`, "g"); const matches = [...stdout.matchAll(tagPattern)]; - if (matches.length === 0) { - return null; - } - const raw = matches.at(-1)?.[1]?.trim() ?? "[]"; - const cleaned = raw.replace(/^```(?:json)?\s*\n?/g, "").replace(/\n?```\s*$/g, ""); - try { - return FindingsSchema.parse(JSON.parse(cleaned)); - } catch { - return null; + if (matches.length === 0) return null; + // Find last non-trivial match + for (let i = matches.length - 1; i >= 0; i--) { + const raw = matches[i]?.[1]?.trim() ?? ""; + if (raw.length < 2) continue; + const cleaned = raw.replace(/^```(?:json)?\s*\n?/g, "").replace(/\n?```\s*$/g, ""); + try { + return parseFindingsSafe(JSON.parse(cleaned)); + } catch { + continue; + } } + return null; } /** diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts index a0405d3..8773b0d 100644 --- a/.sandcastle/task-source.ts +++ b/.sandcastle/task-source.ts @@ -211,5 +211,9 @@ export class GithubIssueSource implements TaskSource { * @returns Text with plan/findings/promise tags removed. */ function sanitizeForPrompt(text: string): string { - return text.replace(/<\/?(?:plan|findings|promise)[^>]*>/gi, ""); + const normalized = text.normalize("NFKC"); + return normalized.replace( + /<\/?(?:plan|findings|promise|system|code|instructions|implement|review|tool_call)[^>]*>/gi, + "", + ); } diff --git a/.sandcastle/types.ts b/.sandcastle/types.ts index efea8f8..300df94 100644 --- a/.sandcastle/types.ts +++ b/.sandcastle/types.ts @@ -48,10 +48,27 @@ export type LoopStatus = "converged" | "exhausted" | "failed" | "skipped"; /** Type alias for a sandcastle sandbox instance. */ export type SandboxInstance = Awaited>; +/** + * Parses a findings array with partial recovery — invalid entries are discarded. + * @param data - Raw parsed JSON value to validate as a findings array. + * @returns Array of valid findings (may be empty). + */ +export function parseFindingsSafe(data: unknown): Finding[] { + if (!Array.isArray(data)) return []; + return data + .map((entry) => FindingSchema.safeParse(entry)) + .filter((r): r is z.ZodSafeParseSuccess => r.success) + .map((r) => r.data); +} + /** Maximum implement↔critic rounds before giving up. */ export const MAX_CRITIC_ROUNDS = 5; -/** Token budget applied uniformly to every implement round. */ +/** + * Flat iteration budget per round (intentionally constant, not decreasing). + * Evidence: ARCS (arXiv:2504.20434), SWE-Agent, AutoCodeRover all use flat budgets. + * Decreasing schedules penalize harder residual problems in later rounds. + */ export const ITERATION_BUDGET_PER_ROUND = 50; /** Specification for a task to be implemented. */ From d1db4e46e5524d66a597c23f9376462e105c924c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 17:59:11 +0200 Subject: [PATCH 18/31] fix: unref timeout timer to prevent process hang, catch critic throws --- .sandcastle/main.ts | 2 +- .sandcastle/refinement-loop.ts | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index dbf3286..7b61188 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -57,7 +57,7 @@ if (tasks.length === 0) { new Promise((_, reject) => { setTimeout(() => { reject(new Error(`Task #${spec.id} timed out after ${String(TASK_TIMEOUT_MS)}ms`)); - }, TASK_TIMEOUT_MS); + }, TASK_TIMEOUT_MS).unref(); }), ]), ), diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index b3c5a93..1f3ec8f 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -233,7 +233,14 @@ async function executeRound( // Critic const nonce = crypto.randomBytes(4).toString("hex"); - const findings = await runCritic(sandbox, spec, round, nonce); + let findings: Finding[] | null; + try { + findings = await runCritic(sandbox, spec, round, nonce); + } catch (err: unknown) { + const msg = err instanceof Error ? err.message : String(err); + console.error(` #${spec.id} R${String(round)}: Critic threw: ${msg}`); + findings = null; + } return { beforeSha, commits: impl.commits.length, findings }; } From 7f0a41ad2f6d7d6a98b3ebc14536166e894b0e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 18:05:24 +0200 Subject: [PATCH 19/31] fix: count commits before break on critic failure (prevents work loss) --- .sandcastle/refinement-loop.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index 1f3ec8f..2d4ca47 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -71,6 +71,7 @@ export async function runRefinementLoop( } if (result.findings === null) { + totalCommits += result.commits; console.warn(` #${spec.id}: Critic failed twice. Breaking (non-converged).`); status = "failed"; break; From 5872f709541e0ac66de01a50a6f97161283ed7bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 18:09:19 +0200 Subject: [PATCH 20/31] fix: suppress unhandled rejection from timeout promise on task success --- .sandcastle/main.ts | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 7b61188..946138f 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -54,11 +54,17 @@ if (tasks.length === 0) { return { prCreated, spec }; })(), - new Promise((_, reject) => { - setTimeout(() => { - reject(new Error(`Task #${spec.id} timed out after ${String(TASK_TIMEOUT_MS)}ms`)); - }, TASK_TIMEOUT_MS).unref(); - }), + (() => { + const p = new Promise((_, reject) => { + setTimeout(() => { + reject(new Error(`Task #${spec.id} timed out after ${String(TASK_TIMEOUT_MS)}ms`)); + }, TASK_TIMEOUT_MS).unref(); + }); + p.catch(() => { + /* suppress unhandled rejection when task completes before timeout */ + }); + return p; + })(), ]), ), ), From 5881b920afbed8b7b5c8237efcb961204db1045b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 18:14:19 +0200 Subject: [PATCH 21/31] fix: force process exit after completion (prevents hang from timed-out sandboxes) --- .sandcastle/main.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 946138f..3ca2d55 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -89,3 +89,5 @@ if (tasks.length === 0) { process.exitCode = 1; } } + +process.exit(process.exitCode ?? 0); From 29c2ce6f67bafa2add5a217acceb015df7b8570b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 18:18:32 +0200 Subject: [PATCH 22/31] fix: check findings null before commits zero (correct status on implementer crash) --- .sandcastle/refinement-loop.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index 2d4ca47..358153d 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -65,11 +65,6 @@ export async function runRefinementLoop( break; } - if (round > 1 && result.commits === 0) { - status = "exhausted"; - break; - } - if (result.findings === null) { totalCommits += result.commits; console.warn(` #${spec.id}: Critic failed twice. Breaking (non-converged).`); @@ -77,6 +72,11 @@ export async function runRefinementLoop( break; } + if (round > 1 && result.commits === 0) { + status = "exhausted"; + break; + } + // Dedup via context hash const cwd = sandbox.worktreePath; const newFindings = deduplicateFindings(result.findings, seenKeys, cwd); From a4f323de9d30400b857b5a97e342765b38817c9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 18:29:58 +0200 Subject: [PATCH 23/31] fix: report known findings in PR body even when converged (prevents silent false convergence) --- .sandcastle/finalizer.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.sandcastle/finalizer.ts b/.sandcastle/finalizer.ts index 3832fd4..1b5aa42 100644 --- a/.sandcastle/finalizer.ts +++ b/.sandcastle/finalizer.ts @@ -140,8 +140,8 @@ function buildPrArgs( const converged = loopResult.status === "converged"; const isDraft = !converged || !validationPassed; const outstandingNote = - !converged && loopResult.lastFindings.length > 0 - ? `\n\n⚠️ Outstanding findings:\n${loopResult.lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` + loopResult.lastFindings.length > 0 + ? `\n\n${converged ? "ℹ️ Known findings (not addressed):" : "⚠️ Outstanding findings:"}\n${loopResult.lastFindings.map((f) => `- [${f.severity}] ${f.file}: ${f.title}`).join("\n")}` : ""; const validationNote = !validationPassed ? "\n\n⚠️ Validation did not pass. Manual review required." From 41d09b47b611c965fd955b421eb1597871792e4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 18:42:25 +0200 Subject: [PATCH 24/31] feat: state-of-the-art convergence improvements (ARCS, SWE-Agent, OpenHands) - Validation in-loop (ARCS): deterministic convergence when tests pass mid-loop - Best-state checkpoint (SWE-Agent): reset to best SHA on non-convergence - Severity-weighted convergence (OpenHands): refuse convergence if CRITICAL/HIGH persist --- .sandcastle/refinement-loop.ts | 57 ++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index 358153d..a31de28 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -8,6 +8,9 @@ import type { Finding, LoopResult, LoopStatus, SandboxInstance, TaskSpec } from import { ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS, parseFindingsSafe } from "./types.js"; +const VALIDATION_COMMAND = + "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2"; + /** Options for configuring the refinement loop. */ export interface RefinementLoopOptions { /** Budget of iterations per round (flat constant applied to every round). */ @@ -49,6 +52,8 @@ export async function runRefinementLoop( let totalCommits = 0; let roundsCompleted = 0; let previousFindingsCount = Infinity; + let bestSha = ""; + let bestFindingsCount = Infinity; for (let round = 1; round <= maxRounds; round++) { roundsCompleted = round; @@ -77,6 +82,22 @@ export async function runRefinementLoop( break; } + // Fix 1: Validation in-loop (ARCS pattern) — deterministic convergence signal + if (result.commits > 0) { + try { + execFileSync("sh", ["-c", VALIDATION_COMMAND], { + cwd: sandbox.worktreePath, + stdio: "pipe", + }); + // Validation passed mid-loop — deterministic convergence + totalCommits += result.commits; + status = "converged"; + break; + } catch { + // Validation failed — continue to critic for feedback + } + } + // Dedup via context hash const cwd = sandbox.worktreePath; const newFindings = deduplicateFindings(result.findings, seenKeys, cwd); @@ -85,6 +106,20 @@ export async function runRefinementLoop( ` #${spec.id}: ${String(result.findings.length)} findings, ${String(newFindings.length)} new`, ); + // Best-state checkpoint (SWE-Agent pattern) + if (newFindings.length < bestFindingsCount) { + bestFindingsCount = newFindings.length; + try { + bestSha = execFileSync("git", ["rev-parse", "HEAD"], { + cwd: sandbox.worktreePath, + encoding: "utf-8", + stdio: ["pipe", "pipe", "pipe"], + }).trim(); + } catch { + /* empty */ + } + } + // Quality ratchet: rollback if findings increased (regression) const nonLowFindings = result.findings.filter((f) => f.confidence !== "LOW"); if ( @@ -106,6 +141,16 @@ export async function runRefinementLoop( opts?.onRoundComplete?.(round, result.findings); if (newFindings.length === 0) { + // Severity-weighted convergence (OpenHands pattern): + // Don't converge if CRITICAL/HIGH findings persist, even if already seen + const criticalPersistent = result.findings.filter( + (f) => (f.severity === "CRITICAL" || f.severity === "HIGH") && f.confidence !== "LOW", + ); + if (criticalPersistent.length > 0) { + lastFindings = criticalPersistent; + status = "exhausted"; + break; + } if (nonLowFindings.length > 0) { lastFindings = nonLowFindings; } @@ -116,6 +161,18 @@ export async function runRefinementLoop( lastFindings = newFindings; } + // Best-state reset: if not converged, restore best intermediate state + if (status !== "converged" && /^[0-9a-f]{40}$/.test(bestSha)) { + try { + execFileSync("git", ["reset", "--hard", bestSha], { + cwd: sandbox.worktreePath, + stdio: "pipe", + }); + } catch { + /* empty */ + } + } + return { lastFindings, roundsCompleted, status, totalCommits }; } From 2f8cbf74a80d5de85e6077f61d2aa77ea483db0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 18:50:41 +0200 Subject: [PATCH 25/31] fix: move bestSha after ratchet, add validation timeout, fix severity/bestSha mismatch --- .sandcastle/refinement-loop.ts | 40 ++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index a31de28..0159152 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -88,6 +88,7 @@ export async function runRefinementLoop( execFileSync("sh", ["-c", VALIDATION_COMMAND], { cwd: sandbox.worktreePath, stdio: "pipe", + timeout: 120_000, }); // Validation passed mid-loop — deterministic convergence totalCommits += result.commits; @@ -106,20 +107,6 @@ export async function runRefinementLoop( ` #${spec.id}: ${String(result.findings.length)} findings, ${String(newFindings.length)} new`, ); - // Best-state checkpoint (SWE-Agent pattern) - if (newFindings.length < bestFindingsCount) { - bestFindingsCount = newFindings.length; - try { - bestSha = execFileSync("git", ["rev-parse", "HEAD"], { - cwd: sandbox.worktreePath, - encoding: "utf-8", - stdio: ["pipe", "pipe", "pipe"], - }).trim(); - } catch { - /* empty */ - } - } - // Quality ratchet: rollback if findings increased (regression) const nonLowFindings = result.findings.filter((f) => f.confidence !== "LOW"); if ( @@ -135,6 +122,21 @@ export async function runRefinementLoop( status = "exhausted"; break; } + + // Best-state checkpoint (SWE-Agent pattern) — after ratchet passes + if (newFindings.length < bestFindingsCount) { + bestFindingsCount = newFindings.length; + try { + bestSha = execFileSync("git", ["rev-parse", "HEAD"], { + cwd: sandbox.worktreePath, + encoding: "utf-8", + stdio: ["pipe", "pipe", "pipe"], + }).trim(); + } catch { + /* empty */ + } + } + totalCommits += result.commits; previousFindingsCount = nonLowFindings.length; @@ -149,6 +151,16 @@ export async function runRefinementLoop( if (criticalPersistent.length > 0) { lastFindings = criticalPersistent; status = "exhausted"; + // Capture current HEAD so post-loop reset is a no-op (code matches findings) + try { + bestSha = execFileSync("git", ["rev-parse", "HEAD"], { + cwd: sandbox.worktreePath, + encoding: "utf-8", + stdio: ["pipe", "pipe", "pipe"], + }).trim(); + } catch { + /* empty */ + } break; } if (nonLowFindings.length > 0) { From 5ef372560d69cf755efc8484fc98d80d270a01d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 18:56:46 +0200 Subject: [PATCH 26/31] fix: recount totalCommits from git after best-state reset (semantic correctness) --- .sandcastle/refinement-loop.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index 0159152..0589ebc 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -180,6 +180,11 @@ export async function runRefinementLoop( cwd: sandbox.worktreePath, stdio: "pipe", }); + const commitCount = execFileSync("git", ["rev-list", "--count", "main..HEAD"], { + cwd: sandbox.worktreePath, + encoding: "utf-8", + }).trim(); + totalCommits = parseInt(commitCount, 10) || 0; } catch { /* empty */ } From 177ecbd7214590791126fe1ab2dd3a6e41f02c99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 20:19:27 +0200 Subject: [PATCH 27/31] refactor(.sandcastle): address all 45 quality audit findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - New constants.ts: shared constants (VALIDATION_COMMAND, timeouts, model names) + utilities (getHeadSha, toErrorMessage) - refinement-loop.ts: decompose runRefinementLoop (CC 17→≤10), RoundContext/HashInput param objects, computeFindingKey rename - finalizer.ts: add timeouts to all execFileSync, use runValidation helper consistently - task-source.ts: add timeout, replace char loop with regex, fix terse names - main.ts: extract withTimeout helper, use model constants - types.ts: unexport FindingsSchema (internal only) --- .sandcastle/constants.ts | 61 ++++++ .sandcastle/finalizer.ts | 64 ++++-- .sandcastle/main.ts | 38 ++-- .sandcastle/refinement-loop.ts | 388 +++++++++++++++++++++------------ .sandcastle/task-source.ts | 47 ++-- .sandcastle/types.ts | 23 +- 6 files changed, 417 insertions(+), 204 deletions(-) create mode 100644 .sandcastle/constants.ts diff --git a/.sandcastle/constants.ts b/.sandcastle/constants.ts new file mode 100644 index 0000000..23374bb --- /dev/null +++ b/.sandcastle/constants.ts @@ -0,0 +1,61 @@ +import { execFileSync } from "node:child_process"; + +/** Model identifier used for implementation and critic agents. */ +export const AGENT_MODEL = "github-copilot/claude-sonnet-4.6"; + +/** Number of context lines around a diff hunk used for hash computation. */ +export const CONTEXT_HASH_RADIUS = 3; + +/** Timeout in milliseconds for git operations. */ +export const GIT_TIMEOUT_MS = 30_000; + +/** Number of characters to retain from a SHA for display purposes. */ +export const HASH_PREFIX_LENGTH = 16; + +/** Maximum number of characters captured from stderr before truncation. */ +export const MAX_STDERR_CHARS = 500; + +/** Maximum number of characters allowed in a PR or commit title. */ +export const MAX_TITLE_LENGTH = 200; + +/** Model identifier used for planning and orchestration agents. */ +export const PLANNER_MODEL = "github-copilot/claude-opus-4.6"; + +/** Timeout in milliseconds for git push operations. */ +export const PUSH_TIMEOUT_MS = 60_000; + +/** Timeout in milliseconds for a single sandcastle task execution. */ +export const TASK_TIMEOUT_MS = 15 * 60 * 1000; + +/** Full validation command run after each implementation round. */ +export const VALIDATION_COMMAND = + "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2"; + +/** Timeout in milliseconds for the validation command. */ +export const VALIDATION_TIMEOUT_MS = 120_000; + +/** + * Returns the current HEAD commit SHA for the given working directory. + * @param cwd - Absolute path to the git repository root. + * @returns The full SHA string, or `null` if the command fails. + */ +export function getHeadSha(cwd: string): null | string { + try { + return execFileSync("git", ["rev-parse", "HEAD"], { + cwd, + encoding: "utf-8", + stdio: ["pipe", "pipe", "pipe"], + }).trim(); + } catch { + return null; + } +} + +/** + * Converts an unknown thrown value to a human-readable error message. + * @param err - The caught value (may be an `Error` or any other type). + * @returns The `message` property if `err` is an `Error`, otherwise `String(err)`. + */ +export function toErrorMessage(err: unknown): string { + return err instanceof Error ? err.message : String(err); +} diff --git a/.sandcastle/finalizer.ts b/.sandcastle/finalizer.ts index 1b5aa42..ab949d3 100644 --- a/.sandcastle/finalizer.ts +++ b/.sandcastle/finalizer.ts @@ -4,11 +4,17 @@ import crypto from "node:crypto"; import type { FinalizeResult, LoopResult, SandboxInstance, TaskSpec } from "./types.js"; +import { + AGENT_MODEL, + GIT_TIMEOUT_MS, + MAX_STDERR_CHARS, + PUSH_TIMEOUT_MS, + toErrorMessage, + VALIDATION_COMMAND, + VALIDATION_TIMEOUT_MS, +} from "./constants.js"; import { ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS } from "./types.js"; -const VALIDATION_COMMAND = - "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2"; - /** * Finalizes a task after the refinement loop: validates, retries if needed, rebases, pushes, and creates a PR. * @param spec - The task specification. @@ -34,7 +40,7 @@ export async function finalizeTask( try { await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + agent: sandcastle.opencode(AGENT_MODEL), maxIterations: retryBudget, name: `Implementer #${spec.id} retry`, promptArgs: { @@ -50,14 +56,18 @@ export async function finalizeTask( promptFile: "./.sandcastle/implement-prompt.md", }); } catch (retryErr: unknown) { - const retryMsg = retryErr instanceof Error ? retryErr.message : String(retryErr); + const retryMsg = toErrorMessage(retryErr); console.warn( ` #${spec.id}: Implementer retry threw: ${retryMsg}. Falling through to PR creation.`, ); } try { - execFileSync("sh", ["-c", VALIDATION_COMMAND], { cwd, stdio: "pipe" }); + execFileSync("sh", ["-c", VALIDATION_COMMAND], { + cwd, + stdio: "pipe", + timeout: VALIDATION_TIMEOUT_MS, + }); validationPassed = true; console.log(` #${spec.id}: Validation passed after retry round.`); } catch { @@ -66,10 +76,14 @@ export async function finalizeTask( } // Rebase on latest main - const rebaseSucceeded = attemptRebase(cwd, spec); + const rebaseSucceeded = attemptRebase(cwd); if (rebaseSucceeded && validationPassed) { try { - execFileSync("sh", ["-c", VALIDATION_COMMAND], { cwd, stdio: "pipe" }); + execFileSync("sh", ["-c", VALIDATION_COMMAND], { + cwd, + stdio: "pipe", + timeout: VALIDATION_TIMEOUT_MS, + }); } catch (postRebaseErr: unknown) { const postRebaseStderr = extractStderr(postRebaseErr); console.warn( @@ -94,7 +108,7 @@ export async function finalizeTask( console.log(` #${spec.id}: PR created${isDraft ? " (draft)" : ""}.`); prCreated = true; } catch (err: unknown) { - const msg = err instanceof Error ? err.message : String(err); + const msg = toErrorMessage(err); console.error(` #${spec.id}: PR creation failed: ${msg}`); } @@ -105,13 +119,16 @@ export async function finalizeTask( * Fetches origin/main and rebases the current branch onto it. * On failure, aborts the rebase cleanly. * @param cwd - Working directory (worktree path). - * @param _spec - Task specification (unused, reserved for future logging). * @returns `true` if rebase succeeded, `false` otherwise. */ -function attemptRebase(cwd: string, _spec: TaskSpec): boolean { +function attemptRebase(cwd: string): boolean { try { - execFileSync("git", ["fetch", "origin", "main"], { cwd, stdio: "pipe" }); - execFileSync("git", ["rebase", "origin/main"], { cwd, stdio: "pipe" }); + execFileSync("git", ["fetch", "origin", "main"], { + cwd, + stdio: "pipe", + timeout: GIT_TIMEOUT_MS, + }); + execFileSync("git", ["rebase", "origin/main"], { cwd, stdio: "pipe", timeout: GIT_TIMEOUT_MS }); return true; } catch { try { @@ -189,7 +206,7 @@ function buildPrArgs( */ function extractStderr(err: unknown): string { return err instanceof Error && "stderr" in err - ? String((err as { stderr: unknown }).stderr).slice(0, 500) + ? String((err as { stderr: unknown }).stderr).slice(0, MAX_STDERR_CHARS) : ""; } @@ -204,15 +221,20 @@ function extractStderr(err: unknown): string { function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): boolean { if (rebaseSucceeded) { try { - execFileSync("git", ["push", "--force-with-lease"], { cwd, stdio: "pipe" }); + execFileSync("git", ["push", "--force-with-lease"], { + cwd, + stdio: "pipe", + timeout: PUSH_TIMEOUT_MS, + }); return true; } catch (pushErr: unknown) { - const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); + const pushMsg = toErrorMessage(pushErr); try { const suffix = crypto.randomBytes(4).toString("hex"); execFileSync("git", ["push", "origin", `HEAD:refs/heads/rescue/${spec.branch}-${suffix}`], { cwd, stdio: "pipe", + timeout: PUSH_TIMEOUT_MS, }); console.warn( ` #${spec.id}: Push failed. Commits preserved at rescue/${spec.branch}-${suffix}`, @@ -226,10 +248,10 @@ function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): bool } } else { try { - execFileSync("git", ["push"], { cwd, stdio: "pipe" }); + execFileSync("git", ["push"], { cwd, stdio: "pipe", timeout: PUSH_TIMEOUT_MS }); return true; } catch (pushErr: unknown) { - const pushMsg = pushErr instanceof Error ? pushErr.message : String(pushErr); + const pushMsg = toErrorMessage(pushErr); console.warn(` #${spec.id}: git push failed after rebase abort: ${pushMsg}`); return false; } @@ -244,7 +266,11 @@ function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): bool */ function runValidation(cwd: string, spec: TaskSpec): boolean { try { - execFileSync("sh", ["-c", VALIDATION_COMMAND], { cwd, stdio: "pipe" }); + execFileSync("sh", ["-c", VALIDATION_COMMAND], { + cwd, + stdio: "pipe", + timeout: VALIDATION_TIMEOUT_MS, + }); return true; } catch (err: unknown) { const stderr = extractStderr(err); diff --git a/.sandcastle/main.ts b/.sandcastle/main.ts index 3ca2d55..74122d9 100644 --- a/.sandcastle/main.ts +++ b/.sandcastle/main.ts @@ -2,6 +2,7 @@ import * as sandcastle from "@ai-hero/sandcastle"; import { docker } from "@ai-hero/sandcastle/sandboxes/docker"; import { ConcurrencyPool } from "./concurrency-pool.js"; +import { TASK_TIMEOUT_MS } from "./constants.js"; import { finalizeTask } from "./finalizer.js"; import { runRefinementLoop } from "./refinement-loop.js"; import { GithubIssueSource } from "./task-source.js"; @@ -11,7 +12,25 @@ const BRANCH_PREFIX = "agent/issue"; const ISSUE_LABEL = "sandcastle"; const MAX_PARALLEL = 3; const DOCKER_IMAGE = "sandcastle-sap-ai"; -const TASK_TIMEOUT_MS = 15 * 60 * 1000; + +/** + * Races a promise against a timeout, rejecting with a descriptive error if the timeout fires first. + * @param promise - The promise to race against the timeout. + * @param ms - Timeout duration in milliseconds. + * @param label - Human-readable label used in the timeout error message. + * @returns The resolved value of the promise if it completes before the timeout. + */ +function withTimeout(promise: Promise, ms: number, label: string): Promise { + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => { + reject(new Error(`${label} timed out after ${String(ms)}ms`)); + }, ms).unref(); + }); + timeoutPromise.catch(() => { + /* suppress unhandled rejection when task completes before timeout */ + }); + return Promise.race([promise, timeoutPromise]); +} const source = new GithubIssueSource({ branchPrefix: BRANCH_PREFIX, @@ -29,7 +48,7 @@ if (tasks.length === 0) { const settled = await Promise.allSettled( tasks.map((spec) => pool.run(() => - Promise.race([ + withTimeout( (async () => { await using sandbox = await sandcastle.createSandbox({ branch: spec.branch, @@ -54,18 +73,9 @@ if (tasks.length === 0) { return { prCreated, spec }; })(), - (() => { - const p = new Promise((_, reject) => { - setTimeout(() => { - reject(new Error(`Task #${spec.id} timed out after ${String(TASK_TIMEOUT_MS)}ms`)); - }, TASK_TIMEOUT_MS).unref(); - }); - p.catch(() => { - /* suppress unhandled rejection when task completes before timeout */ - }); - return p; - })(), - ]), + TASK_TIMEOUT_MS, + `Task #${spec.id}`, + ), ), ), ); diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index 0589ebc..ddfc966 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -6,11 +6,9 @@ import { join, sep } from "node:path"; import type { Finding, LoopResult, LoopStatus, SandboxInstance, TaskSpec } from "./types.js"; +import { VALIDATION_COMMAND } from "./constants.js"; import { ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS, parseFindingsSafe } from "./types.js"; -const VALIDATION_COMMAND = - "npm run type-check && npm run test && npm run test:node && npm run test:edge && npm run prettier-check && npm run lint && npm run build && npm run check-build && npm run build:v2 && npm run check-build:v2"; - /** Options for configuring the refinement loop. */ export interface RefinementLoopOptions { /** Budget of iterations per round (flat constant applied to every round). */ @@ -21,6 +19,53 @@ export interface RefinementLoopOptions { onRoundComplete?: (round: number, findings: Finding[]) => void; } +/** Result of a convergence check. */ +interface ConvergenceResult { + /** Best SHA to restore (empty string = no update). */ + bestSha: string; + /** Updated last findings. */ + lastFindings: Finding[]; + /** New loop status. */ + status: LoopStatus; +} + +/** + * Input descriptor for hashing a window of source lines around a finding. + */ +interface HashInput { + /** Working directory (worktree path) for resolving the file. */ + readonly cwd: string; + /** Relative file path of the finding. */ + readonly file: string; + /** Line number of the finding (1-indexed). */ + readonly line: number; +} + +/** + * Context passed to the quality ratchet check. + * Groups the per-round identifiers needed for regression detection and rollback. + */ +interface RatchetContext { + /** SHA of HEAD before the implementer ran (used for rollback). */ + readonly beforeSha: string; + /** Working directory for git operations. */ + readonly cwd: string; + /** Current round number (1-indexed). */ + readonly round: number; + /** The task specification. */ + readonly spec: TaskSpec; +} + +/** Resolved loop options with defaults applied. */ +interface ResolvedLoopOptions { + /** Iteration budget per round. */ + budget: number; + /** Maximum number of rounds. */ + maxRounds: number; + /** Optional round-complete callback (no-op if not provided). */ + onRoundComplete: (round: number, findings: Finding[]) => void; +} + /** Result of a single implement↔critic round. */ interface RoundResult { /** SHA of HEAD before the implementer ran. */ @@ -43,8 +88,7 @@ export async function runRefinementLoop( sandbox: SandboxInstance, opts?: RefinementLoopOptions, ): Promise { - const maxRounds = opts?.maxRounds ?? MAX_CRITIC_ROUNDS; - const budget = opts?.iterationBudget ?? ITERATION_BUDGET_PER_ROUND; + const { budget, maxRounds, onRoundComplete } = resolveLoopOptions(opts); const seenKeys = new Set(); let lastFindings: Finding[] = []; @@ -64,153 +108,162 @@ export async function runRefinementLoop( const result = await executeRound(spec, sandbox, round, budget, lastFindings); - if (round === 1 && result.commits === 0) { - console.warn(` #${spec.id}: 0 commits on round 1. Skipping.`); - status = "skipped"; + const earlyExit = checkEarlyExit(spec, round, result, totalCommits); + if (earlyExit !== null) { + totalCommits = earlyExit.totalCommits; + status = earlyExit.status; break; } - if (result.findings === null) { - totalCommits += result.commits; - console.warn(` #${spec.id}: Critic failed twice. Breaking (non-converged).`); - status = "failed"; - break; - } + if (result.findings === null) break; + const findings: Finding[] = result.findings; - if (round > 1 && result.commits === 0) { - status = "exhausted"; + if (result.commits > 0 && runMidLoopValidation(sandbox.worktreePath)) { + totalCommits += result.commits; + status = "converged"; break; } - // Fix 1: Validation in-loop (ARCS pattern) — deterministic convergence signal - if (result.commits > 0) { - try { - execFileSync("sh", ["-c", VALIDATION_COMMAND], { - cwd: sandbox.worktreePath, - stdio: "pipe", - timeout: 120_000, - }); - // Validation passed mid-loop — deterministic convergence - totalCommits += result.commits; - status = "converged"; - break; - } catch { - // Validation failed — continue to critic for feedback - } - } - - // Dedup via context hash const cwd = sandbox.worktreePath; - const newFindings = deduplicateFindings(result.findings, seenKeys, cwd); + const newFindings = deduplicateFindings(findings, seenKeys, cwd); console.log( - ` #${spec.id}: ${String(result.findings.length)} findings, ${String(newFindings.length)} new`, + ` #${spec.id}: ${String(findings.length)} findings, ${String(newFindings.length)} new`, ); - // Quality ratchet: rollback if findings increased (regression) - const nonLowFindings = result.findings.filter((f) => f.confidence !== "LOW"); + const nonLowFindings = findings.filter((f) => f.confidence !== "LOW"); if ( checkQualityRatchet( - spec, - round, + { beforeSha: result.beforeSha, cwd, round, spec }, nonLowFindings.length, previousFindingsCount, - result.beforeSha, - cwd, ) ) { status = "exhausted"; break; } - // Best-state checkpoint (SWE-Agent pattern) — after ratchet passes if (newFindings.length < bestFindingsCount) { bestFindingsCount = newFindings.length; - try { - bestSha = execFileSync("git", ["rev-parse", "HEAD"], { - cwd: sandbox.worktreePath, - encoding: "utf-8", - stdio: ["pipe", "pipe", "pipe"], - }).trim(); - } catch { - /* empty */ - } + bestSha = captureHeadSha(cwd); } totalCommits += result.commits; previousFindingsCount = nonLowFindings.length; + onRoundComplete(round, findings); - opts?.onRoundComplete?.(round, result.findings); - - if (newFindings.length === 0) { - // Severity-weighted convergence (OpenHands pattern): - // Don't converge if CRITICAL/HIGH findings persist, even if already seen - const criticalPersistent = result.findings.filter( - (f) => (f.severity === "CRITICAL" || f.severity === "HIGH") && f.confidence !== "LOW", - ); - if (criticalPersistent.length > 0) { - lastFindings = criticalPersistent; - status = "exhausted"; - // Capture current HEAD so post-loop reset is a no-op (code matches findings) - try { - bestSha = execFileSync("git", ["rev-parse", "HEAD"], { - cwd: sandbox.worktreePath, - encoding: "utf-8", - stdio: ["pipe", "pipe", "pipe"], - }).trim(); - } catch { - /* empty */ - } - break; - } - if (nonLowFindings.length > 0) { - lastFindings = nonLowFindings; - } - status = "converged"; + const convergenceResult = checkConvergence(cwd, findings, newFindings, nonLowFindings); + if (convergenceResult !== null) { + lastFindings = convergenceResult.lastFindings; + status = convergenceResult.status; + bestSha = convergenceResult.bestSha; break; } lastFindings = newFindings; } - // Best-state reset: if not converged, restore best intermediate state - if (status !== "converged" && /^[0-9a-f]{40}$/.test(bestSha)) { - try { - execFileSync("git", ["reset", "--hard", bestSha], { - cwd: sandbox.worktreePath, - stdio: "pipe", - }); - const commitCount = execFileSync("git", ["rev-list", "--count", "main..HEAD"], { - cwd: sandbox.worktreePath, - encoding: "utf-8", - }).trim(); - totalCommits = parseInt(commitCount, 10) || 0; - } catch { - /* empty */ - } + if (shouldResetToBest(status, bestSha)) { + totalCommits = resetToBestState(sandbox.worktreePath, bestSha, totalCommits); } return { lastFindings, roundsCompleted, status, totalCommits }; } /** - * Checks whether findings regressed compared to the previous round and rolls back if so. + * Captures the current HEAD SHA, returning empty string on failure. + * @param cwd - Working directory for git operations. + * @returns The HEAD SHA or empty string. + */ +function captureHeadSha(cwd: string): string { + try { + return execFileSync("git", ["rev-parse", "HEAD"], { + cwd, + encoding: "utf-8", + stdio: ["pipe", "pipe", "pipe"], + }).trim(); + } catch { + return ""; + } +} + +/** + * Checks whether the current round converged (no new findings). + * @param cwd - Working directory for git operations. + * @param allFindings - All findings from the critic. + * @param newFindings - Deduplicated new findings. + * @param nonLowFindings - Non-LOW-confidence findings. + * @returns A ConvergenceResult if the loop should break, or null to continue. + */ +function checkConvergence( + cwd: string, + allFindings: Finding[], + newFindings: Finding[], + nonLowFindings: Finding[], +): ConvergenceResult | null { + if (newFindings.length !== 0) return null; + + // Severity-weighted convergence (OpenHands pattern): + // Don't converge if CRITICAL/HIGH findings persist, even if already seen + const criticalPersistent = allFindings.filter( + (f) => (f.severity === "CRITICAL" || f.severity === "HIGH") && f.confidence !== "LOW", + ); + if (criticalPersistent.length > 0) { + // Capture current HEAD so post-loop reset is a no-op (code matches findings) + return { + bestSha: captureHeadSha(cwd), + lastFindings: criticalPersistent, + status: "exhausted", + }; + } + + return { + bestSha: "", + lastFindings: nonLowFindings.length > 0 ? nonLowFindings : [], + status: "converged", + }; +} + +/** + * Checks whether the round result warrants an early exit from the loop. * @param spec - The task specification. * @param round - Current round number. + * @param result - The round result. + * @param totalCommits - Running total of commits before this round. + * @returns An object with updated status and totalCommits if early exit, or null to continue. + */ +function checkEarlyExit( + spec: TaskSpec, + round: number, + result: RoundResult, + totalCommits: number, +): null | { status: LoopStatus; totalCommits: number } { + if (round === 1 && result.commits === 0) { + console.warn(` #${spec.id}: 0 commits on round 1. Skipping.`); + return { status: "skipped", totalCommits }; + } + if (result.findings === null) { + console.warn(` #${spec.id}: Critic failed twice. Breaking (non-converged).`); + return { status: "failed", totalCommits: totalCommits + result.commits }; + } + if (round > 1 && result.commits === 0) { + return { status: "exhausted", totalCommits }; + } + return null; +} + +/** + * @param ctx - Ratchet context containing spec, round, beforeSha, and cwd. * @param findingsCount - Number of non-LOW findings this round. * @param previousCount - Number of non-LOW findings from the previous round. - * @param beforeSha - SHA to reset to on regression. - * @param cwd - Working directory for git operations. * @returns True if a regression was detected and rollback performed. */ function checkQualityRatchet( - spec: TaskSpec, - round: number, + ctx: RatchetContext, findingsCount: number, previousCount: number, - beforeSha: string, - cwd: string, ): boolean { + const { beforeSha, cwd, round, spec } = ctx; if (round <= 2 || findingsCount <= previousCount) { return false; } @@ -236,6 +289,31 @@ function checkQualityRatchet( return true; } +/** + * Computes a deduplication key for a finding using a context hash of surrounding lines. + * @param f - Finding to compute a key for. + * @param cwd - Working directory (worktree path) for reading file context. + * @param fileCache - Optional cache of file contents keyed by resolved path. + * @returns Composite dedup key. + */ +function computeFindingKey(f: Finding, cwd: string, fileCache?: Map): string { + if (!f.file || f.line == null) { + const normalizedTitle = f.title + .toLowerCase() + .replace(/[^\w\s]/g, "") + .replace(/\s+/g, " ") + .trim(); + const titleHash = crypto + .createHash("sha256") + .update(normalizedTitle) + .digest("hex") + .slice(0, 16); + return `${f.file || "global"}::${f.category}::${titleHash}`; + } + const contextHash = hashContextLines({ cwd, file: f.file, line: f.line }, 3, fileCache); + return `${f.file}::${f.category}::${contextHash}`; +} + /** * Filters findings by confidence and deduplicates against previously seen keys. * @param findings - Raw findings from the critic. @@ -246,10 +324,10 @@ function checkQualityRatchet( function deduplicateFindings(findings: Finding[], seenKeys: Set, cwd: string): Finding[] { const fileCache = new Map(); const newFindings = findings.filter( - (f) => f.confidence !== "LOW" && !seenKeys.has(findingKey(f, cwd, fileCache)), + (f) => f.confidence !== "LOW" && !seenKeys.has(computeFindingKey(f, cwd, fileCache)), ); for (const f of newFindings) { - seenKeys.add(findingKey(f, cwd, fileCache)); + seenKeys.add(computeFindingKey(f, cwd, fileCache)); } return newFindings; } @@ -285,9 +363,9 @@ async function executeRound( } // Implementer - let impl: Awaited>; + let implementerResult: Awaited>; try { - impl = await sandbox.run({ + implementerResult = await sandbox.run({ agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), maxIterations: budget, name: `Implementer #${spec.id} R${String(round)}`, @@ -317,50 +395,22 @@ async function executeRound( findings = null; } - return { beforeSha, commits: impl.commits.length, findings }; -} - -/** - * Computes a deduplication key for a finding using a context hash of surrounding lines. - * @param f - Finding to compute a key for. - * @param cwd - Working directory (worktree path) for reading file context. - * @param fileCache - Optional cache of file contents keyed by resolved path. - * @returns Composite dedup key. - */ -function findingKey(f: Finding, cwd: string, fileCache?: Map): string { - if (!f.file || f.line == null) { - const normalizedTitle = f.title - .toLowerCase() - .replace(/[^\w\s]/g, "") - .replace(/\s+/g, " ") - .trim(); - const titleHash = crypto - .createHash("sha256") - .update(normalizedTitle) - .digest("hex") - .slice(0, 16); - return `${f.file || "global"}::${f.category}::${titleHash}`; - } - const contextHash = hashContextLines(cwd, f.file, f.line, 3, fileCache); - return `${f.file}::${f.category}::${contextHash}`; + return { beforeSha, commits: implementerResult.commits.length, findings }; } /** * Hashes a window of lines around the finding for dedup stability. - * @param cwd - Working directory. - * @param file - Relative file path. - * @param line - Line number of the finding. + * @param input - Hash input containing cwd, file, and line. * @param radius - Number of lines above/below to include in the context window. * @param fileCache - Optional cache of file contents keyed by resolved path. * @returns Truncated SHA-256 hex digest. */ function hashContextLines( - cwd: string, - file: string, - line: number, + input: HashInput, radius: number, fileCache?: Map, ): string { + const { cwd, file, line } = input; try { const fullPath = realpathSync(join(cwd, file)); if (!fullPath.startsWith(realpathSync(cwd) + sep)) { @@ -419,6 +469,42 @@ function parseFindings(stdout: string, nonce: string): Finding[] | null { return null; } +/** + * Resets the worktree to the best intermediate state and recounts commits. + * @param cwd - Working directory for git operations. + * @param bestSha - The SHA to reset to. + * @param currentCommits - Current total commits (fallback if recount fails). + * @returns Updated total commit count. + */ +function resetToBestState(cwd: string, bestSha: string, currentCommits: number): number { + try { + execFileSync("git", ["reset", "--hard", bestSha], { + cwd, + stdio: "pipe", + }); + const commitCount = execFileSync("git", ["rev-list", "--count", "main..HEAD"], { + cwd, + encoding: "utf-8", + }).trim(); + return parseInt(commitCount, 10) || 0; + } catch { + return currentCommits; + } +} + +/** + * Resolves loop options, applying defaults for missing values. + * @param opts - Optional loop options. + * @returns Resolved options with all fields populated. + */ +function resolveLoopOptions(opts: RefinementLoopOptions | undefined): ResolvedLoopOptions { + return { + budget: opts?.iterationBudget ?? ITERATION_BUDGET_PER_ROUND, + maxRounds: opts?.maxRounds ?? MAX_CRITIC_ROUNDS, + onRoundComplete: opts?.onRoundComplete ?? (() => undefined), + }; +} + /** * Runs the critic agent, retrying once on parse failure. * @param sandbox - The sandcastle sandbox instance. @@ -463,3 +549,31 @@ async function runCritic( return findings; } + +/** + * Runs the mid-loop validation command (ARCS pattern). + * @param cwd - Working directory for the validation command. + * @returns True if validation passed (deterministic convergence), false otherwise. + */ +function runMidLoopValidation(cwd: string): boolean { + try { + execFileSync("sh", ["-c", VALIDATION_COMMAND], { + cwd, + stdio: "pipe", + timeout: 120_000, + }); + return true; + } catch { + return false; + } +} + +/** + * Returns true if the best-state reset should be applied after the loop. + * @param status - Final loop status. + * @param bestSha - Best intermediate SHA (empty string if none captured). + * @returns True if reset should be applied. + */ +function shouldResetToBest(status: LoopStatus, bestSha: string): boolean { + return status !== "converged" && /^[0-9a-f]{40}$/.test(bestSha); +} diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts index 8773b0d..c9f5ffa 100644 --- a/.sandcastle/task-source.ts +++ b/.sandcastle/task-source.ts @@ -5,6 +5,8 @@ import { z } from "zod"; import type { TaskSpec } from "./types.js"; +import { GIT_TIMEOUT_MS, MAX_TITLE_LENGTH, toErrorMessage } from "./constants.js"; + const RawIssueSchema = z.object({ body: z .string() @@ -137,26 +139,30 @@ export class GithubIssueSource implements TaskSource { "--label", this.label, ], - { encoding: "utf-8" }, + { encoding: "utf-8", timeout: GIT_TIMEOUT_MS }, + ); + } catch (err) { + console.error( + `Failed to fetch issues: ${toErrorMessage(err)}. Ensure gh is installed and authenticated.`, ); - } catch { - console.error("Failed to fetch issues. Ensure gh is installed and authenticated."); process.exit(1); } let rawIssues: z.infer; try { rawIssues = RawIssuesSchema.parse(JSON.parse(rawIssuesJson)); - } catch { - console.error("Failed to parse issues JSON. Unexpected format from gh CLI."); + } catch (err) { + console.error( + `Failed to parse issues JSON: ${toErrorMessage(err)}. Unexpected format from gh CLI.`, + ); process.exit(1); } - return rawIssues.map((i) => ({ - body: sanitizeForPrompt(i.body), - labels: i.labels.map((l) => l.name), - number: i.number, - title: sanitizeForPrompt(i.title), + return rawIssues.map((issue) => ({ + body: sanitizeForPrompt(issue.body), + labels: issue.labels.map((label) => label.name), + number: issue.number, + title: sanitizeForPrompt(issue.title), })); } @@ -178,28 +184,27 @@ export class GithubIssueSource implements TaskSource { if (typeof item.branch !== "string" || !this.branchPattern.test(item.branch)) return false; if (typeof item.title !== "string") return false; - if (item.title.length > 200) return false; - for (let ci = 0; ci < item.title.length; ci++) { - if (item.title.charCodeAt(ci) < 0x20) return false; - } + if (item.title.length > MAX_TITLE_LENGTH) return false; + // eslint-disable-next-line no-control-regex + if (/[\x00-\x1f]/.test(item.title)) return false; return true; }, ); - const issueMap = new Map(issuesJson.map((i) => [String(i.number), i])); + const issueMap = new Map(issuesJson.map((issue) => [String(issue.number), issue])); return validated - .map((v) => { - const source = issueMap.get(v.id); + .map((entry) => { + const source = issueMap.get(entry.id); if (!source) return null; return { - ...v, + ...entry, body: source.body, labels: source.labels, }; }) - .filter((v): v is NonNullable => v !== null); - } catch { - console.error("Planner produced invalid JSON. Retrying."); + .filter((entry): entry is NonNullable => entry !== null); + } catch (err) { + console.error(`Planner produced invalid JSON: ${toErrorMessage(err)}. Retrying.`); return null; } } diff --git a/.sandcastle/types.ts b/.sandcastle/types.ts index 300df94..9ebc589 100644 --- a/.sandcastle/types.ts +++ b/.sandcastle/types.ts @@ -2,6 +2,16 @@ import type * as sandcastle from "@ai-hero/sandcastle"; import { z } from "zod"; +/** Result of post-loop finalization. */ +export interface FinalizeResult { + /** Whether the PR was marked as draft. */ + isDraft: boolean; + /** Whether a PR was successfully created. */ + prCreated: boolean; + /** Whether validation passed. */ + validationPassed: boolean; +} + /** Zod schema for a single critic finding. */ export const FindingSchema = z.object({ category: z.enum(["security", "logic", "performance", "architecture", "style"]), @@ -14,19 +24,6 @@ export const FindingSchema = z.object({ title: z.string(), }); -/** Zod schema for an array of critic findings. */ -export const FindingsSchema = z.array(FindingSchema); - -/** Result of post-loop finalization. */ -export interface FinalizeResult { - /** Whether the PR was marked as draft. */ - isDraft: boolean; - /** Whether a PR was successfully created. */ - prCreated: boolean; - /** Whether validation passed. */ - validationPassed: boolean; -} - /** A single critic finding parsed from agent output. */ export type Finding = z.infer; From 69ca4fa826a1320efaef48b5ae71d48ff20749c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 20:40:42 +0200 Subject: [PATCH 28/31] fix: use constants from constants.ts + add planner timeout (multi-agent audit findings) --- .sandcastle/refinement-loop.ts | 28 +++++++++++++++++++--------- .sandcastle/task-source.ts | 21 ++++++++++++++++++--- 2 files changed, 37 insertions(+), 12 deletions(-) diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index ddfc966..bb04f2c 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -6,7 +6,13 @@ import { join, sep } from "node:path"; import type { Finding, LoopResult, LoopStatus, SandboxInstance, TaskSpec } from "./types.js"; -import { VALIDATION_COMMAND } from "./constants.js"; +import { + AGENT_MODEL, + CONTEXT_HASH_RADIUS, + HASH_PREFIX_LENGTH, + VALIDATION_COMMAND, + VALIDATION_TIMEOUT_MS, +} from "./constants.js"; import { ITERATION_BUDGET_PER_ROUND, MAX_CRITIC_ROUNDS, parseFindingsSafe } from "./types.js"; /** Options for configuring the refinement loop. */ @@ -307,10 +313,14 @@ function computeFindingKey(f: Finding, cwd: string, fileCache?: Map>; try { implementerResult = await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + agent: sandcastle.opencode(AGENT_MODEL), maxIterations: budget, name: `Implementer #${spec.id} R${String(round)}`, promptArgs: { @@ -434,13 +444,13 @@ function hashContextLines( .createHash("sha256") .update(`${file}:${String(line)}:${normalized}`) .digest("hex") - .slice(0, 16); + .slice(0, HASH_PREFIX_LENGTH); } catch { return crypto .createHash("sha256") .update(`${file}:${String(line)}:fallback`) .digest("hex") - .slice(0, 16); + .slice(0, HASH_PREFIX_LENGTH); } } @@ -520,7 +530,7 @@ async function runCritic( nonce: string, ): Promise { let critic = await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + agent: sandcastle.opencode(AGENT_MODEL), maxIterations: 1, name: `Critic #${spec.id} R${String(round)}`, promptArgs: { @@ -535,7 +545,7 @@ async function runCritic( if (findings === null) { console.warn(` #${spec.id}: Critic parse failed. Retrying.`); critic = await sandbox.run({ - agent: sandcastle.opencode("github-copilot/claude-sonnet-4.6"), + agent: sandcastle.opencode(AGENT_MODEL), maxIterations: 1, name: `Critic #${spec.id} R${String(round)} retry`, promptArgs: { @@ -560,7 +570,7 @@ function runMidLoopValidation(cwd: string): boolean { execFileSync("sh", ["-c", VALIDATION_COMMAND], { cwd, stdio: "pipe", - timeout: 120_000, + timeout: VALIDATION_TIMEOUT_MS, }); return true; } catch { diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts index c9f5ffa..1f31f32 100644 --- a/.sandcastle/task-source.ts +++ b/.sandcastle/task-source.ts @@ -5,7 +5,13 @@ import { z } from "zod"; import type { TaskSpec } from "./types.js"; -import { GIT_TIMEOUT_MS, MAX_TITLE_LENGTH, toErrorMessage } from "./constants.js"; +import { + GIT_TIMEOUT_MS, + MAX_TITLE_LENGTH, + PLANNER_MODEL, + TASK_TIMEOUT_MS, + toErrorMessage, +} from "./constants.js"; const RawIssueSchema = z.object({ body: z @@ -74,8 +80,8 @@ export class GithubIssueSource implements TaskSource { for (let attempt = 1; attempt <= this.maxRetries; attempt++) { console.log(`\n=== Planner attempt ${String(attempt)}/${String(this.maxRetries)} ===\n`); - const plan = await sandcastle.run({ - agent: sandcastle.opencode("github-copilot/claude-opus-4.6"), + const planPromise = sandcastle.run({ + agent: sandcastle.opencode(PLANNER_MODEL), maxIterations: 1, name: "Planner", promptArgs: { @@ -85,6 +91,15 @@ export class GithubIssueSource implements TaskSource { promptFile: "./.sandcastle/plan-prompt.md", sandbox: docker({ imageName: this.dockerImage }), }); + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => { + reject(new Error("Planner timed out")); + }, TASK_TIMEOUT_MS).unref(); + }); + timeoutPromise.catch(() => { + /* suppress unhandled rejection when planner completes before timeout */ + }); + const plan = await Promise.race([planPromise, timeoutPromise]); const planMatches = [...plan.stdout.matchAll(/([\s\S]*?)<\/plan>/g)]; const planMatch = planMatches.at(-1); From d8b3454a63090692f0d0b3eeb3665fc2f38982d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 21:51:06 +0200 Subject: [PATCH 29/31] =?UTF-8?q?refactor(.sandcastle):=20harden=20prompts?= =?UTF-8?q?=20=E2=80=94=20cap=20findings,=20add=20known=20decisions,=20sco?= =?UTF-8?q?pe=20preference?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .sandcastle/critic-prompt.md | 10 +++++++++- .sandcastle/plan-prompt.md | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/.sandcastle/critic-prompt.md b/.sandcastle/critic-prompt.md index 98ec518..fb6a997 100644 --- a/.sandcastle/critic-prompt.md +++ b/.sandcastle/critic-prompt.md @@ -6,7 +6,7 @@ Analyze the implementation on branch `{{BRANCH}}` and produce structured finding Run `git diff main...{{BRANCH}}` to see all changes. Examine the diff carefully. For each issue found, produce a structured finding. -Read `CONTRIBUTING.md` for the project's coding standards. +Read `AGENTS.md` and `CONTRIBUTING.md` for the project's coding standards. ## Output Format @@ -39,6 +39,8 @@ If no issues are found, output: ## Rules +- Report ≤5 findings. HIGH and CRITICAL only. Omit LOW/MEDIUM unless zero higher-severity issues exist. +- If >5 HIGH/CRITICAL issues exist, report the top 5 and add a summary note in the last finding's description. - Do NOT modify any files. Do NOT commit. Do NOT push. - Only report issues in the CHANGED code (not pre-existing issues). - Use HIGH confidence only when you've verified the issue by reading the relevant code. @@ -47,6 +49,12 @@ If no issues are found, output: - Focus on: logic errors, missing edge cases, security issues, type safety violations, test gaps. - Do NOT report formatting issues (prettier handles those). +## Known Design Decisions (do not flag) + +- Mid-loop validation convergence bypasses critic (ARCS pattern — deterministic tests > subjective review). +- `process.exit()` at script end kills timed-out sandboxes (no cooperative cancellation available in sandcastle). +- Content-addressed dedup hash includes line number (collision reduction tradeoff, bounded by hard cap). + ## Completion After outputting the findings, output: diff --git a/.sandcastle/plan-prompt.md b/.sandcastle/plan-prompt.md index 5d34138..27be837 100644 --- a/.sandcastle/plan-prompt.md +++ b/.sandcastle/plan-prompt.md @@ -31,6 +31,7 @@ Read `AGENTS.md` for project conventions. - Exclude issues labeled `wontfix`, `duplicate`, or `question`. - Exclude issues that depend on another open issue (mention "blocked by #N" or similar). +- Prefer issues where scope fits a single-file change over cross-cutting refactors. - If every issue is blocked, include the single highest-priority candidate (fewest/weakest dependencies). - If no actionable issues exist, output: From 189f6555a143c83f3861a8f8a6c319218bea37c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 22:26:43 +0200 Subject: [PATCH 30/31] perf(.sandcastle): convert execFileSync to async execFileAsync (unblock event loop) Replace all blocking execFileSync calls with util.promisify(execFile) to enable true parallelism between tasks during subprocess execution. - constants.ts: add execFileAsync export, convert getHeadSha to async - refinement-loop.ts: captureHeadSha, checkQualityRatchet, checkConvergence, runMidLoopValidation, resetToBestState all async - finalizer.ts: runValidation, attemptRebase, pushBranch all async - task-source.ts: fetchAndSanitizeIssues async readFileSync/realpathSync stay sync (<1ms local I/O, no benefit from async). maxBuffer: 8MB added to validation and gh issue list calls. --- .sandcastle/constants.ts | 15 +++++--- .sandcastle/finalizer.ts | 53 +++++++++++++------------- .sandcastle/refinement-loop.ts | 69 +++++++++++++++------------------- .sandcastle/task-source.ts | 23 +++++++----- 4 files changed, 79 insertions(+), 81 deletions(-) diff --git a/.sandcastle/constants.ts b/.sandcastle/constants.ts index 23374bb..c47bed5 100644 --- a/.sandcastle/constants.ts +++ b/.sandcastle/constants.ts @@ -1,4 +1,5 @@ -import { execFileSync } from "node:child_process"; +import { execFile } from "node:child_process"; +import util from "node:util"; /** Model identifier used for implementation and critic agents. */ export const AGENT_MODEL = "github-copilot/claude-sonnet-4.6"; @@ -6,6 +7,9 @@ export const AGENT_MODEL = "github-copilot/claude-sonnet-4.6"; /** Number of context lines around a diff hunk used for hash computation. */ export const CONTEXT_HASH_RADIUS = 3; +/** Async execFile — does not block the event loop. Same error shape as execFileSync. */ +export const execFileAsync = util.promisify(execFile); + /** Timeout in milliseconds for git operations. */ export const GIT_TIMEOUT_MS = 30_000; @@ -39,13 +43,12 @@ export const VALIDATION_TIMEOUT_MS = 120_000; * @param cwd - Absolute path to the git repository root. * @returns The full SHA string, or `null` if the command fails. */ -export function getHeadSha(cwd: string): null | string { +export async function getHeadSha(cwd: string): Promise { try { - return execFileSync("git", ["rev-parse", "HEAD"], { + const { stdout } = await execFileAsync("git", ["rev-parse", "HEAD"], { cwd, - encoding: "utf-8", - stdio: ["pipe", "pipe", "pipe"], - }).trim(); + }); + return stdout.trim(); } catch { return null; } diff --git a/.sandcastle/finalizer.ts b/.sandcastle/finalizer.ts index ab949d3..a164565 100644 --- a/.sandcastle/finalizer.ts +++ b/.sandcastle/finalizer.ts @@ -1,11 +1,11 @@ import * as sandcastle from "@ai-hero/sandcastle"; -import { execFileSync } from "node:child_process"; import crypto from "node:crypto"; import type { FinalizeResult, LoopResult, SandboxInstance, TaskSpec } from "./types.js"; import { AGENT_MODEL, + execFileAsync, GIT_TIMEOUT_MS, MAX_STDERR_CHARS, PUSH_TIMEOUT_MS, @@ -29,7 +29,7 @@ export async function finalizeTask( sandbox: SandboxInstance, cwd: string, ): Promise { - let validationPassed = runValidation(cwd, spec); + let validationPassed = await runValidation(cwd, spec); // Retry one more round if validation failed and budget remains if (!validationPassed && loopResult.roundsCompleted < MAX_CRITIC_ROUNDS) { @@ -63,9 +63,9 @@ export async function finalizeTask( } try { - execFileSync("sh", ["-c", VALIDATION_COMMAND], { + await execFileAsync("sh", ["-c", VALIDATION_COMMAND], { cwd, - stdio: "pipe", + maxBuffer: 8 * 1024 * 1024, timeout: VALIDATION_TIMEOUT_MS, }); validationPassed = true; @@ -76,12 +76,12 @@ export async function finalizeTask( } // Rebase on latest main - const rebaseSucceeded = attemptRebase(cwd); + const rebaseSucceeded = await attemptRebase(cwd); if (rebaseSucceeded && validationPassed) { try { - execFileSync("sh", ["-c", VALIDATION_COMMAND], { + await execFileAsync("sh", ["-c", VALIDATION_COMMAND], { cwd, - stdio: "pipe", + maxBuffer: 8 * 1024 * 1024, timeout: VALIDATION_TIMEOUT_MS, }); } catch (postRebaseErr: unknown) { @@ -94,7 +94,7 @@ export async function finalizeTask( } // Push - const pushSucceeded = pushBranch(cwd, spec, rebaseSucceeded); + const pushSucceeded = await pushBranch(cwd, spec, rebaseSucceeded); if (!pushSucceeded) { console.warn(` #${spec.id}: Push did not succeed; PR may reference unpushed commits.`); } @@ -104,7 +104,7 @@ export async function finalizeTask( let prCreated = false; try { - execFileSync("gh", prArgs, { cwd, encoding: "utf-8", stdio: "pipe" }); + await execFileAsync("gh", prArgs, { cwd, maxBuffer: 8 * 1024 * 1024 }); console.log(` #${spec.id}: PR created${isDraft ? " (draft)" : ""}.`); prCreated = true; } catch (err: unknown) { @@ -121,18 +121,17 @@ export async function finalizeTask( * @param cwd - Working directory (worktree path). * @returns `true` if rebase succeeded, `false` otherwise. */ -function attemptRebase(cwd: string): boolean { +async function attemptRebase(cwd: string): Promise { try { - execFileSync("git", ["fetch", "origin", "main"], { + await execFileAsync("git", ["fetch", "origin", "main"], { cwd, - stdio: "pipe", timeout: GIT_TIMEOUT_MS, }); - execFileSync("git", ["rebase", "origin/main"], { cwd, stdio: "pipe", timeout: GIT_TIMEOUT_MS }); + await execFileAsync("git", ["rebase", "origin/main"], { cwd, timeout: GIT_TIMEOUT_MS }); return true; } catch { try { - execFileSync("git", ["rebase", "--abort"], { cwd, stdio: "pipe" }); + await execFileAsync("git", ["rebase", "--abort"], { cwd }); } catch { /* empty */ } @@ -218,12 +217,11 @@ function extractStderr(err: unknown): string { * @param rebaseSucceeded - Whether the preceding rebase completed successfully. * @returns `true` if the primary push succeeded, `false` otherwise. */ -function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): boolean { +async function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): Promise { if (rebaseSucceeded) { try { - execFileSync("git", ["push", "--force-with-lease"], { + await execFileAsync("git", ["push", "--force-with-lease"], { cwd, - stdio: "pipe", timeout: PUSH_TIMEOUT_MS, }); return true; @@ -231,11 +229,14 @@ function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): bool const pushMsg = toErrorMessage(pushErr); try { const suffix = crypto.randomBytes(4).toString("hex"); - execFileSync("git", ["push", "origin", `HEAD:refs/heads/rescue/${spec.branch}-${suffix}`], { - cwd, - stdio: "pipe", - timeout: PUSH_TIMEOUT_MS, - }); + await execFileAsync( + "git", + ["push", "origin", `HEAD:refs/heads/rescue/${spec.branch}-${suffix}`], + { + cwd, + timeout: PUSH_TIMEOUT_MS, + }, + ); console.warn( ` #${spec.id}: Push failed. Commits preserved at rescue/${spec.branch}-${suffix}`, ); @@ -248,7 +249,7 @@ function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): bool } } else { try { - execFileSync("git", ["push"], { cwd, stdio: "pipe", timeout: PUSH_TIMEOUT_MS }); + await execFileAsync("git", ["push"], { cwd, timeout: PUSH_TIMEOUT_MS }); return true; } catch (pushErr: unknown) { const pushMsg = toErrorMessage(pushErr); @@ -264,11 +265,11 @@ function pushBranch(cwd: string, spec: TaskSpec, rebaseSucceeded: boolean): bool * @param spec - The task specification (used for logging). * @returns `true` if validation passed, `false` otherwise. */ -function runValidation(cwd: string, spec: TaskSpec): boolean { +async function runValidation(cwd: string, spec: TaskSpec): Promise { try { - execFileSync("sh", ["-c", VALIDATION_COMMAND], { + await execFileAsync("sh", ["-c", VALIDATION_COMMAND], { cwd, - stdio: "pipe", + maxBuffer: 8 * 1024 * 1024, timeout: VALIDATION_TIMEOUT_MS, }); return true; diff --git a/.sandcastle/refinement-loop.ts b/.sandcastle/refinement-loop.ts index bb04f2c..3e852a3 100644 --- a/.sandcastle/refinement-loop.ts +++ b/.sandcastle/refinement-loop.ts @@ -1,5 +1,4 @@ import * as sandcastle from "@ai-hero/sandcastle"; -import { execFileSync } from "node:child_process"; import crypto from "node:crypto"; import { readFileSync, realpathSync } from "node:fs"; import { join, sep } from "node:path"; @@ -9,6 +8,7 @@ import type { Finding, LoopResult, LoopStatus, SandboxInstance, TaskSpec } from import { AGENT_MODEL, CONTEXT_HASH_RADIUS, + execFileAsync, HASH_PREFIX_LENGTH, VALIDATION_COMMAND, VALIDATION_TIMEOUT_MS, @@ -124,7 +124,7 @@ export async function runRefinementLoop( if (result.findings === null) break; const findings: Finding[] = result.findings; - if (result.commits > 0 && runMidLoopValidation(sandbox.worktreePath)) { + if (result.commits > 0 && (await runMidLoopValidation(sandbox.worktreePath))) { totalCommits += result.commits; status = "converged"; break; @@ -139,7 +139,7 @@ export async function runRefinementLoop( const nonLowFindings = findings.filter((f) => f.confidence !== "LOW"); if ( - checkQualityRatchet( + await checkQualityRatchet( { beforeSha: result.beforeSha, cwd, round, spec }, nonLowFindings.length, previousFindingsCount, @@ -151,14 +151,14 @@ export async function runRefinementLoop( if (newFindings.length < bestFindingsCount) { bestFindingsCount = newFindings.length; - bestSha = captureHeadSha(cwd); + bestSha = await captureHeadSha(cwd); } totalCommits += result.commits; previousFindingsCount = nonLowFindings.length; onRoundComplete(round, findings); - const convergenceResult = checkConvergence(cwd, findings, newFindings, nonLowFindings); + const convergenceResult = await checkConvergence(cwd, findings, newFindings, nonLowFindings); if (convergenceResult !== null) { lastFindings = convergenceResult.lastFindings; status = convergenceResult.status; @@ -170,7 +170,7 @@ export async function runRefinementLoop( } if (shouldResetToBest(status, bestSha)) { - totalCommits = resetToBestState(sandbox.worktreePath, bestSha, totalCommits); + totalCommits = await resetToBestState(sandbox.worktreePath, bestSha, totalCommits); } return { lastFindings, roundsCompleted, status, totalCommits }; @@ -181,13 +181,10 @@ export async function runRefinementLoop( * @param cwd - Working directory for git operations. * @returns The HEAD SHA or empty string. */ -function captureHeadSha(cwd: string): string { +async function captureHeadSha(cwd: string): Promise { try { - return execFileSync("git", ["rev-parse", "HEAD"], { - cwd, - encoding: "utf-8", - stdio: ["pipe", "pipe", "pipe"], - }).trim(); + const { stdout } = await execFileAsync("git", ["rev-parse", "HEAD"], { cwd }); + return stdout.trim(); } catch { return ""; } @@ -201,12 +198,12 @@ function captureHeadSha(cwd: string): string { * @param nonLowFindings - Non-LOW-confidence findings. * @returns A ConvergenceResult if the loop should break, or null to continue. */ -function checkConvergence( +async function checkConvergence( cwd: string, allFindings: Finding[], newFindings: Finding[], nonLowFindings: Finding[], -): ConvergenceResult | null { +): Promise { if (newFindings.length !== 0) return null; // Severity-weighted convergence (OpenHands pattern): @@ -217,7 +214,7 @@ function checkConvergence( if (criticalPersistent.length > 0) { // Capture current HEAD so post-loop reset is a no-op (code matches findings) return { - bestSha: captureHeadSha(cwd), + bestSha: await captureHeadSha(cwd), lastFindings: criticalPersistent, status: "exhausted", }; @@ -264,27 +261,24 @@ function checkEarlyExit( * @param previousCount - Number of non-LOW findings from the previous round. * @returns True if a regression was detected and rollback performed. */ -function checkQualityRatchet( +async function checkQualityRatchet( ctx: RatchetContext, findingsCount: number, previousCount: number, -): boolean { +): Promise { const { beforeSha, cwd, round, spec } = ctx; if (round <= 2 || findingsCount <= previousCount) { return false; } - // Validate SHA format before passing to execFileSync + // Validate SHA format before passing to execFileAsync if (!/^[0-9a-f]{40}$/.test(beforeSha)) { console.warn(` #${spec.id}: Invalid SHA for rollback, skipping reset.`); return true; } try { - execFileSync("git", ["reset", "--hard", beforeSha], { - cwd, - stdio: "pipe", - }); + await execFileAsync("git", ["reset", "--hard", beforeSha], { cwd }); console.warn( ` #${spec.id} R${String(round)}: Regression detected (${String(previousCount)} → ${String(findingsCount)}). Rolled back.`, ); @@ -363,11 +357,10 @@ async function executeRound( // Capture SHA before implementer runs (for quality ratchet rollback) let beforeSha = ""; try { - beforeSha = execFileSync("git", ["rev-parse", "HEAD"], { + const { stdout } = await execFileAsync("git", ["rev-parse", "HEAD"], { cwd: sandbox.worktreePath, - encoding: "utf-8", - stdio: ["pipe", "pipe", "pipe"], - }).trim(); + }); + beforeSha = stdout.trim(); } catch { console.warn(` #${spec.id}: Failed to capture HEAD SHA before round ${String(round)}.`); } @@ -486,17 +479,15 @@ function parseFindings(stdout: string, nonce: string): Finding[] | null { * @param currentCommits - Current total commits (fallback if recount fails). * @returns Updated total commit count. */ -function resetToBestState(cwd: string, bestSha: string, currentCommits: number): number { +async function resetToBestState( + cwd: string, + bestSha: string, + currentCommits: number, +): Promise { try { - execFileSync("git", ["reset", "--hard", bestSha], { - cwd, - stdio: "pipe", - }); - const commitCount = execFileSync("git", ["rev-list", "--count", "main..HEAD"], { - cwd, - encoding: "utf-8", - }).trim(); - return parseInt(commitCount, 10) || 0; + await execFileAsync("git", ["reset", "--hard", bestSha], { cwd }); + const { stdout } = await execFileAsync("git", ["rev-list", "--count", "main..HEAD"], { cwd }); + return parseInt(stdout.trim(), 10) || 0; } catch { return currentCommits; } @@ -565,11 +556,11 @@ async function runCritic( * @param cwd - Working directory for the validation command. * @returns True if validation passed (deterministic convergence), false otherwise. */ -function runMidLoopValidation(cwd: string): boolean { +async function runMidLoopValidation(cwd: string): Promise { try { - execFileSync("sh", ["-c", VALIDATION_COMMAND], { + await execFileAsync("sh", ["-c", VALIDATION_COMMAND], { cwd, - stdio: "pipe", + maxBuffer: 8 * 1024 * 1024, timeout: VALIDATION_TIMEOUT_MS, }); return true; diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts index 1f31f32..a59066d 100644 --- a/.sandcastle/task-source.ts +++ b/.sandcastle/task-source.ts @@ -1,11 +1,11 @@ import * as sandcastle from "@ai-hero/sandcastle"; import { docker } from "@ai-hero/sandcastle/sandboxes/docker"; -import { execFileSync } from "node:child_process"; import { z } from "zod"; import type { TaskSpec } from "./types.js"; import { + execFileAsync, GIT_TIMEOUT_MS, MAX_TITLE_LENGTH, PLANNER_MODEL, @@ -70,7 +70,7 @@ export class GithubIssueSource implements TaskSource { * @returns Array of task specifications to implement. */ async discover(): Promise { - const issuesJson = this.fetchAndSanitizeIssues(); + const issuesJson = await this.fetchAndSanitizeIssues(); if (issuesJson.length === 0) { console.log("No issues with label '%s'. Exiting.", this.label); @@ -132,15 +132,17 @@ export class GithubIssueSource implements TaskSource { return []; } - private fetchAndSanitizeIssues(): { - body: string; - labels: string[]; - number: number; - title: string; - }[] { + private async fetchAndSanitizeIssues(): Promise< + { + body: string; + labels: string[]; + number: number; + title: string; + }[] + > { let rawIssuesJson: string; try { - rawIssuesJson = execFileSync( + const { stdout } = await execFileAsync( "gh", [ "issue", @@ -154,8 +156,9 @@ export class GithubIssueSource implements TaskSource { "--label", this.label, ], - { encoding: "utf-8", timeout: GIT_TIMEOUT_MS }, + { encoding: "utf-8", maxBuffer: 8 * 1024 * 1024, timeout: GIT_TIMEOUT_MS }, ); + rawIssuesJson = stdout; } catch (err) { console.error( `Failed to fetch issues: ${toErrorMessage(err)}. Ensure gh is installed and authenticated.`, From 96bbd805ca05d3b4c38446baa286da1242c4afd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Benoit?= Date: Tue, 5 May 2026 22:35:12 +0200 Subject: [PATCH 31/31] fix: catch planner timeout rejection (retry instead of crash) + add catch type annotations --- .sandcastle/task-source.ts | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/.sandcastle/task-source.ts b/.sandcastle/task-source.ts index a59066d..358d73c 100644 --- a/.sandcastle/task-source.ts +++ b/.sandcastle/task-source.ts @@ -99,7 +99,13 @@ export class GithubIssueSource implements TaskSource { timeoutPromise.catch(() => { /* suppress unhandled rejection when planner completes before timeout */ }); - const plan = await Promise.race([planPromise, timeoutPromise]); + let plan: Awaited>; + try { + plan = await Promise.race([planPromise, timeoutPromise]); + } catch { + console.error("Planner timed out or failed. Retrying."); + continue; + } const planMatches = [...plan.stdout.matchAll(/([\s\S]*?)<\/plan>/g)]; const planMatch = planMatches.at(-1); @@ -159,7 +165,7 @@ export class GithubIssueSource implements TaskSource { { encoding: "utf-8", maxBuffer: 8 * 1024 * 1024, timeout: GIT_TIMEOUT_MS }, ); rawIssuesJson = stdout; - } catch (err) { + } catch (err: unknown) { console.error( `Failed to fetch issues: ${toErrorMessage(err)}. Ensure gh is installed and authenticated.`, ); @@ -169,7 +175,7 @@ export class GithubIssueSource implements TaskSource { let rawIssues: z.infer; try { rawIssues = RawIssuesSchema.parse(JSON.parse(rawIssuesJson)); - } catch (err) { + } catch (err: unknown) { console.error( `Failed to parse issues JSON: ${toErrorMessage(err)}. Unexpected format from gh CLI.`, ); @@ -221,7 +227,7 @@ export class GithubIssueSource implements TaskSource { }; }) .filter((entry): entry is NonNullable => entry !== null); - } catch (err) { + } catch (err: unknown) { console.error(`Planner produced invalid JSON: ${toErrorMessage(err)}. Retrying.`); return null; }