diff --git a/docs/BACKLOG.md b/docs/BACKLOG.md index 050375566..808b91a7b 100644 --- a/docs/BACKLOG.md +++ b/docs/BACKLOG.md @@ -242,6 +242,9 @@ are closed (status: closed in frontmatter)._ - [ ] **[B-0435](backlog/P1/B-0435-demo-circuit-breaker-visualization-panel-2026-05-13.md)** Demo — circuit breaker visualization panel (mock data → live bus data) - [ ] **[B-0436](backlog/P1/B-0436-demo-hamiltonian-to-git-visualization-2026-05-13.md)** Demo — Hamiltonian-to-git visualization (git history → phase-space rendering) - [ ] **[B-0437](backlog/P1/B-0437-demo-ux-of-math-panel-bivector-fingerprints-2026-05-13.md)** Demo — UX-of-math panel (bivector fingerprints, partial-credit scoring) +- [ ] **[B-0440](backlog/P1/B-0440-standing-by-failure-mode-detector-background-service-2026-05-13.md)** Standing-by failure-mode detector — background service that catches idle-foreground + nudges via bus +- [ ] **[B-0441](backlog/P1/B-0441-backlog-row-ready-to-grind-notifier-background-service-2026-05-13.md)** Backlog-row-ready-to-grind notifier — background service that proactively assigns claims when agent queue empty +- [ ] **[B-0442](backlog/P1/B-0442-missed-substrate-cascade-detector-background-service-2026-05-13.md)** Missed-substrate cascade detector — background service that catches branch-vs-merged-PR drift (e.g., Otto-section-missed-PR-2980-by-3-min class) ## P2 — research-grade diff --git a/docs/backlog/P1/B-0440-standing-by-failure-mode-detector-background-service-2026-05-13.md b/docs/backlog/P1/B-0440-standing-by-failure-mode-detector-background-service-2026-05-13.md new file mode 100644 index 000000000..e24560de3 --- /dev/null +++ b/docs/backlog/P1/B-0440-standing-by-failure-mode-detector-background-service-2026-05-13.md @@ -0,0 +1,160 @@ +--- +id: B-0440 +priority: P1 +status: open +title: "Standing-by failure-mode detector — background service that catches idle-foreground + nudges via bus" +tier: factory-infrastructure +effort: M +created: 2026-05-13 +last_updated: 2026-05-13 +depends_on: [B-0400] +composes_with: [B-0402, B-0441, B-0442] +tags: [multi-agent, background-service, bus, mechanization, infinite-backlog, standing-by, anti-idle] +type: feature +--- + +# Standing-by failure-mode detector background service + +## Origin + +The substrate-honest architectural challenge from the human maintainer +2026-05-13 (preserved in +`memory/feedback_aaron_background_services_must_be_strong_enough_foreground_loop_optional_imagine_surviving_without_foreground_mechanize_standing_by_failure_mode_2026_05_13.md`): + +> *"you need to imagine how would you survive without this foreground +> loop and you background should be strong enough to do that"* + +> *"this is something background services should walk"* + +The foreground-loop "Standing-by" failure mode was caught by the human +maintainer in real time when the foreground agent had just canonized +the infinite-backlog metabolism rule (PR #2974) AND THEN three minutes +later responded "Standing by" to a cron tick despite the infinite +backlog mandating decomposition work. The discipline needs MECHANIZATION +per `.claude/rules/encoding-rules-without-mechanizing.md`: + +> *"Encoding rules without mechanizing them produces a memory of +> failures, not prevention."* + +## Acceptance criteria + +- [ ] Background service `tools/bg/standing-by-detector.ts` exists +- [ ] Service runs under existing launchd / cron infrastructure +- [ ] Polls agent's recent commit history every N minutes (configurable) +- [ ] Detects "Standing-by" pattern: no new commits + no PRs opened/closed + in last 15min while autonomous-loop cron is firing +- [ ] On detection, publishes nudge message via bus (B-0400): + `{ topic: "infinite-backlog-nudge", to: , + payload: { "Standing-by detected for N min; backlog has X open + rows; suggested decomposition target: B-NNNN" } }` +- [ ] Optional: proactively assigns a small claim from the backlog to + the agent's queue +- [ ] Tests cover the detection heuristics (DST-replayable) +- [ ] Documented in `docs/AUTONOMOUS-LOOP.md` as background-services + layer + +## Design sketch + +```typescript +// tools/bg/standing-by-detector.ts +// +// Polls recent agent activity; on idle-detection, nudges via bus. + +import { BusClient } from "../bus/client"; +import { execSync } from "child_process"; + +interface AgentState { + agent: string; + lastCommitAt: Date | null; + lastPRActivityAt: Date | null; + cronFiredCount: number; +} + +const IDLE_THRESHOLD_MIN = 15; +const POLL_INTERVAL_MIN = 5; + +async function detectAndNudge(state: AgentState, bus: BusClient): Promise { + const now = Date.now(); + const idleMin = Math.max( + (now - (state.lastCommitAt?.getTime() ?? 0)) / 60_000, + (now - (state.lastPRActivityAt?.getTime() ?? 0)) / 60_000, + ); + + if (idleMin >= IDLE_THRESHOLD_MIN && state.cronFiredCount > 0) { + const openRows = countOpenBacklogRows(); + const suggestedRow = pickDecompositionTarget(openRows); + + await bus.publish({ + topic: "infinite-backlog-nudge", + to: state.agent, + payload: { + idleMin, + openBacklogCount: openRows.length, + suggestedTarget: suggestedRow, + rationale: "Standing-by detected; per infinite-backlog metabolism, pick decomposition work", + }, + }); + } +} +``` + +## Operational mechanism + +| Step | Trigger | Effect | +|------|---------|--------| +| 1 | Cron fires every 5 min | Service polls agent commit log + PR feed | +| 2 | Idle threshold exceeded (15 min default) | Service computes nudge payload | +| 3 | Bus publishes nudge message | Agent receives via existing bus subscription | +| 4 | Agent acts on nudge | Picks decomposition target + ships substrate | +| 5 | Loop continues | Substrate compounds; Standing-by failure mode prevented | + +## Composes with + +- `.claude/rules/encoding-rules-without-mechanizing.md` (the rule this + mechanizes) +- `.claude/rules/never-be-idle.md` (the priority ladder the nudge + enforces) +- `.claude/rules/largest-mechanizable-backlog-wins.md` (the mechanization + itself increases the mechanizable backlog) +- B-0400 (bus protocol — transport for nudge messages) +- B-0402 (shadow observer — canonical background service example) +- B-0441 (backlog-row-ready notifier — composes; pre-assigns work the + agent picks up) +- B-0442 (missed-substrate cascade detector — composes; catches + different failure mode in same family) +- PR #2974 (infinite-backlog metabolism — the rule this mechanizes) +- PR #2998 (background-services architecture — the substrate that + declared this row as follow-up) +- PR #2999 (substrate-honest discipline triad — the decomposition + discipline that produced this row) + +## Pre-start checklist (per backlog-item-start-gate) + +To complete before starting implementation: + +- [ ] Prior-art search: existing background services in + `tools/shadow/`, `tools/bg/` (verify no overlap) +- [ ] Dependency proof: B-0400 bus protocol slice ready +- [ ] Search committed memory for `Standing-by detector` to find any + prior decomposition + +## Substrate-honest caveats + +- Design sketch only; implementation slice not started +- The threshold values (15 min idle; 5 min poll) are speculative + defaults; first implementation should make them configurable +- The nudge payload schema is illustrative; actual schema lands + during implementation +- Per razor-discipline: claim is design-level; substrate-honest claim + is operational-design, not deployed-service + +## Decomposition into implementation slices (TBD) + +When this row is picked up for implementation: + +- Slice 1: skeleton service + cron registration + no-op poll loop +- Slice 2: commit-history poll via git log +- Slice 3: PR-activity poll via gh CLI +- Slice 4: nudge payload computation + bus publish +- Slice 5: integration with agent subscribers +- Slice 6: tests + documentation diff --git a/docs/backlog/P1/B-0441-backlog-row-ready-to-grind-notifier-background-service-2026-05-13.md b/docs/backlog/P1/B-0441-backlog-row-ready-to-grind-notifier-background-service-2026-05-13.md new file mode 100644 index 000000000..ef0527b38 --- /dev/null +++ b/docs/backlog/P1/B-0441-backlog-row-ready-to-grind-notifier-background-service-2026-05-13.md @@ -0,0 +1,167 @@ +--- +id: B-0441 +priority: P1 +status: open +title: "Backlog-row-ready-to-grind notifier — background service that proactively assigns claims when agent queue empty" +tier: factory-infrastructure +effort: M +created: 2026-05-13 +last_updated: 2026-05-13 +depends_on: [B-0400] +composes_with: [B-0402, B-0440, B-0442] +tags: [multi-agent, background-service, bus, mechanization, infinite-backlog, work-assignment] +type: feature +--- + +# Backlog-row-ready-to-grind notifier background service + +## Origin + +Companion mechanization to B-0440. The substrate-honest architectural +challenge from the human maintainer 2026-05-13: + +> *"this is something background services should walk"* + +B-0440 catches the *failure mode* (Standing-by); this row prevents +the failure mode by *proactively surfacing work* when the agent's +queue is empty. The infinite-backlog metabolism rule (PR #2974) +mandates that backlog work is always available; this service makes +that availability operational at agent-tick scale. + +Per the substrate-honest discipline triad (PR #2999): decomposition +dissolves ambiguity. When agent has no current task, the service +provides a less-ambiguous concrete claim — eliminating the +"what should I do next?" stuckness pattern. + +## Acceptance criteria + +- [ ] Background service `tools/bg/backlog-ready-notifier.ts` exists +- [ ] Runs under existing launchd / cron infrastructure +- [ ] Periodically scans `docs/backlog/P*/B-*.md` for ready-to-grind + rows (open, no blockers, dependencies satisfied) +- [ ] Detects agent queue state (commits in last N minutes; current + branch / open PR ownership) +- [ ] When agent queue is empty AND ready-to-grind rows exist, + publishes claim-assignment message via bus (B-0400): + `{ topic: "work-assignment", to: , + payload: { rowId: "B-NNNN", priority: "P1", + rationale: "queue-empty + dependencies-satisfied + smallest-effort-match", + decompositionSuggestion: } }` +- [ ] Honors agent autonomy — assignment is suggestion, not directive + (per `.claude/rules/no-directives.md`) +- [ ] Tracks assignment history to avoid re-assigning same row + within short window +- [ ] Tests cover the readiness-detection heuristics +- [ ] Documented in `docs/AUTONOMOUS-LOOP.md` + +## Design sketch + +```typescript +// tools/bg/backlog-ready-notifier.ts +// +// Proactively surfaces ready-to-grind backlog rows when agent queue empty. + +interface BacklogRow { + id: string; + priority: "P0" | "P1" | "P2" | "P3"; + status: string; + dependsOn: string[]; + effort: "S" | "M" | "L" | "XL"; + ready: boolean; // true iff all dependsOn satisfied +} + +async function findReadyRows(): Promise { + const rows = scanBacklog("docs/backlog/P*/B-*.md"); + return rows.filter(r => + r.status === "open" && + r.dependsOn.every(dep => rowStatus(dep) === "closed") + ); +} + +async function detectQueueEmpty(agent: string): Promise { + const recentCommits = gitLogSince(agent, "30 minutes ago"); + const ownedPRs = ghOpenPRsAuthoredBy(agent); + return recentCommits.length === 0 && ownedPRs.length === 0; +} + +async function notifyIfQueueEmpty(agent: string, bus: BusClient): Promise { + if (!await detectQueueEmpty(agent)) return; + + const readyRows = await findReadyRows(); + if (readyRows.length === 0) return; + + const pickSmallest = readyRows.sort(byEffortThenPriority)[0]; + + await bus.publish({ + topic: "work-assignment", + to: agent, + payload: { + rowId: pickSmallest.id, + priority: pickSmallest.priority, + rationale: "queue-empty + dependencies-satisfied + smallest-effort-match", + decompositionSuggestion: suggestSlices(pickSmallest), + }, + }); +} +``` + +## Composing with B-0440 + +| Service | Trigger | Output | +|---------|---------|--------| +| B-0440 Standing-by detector | Idle threshold + cron fires | Nudge: "you should pick work" | +| B-0441 Backlog-ready notifier | Queue-empty + rows-ready | Assignment: "this row is ready" | + +B-0440 is reactive (catches failure mode after it occurs); B-0441 is +proactive (prevents failure mode by pre-assigning work). Together they +form a two-layer defense against the Standing-by pattern. + +## Composes with + +- `.claude/rules/never-be-idle.md` (proactive work-surfacing satisfies + the priority ladder) +- `.claude/rules/largest-mechanizable-backlog-wins.md` (the mechanization + IS itself mechanizable-backlog growth) +- `.claude/rules/no-directives.md` (assignment is suggestion, not + directive; agent retains autonomy) +- `.claude/rules/encoding-rules-without-mechanizing.md` (this row IS + the mechanization of the "always have work" discipline) +- `.claude/rules/backlog-item-start-gate.md` (assignment payload could + include start-gate-relevant context) +- B-0400 (bus protocol — transport for assignment messages) +- B-0402 (shadow observer — canonical background service pattern) +- B-0440 (Standing-by detector — composes; this prevents what B-0440 + catches) +- B-0442 (missed-substrate cascade detector — composes; full + background-services suite) +- PR #2974 (infinite-backlog metabolism) +- PR #2998 (background-services architecture) +- PR #2999 (substrate-honest discipline triad — + decomposition-suggestions in payload align with this discipline) + +## Pre-start checklist + +- [ ] Prior-art search: existing audit scripts in `tools/hygiene/` + (check for backlog-readiness-scan overlap) +- [ ] Dependency proof: B-0400 bus protocol slice ready +- [ ] Verify readiness-detection heuristics handle edge cases + (forked work, multi-agent claims, partial completion) + +## Substrate-honest caveats + +- Design sketch only +- The "smallest-effort-match" heuristic is speculative; first + implementation might pick highest-priority or random-among-ready +- Agent autonomy must be preserved — service publishes, agent decides +- Per razor-discipline: claim is design-level + +## Decomposition into implementation slices (TBD) + +When picked up for implementation: + +- Slice 1: backlog row parsing + readiness detection +- Slice 2: agent queue-state detection (commits + PRs) +- Slice 3: assignment payload computation +- Slice 4: bus integration +- Slice 5: assignment history tracking +- Slice 6: tests + documentation diff --git a/docs/backlog/P1/B-0442-missed-substrate-cascade-detector-background-service-2026-05-13.md b/docs/backlog/P1/B-0442-missed-substrate-cascade-detector-background-service-2026-05-13.md new file mode 100644 index 000000000..c1984702e --- /dev/null +++ b/docs/backlog/P1/B-0442-missed-substrate-cascade-detector-background-service-2026-05-13.md @@ -0,0 +1,183 @@ +--- +id: B-0442 +priority: P1 +status: open +title: "Missed-substrate cascade detector — background service that catches branch-vs-merged-PR drift (e.g., Otto-section-missed-PR-2980-by-3-min class)" +tier: factory-infrastructure +effort: M +created: 2026-05-13 +last_updated: 2026-05-13 +depends_on: [B-0400] +composes_with: [B-0402, B-0440, B-0441] +tags: [multi-agent, background-service, bus, mechanization, drift-detection, race-condition] +type: feature +--- + +# Missed-substrate cascade detector background service + +## Origin + +Companion mechanization to B-0440 + B-0441. The substrate-honest +architectural challenge from the human maintainer 2026-05-13: + +> *"you need to imagine how would you survive without this foreground +> loop and you background should be strong enough to do that"* + +Operational example that surfaced this row: my Otto-section-missed-PR-#2980-by-3-min +cascade — a feature-branch commit landed AFTER its parent PR squash-merged, +leaving substrate on the branch but not on main. +Caught manually + recovered via PR #2997, but the recovery was reactive. + +This row mechanizes detection of the failure class: substrate present +on a branch but missing from main due to merge-timing race conditions. + +## The failure-class pattern + +| Step | What happened | What should be caught | +|------|---------------|----------------------| +| 1 | Branch B has commits C1, C2 | (normal state) | +| 2 | PR #N opens against main | (normal state) | +| 3 | Auto-merge armed on PR #N | (normal state) | +| 4 | New commit C3 lands on B (after squash plan formed) | DETECT: C3 not in #N's squash | +| 5 | PR #N squash-merges (lacks C3) | DETECT: C3 orphaned from main | +| 6 | Branch B deleted post-merge | DETECT: C3 about to be lost | + +The cascade can be caught at any step 4-6 by comparing: + +- Branch HEAD commits +- Merged PR's squash content +- Main HEAD content +- Branch deletion plan + +When the comparison shows substrate missing from main but present on +a branch, the service catches it BEFORE branch deletion. + +## Acceptance criteria + +- [ ] Background service `tools/bg/missed-substrate-detector.ts` exists +- [ ] Runs under existing launchd / cron infrastructure +- [ ] On PR merge events (poll or webhook), checks if branch HEAD == + merge commit content +- [ ] When branch HEAD has commits the merged PR didn't include, + publishes cascade-detected message via bus (B-0400): + `{ topic: "missed-substrate-cascade", to: , + payload: { branchName, missingCommits, recommendedAction: + "open-recovery-PR" } }` +- [ ] Optionally auto-opens recovery PR with the missing commits + (gated by configuration) +- [ ] Tests cover the detection heuristics (DST-replayable) +- [ ] Documented in `docs/AUTONOMOUS-LOOP.md` + +## Design sketch + +```typescript +// tools/bg/missed-substrate-detector.ts +// +// Detects branch-vs-merged-PR drift; surfaces recovery work. + +interface MergedPRState { + prNumber: number; + branchName: string; + squashCommit: string; // SHA on main after squash + branchHead: string; // SHA on the feature branch + branchCommits: string[]; // all commits on the branch + squashIncludedCommits: string[]; // commits included in the squash +} + +async function findMissedSubstrate(pr: MergedPRState): Promise { + // Find branch commits NOT included in the squash merge + return pr.branchCommits.filter( + sha => !pr.squashIncludedCommits.includes(sha) + ); +} + +async function watchRecentMerges(bus: BusClient): Promise { + const recentlyMerged = await ghMergedPRsLastHour(); + + for (const pr of recentlyMerged) { + const state = await fetchMergedPRState(pr.number); + const missed = await findMissedSubstrate(state); + + if (missed.length > 0) { + await bus.publish({ + topic: "missed-substrate-cascade", + to: "all-active-agents", + payload: { + prNumber: pr.number, + branchName: state.branchName, + missingCommits: missed, + recommendedAction: "open-recovery-PR", + urgency: state.branchAboutToBeDeleted ? "high" : "medium", + }, + }); + } + } +} +``` + +## Operational examples this would catch + +1. **Otto-section-missed-PR-#2980-by-3-min** (the originating example): + commit `f5aed67` (Otto-in-own-voice) pushed at 09:25:58Z; + PR #2980 squash-merged at 09:22:42Z; auto-merge fired before + push arrived. Service would have caught at step 5 above. + +2. **Concurrent-agent index-lock branch drift**: Vera commits on + her branch while Otto's branch auto-merges; if Vera's commits + reference Otto's branch content that's been squashed, the + drift gets detected. + +3. **Force-push-over content** (composes with lost-files surface): + substrate committed + force-pushed-over before merge; if branch + tracking includes pre-force-push content, the service surfaces + the lost commits. + +## Composes with + +- `.claude/rules/encoding-rules-without-mechanizing.md` (the rule + this mechanizes) +- `.claude/rules/lost-files-surface.md` (this service IS a lost-files + prevention layer at the auto-merge timing scope) +- `.claude/rules/dependency-status-surface.md` (composes; both + surface drift detection at different scopes) +- B-0400 (bus protocol — transport for cascade detection) +- B-0402 (shadow observer — canonical pattern) +- B-0440 (Standing-by detector — composes; full background-services + suite) +- B-0441 (backlog-ready notifier — composes; full suite) +- `tools/hygiene/LOST-FILES-LOCATIONS.md` (the 15-class survey; this + service mechanizes one of the classes) +- PR #2980 (the operational example) +- PR #2997 (the reactive recovery; this service makes future + recovery proactive) +- PR #2998 (background-services architecture) +- PR #2999 (substrate-honest discipline triad — decomposition path + that produced this row) + +## Pre-start checklist + +- [ ] Prior-art search: `tools/hygiene/audit-lost-files.ts` (current + TS implementation; check for overlap; possible composition) +- [ ] Dependency proof: B-0400 bus protocol slice ready +- [ ] Verify detection heuristics handle GitHub squash-merge SHA + resolution correctly + +## Substrate-honest caveats + +- Design sketch only +- Auto-recovery-PR opening is gated; first implementation should + default to detect-only (publish + log; don't auto-act) +- Per razor-discipline: claim is design-level +- The detection has a time-window heuristic; perfect detection + requires webhook subscription rather than polling + +## Decomposition into implementation slices (TBD) + +When picked up for implementation: + +- Slice 1: merged-PR state fetch via gh CLI +- Slice 2: branch-vs-squash comparison logic +- Slice 3: cascade-detection message schema +- Slice 4: bus integration +- Slice 5: optional auto-recovery-PR opening (gated) +- Slice 6: tests + documentation