Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 59 additions & 19 deletions .agents/scripts/supervisor/pulse.sh
Original file line number Diff line number Diff line change
Expand Up @@ -472,16 +472,22 @@ cmd_pulse() {
fi

log_success " $tid: COMPLETE ($outcome_detail)"
# t1183 Bug 2: Persist transition IMMEDIATELY after quality gate passes.
# This is the crash-safe checkpoint — if the pulse is killed after this
# line, the task is safely in 'complete' state and Phase 3 will pick it up.
# All subsequent operations (proof-log, patterns, notifications) are
# non-critical and can be lost without affecting correctness.
cmd_transition "$tid" "complete" --pr-url "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true
completed_count=$((completed_count + 1))
# Clean up worker process tree and PID file (t128.7)
cleanup_worker_processes "$tid"
# --- Non-critical post-processing below (safe to lose on kill) ---
# Proof-log: task completion (t218)
write_proof_log --task "$tid" --event "complete" --stage "evaluate" \
--decision "complete:$outcome_detail" \
--evidence "gate=$gate_result" \
--maker "pulse:phase1" \
--pr-url "$outcome_detail" 2>/dev/null || true
cmd_transition "$tid" "complete" --pr-url "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true
completed_count=$((completed_count + 1))
# Clean up worker process tree and PID file (t128.7)
cleanup_worker_processes "$tid"
# Auto-update TODO.md and send notification (t128.4)
update_todo_on_complete "$tid" 2>>"$SUPERVISOR_LOG" || true
send_task_notification "$tid" "complete" "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true
Expand Down Expand Up @@ -722,22 +728,34 @@ cmd_pulse() {
continue
fi

log_warn " Phase 1c: $stuck_id stuck in evaluating since $stuck_updated (worker dead) — force-transitioning to failed"

# Check retry count
local stuck_retries stuck_max_retries
stuck_retries=$(db "$SUPERVISOR_DB" "SELECT retries FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo 0)
stuck_max_retries=$(db "$SUPERVISOR_DB" "SELECT max_retries FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo 3)

if [[ "$stuck_retries" -lt "$stuck_max_retries" ]]; then
# Transition to retrying so it gets re-dispatched
cmd_transition "$stuck_id" "retrying" --error "Auto-reaped: stuck in evaluating >10min with dead worker (Phase 1c)" 2>>"$SUPERVISOR_LOG" || true
db "$SUPERVISOR_DB" "UPDATE tasks SET retries = retries + 1, updated_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now') WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || true
log_info " Phase 1c: $stuck_id → retrying (retry $((stuck_retries + 1))/$stuck_max_retries)"
log_warn " Phase 1c: $stuck_id stuck in evaluating since $stuck_updated (worker dead)"

# t1183 Bug 1: Check if task already has a PR before retrying.
# If a PR exists, the worker succeeded but evaluation was lost —
# send to pr_review (Phase 3) instead of retrying (which would
# dispatch a duplicate worker and potentially create a duplicate PR).
local stuck_pr_url
stuck_pr_url=$(db "$SUPERVISOR_DB" "SELECT pr_url FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo "")
if [[ -n "$stuck_pr_url" && "$stuck_pr_url" != "no_pr" && "$stuck_pr_url" != "task_only" ]]; then
cmd_transition "$stuck_id" "pr_review" 2>>"$SUPERVISOR_LOG" || true
log_info " Phase 1c: $stuck_id → pr_review (has PR: $stuck_pr_url, skipping retry)"
Comment on lines +739 to +741
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

# First, examine the exact lines in pulse.sh at 739-741
echo "=== Lines 739-741 in pulse.sh ==="
sed -n '739,741p' .agents/scripts/supervisor/pulse.sh

echo ""
echo "=== Context around lines 739-741 (735-745) ==="
sed -n '735,745p' .agents/scripts/supervisor/pulse.sh

Repository: marcusquinn/aidevops

Length of output: 1077


🏁 Script executed:

# Search for verified_complete sentinel usage across the codebase
echo "=== All verified_complete references in pulse.sh ==="
rg -n "verified_complete" .agents/scripts/supervisor/pulse.sh -B2 -A2

echo ""
echo "=== Compare with no_pr and task_only usage ==="
rg -n 'no_pr.*task_only|task_only.*no_pr' .agents/scripts/supervisor/pulse.sh -B1 -A1

Repository: marcusquinn/aidevops

Length of output: 2166


🏁 Script executed:

# Find Phase 0.7 and the transition at line 289
echo "=== Phase 0.7 context (around line 289) ==="
sed -n '280,300p' .agents/scripts/supervisor/pulse.sh

echo ""
echo "=== cmd_transition calls with pr_review in pulse.sh ==="
rg -n 'cmd_transition.*pr_review' .agents/scripts/supervisor/pulse.sh -B2 -A2

Repository: marcusquinn/aidevops

Length of output: 3180


🏁 Script executed:

# Check state.sh for cmd_transition definition and flag handling
echo "=== cmd_transition implementation in state.sh ==="
if [ -f ".agents/scripts/supervisor/state.sh" ]; then
  rg -n "cmd_transition" .agents/scripts/supervisor/state.sh | head -5
  # Look for the function definition
  sed -n '1,200p' .agents/scripts/supervisor/state.sh | rg -n "cmd_transition|--pr-url|--error" -B2 -A2
else
  echo "state.sh not found"
fi

Repository: marcusquinn/aidevops

Length of output: 862


🏁 Script executed:

# Verify all sentinel checks in conditional statements
echo "=== All conditional checks for pr_url sentinels ==="
rg -n 'if.*\$\{.*pr.*url.*!=' .agents/scripts/supervisor/pulse.sh -B1 -A1 | head -60

Repository: marcusquinn/aidevops

Length of output: 115


Add missing verified_complete sentinel check and audit trail parity with Phase 0.7.

Line 739 filters no_pr and task_only but omits verified_complete, which is explicitly checked in Phase 3b2 (line 960), Phase 3 (line 2673), and other pr_url sentinel validations (line 1144). If a task has pr_url == "verified_complete", the condition evaluates true and incorrectly routes to pr_review.

Additionally, Phase 0.7 (line 289) passes --pr-url and --error flags to cmd_transition for the same type of transition; Phase 1c (line 740) omits both, losing audit trail and reducing consistency.

Proposed fix
-			if [[ -n "$stuck_pr_url" && "$stuck_pr_url" != "no_pr" && "$stuck_pr_url" != "task_only" ]]; then
-				cmd_transition "$stuck_id" "pr_review" 2>>"$SUPERVISOR_LOG" || true
+			if [[ -n "$stuck_pr_url" && "$stuck_pr_url" != "no_pr" && "$stuck_pr_url" != "task_only" && "$stuck_pr_url" != "verified_complete" ]]; then
+				cmd_transition "$stuck_id" "pr_review" --pr-url "$stuck_pr_url" --error "Stuck evaluating recovery (Phase 1c/t1183): worker dead, PR exists" 2>>"$SUPERVISOR_LOG" || true
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if [[ -n "$stuck_pr_url" && "$stuck_pr_url" != "no_pr" && "$stuck_pr_url" != "task_only" ]]; then
cmd_transition "$stuck_id" "pr_review" 2>>"$SUPERVISOR_LOG" || true
log_info " Phase 1c: $stuck_id → pr_review (has PR: $stuck_pr_url, skipping retry)"
if [[ -n "$stuck_pr_url" && "$stuck_pr_url" != "no_pr" && "$stuck_pr_url" != "task_only" && "$stuck_pr_url" != "verified_complete" ]]; then
cmd_transition "$stuck_id" "pr_review" --pr-url "$stuck_pr_url" --error "Stuck evaluating recovery (Phase 1c/t1183): worker dead, PR exists" 2>>"$SUPERVISOR_LOG" || true
log_info " Phase 1c: $stuck_id → pr_review (has PR: $stuck_pr_url, skipping retry)"
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In @.agents/scripts/supervisor/pulse.sh around lines 739 - 741, The condition
that routes tasks to pr_review should exclude the "verified_complete" sentinel
and also preserve the audit flags passed in Phase 0.7; update the if-check that
uses stuck_pr_url to add a check that "$stuck_pr_url" != "verified_complete",
and change the cmd_transition call (the invocation of cmd_transition "$stuck_id"
"pr_review") to include the same --pr-url "$stuck_pr_url" and --error
"$stuck_error" arguments (and keep the 2>>"$SUPERVISOR_LOG" || true redirection)
so the transition and log_info call maintain parity with Phase 0.7 behavior and
existing sentinel handling.

else
# Max retries exhausted — mark as failed
cmd_transition "$stuck_id" "failed" --error "Auto-reaped: stuck in evaluating >10min, max retries exhausted (Phase 1c)" 2>>"$SUPERVISOR_LOG" || true
log_warn " Phase 1c: $stuck_id → failed (max retries exhausted)"
# No PR — use original retry/fail logic
# Check retry count
local stuck_retries stuck_max_retries
stuck_retries=$(db "$SUPERVISOR_DB" "SELECT retries FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo 0)
stuck_max_retries=$(db "$SUPERVISOR_DB" "SELECT max_retries FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo 3)

if [[ "$stuck_retries" -lt "$stuck_max_retries" ]]; then
# Transition to retrying so it gets re-dispatched
cmd_transition "$stuck_id" "retrying" --error "Auto-reaped: stuck in evaluating >10min with dead worker (Phase 1c)" 2>>"$SUPERVISOR_LOG" || true
db "$SUPERVISOR_DB" "UPDATE tasks SET retries = retries + 1, updated_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now') WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || true
log_info " Phase 1c: $stuck_id → retrying (retry $((stuck_retries + 1))/$stuck_max_retries)"
Comment on lines +749 to +753
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Verify cmd_transition auto-increment behavior and identify redundant increments

echo "=== Checking state.sh for 'retrying' state auto-increment logic ==="
fd state.sh --type f | head -5

Repository: marcusquinn/aidevops

Length of output: 167


🏁 Script executed:

#!/bin/bash
# Search for retrying state increment pattern
rg -n "retrying" --type=sh -B3 -A3 | head -80

Repository: marcusquinn/aidevops

Length of output: 5627


🏁 Script executed:

#!/bin/bash
# Find all retry increment patterns in pulse.sh
fd pulse.sh --type f

Repository: marcusquinn/aidevops

Length of output: 137


🏁 Script executed:

#!/bin/bash
# Get state.sh content around retrying logic
cat -n .agents/scripts/supervisor/state.sh | grep -A10 -B10 "retrying"

Repository: marcusquinn/aidevops

Length of output: 5418


🏁 Script executed:

#!/bin/bash
# Get pulse.sh content around lines 749-753
sed -n '740,760p' .agents/scripts/supervisor/pulse.sh | cat -n

Repository: marcusquinn/aidevops

Length of output: 1597


🏁 Script executed:

#!/bin/bash
# Find cmd_transition definition and check its behavior with retrying
rg -n "cmd_transition" .agents/scripts/supervisor/ --type=sh -A15 | head -100

Repository: marcusquinn/aidevops

Length of output: 7774


🏁 Script executed:

#!/bin/bash
# Verify Phase 0.7 pattern (lines ~290-310)
sed -n '280,320p' .agents/scripts/supervisor/todo-sync.sh | cat -n

Repository: marcusquinn/aidevops

Length of output: 1730


🏁 Script executed:

#!/bin/bash
# Search for other manual retry increments in pulse.sh to see if this is a pattern
rg -n "retries = retries \+ 1|retries \+= 1" .agents/scripts/supervisor/pulse.sh -C3

Repository: marcusquinn/aidevops

Length of output: 689


🏁 Script executed:

#!/bin/bash
# Search for Phase 0.7 in pulse.sh to verify the comparison
sed -n '290,310p' .agents/scripts/supervisor/pulse.sh | cat -n

Repository: marcusquinn/aidevops

Length of output: 1583


🏁 Script executed:

#!/bin/bash
# Look for all Phase patterns and retry logic in pulse.sh
rg -n "Phase 0\.|Phase 1" .agents/scripts/supervisor/pulse.sh -B2 -A8 | head -150

Repository: marcusquinn/aidevops

Length of output: 8105


🏁 Script executed:

#!/bin/bash
# Find cmd_transition function definition to verify it executes the update_parts
sed -n '1,300p' .agents/scripts/supervisor/state.sh | grep -A30 "^cmd_transition()"

Repository: marcusquinn/aidevops

Length of output: 648


🏁 Script executed:

#!/bin/bash
# Check if db function is being called to execute the UPDATE in cmd_transition
sed -n '1,250p' .agents/scripts/supervisor/state.sh | tail -100

Repository: marcusquinn/aidevops

Length of output: 3714


Double retry increment — each Phase 1c recovery consumes two retries instead of one.

When cmd_transition "$stuck_id" "retrying" executes (line 751), state.sh appends retries = retries + 1 to the UPDATE statement (lines 182–184). The subsequent manual UPDATE tasks SET retries = retries + 1 on line 752 increments it a second time. Tasks exhaust their retry budget at 2× the intended rate.

Compare with Phase 0.7 (lines 298–301), which correctly sets an absolute value (retries = $new_retries) before calling cmd_transition to "queued" — a state that does not auto-increment retries in cmd_transition.

Proposed fix: remove the redundant increment
 			if [[ "$stuck_retries" -lt "$stuck_max_retries" ]]; then
 				# Transition to retrying so it gets re-dispatched
 				cmd_transition "$stuck_id" "retrying" --error "Auto-reaped: stuck in evaluating >10min with dead worker (Phase 1c)" 2>>"$SUPERVISOR_LOG" || true
-				db "$SUPERVISOR_DB" "UPDATE tasks SET retries = retries + 1, updated_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now') WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || true
 				log_info "  Phase 1c: $stuck_id → retrying (retry $((stuck_retries + 1))/$stuck_max_retries)"
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if [[ "$stuck_retries" -lt "$stuck_max_retries" ]]; then
# Transition to retrying so it gets re-dispatched
cmd_transition "$stuck_id" "retrying" --error "Auto-reaped: stuck in evaluating >10min with dead worker (Phase 1c)" 2>>"$SUPERVISOR_LOG" || true
db "$SUPERVISOR_DB" "UPDATE tasks SET retries = retries + 1, updated_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now') WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || true
log_info " Phase 1c: $stuck_id → retrying (retry $((stuck_retries + 1))/$stuck_max_retries)"
if [[ "$stuck_retries" -lt "$stuck_max_retries" ]]; then
# Transition to retrying so it gets re-dispatched
cmd_transition "$stuck_id" "retrying" --error "Auto-reaped: stuck in evaluating >10min with dead worker (Phase 1c)" 2>>"$SUPERVISOR_LOG" || true
log_info " Phase 1c: $stuck_id → retrying (retry $((stuck_retries + 1))/$stuck_max_retries)"
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In @.agents/scripts/supervisor/pulse.sh around lines 749 - 753, The Phase 1c
branch double-increments task retries: cmd_transition "$stuck_id" "retrying"
already causes state.sh to increment retries, and the following manual db UPDATE
(UPDATE tasks SET retries = retries + 1 ...) adds a second increment; remove the
redundant manual increment line and keep the cmd_transition call (or replace
cmd_transition with an explicit set to the intended absolute value if you truly
need to control retries), i.e. delete or comment out the db "$SUPERVISOR_DB"
"UPDATE tasks SET retries = retries + 1..." statement so retries increase only
once when cmd_transition is invoked.

else
# Max retries exhausted — mark as failed
cmd_transition "$stuck_id" "failed" --error "Auto-reaped: stuck in evaluating >10min, max retries exhausted (Phase 1c)" 2>>"$SUPERVISOR_LOG" || true
log_warn " Phase 1c: $stuck_id → failed (max retries exhausted)"
fi
fi

# Clean up PID file
Expand Down Expand Up @@ -2629,12 +2647,24 @@ process_post_pr_lifecycle() {
local deployed_count=0
local deferred_count=0

# t1183 Bug 3: Cap merges per pulse to prevent runaway merge loops.
# Each merge dirties remaining PRs (main moves forward), so unlimited
# merges can cause cascading rebase failures. Default: 5 per pulse.
local max_merges_per_pulse="${SUPERVISOR_MAX_MERGES_PER_PULSE:-5}"

# t225: Track which parent IDs have already had a sibling merge in this pulse.
# Only one sibling per parent group is allowed to merge per cycle.
# Use a simple string list (bash 3.2 compatible — no associative arrays).
local merged_parents=""

while IFS='|' read -r tid tstatus tpr; do
# t1183 Bug 3: Stop processing if we've hit the merge cap.
# Remaining tasks will be picked up in the next pulse cycle.
if [[ "$merged_count" -ge "$max_merges_per_pulse" ]]; then
log_info " Phase 3: reached max merges per pulse ($max_merges_per_pulse), deferring rest to next cycle"
break
fi

# Skip tasks without PRs that are already complete
# t1030: Defense-in-depth — cmd_transition() also guards complete->deployed
# when a real PR URL exists, but this fast path should only fire for genuinely
Expand Down Expand Up @@ -2674,6 +2704,16 @@ process_post_pr_lifecycle() {
if [[ -n "$parent_id" ]]; then
merged_parents="${merged_parents}|${parent_id}|"
fi
# t1183 Bug 3: After a successful merge, pull main in the task's
# repo so subsequent PRs in this pulse can rebase cleanly against
# the updated main. Without this, each merge dirties remaining PRs
# and they fail rebase until the next pulse.
local merge_repo
merge_repo=$(db "$SUPERVISOR_DB" "SELECT repo FROM tasks WHERE id = '$(sql_escape "$tid")';" 2>/dev/null || echo "")
if [[ -n "$merge_repo" && -d "$merge_repo" ]]; then
git -C "$merge_repo" pull --rebase origin main 2>>"$SUPERVISOR_LOG" ||
log_warn " $tid: failed to pull main in $merge_repo after merge (non-blocking)"
fi
;;
esac
if [[ "$new_status" == "deployed" ]]; then
Expand Down
Loading