Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions .agents/scripts/supervisor-helper.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7222,6 +7222,49 @@ cmd_pr_lifecycle() {
stage_timings="${stage_timings}deploying:$((stage_end - stage_start))s,"
fi

# Step 4b: Auto-recover stuck deploying state (t222)
# If a task is already in 'deploying' (from a prior pulse where the deploy
# succeeded but the transition to 'deployed' failed), re-attempt the
# transition and housekeeping steps. The deploy itself already completed
# successfully — only the state transition was lost.
if [[ "$tstatus" == "deploying" ]]; then
local stage_start
stage_start=$(date +%s)

log_warn "Task $task_id stuck in deploying state — attempting auto-recovery (t222)"

if [[ "$dry_run" == "false" ]]; then
# Re-run housekeeping that may have been skipped when the prior
# transition failed (all non-blocking, best-effort)
cleanup_after_merge "$task_id" 2>>"$SUPERVISOR_LOG" || log_warn "Worktree cleanup issue for $task_id during recovery (non-blocking)"
update_todo_on_complete "$task_id" 2>>"$SUPERVISOR_LOG" || log_warn "TODO.md update issue for $task_id during recovery (non-blocking)"
populate_verify_queue "$task_id" "$tpr" "$trepo" 2>>"$SUPERVISOR_LOG" || log_warn "Verify queue population issue for $task_id during recovery (non-blocking)"

# Attempt the transition that previously failed
if cmd_transition "$task_id" "deployed" 2>>"$SUPERVISOR_LOG"; then
log_success "Auto-recovered $task_id: deploying -> deployed (t222)"
send_task_notification "$task_id" "deployed" "Auto-recovered from stuck deploying state" 2>>"$SUPERVISOR_LOG" || true
store_success_pattern "$task_id" "deployed" "" 2>>"$SUPERVISOR_LOG" || true
write_proof_log --task "$task_id" --event "auto_recover" --stage "deploying" \
--decision "deploying->deployed" --evidence "stuck_state_recovery" \
--maker "pr_lifecycle:t222" 2>/dev/null || true

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Error suppression with 2>/dev/null violates the repository style guide. Rule #50 states that 2>/dev/null is only acceptable when redirecting to log files, not for blanket suppression. Errors from write_proof_log should be captured for debugging purposes.

Suggested change
--maker "pr_lifecycle:t222" 2>/dev/null || true
--maker "pr_lifecycle:t222" 2>>"$SUPERVISOR_LOG" || true
References
  1. Rule docs: update branch creation to recommend worktrees for parallel sessions #50: 2>/dev/null is acceptable ONLY when redirecting to log files, not blanket suppression. This change suppresses stderr without redirecting stdout to a log file, which can hide important errors. (link)

else
log_error "Auto-recovery failed for $task_id — transition to deployed rejected"
# If the transition itself is invalid, something is deeply wrong.
# Transition to failed so the task doesn't stay stuck forever.
cmd_transition "$task_id" "failed" --error "Auto-recovery failed: deploying->deployed transition rejected (t222)" 2>>"$SUPERVISOR_LOG" || true
send_task_notification "$task_id" "failed" "Stuck in deploying, auto-recovery failed" 2>>"$SUPERVISOR_LOG" || true
Comment on lines +7244 to +7256
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Avoid failing tasks when recovery races with a concurrent transition.

Line 7244: If cmd_transition fails because another process already moved the task out of deploying, this block marks the task failed, which corrupts state. Re-check the current status and only fail if it’s still deploying.

Suggested fix
-            if cmd_transition "$task_id" "deployed" 2>>"$SUPERVISOR_LOG"; then
+            if cmd_transition "$task_id" "deployed" 2>>"$SUPERVISOR_LOG"; then
                 log_success "Auto-recovered $task_id: deploying -> deployed (t222)"
                 send_task_notification "$task_id" "deployed" "Auto-recovered from stuck deploying state" 2>>"$SUPERVISOR_LOG" || true
                 store_success_pattern "$task_id" "deployed" "" 2>>"$SUPERVISOR_LOG" || true
                 write_proof_log --task "$task_id" --event "auto_recover" --stage "deploying" \
                     --decision "deploying->deployed" --evidence "stuck_state_recovery" \
                     --maker "pr_lifecycle:t222" 2>/dev/null || true
             else
-                log_error "Auto-recovery failed for $task_id — transition to deployed rejected"
-                # If the transition itself is invalid, something is deeply wrong.
-                # Transition to failed so the task doesn't stay stuck forever.
-                cmd_transition "$task_id" "failed" --error "Auto-recovery failed: deploying->deployed transition rejected (t222)" 2>>"$SUPERVISOR_LOG" || true
-                send_task_notification "$task_id" "failed" "Stuck in deploying, auto-recovery failed" 2>>"$SUPERVISOR_LOG" || true
+                # Re-check state to avoid clobbering a concurrent transition
+                local current_state
+                current_state=$(db "$SUPERVISOR_DB" "SELECT status FROM tasks WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || echo "")
+                if [[ "$current_state" == "deploying" ]]; then
+                    log_error "Auto-recovery failed for $task_id — transition to deployed rejected"
+                    cmd_transition "$task_id" "failed" --error "Auto-recovery failed: deploying->deployed transition rejected (t222)" 2>>"$SUPERVISOR_LOG" || true
+                    send_task_notification "$task_id" "failed" "Stuck in deploying, auto-recovery failed" 2>>"$SUPERVISOR_LOG" || true
+                else
+                    log_info "Auto-recovery skipped: $task_id already $current_state"
+                fi
             fi
🤖 Prompt for AI Agents
In @.agents/scripts/supervisor-helper.sh around lines 7244 - 7256, The current
else branch blindly marks the task "failed" when cmd_transition "$task_id"
"deployed" fails, which can corrupt state if another process already moved the
task; change this to re-query the task's current status (using the project’s
existing status-check command for task_id) before forcing failure and only call
cmd_transition "$task_id" "failed" and send_task_notification if the re-queried
status is still "deploying"; otherwise log the observed status with
log_error/log_success and optionally call write_proof_log/send_task_notification
reflecting the actual state, ensuring all commands still redirect to
$SUPERVISOR_LOG as in the original block.

fi
else
log_info "[dry-run] Would auto-recover $task_id from deploying to deployed"
fi

# t222: Record recovery timing
local stage_end
stage_end=$(date +%s)
stage_timings="${stage_timings}deploying_recovery:$((stage_end - stage_start))s,"
fi

# t219: Record total lifecycle timing and log to proof-log
local lifecycle_end_time
lifecycle_end_time=$(date +%s)
Expand Down Expand Up @@ -7774,6 +7817,33 @@ cmd_pulse() {
done <<< "$stale_diags"
fi

# Phase 4d: Auto-recover stuck deploying tasks (t222)
# Tasks can get stuck in 'deploying' if the deploy succeeds but the
# transition to 'deployed' fails (e.g., DB write error, process killed
# mid-transition). Detect tasks in 'deploying' state for longer than
# the deploy timeout and auto-recover them via process_post_pr_lifecycle
# (which now handles the deploying state in Step 4b of cmd_pr_lifecycle).
local deploying_timeout_seconds="${SUPERVISOR_DEPLOY_TIMEOUT:-600}" # 10 min default
local stuck_deploying
stuck_deploying=$(db "$SUPERVISOR_DB" "
SELECT id, updated_at FROM tasks
WHERE status = 'deploying'
AND updated_at < strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-${deploying_timeout_seconds} seconds');
" 2>/dev/null || echo "")

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Error suppression with 2>/dev/null violates the repository style guide. Rule #50 states that 2>/dev/null is only acceptable for redirecting to log files, not for blanket suppression. If the database query fails for reasons other than returning no results (e.g., locked database, syntax error), the error will be silently ignored. It's safer to redirect stderr to the supervisor log.

Suggested change
" 2>/dev/null || echo "")
" 2>>"$SUPERVISOR_LOG" || echo "")
References
  1. Rule docs: update branch creation to recommend worktrees for parallel sessions #50: 2>/dev/null is acceptable ONLY when redirecting to log files, not blanket suppression. This change suppresses stderr from the database command, which can hide critical errors. (link)


if [[ -n "$stuck_deploying" ]]; then
while IFS='|' read -r stuck_id stuck_updated; do
[[ -n "$stuck_id" ]] || continue
log_warn " Stuck deploying: $stuck_id (last updated: ${stuck_updated:-unknown}, timeout: ${deploying_timeout_seconds}s) — triggering recovery (t222)"
# process_post_pr_lifecycle will pick this up and run cmd_pr_lifecycle
# which now handles the deploying state in Step 4b
cmd_pr_lifecycle "$stuck_id" 2>>"$SUPERVISOR_LOG" || {
log_error " Recovery failed for stuck deploying task $stuck_id — forcing to deployed"
cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
Comment on lines +7841 to +7842

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

The fallback logic here seems incorrect. If cmd_pr_lifecycle fails, it indicates a more severe problem than a simple transition failure (which is already handled inside cmd_pr_lifecycle by transitioning to failed). Forcing the task to deployed in this case is risky, as the deployment state is unknown and the task might not be deployed at all. A safer fallback would be to transition the task to failed to signal that manual intervention is required.

Suggested change
log_error " Recovery failed for stuck deploying task $stuck_id — forcing to deployed"
cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
log_error " Recovery failed for stuck deploying task $stuck_id — forcing to failed for manual review"
cmd_transition "$stuck_id" "failed" --error "Catastrophic recovery failure for stuck deploying task (t222)" 2>>"$SUPERVISOR_LOG" || true

}
Comment on lines +7840 to +7843
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Guard the force-deploy fallback with a state re-check.

Line 7840: The fallback forces deployed on any cmd_pr_lifecycle error. If the task was transitioned to failed/blocked/cancelled between the query and recovery, this overwrites the real outcome. Re-read current status and only force when still deploying.

Suggested fix
-            cmd_pr_lifecycle "$stuck_id" 2>>"$SUPERVISOR_LOG" || {
-                log_error "  Recovery failed for stuck deploying task $stuck_id — forcing to deployed"
-                cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
-            }
+            cmd_pr_lifecycle "$stuck_id" 2>>"$SUPERVISOR_LOG" || {
+                log_error "  Recovery failed for stuck deploying task $stuck_id — evaluating fallback"
+                local current_state
+                current_state=$(db "$SUPERVISOR_DB" "SELECT status FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo "")
+                if [[ "$current_state" == "deploying" ]]; then
+                    cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
+                else
+                    log_warn "  Skipping force-recovery for $stuck_id (state now $current_state)"
+                fi
+            }
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
cmd_pr_lifecycle "$stuck_id" 2>>"$SUPERVISOR_LOG" || {
log_error " Recovery failed for stuck deploying task $stuck_id — forcing to deployed"
cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
}
cmd_pr_lifecycle "$stuck_id" 2>>"$SUPERVISOR_LOG" || {
log_error " Recovery failed for stuck deploying task $stuck_id — evaluating fallback"
local current_state
current_state=$(db "$SUPERVISOR_DB" "SELECT status FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo "")
if [[ "$current_state" == "deploying" ]]; then
cmd_transition "$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" 2>>"$SUPERVISOR_LOG" || true
else
log_warn " Skipping force-recovery for $stuck_id (state now $current_state)"
fi
}
🤖 Prompt for AI Agents
In @.agents/scripts/supervisor-helper.sh around lines 7840 - 7843, The fallback
unconditionally forces "deployed" on any cmd_pr_lifecycle error; change it to
re-check the current task state for stuck_id before forcing: after
cmd_pr_lifecycle "$stuck_id" fails, invoke a safe read of the task status (e.g.,
call cmd_pr_lifecycle or a status query for "$stuck_id" and capture/parse its
output), log that status to SUPERVISOR_LOG, and only call cmd_transition
"$stuck_id" "deployed" --error "Force-recovered from stuck deploying (t222)" if
the re-read shows the task is still in "deploying"; if the state is "failed",
"blocked", "cancelled" or any other non-deploying value, skip the
force-transition and preserve the real outcome (ensure all commands and their
errors are redirected to "$SUPERVISOR_LOG" and failures of the re-check are
handled without blindly forcing).

done <<< "$stuck_deploying"
fi

# Phase 5: Summary
local total_running
total_running=$(cmd_running_count "${batch_id:-}")
Expand Down
112 changes: 112 additions & 0 deletions tests/test-supervisor-state-machine.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1575,6 +1575,118 @@ fi

rm -rf "$CLAIM_TEST_DIR"

# ============================================================
# SECTION: Stuck Deploying Auto-Recovery (t222)
# ============================================================
section "Stuck Deploying Auto-Recovery (t222)"

# Test: deploying -> deployed transition is valid (prerequisite)
sup add test-t222a --repo /tmp/test --description "Deploying recovery test" >/dev/null
sup transition test-t222a dispatched >/dev/null
sup transition test-t222a running >/dev/null
sup transition test-t222a evaluating >/dev/null
sup transition test-t222a complete >/dev/null
sup transition test-t222a pr_review >/dev/null
sup transition test-t222a merging >/dev/null
sup transition test-t222a merged >/dev/null
sup transition test-t222a deploying >/dev/null

# Verify task is in deploying state
if [[ "$(get_status test-t222a)" == "deploying" ]]; then
pass "Task reaches deploying state correctly"
else
fail "Task should be in deploying state: $(get_status test-t222a)"
fi

# Simulate recovery: deploying -> deployed
sup transition test-t222a deployed >/dev/null
if [[ "$(get_status test-t222a)" == "deployed" ]]; then
pass "deploying -> deployed recovery transition succeeds (t222)"
else
fail "deploying -> deployed recovery failed: $(get_status test-t222a)"
fi

# Test: deploying -> failed is also valid (deploy failure path)
sup add test-t222b --repo /tmp/test --description "Deploying failure test" >/dev/null
sup transition test-t222b dispatched >/dev/null
sup transition test-t222b running >/dev/null
sup transition test-t222b evaluating >/dev/null
sup transition test-t222b complete >/dev/null
sup transition test-t222b pr_review >/dev/null
sup transition test-t222b merging >/dev/null
sup transition test-t222b merged >/dev/null
sup transition test-t222b deploying >/dev/null
sup transition test-t222b failed --error "Deploy failed during recovery" >/dev/null
if [[ "$(get_status test-t222b)" == "failed" ]]; then
pass "deploying -> failed transition succeeds (deploy failure path)"
else
fail "deploying -> failed transition failed: $(get_status test-t222b)"
fi

# Test: state_log records deploying recovery transitions
log_entries=$(test_db "SELECT from_state || '->' || to_state FROM state_log WHERE task_id = 'test-t222a' AND from_state = 'deploying';")
if echo "$log_entries" | grep -q "deploying->deployed"; then
pass "State log records deploying -> deployed recovery (t222)"
else
fail "State log missing deploying -> deployed entry" "Got: $log_entries"
fi

# Test: cmd_pr_lifecycle handles deploying state via sourced function
# Create a task stuck in deploying and verify pr_lifecycle recovers it
sup add test-t222c --repo /tmp/test --description "PR lifecycle deploying recovery" >/dev/null
sup transition test-t222c dispatched >/dev/null
sup transition test-t222c running >/dev/null
sup transition test-t222c evaluating >/dev/null
sup transition test-t222c complete >/dev/null
sup transition test-t222c pr_review >/dev/null
sup transition test-t222c merging >/dev/null
sup transition test-t222c merged >/dev/null
sup transition test-t222c deploying >/dev/null

# Run cmd_pr_lifecycle on the stuck task — it should auto-recover
lifecycle_output=$(bash -c "
export AIDEVOPS_SUPERVISOR_DIR='$TEST_DIR'
set -- init
source '$SUPERVISOR_SCRIPT' >/dev/null
cmd_pr_lifecycle 'test-t222c'
" 2>&1 || true)

recovered_status=$(get_status test-t222c)
if [[ "$recovered_status" == "deployed" ]]; then
pass "cmd_pr_lifecycle auto-recovers stuck deploying -> deployed (t222)"
else
# Also acceptable: failed (if recovery transition was rejected for some reason)
if [[ "$recovered_status" == "failed" ]]; then
pass "cmd_pr_lifecycle handles stuck deploying (transitioned to failed)"
else
fail "cmd_pr_lifecycle did not recover stuck deploying task" "Status: $recovered_status, Output: $(echo "$lifecycle_output" | tail -3)"
fi
fi

# Test: invalid transition from deploying (e.g., deploying -> queued)
sup add test-t222d --repo /tmp/test --description "Invalid deploying transition" >/dev/null
sup transition test-t222d dispatched >/dev/null
sup transition test-t222d running >/dev/null
sup transition test-t222d evaluating >/dev/null
sup transition test-t222d complete >/dev/null
sup transition test-t222d pr_review >/dev/null
sup transition test-t222d merging >/dev/null
sup transition test-t222d merged >/dev/null
sup transition test-t222d deploying >/dev/null
invalid_deploying=$(sup transition test-t222d queued 2>&1 || true)
if echo "$invalid_deploying" | grep -qi "invalid transition"; then
pass "deploying -> queued rejected (invalid transition)"
else
fail "deploying -> queued should be rejected" "$invalid_deploying"
fi

# Verify state unchanged after invalid transition
if [[ "$(get_status test-t222d)" == "deploying" ]]; then
pass "State unchanged after invalid deploying transition (t222)"
else
fail "State changed despite invalid transition: $(get_status test-t222d)"
fi

# ============================================================
# SUMMARY
# ============================================================
Expand Down
Loading