-
Notifications
You must be signed in to change notification settings - Fork 35
t1075: Fix verification infinite loop, stuck verifying recovery, and bot review dismissal #1566
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -598,18 +598,41 @@ cmd_pulse() { | |
| # verify_failed: PR was already merged and deployed, but | ||
| # post-merge verification failed. Put back to 'deployed' | ||
| # so Phase 3b (verify queue) can re-run verification. | ||
| # Do NOT mark complete — the quality check still needs to pass. | ||
| log_info " Phase 3b2: $stale_id (verify_failed) — PR #$pr_number merged, resetting to deployed for re-verification" | ||
| db "$SUPERVISOR_DB" "UPDATE tasks SET | ||
| status = 'deployed', | ||
| error = NULL, | ||
| updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') | ||
| WHERE id = '$escaped_stale_id';" 2>/dev/null || true | ||
| db "$SUPERVISOR_DB" "INSERT INTO state_log (task_id, from_state, to_state, timestamp, reason) | ||
| VALUES ('$escaped_stale_id', 'verify_failed', 'deployed', | ||
| strftime('%Y-%m-%dT%H:%M:%SZ','now'), | ||
| 'Phase 3b2: reset for re-verification (PR #$pr_number merged)');" 2>/dev/null || true | ||
| sync_issue_status_label "$stale_id" "deployed" "phase_3b2" 2>>"$SUPERVISOR_LOG" || true | ||
| # Cap retries to prevent infinite deployed→verify_failed loop (t1075). | ||
| local verify_reset_count | ||
| verify_reset_count=$(db "$SUPERVISOR_DB" " | ||
| SELECT COUNT(*) FROM state_log | ||
| WHERE task_id = '$escaped_stale_id' | ||
| AND from_state = 'verify_failed' | ||
| AND to_state = 'deployed';" 2>/dev/null || echo "0") | ||
| local max_verify_retries=3 | ||
|
|
||
| if [[ "$verify_reset_count" -ge "$max_verify_retries" ]]; then | ||
| # Exhausted verification retries — mark permanently failed | ||
| log_error " Phase 3b2: $stale_id exhausted $max_verify_retries verification retries — marking failed" | ||
| db "$SUPERVISOR_DB" "UPDATE tasks SET | ||
| status = 'failed', | ||
| error = 'Verification failed after $max_verify_retries retries — manual fix needed', | ||
| updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') | ||
| WHERE id = '$escaped_stale_id';" 2>/dev/null || true | ||
| db "$SUPERVISOR_DB" "INSERT INTO state_log (task_id, from_state, to_state, timestamp, reason) | ||
| VALUES ('$escaped_stale_id', 'verify_failed', 'failed', | ||
| strftime('%Y-%m-%dT%H:%M:%SZ','now'), | ||
| 'Phase 3b2: verification exhausted ($verify_reset_count/$max_verify_retries retries)');" 2>/dev/null || true | ||
| sync_issue_status_label "$stale_id" "failed" "phase_3b2" 2>>"$SUPERVISOR_LOG" || true | ||
| else | ||
| log_info " Phase 3b2: $stale_id (verify_failed) — PR #$pr_number merged, resetting to deployed for re-verification (attempt $((verify_reset_count + 1))/$max_verify_retries)" | ||
| db "$SUPERVISOR_DB" "UPDATE tasks SET | ||
| status = 'deployed', | ||
| error = NULL, | ||
| updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') | ||
| WHERE id = '$escaped_stale_id';" 2>/dev/null || true | ||
| db "$SUPERVISOR_DB" "INSERT INTO state_log (task_id, from_state, to_state, timestamp, reason) | ||
| VALUES ('$escaped_stale_id', 'verify_failed', 'deployed', | ||
| strftime('%Y-%m-%dT%H:%M:%SZ','now'), | ||
| 'Phase 3b2: reset for re-verification attempt $((verify_reset_count + 1))/$max_verify_retries (PR #$pr_number merged)');" 2>/dev/null || true | ||
| sync_issue_status_label "$stale_id" "deployed" "phase_3b2" 2>>"$SUPERVISOR_LOG" || true | ||
| fi | ||
|
Comment on lines
+613
to
+635
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The style guide (line 50) states that References
|
||
| else | ||
| # blocked: PR merged but task stuck in blocked state. | ||
| # Advance to deployed and mark complete. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -313,6 +313,35 @@ process_verify_queue() { | |
|
|
||
| ensure_db | ||
|
|
||
| # Recover tasks stuck in 'verifying' state (t1075) | ||
| # If a pulse crashes mid-verification, tasks stay in 'verifying' forever. | ||
| # Reset any task that has been in 'verifying' for more than 5 minutes back to 'deployed'. | ||
| local stuck_verifying | ||
| stuck_verifying=$(db -separator '|' "$SUPERVISOR_DB" " | ||
| SELECT id FROM tasks | ||
| WHERE status = 'verifying' | ||
| AND updated_at < strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-5 minutes') | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| ORDER BY id; | ||
| " 2>/dev/null || echo "") | ||
|
|
||
| if [[ -n "$stuck_verifying" ]]; then | ||
| while IFS='|' read -r stuck_id; do | ||
| [[ -z "$stuck_id" ]] && continue | ||
| log_warn " $stuck_id: stuck in 'verifying' for >5min — resetting to 'deployed'" | ||
| local escaped_stuck_id | ||
| escaped_stuck_id=$(sql_escape "$stuck_id") | ||
| db "$SUPERVISOR_DB" "UPDATE tasks SET | ||
| status = 'deployed', | ||
| error = NULL, | ||
| updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') | ||
| WHERE id = '$escaped_stuck_id';" 2>/dev/null || true | ||
| db "$SUPERVISOR_DB" "INSERT INTO state_log (task_id, from_state, to_state, timestamp, reason) | ||
| VALUES ('$escaped_stuck_id', 'verifying', 'deployed', | ||
| strftime('%Y-%m-%dT%H:%M:%SZ','now'), | ||
| 'process_verify_queue: recovered from stuck verifying state (>5min timeout)');" 2>/dev/null || true | ||
|
Comment on lines
+320
to
+341
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The style guide (line 50) states that References
|
||
| done <<<"$stuck_verifying" | ||
| fi | ||
|
|
||
| # Find deployed tasks that need verification | ||
| local deployed_tasks | ||
| local where_clause="t.status = 'deployed'" | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
For better maintainability and clarity, it's best to define magic numbers like
3as a script-level constant. Since this requires a change outside the diff, I can't provide a direct suggestion, but please consider definingreadonly MAX_VERIFY_RETRIES=3at the top of the script and using$MAX_VERIFY_RETRIEShere. This also aligns with the style guide's naming convention for constants (line 39).References