From 6c92ad51851de90b2389fa399d0374aaea40dc75 Mon Sep 17 00:00:00 2001 From: marcusquinn <6428977+marcusquinn@users.noreply.github.com> Date: Thu, 19 Feb 2026 09:09:56 +0000 Subject: [PATCH] fix: reduce stale-evaluating rate via heartbeat, configurable timeout, and PR fast-path (t1251) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root causes identified (73% stale rate): 1. evaluate_with_ai() had hardcoded 60s timeout — too short under load, causing silent timeouts that left tasks stuck in 'evaluating' state for 120s grace period 2. No heartbeat during AI eval — Phase 0.7 couldn't distinguish 'eval in progress' from 'eval died', triggering unnecessary recovery for active evaluations 3. Phase 1 called AI eval even when PR was already in DB (t1245 early-persist), wasting 60-90s on an AI call whose answer was already known Fixes: - evaluate_with_ai(): configurable timeout via SUPERVISOR_EVAL_TIMEOUT (default 90s) - evaluate_with_ai(): heartbeat — updates updated_at at eval start so Phase 0.7 grace window doesn't fire while evaluation is actively running - evaluate_with_ai(): logs eval start for better stale diagnosis - _diagnose_stale_root_cause(): detects eval_in_progress_heartbeat_Ns when updated_at is within eval_timeout+30s window - Phase 0.7 + Phase 1c: skip recovery when root_cause=eval_in_progress_heartbeat_* - Phase 1: skip AI eval when PR already in DB (t1251 fast-path, saves 60-90s) --- .agents/scripts/supervisor/evaluate.sh | 14 ++++++- .agents/scripts/supervisor/pulse.sh | 58 +++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 3 deletions(-) diff --git a/.agents/scripts/supervisor/evaluate.sh b/.agents/scripts/supervisor/evaluate.sh index 83a81edafb..fc54295a5a 100755 --- a/.agents/scripts/supervisor/evaluate.sh +++ b/.agents/scripts/supervisor/evaluate.sh @@ -1417,10 +1417,22 @@ Analyze the log and determine the outcome. Look for: Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_limited:FMODE:TRANSIENT:QUALITY:1" local ai_result="" - local eval_timeout=60 + # t1251: Make eval timeout configurable — hardcoded 60s was too short under load, + # causing silent timeouts that left tasks stuck in 'evaluating' state. + # Default 90s gives more headroom; set SUPERVISOR_EVAL_TIMEOUT to override. + local eval_timeout="${SUPERVISOR_EVAL_TIMEOUT:-90}" local eval_model eval_model=$(resolve_model "eval" "$ai_cli") + # t1251: Log evaluation start so _diagnose_stale_root_cause can detect + # "eval in progress" vs "eval died" — reduces false stale-evaluating recoveries. + log_info "evaluate_with_ai: starting AI eval for $task_id (timeout=${eval_timeout}s, model=$eval_model)" + + # t1251: Heartbeat — touch updated_at so Phase 0.7's grace period doesn't + # fire while evaluation is actively running. Without this, a 60s+ eval that + # started just before the grace window expires triggers unnecessary recovery. + db "$SUPERVISOR_DB" "UPDATE tasks SET updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || true + if [[ "$ai_cli" == "opencode" ]]; then ai_result=$(timeout "$eval_timeout" opencode run \ -m "$eval_model" \ diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh index c861ff9079..14dc34a4c4 100755 --- a/.agents/scripts/supervisor/pulse.sh +++ b/.agents/scripts/supervisor/pulse.sh @@ -192,7 +192,29 @@ _diagnose_stale_root_cause() { return 0 fi - # Check if there's an AI eval timeout indicator + # t1251: Check if evaluation is actively in progress via heartbeat. + # evaluate_with_ai() updates updated_at at the start of each AI eval call. + # If updated_at is very recent (< eval_timeout + 30s buffer), the eval is + # likely still running — not stuck. Return a distinct cause so callers can + # skip recovery for actively-evaluating tasks. + local eval_timeout_cfg="${SUPERVISOR_EVAL_TIMEOUT:-90}" + local heartbeat_window=$((eval_timeout_cfg + 30)) + local db_updated_at + db_updated_at=$(db "$SUPERVISOR_DB" "SELECT updated_at FROM tasks WHERE id = '$escaped_id';" 2>/dev/null || echo "") + if [[ -n "$db_updated_at" ]]; then + local updated_epoch now_epoch secs_since_update + updated_epoch=$(_iso_to_epoch "$db_updated_at") + now_epoch=$(date +%s 2>/dev/null || echo 0) + if [[ "$updated_epoch" -gt 0 && "$now_epoch" -gt 0 ]]; then + secs_since_update=$((now_epoch - updated_epoch)) + if [[ "$secs_since_update" -lt "$heartbeat_window" ]]; then + echo "eval_in_progress_heartbeat_${secs_since_update}s" + return 0 + fi + fi + fi + + # Check if there's an AI eval timeout indicator in the supervisor log if grep -q 'evaluate_with_ai\|AI evaluation' "$log_file" 2>/dev/null; then echo "ai_eval_timeout" return 0 @@ -881,6 +903,17 @@ cmd_pulse() { local had_pr_flag=0 [[ -n "$stale_pr_url" && "$stale_pr_url" != "no_pr" && "$stale_pr_url" != "task_only" ]] && had_pr_flag=1 + # t1251: Skip recovery if evaluation is actively in progress (heartbeat check). + # evaluate_with_ai() updates updated_at at the start of each AI eval call. + # If the root cause is eval_in_progress_heartbeat_*, the task is not stuck — + # it's actively evaluating. Recovering it would interrupt a live eval and + # cause unnecessary re-queuing (the primary driver of the 73% stale rate). + if [[ "$root_cause" == eval_in_progress_heartbeat_* ]]; then + log_info " Phase 0.7: $stale_id skipped — AI eval actively in progress ($root_cause)" + stale_skipped=$((stale_skipped + 1)) + continue + fi + # No live worker — this is stale state. Transition based on retry eligibility. # Effective grace: fast-path (evaluating+PR) < evaluating < running/dispatched local effective_grace="$stale_grace_seconds" @@ -1165,10 +1198,22 @@ cmd_pulse() { tid_model=$(db "$SUPERVISOR_DB" "SELECT model FROM tasks WHERE id = '$(sql_escape "$tid")';" 2>/dev/null || echo "") tid_repo=$(db "$SUPERVISOR_DB" "SELECT repo FROM tasks WHERE id = '$(sql_escape "$tid")';" 2>/dev/null || echo "") - # Skip AI eval for stuck tasks (it already timed out once) + # t1251: Fast-path evaluation for tasks with PR already in DB (t1245 early-persist). + # If the PR URL is already persisted, evaluate_worker() will find it immediately + # via the heuristic tiers and return complete: without needing AI eval. + # Skip AI eval proactively to avoid the 60-90s AI eval window that causes + # tasks to appear stuck in 'evaluating' and trigger Phase 0.7 recovery. + local tid_existing_pr + tid_existing_pr=$(db "$SUPERVISOR_DB" "SELECT pr_url FROM tasks WHERE id = '$(sql_escape "$tid")';" 2>/dev/null || echo "") local skip_ai="false" if [[ "$current_task_state" == "evaluating" ]]; then + # Already stuck in evaluating — skip AI (it already timed out once) + skip_ai="true" + elif [[ -n "$tid_existing_pr" && "$tid_existing_pr" != "no_pr" && "$tid_existing_pr" != "task_only" && "$tid_existing_pr" != "task_obsolete" ]]; then + # PR already in DB from t1245 early-persist — heuristic tiers will find it, + # no need for AI eval. Skipping saves 60-90s and prevents stale-evaluating. skip_ai="true" + log_info " $tid: PR already in DB ($tid_existing_pr) — skipping AI eval (t1251 fast-path)" fi local outcome @@ -1525,6 +1570,15 @@ cmd_pulse() { local stuck_eval_started_at="${_DIAG_EVAL_STARTED_AT:-}" local stuck_eval_lag_secs="${_DIAG_EVAL_LAG_SECS:-NULL}" + # t1251: Skip recovery if evaluation is actively in progress (heartbeat check). + # The heartbeat in evaluate_with_ai() updates updated_at at eval start. + # Phase 1c's grace query uses updated_at, so a heartbeat-refreshed task + # won't appear here — but check anyway as a safety net. + if [[ "$stuck_root_cause" == eval_in_progress_heartbeat_* ]]; then + log_info " Phase 1c: $stuck_id skipped — AI eval actively in progress ($stuck_root_cause)" + continue + fi + # Calculate stale duration local stuck_stale_secs=0 if [[ -n "$stuck_updated" ]]; then