diff --git a/.agents/scripts/supervisor/database.sh b/.agents/scripts/supervisor/database.sh index 75d30248..b32f6a63 100755 --- a/.agents/scripts/supervisor/database.sh +++ b/.agents/scripts/supervisor/database.sh @@ -727,6 +727,17 @@ CONTEST_SQL log_success "Added timing columns to stale_recovery_log (t1249)" fi + # Migrate: add eval_duration_secs column to tasks if missing (t1252) + # Records how long the AI evaluation step took — enables trend analysis and + # early detection of evaluation hangs before the full stale timeout fires. + local has_eval_duration_secs + has_eval_duration_secs=$(db "$SUPERVISOR_DB" "SELECT count(*) FROM pragma_table_info('tasks') WHERE name='eval_duration_secs';" 2>/dev/null || echo "0") + if [[ "$has_eval_duration_secs" -eq 0 ]]; then + log_info "Migrating tasks table: adding eval_duration_secs column (t1252)..." + db "$SUPERVISOR_DB" "ALTER TABLE tasks ADD COLUMN eval_duration_secs INTEGER DEFAULT NULL;" 2>/dev/null || true + log_success "Added eval_duration_secs column to tasks (t1252)" + fi + # Prune old action_dedup_log entries (keep last 7 days) db "$SUPERVISOR_DB" "DELETE FROM action_dedup_log WHERE created_at < strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-7 days');" 2>/dev/null || true @@ -783,6 +794,9 @@ CREATE TABLE IF NOT EXISTS tasks ( -- t1249: timestamp set when task transitions to 'evaluating' state -- enables measurement of lag between worker completion and evaluation start evaluating_started_at TEXT, + -- t1252: duration of the AI evaluation step in seconds + -- enables trend analysis and early detection of evaluation hangs + eval_duration_secs INTEGER DEFAULT NULL, created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now')), started_at TEXT, completed_at TEXT, diff --git a/.agents/scripts/supervisor/evaluate.sh b/.agents/scripts/supervisor/evaluate.sh index fc54295a..e8b59139 100755 --- a/.agents/scripts/supervisor/evaluate.sh +++ b/.agents/scripts/supervisor/evaluate.sh @@ -864,11 +864,18 @@ record_evaluation_metadata() { tier_delta_tag=",tier_delta:${requested_tier}->${actual_tier}" fi + # t1252: Look up eval_duration_secs from DB for inclusion in pattern record + local eval_duration_secs="" + if [[ -n "${SUPERVISOR_DB:-}" ]]; then + eval_duration_secs=$(db "$SUPERVISOR_DB" "SELECT eval_duration_secs FROM tasks WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || echo "") + fi + # Build description local description="Worker $task_id: ${outcome_type}:${outcome_detail} [fmode:${failure_mode}] [quality:${quality_score}]" [[ -n "$requested_tier" ]] && description="${description} [req:${requested_tier}]" [[ -n "$actual_tier" ]] && description="${description} [act:${actual_tier}]" [[ -n "$tokens_in" || -n "$tokens_out" ]] && description="${description} [tokens:${tokens_in:-0}+${tokens_out:-0}]" + [[ -n "$eval_duration_secs" ]] && description="${description} [eval:${eval_duration_secs}s]" # Build record args — add tier and token counts when available (t1114, t1117) local record_args=( @@ -876,7 +883,7 @@ record_evaluation_metadata() { --task-type "$task_type" --task-id "$task_id" --description "$description" - --tags "supervisor,evaluate,${outcome_type},${extra_tags}${model_tier:+,model:${model_tier}}${requested_tier:+,requested_tier:${requested_tier}}${actual_tier:+,actual_tier:${actual_tier}}${tier_delta_tag}" + --tags "supervisor,evaluate,${outcome_type},${extra_tags}${model_tier:+,model:${model_tier}}${requested_tier:+,requested_tier:${requested_tier}}${actual_tier:+,actual_tier:${actual_tier}}${tier_delta_tag}${eval_duration_secs:+,eval_duration:${eval_duration_secs}s}" ) [[ -n "$model_tier" ]] && record_args+=(--model "$model_tier") [[ -n "$tokens_in" ]] && record_args+=(--tokens-in "$tokens_in") @@ -1424,15 +1431,49 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li local eval_model eval_model=$(resolve_model "eval" "$ai_cli") + # t1252: Record eval start timestamp for duration tracking. + local eval_start_epoch + eval_start_epoch=$(date +%s 2>/dev/null || echo 0) + # t1251: Log evaluation start so _diagnose_stale_root_cause can detect # "eval in progress" vs "eval died" — reduces false stale-evaluating recoveries. - log_info "evaluate_with_ai: starting AI eval for $task_id (timeout=${eval_timeout}s, model=$eval_model)" + log_info "evaluate_with_ai: starting AI eval for $task_id (timeout=${eval_timeout}s, model=$eval_model, started=$(date -u '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || echo 'unknown'))" # t1251: Heartbeat — touch updated_at so Phase 0.7's grace period doesn't # fire while evaluation is actively running. Without this, a 60s+ eval that # started just before the grace window expires triggers unnecessary recovery. db "$SUPERVISOR_DB" "UPDATE tasks SET updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || true + # t1252: Watchdog timer — detect evaluation hangs within 60s instead of + # waiting for the full stale timeout (120s grace + eval_timeout). + # The watchdog fires a warning log at 60s so trends are visible in logs + # before the timeout kills the eval at eval_timeout seconds. + # Uses a background subshell that self-terminates when the eval completes. + # Cleanup scope ensures sentinel file and watchdog process are released on + # any exit path (normal return, early return, or unexpected signal). + _save_cleanup_scope + trap '_run_cleanups' RETURN + local watchdog_pid="" + local watchdog_sentinel + watchdog_sentinel=$(mktemp 2>/dev/null || echo "") + if [[ -n "$watchdog_sentinel" ]]; then + # Push sentinel removal first (LIFO: runs last, after process kill) + push_cleanup "rm -f '${watchdog_sentinel}' 2>/dev/null || true" + ( + local watchdog_delay="${SUPERVISOR_EVAL_WATCHDOG:-60}" + sleep "$watchdog_delay" + # Only fire if sentinel still exists (eval still running) + if [[ -f "$watchdog_sentinel" ]]; then + log_warn "evaluate_with_ai: WATCHDOG — AI eval for $task_id has been running for ${watchdog_delay}s (timeout=${eval_timeout}s) — possible hang detected" + # Emit watchdog metric to supervisor log for trend analysis + log_info "EVAL_WATCHDOG_FIRED task=$task_id elapsed=${watchdog_delay}s timeout=${eval_timeout}s model=$eval_model" + fi + ) & + watchdog_pid=$! + # Push process cleanup second (LIFO: runs first, before sentinel removal) + push_cleanup "kill '${watchdog_pid}' 2>/dev/null; wait '${watchdog_pid}' 2>/dev/null || true" + fi + if [[ "$ai_cli" == "opencode" ]]; then ai_result=$(timeout "$eval_timeout" opencode run \ -m "$eval_model" \ @@ -1448,6 +1489,28 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li --output-format text 2>/dev/null || echo "") fi + # t1252: Cancel watchdog — eval completed (or timed out), no longer needed. + # Remove sentinel first so watchdog subshell won't fire if it wakes up late. + if [[ -n "$watchdog_sentinel" ]]; then + rm -f "$watchdog_sentinel" 2>/dev/null || true + fi + if [[ -n "$watchdog_pid" ]]; then + kill "$watchdog_pid" 2>/dev/null || true + wait "$watchdog_pid" 2>/dev/null || true + fi + + # t1252: Calculate and record eval duration. + local eval_end_epoch eval_duration_secs="" + eval_end_epoch=$(date +%s 2>/dev/null || echo 0) + if [[ "$eval_start_epoch" -gt 0 && "$eval_end_epoch" -gt 0 ]]; then + eval_duration_secs=$((eval_end_epoch - eval_start_epoch)) + log_info "evaluate_with_ai: AI eval for $task_id completed in ${eval_duration_secs}s" + # Persist duration to DB for trend analysis (t1252) + db "$SUPERVISOR_DB" "UPDATE tasks SET eval_duration_secs = ${eval_duration_secs} WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || true + # Emit structured metric line for log-based trend analysis + log_info "EVAL_DURATION task=$task_id duration_secs=${eval_duration_secs} timeout=${eval_timeout}s model=$eval_model" + fi + # Parse extended VERDICT line (t1096): try full format first, fall back to basic # Full format: VERDICT:type:detail:FMODE:mode:QUALITY:n # Basic format: VERDICT:type:detail (legacy / fallback) @@ -1466,13 +1529,13 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li # Extract quality (after :QUALITY:) local ai_quality="${fmode_part##*:QUALITY:}" - log_info "AI eval for $task_id: $verdict [fmode:${ai_fmode}] [quality:${ai_quality}]" + log_info "AI eval for $task_id: $verdict [fmode:${ai_fmode}] [quality:${ai_quality}]${eval_duration_secs:+ [duration:${eval_duration_secs}s]}" # Store AI evaluation in state log for audit trail db "$SUPERVISOR_DB" " INSERT INTO state_log (task_id, from_state, to_state, reason) VALUES ('$(sql_escape "$task_id")', 'evaluating', 'evaluating', - 'AI eval verdict: $verdict fmode:${ai_fmode} quality:${ai_quality}'); + 'AI eval verdict: $verdict fmode:${ai_fmode} quality:${ai_quality}${eval_duration_secs:+ duration:${eval_duration_secs}s}'); " 2>/dev/null || true # Encode AI-derived fields in stdout so they survive subshell capture. @@ -1488,13 +1551,13 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li if [[ -n "$basic_verdict_line" ]]; then local verdict="${basic_verdict_line#VERDICT:}" - log_info "AI eval for $task_id: $verdict (basic format — no fmode/quality)" + log_info "AI eval for $task_id: $verdict (basic format — no fmode/quality)${eval_duration_secs:+ [duration:${eval_duration_secs}s]}" # Store AI evaluation in state log for audit trail db "$SUPERVISOR_DB" " INSERT INTO state_log (task_id, from_state, to_state, reason) VALUES ('$(sql_escape "$task_id")', 'evaluating', 'evaluating', - 'AI eval verdict: $verdict'); + 'AI eval verdict: $verdict${eval_duration_secs:+ duration:${eval_duration_secs}s}'); " 2>/dev/null || true echo "$verdict" @@ -1502,7 +1565,7 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li fi # AI didn't return a parseable verdict - log_warn "AI eval for $task_id returned unparseable result" + log_warn "AI eval for $task_id returned unparseable result${eval_duration_secs:+ (after ${eval_duration_secs}s)}" return 1 } diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh index 14dc34a4..abc7ed27 100755 --- a/.agents/scripts/supervisor/pulse.sh +++ b/.agents/scripts/supervisor/pulse.sh @@ -422,6 +422,30 @@ cmd_stale_gc_report() { echo "" fi + # t1252: Eval duration analysis — how long AI evaluation actually takes + # Only shown when eval_duration_secs data is available (requires t1252 migration) + local has_eval_duration_data + has_eval_duration_data=$(db "$SUPERVISOR_DB" " + SELECT count(*) FROM tasks + WHERE eval_duration_secs IS NOT NULL + AND updated_at >= strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-${days} days'); + " 2>/dev/null || echo "0") + if [[ "$has_eval_duration_data" -gt 0 ]]; then + echo "--- Eval Duration Analysis (t1252: AI evaluation step timing) ---" + db -column -header "$SUPERVISOR_DB" " + SELECT + count(*) AS Evals, + printf('%.1f', avg(eval_duration_secs)) AS 'Avg Duration (s)', + max(eval_duration_secs) AS 'Max Duration (s)', + min(eval_duration_secs) AS 'Min Duration (s)', + sum(CASE WHEN eval_duration_secs >= 60 THEN 1 ELSE 0 END) AS 'Watchdog Threshold Exceeded' + FROM tasks + WHERE eval_duration_secs IS NOT NULL + AND updated_at >= strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-${days} days'); + " 2>/dev/null || echo "(no data)" + echo "" + fi + # Daily trend echo "--- Daily Trend ---" db -column -header "$SUPERVISOR_DB" "