Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions .agents/scripts/supervisor/database.sh
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,17 @@ CONTEST_SQL
log_success "Added timing columns to stale_recovery_log (t1249)"
fi

# Migrate: add eval_duration_secs column to tasks if missing (t1252)
# Records how long the AI evaluation step took — enables trend analysis and
# early detection of evaluation hangs before the full stale timeout fires.
local has_eval_duration_secs
has_eval_duration_secs=$(db "$SUPERVISOR_DB" "SELECT count(*) FROM pragma_table_info('tasks') WHERE name='eval_duration_secs';" 2>/dev/null || echo "0")
if [[ "$has_eval_duration_secs" -eq 0 ]]; then
log_info "Migrating tasks table: adding eval_duration_secs column (t1252)..."
db "$SUPERVISOR_DB" "ALTER TABLE tasks ADD COLUMN eval_duration_secs INTEGER DEFAULT NULL;" 2>/dev/null || true
log_success "Added eval_duration_secs column to tasks (t1252)"
fi

# Prune old action_dedup_log entries (keep last 7 days)
db "$SUPERVISOR_DB" "DELETE FROM action_dedup_log WHERE created_at < strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-7 days');" 2>/dev/null || true

Expand Down Expand Up @@ -783,6 +794,9 @@ CREATE TABLE IF NOT EXISTS tasks (
-- t1249: timestamp set when task transitions to 'evaluating' state
-- enables measurement of lag between worker completion and evaluation start
evaluating_started_at TEXT,
-- t1252: duration of the AI evaluation step in seconds
-- enables trend analysis and early detection of evaluation hangs
eval_duration_secs INTEGER DEFAULT NULL,
created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now')),
started_at TEXT,
completed_at TEXT,
Expand Down
77 changes: 70 additions & 7 deletions .agents/scripts/supervisor/evaluate.sh
Original file line number Diff line number Diff line change
Expand Up @@ -864,19 +864,26 @@ record_evaluation_metadata() {
tier_delta_tag=",tier_delta:${requested_tier}->${actual_tier}"
fi

# t1252: Look up eval_duration_secs from DB for inclusion in pattern record
local eval_duration_secs=""
if [[ -n "${SUPERVISOR_DB:-}" ]]; then
eval_duration_secs=$(db "$SUPERVISOR_DB" "SELECT eval_duration_secs FROM tasks WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || echo "")
fi

# Build description
local description="Worker $task_id: ${outcome_type}:${outcome_detail} [fmode:${failure_mode}] [quality:${quality_score}]"
[[ -n "$requested_tier" ]] && description="${description} [req:${requested_tier}]"
[[ -n "$actual_tier" ]] && description="${description} [act:${actual_tier}]"
[[ -n "$tokens_in" || -n "$tokens_out" ]] && description="${description} [tokens:${tokens_in:-0}+${tokens_out:-0}]"
[[ -n "$eval_duration_secs" ]] && description="${description} [eval:${eval_duration_secs}s]"

# Build record args — add tier and token counts when available (t1114, t1117)
local record_args=(
--outcome "$pt_outcome"
--task-type "$task_type"
--task-id "$task_id"
--description "$description"
--tags "supervisor,evaluate,${outcome_type},${extra_tags}${model_tier:+,model:${model_tier}}${requested_tier:+,requested_tier:${requested_tier}}${actual_tier:+,actual_tier:${actual_tier}}${tier_delta_tag}"
--tags "supervisor,evaluate,${outcome_type},${extra_tags}${model_tier:+,model:${model_tier}}${requested_tier:+,requested_tier:${requested_tier}}${actual_tier:+,actual_tier:${actual_tier}}${tier_delta_tag}${eval_duration_secs:+,eval_duration:${eval_duration_secs}s}"
)
[[ -n "$model_tier" ]] && record_args+=(--model "$model_tier")
[[ -n "$tokens_in" ]] && record_args+=(--tokens-in "$tokens_in")
Expand Down Expand Up @@ -1424,15 +1431,49 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li
local eval_model
eval_model=$(resolve_model "eval" "$ai_cli")

# t1252: Record eval start timestamp for duration tracking.
local eval_start_epoch
eval_start_epoch=$(date +%s 2>/dev/null || echo 0)

# t1251: Log evaluation start so _diagnose_stale_root_cause can detect
# "eval in progress" vs "eval died" — reduces false stale-evaluating recoveries.
log_info "evaluate_with_ai: starting AI eval for $task_id (timeout=${eval_timeout}s, model=$eval_model)"
log_info "evaluate_with_ai: starting AI eval for $task_id (timeout=${eval_timeout}s, model=$eval_model, started=$(date -u '+%Y-%m-%dT%H:%M:%SZ' 2>/dev/null || echo 'unknown'))"

# t1251: Heartbeat — touch updated_at so Phase 0.7's grace period doesn't
# fire while evaluation is actively running. Without this, a 60s+ eval that
# started just before the grace window expires triggers unnecessary recovery.
db "$SUPERVISOR_DB" "UPDATE tasks SET updated_at = strftime('%Y-%m-%dT%H:%M:%SZ','now') WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || true

# t1252: Watchdog timer — detect evaluation hangs within 60s instead of
# waiting for the full stale timeout (120s grace + eval_timeout).
# The watchdog fires a warning log at 60s so trends are visible in logs
# before the timeout kills the eval at eval_timeout seconds.
# Uses a background subshell that self-terminates when the eval completes.
# Cleanup scope ensures sentinel file and watchdog process are released on
# any exit path (normal return, early return, or unexpected signal).
_save_cleanup_scope
trap '_run_cleanups' RETURN
local watchdog_pid=""
local watchdog_sentinel
watchdog_sentinel=$(mktemp 2>/dev/null || echo "")
if [[ -n "$watchdog_sentinel" ]]; then
# Push sentinel removal first (LIFO: runs last, after process kill)
push_cleanup "rm -f '${watchdog_sentinel}' 2>/dev/null || true"
(
local watchdog_delay="${SUPERVISOR_EVAL_WATCHDOG:-60}"
sleep "$watchdog_delay"
# Only fire if sentinel still exists (eval still running)
if [[ -f "$watchdog_sentinel" ]]; then
log_warn "evaluate_with_ai: WATCHDOG — AI eval for $task_id has been running for ${watchdog_delay}s (timeout=${eval_timeout}s) — possible hang detected"
# Emit watchdog metric to supervisor log for trend analysis
log_info "EVAL_WATCHDOG_FIRED task=$task_id elapsed=${watchdog_delay}s timeout=${eval_timeout}s model=$eval_model"
fi
) &
watchdog_pid=$!
# Push process cleanup second (LIFO: runs first, before sentinel removal)
push_cleanup "kill '${watchdog_pid}' 2>/dev/null; wait '${watchdog_pid}' 2>/dev/null || true"
fi

if [[ "$ai_cli" == "opencode" ]]; then
ai_result=$(timeout "$eval_timeout" opencode run \
-m "$eval_model" \
Expand All @@ -1448,6 +1489,28 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li
--output-format text 2>/dev/null || echo "")
fi

# t1252: Cancel watchdog — eval completed (or timed out), no longer needed.
# Remove sentinel first so watchdog subshell won't fire if it wakes up late.
if [[ -n "$watchdog_sentinel" ]]; then
rm -f "$watchdog_sentinel" 2>/dev/null || true
fi
if [[ -n "$watchdog_pid" ]]; then
kill "$watchdog_pid" 2>/dev/null || true
wait "$watchdog_pid" 2>/dev/null || true
fi

# t1252: Calculate and record eval duration.
local eval_end_epoch eval_duration_secs=""
eval_end_epoch=$(date +%s 2>/dev/null || echo 0)
if [[ "$eval_start_epoch" -gt 0 && "$eval_end_epoch" -gt 0 ]]; then
eval_duration_secs=$((eval_end_epoch - eval_start_epoch))
log_info "evaluate_with_ai: AI eval for $task_id completed in ${eval_duration_secs}s"
# Persist duration to DB for trend analysis (t1252)
db "$SUPERVISOR_DB" "UPDATE tasks SET eval_duration_secs = ${eval_duration_secs} WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || true
# Emit structured metric line for log-based trend analysis
log_info "EVAL_DURATION task=$task_id duration_secs=${eval_duration_secs} timeout=${eval_timeout}s model=$eval_model"
fi

# Parse extended VERDICT line (t1096): try full format first, fall back to basic
# Full format: VERDICT:type:detail:FMODE:mode:QUALITY:n
# Basic format: VERDICT:type:detail (legacy / fallback)
Expand All @@ -1466,13 +1529,13 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li
# Extract quality (after :QUALITY:)
local ai_quality="${fmode_part##*:QUALITY:}"

log_info "AI eval for $task_id: $verdict [fmode:${ai_fmode}] [quality:${ai_quality}]"
log_info "AI eval for $task_id: $verdict [fmode:${ai_fmode}] [quality:${ai_quality}]${eval_duration_secs:+ [duration:${eval_duration_secs}s]}"

# Store AI evaluation in state log for audit trail
db "$SUPERVISOR_DB" "
INSERT INTO state_log (task_id, from_state, to_state, reason)
VALUES ('$(sql_escape "$task_id")', 'evaluating', 'evaluating',
'AI eval verdict: $verdict fmode:${ai_fmode} quality:${ai_quality}');
'AI eval verdict: $verdict fmode:${ai_fmode} quality:${ai_quality}${eval_duration_secs:+ duration:${eval_duration_secs}s}');
" 2>/dev/null || true

# Encode AI-derived fields in stdout so they survive subshell capture.
Expand All @@ -1488,21 +1551,21 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li

if [[ -n "$basic_verdict_line" ]]; then
local verdict="${basic_verdict_line#VERDICT:}"
log_info "AI eval for $task_id: $verdict (basic format — no fmode/quality)"
log_info "AI eval for $task_id: $verdict (basic format — no fmode/quality)${eval_duration_secs:+ [duration:${eval_duration_secs}s]}"

# Store AI evaluation in state log for audit trail
db "$SUPERVISOR_DB" "
INSERT INTO state_log (task_id, from_state, to_state, reason)
VALUES ('$(sql_escape "$task_id")', 'evaluating', 'evaluating',
'AI eval verdict: $verdict');
'AI eval verdict: $verdict${eval_duration_secs:+ duration:${eval_duration_secs}s}');
" 2>/dev/null || true

echo "$verdict"
return 0
fi

# AI didn't return a parseable verdict
log_warn "AI eval for $task_id returned unparseable result"
log_warn "AI eval for $task_id returned unparseable result${eval_duration_secs:+ (after ${eval_duration_secs}s)}"
return 1
}

Expand Down
24 changes: 24 additions & 0 deletions .agents/scripts/supervisor/pulse.sh
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,30 @@ cmd_stale_gc_report() {
echo ""
fi

# t1252: Eval duration analysis — how long AI evaluation actually takes
# Only shown when eval_duration_secs data is available (requires t1252 migration)
local has_eval_duration_data
has_eval_duration_data=$(db "$SUPERVISOR_DB" "
SELECT count(*) FROM tasks
WHERE eval_duration_secs IS NOT NULL
AND updated_at >= strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-${days} days');
" 2>/dev/null || echo "0")
if [[ "$has_eval_duration_data" -gt 0 ]]; then
echo "--- Eval Duration Analysis (t1252: AI evaluation step timing) ---"
db -column -header "$SUPERVISOR_DB" "
SELECT
count(*) AS Evals,
printf('%.1f', avg(eval_duration_secs)) AS 'Avg Duration (s)',
max(eval_duration_secs) AS 'Max Duration (s)',
min(eval_duration_secs) AS 'Min Duration (s)',
sum(CASE WHEN eval_duration_secs >= 60 THEN 1 ELSE 0 END) AS 'Watchdog Threshold Exceeded'
FROM tasks
WHERE eval_duration_secs IS NOT NULL
AND updated_at >= strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-${days} days');
" 2>/dev/null || echo "(no data)"
echo ""
fi

# Daily trend
echo "--- Daily Trend ---"
db -column -header "$SUPERVISOR_DB" "
Expand Down
Loading