diff --git a/.agents/scripts/supervisor/database.sh b/.agents/scripts/supervisor/database.sh index b0eea6c02..fbdf0ec10 100755 --- a/.agents/scripts/supervisor/database.sh +++ b/.agents/scripts/supervisor/database.sh @@ -651,6 +651,20 @@ CONTEST_SQL log_success "Added requested_tier and actual_tier columns to tasks (t1117)" fi + # Migrate: add dispatch deduplication guard columns to tasks (t1206) + # last_failure_at: ISO timestamp of the most recent failure (retry/blocked/failed) + # consecutive_failure_count: number of consecutive failures with the same error code + # Used by check_dispatch_dedup_guard() to enforce 10-min cooldown and block after + # 2 identical consecutive failures (prevents token waste on repeating failures). + local has_last_failure_at + has_last_failure_at=$(db "$SUPERVISOR_DB" "SELECT count(*) FROM pragma_table_info('tasks') WHERE name='last_failure_at';" 2>/dev/null || echo "0") + if [[ "$has_last_failure_at" -eq 0 ]]; then + log_info "Migrating tasks table: adding dispatch dedup guard columns (t1206)..." + db "$SUPERVISOR_DB" "ALTER TABLE tasks ADD COLUMN last_failure_at TEXT;" 2>/dev/null || true + db "$SUPERVISOR_DB" "ALTER TABLE tasks ADD COLUMN consecutive_failure_count INTEGER NOT NULL DEFAULT 0;" 2>/dev/null || true + log_success "Added last_failure_at and consecutive_failure_count columns to tasks (t1206)" + fi + # Migrate: create stale_recovery_log table if missing (t1202) local has_stale_recovery_log has_stale_recovery_log=$(db "$SUPERVISOR_DB" "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='stale_recovery_log';" 2>/dev/null || echo "0") @@ -731,6 +745,8 @@ CREATE TABLE IF NOT EXISTS tasks ( prompt_repeat_done INTEGER NOT NULL DEFAULT 0, requested_tier TEXT, actual_tier TEXT, + last_failure_at TEXT, + consecutive_failure_count INTEGER NOT NULL DEFAULT 0, created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now')), started_at TEXT, completed_at TEXT, diff --git a/.agents/scripts/supervisor/dispatch.sh b/.agents/scripts/supervisor/dispatch.sh index 0d45e2c74..4dc127c2d 100755 --- a/.agents/scripts/supervisor/dispatch.sh +++ b/.agents/scripts/supervisor/dispatch.sh @@ -1349,6 +1349,160 @@ run_quality_gate() { return 0 } +####################################### +# Dispatch deduplication guard (t1206) +# Prevents re-dispatching tasks that failed with the same error in a short window. +# Guards against token waste from repeated identical failures (e.g., t1032.1 failed +# twice within 2 minutes with the same error; t1030 failed twice within 22 minutes). +# +# Rules enforced: +# 1. 10-minute cooldown after any failure before re-dispatch of the same task +# 2. After 2 consecutive identical failures, move task to 'blocked' with diagnostic note +# 3. Log a warning when the same task fails with the same error code twice in succession +# +# Usage: check_dispatch_dedup_guard +# Returns: +# 0 = proceed with dispatch +# 1 = blocked (task transitioned to blocked state, caller should return 1) +# 2 = cooldown active (defer dispatch, caller should return 3 to pulse) +####################################### +check_dispatch_dedup_guard() { + local task_id="$1" + local escaped_id + escaped_id=$(sql_escape "$task_id") + + # Fetch dedup guard fields from DB + local guard_row + guard_row=$(db -separator '|' "$SUPERVISOR_DB" " + SELECT COALESCE(last_failure_at, ''), + COALESCE(consecutive_failure_count, 0), + COALESCE(error, '') + FROM tasks WHERE id = '$escaped_id'; + " 2>/dev/null) || guard_row="" + + if [[ -z "$guard_row" ]]; then + return 0 + fi + + local last_failure_at consecutive_count last_error + IFS='|' read -r last_failure_at consecutive_count last_error <<<"$guard_row" + + # No prior failure recorded — proceed + if [[ -z "$last_failure_at" ]]; then + return 0 + fi + + # Calculate seconds since last failure + local now_epoch last_failure_epoch elapsed_secs + now_epoch=$(date -u +%s 2>/dev/null) || now_epoch=0 + # Convert ISO timestamp to epoch (macOS/BSD compatible) + last_failure_epoch=$(date -u -j -f '%Y-%m-%dT%H:%M:%SZ' "$last_failure_at" '+%s' 2>/dev/null || + date -u -d "$last_failure_at" '+%s' 2>/dev/null || + echo 0) + elapsed_secs=$((now_epoch - last_failure_epoch)) + + local cooldown_secs="${SUPERVISOR_FAILURE_COOLDOWN_SECS:-600}" # 10 minutes default + local max_consecutive="${SUPERVISOR_MAX_CONSECUTIVE_FAILURES:-2}" + + # Rule 2: Block after max_consecutive identical failures + if [[ "$consecutive_count" -ge "$max_consecutive" ]]; then + local block_reason="Dispatch dedup guard: $consecutive_count consecutive identical failures (error: ${last_error:-unknown}) — manual intervention required (t1206)" + log_warn " $task_id: BLOCKED by dedup guard — $consecutive_count consecutive identical failures with error '${last_error:-unknown}'" + cmd_transition "$task_id" "blocked" --error "$block_reason" 2>/dev/null || true + update_todo_on_blocked "$task_id" "$block_reason" 2>/dev/null || true + send_task_notification "$task_id" "blocked" "$block_reason" 2>/dev/null || true + store_failure_pattern "$task_id" "blocked" "$block_reason" "dispatch-dedup-guard" 2>/dev/null || true + return 1 + fi + + # Rule 1: Enforce cooldown window + if [[ "$elapsed_secs" -lt "$cooldown_secs" ]]; then + local remaining=$((cooldown_secs - elapsed_secs)) + log_warn " $task_id: dispatch dedup cooldown active — last failure ${elapsed_secs}s ago (cooldown: ${cooldown_secs}s, ${remaining}s remaining, error: ${last_error:-unknown}) (t1206)" + return 2 + fi + + return 0 +} + +####################################### +# Update dispatch dedup guard fields after a failure (t1206) +# Called from pulse.sh retry handler to track failure timestamps and counts. +# Increments consecutive_failure_count if error matches previous error, +# resets to 1 if error changed (different failure mode = fresh start). +# +# Usage: update_failure_dedup_state +####################################### +update_failure_dedup_state() { + local task_id="$1" + local error_detail="${2:-}" + local escaped_id + escaped_id=$(sql_escape "$task_id") + + # Fetch current state + local current_row + current_row=$(db -separator '|' "$SUPERVISOR_DB" " + SELECT COALESCE(consecutive_failure_count, 0), + COALESCE(error, '') + FROM tasks WHERE id = '$escaped_id'; + " 2>/dev/null) || current_row="0|" + + local current_count current_error + IFS='|' read -r current_count current_error <<<"$current_row" + + # Normalise error strings for comparison (strip trailing detail after first colon) + local new_error_key current_error_key + new_error_key="${error_detail%%:*}" + current_error_key="${current_error%%:*}" + + local new_count + local max_consecutive="${SUPERVISOR_MAX_CONSECUTIVE_FAILURES:-2}" + if [[ "$new_error_key" == "$current_error_key" && -n "$current_error_key" ]]; then + # Same error type — increment consecutive count + new_count=$((current_count + 1)) + if [[ "$new_count" -ge "$max_consecutive" ]]; then + log_warn " $task_id: consecutive failure #${new_count} with same error '${new_error_key}' — dedup guard will block next dispatch (threshold: $max_consecutive) (t1206)" + fi + else + # Different error — reset counter (new failure mode) + new_count=1 + fi + + local now_iso + now_iso=$(date -u +%Y-%m-%dT%H:%M:%SZ) + + db "$SUPERVISOR_DB" " + UPDATE tasks + SET last_failure_at = '$(sql_escape "$now_iso")', + consecutive_failure_count = $new_count + WHERE id = '$escaped_id'; + " 2>/dev/null || true + + return 0 +} + +####################################### +# Reset dispatch dedup guard state after successful task completion (t1206) +# Clears last_failure_at and consecutive_failure_count so a re-queued task +# is not deferred by a stale cooldown from a pre-success failure. +# +# Usage: reset_failure_dedup_state +####################################### +reset_failure_dedup_state() { + local task_id="$1" + local escaped_id + escaped_id=$(sql_escape "$task_id") + + db "$SUPERVISOR_DB" " + UPDATE tasks + SET last_failure_at = NULL, + consecutive_failure_count = 0 + WHERE id = '$escaped_id'; + " 2>/dev/null || true + + return 0 +} + ####################################### # Pre-dispatch model health check (t132.3, t233) # Two-tier probe strategy: @@ -2168,6 +2322,18 @@ cmd_dispatch() { return 1 fi + # Dispatch deduplication guard (t1206): prevent re-dispatch of tasks that failed + # with the same error within a short window. Avoids token waste on repeating failures. + local dedup_rc=0 + check_dispatch_dedup_guard "$task_id" || dedup_rc=$? + if [[ "$dedup_rc" -eq 1 ]]; then + # Task was transitioned to blocked by the guard — abort dispatch + return 1 + elif [[ "$dedup_rc" -eq 2 ]]; then + # Cooldown active — defer to next pulse (return 3 = provider-style deferral) + return 3 + fi + # Resolve AI CLI local ai_cli ai_cli=$(resolve_ai_cli) || return 1 diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh index 3038acb7c..2e2e0db76 100755 --- a/.agents/scripts/supervisor/pulse.sh +++ b/.agents/scripts/supervisor/pulse.sh @@ -1043,6 +1043,10 @@ ${stale_other_tasks}" completed_count=$((completed_count + 1)) # Clean up worker process tree and PID file (t128.7) cleanup_worker_processes "$tid" + # Reset dispatch dedup guard state on success (t1206): clear last_failure_at + # and consecutive_failure_count so a re-queued task is not deferred by a + # stale cooldown from a pre-success failure. + reset_failure_dedup_state "$tid" 2>>"$SUPERVISOR_LOG" || true # --- Non-critical post-processing below (safe to lose on kill) --- # Proof-log: task completion (t218) write_proof_log --task "$tid" --event "complete" --stage "evaluate" \ @@ -1079,6 +1083,12 @@ ${stale_other_tasks}" write_proof_log --task "$tid" --event "retry" --stage "evaluate" \ --decision "retry:$outcome_detail" \ --maker "pulse:phase1" 2>/dev/null || true + # Update dispatch dedup guard state (t1206): track failure timestamp and + # consecutive count so check_dispatch_dedup_guard() can enforce cooldown + # and block tasks that fail identically 2+ times in succession. + # NOTE: must run BEFORE cmd_transition so the DB error column still holds + # the *previous* failure's error for accurate streak comparison. + update_failure_dedup_state "$tid" "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true cmd_transition "$tid" "retrying" --error "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true # Clean up worker process tree before re-prompt (t128.7) cleanup_worker_processes "$tid"