Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions .agents/scripts/supervisor/database.sh
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,20 @@ CONTEST_SQL
log_success "Added requested_tier and actual_tier columns to tasks (t1117)"
fi

# Migrate: add dispatch deduplication guard columns to tasks (t1206)
# last_failure_at: ISO timestamp of the most recent failure (retry/blocked/failed)
# consecutive_failure_count: number of consecutive failures with the same error code
# Used by check_dispatch_dedup_guard() to enforce 10-min cooldown and block after
# 2 identical consecutive failures (prevents token waste on repeating failures).
local has_last_failure_at
has_last_failure_at=$(db "$SUPERVISOR_DB" "SELECT count(*) FROM pragma_table_info('tasks') WHERE name='last_failure_at';" 2>/dev/null || echo "0")
if [[ "$has_last_failure_at" -eq 0 ]]; then
log_info "Migrating tasks table: adding dispatch dedup guard columns (t1206)..."
db "$SUPERVISOR_DB" "ALTER TABLE tasks ADD COLUMN last_failure_at TEXT;" 2>/dev/null || true
db "$SUPERVISOR_DB" "ALTER TABLE tasks ADD COLUMN consecutive_failure_count INTEGER NOT NULL DEFAULT 0;" 2>/dev/null || true
log_success "Added last_failure_at and consecutive_failure_count columns to tasks (t1206)"
fi

# Migrate: create stale_recovery_log table if missing (t1202)
local has_stale_recovery_log
has_stale_recovery_log=$(db "$SUPERVISOR_DB" "SELECT count(*) FROM sqlite_master WHERE type='table' AND name='stale_recovery_log';" 2>/dev/null || echo "0")
Expand Down Expand Up @@ -731,6 +745,8 @@ CREATE TABLE IF NOT EXISTS tasks (
prompt_repeat_done INTEGER NOT NULL DEFAULT 0,
requested_tier TEXT,
actual_tier TEXT,
last_failure_at TEXT,
consecutive_failure_count INTEGER NOT NULL DEFAULT 0,
created_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ','now')),
started_at TEXT,
completed_at TEXT,
Expand Down
166 changes: 166 additions & 0 deletions .agents/scripts/supervisor/dispatch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1349,6 +1349,160 @@ run_quality_gate() {
return 0
}

#######################################
# Dispatch deduplication guard (t1206)
# Prevents re-dispatching tasks that failed with the same error in a short window.
# Guards against token waste from repeated identical failures (e.g., t1032.1 failed
# twice within 2 minutes with the same error; t1030 failed twice within 22 minutes).
#
# Rules enforced:
# 1. 10-minute cooldown after any failure before re-dispatch of the same task
# 2. After 2 consecutive identical failures, move task to 'blocked' with diagnostic note
# 3. Log a warning when the same task fails with the same error code twice in succession
#
# Usage: check_dispatch_dedup_guard <task_id>
# Returns:
# 0 = proceed with dispatch
# 1 = blocked (task transitioned to blocked state, caller should return 1)
# 2 = cooldown active (defer dispatch, caller should return 3 to pulse)
#######################################
check_dispatch_dedup_guard() {
local task_id="$1"
local escaped_id
escaped_id=$(sql_escape "$task_id")

# Fetch dedup guard fields from DB
local guard_row
guard_row=$(db -separator '|' "$SUPERVISOR_DB" "
SELECT COALESCE(last_failure_at, ''),
COALESCE(consecutive_failure_count, 0),
COALESCE(error, '')
FROM tasks WHERE id = '$escaped_id';
" 2>/dev/null) || guard_row=""

if [[ -z "$guard_row" ]]; then
return 0
fi

local last_failure_at consecutive_count last_error
IFS='|' read -r last_failure_at consecutive_count last_error <<<"$guard_row"

# No prior failure recorded — proceed
if [[ -z "$last_failure_at" ]]; then
return 0
fi

# Calculate seconds since last failure
local now_epoch last_failure_epoch elapsed_secs
now_epoch=$(date -u +%s 2>/dev/null) || now_epoch=0
# Convert ISO timestamp to epoch (macOS/BSD compatible)
last_failure_epoch=$(date -u -j -f '%Y-%m-%dT%H:%M:%SZ' "$last_failure_at" '+%s' 2>/dev/null ||
date -u -d "$last_failure_at" '+%s' 2>/dev/null ||
echo 0)
elapsed_secs=$((now_epoch - last_failure_epoch))

local cooldown_secs="${SUPERVISOR_FAILURE_COOLDOWN_SECS:-600}" # 10 minutes default
local max_consecutive="${SUPERVISOR_MAX_CONSECUTIVE_FAILURES:-2}"

# Rule 2: Block after max_consecutive identical failures
if [[ "$consecutive_count" -ge "$max_consecutive" ]]; then
local block_reason="Dispatch dedup guard: $consecutive_count consecutive identical failures (error: ${last_error:-unknown}) — manual intervention required (t1206)"
log_warn " $task_id: BLOCKED by dedup guard — $consecutive_count consecutive identical failures with error '${last_error:-unknown}'"
cmd_transition "$task_id" "blocked" --error "$block_reason" 2>/dev/null || true
update_todo_on_blocked "$task_id" "$block_reason" 2>/dev/null || true
send_task_notification "$task_id" "blocked" "$block_reason" 2>/dev/null || true
store_failure_pattern "$task_id" "blocked" "$block_reason" "dispatch-dedup-guard" 2>/dev/null || true
return 1
fi

# Rule 1: Enforce cooldown window
if [[ "$elapsed_secs" -lt "$cooldown_secs" ]]; then
local remaining=$((cooldown_secs - elapsed_secs))
log_warn " $task_id: dispatch dedup cooldown active — last failure ${elapsed_secs}s ago (cooldown: ${cooldown_secs}s, ${remaining}s remaining, error: ${last_error:-unknown}) (t1206)"
return 2
fi

return 0
}

#######################################
# Update dispatch dedup guard fields after a failure (t1206)
# Called from pulse.sh retry handler to track failure timestamps and counts.
# Increments consecutive_failure_count if error matches previous error,
# resets to 1 if error changed (different failure mode = fresh start).
#
# Usage: update_failure_dedup_state <task_id> <error_detail>
#######################################
update_failure_dedup_state() {
local task_id="$1"
local error_detail="${2:-}"
local escaped_id
escaped_id=$(sql_escape "$task_id")

# Fetch current state
local current_row
current_row=$(db -separator '|' "$SUPERVISOR_DB" "
SELECT COALESCE(consecutive_failure_count, 0),
COALESCE(error, '')
FROM tasks WHERE id = '$escaped_id';
" 2>/dev/null) || current_row="0|"

local current_count current_error
IFS='|' read -r current_count current_error <<<"$current_row"

# Normalise error strings for comparison (strip trailing detail after first colon)
local new_error_key current_error_key
new_error_key="${error_detail%%:*}"
current_error_key="${current_error%%:*}"

local new_count
local max_consecutive="${SUPERVISOR_MAX_CONSECUTIVE_FAILURES:-2}"
if [[ "$new_error_key" == "$current_error_key" && -n "$current_error_key" ]]; then
# Same error type — increment consecutive count
new_count=$((current_count + 1))
if [[ "$new_count" -ge "$max_consecutive" ]]; then
log_warn " $task_id: consecutive failure #${new_count} with same error '${new_error_key}' — dedup guard will block next dispatch (threshold: $max_consecutive) (t1206)"
fi
else
# Different error — reset counter (new failure mode)
new_count=1
fi

local now_iso
now_iso=$(date -u +%Y-%m-%dT%H:%M:%SZ)

db "$SUPERVISOR_DB" "
UPDATE tasks
SET last_failure_at = '$(sql_escape "$now_iso")',
consecutive_failure_count = $new_count
WHERE id = '$escaped_id';
" 2>/dev/null || true

return 0
}

#######################################
# Reset dispatch dedup guard state after successful task completion (t1206)
# Clears last_failure_at and consecutive_failure_count so a re-queued task
# is not deferred by a stale cooldown from a pre-success failure.
#
# Usage: reset_failure_dedup_state <task_id>
#######################################
reset_failure_dedup_state() {
local task_id="$1"
local escaped_id
escaped_id=$(sql_escape "$task_id")

db "$SUPERVISOR_DB" "
UPDATE tasks
SET last_failure_at = NULL,
consecutive_failure_count = 0
WHERE id = '$escaped_id';
" 2>/dev/null || true

return 0
}

#######################################
# Pre-dispatch model health check (t132.3, t233)
# Two-tier probe strategy:
Expand Down Expand Up @@ -2168,6 +2322,18 @@ cmd_dispatch() {
return 1
fi

# Dispatch deduplication guard (t1206): prevent re-dispatch of tasks that failed
# with the same error within a short window. Avoids token waste on repeating failures.
local dedup_rc=0
check_dispatch_dedup_guard "$task_id" || dedup_rc=$?
if [[ "$dedup_rc" -eq 1 ]]; then
# Task was transitioned to blocked by the guard — abort dispatch
return 1
elif [[ "$dedup_rc" -eq 2 ]]; then
# Cooldown active — defer to next pulse (return 3 = provider-style deferral)
return 3
fi

# Resolve AI CLI
local ai_cli
ai_cli=$(resolve_ai_cli) || return 1
Expand Down
10 changes: 10 additions & 0 deletions .agents/scripts/supervisor/pulse.sh
Original file line number Diff line number Diff line change
Expand Up @@ -1043,6 +1043,10 @@ ${stale_other_tasks}"
completed_count=$((completed_count + 1))
# Clean up worker process tree and PID file (t128.7)
cleanup_worker_processes "$tid"
# Reset dispatch dedup guard state on success (t1206): clear last_failure_at
# and consecutive_failure_count so a re-queued task is not deferred by a
# stale cooldown from a pre-success failure.
reset_failure_dedup_state "$tid" 2>>"$SUPERVISOR_LOG" || true
# --- Non-critical post-processing below (safe to lose on kill) ---
# Proof-log: task completion (t218)
write_proof_log --task "$tid" --event "complete" --stage "evaluate" \
Expand Down Expand Up @@ -1079,6 +1083,12 @@ ${stale_other_tasks}"
write_proof_log --task "$tid" --event "retry" --stage "evaluate" \
--decision "retry:$outcome_detail" \
--maker "pulse:phase1" 2>/dev/null || true
# Update dispatch dedup guard state (t1206): track failure timestamp and
# consecutive count so check_dispatch_dedup_guard() can enforce cooldown
# and block tasks that fail identically 2+ times in succession.
# NOTE: must run BEFORE cmd_transition so the DB error column still holds
# the *previous* failure's error for accurate streak comparison.
update_failure_dedup_state "$tid" "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true
cmd_transition "$tid" "retrying" --error "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true
# Clean up worker process tree before re-prompt (t128.7)
cleanup_worker_processes "$tid"
Expand Down
Loading