Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions .agents/scripts/supervisor/ai-actions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -333,10 +333,8 @@ validate_action_fields() {
echo "missing required field: task_id"
return 0
fi
if [[ -z "$new_priority" ]]; then
echo "missing required field: new_priority"
return 0
fi
# new_priority is no longer strictly required — the executor infers it
# from reasoning text if missing (see _exec_adjust_priority)
;;
close_verified)
local issue_number pr_number
Expand Down Expand Up @@ -715,9 +713,25 @@ _exec_adjust_priority() {

local task_id new_priority reasoning
task_id=$(printf '%s' "$action" | jq -r '.task_id')
new_priority=$(printf '%s' "$action" | jq -r '.new_priority')
new_priority=$(printf '%s' "$action" | jq -r '.new_priority // empty')
reasoning=$(printf '%s' "$action" | jq -r '.reasoning // "No reasoning provided"')

# Infer priority from reasoning if the AI omitted the field (common pattern —
# the AI has omitted new_priority in 13+ actions across 5+ cycles)
if [[ -z "$new_priority" || "$new_priority" == "null" ]]; then
if printf '%s' "$reasoning" | grep -qi 'critical\|urgent\|blocker\|blocking'; then
new_priority="critical"
elif printf '%s' "$reasoning" | grep -qi 'high\|important\|prioriti'; then
new_priority="high"
elif printf '%s' "$reasoning" | grep -qi 'low\|minor\|defer'; then
new_priority="low"
else
# Default to high — the AI is recommending a change, usually an escalation
new_priority="high"
fi
log_warn "AI Actions: adjust_priority inferred new_priority='$new_priority' from reasoning (field was missing)"
fi

# Find the task's GitHub issue number
local issue_number=""
if declare -f find_task_issue_number &>/dev/null; then
Expand Down
36 changes: 36 additions & 0 deletions .agents/scripts/supervisor/ai-reason.sh
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,22 @@ ${user_prompt}"

if [[ -z "$action_plan" || "$action_plan" == "null" ]]; then
log_warn "AI Reasoning: no parseable action plan in response"
# Debug diagnostics for intermittent parse failures
local response_len json_block_count first_bytes last_bytes
response_len=$(printf '%s' "$ai_result" | wc -c | tr -d ' ')
json_block_count=$(printf '%s' "$ai_result" | grep -c '^```json' || echo 0)
first_bytes=$(printf '%s' "$ai_result" | head -c 100 | tr '\n' ' ')
last_bytes=$(printf '%s' "$ai_result" | tail -c 100 | tr '\n' ' ')
{
echo "## Parsing Result"
echo ""
echo "Status: FAILED - no parseable JSON action plan"
echo ""
echo "### Debug Diagnostics"
echo "- Response length: $response_len bytes"
echo "- \`\`\`json blocks found: $json_block_count"
echo "- First 100 bytes: \`$first_bytes\`"
echo "- Last 100 bytes: \`$last_bytes\`"
} >>"$reason_log"
echo '{"error":"no_action_plan","actions":[]}'
_release_ai_lock
Expand Down Expand Up @@ -517,6 +529,30 @@ extract_action_plan() {
fi
fi

# Try 5: Write response to temp file and parse from file
# This handles edge cases where the shell variable may have lost data
# (e.g., null bytes, very long lines, or subshell truncation)
local tmpfile
tmpfile=$(mktemp "${TMPDIR:-/tmp}/ai-parse-XXXXXX")
printf '%s' "$response" >"$tmpfile"

# Try file-based extraction of last ```json block
json_block=$(awk '
/^```json/ { capture=1; block=""; next }
/^```$/ && capture { capture=0; last_block=block; next }
capture { block = block (block ? "\n" : "") $0 }
END { if (capture && block) print block; else if (last_block) print last_block }
' "$tmpfile")
rm -f "$tmpfile"

if [[ -n "$json_block" ]]; then
parsed=$(printf '%s' "$json_block" | jq '.' 2>/dev/null)
if [[ $? -eq 0 && -n "$parsed" ]]; then
printf '%s' "$parsed"
return 0
fi
fi
Comment on lines +535 to +554
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

This new logic for file-based parsing introduces a temporary file but doesn't use a trap to ensure its cleanup. This violates the repository style guide, which requires trap for all temporary files to prevent them from being left behind on script errors or interruptions.

Suggested change
local tmpfile
tmpfile=$(mktemp "${TMPDIR:-/tmp}/ai-parse-XXXXXX")
printf '%s' "$response" >"$tmpfile"
# Try file-based extraction of last ```json block
json_block=$(awk '
/^```json/ { capture=1; block=""; next }
/^```$/ && capture { capture=0; last_block=block; next }
capture { block = block (block ? "\n" : "") $0 }
END { if (capture && block) print block; else if (last_block) print last_block }
' "$tmpfile")
rm -f "$tmpfile"
if [[ -n "$json_block" ]]; then
parsed=$(printf '%s' "$json_block" | jq '.' 2>/dev/null)
if [[ $? -eq 0 && -n "$parsed" ]]; then
printf '%s' "$parsed"
return 0
fi
fi
if parsed=$( (
local tmpfile
tmpfile=$(mktemp "${TMPDIR:-/tmp}/ai-parse-XXXXXX")
trap 'rm -f "$tmpfile"' EXIT
printf '%s' "$response" >"$tmpfile"
awk '
/^```json/ { capture=1; block=""; next }
/^```$/ && capture { capture=0; last_block=block; next }
capture { block = block (block ? "\n" : "") $0 }
END { if (capture && block) print block; else if (last_block) print last_block }
' "$tmpfile" | jq '.' 2>/dev/null
) ); then
if [[ -n "$parsed" ]]; then
printf '%s' "$parsed"
return 0
fi
fi
References
  1. The style guide mandates that temporary files must have a trap for cleanup to ensure they are removed even if the script exits unexpectedly. This change adds a temporary file without a corresponding trap. (link)


# Failed to parse
echo ""
return 0
Expand Down
112 changes: 112 additions & 0 deletions .agents/scripts/supervisor/pulse.sh
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,65 @@ cmd_pulse() {
done <<<"$retrying_tasks"
fi

# Phase 1c: Auto-reap stuck evaluating tasks (self-healing)
# Tasks can get stuck in 'evaluating' when the worker dies but evaluation
# fails or times out. Phase 1 handles tasks with dead workers that it finds
# in the running_tasks query, but tasks can also get stuck if:
# - The evaluation itself crashed (jq error, timeout, etc.)
# - The task was left in evaluating from a previous pulse that was killed
# This phase catches any evaluating task older than 10 minutes with no
# live worker process, and force-transitions it to failed for retry.
local stuck_evaluating
stuck_evaluating=$(db -separator '|' "$SUPERVISOR_DB" "
SELECT id, updated_at FROM tasks
WHERE status = 'evaluating'
AND updated_at < strftime('%Y-%m-%dT%H:%M:%SZ', 'now', '-10 minutes')
ORDER BY updated_at ASC;
" 2>/dev/null || echo "")

if [[ -n "$stuck_evaluating" ]]; then
while IFS='|' read -r stuck_id stuck_updated; do
[[ -z "$stuck_id" ]] && continue

# Double-check: is the worker actually dead?
local stuck_pid_file="$SUPERVISOR_DIR/pids/${stuck_id}.pid"
local stuck_alive=false
if [[ -f "$stuck_pid_file" ]]; then
local stuck_pid
stuck_pid=$(cat "$stuck_pid_file" 2>/dev/null || echo "")
if [[ -n "$stuck_pid" ]] && kill -0 "$stuck_pid" 2>/dev/null; then
stuck_alive=true
fi
fi

if [[ "$stuck_alive" == "true" ]]; then
log_info " Phase 1c: $stuck_id evaluating since $stuck_updated but worker still alive — skipping"
continue
fi

log_warn " Phase 1c: $stuck_id stuck in evaluating since $stuck_updated (worker dead) — force-transitioning to failed"

# Check retry count
local stuck_retries stuck_max_retries
stuck_retries=$(db "$SUPERVISOR_DB" "SELECT retries FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo 0)
stuck_max_retries=$(db "$SUPERVISOR_DB" "SELECT max_retries FROM tasks WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || echo 3)
Comment on lines +540 to +541
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

These database queries are constructed by interpolating variables directly into the SQL string. While sql_escape is used, this pattern is susceptible to SQL injection vulnerabilities. The repository style guide recommends using parameterized queries where possible to prevent such issues. This pattern is used multiple times in the new Phase 1c and Phase 2b logic.

Consider refactoring the db helper function to support parameterized queries. A hypothetical usage might look like this:

db "$SUPERVISOR_DB" "SELECT retries FROM tasks WHERE id = ?;" --param "$stuck_id"
References
  1. The style guide states that parameterized queries should be used where possible for database interactions to enhance security. (link)
  2. To prevent SQL injection in shell scripts using sqlite3, create a helper function that uses .param set for safe parameterized bindings instead of direct string interpolation.


if [[ "$stuck_retries" -lt "$stuck_max_retries" ]]; then
# Transition to retrying so it gets re-dispatched
cmd_transition "$stuck_id" "retrying" --error "Auto-reaped: stuck in evaluating >10min with dead worker (Phase 1c)" 2>>"$SUPERVISOR_LOG" || true
db "$SUPERVISOR_DB" "UPDATE tasks SET retries = retries + 1, updated_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now') WHERE id = '$(sql_escape "$stuck_id")';" 2>/dev/null || true
log_info " Phase 1c: $stuck_id → retrying (retry $((stuck_retries + 1))/$stuck_max_retries)"
else
# Max retries exhausted — mark as failed
cmd_transition "$stuck_id" "failed" --error "Auto-reaped: stuck in evaluating >10min, max retries exhausted (Phase 1c)" 2>>"$SUPERVISOR_LOG" || true
log_warn " Phase 1c: $stuck_id → failed (max retries exhausted)"
fi

# Clean up PID file
cleanup_worker_processes "$stuck_id" 2>>"$SUPERVISOR_LOG" || true
done <<<"$stuck_evaluating"
fi

# Phase 2: Dispatch queued tasks up to concurrency limit

if [[ -n "$batch_id" ]]; then
Expand Down Expand Up @@ -555,6 +614,59 @@ cmd_pulse() {
fi
fi

# Phase 2b: Dispatch stall detection and auto-recovery
# If there are queued tasks but nothing was dispatched and nothing is running,
# the pipeline is stalled. Common causes:
# - No active batch (auto-pickup creates batches, but may have failed)
# - All tasks stuck in non-dispatchable states (evaluating, blocked)
# - Provider unavailable for extended period
# - Concurrency limit misconfigured to 0
if [[ "$dispatched_count" -eq 0 ]]; then
local queued_count running_count
queued_count=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM tasks WHERE status = 'queued';" 2>/dev/null || echo 0)
running_count=$(db "$SUPERVISOR_DB" "SELECT COUNT(*) FROM tasks WHERE status IN ('running', 'dispatched');" 2>/dev/null || echo 0)

if [[ "$queued_count" -gt 0 && "$running_count" -eq 0 ]]; then
log_warn "Phase 2b: Dispatch stall detected — $queued_count queued, 0 running, 0 dispatched this pulse"

# Diagnose: is there an active batch?
local active_batch_count
active_batch_count=$(db "$SUPERVISOR_DB" "
SELECT COUNT(*) FROM batches
WHERE status IN ('active', 'running');" 2>/dev/null || echo 0)

if [[ "$active_batch_count" -eq 0 ]]; then
log_warn "Phase 2b: No active batch found — queued tasks have no batch to dispatch from"
# Auto-recovery: trigger auto-pickup to create a batch
# This handles the case where tasks were added to the DB but no batch was created
local stall_repos
stall_repos=$(db "$SUPERVISOR_DB" "SELECT DISTINCT repo FROM tasks WHERE status = 'queued';" 2>/dev/null || echo "")
if [[ -n "$stall_repos" ]]; then
while IFS= read -r stall_repo; do
[[ -z "$stall_repo" ]] && continue
log_info "Phase 2b: Re-running auto-pickup for $stall_repo to create batch"
cmd_auto_pickup --repo "$stall_repo" 2>>"$SUPERVISOR_LOG" || true
done <<<"$stall_repos"
fi
else
# Batch exists but dispatch failed — log diagnostic info
local batch_info
batch_info=$(db -separator '|' "$SUPERVISOR_DB" "
SELECT id, concurrency, status FROM batches
WHERE status IN ('active', 'running')
LIMIT 1;" 2>/dev/null || echo "")
log_warn "Phase 2b: Active batch exists ($batch_info) but dispatch produced 0 — check concurrency limits and provider health"
fi

# Track stall count in state_log for the AI self-reflection to pick up
db "$SUPERVISOR_DB" "
INSERT INTO state_log (task_id, from_state, to_state, reason)
VALUES ('supervisor', 'dispatch', 'stalled',
'$(sql_escape "Dispatch stall: $queued_count queued, 0 running, 0 dispatched. Active batches: $active_batch_count")');
" 2>/dev/null || true
fi
fi

# Phase 2.5: Contest mode — check running contests for completion (t1011)
# If any contest has all entries complete, evaluate cross-rankings and apply winner
local contest_helper="${SCRIPT_DIR}/contest-helper.sh"
Expand Down
Loading