From 2ffe42fc03163c77b2e30464786a09ac0a503d17 Mon Sep 17 00:00:00 2001 From: marcusquinn <6428977+marcusquinn@users.noreply.github.com> Date: Thu, 19 Feb 2026 22:45:24 +0000 Subject: [PATCH 1/2] feat: add pre-dispatch CLI health check to prevent worker_never_started failures (t1113) Before dispatch, verify the AI CLI binary exists and can execute. If the check fails, defer dispatch to next pulse instead of spawning a worker that will never produce output. This prevents burning retries on environment issues (missing binary, broken installation, PATH problems). The check_cli_health() function: - Verifies binary exists in PATH (command -v) - Runs lightweight version check with 10s timeout - Caches result per-pulse and for 5 minutes (file-based) - Logs diagnostic detail on failure for troubleshooting Also adds cli_health=ok to dispatch metadata log for auditability. --- .agents/scripts/supervisor/dispatch.sh | 118 +++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/.agents/scripts/supervisor/dispatch.sh b/.agents/scripts/supervisor/dispatch.sh index f46de2a0a..14a7ed9b4 100755 --- a/.agents/scripts/supervisor/dispatch.sh +++ b/.agents/scripts/supervisor/dispatch.sh @@ -1503,6 +1503,109 @@ reset_failure_dedup_state() { return 0 } +####################################### +# Pre-dispatch CLI health check (t1113) +# +# Verifies the AI CLI binary exists, is executable, and can produce output +# before spawning a worker. This prevents wasting retries on environment +# issues where the CLI was invoked but never produced output (the +# "worker_never_started:no_sentinel" failure pattern). +# +# Strategy: +# 1. Check binary exists in PATH (command -v) +# 2. Run a lightweight version/help check to verify it can execute +# 3. Cache result for the pulse duration (pulse-level flag) +# +# $1: ai_cli - the CLI binary name (e.g., "opencode", "claude") +# +# Exit codes: +# 0 = CLI healthy, proceed with dispatch +# 1 = CLI not found or not executable +# +# Outputs: diagnostic message on failure (for dispatch log) +####################################### +check_cli_health() { + local ai_cli="$1" + + # Pulse-level fast path: if CLI was already verified in this pulse, skip + if [[ -n "${_PULSE_CLI_VERIFIED:-}" ]]; then + log_verbose "CLI health: pulse-verified OK (skipping check)" + return 0 + fi + + # File-based cache: avoid re-checking within 5 minutes + local cache_dir="$SUPERVISOR_DIR/health" + mkdir -p "$cache_dir" + local cli_cache_file="$cache_dir/cli-${ai_cli}" + if [[ -f "$cli_cache_file" ]]; then + local cached_at + cached_at=$(cat "$cli_cache_file" 2>/dev/null || echo "0") + local now + now=$(date +%s) + local age=$((now - cached_at)) + if [[ "$age" -lt 300 ]]; then + log_verbose "CLI health: cached OK ($age seconds ago)" + _PULSE_CLI_VERIFIED="true" + return 0 + fi + fi + + # Check 1: binary exists in PATH + if ! command -v "$ai_cli" &>/dev/null; then + log_error "CLI health check FAILED: '$ai_cli' not found in PATH" + log_error "PATH=$PATH" + echo "cli_not_found:${ai_cli}" + return 1 + fi + + # Check 2: binary is executable and can produce version output + local version_output="" + local version_exit=1 + + # Use timeout to prevent hanging on broken installations + local timeout_cmd="" + if command -v gtimeout &>/dev/null; then + timeout_cmd="gtimeout" + elif command -v timeout &>/dev/null; then + timeout_cmd="timeout" + fi + + if [[ "$ai_cli" == "opencode" ]]; then + if [[ -n "$timeout_cmd" ]]; then + version_output=$("$timeout_cmd" 10 "$ai_cli" version 2>&1) || version_exit=$? + else + version_output=$("$ai_cli" version 2>&1) || version_exit=$? + fi + else + # claude CLI + if [[ -n "$timeout_cmd" ]]; then + version_output=$("$timeout_cmd" 10 "$ai_cli" --version 2>&1) || version_exit=$? + else + version_output=$("$ai_cli" --version 2>&1) || version_exit=$? + fi + fi + + # If version command succeeded (exit 0) or produced output, CLI is working + if [[ "$version_exit" -eq 0 ]] || [[ -n "$version_output" && "$version_exit" -ne 124 && "$version_exit" -ne 137 ]]; then + # Cache the healthy result + date +%s >"$cli_cache_file" 2>/dev/null || true + _PULSE_CLI_VERIFIED="true" + log_info "CLI health: OK ($ai_cli: ${version_output:0:80})" + return 0 + fi + + # Version check failed + if [[ "$version_exit" -eq 124 || "$version_exit" -eq 137 ]]; then + log_error "CLI health check FAILED: '$ai_cli' timed out (10s)" + echo "cli_timeout:${ai_cli}" + else + log_error "CLI health check FAILED: '$ai_cli' exited with code $version_exit" + log_error "Output: ${version_output:0:200}" + echo "cli_error:${ai_cli}:exit_${version_exit}" + fi + return 1 +} + ####################################### # Pre-dispatch model health check (t132.3, t233) # Two-tier probe strategy: @@ -2354,6 +2457,20 @@ cmd_dispatch() { local ai_cli ai_cli=$(resolve_ai_cli) || return 1 + # Pre-dispatch CLI health check (t1113): verify the AI CLI binary exists and + # can execute before creating worktrees and spawning workers. This prevents + # the "worker_never_started:no_sentinel" failure pattern where the CLI is + # invoked but never produces output due to environment issues (missing binary, + # broken installation, PATH misconfiguration). Deferring here avoids burning + # retries on environment problems that won't resolve between retry attempts. + local cli_health_exit=0 cli_health_detail="" + cli_health_detail=$(check_cli_health "$ai_cli") || cli_health_exit=$? + if [[ "$cli_health_exit" -ne 0 ]]; then + log_error "CLI health check failed for $task_id ($ai_cli): $cli_health_detail — deferring dispatch" + log_error "Fix: ensure '$ai_cli' is installed and in PATH, then retry" + return 3 # Defer to next pulse (same as provider unavailable) + fi + # Pre-dispatch model availability check (t233 — replaces simple health check) # Calls model-availability-helper.sh check before spawning workers. # Distinct exit codes prevent wasted dispatch attempts: @@ -2450,6 +2567,7 @@ cmd_dispatch() { echo "dispatch_type=${verify_mode:+verify}" echo "verify_reason=${verify_reason:-}" echo "hung_timeout_seconds=${dispatch_hung_timeout}" + echo "cli_health=ok" echo "=== END DISPATCH METADATA ===" echo "" } >"$log_file" 2>/dev/null || true From 011cf2f680f9dd476e776652b61ce42fe80d11ce Mon Sep 17 00:00:00 2001 From: marcusquinn <6428977+marcusquinn@users.noreply.github.com> Date: Thu, 19 Feb 2026 22:46:47 +0000 Subject: [PATCH 2/2] feat: add ENVIRONMENT failure mode and re-queue worker_never_started without burning retries (t1113) Reclassify worker_never_started, log_file_missing, log_file_empty, and related dispatch infrastructure failures from LOGIC to new ENVIRONMENT category. These failures indicate the CLI/environment is broken, not the task itself. When the pulse detects an ENVIRONMENT failure: - Re-queues the task (evaluating -> queued) without incrementing retry count - Invalidates CLI health cache so next dispatch re-verifies the environment - Stores failure pattern for diagnostics without marking as task failure - Logs proof-log entry with failure_mode=ENVIRONMENT for auditability This prevents the Feb 13 scenario where 5 tasks exhausted their retries on worker_never_started:no_sentinel when the underlying issue was a broken CLI environment that no amount of task retries would fix. --- .agents/scripts/supervisor/evaluate.sh | 24 +++++++--- .agents/scripts/supervisor/pulse.sh | 66 +++++++++++++++++++------- 2 files changed, 65 insertions(+), 25 deletions(-) diff --git a/.agents/scripts/supervisor/evaluate.sh b/.agents/scripts/supervisor/evaluate.sh index b1e5b0b57..f93c53177 100755 --- a/.agents/scripts/supervisor/evaluate.sh +++ b/.agents/scripts/supervisor/evaluate.sh @@ -666,11 +666,15 @@ link_pr_to_task() { # five broad categories for pattern tracking and model routing decisions. # # Categories: -# TRANSIENT - recoverable with retry (rate limits, timeouts, backend blips) -# RESOURCE - infrastructure/environment issue (auth, OOM, disk) -# LOGIC - task/code problem (merge conflict, test failure, build error) -# BLOCKED - external dependency (human needed, upstream, missing context) -# AMBIGUOUS - unclear cause (clean exit, max retries, unknown) +# TRANSIENT - recoverable with retry (rate limits, timeouts, backend blips) +# RESOURCE - infrastructure/environment issue (auth, OOM, disk) +# ENVIRONMENT - dispatch infrastructure failure (t1113: CLI missing, worker never +# started, log file missing). These are NOT task/code problems — +# retrying won't help until the environment is fixed. The pulse +# handles these by deferring re-queue without burning retry count. +# LOGIC - task/code problem (merge conflict, test failure, build error) +# BLOCKED - external dependency (human needed, upstream, missing context) +# AMBIGUOUS - unclear cause (clean exit, max retries, unknown) # # $1: outcome_detail (e.g., "rate_limited", "auth_error", "merge_conflict") # @@ -691,9 +695,15 @@ classify_failure_mode() { billing_credits_exhausted | out_of_memory) echo "RESOURCE" ;; - merge_conflict | test_fail* | lint_* | build_* | \ - worker_never_started* | log_file_missing* | log_file_empty | \ + worker_never_started* | log_file_missing* | log_file_empty | \ no_log_path_in_db* | dispatch_script_not_executable) + # t1113: Reclassified from LOGIC to ENVIRONMENT. These failures indicate + # the dispatch infrastructure (CLI binary, worktree, permissions) is broken, + # not the task itself. Retrying the task won't help — the environment must + # be fixed first. The pulse defers these without burning retry count. + echo "ENVIRONMENT" + ;; + merge_conflict | test_fail* | lint_* | build_*) echo "LOGIC" ;; blocked:* | waiting* | upstream* | missing_context* | \ diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh index 7c3a0b4f3..1bb688b46 100755 --- a/.agents/scripts/supervisor/pulse.sh +++ b/.agents/scripts/supervisor/pulse.sh @@ -1616,24 +1616,54 @@ cmd_pulse() { attempt_self_heal "$tid" "blocked" "$outcome_detail" "${batch_id:-}" 2>>"$SUPERVISOR_LOG" || true ;; failed) - log_error " $tid: FAILED ($outcome_detail)" - # Proof-log: failed decision (t218) - write_proof_log --task "$tid" --event "failed" --stage "evaluate" \ - --decision "failed:$outcome_detail" \ - --maker "pulse:phase1" 2>/dev/null || true - cmd_transition "$tid" "failed" --error "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true - failed_count=$((failed_count + 1)) - # Clean up worker process tree and PID file (t128.7) - cleanup_worker_processes "$tid" - # Auto-update TODO.md and send notification (t128.4) - update_todo_on_blocked "$tid" "FAILED: $outcome_detail" 2>>"$SUPERVISOR_LOG" || true - send_task_notification "$tid" "failed" "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true - # Store failure pattern in memory (t128.6) - store_failure_pattern "$tid" "failed" "$outcome_detail" "$tid_desc" 2>>"$SUPERVISOR_LOG" || true - # Add failed:model label to GitHub issue (t1010) - add_model_label "$tid" "failed" "$tid_model" "${tid_repo:-.}" 2>>"$SUPERVISOR_LOG" || true - # Self-heal: attempt diagnostic subtask (t150) - attempt_self_heal "$tid" "failed" "$outcome_detail" "${batch_id:-}" 2>>"$SUPERVISOR_LOG" || true + # t1113: Classify failure mode to distinguish environment issues from + # task/code problems. Environment failures (worker_never_started, + # log_file_missing, etc.) are re-queued without burning retry count + # since the task itself isn't at fault. + local failed_fmode="" + failed_fmode=$(classify_failure_mode "$outcome_detail" 2>/dev/null) || failed_fmode="AMBIGUOUS" + + if [[ "$failed_fmode" == "ENVIRONMENT" ]]; then + # t1113: Environment failure — re-queue without incrementing retry count. + # The CLI/environment was broken, not the task. Burning retries here + # would exhaust max_retries on infrastructure issues, permanently + # failing tasks that would succeed once the environment is fixed. + log_warn " $tid: ENVIRONMENT failure ($outcome_detail) — re-queuing without retry increment (t1113)" + write_proof_log --task "$tid" --event "environment_failure" --stage "evaluate" \ + --decision "requeue:$outcome_detail" \ + --evidence "failure_mode=ENVIRONMENT,retry_preserved=true" \ + --maker "pulse:phase1:t1113" 2>/dev/null || true + # Clean up worker process tree and PID file + cleanup_worker_processes "$tid" + # Transition back to queued (preserves current retry count) + cmd_transition "$tid" "queued" --error "environment:$outcome_detail" 2>>"$SUPERVISOR_LOG" || true + # Store pattern for diagnostics but don't mark as task failure + store_failure_pattern "$tid" "environment" "$outcome_detail" "$tid_desc" 2>>"$SUPERVISOR_LOG" || true + # Invalidate CLI health cache so next pulse re-checks + local cli_cache_dir="${SUPERVISOR_DIR}/health" + rm -f "$cli_cache_dir"/cli-* 2>/dev/null || true + _PULSE_CLI_VERIFIED="" + log_info " $tid: CLI health cache invalidated — next dispatch will re-verify" + else + log_error " $tid: FAILED ($outcome_detail)" + # Proof-log: failed decision (t218) + write_proof_log --task "$tid" --event "failed" --stage "evaluate" \ + --decision "failed:$outcome_detail" \ + --maker "pulse:phase1" 2>/dev/null || true + cmd_transition "$tid" "failed" --error "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true + failed_count=$((failed_count + 1)) + # Clean up worker process tree and PID file (t128.7) + cleanup_worker_processes "$tid" + # Auto-update TODO.md and send notification (t128.4) + update_todo_on_blocked "$tid" "FAILED: $outcome_detail" 2>>"$SUPERVISOR_LOG" || true + send_task_notification "$tid" "failed" "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true + # Store failure pattern in memory (t128.6) + store_failure_pattern "$tid" "failed" "$outcome_detail" "$tid_desc" 2>>"$SUPERVISOR_LOG" || true + # Add failed:model label to GitHub issue (t1010) + add_model_label "$tid" "failed" "$tid_model" "${tid_repo:-.}" 2>>"$SUPERVISOR_LOG" || true + # Self-heal: attempt diagnostic subtask (t150) + attempt_self_heal "$tid" "failed" "$outcome_detail" "${batch_id:-}" 2>>"$SUPERVISOR_LOG" || true + fi ;; esac done <<<"$running_tasks"