From 2ffe42fc03163c77b2e30464786a09ac0a503d17 Mon Sep 17 00:00:00 2001
From: marcusquinn <6428977+marcusquinn@users.noreply.github.com>
Date: Thu, 19 Feb 2026 22:45:24 +0000
Subject: [PATCH 1/2] feat: add pre-dispatch CLI health check to prevent
 worker_never_started failures (t1113)

Before dispatch, verify the AI CLI binary exists and can execute. If the
check fails, defer dispatch to next pulse instead of spawning a worker
that will never produce output. This prevents burning retries on
environment issues (missing binary, broken installation, PATH problems).

The check_cli_health() function:
- Verifies binary exists in PATH (command -v)
- Runs lightweight version check with 10s timeout
- Caches result per-pulse and for 5 minutes (file-based)
- Logs diagnostic detail on failure for troubleshooting

Also adds cli_health=ok to dispatch metadata log for auditability.
---
 .agents/scripts/supervisor/dispatch.sh | 118 +++++++++++++++++++++++++
 1 file changed, 118 insertions(+)

diff --git a/.agents/scripts/supervisor/dispatch.sh b/.agents/scripts/supervisor/dispatch.sh
index f46de2a0a..14a7ed9b4 100755
--- a/.agents/scripts/supervisor/dispatch.sh
+++ b/.agents/scripts/supervisor/dispatch.sh
@@ -1503,6 +1503,109 @@ reset_failure_dedup_state() {
 	return 0
 }
 
+#######################################
+# Pre-dispatch CLI health check (t1113)
+#
+# Verifies the AI CLI binary exists, is executable, and can produce output
+# before spawning a worker. This prevents wasting retries on environment
+# issues where the CLI was invoked but never produced output (the
+# "worker_never_started:no_sentinel" failure pattern).
+#
+# Strategy:
+#   1. Check binary exists in PATH (command -v)
+#   2. Run a lightweight version/help check to verify it can execute
+#   3. Cache result for the pulse duration (pulse-level flag)
+#
+# $1: ai_cli - the CLI binary name (e.g., "opencode", "claude")
+#
+# Exit codes:
+#   0 = CLI healthy, proceed with dispatch
+#   1 = CLI not found or not executable
+#
+# Outputs: diagnostic message on failure (for dispatch log)
+#######################################
+check_cli_health() {
+	local ai_cli="$1"
+
+	# Pulse-level fast path: if CLI was already verified in this pulse, skip
+	if [[ -n "${_PULSE_CLI_VERIFIED:-}" ]]; then
+		log_verbose "CLI health: pulse-verified OK (skipping check)"
+		return 0
+	fi
+
+	# File-based cache: avoid re-checking within 5 minutes
+	local cache_dir="$SUPERVISOR_DIR/health"
+	mkdir -p "$cache_dir"
+	local cli_cache_file="$cache_dir/cli-${ai_cli}"
+	if [[ -f "$cli_cache_file" ]]; then
+		local cached_at
+		cached_at=$(cat "$cli_cache_file" 2>/dev/null || echo "0")
+		local now
+		now=$(date +%s)
+		local age=$((now - cached_at))
+		if [[ "$age" -lt 300 ]]; then
+			log_verbose "CLI health: cached OK ($age seconds ago)"
+			_PULSE_CLI_VERIFIED="true"
+			return 0
+		fi
+	fi
+
+	# Check 1: binary exists in PATH
+	if ! command -v "$ai_cli" &>/dev/null; then
+		log_error "CLI health check FAILED: '$ai_cli' not found in PATH"
+		log_error "PATH=$PATH"
+		echo "cli_not_found:${ai_cli}"
+		return 1
+	fi
+
+	# Check 2: binary is executable and can produce version output
+	local version_output=""
+	local version_exit=1
+
+	# Use timeout to prevent hanging on broken installations
+	local timeout_cmd=""
+	if command -v gtimeout &>/dev/null; then
+		timeout_cmd="gtimeout"
+	elif command -v timeout &>/dev/null; then
+		timeout_cmd="timeout"
+	fi
+
+	if [[ "$ai_cli" == "opencode" ]]; then
+		if [[ -n "$timeout_cmd" ]]; then
+			version_output=$("$timeout_cmd" 10 "$ai_cli" version 2>&1) || version_exit=$?
+		else
+			version_output=$("$ai_cli" version 2>&1) || version_exit=$?
+		fi
+	else
+		# claude CLI
+		if [[ -n "$timeout_cmd" ]]; then
+			version_output=$("$timeout_cmd" 10 "$ai_cli" --version 2>&1) || version_exit=$?
+		else
+			version_output=$("$ai_cli" --version 2>&1) || version_exit=$?
+		fi
+	fi
+
+	# If version command succeeded (exit 0) or produced output, CLI is working
+	if [[ "$version_exit" -eq 0 ]] || [[ -n "$version_output" && "$version_exit" -ne 124 && "$version_exit" -ne 137 ]]; then
+		# Cache the healthy result
+		date +%s >"$cli_cache_file" 2>/dev/null || true
+		_PULSE_CLI_VERIFIED="true"
+		log_info "CLI health: OK ($ai_cli: ${version_output:0:80})"
+		return 0
+	fi
+
+	# Version check failed
+	if [[ "$version_exit" -eq 124 || "$version_exit" -eq 137 ]]; then
+		log_error "CLI health check FAILED: '$ai_cli' timed out (10s)"
+		echo "cli_timeout:${ai_cli}"
+	else
+		log_error "CLI health check FAILED: '$ai_cli' exited with code $version_exit"
+		log_error "Output: ${version_output:0:200}"
+		echo "cli_error:${ai_cli}:exit_${version_exit}"
+	fi
+	return 1
+}
+
 #######################################
 # Pre-dispatch model health check (t132.3, t233)
 # Two-tier probe strategy:
@@ -2354,6 +2457,20 @@ cmd_dispatch() {
 	local ai_cli
 	ai_cli=$(resolve_ai_cli) || return 1
 
+	# Pre-dispatch CLI health check (t1113): verify the AI CLI binary exists and
+	# can execute before creating worktrees and spawning workers. This prevents
+	# the "worker_never_started:no_sentinel" failure pattern where the CLI is
+	# invoked but never produces output due to environment issues (missing binary,
+	# broken installation, PATH misconfiguration). Deferring here avoids burning
+	# retries on environment problems that won't resolve between retry attempts.
+	local cli_health_exit=0 cli_health_detail=""
+	cli_health_detail=$(check_cli_health "$ai_cli") || cli_health_exit=$?
+	if [[ "$cli_health_exit" -ne 0 ]]; then
+		log_error "CLI health check failed for $task_id ($ai_cli): $cli_health_detail — deferring dispatch"
+		log_error "Fix: ensure '$ai_cli' is installed and in PATH, then retry"
+		return 3 # Defer to next pulse (same as provider unavailable)
+	fi
+
 	# Pre-dispatch model availability check (t233 — replaces simple health check)
 	# Calls model-availability-helper.sh check before spawning workers.
 	# Distinct exit codes prevent wasted dispatch attempts:
@@ -2450,6 +2567,7 @@ cmd_dispatch() {
 		echo "dispatch_type=${verify_mode:+verify}"
 		echo "verify_reason=${verify_reason:-}"
 		echo "hung_timeout_seconds=${dispatch_hung_timeout}"
+		echo "cli_health=ok"
 		echo "=== END DISPATCH METADATA ==="
 		echo ""
 	} >"$log_file" 2>/dev/null || true

From 011cf2f680f9dd476e776652b61ce42fe80d11ce Mon Sep 17 00:00:00 2001
From: marcusquinn <6428977+marcusquinn@users.noreply.github.com>
Date: Thu, 19 Feb 2026 22:46:47 +0000
Subject: [PATCH 2/2] feat: add ENVIRONMENT failure mode and re-queue
 worker_never_started without burning retries (t1113)

Reclassify worker_never_started, log_file_missing, log_file_empty, and
related dispatch infrastructure failures from LOGIC to new ENVIRONMENT
category. These failures indicate the CLI/environment is broken, not the
task itself.

When the pulse detects an ENVIRONMENT failure:
- Re-queues the task (evaluating -> queued) without incrementing retry count
- Invalidates CLI health cache so next dispatch re-verifies the environment
- Stores failure pattern for diagnostics without marking as task failure
- Logs proof-log entry with failure_mode=ENVIRONMENT for auditability

This prevents the Feb 13 scenario where 5 tasks exhausted their retries
on worker_never_started:no_sentinel when the underlying issue was a
broken CLI environment that no amount of task retries would fix.
---
 .agents/scripts/supervisor/evaluate.sh | 24 +++++++---
 .agents/scripts/supervisor/pulse.sh    | 66 +++++++++++++++++++-------
 2 files changed, 65 insertions(+), 25 deletions(-)

diff --git a/.agents/scripts/supervisor/evaluate.sh b/.agents/scripts/supervisor/evaluate.sh
index b1e5b0b57..f93c53177 100755
--- a/.agents/scripts/supervisor/evaluate.sh
+++ b/.agents/scripts/supervisor/evaluate.sh
@@ -666,11 +666,15 @@ link_pr_to_task() {
 # five broad categories for pattern tracking and model routing decisions.
 #
 # Categories:
-#   TRANSIENT - recoverable with retry (rate limits, timeouts, backend blips)
-#   RESOURCE  - infrastructure/environment issue (auth, OOM, disk)
-#   LOGIC     - task/code problem (merge conflict, test failure, build error)
-#   BLOCKED   - external dependency (human needed, upstream, missing context)
-#   AMBIGUOUS - unclear cause (clean exit, max retries, unknown)
+#   TRANSIENT   - recoverable with retry (rate limits, timeouts, backend blips)
+#   RESOURCE    - infrastructure/environment issue (auth, OOM, disk)
+#   ENVIRONMENT - dispatch infrastructure failure (t1113: CLI missing, worker never
+#                 started, log file missing). These are NOT task/code problems —
+#                 retrying won't help until the environment is fixed. The pulse
+#                 handles these by deferring re-queue without burning retry count.
+#   LOGIC       - task/code problem (merge conflict, test failure, build error)
+#   BLOCKED     - external dependency (human needed, upstream, missing context)
+#   AMBIGUOUS   - unclear cause (clean exit, max retries, unknown)
 #
 # $1: outcome_detail (e.g., "rate_limited", "auth_error", "merge_conflict")
 #
@@ -691,9 +695,15 @@ classify_failure_mode() {
 		billing_credits_exhausted | out_of_memory)
 		echo "RESOURCE"
 		;;
-	merge_conflict | test_fail* | lint_* | build_* | \
-		worker_never_started* | log_file_missing* | log_file_empty | \
+	worker_never_started* | log_file_missing* | log_file_empty | \
 		no_log_path_in_db* | dispatch_script_not_executable)
+		# t1113: Reclassified from LOGIC to ENVIRONMENT. These failures indicate
+		# the dispatch infrastructure (CLI binary, worktree, permissions) is broken,
+		# not the task itself. Retrying the task won't help — the environment must
+		# be fixed first. The pulse defers these without burning retry count.
+		echo "ENVIRONMENT"
+		;;
+	merge_conflict | test_fail* | lint_* | build_*)
 		echo "LOGIC"
 		;;
 	blocked:* | waiting* | upstream* | missing_context* | \
diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh
index 7c3a0b4f3..1bb688b46 100755
--- a/.agents/scripts/supervisor/pulse.sh
+++ b/.agents/scripts/supervisor/pulse.sh
@@ -1616,24 +1616,54 @@ cmd_pulse() {
 				attempt_self_heal "$tid" "blocked" "$outcome_detail" "${batch_id:-}" 2>>"$SUPERVISOR_LOG" || true
 				;;
 			failed)
-				log_error "  $tid: FAILED ($outcome_detail)"
-				# Proof-log: failed decision (t218)
-				write_proof_log --task "$tid" --event "failed" --stage "evaluate" \
-					--decision "failed:$outcome_detail" \
-					--maker "pulse:phase1" 2>/dev/null || true
-				cmd_transition "$tid" "failed" --error "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true
-				failed_count=$((failed_count + 1))
-				# Clean up worker process tree and PID file (t128.7)
-				cleanup_worker_processes "$tid"
-				# Auto-update TODO.md and send notification (t128.4)
-				update_todo_on_blocked "$tid" "FAILED: $outcome_detail" 2>>"$SUPERVISOR_LOG" || true
-				send_task_notification "$tid" "failed" "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true
-				# Store failure pattern in memory (t128.6)
-				store_failure_pattern "$tid" "failed" "$outcome_detail" "$tid_desc" 2>>"$SUPERVISOR_LOG" || true
-				# Add failed:model label to GitHub issue (t1010)
-				add_model_label "$tid" "failed" "$tid_model" "${tid_repo:-.}" 2>>"$SUPERVISOR_LOG" || true
-				# Self-heal: attempt diagnostic subtask (t150)
-				attempt_self_heal "$tid" "failed" "$outcome_detail" "${batch_id:-}" 2>>"$SUPERVISOR_LOG" || true
+				# t1113: Classify failure mode to distinguish environment issues from
+				# task/code problems. Environment failures (worker_never_started,
+				# log_file_missing, etc.) are re-queued without burning retry count
+				# since the task itself isn't at fault.
+				local failed_fmode=""
+				failed_fmode=$(classify_failure_mode "$outcome_detail" 2>/dev/null) || failed_fmode="AMBIGUOUS"
+
+				if [[ "$failed_fmode" == "ENVIRONMENT" ]]; then
+					# t1113: Environment failure — re-queue without incrementing retry count.
+					# The CLI/environment was broken, not the task. Burning retries here
+					# would exhaust max_retries on infrastructure issues, permanently
+					# failing tasks that would succeed once the environment is fixed.
+					log_warn "  $tid: ENVIRONMENT failure ($outcome_detail) — re-queuing without retry increment (t1113)"
+					write_proof_log --task "$tid" --event "environment_failure" --stage "evaluate" \
+						--decision "requeue:$outcome_detail" \
+						--evidence "failure_mode=ENVIRONMENT,retry_preserved=true" \
+						--maker "pulse:phase1:t1113" 2>/dev/null || true
+					# Clean up worker process tree and PID file
+					cleanup_worker_processes "$tid"
+					# Transition back to queued (preserves current retry count)
+					cmd_transition "$tid" "queued" --error "environment:$outcome_detail" 2>>"$SUPERVISOR_LOG" || true
+					# Store pattern for diagnostics but don't mark as task failure
+					store_failure_pattern "$tid" "environment" "$outcome_detail" "$tid_desc" 2>>"$SUPERVISOR_LOG" || true
+					# Invalidate CLI health cache so next pulse re-checks
+					local cli_cache_dir="${SUPERVISOR_DIR}/health"
+					rm -f "$cli_cache_dir"/cli-* 2>/dev/null || true
+					_PULSE_CLI_VERIFIED=""
+					log_info "  $tid: CLI health cache invalidated — next dispatch will re-verify"
+				else
+					log_error "  $tid: FAILED ($outcome_detail)"
+					# Proof-log: failed decision (t218)
+					write_proof_log --task "$tid" --event "failed" --stage "evaluate" \
+						--decision "failed:$outcome_detail" \
+						--maker "pulse:phase1" 2>/dev/null || true
+					cmd_transition "$tid" "failed" --error "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true
+					failed_count=$((failed_count + 1))
+					# Clean up worker process tree and PID file (t128.7)
+					cleanup_worker_processes "$tid"
+					# Auto-update TODO.md and send notification (t128.4)
+					update_todo_on_blocked "$tid" "FAILED: $outcome_detail" 2>>"$SUPERVISOR_LOG" || true
+					send_task_notification "$tid" "failed" "$outcome_detail" 2>>"$SUPERVISOR_LOG" || true
+					# Store failure pattern in memory (t128.6)
+					store_failure_pattern "$tid" "failed" "$outcome_detail" "$tid_desc" 2>>"$SUPERVISOR_LOG" || true
+					# Add failed:model label to GitHub issue (t1010)
+					add_model_label "$tid" "failed" "$tid_model" "${tid_repo:-.}" 2>>"$SUPERVISOR_LOG" || true
+					# Self-heal: attempt diagnostic subtask (t150)
+					attempt_self_heal "$tid" "failed" "$outcome_detail" "${batch_id:-}" 2>>"$SUPERVISOR_LOG" || true
+				fi
 				;;
 			esac
 		done <<<"$running_tasks"