-
Notifications
You must be signed in to change notification settings - Fork 7
t1113: Add worker_never_started diagnostic and auto-retry with environment check #1980
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -1503,6 +1503,109 @@ reset_failure_dedup_state() { | |||||
| return 0 | ||||||
| } | ||||||
|
|
||||||
| ####################################### | ||||||
| # Pre-dispatch CLI health check (t1113) | ||||||
| # | ||||||
| # Verifies the AI CLI binary exists, is executable, and can produce output | ||||||
| # before spawning a worker. This prevents wasting retries on environment | ||||||
| # issues where the CLI was invoked but never produced output (the | ||||||
| # "worker_never_started:no_sentinel" failure pattern). | ||||||
| # | ||||||
| # Strategy: | ||||||
| # 1. Check binary exists in PATH (command -v) | ||||||
| # 2. Run a lightweight version/help check to verify it can execute | ||||||
| # 3. Cache result for the pulse duration (pulse-level flag) | ||||||
| # | ||||||
| # $1: ai_cli - the CLI binary name (e.g., "opencode", "claude") | ||||||
| # | ||||||
| # Exit codes: | ||||||
| # 0 = CLI healthy, proceed with dispatch | ||||||
| # 1 = CLI not found or not executable | ||||||
| # | ||||||
| # Outputs: diagnostic message on failure (for dispatch log) | ||||||
| ####################################### | ||||||
| check_cli_health() { | ||||||
| local ai_cli="$1" | ||||||
|
|
||||||
| # Pulse-level fast path: if CLI was already verified in this pulse, skip | ||||||
| if [[ -n "${_PULSE_CLI_VERIFIED:-}" ]]; then | ||||||
| log_verbose "CLI health: pulse-verified OK (skipping check)" | ||||||
| return 0 | ||||||
| fi | ||||||
|
|
||||||
| # File-based cache: avoid re-checking within 5 minutes | ||||||
| local cache_dir="$SUPERVISOR_DIR/health" | ||||||
| mkdir -p "$cache_dir" | ||||||
| local cli_cache_file="$cache_dir/cli-${ai_cli}" | ||||||
| if [[ -f "$cli_cache_file" ]]; then | ||||||
| local cached_at | ||||||
| cached_at=$(cat "$cli_cache_file" 2>/dev/null || echo "0") | ||||||
| local now | ||||||
| now=$(date +%s) | ||||||
| local age=$((now - cached_at)) | ||||||
| if [[ "$age" -lt 300 ]]; then | ||||||
| log_verbose "CLI health: cached OK ($age seconds ago)" | ||||||
| _PULSE_CLI_VERIFIED="true" | ||||||
| return 0 | ||||||
| fi | ||||||
| fi | ||||||
|
|
||||||
| # Check 1: binary exists in PATH | ||||||
| if ! command -v "$ai_cli" &>/dev/null; then | ||||||
| log_error "CLI health check FAILED: '$ai_cli' not found in PATH" | ||||||
| log_error "PATH=$PATH" | ||||||
| echo "cli_not_found:${ai_cli}" | ||||||
| return 1 | ||||||
| fi | ||||||
|
|
||||||
| # Check 2: binary is executable and can produce version output | ||||||
| local version_output="" | ||||||
| local version_exit=1 | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Initializing
Suggested change
|
||||||
|
|
||||||
| # Use timeout to prevent hanging on broken installations | ||||||
| local timeout_cmd="" | ||||||
| if command -v gtimeout &>/dev/null; then | ||||||
| timeout_cmd="gtimeout" | ||||||
| elif command -v timeout &>/dev/null; then | ||||||
| timeout_cmd="timeout" | ||||||
| fi | ||||||
|
|
||||||
| if [[ "$ai_cli" == "opencode" ]]; then | ||||||
| if [[ -n "$timeout_cmd" ]]; then | ||||||
| version_output=$("$timeout_cmd" 10 "$ai_cli" version 2>&1) || version_exit=$? | ||||||
| else | ||||||
| version_output=$("$ai_cli" version 2>&1) || version_exit=$? | ||||||
| fi | ||||||
| else | ||||||
| # claude CLI | ||||||
| if [[ -n "$timeout_cmd" ]]; then | ||||||
| version_output=$("$timeout_cmd" 10 "$ai_cli" --version 2>&1) || version_exit=$? | ||||||
| else | ||||||
| version_output=$("$ai_cli" --version 2>&1) || version_exit=$? | ||||||
| fi | ||||||
| fi | ||||||
|
|
||||||
| # If version command succeeded (exit 0) or produced output, CLI is working | ||||||
| if [[ "$version_exit" -eq 0 ]] || [[ -n "$version_output" && "$version_exit" -ne 124 && "$version_exit" -ne 137 ]]; then | ||||||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The health check logic is currently too lenient. If the version command fails (e.g., exit 1) but still produces some output (like an error message), it will be considered 'healthy' because of the
Suggested change
|
||||||
| # Cache the healthy result | ||||||
| date +%s >"$cli_cache_file" 2>/dev/null || true | ||||||
| _PULSE_CLI_VERIFIED="true" | ||||||
| log_info "CLI health: OK ($ai_cli: ${version_output:0:80})" | ||||||
| return 0 | ||||||
| fi | ||||||
|
Comment on lines
+1562
to
+1595
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
The More importantly this creates a false positive: a binary that exits non-zero (broken installation, startup crash) but still emits any output to stderr/stdout (which 🐛 Proposed fix — separate assignment from capture- local version_output=""
- local version_exit=1
+ local version_output=""
+ local version_exit=0
if [[ "$ai_cli" == "opencode" ]]; then
if [[ -n "$timeout_cmd" ]]; then
- version_output=$("$timeout_cmd" 10 "$ai_cli" version 2>&1) || version_exit=$?
+ version_output=$("$timeout_cmd" 10 "$ai_cli" version 2>&1)
+ version_exit=$?
else
- version_output=$("$ai_cli" version 2>&1) || version_exit=$?
+ version_output=$("$ai_cli" version 2>&1)
+ version_exit=$?
fi
else
if [[ -n "$timeout_cmd" ]]; then
- version_output=$("$timeout_cmd" 10 "$ai_cli" --version 2>&1) || version_exit=$?
+ version_output=$("$timeout_cmd" 10 "$ai_cli" --version 2>&1)
+ version_exit=$?
else
- version_output=$("$ai_cli" --version 2>&1) || version_exit=$?
+ version_output=$("$ai_cli" --version 2>&1)
+ version_exit=$?
fi
fiWith this change, a clean exit 0 (even with empty output) is treated as healthy via the first condition, and a non-zero exit is correctly treated as a failure unless there is output (which preserves the "output-even-on-non-zero" lenient path intentionally). 🤖 Prompt for AI Agents |
||||||
|
|
||||||
| # Version check failed | ||||||
| if [[ "$version_exit" -eq 124 || "$version_exit" -eq 137 ]]; then | ||||||
| log_error "CLI health check FAILED: '$ai_cli' timed out (10s)" | ||||||
| echo "cli_timeout:${ai_cli}" | ||||||
| else | ||||||
| log_error "CLI health check FAILED: '$ai_cli' exited with code $version_exit" | ||||||
| log_error "Output: ${version_output:0:200}" | ||||||
| echo "cli_error:${ai_cli}:exit_${version_exit}" | ||||||
| fi | ||||||
| return 1 | ||||||
| } | ||||||
|
|
||||||
| ####################################### | ||||||
| # Pre-dispatch model health check (t132.3, t233) | ||||||
| # Two-tier probe strategy: | ||||||
|
|
@@ -2354,6 +2457,20 @@ cmd_dispatch() { | |||||
| local ai_cli | ||||||
| ai_cli=$(resolve_ai_cli) || return 1 | ||||||
|
|
||||||
| # Pre-dispatch CLI health check (t1113): verify the AI CLI binary exists and | ||||||
| # can execute before creating worktrees and spawning workers. This prevents | ||||||
| # the "worker_never_started:no_sentinel" failure pattern where the CLI is | ||||||
| # invoked but never produces output due to environment issues (missing binary, | ||||||
| # broken installation, PATH misconfiguration). Deferring here avoids burning | ||||||
| # retries on environment problems that won't resolve between retry attempts. | ||||||
| local cli_health_exit=0 cli_health_detail="" | ||||||
| cli_health_detail=$(check_cli_health "$ai_cli") || cli_health_exit=$? | ||||||
| if [[ "$cli_health_exit" -ne 0 ]]; then | ||||||
| log_error "CLI health check failed for $task_id ($ai_cli): $cli_health_detail — deferring dispatch" | ||||||
| log_error "Fix: ensure '$ai_cli' is installed and in PATH, then retry" | ||||||
| return 3 # Defer to next pulse (same as provider unavailable) | ||||||
| fi | ||||||
|
|
||||||
| # Pre-dispatch model availability check (t233 — replaces simple health check) | ||||||
| # Calls model-availability-helper.sh check before spawning workers. | ||||||
| # Distinct exit codes prevent wasted dispatch attempts: | ||||||
|
|
@@ -2450,6 +2567,7 @@ cmd_dispatch() { | |||||
| echo "dispatch_type=${verify_mode:+verify}" | ||||||
| echo "verify_reason=${verify_reason:-}" | ||||||
| echo "hung_timeout_seconds=${dispatch_hung_timeout}" | ||||||
| echo "cli_health=ok" | ||||||
| echo "=== END DISPATCH METADATA ===" | ||||||
| echo "" | ||||||
| } >"$log_file" 2>/dev/null || true | ||||||
|
|
||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The arithmetic expansion at line 1545 is unsafe because
cached_atcan be empty if the cache file exists but is empty (in which casecatsucceeds and the|| echo "0"fallback is not triggered). This would cause a syntax error in the arithmetic expansion. Additionally, following the repository style guide (line 11), declarations and assignments for expansions that can fail should be separate to ensure exit code safety.References
local var="$1"pattern in functions (declare and assign separately for exit code safety) (link)set -e,var=$(cat file || echo 0)is not safe for arithmetic if the file can be empty.caton an empty file succeeds, makingvaran empty string and causing arithmetic expansion to fail. Use parameter expansion with a default value (e.g.,$((... - ${var:-0}))) to handle empty strings robustly.