From 1467cf0dcca5956064cd245db55811841c375e7a Mon Sep 17 00:00:00 2001 From: marcusquinn <6428977+marcusquinn@users.noreply.github.com> Date: Wed, 18 Feb 2026 02:48:09 +0000 Subject: [PATCH 1/2] feat: add failure mode classification and output quality rating to evaluate.sh (t1096) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add classify_failure_mode(): maps outcome_detail strings to 5 categories (TRANSIENT, RESOURCE, LOGIC, BLOCKED, AMBIGUOUS) for pattern tracking - Add rate_output_quality(): derives 3-point quality score (0/1/2) from outcome type without extra AI calls - Add record_evaluation_metadata(): records richer fields to pattern tracker after each worker assessment - Add evaluate_worker_with_metadata(): thin wrapper that calls evaluate_worker(), classifies failure mode, rates quality, records metadata, returns verdict unchanged - Extend evaluate_with_ai() to request FMODE and QUALITY in AI prompt; parses extended VERDICT:type:detail:FMODE:mode:QUALITY:n format with fallback to basic format for backward compatibility - Update pulse.sh to call evaluate_worker_with_metadata() instead of evaluate_worker() - Extend store_failure_pattern() and store_success_pattern() in memory-integration.sh to accept and propagate failure_mode and quality_score fields - Add --failure-mode and --quality options to pattern-tracker-helper.sh cmd_record() with validation and documentation Chose tag-based approach for new fields (failure_mode:X, quality:N) to avoid schema changes — matches existing pattern-tracker conventions and enables immediate filtering via SQLite LIKE queries. --- .agents/scripts/pattern-tracker-helper.sh | 46 ++- .agents/scripts/supervisor/evaluate.sh | 335 +++++++++++++++++- .../scripts/supervisor/memory-integration.sh | 25 ++ .agents/scripts/supervisor/pulse.sh | 11 +- 4 files changed, 391 insertions(+), 26 deletions(-) diff --git a/.agents/scripts/pattern-tracker-helper.sh b/.agents/scripts/pattern-tracker-helper.sh index 29adab692..73a907cbf 100755 --- a/.agents/scripts/pattern-tracker-helper.sh +++ b/.agents/scripts/pattern-tracker-helper.sh @@ -93,6 +93,7 @@ cmd_record() { local failure_mode="" local tokens_in="" local tokens_out="" + local quality_score="" while [[ $# -gt 0 ]]; do case "$1" in @@ -148,6 +149,10 @@ cmd_record() { tokens_out="$2" shift 2 ;; + --quality-score) + quality_score="$2" + shift 2 + ;; # t1096: 0|1|2 *) if [[ -z "$description" ]]; then description="$1" @@ -211,13 +216,30 @@ cmd_record() { esac fi - # Validate failure_mode if provided (t1095) + # Build memory type + local memory_type + if [[ "$outcome" == "success" ]]; then + memory_type="SUCCESS_PATTERN" + else + memory_type="FAILURE_PATTERN" + fi + + # Validate failure_mode if provided (t1095/t1096) if [[ -n "$failure_mode" ]]; then case "$failure_mode" in - hallucination | context-miss | incomplete | wrong-file | timeout) ;; + hallucination | context-miss | incomplete | wrong-file | timeout | TRANSIENT | RESOURCE | LOGIC | BLOCKED | AMBIGUOUS | NONE) ;; *) - log_error "Invalid failure_mode: $failure_mode (use hallucination, context-miss, incomplete, wrong-file, or timeout)" - return 1 + log_warn "Non-standard failure_mode: $failure_mode (standard: hallucination, context-miss, incomplete, wrong-file, timeout, TRANSIENT, RESOURCE, LOGIC, BLOCKED, AMBIGUOUS, NONE)" + ;; + esac + fi + + # Validate quality_score if provided (t1096) + if [[ -n "$quality_score" ]]; then + case "$quality_score" in + 0 | 1 | 2) ;; + *) + log_warn "Non-standard quality_score: $quality_score (standard: 0=no_output 1=partial 2=complete)" ;; esac fi @@ -232,14 +254,6 @@ cmd_record() { return 1 fi - # Build memory type - local memory_type - if [[ "$outcome" == "success" ]]; then - memory_type="SUCCESS_PATTERN" - else - memory_type="FAILURE_PATTERN" - fi - # Build tags local all_tags="pattern" [[ -n "$task_type" ]] && all_tags="$all_tags,$task_type" @@ -248,6 +262,9 @@ cmd_record() { [[ -n "$duration" ]] && all_tags="$all_tags,duration:$duration" [[ -n "$retries" ]] && all_tags="$all_tags,retries:$retries" [[ -n "$strategy" ]] && all_tags="$all_tags,strategy:$strategy" + # t1096: include failure mode and quality score in tags for filtering + [[ -n "$failure_mode" ]] && all_tags="$all_tags,failure_mode:$failure_mode" + [[ -n "$quality_score" ]] && all_tags="$all_tags,quality:$quality_score" [[ -n "$tags" ]] && all_tags="$all_tags,$tags" # Build content with structured metadata @@ -257,6 +274,9 @@ cmd_record() { [[ -n "$task_id" ]] && content="$content [id:$task_id]" [[ -n "$duration" ]] && content="$content [duration:${duration}s]" [[ -n "$retries" && "$retries" != "0" ]] && content="$content [retries:$retries]" + # t1096: append failure mode and quality score to content + [[ -n "$failure_mode" ]] && content="$content [fmode:$failure_mode]" + [[ -n "$quality_score" ]] && content="$content [quality:$quality_score]" # Store via memory-helper.sh and capture the returned ID # The last line of store output is the bare mem_YYYYMMDDHHMMSS_hex ID. @@ -1098,6 +1118,8 @@ RECORD OPTIONS: --task-id Task identifier (e.g., t102.3) --duration How long the task took --retries Number of retries before completion + --failure-mode Failure category (t1096): TRANSIENT|RESOURCE|LOGIC|BLOCKED|AMBIGUOUS|NONE + --quality Output quality (t1096): 0=no_output 1=partial 2=complete --tags Additional comma-separated tags --strategy Dispatch strategy: normal, prompt-repeat, escalated (t1095) --quality CI quality: ci-pass-first-try, ci-pass-after-fix, needs-human (t1095) diff --git a/.agents/scripts/supervisor/evaluate.sh b/.agents/scripts/supervisor/evaluate.sh index 21b8c7356..60986537d 100755 --- a/.agents/scripts/supervisor/evaluate.sh +++ b/.agents/scripts/supervisor/evaluate.sh @@ -659,6 +659,194 @@ link_pr_to_task() { return 0 } +####################################### +# Classify a failure outcome_detail string into a failure mode category (t1096) +# +# Maps the granular outcome_detail strings from evaluate_worker() into +# five broad categories for pattern tracking and model routing decisions. +# +# Categories: +# TRANSIENT - recoverable with retry (rate limits, timeouts, backend blips) +# RESOURCE - infrastructure/environment issue (auth, OOM, disk) +# LOGIC - task/code problem (merge conflict, test failure, build error) +# BLOCKED - external dependency (human needed, upstream, missing context) +# AMBIGUOUS - unclear cause (clean exit, max retries, unknown) +# +# $1: outcome_detail (e.g., "rate_limited", "auth_error", "merge_conflict") +# +# Outputs: category string on stdout +# Returns: 0 always +####################################### +classify_failure_mode() { + local detail="$1" + + case "$detail" in + rate_limited | backend_quota_error | backend_infrastructure_error | \ + retry:backend* | quota* | 429*) + echo "TRANSIENT" + ;; + auth_error | unauthorized | forbidden | 401* | 403* | \ + billing_credits_exhausted) + echo "RESOURCE" + ;; + timeout | interrupted_sigint | killed_sigkill | terminated_sigterm | \ + work_in_progress) + echo "TRANSIENT" + ;; + out_of_memory) + echo "RESOURCE" + ;; + merge_conflict | test_fail* | lint_* | build_* | \ + worker_never_started* | log_file_missing* | log_file_empty | \ + no_log_path_in_db* | dispatch_script_not_executable) + echo "LOGIC" + ;; + blocked:* | waiting* | upstream* | missing_context* | \ + verify_incomplete_no_pr | verify_not_started_needs_full) + echo "BLOCKED" + ;; + clean_exit_no_signal | max_retries | \ + ambiguous_skipped_ai | ambiguous_ai_unavailable | ambiguous* | "") + echo "AMBIGUOUS" + ;; + *) + echo "AMBIGUOUS" + ;; + esac + return 0 +} + +####################################### +# Rate the output quality of a worker based on outcome type (t1096) +# +# Derives a 3-point quality score from the outcome type without an extra +# AI call. Only AMBIGUOUS failure modes trigger AI quality assessment. +# +# Scale: +# 0 = no_output - worker produced nothing usable +# 1 = partial - some progress, incomplete or broken artifact +# 2 = complete - deliverable matches task intent +# +# $1: outcome_type (complete|retry|blocked|failed) +# $2: outcome_detail (for context) +# +# Outputs: quality score (0, 1, or 2) on stdout +# Returns: 0 always +####################################### +rate_output_quality() { + local outcome_type="$1" + local outcome_detail="${2:-}" + + case "$outcome_type" in + complete) + # task_obsolete = task was already done, still counts as complete + echo "2" + ;; + retry) + # work_in_progress = partial commits exist + if [[ "$outcome_detail" == "work_in_progress" ]]; then + echo "1" + else + echo "1" + fi + ;; + blocked) + # auth/billing blocks = no output; merge conflict = partial + case "$outcome_detail" in + auth_error | billing_credits_exhausted | out_of_memory) + echo "0" + ;; + merge_conflict) + echo "1" + ;; + *) + echo "0" + ;; + esac + ;; + failed) + # worker_never_started / log missing = truly no output + case "$outcome_detail" in + worker_never_started* | log_file_missing* | log_file_empty | \ + no_log_path_in_db* | max_retries) + echo "0" + ;; + *) + echo "0" + ;; + esac + ;; + *) + echo "0" + ;; + esac + return 0 +} + +####################################### +# Record evaluation metadata to pattern tracker (t1096) +# +# Called after evaluate_worker() resolves a verdict. Stores richer metadata +# than the basic store_success/failure_pattern calls: failure mode category, +# output quality score, AI eval flag, and log quality signals. +# +# $1: task_id +# $2: outcome_type (complete|retry|blocked|failed) +# $3: outcome_detail +# $4: failure_mode (TRANSIENT|RESOURCE|LOGIC|BLOCKED|AMBIGUOUS|NONE) +# $5: quality_score (0|1|2) +# $6: ai_evaluated (true|false) — whether AI eval was used +# +# Returns: 0 always (non-blocking) +####################################### +record_evaluation_metadata() { + local task_id="$1" + local outcome_type="$2" + local outcome_detail="$3" + local failure_mode="${4:-AMBIGUOUS}" + local quality_score="${5:-0}" + local ai_evaluated="${6:-false}" + + local pattern_helper="${SCRIPT_DIR}/pattern-tracker-helper.sh" + if [[ ! -x "$pattern_helper" ]]; then + pattern_helper="$HOME/.aidevops/agents/scripts/pattern-tracker-helper.sh" + fi + if [[ ! -x "$pattern_helper" ]]; then + return 0 + fi + + # Map outcome_type to pattern-tracker outcome + local pt_outcome="failure" + [[ "$outcome_type" == "complete" ]] && pt_outcome="success" + + # Build extra tags for new fields + local extra_tags="failure_mode:${failure_mode},quality:${quality_score}" + [[ "$ai_evaluated" == "true" ]] && extra_tags="${extra_tags},ai_eval:true" + + # Look up model tier from DB for routing context + local model_tier="" + if [[ -n "${SUPERVISOR_DB:-}" ]]; then + local task_model + task_model=$(db "$SUPERVISOR_DB" "SELECT model FROM tasks WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || echo "") + if [[ -n "$task_model" ]] && command -v model_to_tier &>/dev/null; then + model_tier=$(model_to_tier "$task_model" 2>/dev/null || echo "") + fi + fi + + # Build description + local description="Worker $task_id: ${outcome_type}:${outcome_detail} [fmode:${failure_mode}] [quality:${quality_score}]" + + "$pattern_helper" record \ + --outcome "$pt_outcome" \ + --task-type "feature" \ + --task-id "$task_id" \ + --description "$description" \ + --tags "supervisor,evaluate,${outcome_type},${extra_tags}${model_tier:+,model:${model_tier}}" \ + 2>/dev/null || true + + return 0 +} + ####################################### # Evaluate a completed worker's outcome using log analysis # Returns: complete:, retry:, blocked:, failed: @@ -1083,9 +1271,16 @@ evaluate_worker() { } ####################################### -# Dispatch a cheap AI call to evaluate ambiguous worker outcomes +# Dispatch a cheap AI call to evaluate ambiguous worker outcomes (t1096) # Uses Sonnet for speed (~30s) and cost efficiency +# # Returns: complete:, retry:, blocked: +# +# Extended format (t1096): AI is asked to also classify failure mode and +# rate output quality. The response is parsed for the extended fields and +# stored as metadata. The returned verdict is the standard format (unchanged +# for callers) — extended fields are captured as side-effects via +# _AI_EVAL_FMODE and _AI_EVAL_QUALITY shell variables for the caller. ####################################### evaluate_with_ai() { local task_id="$1" @@ -1104,13 +1299,17 @@ evaluate_with_ai() { local task_desc task_desc=$(db "$SUPERVISOR_DB" "SELECT description FROM tasks WHERE id = '$escaped_id';" 2>/dev/null || echo "") + # Extended prompt (t1096): request failure mode classification and quality rating + # alongside the standard verdict. Format: VERDICT:type:detail:FMODE:mode:QUALITY:n + # Failure modes: TRANSIENT RESOURCE LOGIC BLOCKED AMBIGUOUS + # Quality: 0=no_output 1=partial 2=complete local eval_prompt - eval_prompt="You are evaluating the outcome of an automated task worker. Respond with EXACTLY one line in the format: VERDICT:: + eval_prompt="You are evaluating the outcome of an automated task worker. Respond with EXACTLY one line in the format: +VERDICT:::FMODE::QUALITY: -Types: -- complete: (task finished successfully) -- retry: (transient failure, worth retrying) -- blocked: (needs human intervention) +Types: complete|retry|blocked +Failure modes: TRANSIENT (rate limit/timeout/backend) | RESOURCE (auth/OOM/billing) | LOGIC (merge conflict/build/test) | BLOCKED (human needed/upstream) | AMBIGUOUS (unclear) | NONE (for complete) +Quality: 0=no_output 1=partial_progress 2=fully_complete Task: $task_id Description: ${task_desc:-unknown} @@ -1124,8 +1323,10 @@ Analyze the log and determine the outcome. Look for: 1. Did the task complete its objective? (code changes, PR created, tests passing) 2. Is there a transient error that a retry would fix? (network, rate limit, timeout) 3. Is there a permanent blocker? (auth, permissions, merge conflict, missing dependency) +4. What failure mode category best describes the issue? +5. How much useful output did the worker produce? (0=nothing, 1=partial, 2=complete) -Respond with ONLY the verdict line, nothing else." +Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_limited:FMODE:TRANSIENT:QUALITY:1" local ai_result="" local eval_timeout=60 @@ -1147,14 +1348,53 @@ Respond with ONLY the verdict line, nothing else." --output-format text 2>/dev/null || echo "") fi - # Parse the VERDICT line from AI response + # Parse extended VERDICT line (t1096): try full format first, fall back to basic + # Full format: VERDICT:type:detail:FMODE:mode:QUALITY:n + # Basic format: VERDICT:type:detail (legacy / fallback) local verdict_line - verdict_line=$(echo "$ai_result" | grep -o 'VERDICT:[a-z]*:[a-z_]*' | head -1 || true) + verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z_]+:FMODE:[A-Z]+:QUALITY:[012]' | head -1 || true) if [[ -n "$verdict_line" ]]; then - # Strip VERDICT: prefix and return - local verdict="${verdict_line#VERDICT:}" - log_info "AI eval for $task_id: $verdict" + # Parse extended format + # Format: VERDICT:type:detail:FMODE:mode:QUALITY:n + local raw="${verdict_line#VERDICT:}" + # Extract type:detail (everything before :FMODE:) + local verdict="${raw%%:FMODE:*}" + # Extract failure mode (between :FMODE: and :QUALITY:) + local fmode_part="${raw#*:FMODE:}" + local ai_fmode="${fmode_part%%:QUALITY:*}" + # Extract quality (after :QUALITY:) + local ai_quality="${fmode_part##*:QUALITY:}" + + log_info "AI eval for $task_id: $verdict [fmode:${ai_fmode}] [quality:${ai_quality}]" + + # Export extended fields for record_evaluation_metadata() caller + # These are set as global-scope variables (prefixed to avoid collision) + _AI_EVAL_FMODE="$ai_fmode" + _AI_EVAL_QUALITY="$ai_quality" + + # Store AI evaluation in state log for audit trail + db "$SUPERVISOR_DB" " + INSERT INTO state_log (task_id, from_state, to_state, reason) + VALUES ('$(sql_escape "$task_id")', 'evaluating', 'evaluating', + 'AI eval verdict: $verdict fmode:${ai_fmode} quality:${ai_quality}'); + " 2>/dev/null || true + + echo "$verdict" + return 0 + fi + + # Fallback: try basic VERDICT format (AI didn't include extended fields) + local basic_verdict_line + basic_verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z_]+' | head -1 || true) + + if [[ -n "$basic_verdict_line" ]]; then + local verdict="${basic_verdict_line#VERDICT:}" + log_info "AI eval for $task_id: $verdict (basic format — no fmode/quality)" + + # Clear extended fields (not provided by AI in this response) + _AI_EVAL_FMODE="" + _AI_EVAL_QUALITY="" # Store AI evaluation in state log for audit trail db "$SUPERVISOR_DB" " @@ -1172,6 +1412,77 @@ Respond with ONLY the verdict line, nothing else." return 1 } +####################################### +# Evaluate worker outcome and record richer metadata to pattern tracker (t1096) +# +# Thin wrapper around evaluate_worker() that: +# 1. Calls evaluate_worker() to get the standard verdict +# 2. Classifies failure mode via classify_failure_mode() +# 3. Rates output quality via rate_output_quality() +# 4. Records metadata to pattern tracker with new fields +# 5. Returns the original verdict unchanged (callers unaffected) +# +# This is the preferred entry point for the pulse cycle. evaluate_worker() +# remains available for direct use (e.g., cmd_evaluate, tests). +# +# $1: task_id +# $2: skip_ai_eval (optional, default false) +# +# Outputs: same as evaluate_worker() — "type:detail" +# Returns: 0 on success, 1 if task not found +####################################### +evaluate_worker_with_metadata() { + local task_id="$1" + local skip_ai_eval="${2:-false}" + + # Reset AI eval extended fields before calling evaluate_worker + _AI_EVAL_FMODE="" + _AI_EVAL_QUALITY="" + + # Run core evaluation + local verdict + verdict=$(evaluate_worker "$task_id" "$skip_ai_eval") || return 1 + + # Parse verdict into type and detail + local outcome_type="${verdict%%:*}" + local outcome_detail="${verdict#*:}" + + # Determine if AI eval was used (extended fields set by evaluate_with_ai) + local ai_evaluated="false" + local failure_mode quality_score + + if [[ -n "$_AI_EVAL_FMODE" ]]; then + # AI provided failure mode classification + ai_evaluated="true" + failure_mode="$_AI_EVAL_FMODE" + quality_score="${_AI_EVAL_QUALITY:-$(rate_output_quality "$outcome_type" "$outcome_detail")}" + else + # Deterministic classification from outcome strings + if [[ "$outcome_type" == "complete" ]]; then + failure_mode="NONE" + else + failure_mode=$(classify_failure_mode "$outcome_detail") + fi + quality_score=$(rate_output_quality "$outcome_type" "$outcome_detail") + fi + + # Record to pattern tracker (non-blocking — failures here must not affect verdict) + record_evaluation_metadata \ + "$task_id" \ + "$outcome_type" \ + "$outcome_detail" \ + "$failure_mode" \ + "$quality_score" \ + "$ai_evaluated" \ + 2>/dev/null || true + + log_info "evaluate_worker_with_metadata: $task_id → $verdict [fmode:${failure_mode}] [quality:${quality_score}] [ai:${ai_evaluated}]" + + # Return the original verdict unchanged + echo "$verdict" + return 0 +} + ####################################### # Manually evaluate a task's worker outcome # Useful for debugging or forcing evaluation of a stuck task diff --git a/.agents/scripts/supervisor/memory-integration.sh b/.agents/scripts/supervisor/memory-integration.sh index 1c5edf7ef..b61e4121c 100755 --- a/.agents/scripts/supervisor/memory-integration.sh +++ b/.agents/scripts/supervisor/memory-integration.sh @@ -58,12 +58,21 @@ $task_memories" # Called when a task fails, is blocked, or retries # Tags with supervisor context for future recall # Uses FAILURE_PATTERN type for pattern-tracker integration (t102.3) +# +# $1: task_id +# $2: outcome_type (blocked|failed|retry) +# $3: outcome_detail +# $4: description (optional) +# $5: failure_mode (optional, t1096 — TRANSIENT|RESOURCE|LOGIC|BLOCKED|AMBIGUOUS) +# $6: quality_score (optional, t1096 — 0|1|2) ####################################### store_failure_pattern() { local task_id="$1" local outcome_type="$2" local outcome_detail="$3" local description="${4:-}" + local failure_mode="${5:-}" + local quality_score="${6:-}" if [[ ! -x "$MEMORY_HELPER" ]]; then return 0 @@ -119,10 +128,16 @@ store_failure_pattern() { content="[task:feature] $content | Task: $description" fi [[ -n "$model_tier" ]] && content="$content [model:$model_tier]" + # t1096: append failure mode and quality score when provided + [[ -n "$failure_mode" ]] && content="$content [fmode:$failure_mode]" + [[ -n "$quality_score" ]] && content="$content [quality:$quality_score]" # Build tags with model info for pattern-tracker queries local tags="supervisor,pattern,$task_id,$outcome_type,$outcome_detail" [[ -n "$model_tier" ]] && tags="$tags,model:$model_tier" + # t1096: include failure mode and quality in tags for pattern-tracker filtering + [[ -n "$failure_mode" ]] && tags="$tags,failure_mode:$failure_mode" + [[ -n "$quality_score" ]] && tags="$tags,quality:$quality_score" "$MEMORY_HELPER" store \ --auto \ @@ -139,11 +154,17 @@ store_failure_pattern() { # Store a success pattern in memory after task completion # Records what worked for future reference # Uses SUCCESS_PATTERN type for pattern-tracker integration (t102.3) +# +# $1: task_id +# $2: detail (PR URL or outcome detail) +# $3: description (optional) +# $4: quality_score (optional, t1096 — 0|1|2; defaults to 2 for complete) ####################################### store_success_pattern() { local task_id="$1" local detail="${2:-}" local description="${3:-}" + local quality_score="${4:-2}" if [[ ! -x "$MEMORY_HELPER" ]]; then return 0 @@ -201,6 +222,8 @@ store_success_pattern() { if [[ "$task_tool_count" -gt 0 ]]; then content="$content [task_tool:$task_tool_count]" fi + # t1096: append quality score (always 2 for success, but caller may override) + content="$content [quality:${quality_score}]" # Build tags with model and duration info for pattern-tracker queries local tags="supervisor,pattern,$task_id,complete" @@ -208,6 +231,8 @@ store_success_pattern() { [[ -n "$duration_secs" ]] && tags="$tags,duration:$duration_secs" [[ "$retries" -gt 0 ]] && tags="$tags,retries:$retries" [[ "$task_tool_count" -gt 0 ]] && tags="$tags,task_tool:$task_tool_count" + # t1096: include quality in tags for pattern-tracker filtering + tags="$tags,quality:${quality_score},failure_mode:NONE" "$MEMORY_HELPER" store \ --auto \ diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh index e17d8f44b..5747179c1 100755 --- a/.agents/scripts/supervisor/pulse.sh +++ b/.agents/scripts/supervisor/pulse.sh @@ -211,7 +211,14 @@ cmd_pulse() { fi local outcome - outcome=$(evaluate_worker "$tid" "$skip_ai") + # t1096: use evaluate_worker_with_metadata() to capture richer metadata + # (failure mode, output quality) and record to pattern tracker. + # Falls back to evaluate_worker() if the wrapper is unavailable. + if command -v evaluate_worker_with_metadata &>/dev/null; then + outcome=$(evaluate_worker_with_metadata "$tid" "$skip_ai") + else + outcome=$(evaluate_worker "$tid" "$skip_ai") + fi local outcome_type="${outcome%%:*}" local outcome_detail="${outcome#*:}" @@ -220,7 +227,7 @@ cmd_pulse() { _eval_duration=$(_proof_log_stage_duration "$tid" "evaluate") write_proof_log --task "$tid" --event "evaluate" --stage "evaluate" \ --decision "$outcome" --evidence "skip_ai=$skip_ai" \ - --maker "evaluate_worker" \ + --maker "evaluate_worker_with_metadata" \ ${_eval_duration:+--duration "$_eval_duration"} 2>/dev/null || true # Budget tracking: record spend from worker log (t1100) From ea491eb4123c32e1037ea71f306321c0e64aea31 Mon Sep 17 00:00:00 2001 From: marcusquinn <6428977+marcusquinn@users.noreply.github.com> Date: Wed, 18 Feb 2026 03:40:24 +0000 Subject: [PATCH 2/2] fix: address review feedback for t1096 --- .agents/scripts/pattern-tracker-helper.sh | 6 +- .agents/scripts/supervisor/evaluate.sh | 116 +++++++++++----------- .agents/scripts/supervisor/pulse.sh | 4 +- 3 files changed, 64 insertions(+), 62 deletions(-) diff --git a/.agents/scripts/pattern-tracker-helper.sh b/.agents/scripts/pattern-tracker-helper.sh index 73a907cbf..1b3a26e85 100755 --- a/.agents/scripts/pattern-tracker-helper.sh +++ b/.agents/scripts/pattern-tracker-helper.sh @@ -1118,13 +1118,13 @@ RECORD OPTIONS: --task-id Task identifier (e.g., t102.3) --duration How long the task took --retries Number of retries before completion - --failure-mode Failure category (t1096): TRANSIENT|RESOURCE|LOGIC|BLOCKED|AMBIGUOUS|NONE - --quality Output quality (t1096): 0=no_output 1=partial 2=complete --tags Additional comma-separated tags --strategy Dispatch strategy: normal, prompt-repeat, escalated (t1095) --quality CI quality: ci-pass-first-try, ci-pass-after-fix, needs-human (t1095) + --quality-score Output quality rating (t1096): 0=no_output 1=partial 2=complete --failure-mode Failure classification: hallucination, context-miss, - incomplete, wrong-file, timeout (t1095) + incomplete, wrong-file, timeout (t1095) or + TRANSIENT, RESOURCE, LOGIC, BLOCKED, AMBIGUOUS, NONE (t1096) --tokens-in Input token count (t1095) --tokens-out Output token count (t1095) diff --git a/.agents/scripts/supervisor/evaluate.sh b/.agents/scripts/supervisor/evaluate.sh index 60986537d..204d92194 100755 --- a/.agents/scripts/supervisor/evaluate.sh +++ b/.agents/scripts/supervisor/evaluate.sh @@ -682,18 +682,13 @@ classify_failure_mode() { case "$detail" in rate_limited | backend_quota_error | backend_infrastructure_error | \ - retry:backend* | quota* | 429*) - echo "TRANSIENT" - ;; - auth_error | unauthorized | forbidden | 401* | 403* | \ - billing_credits_exhausted) - echo "RESOURCE" - ;; - timeout | interrupted_sigint | killed_sigkill | terminated_sigterm | \ + retry:backend* | quota* | 429* | \ + timeout | interrupted_sigint | killed_sigkill | terminated_sigterm | \ work_in_progress) echo "TRANSIENT" ;; - out_of_memory) + auth_error | unauthorized | forbidden | 401* | 403* | \ + billing_credits_exhausted | out_of_memory) echo "RESOURCE" ;; merge_conflict | test_fail* | lint_* | build_* | \ @@ -743,12 +738,8 @@ rate_output_quality() { echo "2" ;; retry) - # work_in_progress = partial commits exist - if [[ "$outcome_detail" == "work_in_progress" ]]; then - echo "1" - else - echo "1" - fi + # All retries imply some form of progress or attempt + echo "1" ;; blocked) # auth/billing blocks = no output; merge conflict = partial @@ -765,16 +756,8 @@ rate_output_quality() { esac ;; failed) - # worker_never_started / log missing = truly no output - case "$outcome_detail" in - worker_never_started* | log_file_missing* | log_file_empty | \ - no_log_path_in_db* | max_retries) - echo "0" - ;; - *) - echo "0" - ;; - esac + # All failed outcomes are considered to have no usable output + echo "0" ;; *) echo "0" @@ -833,12 +816,28 @@ record_evaluation_metadata() { fi fi + # Look up task type from DB tags if available, fallback to "unknown" + # TODO(t1096): extract real task type from TODO.md tags or DB metadata + local task_type="unknown" + if [[ -n "${SUPERVISOR_DB:-}" ]]; then + local task_desc + task_desc=$(db "$SUPERVISOR_DB" "SELECT description FROM tasks WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || echo "") + # Infer type from description keywords (best-effort) + case "$task_desc" in + *bugfix* | *fix* | *bug*) task_type="bugfix" ;; + *refactor*) task_type="refactor" ;; + *test*) task_type="testing" ;; + *doc*) task_type="docs" ;; + *) task_type="feature" ;; + esac + fi + # Build description local description="Worker $task_id: ${outcome_type}:${outcome_detail} [fmode:${failure_mode}] [quality:${quality_score}]" "$pattern_helper" record \ --outcome "$pt_outcome" \ - --task-type "feature" \ + --task-type "$task_type" \ --task-id "$task_id" \ --description "$description" \ --tags "supervisor,evaluate,${outcome_type},${extra_tags}${model_tier:+,model:${model_tier}}" \ @@ -1277,10 +1276,9 @@ evaluate_worker() { # Returns: complete:, retry:, blocked: # # Extended format (t1096): AI is asked to also classify failure mode and -# rate output quality. The response is parsed for the extended fields and -# stored as metadata. The returned verdict is the standard format (unchanged -# for callers) — extended fields are captured as side-effects via -# _AI_EVAL_FMODE and _AI_EVAL_QUALITY shell variables for the caller. +# rate output quality. When extended fields are present, the stdout output +# encodes them as type:detail:FMODE:mode:QUALITY:n so they survive subshell +# capture. evaluate_worker_with_metadata() parses and strips these fields. ####################################### evaluate_with_ai() { local task_id="$1" @@ -1352,7 +1350,7 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li # Full format: VERDICT:type:detail:FMODE:mode:QUALITY:n # Basic format: VERDICT:type:detail (legacy / fallback) local verdict_line - verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z_]+:FMODE:[A-Z]+:QUALITY:[012]' | head -1 || true) + verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z0-9_-]+:FMODE:[A-Z]+:QUALITY:[012]' | head -1 || true) if [[ -n "$verdict_line" ]]; then # Parse extended format @@ -1368,11 +1366,6 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li log_info "AI eval for $task_id: $verdict [fmode:${ai_fmode}] [quality:${ai_quality}]" - # Export extended fields for record_evaluation_metadata() caller - # These are set as global-scope variables (prefixed to avoid collision) - _AI_EVAL_FMODE="$ai_fmode" - _AI_EVAL_QUALITY="$ai_quality" - # Store AI evaluation in state log for audit trail db "$SUPERVISOR_DB" " INSERT INTO state_log (task_id, from_state, to_state, reason) @@ -1380,22 +1373,21 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li 'AI eval verdict: $verdict fmode:${ai_fmode} quality:${ai_quality}'); " 2>/dev/null || true - echo "$verdict" + # Encode AI-derived fields in stdout so they survive subshell capture. + # Format: type:detail:FMODE:mode:QUALITY:n + # evaluate_worker_with_metadata() parses and strips these before returning. + echo "${verdict}:FMODE:${ai_fmode}:QUALITY:${ai_quality}" return 0 fi # Fallback: try basic VERDICT format (AI didn't include extended fields) local basic_verdict_line - basic_verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z_]+' | head -1 || true) + basic_verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z0-9_-]+' | head -1 || true) if [[ -n "$basic_verdict_line" ]]; then local verdict="${basic_verdict_line#VERDICT:}" log_info "AI eval for $task_id: $verdict (basic format — no fmode/quality)" - # Clear extended fields (not provided by AI in this response) - _AI_EVAL_FMODE="" - _AI_EVAL_QUALITY="" - # Store AI evaluation in state log for audit trail db "$SUPERVISOR_DB" " INSERT INTO state_log (task_id, from_state, to_state, reason) @@ -1435,28 +1427,36 @@ evaluate_worker_with_metadata() { local task_id="$1" local skip_ai_eval="${2:-false}" - # Reset AI eval extended fields before calling evaluate_worker - _AI_EVAL_FMODE="" - _AI_EVAL_QUALITY="" + # Run core evaluation (captures stdout which may include AI-derived fields) + local raw_verdict + raw_verdict=$(evaluate_worker "$task_id" "$skip_ai_eval") || return 1 + + # Parse AI-derived extended fields from stdout if present. + # Extended format: type:detail:FMODE:mode:QUALITY:n + # Basic format: type:detail + local verdict ai_evaluated="false" + local failure_mode quality_score - # Run core evaluation - local verdict - verdict=$(evaluate_worker "$task_id" "$skip_ai_eval") || return 1 + if [[ "$raw_verdict" == *":FMODE:"*":QUALITY:"* ]]; then + # AI eval encoded extended fields in stdout — extract them + ai_evaluated="true" + # Strip :FMODE:...:QUALITY:... to get the original verdict + verdict="${raw_verdict%%:FMODE:*}" + # Extract failure mode (between :FMODE: and :QUALITY:) + local fmode_part="${raw_verdict#*:FMODE:}" + failure_mode="${fmode_part%%:QUALITY:*}" + # Extract quality (after last :QUALITY:) + quality_score="${fmode_part##*:QUALITY:}" + else + # No AI fields — use deterministic classification + verdict="$raw_verdict" + fi # Parse verdict into type and detail local outcome_type="${verdict%%:*}" local outcome_detail="${verdict#*:}" - # Determine if AI eval was used (extended fields set by evaluate_with_ai) - local ai_evaluated="false" - local failure_mode quality_score - - if [[ -n "$_AI_EVAL_FMODE" ]]; then - # AI provided failure mode classification - ai_evaluated="true" - failure_mode="$_AI_EVAL_FMODE" - quality_score="${_AI_EVAL_QUALITY:-$(rate_output_quality "$outcome_type" "$outcome_detail")}" - else + if [[ "$ai_evaluated" != "true" ]]; then # Deterministic classification from outcome strings if [[ "$outcome_type" == "complete" ]]; then failure_mode="NONE" diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh index 5747179c1..f897fb289 100755 --- a/.agents/scripts/supervisor/pulse.sh +++ b/.agents/scripts/supervisor/pulse.sh @@ -211,11 +211,13 @@ cmd_pulse() { fi local outcome + local eval_maker="evaluate_worker" # t1096: use evaluate_worker_with_metadata() to capture richer metadata # (failure mode, output quality) and record to pattern tracker. # Falls back to evaluate_worker() if the wrapper is unavailable. if command -v evaluate_worker_with_metadata &>/dev/null; then outcome=$(evaluate_worker_with_metadata "$tid" "$skip_ai") + eval_maker="evaluate_worker_with_metadata" else outcome=$(evaluate_worker "$tid" "$skip_ai") fi @@ -227,7 +229,7 @@ cmd_pulse() { _eval_duration=$(_proof_log_stage_duration "$tid" "evaluate") write_proof_log --task "$tid" --event "evaluate" --stage "evaluate" \ --decision "$outcome" --evidence "skip_ai=$skip_ai" \ - --maker "evaluate_worker_with_metadata" \ + --maker "$eval_maker" \ ${_eval_duration:+--duration "$_eval_duration"} 2>/dev/null || true # Budget tracking: record spend from worker log (t1100)