From 1467cf0dcca5956064cd245db55811841c375e7a Mon Sep 17 00:00:00 2001
From: marcusquinn <6428977+marcusquinn@users.noreply.github.com>
Date: Wed, 18 Feb 2026 02:48:09 +0000
Subject: [PATCH 1/2] feat: add failure mode classification and output quality
 rating to evaluate.sh (t1096)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add classify_failure_mode(): maps outcome_detail strings to 5 categories
  (TRANSIENT, RESOURCE, LOGIC, BLOCKED, AMBIGUOUS) for pattern tracking
- Add rate_output_quality(): derives 3-point quality score (0/1/2) from
  outcome type without extra AI calls
- Add record_evaluation_metadata(): records richer fields to pattern tracker
  after each worker assessment
- Add evaluate_worker_with_metadata(): thin wrapper that calls evaluate_worker(),
  classifies failure mode, rates quality, records metadata, returns verdict unchanged
- Extend evaluate_with_ai() to request FMODE and QUALITY in AI prompt;
  parses extended VERDICT:type:detail:FMODE:mode:QUALITY:n format with
  fallback to basic format for backward compatibility
- Update pulse.sh to call evaluate_worker_with_metadata() instead of evaluate_worker()
- Extend store_failure_pattern() and store_success_pattern() in memory-integration.sh
  to accept and propagate failure_mode and quality_score fields
- Add --failure-mode and --quality options to pattern-tracker-helper.sh cmd_record()
  with validation and documentation

Chose tag-based approach for new fields (failure_mode:X, quality:N) to avoid
schema changes — matches existing pattern-tracker conventions and enables
immediate filtering via SQLite LIKE queries.
---
 .agents/scripts/pattern-tracker-helper.sh     |  46 ++-
 .agents/scripts/supervisor/evaluate.sh        | 335 +++++++++++++++++-
 .../scripts/supervisor/memory-integration.sh  |  25 ++
 .agents/scripts/supervisor/pulse.sh           |  11 +-
 4 files changed, 391 insertions(+), 26 deletions(-)

diff --git a/.agents/scripts/pattern-tracker-helper.sh b/.agents/scripts/pattern-tracker-helper.sh
index 29adab692..73a907cbf 100755
--- a/.agents/scripts/pattern-tracker-helper.sh
+++ b/.agents/scripts/pattern-tracker-helper.sh
@@ -93,6 +93,7 @@ cmd_record() {
 	local failure_mode=""
 	local tokens_in=""
 	local tokens_out=""
+	local quality_score=""
 
 	while [[ $# -gt 0 ]]; do
 		case "$1" in
@@ -148,6 +149,10 @@ cmd_record() {
 			tokens_out="$2"
 			shift 2
 			;;
+		--quality-score)
+			quality_score="$2"
+			shift 2
+			;; # t1096: 0|1|2
 		*)
 			if [[ -z "$description" ]]; then
 				description="$1"
@@ -211,13 +216,30 @@ cmd_record() {
 		esac
 	fi
 
-	# Validate failure_mode if provided (t1095)
+	# Build memory type
+	local memory_type
+	if [[ "$outcome" == "success" ]]; then
+		memory_type="SUCCESS_PATTERN"
+	else
+		memory_type="FAILURE_PATTERN"
+	fi
+
+	# Validate failure_mode if provided (t1095/t1096)
 	if [[ -n "$failure_mode" ]]; then
 		case "$failure_mode" in
-		hallucination | context-miss | incomplete | wrong-file | timeout) ;;
+		hallucination | context-miss | incomplete | wrong-file | timeout | TRANSIENT | RESOURCE | LOGIC | BLOCKED | AMBIGUOUS | NONE) ;;
 		*)
-			log_error "Invalid failure_mode: $failure_mode (use hallucination, context-miss, incomplete, wrong-file, or timeout)"
-			return 1
+			log_warn "Non-standard failure_mode: $failure_mode (standard: hallucination, context-miss, incomplete, wrong-file, timeout, TRANSIENT, RESOURCE, LOGIC, BLOCKED, AMBIGUOUS, NONE)"
+			;;
+		esac
+	fi
+
+	# Validate quality_score if provided (t1096)
+	if [[ -n "$quality_score" ]]; then
+		case "$quality_score" in
+		0 | 1 | 2) ;;
+		*)
+			log_warn "Non-standard quality_score: $quality_score (standard: 0=no_output 1=partial 2=complete)"
 			;;
 		esac
 	fi
@@ -232,14 +254,6 @@ cmd_record() {
 		return 1
 	fi
 
-	# Build memory type
-	local memory_type
-	if [[ "$outcome" == "success" ]]; then
-		memory_type="SUCCESS_PATTERN"
-	else
-		memory_type="FAILURE_PATTERN"
-	fi
-
 	# Build tags
 	local all_tags="pattern"
 	[[ -n "$task_type" ]] && all_tags="$all_tags,$task_type"
@@ -248,6 +262,9 @@ cmd_record() {
 	[[ -n "$duration" ]] && all_tags="$all_tags,duration:$duration"
 	[[ -n "$retries" ]] && all_tags="$all_tags,retries:$retries"
 	[[ -n "$strategy" ]] && all_tags="$all_tags,strategy:$strategy"
+	# t1096: include failure mode and quality score in tags for filtering
+	[[ -n "$failure_mode" ]] && all_tags="$all_tags,failure_mode:$failure_mode"
+	[[ -n "$quality_score" ]] && all_tags="$all_tags,quality:$quality_score"
 	[[ -n "$tags" ]] && all_tags="$all_tags,$tags"
 
 	# Build content with structured metadata
@@ -257,6 +274,9 @@ cmd_record() {
 	[[ -n "$task_id" ]] && content="$content [id:$task_id]"
 	[[ -n "$duration" ]] && content="$content [duration:${duration}s]"
 	[[ -n "$retries" && "$retries" != "0" ]] && content="$content [retries:$retries]"
+	# t1096: append failure mode and quality score to content
+	[[ -n "$failure_mode" ]] && content="$content [fmode:$failure_mode]"
+	[[ -n "$quality_score" ]] && content="$content [quality:$quality_score]"
 
 	# Store via memory-helper.sh and capture the returned ID
 	# The last line of store output is the bare mem_YYYYMMDDHHMMSS_hex ID.
@@ -1098,6 +1118,8 @@ RECORD OPTIONS:
     --task-id <id>                Task identifier (e.g., t102.3)
     --duration <seconds>          How long the task took
     --retries <count>             Number of retries before completion
+    --failure-mode <mode>         Failure category (t1096): TRANSIENT|RESOURCE|LOGIC|BLOCKED|AMBIGUOUS|NONE
+    --quality <score>             Output quality (t1096): 0=no_output 1=partial 2=complete
     --tags <tags>                 Additional comma-separated tags
     --strategy <type>             Dispatch strategy: normal, prompt-repeat, escalated (t1095)
     --quality <level>             CI quality: ci-pass-first-try, ci-pass-after-fix, needs-human (t1095)
diff --git a/.agents/scripts/supervisor/evaluate.sh b/.agents/scripts/supervisor/evaluate.sh
index 21b8c7356..60986537d 100755
--- a/.agents/scripts/supervisor/evaluate.sh
+++ b/.agents/scripts/supervisor/evaluate.sh
@@ -659,6 +659,194 @@ link_pr_to_task() {
 	return 0
 }
 
+#######################################
+# Classify a failure outcome_detail string into a failure mode category (t1096)
+#
+# Maps the granular outcome_detail strings from evaluate_worker() into
+# five broad categories for pattern tracking and model routing decisions.
+#
+# Categories:
+#   TRANSIENT - recoverable with retry (rate limits, timeouts, backend blips)
+#   RESOURCE  - infrastructure/environment issue (auth, OOM, disk)
+#   LOGIC     - task/code problem (merge conflict, test failure, build error)
+#   BLOCKED   - external dependency (human needed, upstream, missing context)
+#   AMBIGUOUS - unclear cause (clean exit, max retries, unknown)
+#
+# $1: outcome_detail (e.g., "rate_limited", "auth_error", "merge_conflict")
+#
+# Outputs: category string on stdout
+# Returns: 0 always
+#######################################
+classify_failure_mode() {
+	local detail="$1"
+
+	case "$detail" in
+	rate_limited | backend_quota_error | backend_infrastructure_error | \
+		retry:backend* | quota* | 429*)
+		echo "TRANSIENT"
+		;;
+	auth_error | unauthorized | forbidden | 401* | 403* | \
+		billing_credits_exhausted)
+		echo "RESOURCE"
+		;;
+	timeout | interrupted_sigint | killed_sigkill | terminated_sigterm | \
+		work_in_progress)
+		echo "TRANSIENT"
+		;;
+	out_of_memory)
+		echo "RESOURCE"
+		;;
+	merge_conflict | test_fail* | lint_* | build_* | \
+		worker_never_started* | log_file_missing* | log_file_empty | \
+		no_log_path_in_db* | dispatch_script_not_executable)
+		echo "LOGIC"
+		;;
+	blocked:* | waiting* | upstream* | missing_context* | \
+		verify_incomplete_no_pr | verify_not_started_needs_full)
+		echo "BLOCKED"
+		;;
+	clean_exit_no_signal | max_retries | \
+		ambiguous_skipped_ai | ambiguous_ai_unavailable | ambiguous* | "")
+		echo "AMBIGUOUS"
+		;;
+	*)
+		echo "AMBIGUOUS"
+		;;
+	esac
+	return 0
+}
+
+#######################################
+# Rate the output quality of a worker based on outcome type (t1096)
+#
+# Derives a 3-point quality score from the outcome type without an extra
+# AI call. Only AMBIGUOUS failure modes trigger AI quality assessment.
+#
+# Scale:
+#   0 = no_output    - worker produced nothing usable
+#   1 = partial      - some progress, incomplete or broken artifact
+#   2 = complete     - deliverable matches task intent
+#
+# $1: outcome_type (complete|retry|blocked|failed)
+# $2: outcome_detail (for context)
+#
+# Outputs: quality score (0, 1, or 2) on stdout
+# Returns: 0 always
+#######################################
+rate_output_quality() {
+	local outcome_type="$1"
+	local outcome_detail="${2:-}"
+
+	case "$outcome_type" in
+	complete)
+		# task_obsolete = task was already done, still counts as complete
+		echo "2"
+		;;
+	retry)
+		# work_in_progress = partial commits exist
+		if [[ "$outcome_detail" == "work_in_progress" ]]; then
+			echo "1"
+		else
+			echo "1"
+		fi
+		;;
+	blocked)
+		# auth/billing blocks = no output; merge conflict = partial
+		case "$outcome_detail" in
+		auth_error | billing_credits_exhausted | out_of_memory)
+			echo "0"
+			;;
+		merge_conflict)
+			echo "1"
+			;;
+		*)
+			echo "0"
+			;;
+		esac
+		;;
+	failed)
+		# worker_never_started / log missing = truly no output
+		case "$outcome_detail" in
+		worker_never_started* | log_file_missing* | log_file_empty | \
+			no_log_path_in_db* | max_retries)
+			echo "0"
+			;;
+		*)
+			echo "0"
+			;;
+		esac
+		;;
+	*)
+		echo "0"
+		;;
+	esac
+	return 0
+}
+
+#######################################
+# Record evaluation metadata to pattern tracker (t1096)
+#
+# Called after evaluate_worker() resolves a verdict. Stores richer metadata
+# than the basic store_success/failure_pattern calls: failure mode category,
+# output quality score, AI eval flag, and log quality signals.
+#
+# $1: task_id
+# $2: outcome_type (complete|retry|blocked|failed)
+# $3: outcome_detail
+# $4: failure_mode (TRANSIENT|RESOURCE|LOGIC|BLOCKED|AMBIGUOUS|NONE)
+# $5: quality_score (0|1|2)
+# $6: ai_evaluated (true|false) — whether AI eval was used
+#
+# Returns: 0 always (non-blocking)
+#######################################
+record_evaluation_metadata() {
+	local task_id="$1"
+	local outcome_type="$2"
+	local outcome_detail="$3"
+	local failure_mode="${4:-AMBIGUOUS}"
+	local quality_score="${5:-0}"
+	local ai_evaluated="${6:-false}"
+
+	local pattern_helper="${SCRIPT_DIR}/pattern-tracker-helper.sh"
+	if [[ ! -x "$pattern_helper" ]]; then
+		pattern_helper="$HOME/.aidevops/agents/scripts/pattern-tracker-helper.sh"
+	fi
+	if [[ ! -x "$pattern_helper" ]]; then
+		return 0
+	fi
+
+	# Map outcome_type to pattern-tracker outcome
+	local pt_outcome="failure"
+	[[ "$outcome_type" == "complete" ]] && pt_outcome="success"
+
+	# Build extra tags for new fields
+	local extra_tags="failure_mode:${failure_mode},quality:${quality_score}"
+	[[ "$ai_evaluated" == "true" ]] && extra_tags="${extra_tags},ai_eval:true"
+
+	# Look up model tier from DB for routing context
+	local model_tier=""
+	if [[ -n "${SUPERVISOR_DB:-}" ]]; then
+		local task_model
+		task_model=$(db "$SUPERVISOR_DB" "SELECT model FROM tasks WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || echo "")
+		if [[ -n "$task_model" ]] && command -v model_to_tier &>/dev/null; then
+			model_tier=$(model_to_tier "$task_model" 2>/dev/null || echo "")
+		fi
+	fi
+
+	# Build description
+	local description="Worker $task_id: ${outcome_type}:${outcome_detail} [fmode:${failure_mode}] [quality:${quality_score}]"
+
+	"$pattern_helper" record \
+		--outcome "$pt_outcome" \
+		--task-type "feature" \
+		--task-id "$task_id" \
+		--description "$description" \
+		--tags "supervisor,evaluate,${outcome_type},${extra_tags}${model_tier:+,model:${model_tier}}" \
+		2>/dev/null || true
+
+	return 0
+}
+
 #######################################
 # Evaluate a completed worker's outcome using log analysis
 # Returns: complete:<detail>, retry:<reason>, blocked:<reason>, failed:<reason>
@@ -1083,9 +1271,16 @@ evaluate_worker() {
 }
 
 #######################################
-# Dispatch a cheap AI call to evaluate ambiguous worker outcomes
+# Dispatch a cheap AI call to evaluate ambiguous worker outcomes (t1096)
 # Uses Sonnet for speed (~30s) and cost efficiency
+#
 # Returns: complete:<detail>, retry:<reason>, blocked:<reason>
+#
+# Extended format (t1096): AI is asked to also classify failure mode and
+# rate output quality. The response is parsed for the extended fields and
+# stored as metadata. The returned verdict is the standard format (unchanged
+# for callers) — extended fields are captured as side-effects via
+# _AI_EVAL_FMODE and _AI_EVAL_QUALITY shell variables for the caller.
 #######################################
 evaluate_with_ai() {
 	local task_id="$1"
@@ -1104,13 +1299,17 @@ evaluate_with_ai() {
 	local task_desc
 	task_desc=$(db "$SUPERVISOR_DB" "SELECT description FROM tasks WHERE id = '$escaped_id';" 2>/dev/null || echo "")
 
+	# Extended prompt (t1096): request failure mode classification and quality rating
+	# alongside the standard verdict. Format: VERDICT:type:detail:FMODE:mode:QUALITY:n
+	# Failure modes: TRANSIENT RESOURCE LOGIC BLOCKED AMBIGUOUS
+	# Quality: 0=no_output 1=partial 2=complete
 	local eval_prompt
-	eval_prompt="You are evaluating the outcome of an automated task worker. Respond with EXACTLY one line in the format: VERDICT:<type>:<detail>
+	eval_prompt="You are evaluating the outcome of an automated task worker. Respond with EXACTLY one line in the format:
+VERDICT:<type>:<detail>:FMODE:<failure_mode>:QUALITY:<quality>
 
-Types:
-- complete:<what_succeeded> (task finished successfully)
-- retry:<reason> (transient failure, worth retrying)
-- blocked:<reason> (needs human intervention)
+Types: complete|retry|blocked
+Failure modes: TRANSIENT (rate limit/timeout/backend) | RESOURCE (auth/OOM/billing) | LOGIC (merge conflict/build/test) | BLOCKED (human needed/upstream) | AMBIGUOUS (unclear) | NONE (for complete)
+Quality: 0=no_output 1=partial_progress 2=fully_complete
 
 Task: $task_id
 Description: ${task_desc:-unknown}
@@ -1124,8 +1323,10 @@ Analyze the log and determine the outcome. Look for:
 1. Did the task complete its objective? (code changes, PR created, tests passing)
 2. Is there a transient error that a retry would fix? (network, rate limit, timeout)
 3. Is there a permanent blocker? (auth, permissions, merge conflict, missing dependency)
+4. What failure mode category best describes the issue?
+5. How much useful output did the worker produce? (0=nothing, 1=partial, 2=complete)
 
-Respond with ONLY the verdict line, nothing else."
+Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_limited:FMODE:TRANSIENT:QUALITY:1"
 
 	local ai_result=""
 	local eval_timeout=60
@@ -1147,14 +1348,53 @@ Respond with ONLY the verdict line, nothing else."
 			--output-format text 2>/dev/null || echo "")
 	fi
 
-	# Parse the VERDICT line from AI response
+	# Parse extended VERDICT line (t1096): try full format first, fall back to basic
+	# Full format: VERDICT:type:detail:FMODE:mode:QUALITY:n
+	# Basic format: VERDICT:type:detail (legacy / fallback)
 	local verdict_line
-	verdict_line=$(echo "$ai_result" | grep -o 'VERDICT:[a-z]*:[a-z_]*' | head -1 || true)
+	verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z_]+:FMODE:[A-Z]+:QUALITY:[012]' | head -1 || true)
 
 	if [[ -n "$verdict_line" ]]; then
-		# Strip VERDICT: prefix and return
-		local verdict="${verdict_line#VERDICT:}"
-		log_info "AI eval for $task_id: $verdict"
+		# Parse extended format
+		# Format: VERDICT:type:detail:FMODE:mode:QUALITY:n
+		local raw="${verdict_line#VERDICT:}"
+		# Extract type:detail (everything before :FMODE:)
+		local verdict="${raw%%:FMODE:*}"
+		# Extract failure mode (between :FMODE: and :QUALITY:)
+		local fmode_part="${raw#*:FMODE:}"
+		local ai_fmode="${fmode_part%%:QUALITY:*}"
+		# Extract quality (after :QUALITY:)
+		local ai_quality="${fmode_part##*:QUALITY:}"
+
+		log_info "AI eval for $task_id: $verdict [fmode:${ai_fmode}] [quality:${ai_quality}]"
+
+		# Export extended fields for record_evaluation_metadata() caller
+		# These are set as global-scope variables (prefixed to avoid collision)
+		_AI_EVAL_FMODE="$ai_fmode"
+		_AI_EVAL_QUALITY="$ai_quality"
+
+		# Store AI evaluation in state log for audit trail
+		db "$SUPERVISOR_DB" "
+            INSERT INTO state_log (task_id, from_state, to_state, reason)
+            VALUES ('$(sql_escape "$task_id")', 'evaluating', 'evaluating',
+                    'AI eval verdict: $verdict fmode:${ai_fmode} quality:${ai_quality}');
+        " 2>/dev/null || true
+
+		echo "$verdict"
+		return 0
+	fi
+
+	# Fallback: try basic VERDICT format (AI didn't include extended fields)
+	local basic_verdict_line
+	basic_verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z_]+' | head -1 || true)
+
+	if [[ -n "$basic_verdict_line" ]]; then
+		local verdict="${basic_verdict_line#VERDICT:}"
+		log_info "AI eval for $task_id: $verdict (basic format — no fmode/quality)"
+
+		# Clear extended fields (not provided by AI in this response)
+		_AI_EVAL_FMODE=""
+		_AI_EVAL_QUALITY=""
 
 		# Store AI evaluation in state log for audit trail
 		db "$SUPERVISOR_DB" "
@@ -1172,6 +1412,77 @@ Respond with ONLY the verdict line, nothing else."
 	return 1
 }
 
+#######################################
+# Evaluate worker outcome and record richer metadata to pattern tracker (t1096)
+#
+# Thin wrapper around evaluate_worker() that:
+#   1. Calls evaluate_worker() to get the standard verdict
+#   2. Classifies failure mode via classify_failure_mode()
+#   3. Rates output quality via rate_output_quality()
+#   4. Records metadata to pattern tracker with new fields
+#   5. Returns the original verdict unchanged (callers unaffected)
+#
+# This is the preferred entry point for the pulse cycle. evaluate_worker()
+# remains available for direct use (e.g., cmd_evaluate, tests).
+#
+# $1: task_id
+# $2: skip_ai_eval (optional, default false)
+#
+# Outputs: same as evaluate_worker() — "type:detail"
+# Returns: 0 on success, 1 if task not found
+#######################################
+evaluate_worker_with_metadata() {
+	local task_id="$1"
+	local skip_ai_eval="${2:-false}"
+
+	# Reset AI eval extended fields before calling evaluate_worker
+	_AI_EVAL_FMODE=""
+	_AI_EVAL_QUALITY=""
+
+	# Run core evaluation
+	local verdict
+	verdict=$(evaluate_worker "$task_id" "$skip_ai_eval") || return 1
+
+	# Parse verdict into type and detail
+	local outcome_type="${verdict%%:*}"
+	local outcome_detail="${verdict#*:}"
+
+	# Determine if AI eval was used (extended fields set by evaluate_with_ai)
+	local ai_evaluated="false"
+	local failure_mode quality_score
+
+	if [[ -n "$_AI_EVAL_FMODE" ]]; then
+		# AI provided failure mode classification
+		ai_evaluated="true"
+		failure_mode="$_AI_EVAL_FMODE"
+		quality_score="${_AI_EVAL_QUALITY:-$(rate_output_quality "$outcome_type" "$outcome_detail")}"
+	else
+		# Deterministic classification from outcome strings
+		if [[ "$outcome_type" == "complete" ]]; then
+			failure_mode="NONE"
+		else
+			failure_mode=$(classify_failure_mode "$outcome_detail")
+		fi
+		quality_score=$(rate_output_quality "$outcome_type" "$outcome_detail")
+	fi
+
+	# Record to pattern tracker (non-blocking — failures here must not affect verdict)
+	record_evaluation_metadata \
+		"$task_id" \
+		"$outcome_type" \
+		"$outcome_detail" \
+		"$failure_mode" \
+		"$quality_score" \
+		"$ai_evaluated" \
+		2>/dev/null || true
+
+	log_info "evaluate_worker_with_metadata: $task_id → $verdict [fmode:${failure_mode}] [quality:${quality_score}] [ai:${ai_evaluated}]"
+
+	# Return the original verdict unchanged
+	echo "$verdict"
+	return 0
+}
+
 #######################################
 # Manually evaluate a task's worker outcome
 # Useful for debugging or forcing evaluation of a stuck task
diff --git a/.agents/scripts/supervisor/memory-integration.sh b/.agents/scripts/supervisor/memory-integration.sh
index 1c5edf7ef..b61e4121c 100755
--- a/.agents/scripts/supervisor/memory-integration.sh
+++ b/.agents/scripts/supervisor/memory-integration.sh
@@ -58,12 +58,21 @@ $task_memories"
 # Called when a task fails, is blocked, or retries
 # Tags with supervisor context for future recall
 # Uses FAILURE_PATTERN type for pattern-tracker integration (t102.3)
+#
+# $1: task_id
+# $2: outcome_type (blocked|failed|retry)
+# $3: outcome_detail
+# $4: description (optional)
+# $5: failure_mode (optional, t1096 — TRANSIENT|RESOURCE|LOGIC|BLOCKED|AMBIGUOUS)
+# $6: quality_score (optional, t1096 — 0|1|2)
 #######################################
 store_failure_pattern() {
 	local task_id="$1"
 	local outcome_type="$2"
 	local outcome_detail="$3"
 	local description="${4:-}"
+	local failure_mode="${5:-}"
+	local quality_score="${6:-}"
 
 	if [[ ! -x "$MEMORY_HELPER" ]]; then
 		return 0
@@ -119,10 +128,16 @@ store_failure_pattern() {
 		content="[task:feature] $content | Task: $description"
 	fi
 	[[ -n "$model_tier" ]] && content="$content [model:$model_tier]"
+	# t1096: append failure mode and quality score when provided
+	[[ -n "$failure_mode" ]] && content="$content [fmode:$failure_mode]"
+	[[ -n "$quality_score" ]] && content="$content [quality:$quality_score]"
 
 	# Build tags with model info for pattern-tracker queries
 	local tags="supervisor,pattern,$task_id,$outcome_type,$outcome_detail"
 	[[ -n "$model_tier" ]] && tags="$tags,model:$model_tier"
+	# t1096: include failure mode and quality in tags for pattern-tracker filtering
+	[[ -n "$failure_mode" ]] && tags="$tags,failure_mode:$failure_mode"
+	[[ -n "$quality_score" ]] && tags="$tags,quality:$quality_score"
 
 	"$MEMORY_HELPER" store \
 		--auto \
@@ -139,11 +154,17 @@ store_failure_pattern() {
 # Store a success pattern in memory after task completion
 # Records what worked for future reference
 # Uses SUCCESS_PATTERN type for pattern-tracker integration (t102.3)
+#
+# $1: task_id
+# $2: detail (PR URL or outcome detail)
+# $3: description (optional)
+# $4: quality_score (optional, t1096 — 0|1|2; defaults to 2 for complete)
 #######################################
 store_success_pattern() {
 	local task_id="$1"
 	local detail="${2:-}"
 	local description="${3:-}"
+	local quality_score="${4:-2}"
 
 	if [[ ! -x "$MEMORY_HELPER" ]]; then
 		return 0
@@ -201,6 +222,8 @@ store_success_pattern() {
 	if [[ "$task_tool_count" -gt 0 ]]; then
 		content="$content [task_tool:$task_tool_count]"
 	fi
+	# t1096: append quality score (always 2 for success, but caller may override)
+	content="$content [quality:${quality_score}]"
 
 	# Build tags with model and duration info for pattern-tracker queries
 	local tags="supervisor,pattern,$task_id,complete"
@@ -208,6 +231,8 @@ store_success_pattern() {
 	[[ -n "$duration_secs" ]] && tags="$tags,duration:$duration_secs"
 	[[ "$retries" -gt 0 ]] && tags="$tags,retries:$retries"
 	[[ "$task_tool_count" -gt 0 ]] && tags="$tags,task_tool:$task_tool_count"
+	# t1096: include quality in tags for pattern-tracker filtering
+	tags="$tags,quality:${quality_score},failure_mode:NONE"
 
 	"$MEMORY_HELPER" store \
 		--auto \
diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh
index e17d8f44b..5747179c1 100755
--- a/.agents/scripts/supervisor/pulse.sh
+++ b/.agents/scripts/supervisor/pulse.sh
@@ -211,7 +211,14 @@ cmd_pulse() {
 			fi
 
 			local outcome
-			outcome=$(evaluate_worker "$tid" "$skip_ai")
+			# t1096: use evaluate_worker_with_metadata() to capture richer metadata
+			# (failure mode, output quality) and record to pattern tracker.
+			# Falls back to evaluate_worker() if the wrapper is unavailable.
+			if command -v evaluate_worker_with_metadata &>/dev/null; then
+				outcome=$(evaluate_worker_with_metadata "$tid" "$skip_ai")
+			else
+				outcome=$(evaluate_worker "$tid" "$skip_ai")
+			fi
 			local outcome_type="${outcome%%:*}"
 			local outcome_detail="${outcome#*:}"
 
@@ -220,7 +227,7 @@ cmd_pulse() {
 			_eval_duration=$(_proof_log_stage_duration "$tid" "evaluate")
 			write_proof_log --task "$tid" --event "evaluate" --stage "evaluate" \
 				--decision "$outcome" --evidence "skip_ai=$skip_ai" \
-				--maker "evaluate_worker" \
+				--maker "evaluate_worker_with_metadata" \
 				${_eval_duration:+--duration "$_eval_duration"} 2>/dev/null || true
 
 			# Budget tracking: record spend from worker log (t1100)

From ea491eb4123c32e1037ea71f306321c0e64aea31 Mon Sep 17 00:00:00 2001
From: marcusquinn <6428977+marcusquinn@users.noreply.github.com>
Date: Wed, 18 Feb 2026 03:40:24 +0000
Subject: [PATCH 2/2] fix: address review feedback for t1096

---
 .agents/scripts/pattern-tracker-helper.sh |   6 +-
 .agents/scripts/supervisor/evaluate.sh    | 116 +++++++++++-----------
 .agents/scripts/supervisor/pulse.sh       |   4 +-
 3 files changed, 64 insertions(+), 62 deletions(-)

diff --git a/.agents/scripts/pattern-tracker-helper.sh b/.agents/scripts/pattern-tracker-helper.sh
index 73a907cbf..1b3a26e85 100755
--- a/.agents/scripts/pattern-tracker-helper.sh
+++ b/.agents/scripts/pattern-tracker-helper.sh
@@ -1118,13 +1118,13 @@ RECORD OPTIONS:
     --task-id <id>                Task identifier (e.g., t102.3)
     --duration <seconds>          How long the task took
     --retries <count>             Number of retries before completion
-    --failure-mode <mode>         Failure category (t1096): TRANSIENT|RESOURCE|LOGIC|BLOCKED|AMBIGUOUS|NONE
-    --quality <score>             Output quality (t1096): 0=no_output 1=partial 2=complete
     --tags <tags>                 Additional comma-separated tags
     --strategy <type>             Dispatch strategy: normal, prompt-repeat, escalated (t1095)
     --quality <level>             CI quality: ci-pass-first-try, ci-pass-after-fix, needs-human (t1095)
+    --quality-score <n>           Output quality rating (t1096): 0=no_output 1=partial 2=complete
     --failure-mode <mode>         Failure classification: hallucination, context-miss,
-                                  incomplete, wrong-file, timeout (t1095)
+                                  incomplete, wrong-file, timeout (t1095) or
+                                  TRANSIENT, RESOURCE, LOGIC, BLOCKED, AMBIGUOUS, NONE (t1096)
     --tokens-in <count>           Input token count (t1095)
     --tokens-out <count>          Output token count (t1095)
 
diff --git a/.agents/scripts/supervisor/evaluate.sh b/.agents/scripts/supervisor/evaluate.sh
index 60986537d..204d92194 100755
--- a/.agents/scripts/supervisor/evaluate.sh
+++ b/.agents/scripts/supervisor/evaluate.sh
@@ -682,18 +682,13 @@ classify_failure_mode() {
 
 	case "$detail" in
 	rate_limited | backend_quota_error | backend_infrastructure_error | \
-		retry:backend* | quota* | 429*)
-		echo "TRANSIENT"
-		;;
-	auth_error | unauthorized | forbidden | 401* | 403* | \
-		billing_credits_exhausted)
-		echo "RESOURCE"
-		;;
-	timeout | interrupted_sigint | killed_sigkill | terminated_sigterm | \
+		retry:backend* | quota* | 429* | \
+		timeout | interrupted_sigint | killed_sigkill | terminated_sigterm | \
 		work_in_progress)
 		echo "TRANSIENT"
 		;;
-	out_of_memory)
+	auth_error | unauthorized | forbidden | 401* | 403* | \
+		billing_credits_exhausted | out_of_memory)
 		echo "RESOURCE"
 		;;
 	merge_conflict | test_fail* | lint_* | build_* | \
@@ -743,12 +738,8 @@ rate_output_quality() {
 		echo "2"
 		;;
 	retry)
-		# work_in_progress = partial commits exist
-		if [[ "$outcome_detail" == "work_in_progress" ]]; then
-			echo "1"
-		else
-			echo "1"
-		fi
+		# All retries imply some form of progress or attempt
+		echo "1"
 		;;
 	blocked)
 		# auth/billing blocks = no output; merge conflict = partial
@@ -765,16 +756,8 @@ rate_output_quality() {
 		esac
 		;;
 	failed)
-		# worker_never_started / log missing = truly no output
-		case "$outcome_detail" in
-		worker_never_started* | log_file_missing* | log_file_empty | \
-			no_log_path_in_db* | max_retries)
-			echo "0"
-			;;
-		*)
-			echo "0"
-			;;
-		esac
+		# All failed outcomes are considered to have no usable output
+		echo "0"
 		;;
 	*)
 		echo "0"
@@ -833,12 +816,28 @@ record_evaluation_metadata() {
 		fi
 	fi
 
+	# Look up task type from DB tags if available, fallback to "unknown"
+	# TODO(t1096): extract real task type from TODO.md tags or DB metadata
+	local task_type="unknown"
+	if [[ -n "${SUPERVISOR_DB:-}" ]]; then
+		local task_desc
+		task_desc=$(db "$SUPERVISOR_DB" "SELECT description FROM tasks WHERE id = '$(sql_escape "$task_id")';" 2>/dev/null || echo "")
+		# Infer type from description keywords (best-effort)
+		case "$task_desc" in
+		*bugfix* | *fix* | *bug*) task_type="bugfix" ;;
+		*refactor*) task_type="refactor" ;;
+		*test*) task_type="testing" ;;
+		*doc*) task_type="docs" ;;
+		*) task_type="feature" ;;
+		esac
+	fi
+
 	# Build description
 	local description="Worker $task_id: ${outcome_type}:${outcome_detail} [fmode:${failure_mode}] [quality:${quality_score}]"
 
 	"$pattern_helper" record \
 		--outcome "$pt_outcome" \
-		--task-type "feature" \
+		--task-type "$task_type" \
 		--task-id "$task_id" \
 		--description "$description" \
 		--tags "supervisor,evaluate,${outcome_type},${extra_tags}${model_tier:+,model:${model_tier}}" \
@@ -1277,10 +1276,9 @@ evaluate_worker() {
 # Returns: complete:<detail>, retry:<reason>, blocked:<reason>
 #
 # Extended format (t1096): AI is asked to also classify failure mode and
-# rate output quality. The response is parsed for the extended fields and
-# stored as metadata. The returned verdict is the standard format (unchanged
-# for callers) — extended fields are captured as side-effects via
-# _AI_EVAL_FMODE and _AI_EVAL_QUALITY shell variables for the caller.
+# rate output quality. When extended fields are present, the stdout output
+# encodes them as type:detail:FMODE:mode:QUALITY:n so they survive subshell
+# capture. evaluate_worker_with_metadata() parses and strips these fields.
 #######################################
 evaluate_with_ai() {
 	local task_id="$1"
@@ -1352,7 +1350,7 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li
 	# Full format: VERDICT:type:detail:FMODE:mode:QUALITY:n
 	# Basic format: VERDICT:type:detail (legacy / fallback)
 	local verdict_line
-	verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z_]+:FMODE:[A-Z]+:QUALITY:[012]' | head -1 || true)
+	verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z0-9_-]+:FMODE:[A-Z]+:QUALITY:[012]' | head -1 || true)
 
 	if [[ -n "$verdict_line" ]]; then
 		# Parse extended format
@@ -1368,11 +1366,6 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li
 
 		log_info "AI eval for $task_id: $verdict [fmode:${ai_fmode}] [quality:${ai_quality}]"
 
-		# Export extended fields for record_evaluation_metadata() caller
-		# These are set as global-scope variables (prefixed to avoid collision)
-		_AI_EVAL_FMODE="$ai_fmode"
-		_AI_EVAL_QUALITY="$ai_quality"
-
 		# Store AI evaluation in state log for audit trail
 		db "$SUPERVISOR_DB" "
             INSERT INTO state_log (task_id, from_state, to_state, reason)
@@ -1380,22 +1373,21 @@ Respond with ONLY the verdict line, nothing else. Example: VERDICT:retry:rate_li
                     'AI eval verdict: $verdict fmode:${ai_fmode} quality:${ai_quality}');
         " 2>/dev/null || true
 
-		echo "$verdict"
+		# Encode AI-derived fields in stdout so they survive subshell capture.
+		# Format: type:detail:FMODE:mode:QUALITY:n
+		# evaluate_worker_with_metadata() parses and strips these before returning.
+		echo "${verdict}:FMODE:${ai_fmode}:QUALITY:${ai_quality}"
 		return 0
 	fi
 
 	# Fallback: try basic VERDICT format (AI didn't include extended fields)
 	local basic_verdict_line
-	basic_verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z_]+' | head -1 || true)
+	basic_verdict_line=$(echo "$ai_result" | grep -oE 'VERDICT:[a-z]+:[a-z0-9_-]+' | head -1 || true)
 
 	if [[ -n "$basic_verdict_line" ]]; then
 		local verdict="${basic_verdict_line#VERDICT:}"
 		log_info "AI eval for $task_id: $verdict (basic format — no fmode/quality)"
 
-		# Clear extended fields (not provided by AI in this response)
-		_AI_EVAL_FMODE=""
-		_AI_EVAL_QUALITY=""
-
 		# Store AI evaluation in state log for audit trail
 		db "$SUPERVISOR_DB" "
             INSERT INTO state_log (task_id, from_state, to_state, reason)
@@ -1435,28 +1427,36 @@ evaluate_worker_with_metadata() {
 	local task_id="$1"
 	local skip_ai_eval="${2:-false}"
 
-	# Reset AI eval extended fields before calling evaluate_worker
-	_AI_EVAL_FMODE=""
-	_AI_EVAL_QUALITY=""
+	# Run core evaluation (captures stdout which may include AI-derived fields)
+	local raw_verdict
+	raw_verdict=$(evaluate_worker "$task_id" "$skip_ai_eval") || return 1
+
+	# Parse AI-derived extended fields from stdout if present.
+	# Extended format: type:detail:FMODE:mode:QUALITY:n
+	# Basic format:    type:detail
+	local verdict ai_evaluated="false"
+	local failure_mode quality_score
 
-	# Run core evaluation
-	local verdict
-	verdict=$(evaluate_worker "$task_id" "$skip_ai_eval") || return 1
+	if [[ "$raw_verdict" == *":FMODE:"*":QUALITY:"* ]]; then
+		# AI eval encoded extended fields in stdout — extract them
+		ai_evaluated="true"
+		# Strip :FMODE:...:QUALITY:... to get the original verdict
+		verdict="${raw_verdict%%:FMODE:*}"
+		# Extract failure mode (between :FMODE: and :QUALITY:)
+		local fmode_part="${raw_verdict#*:FMODE:}"
+		failure_mode="${fmode_part%%:QUALITY:*}"
+		# Extract quality (after last :QUALITY:)
+		quality_score="${fmode_part##*:QUALITY:}"
+	else
+		# No AI fields — use deterministic classification
+		verdict="$raw_verdict"
+	fi
 
 	# Parse verdict into type and detail
 	local outcome_type="${verdict%%:*}"
 	local outcome_detail="${verdict#*:}"
 
-	# Determine if AI eval was used (extended fields set by evaluate_with_ai)
-	local ai_evaluated="false"
-	local failure_mode quality_score
-
-	if [[ -n "$_AI_EVAL_FMODE" ]]; then
-		# AI provided failure mode classification
-		ai_evaluated="true"
-		failure_mode="$_AI_EVAL_FMODE"
-		quality_score="${_AI_EVAL_QUALITY:-$(rate_output_quality "$outcome_type" "$outcome_detail")}"
-	else
+	if [[ "$ai_evaluated" != "true" ]]; then
 		# Deterministic classification from outcome strings
 		if [[ "$outcome_type" == "complete" ]]; then
 			failure_mode="NONE"
diff --git a/.agents/scripts/supervisor/pulse.sh b/.agents/scripts/supervisor/pulse.sh
index 5747179c1..f897fb289 100755
--- a/.agents/scripts/supervisor/pulse.sh
+++ b/.agents/scripts/supervisor/pulse.sh
@@ -211,11 +211,13 @@ cmd_pulse() {
 			fi
 
 			local outcome
+			local eval_maker="evaluate_worker"
 			# t1096: use evaluate_worker_with_metadata() to capture richer metadata
 			# (failure mode, output quality) and record to pattern tracker.
 			# Falls back to evaluate_worker() if the wrapper is unavailable.
 			if command -v evaluate_worker_with_metadata &>/dev/null; then
 				outcome=$(evaluate_worker_with_metadata "$tid" "$skip_ai")
+				eval_maker="evaluate_worker_with_metadata"
 			else
 				outcome=$(evaluate_worker "$tid" "$skip_ai")
 			fi
@@ -227,7 +229,7 @@ cmd_pulse() {
 			_eval_duration=$(_proof_log_stage_duration "$tid" "evaluate")
 			write_proof_log --task "$tid" --event "evaluate" --stage "evaluate" \
 				--decision "$outcome" --evidence "skip_ai=$skip_ai" \
-				--maker "evaluate_worker_with_metadata" \
+				--maker "$eval_maker" \
 				${_eval_duration:+--duration "$_eval_duration"} 2>/dev/null || true
 
 			# Budget tracking: record spend from worker log (t1100)