diff --git a/.agents/scripts/model-availability-helper.sh b/.agents/scripts/model-availability-helper.sh new file mode 100755 index 000000000..bfcdcb2d4 --- /dev/null +++ b/.agents/scripts/model-availability-helper.sh @@ -0,0 +1,1249 @@ +#!/usr/bin/env bash +# shellcheck disable=SC1091 + +# Model Availability Helper - Probe before dispatch +# Lightweight provider health checks using direct HTTP API calls. +# Tests API key validity, model availability, and rate limits. +# Caches results with short TTL to avoid redundant probes. +# +# Usage: model-availability-helper.sh [command] [options] +# +# Commands: +# check [provider|model] Check if a provider/model is available (exit 0=yes, 1=no) +# probe [--all] Probe all configured providers (or specific one) +# status Show cached availability status for all providers +# rate-limits Show current rate limit status from cache +# resolve Resolve best available model for a tier (with fallback) +# invalidate [provider] Clear cache for a provider (or all) +# help Show this help +# +# Options: +# --json Output in JSON format +# --quiet Suppress informational output +# --force Bypass cache and probe live +# --ttl N Override cache TTL in seconds (default: 300) +# +# Integration: +# - Called by supervisor-helper.sh before dispatch (replaces inline health check) +# - Uses direct HTTP API calls (~1-2s) instead of full AI CLI sessions (~8s) +# - Reads API keys from: env vars > gopass > credentials.sh +# - Cache: SQLite at ~/.aidevops/.agent-workspace/model-availability.db +# +# Exit codes: +# 0 - Provider/model available +# 1 - Provider/model unavailable or error +# 2 - Rate limited (retry after delay) +# 3 - API key invalid or missing +# +# Author: AI DevOps Framework +# Version: 1.0.0 + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" || exit +source "${SCRIPT_DIR}/shared-constants.sh" + +set -euo pipefail + +init_log_file + +# ============================================================================= +# Configuration +# ============================================================================= + +readonly AVAILABILITY_DIR="${HOME}/.aidevops/.agent-workspace" +readonly AVAILABILITY_DB="${AVAILABILITY_DIR}/model-availability.db" +readonly DEFAULT_HEALTH_TTL=300 # 5 minutes for health checks +readonly DEFAULT_RATELIMIT_TTL=60 # 1 minute for rate limit data +readonly PROBE_TIMEOUT=10 # HTTP request timeout in seconds + +# Known providers list +readonly KNOWN_PROVIDERS="anthropic openai google openrouter groq deepseek" + +# Provider API endpoints for lightweight probes +# These endpoints are chosen for minimal cost: /models endpoints are free +# and return quickly, confirming both key validity and API availability. +# Uses functions instead of associative arrays for bash 3.2 compatibility (macOS). +get_provider_endpoint() { + local provider="$1" + case "$provider" in + anthropic) echo "https://api.anthropic.com/v1/models" ;; + openai) echo "https://api.openai.com/v1/models" ;; + google) echo "https://generativelanguage.googleapis.com/v1beta/models" ;; + openrouter) echo "https://openrouter.ai/api/v1/models" ;; + groq) echo "https://api.groq.com/openai/v1/models" ;; + deepseek) echo "https://api.deepseek.com/v1/models" ;; + *) return 1 ;; + esac + return 0 +} + +# Provider to env var mapping (comma-separated for multiple options) +get_provider_key_vars() { + local provider="$1" + case "$provider" in + anthropic) echo "ANTHROPIC_API_KEY" ;; + openai) echo "OPENAI_API_KEY" ;; + google) echo "GOOGLE_API_KEY,GEMINI_API_KEY" ;; + openrouter) echo "OPENROUTER_API_KEY" ;; + groq) echo "GROQ_API_KEY" ;; + deepseek) echo "DEEPSEEK_API_KEY" ;; + *) return 1 ;; + esac + return 0 +} + +# Check if a provider name is known +is_known_provider() { + local provider="$1" + case "$provider" in + anthropic|openai|google|openrouter|groq|deepseek) return 0 ;; + *) return 1 ;; + esac +} + +# Tier to primary/fallback model mapping +# Format: primary_provider/model|fallback_provider/model +get_tier_models() { + local tier="$1" + case "$tier" in + haiku) echo "anthropic/claude-3-5-haiku-20241022|google/gemini-2.5-flash" ;; + flash) echo "google/gemini-2.5-flash|openai/gpt-4.1-mini" ;; + sonnet) echo "anthropic/claude-sonnet-4-20250514|openai/gpt-4.1" ;; + pro) echo "google/gemini-2.5-pro|anthropic/claude-sonnet-4-20250514" ;; + opus) echo "anthropic/claude-opus-4-6|openai/o3" ;; + health) echo "anthropic/claude-sonnet-4-5|google/gemini-2.5-flash" ;; + eval) echo "anthropic/claude-sonnet-4-5|google/gemini-2.5-flash" ;; + coding) echo "anthropic/claude-opus-4-6|openai/o3" ;; + *) return 1 ;; + esac + return 0 +} + +# Check if a tier name is known +is_known_tier() { + local tier="$1" + case "$tier" in + haiku|flash|sonnet|pro|opus|health|eval|coding) return 0 ;; + *) return 1 ;; + esac +} + +# ============================================================================= +# Database Setup +# ============================================================================= + +init_db() { + mkdir -p "$AVAILABILITY_DIR" 2>/dev/null || true + + sqlite3 "$AVAILABILITY_DB" " + PRAGMA journal_mode=WAL; + PRAGMA busy_timeout=5000; + + CREATE TABLE IF NOT EXISTS provider_health ( + provider TEXT PRIMARY KEY, + status TEXT NOT NULL DEFAULT 'unknown', + http_code INTEGER DEFAULT 0, + response_ms INTEGER DEFAULT 0, + error_message TEXT DEFAULT '', + models_count INTEGER DEFAULT 0, + checked_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')), + ttl_seconds INTEGER NOT NULL DEFAULT $DEFAULT_HEALTH_TTL + ); + + CREATE TABLE IF NOT EXISTS model_availability ( + model_id TEXT NOT NULL, + provider TEXT NOT NULL, + available INTEGER NOT NULL DEFAULT 0, + checked_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')), + ttl_seconds INTEGER NOT NULL DEFAULT $DEFAULT_HEALTH_TTL, + PRIMARY KEY (model_id, provider) + ); + + CREATE TABLE IF NOT EXISTS rate_limits ( + provider TEXT PRIMARY KEY, + requests_limit INTEGER DEFAULT 0, + requests_remaining INTEGER DEFAULT 0, + requests_reset TEXT DEFAULT '', + tokens_limit INTEGER DEFAULT 0, + tokens_remaining INTEGER DEFAULT 0, + tokens_reset TEXT DEFAULT '', + checked_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')), + ttl_seconds INTEGER NOT NULL DEFAULT $DEFAULT_RATELIMIT_TTL + ); + + CREATE TABLE IF NOT EXISTS probe_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + provider TEXT NOT NULL, + action TEXT NOT NULL, + result TEXT NOT NULL, + duration_ms INTEGER DEFAULT 0, + details TEXT DEFAULT '', + timestamp TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')) + ); + + CREATE INDEX IF NOT EXISTS idx_probe_log_provider ON probe_log(provider); + CREATE INDEX IF NOT EXISTS idx_probe_log_timestamp ON probe_log(timestamp); + " >/dev/null 2>/dev/null || { + print_error "Failed to initialize availability database" + return 1 + } + return 0 +} + +db_query() { + local query="$1" + sqlite3 -cmd ".timeout 5000" "$AVAILABILITY_DB" "$query" 2>/dev/null + return $? +} + +db_query_json() { + local query="$1" + sqlite3 -cmd ".timeout 5000" -json "$AVAILABILITY_DB" "$query" 2>/dev/null + return $? +} + +sql_escape() { + local val="$1" + echo "${val//\'/\'\'}" + return 0 +} + +# ============================================================================= +# API Key Resolution +# ============================================================================= +# Resolves API keys from three sources (in priority order): +# 1. Environment variables +# 2. gopass encrypted secrets +# 3. credentials.sh plaintext fallback +# SECURITY: Never prints key values. Returns 0 if found, 1 if not. + +resolve_api_key() { + local provider="$1" + local key_vars + key_vars=$(get_provider_key_vars "$provider" 2>/dev/null) || return 1 + + if [[ -z "$key_vars" ]]; then + return 1 + fi + + # Check each possible env var name + IFS=',' read -ra var_names <<< "$key_vars" + for var_name in "${var_names[@]}"; do + # Source 1: Environment variable + if [[ -n "${!var_name:-}" ]]; then + echo "$var_name" + return 0 + fi + done + + # Source 2: gopass (if available) + if command -v gopass &>/dev/null; then + for var_name in "${var_names[@]}"; do + local gopass_path="aidevops/${var_name}" + if gopass show "$gopass_path" &>/dev/null; then + # Export the key into the environment for this session + local key_val + key_val=$(gopass show "$gopass_path" 2>/dev/null) + if [[ -n "$key_val" ]]; then + export "${var_name}=${key_val}" + echo "$var_name" + return 0 + fi + fi + done + fi + + # Source 3: credentials.sh (plaintext fallback) + local creds_file="${HOME}/.config/aidevops/credentials.sh" + if [[ -f "$creds_file" ]]; then + # Source the file to get variables (safe: we control this file) + # shellcheck disable=SC1090 + source "$creds_file" + for var_name in "${var_names[@]}"; do + if [[ -n "${!var_name:-}" ]]; then + echo "$var_name" + return 0 + fi + done + fi + + return 1 +} + +# Get the actual key value (internal use only, never printed to output) +_get_key_value() { + local provider="$1" + local key_vars + key_vars=$(get_provider_key_vars "$provider" 2>/dev/null) || return 1 + + if [[ -z "$key_vars" ]]; then + return 1 + fi + + IFS=',' read -ra var_names <<< "$key_vars" + + # Try env vars first (may have been populated by resolve_api_key) + for var_name in "${var_names[@]}"; do + if [[ -n "${!var_name:-}" ]]; then + echo "${!var_name}" + return 0 + fi + done + + return 1 +} + +# ============================================================================= +# Cache Management +# ============================================================================= + +is_cache_valid() { + local provider="$1" + local table="${2:-provider_health}" + local custom_ttl="${3:-}" + + local row + row=$(db_query " + SELECT checked_at, ttl_seconds FROM $table + WHERE provider = '$(sql_escape "$provider")' + LIMIT 1; + ") + + if [[ -z "$row" ]]; then + return 1 + fi + + local checked_at ttl_seconds + checked_at=$(echo "$row" | cut -d'|' -f1) + ttl_seconds=$(echo "$row" | cut -d'|' -f2) + + # Allow TTL override + if [[ -n "$custom_ttl" ]]; then + ttl_seconds="$custom_ttl" + fi + + local checked_epoch now_epoch + if [[ "$(uname)" == "Darwin" ]]; then + checked_epoch=$(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$checked_at" "+%s" 2>/dev/null || echo "0") + else + checked_epoch=$(date -d "$checked_at" "+%s" 2>/dev/null || echo "0") + fi + now_epoch=$(date "+%s") + + local age=$(( now_epoch - checked_epoch )) + if [[ "$age" -lt "$ttl_seconds" ]]; then + return 0 + fi + + return 1 +} + +invalidate_cache() { + local provider="${1:-}" + + if [[ -z "$provider" ]]; then + db_query "DELETE FROM provider_health;" + db_query "DELETE FROM model_availability;" + db_query "DELETE FROM rate_limits;" + print_info "All availability caches cleared" + else + local escaped + escaped=$(sql_escape "$provider") + db_query "DELETE FROM provider_health WHERE provider = '$escaped';" + db_query "DELETE FROM model_availability WHERE provider = '$escaped';" + db_query "DELETE FROM rate_limits WHERE provider = '$escaped';" + print_info "Cache cleared for provider: $provider" + fi + return 0 +} + +# ============================================================================= +# Provider Probing +# ============================================================================= + +# Probe a single provider via its /models endpoint. +# This is a lightweight check: the /models endpoint is free on all providers, +# returns quickly, and confirms both API key validity and service availability. +# +# Returns: 0=healthy, 1=unhealthy, 2=rate-limited, 3=key-invalid +probe_provider() { + local provider="$1" + local force="${2:-false}" + local custom_ttl="${3:-}" + local quiet="${4:-false}" + + # Check cache first (unless forced) + if [[ "$force" != "true" ]]; then + if is_cache_valid "$provider" "provider_health" "$custom_ttl"; then + local cached_status + cached_status=$(db_query "SELECT status FROM provider_health WHERE provider = '$(sql_escape "$provider")';") + if [[ "$cached_status" == "healthy" ]]; then + [[ "$quiet" != "true" ]] && print_info "$provider: cached healthy" + return 0 + elif [[ "$cached_status" == "rate_limited" ]]; then + [[ "$quiet" != "true" ]] && print_warning "$provider: cached rate-limited" + return 2 + elif [[ "$cached_status" == "key_invalid" ]]; then + [[ "$quiet" != "true" ]] && print_warning "$provider: cached key-invalid" + return 3 + else + [[ "$quiet" != "true" ]] && print_warning "$provider: cached unhealthy" + return 1 + fi + fi + fi + + # Resolve API key + local key_var + if ! key_var=$(resolve_api_key "$provider"); then + [[ "$quiet" != "true" ]] && print_warning "$provider: no API key configured" + _record_health "$provider" "no_key" 0 0 "No API key found" 0 + return 3 + fi + + local api_key + if ! api_key=$(_get_key_value "$provider"); then + [[ "$quiet" != "true" ]] && print_warning "$provider: could not resolve API key value" + _record_health "$provider" "no_key" 0 0 "Key var $key_var found but empty" 0 + return 3 + fi + + local endpoint + endpoint=$(get_provider_endpoint "$provider" 2>/dev/null) || true + if [[ -z "$endpoint" ]]; then + [[ "$quiet" != "true" ]] && print_error "$provider: no endpoint configured" + return 1 + fi + + # Build curl command based on provider auth style + local -a curl_args=(-s -w '\n%{http_code}\n%{time_total}' --max-time "$PROBE_TIMEOUT" -D -) + case "$provider" in + anthropic) + curl_args+=(-H "x-api-key: ${api_key}" -H "anthropic-version: 2023-06-01") + ;; + google) + # Google uses query parameter for key + endpoint="${endpoint}?key=${api_key}&pageSize=1" + ;; + *) + # OpenAI-compatible: Bearer token + curl_args+=(-H "Authorization: Bearer ${api_key}") + ;; + esac + + # Execute probe + local start_ms + start_ms=$(date +%s%N 2>/dev/null || echo "0") + + local response + response=$(curl "${curl_args[@]}" "$endpoint" 2>/dev/null) || true + + local end_ms + end_ms=$(date +%s%N 2>/dev/null || echo "0") + local duration_ms=0 + if [[ "$start_ms" != "0" && "$end_ms" != "0" ]]; then + duration_ms=$(( (end_ms - start_ms) / 1000000 )) + fi + + # Parse response: last two lines are http_code and time_total + local http_code headers body + http_code=$(echo "$response" | tail -1) + # response_time from curl -w is in the second-to-last line (unused; duration_ms is more precise) + + # Separate headers from body (split on blank line) + headers=$(echo "$response" | sed '/^$/q' | head -50) + body=$(echo "$response" | sed '1,/^$/d' | head -n -2) + + # Parse rate limit headers (provider-specific) + _parse_rate_limits "$provider" "$headers" + + # Determine status based on HTTP code + local status="unknown" + local error_msg="" + local models_count=0 + local exit_code=1 + + case "$http_code" in + 200) + status="healthy" + exit_code=0 + # Count models in response + case "$provider" in + google) + models_count=$(echo "$body" | jq -r '.models | length' 2>/dev/null || echo "0") + ;; + *) + models_count=$(echo "$body" | jq -r '.data | length' 2>/dev/null || echo "0") + ;; + esac + [[ "$quiet" != "true" ]] && print_success "$provider: healthy (${duration_ms}ms, $models_count models)" + ;; + 401|403) + status="key_invalid" + error_msg="Authentication failed (HTTP $http_code)" + exit_code=3 + [[ "$quiet" != "true" ]] && print_error "$provider: API key invalid (HTTP $http_code)" + ;; + 429) + status="rate_limited" + error_msg="Rate limited (HTTP 429)" + exit_code=2 + [[ "$quiet" != "true" ]] && print_warning "$provider: rate limited" + ;; + 500|502|503|504) + status="unhealthy" + error_msg="Server error (HTTP $http_code)" + exit_code=1 + [[ "$quiet" != "true" ]] && print_error "$provider: server error (HTTP $http_code)" + ;; + "") + status="unreachable" + error_msg="Connection failed or timeout" + exit_code=1 + [[ "$quiet" != "true" ]] && print_error "$provider: unreachable (timeout or DNS failure)" + ;; + *) + status="unhealthy" + error_msg="Unexpected HTTP $http_code" + exit_code=1 + [[ "$quiet" != "true" ]] && print_warning "$provider: unexpected response (HTTP $http_code)" + ;; + esac + + # Record health status + _record_health "$provider" "$status" "$http_code" "$duration_ms" "$error_msg" "$models_count" + + # Log the probe + db_query " + INSERT INTO probe_log (provider, action, result, duration_ms, details) + VALUES ( + '$(sql_escape "$provider")', + 'health_probe', + '$(sql_escape "$status")', + $duration_ms, + '$(sql_escape "HTTP $http_code, $models_count models")' + ); + " || true + + # Prune old probe logs (keep last 100 per provider) + db_query " + DELETE FROM probe_log WHERE id IN ( + SELECT id FROM probe_log + WHERE provider = '$(sql_escape "$provider")' + ORDER BY timestamp DESC + LIMIT -1 OFFSET 100 + ); + " || true + + return "$exit_code" +} + +_record_health() { + local provider="$1" + local status="$2" + local http_code="$3" + local duration_ms="$4" + local error_msg="$5" + local models_count="$6" + + db_query " + INSERT INTO provider_health (provider, status, http_code, response_ms, error_message, models_count, checked_at, ttl_seconds) + VALUES ( + '$(sql_escape "$provider")', + '$(sql_escape "$status")', + $http_code, + $duration_ms, + '$(sql_escape "$error_msg")', + $models_count, + strftime('%Y-%m-%dT%H:%M:%SZ', 'now'), + $DEFAULT_HEALTH_TTL + ) + ON CONFLICT(provider) DO UPDATE SET + status = excluded.status, + http_code = excluded.http_code, + response_ms = excluded.response_ms, + error_message = excluded.error_message, + models_count = excluded.models_count, + checked_at = excluded.checked_at, + ttl_seconds = excluded.ttl_seconds; + " || true + return 0 +} + +# ============================================================================= +# Rate Limit Parsing +# ============================================================================= + +_parse_rate_limits() { + local provider="$1" + local headers="$2" + + local req_limit=0 req_remaining=0 req_reset="" + local tok_limit=0 tok_remaining=0 tok_reset="" + + case "$provider" in + anthropic) + req_limit=$(echo "$headers" | grep -i 'anthropic-ratelimit-requests-limit' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + req_remaining=$(echo "$headers" | grep -i 'anthropic-ratelimit-requests-remaining' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + req_reset=$(echo "$headers" | grep -i 'anthropic-ratelimit-requests-reset' | head -1 | awk '{print $2}' | tr -d '\r' || echo "") + tok_limit=$(echo "$headers" | grep -i 'anthropic-ratelimit-tokens-limit' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + tok_remaining=$(echo "$headers" | grep -i 'anthropic-ratelimit-tokens-remaining' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + tok_reset=$(echo "$headers" | grep -i 'anthropic-ratelimit-tokens-reset' | head -1 | awk '{print $2}' | tr -d '\r' || echo "") + ;; + openai) + req_limit=$(echo "$headers" | grep -i 'x-ratelimit-limit-requests' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + req_remaining=$(echo "$headers" | grep -i 'x-ratelimit-remaining-requests' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + req_reset=$(echo "$headers" | grep -i 'x-ratelimit-reset-requests' | head -1 | awk '{print $2}' | tr -d '\r' || echo "") + tok_limit=$(echo "$headers" | grep -i 'x-ratelimit-limit-tokens' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + tok_remaining=$(echo "$headers" | grep -i 'x-ratelimit-remaining-tokens' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + tok_reset=$(echo "$headers" | grep -i 'x-ratelimit-reset-tokens' | head -1 | awk '{print $2}' | tr -d '\r' || echo "") + ;; + groq) + req_limit=$(echo "$headers" | grep -i 'x-ratelimit-limit-requests' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + req_remaining=$(echo "$headers" | grep -i 'x-ratelimit-remaining-requests' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + req_reset=$(echo "$headers" | grep -i 'x-ratelimit-reset-requests' | head -1 | awk '{print $2}' | tr -d '\r' || echo "") + tok_limit=$(echo "$headers" | grep -i 'x-ratelimit-limit-tokens' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + tok_remaining=$(echo "$headers" | grep -i 'x-ratelimit-remaining-tokens' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + tok_reset=$(echo "$headers" | grep -i 'x-ratelimit-reset-tokens' | head -1 | awk '{print $2}' | tr -d '\r' || echo "") + ;; + *) + # Other providers: try generic x-ratelimit headers + req_limit=$(echo "$headers" | grep -i 'x-ratelimit-limit' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + req_remaining=$(echo "$headers" | grep -i 'x-ratelimit-remaining' | head -1 | awk '{print $2}' | tr -d '\r' || echo "0") + ;; + esac + + # Only store if we got meaningful data + if [[ "$req_limit" != "0" || "$req_remaining" != "0" ]]; then + db_query " + INSERT INTO rate_limits (provider, requests_limit, requests_remaining, requests_reset, + tokens_limit, tokens_remaining, tokens_reset, checked_at, ttl_seconds) + VALUES ( + '$(sql_escape "$provider")', + ${req_limit:-0}, + ${req_remaining:-0}, + '$(sql_escape "${req_reset:-}")', + ${tok_limit:-0}, + ${tok_remaining:-0}, + '$(sql_escape "${tok_reset:-}")', + strftime('%Y-%m-%dT%H:%M:%SZ', 'now'), + $DEFAULT_RATELIMIT_TTL + ) + ON CONFLICT(provider) DO UPDATE SET + requests_limit = excluded.requests_limit, + requests_remaining = excluded.requests_remaining, + requests_reset = excluded.requests_reset, + tokens_limit = excluded.tokens_limit, + tokens_remaining = excluded.tokens_remaining, + tokens_reset = excluded.tokens_reset, + checked_at = excluded.checked_at, + ttl_seconds = excluded.ttl_seconds; + " || true + fi + return 0 +} + +# ============================================================================= +# Model Availability Check +# ============================================================================= + +# Check if a specific model is available from its provider. +# First checks provider health, then verifies the model exists in the +# provider's model list (from the cached /models response or model-registry). +check_model_available() { + local model_spec="$1" + local force="${2:-false}" + local quiet="${3:-false}" + + # Parse provider/model format + local provider model_id + if [[ "$model_spec" == *"/"* ]]; then + provider="${model_spec%%/*}" + model_id="${model_spec#*/}" + else + # Try to infer provider from model name + case "$model_spec" in + claude*) provider="anthropic" ;; + gpt*|o3*|o4*) provider="openai" ;; + gemini*) provider="google" ;; + deepseek*) provider="deepseek" ;; + llama*) provider="groq" ;; + *) provider="" ;; + esac + model_id="$model_spec" + fi + + if [[ -z "$provider" ]]; then + [[ "$quiet" != "true" ]] && print_error "Cannot determine provider for: $model_spec" + return 1 + fi + + # Check provider health first + local probe_exit=0 + probe_provider "$provider" "$force" "" "$quiet" || probe_exit=$? + + if [[ "$probe_exit" -ne 0 ]]; then + return "$probe_exit" + fi + + # Check model-specific availability from cache + local cached_available + cached_available=$(db_query " + SELECT available FROM model_availability + WHERE model_id = '$(sql_escape "$model_id")' AND provider = '$(sql_escape "$provider")' + AND (julianday('now') - julianday(checked_at)) * 86400 < ttl_seconds; + ") + + if [[ -n "$cached_available" ]]; then + if [[ "$cached_available" == "1" ]]; then + [[ "$quiet" != "true" ]] && print_info "$model_spec: available (cached)" + return 0 + else + [[ "$quiet" != "true" ]] && print_warning "$model_spec: unavailable (cached)" + return 1 + fi + fi + + # Model-level check: query the model-registry if available + local registry_db="${AVAILABILITY_DIR}/model-registry.db" + if [[ -f "$registry_db" ]]; then + local in_registry + in_registry=$(sqlite3 -cmd ".timeout 5000" "$registry_db" " + SELECT COUNT(*) FROM provider_models + WHERE model_id LIKE '%$(sql_escape "$model_id")%' + AND provider LIKE '%$(sql_escape "$provider")%'; + " 2>/dev/null || echo "0") + + if [[ "$in_registry" -gt 0 ]]; then + _record_model_availability "$model_id" "$provider" 1 + [[ "$quiet" != "true" ]] && print_success "$model_spec: available (registry confirmed)" + return 0 + fi + fi + + # If provider is healthy but we can't confirm the specific model, + # assume available (provider health is the primary signal) + _record_model_availability "$model_id" "$provider" 1 + [[ "$quiet" != "true" ]] && print_info "$model_spec: assumed available (provider healthy)" + return 0 +} + +_record_model_availability() { + local model_id="$1" + local provider="$2" + local available="$3" + + db_query " + INSERT INTO model_availability (model_id, provider, available, checked_at, ttl_seconds) + VALUES ( + '$(sql_escape "$model_id")', + '$(sql_escape "$provider")', + $available, + strftime('%Y-%m-%dT%H:%M:%SZ', 'now'), + $DEFAULT_HEALTH_TTL + ) + ON CONFLICT(model_id, provider) DO UPDATE SET + available = excluded.available, + checked_at = excluded.checked_at, + ttl_seconds = excluded.ttl_seconds; + " || true + return 0 +} + +# ============================================================================= +# Tier Resolution with Fallback +# ============================================================================= + +# Resolve the best available model for a given tier. +# Checks primary model first, falls back to secondary if primary is unavailable. +# Output: provider/model_id on stdout +# Returns: 0 if a model was resolved, 1 if no model available for this tier +resolve_tier() { + local tier="$1" + local force="${2:-false}" + local quiet="${3:-false}" + + local tier_spec + tier_spec=$(get_tier_models "$tier" 2>/dev/null) || true + if [[ -z "$tier_spec" ]]; then + [[ "$quiet" != "true" ]] && print_error "Unknown tier: $tier" + return 1 + fi + + local primary fallback + primary="${tier_spec%%|*}" + fallback="${tier_spec#*|}" + + # Try primary + if check_model_available "$primary" "$force" "true"; then + echo "$primary" + [[ "$quiet" != "true" ]] && print_success "Resolved $tier -> $primary (primary)" + return 0 + fi + + # Try fallback + if [[ -n "$fallback" && "$fallback" != "$primary" ]]; then + if check_model_available "$fallback" "$force" "true"; then + echo "$fallback" + [[ "$quiet" != "true" ]] && print_warning "Resolved $tier -> $fallback (fallback, primary $primary unavailable)" + return 0 + fi + fi + + [[ "$quiet" != "true" ]] && print_error "No available model for tier: $tier (tried $primary, $fallback)" + return 1 +} + +# ============================================================================= +# Commands +# ============================================================================= + +cmd_check() { + local target="${1:-}" + local force=false + local quiet=false + local json_flag=false + local custom_ttl="" + shift || true + + while [[ $# -gt 0 ]]; do + case "$1" in + --force) force=true; shift ;; + --quiet) quiet=true; shift ;; + --json) json_flag=true; shift ;; + --ttl) custom_ttl="${2:-}"; shift 2 ;; + *) shift ;; + esac + done + + if [[ -z "$target" ]]; then + print_error "Usage: model-availability-helper.sh check " + return 1 + fi + + # Determine if target is a provider name, tier, or model spec + if is_known_provider "$target"; then + probe_provider "$target" "$force" "$custom_ttl" "$quiet" + return $? + elif is_known_tier "$target"; then + resolve_tier "$target" "$force" "$quiet" >/dev/null + return $? + else + # Assume it's a model spec (provider/model or model name) + check_model_available "$target" "$force" "$quiet" + return $? + fi +} + +cmd_probe() { + local all=false + local target="" + local force=false + local quiet=false + local json_flag=false + + while [[ $# -gt 0 ]]; do + case "$1" in + --all) all=true; shift ;; + --force) force=true; shift ;; + --quiet) quiet=true; shift ;; + --json) json_flag=true; shift ;; + *) + if [[ -z "$target" ]]; then + target="$1" + fi + shift + ;; + esac + done + + if [[ -n "$target" ]] && ! is_known_provider "$target"; then + print_error "Unknown provider: $target" + print_info "Available: $KNOWN_PROVIDERS" + return 1 + fi + + local providers_to_probe=() + if [[ -n "$target" ]]; then + providers_to_probe=("$target") + elif [[ "$all" == "true" ]]; then + # Probe all known providers + for p in $KNOWN_PROVIDERS; do + providers_to_probe+=("$p") + done + else + # Probe only providers with configured keys + for p in $KNOWN_PROVIDERS; do + if resolve_api_key "$p" >/dev/null 2>&1; then + providers_to_probe+=("$p") + fi + done + fi + + if [[ ${#providers_to_probe[@]} -eq 0 ]]; then + print_warning "No providers to probe (no API keys configured)" + return 1 + fi + + [[ "$quiet" != "true" ]] && echo "" + [[ "$quiet" != "true" ]] && echo "Provider Availability Probe" + [[ "$quiet" != "true" ]] && echo "===========================" + [[ "$quiet" != "true" ]] && echo "" + + local healthy=0 unhealthy=0 no_key=0 + for provider in "${providers_to_probe[@]}"; do + local exit_code=0 + probe_provider "$provider" "$force" "" "$quiet" || exit_code=$? + case "$exit_code" in + 0) healthy=$((healthy + 1)) ;; + 3) no_key=$((no_key + 1)) ;; + *) unhealthy=$((unhealthy + 1)) ;; + esac + done + + [[ "$quiet" != "true" ]] && echo "" + [[ "$quiet" != "true" ]] && print_info "Summary: $healthy healthy, $unhealthy unhealthy, $no_key no key" + + if [[ "$json_flag" == "true" ]]; then + db_query_json "SELECT provider, status, http_code, response_ms, models_count, checked_at FROM provider_health ORDER BY provider;" + fi + + [[ "$unhealthy" -gt 0 ]] && return 1 + return 0 +} + +cmd_status() { + local json_flag=false + + while [[ $# -gt 0 ]]; do + case "$1" in + --json) json_flag=true; shift ;; + *) shift ;; + esac + done + + if [[ ! -f "$AVAILABILITY_DB" ]]; then + print_warning "No availability data. Run 'model-availability-helper.sh probe' first." + return 0 + fi + + if [[ "$json_flag" == "true" ]]; then + echo "{" + echo " \"providers\":" + db_query_json "SELECT provider, status, http_code, response_ms, models_count, error_message, checked_at FROM provider_health ORDER BY provider;" + echo "," + echo " \"rate_limits\":" + db_query_json "SELECT provider, requests_limit, requests_remaining, requests_reset, tokens_limit, tokens_remaining, tokens_reset, checked_at FROM rate_limits ORDER BY provider;" + echo "}" + return 0 + fi + + echo "" + echo "Model Availability Status" + echo "=========================" + echo "" + + echo "Provider Health:" + echo "" + printf " %-12s %-12s %-6s %-8s %-8s %-20s\n" \ + "Provider" "Status" "HTTP" "Time" "Models" "Last Check" + printf " %-12s %-12s %-6s %-8s %-8s %-20s\n" \ + "--------" "------" "----" "----" "------" "----------" + + db_query " + SELECT provider, status, http_code, response_ms, models_count, checked_at + FROM provider_health ORDER BY provider; + " | while IFS='|' read -r prov stat code ms models checked; do + local status_display="$stat" + case "$stat" in + healthy) status_display="${GREEN}healthy${NC}" ;; + unhealthy|unreachable) status_display="${RED}$stat${NC}" ;; + rate_limited) status_display="${YELLOW}rate-ltd${NC}" ;; + key_invalid) status_display="${RED}bad-key${NC}" ;; + no_key) status_display="${YELLOW}no-key${NC}" ;; + esac + + # Calculate age + local age_display="$checked" + local checked_epoch now_epoch + if [[ "$(uname)" == "Darwin" ]]; then + checked_epoch=$(date -j -f "%Y-%m-%dT%H:%M:%SZ" "$checked" "+%s" 2>/dev/null || echo "0") + else + checked_epoch=$(date -d "$checked" "+%s" 2>/dev/null || echo "0") + fi + now_epoch=$(date "+%s") + local age=$(( now_epoch - checked_epoch )) + if [[ "$age" -lt 60 ]]; then + age_display="${age}s ago" + elif [[ "$age" -lt 3600 ]]; then + age_display="$((age / 60))m ago" + else + age_display="$((age / 3600))h ago" + fi + + printf " %-12s %-12b %-6s %-8s %-8s %-20s\n" \ + "$prov" "$status_display" "$code" "${ms}ms" "$models" "$age_display" + done + + # Show rate limits if available + local rl_count + rl_count=$(db_query "SELECT COUNT(*) FROM rate_limits WHERE requests_limit > 0;") + if [[ "$rl_count" -gt 0 ]]; then + echo "" + echo "Rate Limits:" + echo "" + printf " %-12s %-15s %-15s %-15s\n" \ + "Provider" "Req Remaining" "Tok Remaining" "Reset" + printf " %-12s %-15s %-15s %-15s\n" \ + "--------" "-------------" "-------------" "-----" + + db_query " + SELECT provider, requests_limit, requests_remaining, requests_reset, + tokens_limit, tokens_remaining, tokens_reset + FROM rate_limits WHERE requests_limit > 0 ORDER BY provider; + " | while IFS='|' read -r prov rl rr rres tl tr tres; do + local req_display="${rr}/${rl}" + local tok_display="${tr}/${tl}" + [[ "$tl" == "0" ]] && tok_display="n/a" + printf " %-12s %-15s %-15s %-15s\n" \ + "$prov" "$req_display" "$tok_display" "${rres:-n/a}" + done + fi + + # Show tier resolution + echo "" + echo "Tier Resolution:" + echo "" + printf " %-8s %-35s %-35s\n" "Tier" "Primary" "Fallback" + printf " %-8s %-35s %-35s\n" "----" "-------" "--------" + for tier in haiku flash sonnet pro opus health eval coding; do + local spec + spec=$(get_tier_models "$tier" 2>/dev/null) || spec="" + local primary="${spec%%|*}" + local fallback="${spec#*|}" + printf " %-8s %-35s %-35s\n" "$tier" "$primary" "$fallback" + done + + echo "" + + # Show recent probe log + local log_count + log_count=$(db_query "SELECT COUNT(*) FROM probe_log;") + if [[ "$log_count" -gt 0 ]]; then + echo "Recent Probes (last 10):" + echo "" + db_query " + SELECT timestamp, provider, action, result, duration_ms + FROM probe_log ORDER BY timestamp DESC LIMIT 10; + " | while IFS='|' read -r ts prov _action result ms; do + echo " $ts $prov $result ${ms}ms" + done + echo "" + fi + + return 0 +} + +cmd_rate_limits() { + local json_flag=false + + while [[ $# -gt 0 ]]; do + case "$1" in + --json) json_flag=true; shift ;; + *) shift ;; + esac + done + + if [[ ! -f "$AVAILABILITY_DB" ]]; then + print_warning "No rate limit data. Run 'model-availability-helper.sh probe' first." + return 0 + fi + + if [[ "$json_flag" == "true" ]]; then + db_query_json "SELECT * FROM rate_limits ORDER BY provider;" + return 0 + fi + + echo "" + echo "Rate Limit Status" + echo "=================" + echo "" + + local count + count=$(db_query "SELECT COUNT(*) FROM rate_limits;") + + if [[ "$count" -eq 0 ]]; then + print_info "No rate limit data cached. Probe providers to collect rate limit headers." + return 0 + fi + + printf " %-12s %-12s %-12s %-20s %-12s %-12s %-20s %-20s\n" \ + "Provider" "Req Limit" "Req Left" "Req Reset" "Tok Limit" "Tok Left" "Tok Reset" "Checked" + printf " %-12s %-12s %-12s %-20s %-12s %-12s %-20s %-20s\n" \ + "--------" "---------" "--------" "---------" "---------" "--------" "---------" "-------" + + db_query "SELECT * FROM rate_limits ORDER BY provider;" | \ + while IFS='|' read -r prov rl rr rres tl tr tres checked _ttl; do + printf " %-12s %-12s %-12s %-20s %-12s %-12s %-20s %-20s\n" \ + "$prov" "$rl" "$rr" "${rres:-n/a}" "$tl" "$tr" "${tres:-n/a}" "$checked" + done + + echo "" + return 0 +} + +cmd_resolve() { + local tier="${1:-}" + local force=false + local quiet=false + local json_flag=false + shift || true + + while [[ $# -gt 0 ]]; do + case "$1" in + --force) force=true; shift ;; + --quiet) quiet=true; shift ;; + --json) json_flag=true; shift ;; + *) shift ;; + esac + done + + if [[ -z "$tier" ]]; then + print_error "Usage: model-availability-helper.sh resolve " + print_info "Available tiers: haiku flash sonnet pro opus health eval coding" + return 1 + fi + + local resolved + resolved=$(resolve_tier "$tier" "$force" "$quiet") + local exit_code=$? + + if [[ "$json_flag" == "true" ]]; then + if [[ $exit_code -eq 0 ]]; then + local provider model_id + provider="${resolved%%/*}" + model_id="${resolved#*/}" + echo "{\"tier\":\"$tier\",\"provider\":\"$provider\",\"model\":\"$model_id\",\"full_id\":\"$resolved\",\"status\":\"available\"}" + else + echo "{\"tier\":\"$tier\",\"status\":\"unavailable\"}" + fi + else + if [[ $exit_code -eq 0 ]]; then + echo "$resolved" + fi + fi + + return "$exit_code" +} + +cmd_invalidate() { + local target="${1:-}" + invalidate_cache "$target" + return 0 +} + +cmd_help() { + echo "" + echo "Model Availability Helper - Probe before dispatch" + echo "=================================================" + echo "" + echo "Usage: model-availability-helper.sh [command] [options]" + echo "" + echo "Commands:" + echo " check Check availability (exit 0=yes, 1=no, 2=rate-limited, 3=bad-key)" + echo " probe [provider] [--all] Probe providers (default: only those with keys)" + echo " status Show cached availability status" + echo " rate-limits Show rate limit data from cache" + echo " resolve Resolve best available model for tier (with fallback)" + echo " invalidate [provider] Clear cache (all or specific provider)" + echo " help Show this help" + echo "" + echo "Options:" + echo " --json Output in JSON format" + echo " --quiet Suppress informational output" + echo " --force Bypass cache and probe live" + echo " --ttl N Override cache TTL in seconds" + echo "" + echo "Tiers:" + echo " haiku - Cheapest (triage, classification)" + echo " flash - Low cost (large context, summarization)" + echo " sonnet - Medium (code implementation, review)" + echo " pro - Medium-high (large codebase analysis)" + echo " opus - Highest (architecture, complex reasoning)" + echo " health - Cheapest probe model" + echo " eval - Cheap evaluation model" + echo " coding - Best SOTA coding model" + echo "" + echo "Examples:" + echo " model-availability-helper.sh check anthropic" + echo " model-availability-helper.sh check anthropic/claude-sonnet-4-20250514" + echo " model-availability-helper.sh check sonnet" + echo " model-availability-helper.sh probe --all" + echo " model-availability-helper.sh resolve opus --json" + echo " model-availability-helper.sh status" + echo " model-availability-helper.sh rate-limits --json" + echo " model-availability-helper.sh invalidate anthropic" + echo "" + echo "Integration with supervisor:" + echo " # In supervisor dispatch, replace check_model_health() with:" + echo " model-availability-helper.sh check anthropic --quiet" + echo "" + echo " # Resolve model with fallback for a tier:" + echo " MODEL=\$(model-availability-helper.sh resolve coding --quiet)" + echo "" + echo "Exit codes:" + echo " 0 - Available" + echo " 1 - Unavailable or error" + echo " 2 - Rate limited" + echo " 3 - API key invalid or missing" + echo "" + echo "Cache: $AVAILABILITY_DB" + echo "TTL: ${DEFAULT_HEALTH_TTL}s (health), ${DEFAULT_RATELIMIT_TTL}s (rate limits)" + echo "" + return 0 +} + +# ============================================================================= +# Main +# ============================================================================= + +main() { + local command="${1:-help}" + shift || true + + # Initialize DB for all commands except help + if [[ "$command" != "help" && "$command" != "--help" && "$command" != "-h" ]]; then + init_db || return 1 + fi + + case "$command" in + check) + cmd_check "$@" + ;; + probe) + cmd_probe "$@" + ;; + status) + cmd_status "$@" + ;; + rate-limits|ratelimits|rate_limits) + cmd_rate_limits "$@" + ;; + resolve) + cmd_resolve "$@" + ;; + invalidate|clear|flush) + cmd_invalidate "$@" + ;; + help|--help|-h) + cmd_help + ;; + *) + print_error "Unknown command: $command" + cmd_help + return 1 + ;; + esac + return $? +} + +main "$@" diff --git a/.agents/scripts/supervisor-helper.sh b/.agents/scripts/supervisor-helper.sh index 593badba1..446395995 100755 --- a/.agents/scripts/supervisor-helper.sh +++ b/.agents/scripts/supervisor-helper.sh @@ -2620,7 +2620,8 @@ resolve_ai_cli() { ####################################### # Resolve the best available model for a given task tier -# Priority: Anthropic SOTA via opencode (only supported CLI) +# Uses model-availability-helper.sh for availability-aware resolution +# with automatic fallback to cross-provider alternatives. # # Tiers: # coding - Best SOTA model for code tasks (default) @@ -2637,14 +2638,25 @@ resolve_model() { return 0 fi + # Try model-availability-helper.sh for availability-aware resolution (t132.3) + local availability_helper="${SCRIPT_DIR}/model-availability-helper.sh" + if [[ -x "$availability_helper" ]]; then + local resolved + resolved=$("$availability_helper" resolve "$tier" --quiet 2>/dev/null) || true + if [[ -n "$resolved" ]]; then + echo "$resolved" + return 0 + fi + # Fallback: availability helper couldn't resolve, use static defaults + log_verbose "model-availability-helper.sh could not resolve tier '$tier', using static default" + fi + + # Static fallback (no availability helper or resolution failed) case "$tier" in coding) - # Best Anthropic model - primary for all code tasks echo "anthropic/claude-opus-4-6" ;; eval|health) - # Fast + cheap for evaluation and health probes - # Note: OpenCode requires full model IDs (e.g., claude-sonnet-4-5, not claude-sonnet-4) echo "anthropic/claude-sonnet-4-5" ;; esac @@ -2653,8 +2665,10 @@ resolve_model() { } ####################################### -# Pre-dispatch model health check -# Sends a trivial prompt to verify the model/provider is responding. +# Pre-dispatch model health check (t132.3) +# Two-tier probe strategy: +# 1. Fast path: model-availability-helper.sh (direct HTTP, ~1-2s, cached) +# 2. Slow path: Full AI CLI probe (spawns session, ~8-15s) # Returns 0 if healthy, 1 if unhealthy. Timeout: 15 seconds. # Result is cached for 5 minutes to avoid repeated probes. ####################################### @@ -2669,13 +2683,47 @@ check_model_health() { return 0 fi - # Cache key: cli + model, stored as a file with timestamp + # Fast path: use model-availability-helper.sh for lightweight HTTP probe (t132.3) + # This checks the provider's /models endpoint (~1-2s) instead of spawning + # a full AI CLI session (~8-15s). Falls through to slow path on failure. + local availability_helper="${SCRIPT_DIR}/model-availability-helper.sh" + if [[ -x "$availability_helper" ]]; then + local provider_name="" + if [[ -n "$model" && "$model" == *"/"* ]]; then + provider_name="${model%%/*}" + else + provider_name="anthropic" # Default provider + fi + + local avail_exit=0 + "$availability_helper" check "$provider_name" --quiet 2>/dev/null || avail_exit=$? + + case "$avail_exit" in + 0) + _PULSE_HEALTH_VERIFIED="true" + log_info "Model health: OK via availability helper (fast path)" + return 0 + ;; + 2) + log_warn "Model health check: rate limited (via availability helper)" + return 1 + ;; + 3) + log_warn "Model health check: API key invalid (via availability helper)" + return 1 + ;; + *) + log_verbose "Availability helper returned $avail_exit, falling through to CLI probe" + ;; + esac + fi + + # Slow path: file-based cache check (legacy, kept for environments without the helper) local cache_dir="$SUPERVISOR_DIR/health" mkdir -p "$cache_dir" local cache_key="${ai_cli}-${model//\//_}" local cache_file="$cache_dir/${cache_key}" - # Check cache (valid for 300 seconds / 5 minutes) if [[ -f "$cache_file" ]]; then local cached_at cached_at=$(cat "$cache_file") @@ -2689,7 +2737,7 @@ check_model_health() { fi fi - # Resolve timeout command (macOS lacks coreutils timeout) + # Slow path: spawn AI CLI for a trivial prompt local timeout_cmd="" if command -v gtimeout &>/dev/null; then timeout_cmd="gtimeout" @@ -2697,7 +2745,6 @@ check_model_health() { timeout_cmd="timeout" fi - # Send a trivial prompt local probe_result="" local probe_exit=1 @@ -2711,7 +2758,6 @@ check_model_health() { probe_result=$("$timeout_cmd" 15 "${probe_cmd[@]}" 2>&1) probe_exit=$? else - # Fallback: background process with manual kill after 15s local probe_pid probe_tmpfile probe_tmpfile=$(mktemp) ("${probe_cmd[@]}" > "$probe_tmpfile" 2>&1) & @@ -2724,7 +2770,7 @@ check_model_health() { if kill -0 "$probe_pid" 2>/dev/null; then kill "$probe_pid" 2>/dev/null || true wait "$probe_pid" 2>/dev/null || true - probe_exit=124 # Simulate timeout exit code + probe_exit=124 else wait "$probe_pid" 2>/dev/null || true probe_exit=$? @@ -2764,21 +2810,16 @@ check_model_health() { fi # Check for known failure patterns - # NOTE: Patterns must not match inside JSON fields (e.g. timestamps contain digits - # like "1770503..." which falsely matched bare "503"). Use word boundaries or - # anchored patterns. The probe returns JSON lines from opencode run --format json. if echo "$probe_result" | grep -qiE 'endpoints failed|Quota protection|over[_ -]?usage|quota reset|"status":[[:space:]]*503|HTTP 503|503 Service|service unavailable' 2>/dev/null; then log_warn "Model health check FAILED: provider error detected" return 1 fi - # Timeout (exit 124) = unhealthy if [[ "$probe_exit" -eq 124 ]]; then log_warn "Model health check FAILED: timeout (15s)" return 1 fi - # Empty response with non-zero exit = unhealthy if [[ -z "$probe_result" && "$probe_exit" -ne 0 ]]; then log_warn "Model health check FAILED: empty response (exit $probe_exit)" return 1 diff --git a/.agents/tools/context/model-routing.md b/.agents/tools/context/model-routing.md index 0886d5a6e..86c5f18e1 100644 --- a/.agents/tools/context/model-routing.md +++ b/.agents/tools/context/model-routing.md @@ -180,6 +180,32 @@ Interactive commands: `/compare-models` (with live web fetch), `/compare-models- The model registry (`model-registry-helper.sh`) maintains a SQLite database tracking all known models across providers. It syncs from subagent frontmatter, embedded pricing data, and live provider APIs. Use `model-registry-helper.sh status` to check registry health and `model-registry-helper.sh check` to verify configured models are available. +## Model Availability (Pre-Dispatch) + +The model availability checker (`model-availability-helper.sh`) provides lightweight, cached health probes for use before dispatch. Unlike the model registry (which tracks what models exist), the availability checker tests whether providers are currently responding and API keys are valid. + +```bash +# Check if a provider is healthy (fast: direct HTTP, ~1-2s, cached 5min) +model-availability-helper.sh check anthropic + +# Check a specific model +model-availability-helper.sh check anthropic/claude-sonnet-4-20250514 + +# Resolve best available model for a tier (with automatic fallback) +model-availability-helper.sh resolve opus + +# Probe all configured providers +model-availability-helper.sh probe + +# View cached status and rate limits +model-availability-helper.sh status +model-availability-helper.sh rate-limits +``` + +The supervisor uses this automatically during dispatch (t132.3). The availability helper is ~4-8x faster than the previous CLI-based health probe because it calls provider `/models` endpoints directly via HTTP instead of spawning a full AI CLI session. + +Exit codes: 0=available, 1=unavailable, 2=rate-limited, 3=API-key-invalid. + ## Decision Flowchart diff --git a/tests/test-model-availability.sh b/tests/test-model-availability.sh new file mode 100755 index 000000000..500a35ebd --- /dev/null +++ b/tests/test-model-availability.sh @@ -0,0 +1,295 @@ +#!/usr/bin/env bash +# test-model-availability.sh +# +# Tests for model-availability-helper.sh (t132.3) +# Validates: syntax, help output, DB init, cache logic, tier resolution, +# and integration with supervisor resolve_model/check_model_health. +# +# Usage: bash tests/test-model-availability.sh [--verbose] +# +# Exit codes: 0 = all pass, 1 = failures found + +set -euo pipefail + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +HELPER="$REPO_DIR/.agents/scripts/model-availability-helper.sh" +SUPERVISOR="$REPO_DIR/.agents/scripts/supervisor-helper.sh" +VERBOSE="${1:-}" + +# Portable timeout: gtimeout (macOS homebrew) > timeout (Linux) > none +TIMEOUT_CMD="" +if command -v gtimeout &>/dev/null; then + TIMEOUT_CMD="gtimeout" +elif command -v timeout &>/dev/null; then + TIMEOUT_CMD="timeout" +fi + +# Run a command with optional timeout +run_with_timeout() { + local secs="$1" + shift + if [[ -n "$TIMEOUT_CMD" ]]; then + "$TIMEOUT_CMD" "$secs" "$@" + else + "$@" + fi +} + +# --- Test Framework --- +PASS_COUNT=0 +FAIL_COUNT=0 +SKIP_COUNT=0 +TOTAL_COUNT=0 + +pass() { + PASS_COUNT=$((PASS_COUNT + 1)) + TOTAL_COUNT=$((TOTAL_COUNT + 1)) + if [[ "$VERBOSE" == "--verbose" ]]; then + printf " \033[0;32mPASS\033[0m %s\n" "$1" + fi + return 0 +} + +fail() { + FAIL_COUNT=$((FAIL_COUNT + 1)) + TOTAL_COUNT=$((TOTAL_COUNT + 1)) + printf " \033[0;31mFAIL\033[0m %s\n" "$1" + if [[ -n "${2:-}" ]]; then + printf " %s\n" "$2" + fi + return 0 +} + +skip() { + SKIP_COUNT=$((SKIP_COUNT + 1)) + TOTAL_COUNT=$((TOTAL_COUNT + 1)) + if [[ "$VERBOSE" == "--verbose" ]]; then + printf " \033[0;33mSKIP\033[0m %s\n" "$1" + fi + return 0 +} + +section() { + echo "" + printf "\033[1m=== %s ===\033[0m\n" "$1" +} + +# Use a temp DB for testing to avoid polluting real cache +TEST_DB_DIR=$(mktemp -d) +export AVAILABILITY_DB_OVERRIDE="$TEST_DB_DIR/test-availability.db" +trap 'rm -rf "$TEST_DB_DIR"' EXIT + +# ============================================================ +# SECTION 1: Basic validation +# ============================================================ +section "Basic Validation" + +# Syntax check +if bash -n "$HELPER" 2>/dev/null; then + pass "bash -n syntax check" +else + fail "bash -n syntax check" "Script has syntax errors" +fi + +# ShellCheck +if command -v shellcheck &>/dev/null; then + sc_output=$(shellcheck "$HELPER" 2>&1 || true) + sc_errors=$(echo "$sc_output" | grep -c "error" 2>/dev/null || true) + if [[ "$sc_errors" -eq 0 ]]; then + pass "shellcheck (0 errors)" + else + fail "shellcheck ($sc_errors errors)" "$(echo "$sc_output" | head -5)" + fi +else + skip "shellcheck not installed" +fi + +# Help command +help_output=$(run_with_timeout 5 bash "$HELPER" help 2>&1) || true +if [[ -n "$help_output" ]]; then + pass "help command produces output" +else + fail "help command produces output" "No output" +fi + +# Help mentions key commands +if echo "$help_output" | grep -qi "check"; then + pass "help mentions 'check' command" +else + fail "help mentions 'check' command" +fi + +if echo "$help_output" | grep -qi "probe"; then + pass "help mentions 'probe' command" +else + fail "help mentions 'probe' command" +fi + +if echo "$help_output" | grep -qi "resolve"; then + pass "help mentions 'resolve' command" +else + fail "help mentions 'resolve' command" +fi + +if echo "$help_output" | grep -qi "rate-limits"; then + pass "help mentions 'rate-limits' command" +else + fail "help mentions 'rate-limits' command" +fi + +# ============================================================ +# SECTION 2: Status command (no prior data) +# ============================================================ +section "Status Command (Empty State)" + +status_output=$(run_with_timeout 5 bash "$HELPER" status 2>&1) || true +if [[ -n "$status_output" ]]; then + pass "status command runs without error" +else + fail "status command runs without error" "No output or error" +fi + +# ============================================================ +# SECTION 3: Resolve command (tier resolution) +# ============================================================ +section "Tier Resolution" + +# Test that resolve returns a model spec for known tiers +for tier in haiku flash sonnet pro opus health eval coding; do + resolve_output=$(run_with_timeout 15 bash "$HELPER" resolve "$tier" --quiet 2>&1) || true + # Even without API keys, resolve should return the primary model + # (it falls through to the primary when no probe is possible) + if [[ -n "$resolve_output" && "$resolve_output" == *"/"* ]]; then + pass "resolve $tier -> $resolve_output" + else + # May fail if no API keys configured - that's OK for CI + skip "resolve $tier (no API keys or provider unavailable)" + fi +done + +# Test unknown tier (use || true to prevent set -e from aborting on expected failure) +if run_with_timeout 5 bash "$HELPER" resolve "nonexistent" --quiet >/dev/null 2>&1; then + fail "resolve unknown tier returns error" "Expected non-zero exit" +else + pass "resolve unknown tier returns error" +fi + +# ============================================================ +# SECTION 4: Check command +# ============================================================ +section "Check Command" + +# Check with unknown provider (use if to prevent set -e from aborting on expected failure) +if run_with_timeout 5 bash "$HELPER" check "nonexistent_provider_xyz" --quiet >/dev/null 2>&1; then + fail "check unknown target returns error" "Expected non-zero exit, got 0" +else + pass "check unknown target returns error" +fi + +# Check with known provider (may succeed or fail depending on keys) +# Use || true to prevent set -e from aborting on non-zero exit +for provider in anthropic openai google; do + check_exit=0 + run_with_timeout 15 bash "$HELPER" check "$provider" --quiet >/dev/null 2>&1 || check_exit=$? + case "$check_exit" in + 0) pass "check $provider: healthy" ;; + 1) pass "check $provider: unhealthy (expected without key)" ;; + 2) pass "check $provider: rate limited" ;; + 3) pass "check $provider: no key (expected in CI)" ;; + *) fail "check $provider: unexpected exit code $check_exit" ;; + esac +done + +# ============================================================ +# SECTION 5: Invalidate command +# ============================================================ +section "Cache Invalidation" + +run_with_timeout 5 bash "$HELPER" invalidate >/dev/null 2>&1 +invalidate_exit=$? +if [[ $invalidate_exit -eq 0 ]]; then + pass "invalidate all caches" +else + fail "invalidate all caches" "Exit code: $invalidate_exit" +fi + +run_with_timeout 5 bash "$HELPER" invalidate anthropic >/dev/null 2>&1 +invalidate_prov_exit=$? +if [[ $invalidate_prov_exit -eq 0 ]]; then + pass "invalidate specific provider cache" +else + fail "invalidate specific provider cache" "Exit code: $invalidate_prov_exit" +fi + +# ============================================================ +# SECTION 6: Supervisor integration +# ============================================================ +section "Supervisor Integration" + +# Verify supervisor references the availability helper +if grep -q "model-availability-helper.sh" "$SUPERVISOR"; then + pass "supervisor references model-availability-helper.sh" +else + fail "supervisor references model-availability-helper.sh" +fi + +# Verify resolve_model() has availability helper fast path +if grep -q "availability_helper.*resolve" "$SUPERVISOR"; then + pass "resolve_model() uses availability helper" +else + fail "resolve_model() uses availability helper" +fi + +# Verify check_model_health() has availability helper fast path +if grep -q "availability_helper.*check" "$SUPERVISOR"; then + pass "check_model_health() uses availability helper fast path" +else + fail "check_model_health() uses availability helper fast path" +fi + +# Verify check_model_health() still has CLI fallback +if grep -q 'health-check' "$SUPERVISOR"; then + pass "check_model_health() retains CLI fallback (slow path)" +else + fail "check_model_health() retains CLI fallback (slow path)" +fi + +# ============================================================ +# SECTION 7: JSON output +# ============================================================ +section "JSON Output" + +# Status --json +json_status=$(run_with_timeout 5 bash "$HELPER" status --json 2>&1) || true +if echo "$json_status" | grep -q "{" 2>/dev/null; then + pass "status --json produces JSON" +else + skip "status --json (no data to format)" +fi + +# Resolve --json +json_resolve=$(run_with_timeout 15 bash "$HELPER" resolve sonnet --json --quiet 2>&1) || true +if echo "$json_resolve" | grep -q "tier" 2>/dev/null; then + pass "resolve --json produces JSON with tier field" +else + skip "resolve --json (provider may be unavailable)" +fi + +# ============================================================ +# SUMMARY +# ============================================================ +echo "" +echo "========================================" +printf " \033[1mResults: %d total, \033[0;32m%d passed\033[0m, \033[0;31m%d failed\033[0m, \033[0;33m%d skipped\033[0m\n" \ + "$TOTAL_COUNT" "$PASS_COUNT" "$FAIL_COUNT" "$SKIP_COUNT" +echo "========================================" + +if [[ "$FAIL_COUNT" -gt 0 ]]; then + echo "" + printf "\033[0;31mFAILURES DETECTED - review output above\033[0m\n" + exit 1 +else + echo "" + printf "\033[0;32mAll tests passed.\033[0m\n" + exit 0 +fi