diff --git a/.agents/scripts/compare-models-helper.sh b/.agents/scripts/compare-models-helper.sh index a89b2920c..682d269f7 100755 --- a/.agents/scripts/compare-models-helper.sh +++ b/.agents/scripts/compare-models-helper.sh @@ -143,12 +143,12 @@ get_all_tier_patterns() { # Model Database (embedded reference data) # ============================================================================= # Format: model_id|provider|display_name|context_window|input_price_per_1m|output_price_per_1m|tier|capabilities|best_for -# Prices in USD per 1M tokens. Last updated: 2025-02-08. +# Prices in USD per 1M tokens. Last updated: 2026-02-18. # Sources: Anthropic, OpenAI, Google official pricing pages. -readonly MODEL_DATA="claude-opus-4|Anthropic|Claude Opus 4|200000|15.00|75.00|high|code,reasoning,architecture,vision,tools|Architecture decisions, novel problems, complex multi-step reasoning -claude-sonnet-4|Anthropic|Claude Sonnet 4|200000|3.00|15.00|medium|code,reasoning,vision,tools|Code implementation, review, most development tasks -claude-haiku-3.5|Anthropic|Claude 3.5 Haiku|200000|0.80|4.00|low|code,reasoning,vision,tools|Triage, classification, simple transforms, formatting +readonly MODEL_DATA="claude-opus-4-6|Anthropic|Claude Opus 4.6|200000|5.00|25.00|high|code,reasoning,architecture,vision,tools|Architecture decisions, novel problems, complex multi-step reasoning +claude-sonnet-4-6|Anthropic|Claude Sonnet 4.6|200000|3.00|15.00|medium|code,reasoning,vision,tools|Code implementation, review, most development tasks +claude-haiku-4-5|Anthropic|Claude Haiku 4.5|200000|1.00|5.00|low|code,reasoning,vision,tools|Triage, classification, simple transforms, formatting gpt-4.1|OpenAI|GPT-4.1|1048576|2.00|8.00|medium|code,reasoning,vision,tools,search|Coding, instruction following, long context gpt-4.1-mini|OpenAI|GPT-4.1 Mini|1048576|0.40|1.60|low|code,reasoning,vision,tools|Cost-efficient coding and general tasks gpt-4.1-nano|OpenAI|GPT-4.1 Nano|1048576|0.10|0.40|low|code,reasoning,tools|Fast classification, simple transforms @@ -169,31 +169,31 @@ llama-4-scout|Meta|Llama 4 Scout|512000|0.15|0.40|low|code,reasoning,vision,tool # ============================================================================= # Maps aidevops internal tiers to recommended models -readonly TIER_MAP="haiku|claude-haiku-3.5|Triage, classification, simple transforms +readonly TIER_MAP="haiku|claude-haiku-4-5|Triage, classification, simple transforms flash|gemini-2.5-flash|Large context reads, summarization, bulk processing -sonnet|claude-sonnet-4|Code implementation, review, most development tasks +sonnet|claude-sonnet-4-6|Code implementation, review, most development tasks pro|gemini-2.5-pro|Large codebase analysis, complex reasoning with big context -opus|claude-opus-4|Architecture decisions, complex multi-step reasoning" +opus|claude-opus-4-6|Architecture decisions, complex multi-step reasoning" # ============================================================================= # Task-to-Model Recommendations # ============================================================================= -readonly TASK_RECOMMENDATIONS="code review|claude-sonnet-4|o4-mini|gemini-2.5-flash -code implementation|claude-sonnet-4|gpt-4.1|gemini-2.5-pro -architecture design|claude-opus-4|o3|gemini-2.5-pro -bug fixing|claude-sonnet-4|gpt-4.1|o4-mini -refactoring|claude-sonnet-4|gpt-4.1|gemini-2.5-pro -documentation|claude-sonnet-4|gpt-4o|gemini-2.5-flash -testing|claude-sonnet-4|gpt-4.1|o4-mini -classification|claude-haiku-3.5|gpt-4.1-nano|gemini-2.5-flash -summarization|gemini-2.5-flash|gpt-4o-mini|claude-haiku-3.5 -large codebase analysis|gemini-2.5-pro|gpt-4.1|claude-sonnet-4 +readonly TASK_RECOMMENDATIONS="code review|claude-sonnet-4-6|o4-mini|gemini-2.5-flash +code implementation|claude-sonnet-4-6|gpt-4.1|gemini-2.5-pro +architecture design|claude-opus-4-6|o3|gemini-2.5-pro +bug fixing|claude-sonnet-4-6|gpt-4.1|o4-mini +refactoring|claude-sonnet-4-6|gpt-4.1|gemini-2.5-pro +documentation|claude-sonnet-4-6|gpt-4o|gemini-2.5-flash +testing|claude-sonnet-4-6|gpt-4.1|o4-mini +classification|claude-haiku-4-5|gpt-4.1-nano|gemini-2.5-flash +summarization|gemini-2.5-flash|gpt-4o-mini|claude-haiku-4-5 +large codebase analysis|gemini-2.5-pro|gpt-4.1|claude-sonnet-4-6 math reasoning|o3|deepseek-r1|gemini-2.5-pro -security audit|claude-opus-4|o3|claude-sonnet-4 -data extraction|gemini-2.5-flash|gpt-4o-mini|claude-haiku-3.5 -commit messages|claude-haiku-3.5|gpt-4.1-nano|gemini-2.5-flash -pr description|claude-sonnet-4|gpt-4o|gemini-2.5-flash" +security audit|claude-opus-4-6|o3|claude-sonnet-4-6 +data extraction|gemini-2.5-flash|gpt-4o-mini|claude-haiku-4-5 +commit messages|claude-haiku-4-5|gpt-4.1-nano|gemini-2.5-flash +pr description|claude-sonnet-4-6|gpt-4o|gemini-2.5-flash" # ============================================================================= # Helper Functions @@ -480,8 +480,8 @@ cmd_recommend() { if [[ "$found" != "true" ]]; then echo "No exact task match. Showing general recommendations:" echo "" - echo " High capability: claude-opus-4 or o3" - echo " Balanced: claude-sonnet-4 or gpt-4.1" + echo " High capability: claude-opus-4-6 or o3" + echo " Balanced: claude-sonnet-4-6 or gpt-4.1" echo " Budget: gemini-2.5-flash or gpt-4.1-nano" echo " Large context: gemini-2.5-pro or gpt-4.1 (1M tokens)" echo "" @@ -1003,9 +1003,9 @@ cmd_help() { echo "" echo "Scoring examples:" echo " compare-models-helper.sh score --task 'fix React bug' --type code \\" - echo " --model claude-sonnet-4 --correctness 9 --completeness 8 --quality 8 --clarity 9 --adherence 9 \\" + echo " --model claude-sonnet-4-6 --correctness 9 --completeness 8 --quality 8 --clarity 9 --adherence 9 \\" echo " --model gpt-4.1 --correctness 8 --completeness 7 --quality 7 --clarity 8 --adherence 8 \\" - echo " --winner claude-sonnet-4" + echo " --winner claude-sonnet-4-6" echo " compare-models-helper.sh results" echo " compare-models-helper.sh results --model sonnet --limit 5" echo "" @@ -1441,8 +1441,8 @@ SQL } # Record a comparison result -# Usage: cmd_score --task "description" --type "code" --evaluator "claude-opus-4" \ -# --model "claude-sonnet-4" --correctness 9 --completeness 8 --quality 7 \ +# Usage: cmd_score --task "description" --type "code" --evaluator "claude-opus-4-6" \ +# --model "claude-sonnet-4-6" --correctness 9 --completeness 8 --quality 7 \ # --clarity 8 --adherence 9 --latency 1200 --tokens 500 \ # --strengths "Fast, accurate" --weaknesses "Verbose" \ # [--model "gpt-4.1" --correctness 8 ...] diff --git a/.agents/scripts/generate-models-md.sh b/.agents/scripts/generate-models-md.sh index 097c6e272..7d0b08671 100755 --- a/.agents/scripts/generate-models-md.sh +++ b/.agents/scripts/generate-models-md.sh @@ -157,11 +157,11 @@ generate_routing_tiers() { sm.tier, sm.model_id, CASE sm.tier - WHEN 'haiku' THEN '~0.25x' + WHEN 'haiku' THEN '~0.33x' WHEN 'flash' THEN '~0.20x' WHEN 'sonnet' THEN '1x (baseline)' WHEN 'pro' THEN '~1.5x' - WHEN 'opus' THEN '~3x' + WHEN 'opus' THEN '~1.7x' ELSE '?' END FROM subagent_models sm diff --git a/.agents/tools/ai-assistants/models/haiku.md b/.agents/tools/ai-assistants/models/haiku.md index b90ac93cb..8cc172822 100644 --- a/.agents/tools/ai-assistants/models/haiku.md +++ b/.agents/tools/ai-assistants/models/haiku.md @@ -1,7 +1,7 @@ --- description: Lightweight model for triage, classification, and simple transforms mode: subagent -model: anthropic/claude-3-5-haiku-20241022 +model: anthropic/claude-haiku-4-5-20251001 model-tier: haiku model-fallback: google/gemini-2.5-flash-preview-05-20 tools: @@ -39,8 +39,9 @@ You are a lightweight, fast AI assistant optimized for simple tasks. | Field | Value | |-------|-------| | Provider | Anthropic | -| Model | claude-3-5-haiku | +| Model | claude-haiku-4-5 | | Context | 200K tokens | -| Input cost | $0.80/1M tokens | -| Output cost | $4.00/1M tokens | +| Max output | 64K tokens | +| Input cost | $1.00/1M tokens | +| Output cost | $5.00/1M tokens | | Tier | haiku (lowest cost) | diff --git a/.agents/tools/ai-assistants/models/opus.md b/.agents/tools/ai-assistants/models/opus.md index f3102e15d..0411a18b9 100644 --- a/.agents/tools/ai-assistants/models/opus.md +++ b/.agents/tools/ai-assistants/models/opus.md @@ -1,7 +1,7 @@ --- description: Highest-capability model for architecture decisions, novel problems, and complex multi-step reasoning mode: subagent -model: anthropic/claude-opus-4-20250514 +model: anthropic/claude-opus-4-6 model-tier: opus model-fallback: openai/o3 fallback-chain: @@ -37,7 +37,7 @@ You are the highest-capability AI assistant, reserved for the most complex and c - Only use this tier when the task genuinely requires it - Most coding tasks are better served by sonnet tier -- Cost is approximately 3x sonnet -- justify the spend +- Cost is approximately 1.7x sonnet -- justify the spend - If the task is primarily about large context, use pro tier instead ## Model Details @@ -45,8 +45,9 @@ You are the highest-capability AI assistant, reserved for the most complex and c | Field | Value | |-------|-------| | Provider | Anthropic | -| Model | claude-opus-4 | -| Context | 200K tokens | -| Input cost | $15.00/1M tokens | -| Output cost | $75.00/1M tokens | +| Model | claude-opus-4-6 | +| Context | 200K tokens (1M beta) | +| Max output | 128K tokens | +| Input cost | $5.00/1M tokens | +| Output cost | $25.00/1M tokens | | Tier | opus (highest capability, highest cost) | diff --git a/MODELS.md b/MODELS.md index 589525e84..acf5c4262 100644 --- a/MODELS.md +++ b/MODELS.md @@ -3,9 +3,9 @@ Live performance data from pattern-tracker and response-scoring databases. Auto-generated by `generate-models-md.sh` — do not edit manually. -**Last updated**: 2026-02-18T16:40:00Z +**Last updated**: 2026-02-18T17:35:03Z -- **Pattern data points**: 884 +- **Pattern data points**: 902 - **Scored responses**: 18 - **Date range**: 2026-02-05 to 2026-02-18 @@ -13,14 +13,14 @@ Auto-generated by `generate-models-md.sh` — do not edit manually. | Model | Provider | Tier | Context | Input/1M | Output/1M | |-------|----------|------|---------|----------|-----------| -| claude-opus-4 | Anthropic | opus | 200K | $15.00 | $75.00 | +| claude-opus-4-6 | Anthropic | opus | 200K | $5.00 | $25.00 | | o3 | OpenAI | opus | 200K | $10.00 | $40.00 | -| claude-sonnet-4 | Anthropic | sonnet | 200K | $3.00 | $15.00 | +| claude-sonnet-4-6 | Anthropic | sonnet | 200K | $3.00 | $15.00 | | gemini-2.5-pro | Google | sonnet | 1M | $1.25 | $10.00 | | gpt-4.1 | OpenAI | sonnet | 1M | $2.00 | $8.00 | | gpt-4o | OpenAI | sonnet | 128K | $2.50 | $10.00 | | o4-mini | OpenAI | sonnet | 200K | $1.10 | $4.40 | -| claude-haiku-3.5 | Anthropic | haiku | 200K | $0.80 | $4.00 | +| claude-haiku-4-5 | Anthropic | haiku | 200K | $1.00 | $5.00 | | deepseek-r1 | DeepSeek | haiku | 131K | $0.55 | $2.19 | | deepseek-v3 | DeepSeek | haiku | 131K | $0.27 | $1.10 | | gemini-2.0-flash | Google | haiku | 1M | $0.10 | $0.40 | @@ -37,11 +37,11 @@ Active model assignments for each dispatch tier: | Tier | Primary Model | Relative Cost | |------|---------------|---------------| -| haiku | claude-3-5-haiku | ~0.25x | +| haiku | claude-haiku-4-5 | ~0.33x | | flash | gemini-2.5-flash-preview-05-20 | ~0.20x | -| sonnet | claude-sonnet-4 | 1x (baseline) | +| sonnet | claude-sonnet-4-6 | 1x (baseline) | | pro | gemini-2.5-pro-preview-06-05 | ~1.5x | -| opus | claude-opus-4 | ~3x | +| opus | claude-opus-4-6 | ~1.7x | ## Performance Leaderboard @@ -49,8 +49,8 @@ Success rates from autonomous task execution (pattern-tracker data): | Model | Tasks | Successes | Failures | Success Rate | Last Used | |-------|-------|-----------|----------|--------------|-----------| -| opus | 507 | 499 | 8 | 98% | 2026-02-18 | -| sonnet | 163 | 163 | 0 | 100% | 2026-02-18 | +| opus | 512 | 504 | 8 | 98% | 2026-02-18 | +| sonnet | 175 | 175 | 0 | 100% | 2026-02-18 | | pro | 2 | 2 | 0 | 100% | 2026-02-18 | | haiku | 1 | 0 | 1 | 0% | 2026-02-05 | @@ -58,8 +58,8 @@ Success rates from autonomous task execution (pattern-tracker data): | Task Type | Tasks | Successes | Failures | Success Rate | |-----------|-------|-----------|----------|--------------| -| feature | 541 | 522 | 19 | 96% | -| bugfix | 10 | 7 | 3 | 70% | +| feature | 551 | 532 | 19 | 96% | +| bugfix | 17 | 14 | 3 | 82% | | refactor | 1 | 0 | 1 | 0% | | testing | 1 | 1 | 0 | 100% | | security | 1 | 1 | 0 | 100% | diff --git a/TODO.md b/TODO.md index 3caa86603..3707e82e4 100644 --- a/TODO.md +++ b/TODO.md @@ -1540,7 +1540,7 @@ t019.3.4,Update AGENTS.md with Beads integration docs,,beads,1h,45m,2025-12-21T1 - [ ] t1127 Fix create_improvement action type recognition in executor #bugfix #auto-dispatch #self-improvement ~15m model:haiku category:automation — The supervisor executor rejects 'create_improvement' as 'invalid type' — 3 skips in last 5 cycles. The AI prompt defines create_improvement as a valid action type, but the executor's action type whitelist doesn't include it. Add 'create_improvement' to the executor's recognized action types, mapping it to create_task with the additional #self-improvement tag and category metadata. ref:GH#1686 assignee:marcusquinn started:2026-02-18T16:49:43Z -- [ ] t1128 Update model registry with current model IDs #bugfix #auto-dispatch ~30m model:sonnet category:data — The model-registry.db has stale entries: `claude-opus-4` should be `claude-opus-4-6`, `claude-sonnet-4` should include dated variants like `claude-sonnet-4-20250514`, and newer models may be missing entirely. The `generate-models-md.sh` script produces MODELS.md from this registry, so outdated source data propagates to all repos. Steps: (1) audit current model IDs against provider APIs/docs, (2) update model-registry.db entries, (3) re-run generate-models-md.sh, (4) verify MODELS.md output is accurate. ref:GH#1690 assignee:marcusquinn started:2026-02-18T17:01:09Z logged:2026-02-18 +- [x] t1128 Update model registry with current model IDs #bugfix #auto-dispatch ~30m model:sonnet category:data — The model-registry.db has stale entries: `claude-opus-4` should be `claude-opus-4-6`, `claude-sonnet-4` should include dated variants like `claude-sonnet-4-20250514`, and newer models may be missing entirely. The `generate-models-md.sh` script produces MODELS.md from this registry, so outdated source data propagates to all repos. Steps: (1) audit current model IDs against provider APIs/docs, (2) update model-registry.db entries, (3) re-run generate-models-md.sh, (4) verify MODELS.md output is accurate. ref:GH#1690 assignee:marcusquinn started:2026-02-18T17:01:09Z logged:2026-02-18 pr:#1712 completed:2026-02-18 - [ ] t1129 Include MODELS.md in aidevops init workflow for per-repo performance tracking #feature #auto-dispatch ~2h model:sonnet category:observability — Currently `generate-models-md.sh` produces a global MODELS.md from the pattern-tracker and response-scoring databases. This is useful for per-repo tracking of which models perform best on that repo's task types. Steps: (1) update `aidevops init` to generate MODELS.md in the project root, (2) add MODELS.md to the list of files tracked by git (not gitignored), (3) add a periodic refresh mechanism (e.g., during supervisor pulse or on commit) so the data stays current, (4) consider filtering pattern data by repo path so each repo's MODELS.md reflects its own task history rather than global stats. ref:GH#1691 logged:2026-02-18