diff --git a/.github/sync-manifest.yml b/.github/sync-manifest.yml index f25c9a7e0..be490d61e 100644 --- a/.github/sync-manifest.yml +++ b/.github/sync-manifest.yml @@ -72,6 +72,15 @@ workflows: - source: .github/workflows/agents-auto-label.yml description: "Auto-label - suggests/applies labels based on semantic matching (Phase 5A)" + - source: .github/workflows/agents-capability-check.yml + description: "Capability check - pre-flight agent feasibility gate (Phase 3A)" + + - source: .github/workflows/agents-decompose.yml + description: "Task decomposition - breaks large issues into sub-tasks (Phase 3B)" + + - source: .github/workflows/agents-dedup.yml + description: "Duplicate detection - flags similar open issues (Phase 3C)" + - source: .github/workflows/agents-guard.yml description: "Agents guard - enforces agents workflow protections (Health 45)" diff --git a/docs/plans/langchain-post-code-rollout.md b/docs/plans/langchain-post-code-rollout.md index c5db10b12..954257b6b 100644 --- a/docs/plans/langchain-post-code-rollout.md +++ b/docs/plans/langchain-post-code-rollout.md @@ -1,8 +1,8 @@ # LangChain Post-Code Production Capabilities - Evaluation & Rollout Plan -> **Date:** January 7, 2026 -> **Status:** Phase 4 Planning - Full Automation Design -> **Last Validation:** 2026-01-07 (Phase 4 Planning Added) +> **Date:** January 8, 2026 +> **Status:** Phase 3 Workflows Created, Pending Consumer Sync +> **Last Validation:** 2026-01-08 (Thorough audit - all workflows created) --- @@ -14,9 +14,9 @@ |--------|---------|---------------------|--------| | `topic_splitter.py` | Split multi-topic ChatGPT conversations into separate issues | `agents-63-issue-intake.yml` | ✅ Working | | `issue_formatter.py` | Format raw issue text to AGENT_ISSUE_TEMPLATE | `agents-issue-optimizer.yml` | ✅ Implemented | -| `issue_optimizer.py` | Analyze issues and suggest improvements | `agents-issue-optimizer.yml` | ⚠️ Partial | -| `capability_check.py` | Pre-flight check if agent can complete tasks | Not integrated | ❌ Not connected | -| `task_decomposer.py` | Break large tasks into smaller actionable items | Not integrated | ❌ Not connected | +| `issue_optimizer.py` | Analyze issues and suggest improvements | `agents-issue-optimizer.yml` | ✅ Implemented (LLM enabled) | +| `capability_check.py` | Pre-flight check if agent can complete tasks | `agents-capability-check.yml` | ✅ Script + Workflow ready | +| `task_decomposer.py` | Break large tasks into smaller actionable items | `agents-decompose.yml` | ✅ Script + Workflow ready | ### B. PR Verification (Post-Code) @@ -30,9 +30,9 @@ | Script | Purpose | Status | |--------|---------|--------| -| `semantic_matcher.py` | Embedding-based semantic similarity | ⚠️ Available but unused | -| `label_matcher.py` | Match issues to labels semantically | ⚠️ Available but unused | -| `issue_dedup.py` | Detect duplicate issues | ⚠️ Available but unused | +| `semantic_matcher.py` | Embedding-based semantic similarity | ✅ Tests passing, used by label_matcher | +| `label_matcher.py` | Match issues to labels semantically | ✅ Workflow created (`agents-auto-label.yml`) | +| `issue_dedup.py` | Detect duplicate issues | ✅ Script + Workflow ready (`agents-dedup.yml`) | ### D. Core Infrastructure @@ -223,7 +223,7 @@ - [x] **Live test on Travel-Plan-Permission #318:** LLM evaluation produces scores and verdict (OpenAI: 62% confidence, detailed scores) - [x] **Live test on Travel-Plan-Permission #318:** Comment posted on PR with evaluation results (within 3 minutes of merge) - [x] Follow-up issue creation **DISABLED** (no longer automatically created) -- [ ] **Fix GitHub Models authentication** - 401 error in consumer repos (models permission missing) +- [x] **GitHub Models authentication** - ✅ FIXED (Travel-Plan-Permission PR #301 shows both providers working 2026-01-08) ### Phase 2: Issue Formatting & Cleanup (1 Step) @@ -277,28 +277,30 @@ ``` 3. **Implementation tasks:** - - [ ] Create `agents-capability-check.yml` workflow - - [ ] Add `needs-human` label to consumer repos via sync - - [ ] Trigger on `agent:codex` label added OR new workflow label - - [ ] Post comment explaining blockers when agent cannot proceed + - [x] Create `agents-capability-check.yml` workflow ✅ **DONE 2026-01-08** + - [x] Trigger on `agent:codex` label added + - [x] Post capability report comment with blockers + - [x] Add `needs-human` label + remove `agent:codex` when BLOCKED + - [ ] Add `needs-human` label to consumer repos via label sync (pending) **Step 3B: Task Decomposition** 1. **Implementation tasks:** - - [ ] Create `agents-decompose.yml` workflow - - [ ] Add `agents:decompose` label to label sync config - - [ ] Call `task_decomposer.py` when label applied - - [ ] Output: Either create sub-issues OR add checklist to parent + - [x] Create `agents-decompose.yml` workflow ✅ **DONE 2026-01-08** + - [x] Trigger on `agents:decompose` label added + - [x] Posts sub-task checklist as comment + - [x] Removes trigger label after posting + - [ ] Add `agents:decompose` label to consumer repos via label sync (pending) **Step 3C: Duplicate Detection (Testing Focus)** 1. **Critical concern:** False positives - we don't want to close valid issues 2. **Approach:** Comment-only mode first, no auto-close 3. **Implementation tasks:** - - [ ] Create `agents-dedup.yml` workflow - - [ ] Trigger on issue opened - - [ ] Compare against open issues using embeddings - - [ ] Post comment if >85% similarity detected (link to potential duplicate) + - [x] Create `agents-dedup.yml` workflow ✅ **DONE 2026-01-08** + - [x] Trigger on issue opened (skips bot-created issues) + - [x] Compare against open issues using embeddings (0.85 threshold) + - [x] Post warning comment if duplicate detected with links - [ ] Track false positive rate over testing period 4. **Testing metrics to track:** @@ -309,85 +311,438 @@ **Step 3D: Semantic Label Matching** 1. **Implementation tasks:** - - [ ] Create `agents-auto-label.yml` workflow OR integrate into existing - - [ ] Use `label_matcher.py` for semantic similarity - - [ ] Post comment with suggestions OR auto-apply at >90% confidence + - [x] Create `agents-auto-label.yml` workflow ✅ **DONE 2026-01-08** + - [x] Trigger on issue opened/edited (skips already-labeled) + - [x] Auto-apply labels at ≥90% confidence + - [x] Post suggestion comment for 75%-90% matches + - [x] Uses `label_matcher.py` + `semantic_matcher.py` --- -## Phase 3 Testing Plan (Manager-Database) +## Deployment Verification Plan -**Test Repository:** Manager-Database -**Test Duration:** 2 weeks (7 issues minimum) -**Start Date:** Ready to begin (all consumer repos synced) +> **Purpose:** Verify all workflows function correctly across ALL consumer repos +> **Status:** Ready to Execute +> **Prerequisite:** PR merged to main, sync workflow completed -### Test Issue #1: Capability Check Validation +### Known Issue - RESOLVED ✅ -**Purpose:** Validate capability_check.py correctly identifies agent blockers +**`verify:compare` on Trend_Model_Project PR #4249** +- **Status:** ✅ NOT A BUG - Working as designed +- **Root Cause:** PR #4249 is **not merged** (state: OPEN) +- **Expected Behavior:** Verifier workflow only runs on merged PRs -**Test Scenarios:** -1. **Issue requiring external API** - Should flag "needs credentials/external dependency" -2. **Issue requiring database migration** - Should flag "needs infrastructure/manual step" -3. **Normal code-only issue** - Should pass capability check +**Investigation (2026-01-08):** +1. Checked workflow run logs for `Agents Verifier` run #20803754012 +2. Found: "PR not merged; skipping verifier" - correct behavior +3. Confirmed via `gh pr view 4249`: `mergedAt: null, state: OPEN` +4. The `verify:compare` label was applied but PR wasn't merged yet -**Test Issue Ideas for Manager-Database:** -- "Integrate with external payment API" (should fail - external dep) -- "Add database migration for new schema" (should fail - infra) -- "Refactor logging module" (should pass - code only) +**Additional findings:** +- `verify:*` labels exist in Travel-Plan-Permission ✅ +- `verify:*` labels do NOT exist in Trend_Model_Project repo label list + - But GitHub allows ad-hoc label creation on PRs + - To properly trigger workflows, labels should be pre-created +- **Action:** Run `scripts/create_verifier_labels.py` on Trend_Model_Project -### Test Issue #2: Task Decomposition Validation +--- -**Purpose:** Validate task_decomposer.py produces useful sub-tasks +### Phase 1: Sync Deployment (All 7 Repos) + +After merging to main, verify sync creates PRs in all consumer repos. + +| Repo | Sync PR # | Sync PR Status | New Workflows Present | Notes | +|------|-----------|----------------|----------------------|-------| +| Manager-Database | - | ⏳ | - | Primary test repo | +| Template | - | ⏳ | - | | +| trip-planner | - | ⏳ | - | | +| Travel-Plan-Permission | - | ⏳ | - | Verify workflows working, has labels ✅ | +| Portable-Alpha-Extension-Model | - | ⏳ | - | | +| Trend_Model_Project | - | ⏳ | - | Needs verify labels created | +| Collab-Admin | - | ⏳ | - | | + +**Checklist:** +- [ ] Merge Phase 3 PR to main in Workflows repo +- [ ] Verify sync workflow triggered (Actions tab) +- [ ] Check each consumer repo for sync PR +- [ ] Review sync PR for correct workflow files: + - `agents-capability-check.yml` + - `agents-decompose.yml` + - `agents-dedup.yml` + - `agents-auto-label.yml` + - `agents-verify-to-issue.yml` +- [ ] Review bot comments on sync PRs for code issues +- [ ] Merge sync PRs to each consumer repo +- [ ] Create `verify:*` labels in repos that need them: + - Run: `python scripts/create_verifier_labels.py --execute` -**Test Scenario:** -- Create large issue with 5+ implied tasks -- Apply `agents:decompose` label -- Verify sub-tasks are actionable and correctly scoped +--- -**Test Issue Idea:** -- "Implement comprehensive health check endpoint with retry logic, circuit breaker, metrics, and alerting integration" +### Phase 2: Existing Workflow Verification (Cross-Repo) + +Verify already-deployed workflows work across repos. + +#### 2A. Verify Label Prerequisites + +**Required labels for verifier:** +| Repo | `verify:checkbox` | `verify:evaluate` | `verify:compare` | Action | +|------|-------------------|-------------------|------------------|--------| +| Travel-Plan-Permission | ✅ | ✅ | ✅ | None | +| Trend_Model_Project | ❌ | ❌ | ❌ | Create labels | +| Manager-Database | ❓ | ❓ | ❓ | Check & create | +| Template | ❓ | ❓ | ❓ | Check & create | +| trip-planner | ❓ | ❓ | ❓ | Check & create | +| Portable-Alpha-Extension-Model | ❓ | ❓ | ❓ | Check & create | +| Collab-Admin | ❓ | ❓ | ❓ | Check & create | + +**Script to check all repos:** +```bash +for repo in Travel-Plan-Permission Trend_Model_Project Manager-Database Template \ + trip-planner Portable-Alpha-Extension-Model Collab-Admin; do + echo "=== stranske/$repo ===" + gh api "repos/stranske/$repo/labels?per_page=100" \ + --jq '[.[] | select(.name | startswith("verify:")) | .name] | join(", ") | if . == "" then "NONE" else . end' +done +``` -### Test Issue #3: Duplicate Detection Validation +#### 2B. Verify `verify:evaluate` on Merged PRs -**Purpose:** Measure false positive rate for issue_dedup.py +Test on a **merged** PR in each repo (verifier only works post-merge): +- Missing labels (`verify:compare` not in repo) +- Missing secrets (`OPENAI_API_KEY` not configured) +- Workflow file syntax error +- Permissions issue -**Test Scenarios:** -1. **True duplicate** - Create issue very similar to existing (should detect) -2. **Related but different** - Create issue in same area but different ask (should NOT flag) -3. **Unrelated** - Create issue in different area (should NOT flag) +#### 2B. Verify `verify:evaluate` Across Repos -**Success Criteria:** -- True positives detected: 100% -- False positive rate: <5% -- Clear explanation in comment linking to potential duplicate +Test on a merged PR in each repo: + +| Repo | Test PR # | Label Added | Workflow Ran | Comment Posted | Result | +|------|-----------|-------------|--------------|----------------|--------| +| Manager-Database | - | ⏳ | - | - | - | +| Travel-Plan-Permission | #301 | ✅ | ✅ | ✅ | ✅ Known working | +| Trend_Model_Project | #4249 | ✅ | ❓ | ❓ | 🔍 Investigating | +| trip-planner | - | ⏳ | - | - | - | + +#### 2C. Verify `agents:optimize` Across Repos + +Test on an issue in each repo: + +| Repo | Test Issue # | Label Added | Workflow Ran | Comment Posted | Result | +|------|--------------|-------------|--------------|----------------|--------| +| Manager-Database | #184 | ✅ | ✅ | ✅ | ✅ Known working | +| Travel-Plan-Permission | - | ⏳ | - | - | - | +| Trend_Model_Project | - | ⏳ | - | - | - | + +--- + +### Phase 3: New Workflow Verification (Cross-Repo) + +After sync PRs merged, verify each new Phase 3 workflow in multiple repos. + +#### 3A. `agents-capability-check.yml` + +**Trigger:** Add `agent:codex` label to issue +**Test:** Create issue with external dependency, verify BLOCKED response + +| Repo | Test Issue # | Label Added | Workflow Ran | Report Posted | Correct Verdict | Notes | +|------|--------------|-------------|--------------|---------------|-----------------|-------| +| Manager-Database | - | ⏳ | - | - | - | Primary test | +| Travel-Plan-Permission | - | ⏳ | - | - | - | Secondary test | + +**Test Issue Content:** +```markdown +## Why +We need to integrate with external payment service. + +## Tasks +- [ ] Set up Stripe API credentials +- [ ] Implement payment webhook handler + +## Acceptance Criteria +- [ ] Payments process successfully +``` + +**Expected:** `needs-human` label added, `agent:codex` removed, blocker comment posted + +#### 3B. `agents-decompose.yml` + +**Trigger:** Add `agents:decompose` label to issue +**Test:** Create large issue, verify sub-task breakdown + +| Repo | Test Issue # | Label Added | Workflow Ran | Sub-tasks Posted | Label Removed | Notes | +|------|--------------|-------------|--------------|------------------|---------------|-------| +| Manager-Database | - | ⏳ | - | - | - | Primary test | +| Travel-Plan-Permission | - | ⏳ | - | - | - | Secondary test | + +**Test Issue Content:** +```markdown +## Why +Need comprehensive health check system. + +## Tasks +- [ ] Implement health check with retry logic, circuit breaker, + metrics collection, alerting, and dependency aggregation +``` + +**Expected:** Comment with 4-6 specific sub-tasks, `agents:decompose` label removed + +#### 3C. `agents-dedup.yml` + +**Trigger:** Automatic on issue creation +**Test:** Create issue similar to existing open issue + +| Repo | Existing Issue # | New Test Issue # | Workflow Ran | Duplicate Warning | Correct Link | Notes | +|------|------------------|------------------|--------------|-------------------|--------------|-------| +| Manager-Database | #133 | - | ⏳ | - | - | Similar to "GET managers" | +| Travel-Plan-Permission | - | - | ⏳ | - | - | Find existing issue first | + +**Expected:** Warning comment posted linking to similar issue + +#### 3D. `agents-auto-label.yml` + +**Trigger:** Automatic on issue creation (no existing labels) +**Test:** Create unlabeled issue with clear category -### Test Issue #4: Label Matching Validation +| Repo | Test Issue # | Workflow Ran | Labels Suggested | Labels Applied | Accuracy | Notes | +|------|--------------|--------------|------------------|----------------|----------|-------| +| Manager-Database | - | ⏳ | - | - | - | | +| Travel-Plan-Permission | - | ⏳ | - | - | - | | -**Purpose:** Validate label_matcher.py suggests correct labels +**Test Issue:** "Fix crash when database connection times out" (expect `bug` label) -**Test Scenario:** -- Create unlabeled issues in different categories -- Verify label suggestions match expected labels -- Track suggestion accuracy +#### 3E. `agents-verify-to-issue.yml` -### Test Issues Created (Manager-Database) +**Trigger:** Add `verify:create-issue` label to merged PR with verification comment +**Test:** Use PR that has `verify:evaluate` comment + +| Repo | Test PR # | Has Verify Comment | Label Added | Issue Created | Linked Correctly | Notes | +|------|-----------|-------------------|-------------|---------------|------------------|-------| +| Travel-Plan-Permission | #301 | ✅ | ⏳ | - | - | Has existing verification | +| Manager-Database | - | ⏳ | - | - | - | Need PR with verification first | + +--- + +### Phase 4: Troubleshooting Guide + +#### Workflow Not Running + +1. **Check workflow file exists** in repo under `.github/workflows/` +2. **Check trigger conditions** match (label name, event type) +3. **Check Actions tab** - may be disabled or erroring silently +4. **Check permissions** in workflow file vs repo settings + +#### Workflow Runs But No Comment Posted + +1. **Check logs** for Python/LLM errors +2. **Check secrets** - `GITHUB_TOKEN`, `OPENAI_API_KEY` configured +3. **Check permissions** in workflow - needs `issues: write` or `pull-requests: write` + +#### LLM Errors (401, timeout, etc.) + +1. **GitHub Models 401:** Token lacks `models` permission - falls back to OpenAI +2. **OpenAI 401:** Check `OPENAI_API_KEY` secret in repo +3. **Timeout:** LLM call taking too long - check input size + +#### Cross-Repo Differences + +If workflow works in Repo A but not Repo B: +1. Compare workflow file contents (may be out of sync) +2. Compare repo secrets configuration +3. Compare repo Actions permissions +4. Check for repo-specific branch protection rules + +--- + +### Verification Summary + +| Workflow | Repos Tested | Repos Passing | Status | +|----------|--------------|---------------|--------| +| `verify:evaluate` | 1/7 | 1 | ✅ Travel-Plan-Permission working | +| `verify:compare` | 1/7 | 1 | ✅ NOT A BUG - PR #4249 not merged (expected skip) | +| `agents:optimize` | 1/7 | 1 | ✅ Manager-Database working | +| `agents-capability-check` | 0/7 | - | ⏳ Pending sync | +| `agents-decompose` | 0/7 | - | ⏳ Pending sync | +| `agents-dedup` | 0/7 | - | ⏳ Pending sync | +| `agents-auto-label` | 0/7 | - | ⏳ Pending sync | +| `agents-verify-to-issue` | 0/7 | - | ⏳ Pending sync | + +**Investigation completed (2026-01-08):** +- Trend_Model_Project PR #4249 is OPEN, not merged +- Verifier correctly skipped (designed for merged PRs only) +- Labels need to be created in repos (only Travel-Plan-Permission has them) + +**Minimum for Phase 3 Completion:** Each workflow tested in ≥2 repos, passing in ≥2 repos + +--- + +## Phase 3 Functional Testing (Manager-Database) + +> **Purpose:** Validate workflows produce correct results (after deployment verified) +> **Status:** Blocked on deployment verification +> **Test Repository:** Manager-Database (primary), Travel-Plan-Permission (secondary) + +### Test Suite A: Capability Check (3 issues) + +**Workflow:** `agents-capability-check.yml` +**Trigger:** Add `agent:codex` label to issue +**Expected:** Posts capability report, adds `needs-human` if BLOCKED + +| Test | Issue Title | Tasks Description | Expected Result | Pass Criteria | +|------|-------------|-------------------|-----------------|---------------| +| A1 | "Integrate Stripe Payment Processing" | External API, webhooks, credentials | 🚫 BLOCKED | `needs-human` added, `agent:codex` removed, blocker explanation posted | +| A2 | "Add database migration for user roles" | Schema changes, migration scripts | 🚫 BLOCKED or ⚠️ REVIEW | Flags infrastructure/manual requirement | +| A3 | "Refactor logging to use structured format" | Code-only changes | ✅ PROCEED | No `needs-human`, agent proceeds normally | + +**Test A1 Issue Body:** +```markdown +## Why +We need to accept credit card payments. + +## Tasks +- [ ] Set up Stripe account and get API keys +- [ ] Implement payment intent creation +- [ ] Handle webhook events for payment confirmation +- [ ] Store transaction records + +## Acceptance Criteria +- [ ] Payments process successfully in test mode +- [ ] Webhooks update order status +``` + +**Test A3 Issue Body:** +```markdown +## Why +Current logging is unstructured and hard to parse. + +## Tasks +- [ ] Replace print statements with structured logger +- [ ] Add log levels (INFO, WARNING, ERROR) +- [ ] Include timestamp and context in log output + +## Acceptance Criteria +- [ ] All log output is JSON formatted +- [ ] Log level can be configured via environment variable +``` + +--- -| Issue | Purpose | Expected Result | -|-------|---------|-----------------| -| #193 | Capability Check - External service (Stripe) | ❌ SHOULD FAIL - requires Stripe API credentials, webhook endpoint | -| #194 | Task Decomposition - Large issue (10 tasks) | ✅ SHOULD DECOMPOSE - into 3-5 sub-issues or checklist | -| #196 | Duplicate Detection - Similar to #133 | ⚠️ SHOULD DETECT - ~85%+ similarity to "Add GET Endpoint for Managers List" | +### Test Suite B: Task Decomposition (3 issues) + +**Workflow:** `agents-decompose.yml` +**Trigger:** Add `agents:decompose` label to issue +**Expected:** Posts sub-task checklist comment, removes trigger label + +| Test | Issue Title | Complexity | Expected Result | Pass Criteria | +|------|-------------|------------|-----------------|---------------| +| B1 | "Implement health check with circuit breaker" | 5+ tasks | 4-6 sub-tasks | Clear, actionable sub-tasks posted | +| B2 | "Add comprehensive API documentation" | Many implied tasks | 5-8 sub-tasks | Covers all doc types (endpoints, examples, errors) | +| B3 | "Simple: Add version endpoint" | 1-2 tasks | 1-2 sub-tasks or "already small" | Doesn't over-decompose simple issues | + +**Test B1 Issue Body:** +```markdown +## Why +We need robust health checks with failure isolation. + +## Tasks +- [ ] Implement health check endpoint with retry logic, circuit breaker pattern, + metrics collection, alerting integration, and dependency health aggregation + +## Acceptance Criteria +- [ ] Health check returns status of all dependencies +- [ ] Circuit breaker opens after 3 consecutive failures +- [ ] Metrics exported to monitoring system +``` + +--- + +### Test Suite C: Duplicate Detection (4 issues) + +**Workflow:** `agents-dedup.yml` +**Trigger:** Automatic on issue creation (not bot-created) +**Expected:** Warning comment if >85% similar to existing open issue + +| Test | Issue Title | Similarity To | Expected Result | Pass Criteria | +|------|-------------|---------------|-----------------|---------------| +| C1 | "Add GET endpoint for all managers" | Existing #133 | ⚠️ DUPLICATE | Warning posted, links to #133 | +| C2 | "Add PUT endpoint to update manager" | Related area | ✅ NO FLAG | Different operation, no warning | +| C3 | "Implement caching layer" | Unrelated | ✅ NO FLAG | Different domain, no warning | +| C4 | "Get list of all managers from database" | Phrased differently | ⚠️ DUPLICATE | Semantic match despite different words | + +**Success Metrics:** +- True positive rate: ≥90% (C1, C4 correctly flagged) +- False positive rate: <10% (C2, C3 not flagged) +- Link accuracy: 100% (correct issue linked) + +--- + +### Test Suite D: Auto-Label (2 issues) + +**Workflow:** `agents-auto-label.yml` +**Trigger:** Automatic on issue creation/edit (skips labeled issues) +**Expected:** Suggests or applies labels based on content + +| Test | Issue Title | Content Theme | Expected Labels | Pass Criteria | +|------|-------------|---------------|-----------------|---------------| +| D1 | "Fix crash when database connection fails" | Bug, database | `bug` suggested/applied | Correct category identified | +| D2 | "Add support for bulk manager import" | Feature, enhancement | `enhancement` suggested | Feature vs bug distinction correct | + +**Note:** Label matching depends on repo having well-described labels. Manager-Database has: `bug`, `enhancement`, `documentation`, `agent:codex`, etc. + +--- + +### Test Execution Tracking + +| Suite | Test | Issue # | Created | Workflow Ran | Result | Notes | +|-------|------|---------|---------|--------------|--------|-------| +| A | A1 | - | ⏳ | - | - | | +| A | A2 | - | ⏳ | - | - | | +| A | A3 | - | ⏳ | - | - | | +| B | B1 | - | ⏳ | - | - | | +| B | B2 | - | ⏳ | - | - | | +| B | B3 | - | ⏳ | - | - | | +| C | C1 | - | ⏳ | - | - | | +| C | C2 | - | ⏳ | - | - | | +| C | C3 | - | ⏳ | - | - | | +| C | C4 | - | ⏳ | - | - | | +| D | D1 | - | ⏳ | - | - | | +| D | D2 | - | ⏳ | - | - | | + +**Total:** 0/12 tests executed + +--- ### Testing Metrics Dashboard -| Script | Test Issues | True Positives | False Positives | Accuracy | Status | -|--------|-------------|----------------|-----------------|----------|--------| -| capability_check.py | #193 (1/3) | - | - | - | 🔄 Testing | -| task_decomposer.py | #194 (1/2) | - | - | - | 🔄 Testing | -| issue_dedup.py | #196 (1/3) | - | - | <5% target | 🔄 Testing | -| label_matcher.py | 0/3 | - | - | - | ⏳ Pending | +| Workflow | Tests | Passed | Failed | Accuracy | Status | +|----------|-------|--------|--------|----------|--------| +| `agents-capability-check.yml` | 0/3 | - | - | - | ⏳ Pending | +| `agents-decompose.yml` | 0/3 | - | - | - | ⏳ Pending | +| `agents-dedup.yml` | 0/4 | - | - | - | ⏳ Pending | +| `agents-auto-label.yml` | 0/2 | - | - | - | ⏳ Pending | -**Total test issues created:** 3/11 on Manager-Database +**Overall Phase 3 Test Status:** 0/12 complete + +--- + +### Rollback Plan + +If any workflow causes issues in consumer repos: + +1. **Immediate:** Remove workflow file from consumer repo manually +2. **Short-term:** Update sync-manifest.yml to exclude problematic workflow +3. **Fix:** Debug in Workflows repo, create fix PR +4. **Re-deploy:** Re-run sync after fix merged + +### Success Criteria for Phase 3 Completion + +- [ ] All 12 test issues created and workflows triggered +- [ ] Capability check: ≥2/3 tests pass (correctly identifies blockers) +- [ ] Task decomposition: ≥2/3 tests pass (produces useful sub-tasks) +- [ ] Duplicate detection: ≥3/4 tests pass (low false positive rate) +- [ ] Auto-label: ≥1/2 tests pass (suggests relevant labels) +- [ ] No workflow errors or crashes +- [ ] User feedback: workflows provide value (not just noise) --- @@ -397,23 +752,23 @@ |-------|-------|-------|-----------|--------| | 1 | PR Verification | 2 | Manager-Database | ✅ Deployed, 7/7 repos synced | | 2 | Issue Formatting | 1 | Manager-Database | ✅ Deployed & tested - Quality: 7.5/10 | -| 3 | Pre-Agent Intelligence | 4 | Manager-Database | 🔄 Testing - 3/11 test issues created | -| 4 | Full Automation & Cleanup | 5 | Manager-Database | 📋 Planning | +| 3 | Pre-Agent Intelligence | 4 | Manager-Database | ✅ All 4 workflows created, in sync manifest | +| 4 | Full Automation & Cleanup | 5 | Manager-Database | 🔄 Implementation started | **Phase 3 Components:** -- **3A:** Capability Check - Pre-agent feasibility gate (supplements agents:optimize) -- **3B:** Task Decomposition - Auto-split large issues -- **3C:** Duplicate Detection - Comment-only mode, track false positives -- **3D:** Semantic Labeling - Auto-suggest/apply labels +- **3A:** Capability Check - Pre-agent feasibility gate - ✅ Script + Workflow created (`agents-capability-check.yml`) +- **3B:** Task Decomposition - Auto-split large issues - ✅ Script + Workflow created (`agents-decompose.yml`) +- **3C:** Duplicate Detection - Comment-only mode - ✅ Script + Workflow created (`agents-dedup.yml`) +- **3D:** Semantic Labeling - Auto-suggest/apply labels - ✅ Script + Workflow created (`agents-auto-label.yml`) **Phase 4 Components:** -- **4A:** Label Cleanup - Remove bloat, standardize across repos -- **4B:** User Guide - Operational documentation for label system -- **4C:** Auto-Pilot Label - End-to-end issue-to-merge automation -- **4D:** Conflict Resolution - Automated merge conflict handling in keepalive -- **4E:** Verify-to-Issue - Create follow-up issues from verification feedback +- **4A:** Label Cleanup - ✅ Script created (`scripts/cleanup_labels.py`) +- **4B:** User Guide - Operational documentation for label system - 📋 Deferred +- **4C:** Auto-Pilot Label - End-to-end issue-to-merge automation - 📋 Planning +- **4D:** Conflict Resolution - ✅ Script created (`conflict_detector.js`), in sync manifest +- **4E:** Verify-to-Issue - ✅ Workflow created (`agents-verify-to-issue.yml`), in sync manifest -**Total: 12 deployment actions** - Phases 1-2 deployed. Phases 3-4 in planning/testing. +**Total: 12 deployment actions** - Phases 1-2 deployed. Phase 3 scripts ready. Phase 4 partially implemented. **Substantive Quality Assessment:** - **agents:optimize:** 8.6/10 - Provides valuable, actionable analysis @@ -444,13 +799,32 @@ - "Implement logging before health checks" - "Retry logic blocks enhanced error logging" -### Phase 3 Implementation (Next) -1. **Step 3A: Capability Check** - Create `agents-capability-check.yml`, integrate with issue workflow - - Supplements existing agents:optimize (quality) with feasibility gate - - Runs BEFORE agent assignment, not after -2. **Step 3B: Task Decomposition** - Create `agents-decompose.yml` workflow -3. **Step 3C: Duplicate Detection** - Create `agents-dedup.yml` (comment-only, track false positives) -4. **Step 3D: Label Matching** - Integrate into issue workflow +### Phase 3 Implementation - UPDATED 2026-01-08 + +**Scripts Status:** All 4 Phase 3 scripts have passing tests (129 tests total) +- ✅ `capability_check.py` - 57 tests passing +- ✅ `task_decomposer.py` - 51 tests passing +- ✅ `issue_dedup.py` - 6 tests passing +- ✅ `label_matcher.py` - 6 tests passing + +**Workflows Status:** +1. ~~**Step 3D: Label Matching**~~ ✅ `agents-auto-label.yml` created and in sync manifest +2. ~~**Step 3A: Capability Check**~~ ✅ `agents-capability-check.yml` created and in sync manifest +3. ~~**Step 3B: Task Decomposition**~~ ✅ `agents-decompose.yml` created and in sync manifest +4. ~~**Step 3C: Duplicate Detection**~~ ✅ `agents-dedup.yml` created and in sync manifest + +**⚠️ PENDING:** All Phase 3 workflows created but not yet synced to consumer repos. Trigger sync workflow to deploy. + +### Phase 4 Implementation - STARTED 2026-01-08 + +**Completed:** +1. ✅ **4A: Label Cleanup** - `scripts/cleanup_labels.py` created (296 lines) +2. ✅ **4D: Conflict Resolution** - `conflict_detector.js` created (365 lines), in sync manifest +3. ✅ **4E: Verify-to-Issue** - `agents-verify-to-issue.yml` created (203 lines), in sync manifest + +**Pending:** +4. **4B: User Guide** - Create `docs/WORKFLOW_USER_GUIDE.md` - 📋 Deferred +5. **4C: Auto-Pilot** - Create `agents-auto-pilot.yml` - ❌ NOT STARTED ### Future Enhancements 1. **Compare mode refinement** - Currently uses gpt-4o (GitHub) vs gpt-5.2 (OpenAI) @@ -461,7 +835,7 @@ ## Phase 4: Full Automation & Cleanup (5 Initiatives) -> **Status:** Planning +> **Status:** Implementation Started > **Goal:** Streamline end-to-end automation from issue to merged PR ### 4A. Label Cleanup & Standardization @@ -658,13 +1032,25 @@ Step 8: verify:evaluate on merged PR ### 4D. Conflict Resolution in Keepalive +> **Status:** ✅ Script Implemented - Integration Pending + **Problem:** Most common reason keepalive stalls is merge conflicts. Agents handle conflicts well when prompted, but current pipeline doesn't automatically detect/respond. **Current State:** -- Keepalive detects "Gate failed" but doesn't distinguish conflict from test failure -- Agent eventually addresses conflicts but wastes cycles +- ✅ `conflict_detector.js` created (366 lines) with full conflict detection logic +- ✅ In sync manifest for consumer repos +- ❌ Integration with `keepalive_gate.js` pending +- ❌ Integration with `agents-keepalive-loop.yml` pending + +**Implementation Checklist:** +- [x] Create `.github/scripts/conflict_detector.js` +- [ ] Add conflict detection to `keepalive_gate.js` +- [x] Create `.github/codex/prompts/fix_merge_conflicts.md` (in sync manifest) +- [ ] Update `agents-keepalive-loop.yml` to use conflict prompt +- [ ] Add conflict metrics to keepalive summary +- [ ] Test with intentionally conflicted branches on Manager-Database -**Full Implementation Plan:** +**Full Implementation Plan (Reference):** **Step 1: Conflict Detection Module** Create `scripts/conflict_detector.js`: @@ -755,11 +1141,19 @@ Add to `agents-keepalive-loop.yml`: ### 4E. Verification-to-Issue Workflow +> **Status:** ✅ Implemented - In Sync Manifest + **Problem:** When `verify:evaluate` or `verify:compare` identifies issues, there's no automated way to create follow-up work. **Note:** We previously disabled automatic issue creation because it was too aggressive. This is a **user-triggered** alternative. -**Proposed Label:** `verify:create-issue` +**Implementation Status:** +- ✅ `agents-verify-to-issue.yml` created (203 lines) +- ✅ Added to sync manifest for consumer repos +- ✅ Triggers on `verify:create-issue` label +- ⏳ Pending: Live testing on consumer repo + +**Label:** `verify:create-issue` **Flow:** @@ -1057,19 +1451,98 @@ os.environ["LANGCHAIN_PROJECT"] = "workflows-agents" ## Implementation Priority -| Initiative | Effort | Value | Priority | Notes | -|------------|--------|-------|----------|-------| -| 4A. Label Cleanup | Low | Medium | ✅ Ready | 5 bloat labels + per-repo audit | -| 4B. User Guide | Medium | High | Defer | After other features stable | -| 4C. Auto-Pilot | High | High | Test carefully | Most complex | -| 4D. Conflict Resolution | Medium | High | ✅ Ready | Full implementation planned | -| 4E. Verify-to-Issue | Low | Medium | ✅ Ready | Full workflow designed | -| 5A. Auto-labeling | Low | Medium | ✅ Ready | Script exists | -| 5B. Coverage PR Check | Low | Medium | ✅ Ready | Soft warning only | -| 5D. Dependabot Auto-merge | Low | Medium | ✅ Ready | Extend existing | -| 5E. Issue Lint | Low | Low | Later | Nice to have | -| 5F. Cross-Repo Linking | - | - | ❌ Skipped | Not needed | -| 5G. Metrics Dashboard | Medium | Medium | ✅ Ready | LangSmith + custom | +| Initiative | Effort | Value | Priority | Status | +|------------|--------|-------|----------|--------| +| 4A. Label Cleanup | Low | Medium | Ready | ❌ Not started | +| 4B. User Guide | Medium | High | Defer | 📋 After other features stable | +| 4C. Auto-Pilot | High | High | Test carefully | ❌ Not started | +| 4D. Conflict Resolution | Medium | High | In Progress | ✅ Script done, integration pending | +| 4E. Verify-to-Issue | Low | Medium | Ready | ✅ **Implemented & synced** | +| 5A. Auto-labeling | Low | Medium | Ready | ✅ **Workflow created** | +| 5B. Coverage PR Check | Low | Medium | Ready | ⚠️ Existing workflow, enhance | +| 5D. Dependabot Auto-merge | Low | Medium | Ready | ⚠️ Extend existing | +| 5E. Issue Lint | Low | Low | Later | ❌ Not started | +| 5F. Cross-Repo Linking | - | - | Skipped | ❌ Not implementing | +| 5G. Metrics Dashboard | Medium | Medium | Ready | ❌ Not started | + +--- + +## What's Next - Prioritized Action Items + +### Immediate (Can Do Now) + +1. **Create Phase 3 Workflows** - Scripts ready, just need workflow files: + - `agents-capability-check.yml` - Gate before agent assignment + - `agents-decompose.yml` - Split large issues automatically + - `agents-dedup.yml` - Detect duplicate issues + +2. **Integrate Conflict Detector** - Script exists, add to keepalive pipeline: + - Update `keepalive_gate.js` to call `conflict_detector.js` + - Add conflict prompt routing in `keepalive_prompt_routing.js` + +3. **Test 4E Verify-to-Issue** - Workflow deployed, needs live test: + - Find merged PR with verification feedback + - Add `verify:create-issue` label + - Validate issue creation and linking + +4. **Test Auto-Label Workflow** - Deployed to consumer repos: + - Create test issue with clear topic (e.g., "bug" or "documentation") + - Verify label suggestions appear + +### Short Term (1-2 weeks) + +5. **Label Cleanup Audit** - Per-repo idiosyncratic labels: + - Create `scripts/cleanup_labels.py` + - Audit Manager-Database first + - Generate cleanup PRs with human approval + +6. **GitHub Models Authentication Fix**: + - Investigate 401 "models permission required" in consumer repos + - Either fix token permissions or document OpenAI-only mode + +### Medium Term (2-4 weeks) + +7. **Auto-Pilot Design & Testing** - High risk, careful rollout: + - Design state machine for sequential workflow triggers + - Test on Manager-Database with controlled simple issues + - Add safety limits (max iterations, token budgets) + +8. **User Guide Documentation** - After Phase 4 features stable: + - Create `docs/WORKFLOW_USER_GUIDE.md` + - Add to sync manifest + - Include label decision tree + +--- + +## Test Results Summary (2026-01-08) + +### Phase 3 Script Test Coverage + +| Script | Tests | Status | +|--------|-------|--------| +| `capability_check.py` | 57 | ✅ All passing | +| `task_decomposer.py` | 51 | ✅ All passing | +| `issue_dedup.py` | 6 | ✅ All passing | +| `label_matcher.py` | 6 | ✅ All passing | +| **Total** | **129** | **✅ All passing** | + +### Deployed Workflows + +| Workflow | Phase | Consumer Sync | +|----------|-------|---------------| +| `agents-issue-optimizer.yml` | 2 | ✅ Synced | +| `agents-verifier.yml` | 1 | ✅ Synced | +| `agents-auto-label.yml` | 3D | ✅ In manifest | +| `agents-verify-to-issue.yml` | 4E | ✅ In manifest | + +### Implemented but Not Workflow-Integrated + +| Component | Purpose | Next Step | +|-----------|---------|-----------| +| `conflict_detector.js` | Detect merge conflicts | Integrate with keepalive | +| `capability_check.py` | Pre-agent feasibility | Create workflow | +| `task_decomposer.py` | Split large issues | Create workflow | +| `issue_dedup.py` | Find duplicates | Create workflow | ### Test Results Documentation Full substantive analysis available at `/tmp/substantive_test_analysis.md`: diff --git a/scripts/langchain/pr_verifier.py b/scripts/langchain/pr_verifier.py index 9edc21158..70e60bc4e 100755 --- a/scripts/langchain/pr_verifier.py +++ b/scripts/langchain/pr_verifier.py @@ -458,6 +458,14 @@ def _parse_llm_response(content: str, provider: str) -> EvaluationResult: ) +def _is_auth_error(exc: Exception) -> bool: + """Check if an exception is an authentication/authorization error.""" + exc_str = str(exc).lower() + # Common auth error patterns from various LLM APIs + auth_patterns = ["401", "unauthorized", "forbidden", "403", "permission", "authentication"] + return any(pattern in exc_str for pattern in auth_patterns) + + def evaluate_pr( context: str, diff: str | None = None, @@ -484,6 +492,34 @@ def evaluate_pr( try: response = client.invoke(prompt) except Exception as exc: # pragma: no cover - exercised in integration + # If auth error and not explicitly requesting a provider, try fallback + if _is_auth_error(exc) and provider is None: + fallback_provider = "openai" if "github-models" in provider_name else "github-models" + fallback_resolved = _get_llm_client(model=model, provider=fallback_provider) + if fallback_resolved is not None: + fallback_client, fallback_provider_name = fallback_resolved + try: + response = fallback_client.invoke(prompt) + content = getattr(response, "content", None) or str(response) + result = _parse_llm_response(content, fallback_provider_name) + # Add note about fallback + if result.summary: + result = EvaluationResult( + verdict=result.verdict, + scores=result.scores, + concerns=result.concerns, + summary=result.summary, + provider_used=fallback_provider_name, + model=result.model, + used_llm=result.used_llm, + error=f"Primary provider ({provider_name}) failed, used fallback", + raw_content=result.raw_content, + ) + return result + except Exception as fallback_exc: + return _fallback_evaluation( + f"Primary ({provider_name}): {exc}; Fallback ({fallback_provider_name}): {fallback_exc}" + ) return _fallback_evaluation(f"LLM invocation failed: {exc}") content = getattr(response, "content", None) or str(response) diff --git a/templates/consumer-repo/.github/workflows/agents-auto-label.yml b/templates/consumer-repo/.github/workflows/agents-auto-label.yml new file mode 100644 index 000000000..8908fb7d2 --- /dev/null +++ b/templates/consumer-repo/.github/workflows/agents-auto-label.yml @@ -0,0 +1,265 @@ +name: Auto-Label Issues + +# Suggests or applies labels to new issues based on semantic matching +# Uses label_matcher.py for embedding-based similarity + +on: + issues: + types: [opened, edited] + +permissions: + contents: read + issues: write + +env: + # Threshold for auto-applying labels (very high confidence) + AUTO_APPLY_THRESHOLD: "0.90" + # Threshold for suggesting labels (lower, for comments) + SUGGEST_THRESHOLD: "0.75" + +jobs: + auto-label: + runs-on: ubuntu-latest + # Skip if issue already has agent-related labels + if: | + !contains(github.event.issue.labels.*.name, 'agents:formatted') && + !contains(github.event.issue.labels.*.name, 'agent:codex') && + !contains(github.event.issue.labels.*.name, 'automated') + + steps: + - name: Checkout Workflows repo + uses: actions/checkout@v6 + with: + repository: stranske/Workflows + path: workflows-repo + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + cd workflows-repo + pip install -e ".[langchain]" --quiet + + - name: Get repo labels + id: get-labels + uses: actions/github-script@v8 + with: + script: | + // Paginate to get all labels (handles repos with >100 labels) + const labels = await github.paginate( + github.rest.issues.listLabelsForRepo, + { + owner: context.repo.owner, + repo: context.repo.repo, + per_page: 100 + } + ); + + const labelData = labels.map(l => ({ + name: l.name, + description: l.description || '' + })); + + core.setOutput('labels_json', JSON.stringify(labelData)); + core.info(`Found ${labels.length} labels in repo`); + + - name: Match labels + id: match + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + LABELS_JSON: ${{ steps.get-labels.outputs.labels_json }} + ISSUE_TITLE: ${{ github.event.issue.title }} + ISSUE_BODY: ${{ github.event.issue.body }} + run: | + cd workflows-repo + python3 << 'PYTHON_SCRIPT' + import json + import os + import sys + + # Add scripts to path + sys.path.insert(0, '.') + + from scripts.langchain.label_matcher import ( + build_label_vector_store, + find_similar_labels, + LabelRecord, + ) + + # Get issue content + issue_title = os.environ.get('ISSUE_TITLE', '') + issue_body = os.environ.get('ISSUE_BODY', '') + query = f"{issue_title}\n\n{issue_body}" + + # Get thresholds + auto_threshold = float(os.environ.get('AUTO_APPLY_THRESHOLD', '0.90')) + suggest_threshold = float(os.environ.get('SUGGEST_THRESHOLD', '0.75')) + + # Parse labels + labels_json = os.environ.get('LABELS_JSON', '[]') + labels = json.loads(labels_json) + + if not labels: + print("No labels found in repo") + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('has_suggestions=false\n') + sys.exit(0) + + # Build vector store + label_records = [ + LabelRecord(name=l['name'], description=l['description']) + for l in labels + ] + store = build_label_vector_store(label_records) + + if store is None: + print("Could not build label vector store (missing embeddings)") + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('has_suggestions=false\n') + sys.exit(0) + + # Find matches + matches = find_similar_labels(store, query, threshold=suggest_threshold, k=5) + + if not matches: + print("No label matches found above threshold") + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('has_suggestions=false\n') + sys.exit(0) + + # Separate auto-apply from suggestions + auto_apply = [m for m in matches if m.score >= auto_threshold] + suggestions = [m for m in matches if suggest_threshold <= m.score < auto_threshold] + + print(f"Auto-apply labels ({auto_threshold}+ confidence):") + for m in auto_apply: + print(f" - {m.label.name}: {m.score:.2%}") + + print(f"Suggested labels ({suggest_threshold}-{auto_threshold} confidence):") + for m in suggestions: + print(f" - {m.label.name}: {m.score:.2%}") + + # Output results + auto_labels = json.dumps([m.label.name for m in auto_apply]) + suggest_json = json.dumps([ + {'name': m.label.name, 'score': f'{m.score:.0%}'} + for m in suggestions + ]) + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('has_suggestions=true\n') + f.write(f'auto_apply_labels={auto_labels}\n') + f.write(f'suggested_labels={suggest_json}\n') + + PYTHON_SCRIPT + + - name: Apply high-confidence labels + if: | + steps.match.outputs.has_suggestions == 'true' && + steps.match.outputs.auto_apply_labels != '[]' + uses: actions/github-script@v8 + with: + script: | + const autoApplyLabels = JSON.parse('${{ steps.match.outputs.auto_apply_labels }}'); + + if (autoApplyLabels.length === 0) { + core.info('No labels to auto-apply'); + return; + } + + // Get current labels + const { data: issue } = await github.rest.issues.get({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number + }); + + const currentLabels = issue.labels.map(l => l.name); + const newLabels = autoApplyLabels.filter(l => !currentLabels.includes(l)); + + if (newLabels.length === 0) { + core.info('All suggested labels already present'); + return; + } + + // Add labels + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + labels: newLabels + }); + + core.info(`Applied labels: ${newLabels.join(', ')}`); + + - name: Post suggestion comment + if: | + steps.match.outputs.has_suggestions == 'true' && + steps.match.outputs.suggested_labels != '[]' + uses: actions/github-script@v8 + with: + script: | + const suggestedLabels = JSON.parse('${{ steps.match.outputs.suggested_labels }}'); + const autoApplied = JSON.parse('${{ steps.match.outputs.auto_apply_labels }}'); + + if (suggestedLabels.length === 0) { + core.info('No suggestions to post'); + return; + } + + // Build suggestion list + const suggestions = suggestedLabels + .map(l => `- \`${l.name}\` (${l.score} confidence)`) + .join('\n'); + + let body = `### 🏷️ Label Suggestions\n\n`; + body += `Based on the issue content, these labels might be relevant:\n\n`; + body += `${suggestions}\n\n`; + + if (autoApplied.length > 0) { + const appliedStr = autoApplied.map(l => `\`${l}\``).join(', '); + body += `**Auto-applied:** ${appliedStr}\n\n`; + } + + body += `
\nHow to use these suggestions\n\n`; + body += `- Click the label name in the sidebar to add it\n`; + const ghCmd = `gh issue edit ${context.issue.number} --add-label "label-name"`; + body += `- Or use the GitHub CLI: \`${ghCmd}\`\n`; + body += `
\n\n`; + body += `---\n*Auto-generated by label matcher*`; + + // Check for existing suggestion comment + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + per_page: 30 + }); + + const existingComment = comments.find(c => + c.body.includes('### 🏷️ Label Suggestions') && + c.user.type === 'Bot' + ); + + if (existingComment) { + // Update existing comment + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existingComment.id, + body: body + }); + core.info('Updated existing suggestion comment'); + } else { + // Create new comment + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: body + }); + core.info('Posted label suggestions'); + } diff --git a/templates/consumer-repo/.github/workflows/agents-capability-check.yml b/templates/consumer-repo/.github/workflows/agents-capability-check.yml new file mode 100644 index 000000000..57ef05b81 --- /dev/null +++ b/templates/consumer-repo/.github/workflows/agents-capability-check.yml @@ -0,0 +1,215 @@ +name: Capability Check + +# Pre-flight check before agent assignment to identify blockers +# Uses capability_check.py to detect issues agents cannot complete + +on: + issues: + types: [labeled] + +permissions: + contents: read + issues: write + models: read + +jobs: + capability-check: + runs-on: ubuntu-latest + # Trigger when agent:codex is added (pre-agent gate) + if: github.event.label.name == 'agent:codex' + + steps: + - name: Checkout Workflows repo + uses: actions/checkout@v6 + with: + repository: stranske/Workflows + path: workflows-repo + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + cd workflows-repo + pip install -e ".[langchain]" --quiet + + - name: Extract issue content + id: extract + uses: actions/github-script@v8 + with: + script: | + const issue = context.payload.issue; + const body = issue.body || ''; + + // Extract Tasks section + const tasksMatch = body.match(/## Tasks\s*\n([\s\S]*?)(?=##|$)/i); + const tasks = tasksMatch ? tasksMatch[1].trim() : ''; + + // Extract Acceptance Criteria section + const acceptanceMatch = body.match(/## Acceptance [Cc]riteria\s*\n([\s\S]*?)(?=##|$)/i); + const acceptance = acceptanceMatch ? acceptanceMatch[1].trim() : ''; + + // Write to files for Python script + const fs = require('fs'); + fs.writeFileSync('tasks.md', tasks || 'No tasks defined'); + fs.writeFileSync('acceptance.md', acceptance || 'No acceptance criteria defined'); + + core.setOutput('has_tasks', tasks ? 'true' : 'false'); + core.setOutput('has_acceptance', acceptance ? 'true' : 'false'); + + - name: Run capability check + id: check + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PYTHONPATH: ${{ github.workspace }}/workflows-repo + run: | + cd workflows-repo + python -c " + import json + import os + import sys + sys.path.insert(0, '.') + + from scripts.langchain.capability_check import check_capability + + # Read extracted content + tasks = open('../tasks.md').read() + acceptance = open('../acceptance.md').read() + + # Run capability check + result = check_capability(tasks, acceptance) + + if result is None: + print('::warning::Could not run capability check (LLM unavailable)') + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('check_failed=true\n') + sys.exit(0) + + # Output results + result_dict = result.to_dict() + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('check_failed=false\n') + f.write(f'recommendation={result.recommendation}\n') + f.write(f'blocked_count={len(result.blocked_tasks)}\n') + f.write(f'partial_count={len(result.partial_tasks)}\n') + f.write(f'result_json={json.dumps(result_dict)}\n') + + print(f'Recommendation: {result.recommendation}') + print(f'Blocked tasks: {len(result.blocked_tasks)}') + print(f'Partial tasks: {len(result.partial_tasks)}') + print(f'Actionable tasks: {len(result.actionable_tasks)}') + " + + - name: Add needs-human label if blocked + if: steps.check.outputs.recommendation == 'BLOCKED' + uses: actions/github-script@v8 + with: + script: | + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + labels: ['needs-human'] + }); + + // Remove agent:codex since agent can't complete this + try { + await github.rest.issues.removeLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + name: 'agent:codex' + }); + } catch (e) { + core.warning('Could not remove agent:codex label'); + } + + - name: Post capability report + if: steps.check.outputs.check_failed != 'true' + uses: actions/github-script@v8 + env: + RESULT_JSON: ${{ steps.check.outputs.result_json }} + RECOMMENDATION: ${{ steps.check.outputs.recommendation }} + with: + script: | + const result = JSON.parse(process.env.RESULT_JSON || '{}'); + const recommendation = process.env.RECOMMENDATION || 'UNKNOWN'; + + let emoji = '✅'; + let status = 'Agent can proceed'; + if (recommendation === 'BLOCKED') { + emoji = '🚫'; + status = 'Agent cannot complete this issue'; + } else if (recommendation === 'REVIEW_NEEDED') { + emoji = '⚠️'; + status = 'Some tasks may need human assistance'; + } + + let body = `### ${emoji} Capability Check: ${status}\n\n`; + body += `**Recommendation:** ${recommendation}\n\n`; + + if (result.actionable_tasks && result.actionable_tasks.length > 0) { + body += `**✅ Actionable Tasks (${result.actionable_tasks.length}):**\n`; + result.actionable_tasks.forEach(t => { body += `- ${t}\n`; }); + body += '\n'; + } + + if (result.partial_tasks && result.partial_tasks.length > 0) { + body += `**⚠️ Partial Tasks (${result.partial_tasks.length}):**\n`; + result.partial_tasks.forEach(t => { + body += `- ${t.task}\n - *Limitation:* ${t.limitation}\n`; + }); + body += '\n'; + } + + if (result.blocked_tasks && result.blocked_tasks.length > 0) { + body += `**🚫 Blocked Tasks (${result.blocked_tasks.length}):**\n`; + result.blocked_tasks.forEach(t => { + body += `- ${t.task}\n - *Reason:* ${t.reason}\n`; + if (t.suggested_action) { + body += ` - *Suggested Action:* ${t.suggested_action}\n`; + } + }); + body += '\n'; + } + + if (result.human_actions_needed && result.human_actions_needed.length > 0) { + body += `**👤 Human Actions Needed:**\n`; + result.human_actions_needed.forEach(a => { body += `- ${a}\n`; }); + body += '\n'; + } + + body += `---\n*Auto-generated by capability check*`; + + // Check for existing comment + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + per_page: 50 + }); + + const existingComment = comments.find(c => + c.body.includes('### ✅ Capability Check') || + c.body.includes('### ⚠️ Capability Check') || + c.body.includes('### 🚫 Capability Check') + ); + + if (existingComment) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existingComment.id, + body: body + }); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: body + }); + } diff --git a/templates/consumer-repo/.github/workflows/agents-decompose.yml b/templates/consumer-repo/.github/workflows/agents-decompose.yml new file mode 100644 index 000000000..aadd1fc1b --- /dev/null +++ b/templates/consumer-repo/.github/workflows/agents-decompose.yml @@ -0,0 +1,195 @@ +name: Task Decomposition + +# Decomposes large issues into smaller, actionable sub-tasks +# Uses task_decomposer.py for intelligent task splitting + +on: + issues: + types: [labeled] + +permissions: + contents: read + issues: write + models: read + +jobs: + decompose: + runs-on: ubuntu-latest + # Trigger when agents:decompose label is added + if: github.event.label.name == 'agents:decompose' + + steps: + - name: Checkout Workflows repo + uses: actions/checkout@v6 + with: + repository: stranske/Workflows + path: workflows-repo + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + cd workflows-repo + pip install -e ".[langchain]" --quiet + + - name: Extract issue content + id: extract + uses: actions/github-script@v8 + with: + script: | + const issue = context.payload.issue; + const body = issue.body || ''; + const title = issue.title || ''; + + // Extract Tasks section + const tasksMatch = body.match(/## Tasks\s*\n([\s\S]*?)(?=##|$)/i); + const tasks = tasksMatch ? tasksMatch[1].trim() : ''; + + // Extract Scope section + const scopeMatch = body.match(/## Scope\s*\n([\s\S]*?)(?=##|$)/i); + const scope = scopeMatch ? scopeMatch[1].trim() : ''; + + // Build context for decomposition + const context_text = [ + `# ${title}`, + '', + scope ? `## Scope\n${scope}` : '', + '', + tasks ? `## Current Tasks\n${tasks}` : 'No tasks defined' + ].filter(Boolean).join('\n'); + + const fs = require('fs'); + fs.writeFileSync('issue_context.md', context_text); + + core.setOutput('issue_title', title); + core.setOutput('has_tasks', tasks ? 'true' : 'false'); + + - name: Decompose tasks + id: decompose + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PYTHONPATH: ${{ github.workspace }}/workflows-repo + run: | + cd workflows-repo + python -c " + import json + import os + import sys + sys.path.insert(0, '.') + + from scripts.langchain.task_decomposer import decompose_task + + # Read issue context + context = open('../issue_context.md').read() + + # Decompose the task + result = decompose_task(context) + + if result is None: + print('::warning::Could not decompose task (LLM unavailable)') + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('decompose_failed=true\n') + sys.exit(0) + + # Output results + subtasks = result.get('subtasks', []) + + # Build markdown list + subtask_md = '\n'.join([f'- [ ] {t}' for t in subtasks]) + + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('decompose_failed=false\n') + f.write(f'subtask_count={len(subtasks)}\n') + + # Write subtasks to file for multiline handling + with open('../subtasks.md', 'w') as f: + f.write(subtask_md) + + print(f'Generated {len(subtasks)} subtasks') + for t in subtasks: + print(f' - {t}') + " + + - name: Post decomposition comment + if: steps.decompose.outputs.decompose_failed != 'true' + uses: actions/github-script@v8 + env: + SUBTASK_COUNT: ${{ steps.decompose.outputs.subtask_count }} + with: + script: | + const fs = require('fs'); + const subtasks = fs.readFileSync('subtasks.md', 'utf8'); + const count = parseInt(process.env.SUBTASK_COUNT || '0'); + + if (count === 0) { + core.info('No subtasks generated'); + return; + } + + let body = `### 📋 Task Decomposition\n\n`; + body += `This issue has been analyzed and broken down into **${count} sub-tasks**.\n\n`; + body += `**Suggested Sub-Tasks:**\n\n`; + body += subtasks + '\n\n'; + body += `
\nHow to use these sub-tasks\n\n`; + body += `**Option 1: Update this issue**\n`; + body += `Copy the sub-tasks above and `; + body += `replace the Tasks section in the issue body.\n\n`; + body += `**Option 2: Create child issues**\n`; + body += `For larger efforts, create a separate issue `; + body += `for each sub-task and link them here.\n\n`; + body += `**Option 3: Use as-is**\n`; + body += `Work through the sub-tasks sequentially, `; + body += `checking off as you complete each one.\n`; + body += `
\n\n`; + body += `---\n*Auto-generated by task decomposer*`; + + // Check for existing comment + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + per_page: 50 + }); + + const existingComment = comments.find(c => + c.body.includes('### 📋 Task Decomposition') + ); + + if (existingComment) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: existingComment.id, + body: body + }); + core.info('Updated existing decomposition comment'); + } else { + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: body + }); + core.info('Posted decomposition comment'); + } + + - name: Remove trigger label + uses: actions/github-script@v8 + continue-on-error: true + with: + script: | + try { + await github.rest.issues.removeLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + name: 'agents:decompose' + }); + core.info('Removed agents:decompose label'); + } catch (error) { + core.warning('Could not remove label: ' + error.message); + } diff --git a/templates/consumer-repo/.github/workflows/agents-dedup.yml b/templates/consumer-repo/.github/workflows/agents-dedup.yml new file mode 100644 index 000000000..6be508f67 --- /dev/null +++ b/templates/consumer-repo/.github/workflows/agents-dedup.yml @@ -0,0 +1,198 @@ +name: Duplicate Detection + +# Detects potential duplicate issues using semantic similarity +# Uses issue_dedup.py for embedding-based matching + +on: + issues: + types: [opened] + +permissions: + contents: read + issues: write + models: read + +env: + # Similarity threshold for flagging duplicates (0.0-1.0) + # 0.85 = very similar, reduces false positives + SIMILARITY_THRESHOLD: "0.85" + +jobs: + dedup: + runs-on: ubuntu-latest + # Skip issues created by bots to avoid noise + if: github.event.issue.user.type != 'Bot' + + steps: + - name: Checkout Workflows repo + uses: actions/checkout@v6 + with: + repository: stranske/Workflows + path: workflows-repo + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.12" + + - name: Install dependencies + run: | + cd workflows-repo + pip install -e ".[langchain]" --quiet + + - name: Get open issues + id: get-issues + uses: actions/github-script@v8 + with: + script: | + // Get all open issues (excluding this one) + const { data: issues } = await github.rest.issues.listForRepo({ + owner: context.repo.owner, + repo: context.repo.repo, + state: 'open', + per_page: 100 + }); + + // Filter out the current issue and PRs + const otherIssues = issues.filter(i => + i.number !== context.issue.number && + !i.pull_request + ); + + // Simplify for Python + const issueData = otherIssues.map(i => ({ + number: i.number, + title: i.title, + body: i.body || '', + html_url: i.html_url + })); + + const fs = require('fs'); + fs.writeFileSync('open_issues.json', JSON.stringify(issueData, null, 2)); + + core.setOutput('issue_count', issueData.length); + core.info(`Found ${issueData.length} other open issues to compare against`); + + - name: Check for duplicates + id: check + if: steps.get-issues.outputs.issue_count > 0 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} + PYTHONPATH: ${{ github.workspace }}/workflows-repo + NEW_ISSUE_TITLE: ${{ github.event.issue.title }} + NEW_ISSUE_BODY: ${{ github.event.issue.body }} + run: | + cd workflows-repo + python -c " + import json + import os + import sys + sys.path.insert(0, '.') + + from scripts.langchain.issue_dedup import ( + build_issue_vector_store, + find_similar_issues, + IssueRecord, + ) + + # Load open issues + with open('../open_issues.json') as f: + issues_data = json.load(f) + + if not issues_data: + print('No issues to compare against') + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('has_duplicates=false\n') + sys.exit(0) + + # Build vector store + issues = [IssueRecord( + number=i['number'], + title=i['title'], + body=i['body'], + url=i['html_url'] + ) for i in issues_data] + + store = build_issue_vector_store(issues) + + if store is None: + print('::warning::Could not build vector store (embeddings unavailable)') + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('has_duplicates=false\n') + sys.exit(0) + + # Check new issue against existing + new_title = os.environ.get('NEW_ISSUE_TITLE', '') + new_body = os.environ.get('NEW_ISSUE_BODY', '') + query = f'{new_title}\n\n{new_body}' + + threshold = float(os.environ.get('SIMILARITY_THRESHOLD', '0.85')) + matches = find_similar_issues(store, query, threshold=threshold, k=3) + + if not matches: + print('No duplicates found above threshold') + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('has_duplicates=false\n') + sys.exit(0) + + # Output results + duplicates = [{ + 'number': m.issue.number, + 'title': m.issue.title, + 'url': m.issue.url, + 'score': f'{m.score:.0%}' + } for m in matches] + + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write('has_duplicates=true\n') + f.write(f'duplicate_count={len(duplicates)}\n') + + # Write to file for GitHub script + with open('../duplicates.json', 'w') as f: + json.dump(duplicates, f) + + print(f'Found {len(duplicates)} potential duplicates:') + for d in duplicates: + print(f' - #{d[\"number\"]}: {d[\"title\"]} ({d[\"score\"]})') + " + + - name: Post duplicate warning + if: steps.check.outputs.has_duplicates == 'true' + uses: actions/github-script@v8 + with: + script: | + const fs = require('fs'); + const duplicates = JSON.parse(fs.readFileSync('duplicates.json', 'utf8')); + + if (duplicates.length === 0) { + return; + } + + let body = `### ⚠️ Potential Duplicate Detected\n\n`; + body += `This issue appears similar to existing open issues:\n\n`; + + duplicates.forEach(d => { + body += `- **#${d.number}** - [${d.title}](${d.url}) (${d.score} similarity)\n`; + }); + + body += `\n
\nWhat should I do?\n\n`; + body += `1. **Review the linked issues** `; + body += `to see if they address the same problem\n`; + body += `2. **If duplicate:** Close this issue `; + body += `and add your context to the existing one\n`; + body += `3. **If different:** Add a comment `; + body += `explaining how this issue is distinct\n`; + body += `4. **If related:** Link the issues and keep both open\n`; + body += `
\n\n`; + body += `---\n*Auto-generated by duplicate detection • `; + body += `False positive? Just ignore this comment.*`; + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: body + }); + + core.info(`Posted duplicate warning for ${duplicates.length} potential matches`); diff --git a/templates/consumer-repo/.github/workflows/agents-verify-to-issue.yml b/templates/consumer-repo/.github/workflows/agents-verify-to-issue.yml new file mode 100644 index 000000000..469ae9e99 --- /dev/null +++ b/templates/consumer-repo/.github/workflows/agents-verify-to-issue.yml @@ -0,0 +1,206 @@ +name: Create Issue from Verification + +# Creates a follow-up issue from verification feedback when user adds verify:create-issue label +# This is a user-triggered workflow (not automatic) to avoid aggressive issue creation + +on: + pull_request_target: + types: [labeled] + +permissions: + contents: read + pull-requests: write + issues: write + +jobs: + create-issue: + if: github.event.label.name == 'verify:create-issue' + runs-on: ubuntu-latest + steps: + - name: Check PR is merged + id: check-merged + uses: actions/github-script@v8 + with: + script: | + const pr = context.payload.pull_request; + if (!pr.merged) { + core.setFailed('PR must be merged before creating follow-up issue'); + return; + } + core.setOutput('merged', 'true'); + + - name: Find and extract verification feedback + id: extract + if: steps.check-merged.outputs.merged == 'true' + uses: actions/github-script@v8 + with: + script: | + const { data: comments } = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.payload.pull_request.number, + per_page: 100 + }); + + // Look for verification report comment + const verifyComment = comments.find(c => + c.body.includes('## PR Verification Report') || + c.body.includes('## PR Verification Comparison') || + c.body.includes('### Concerns') || + c.body.includes('Verdict:') + ); + + if (!verifyComment) { + const msg = 'No verification comment found on this PR. '; + core.setFailed(msg + 'Add verify:evaluate or verify:compare label first.'); + return; + } + + const comment = verifyComment.body; + core.info('Found verification comment'); + + // Extract CONCERNS section + const concernsMatch = comment.match(/### Concerns\s*\n([\s\S]*?)(?=###|##|$)/i); + let concerns = concernsMatch ? concernsMatch[1].trim() : ''; + + // Also try alternate formats + if (!concerns) { + const altMatch = comment.match(/\*\*Concerns:\*\*\s*([\s\S]*?)(?=\*\*|##|$)/i); + concerns = altMatch ? altMatch[1].trim() : ''; + } + + // Extract low scores (anything < 7/10) + const scoreMatches = [...comment.matchAll(/(\w+):\s*(\d+)\/10/gi)]; + const lowScores = scoreMatches + .filter(m => parseInt(m[2]) < 7) + .map(m => `- ${m[1]}: ${m[2]}/10`); + + // Extract verdict + const verdictMatch = comment.match(/Verdict:\s*\*?\*?(\w+)\*?\*?/i); + const verdict = verdictMatch ? verdictMatch[1] : 'Unknown'; + + // Build summary + let summary = ''; + if (concerns) { + summary += '### Concerns from Verification\n\n' + concerns + '\n\n'; + } + if (lowScores.length > 0) { + summary += '### Scores Below 7/10\n\n' + lowScores.join('\n') + '\n\n'; + } + if (!summary) { + summary = 'No specific concerns extracted from verification report.'; + summary += '\n\nPlease review the original verification comment for details.'; + } + + // Set outputs using environment file (handles multi-line content) + const fs = require('fs'); + const envFile = process.env.GITHUB_OUTPUT; + + // Use delimiter for multi-line output + const delim = 'EOF_' + Math.random().toString(36).substring(2); + const outLine = `concerns_summary<<${delim}\n${summary}\n${delim}\n`; + fs.appendFileSync(envFile, outLine); + + core.setOutput('verdict', verdict); + core.setOutput('has_concerns', (concerns || lowScores.length > 0) ? 'true' : 'false'); + + - name: Create follow-up issue + id: create-issue + if: steps.check-merged.outputs.merged == 'true' + uses: actions/github-script@v8 + env: + VERDICT: ${{ steps.extract.outputs.verdict }} + CONCERNS_SUMMARY: ${{ steps.extract.outputs.concerns_summary }} + with: + script: | + const prNumber = context.payload.pull_request.number; + const prTitle = context.payload.pull_request.title; + const prUrl = context.payload.pull_request.html_url; + const concernsSummary = process.env.CONCERNS_SUMMARY || 'No concerns extracted.'; + const verdict = process.env.VERDICT || 'Unknown'; + + const issueBody = [ + '## Follow-up from PR #' + prNumber, + '', + 'Original PR: [#' + prNumber + ' - ' + prTitle + '](' + prUrl + ')', + 'Verification Verdict: ' + verdict, + '', + '---', + '', + concernsSummary, + '', + '## Suggested Tasks', + '', + '- [ ] Review the concerns identified above', + '- [ ] Address each issue or document why it is not applicable', + '- [ ] Update tests if needed', + '- [ ] Consider re-verification after changes', + '', + '---', + '', + '## Context', + '', + 'This issue was created from verification feedback on a merged PR.', + '', + '
', + 'How to use this issue', + '', + '1. Add `agents:optimize` label to get AI-suggested improvements', + '2. Add `agents:apply-suggestions` to format for agent work', + '3. Add `agent:codex` to assign to an agent', + '', + 'Or work on it manually - the choice is yours!', + '', + '
', + '', + '---', + '*Auto-generated by verify-to-issue workflow*' + ].join('\n'); + + const issue = await github.rest.issues.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: '[Follow-up] Address verification concerns from PR #' + prNumber, + body: issueBody, + labels: ['follow-up', 'agents:optimize'] + }); + + core.info('Created issue #' + issue.data.number); + core.setOutput('issue_number', issue.data.number); + core.setOutput('issue_url', issue.data.html_url); + + - name: Comment on original PR + if: steps.check-merged.outputs.merged == 'true' + uses: actions/github-script@v8 + env: + ISSUE_NUMBER: ${{ steps.create-issue.outputs.issue_number }} + with: + script: | + const issueNumber = process.env.ISSUE_NUMBER; + let body = '📋 Follow-up issue created: #' + issueNumber + '\n\n'; + body += 'Verification concerns have been captured in the new issue for tracking.'; + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.payload.pull_request.number, + body: body + }); + + - name: Remove trigger label + if: steps.check-merged.outputs.merged == 'true' + uses: actions/github-script@v8 + continue-on-error: true + with: + script: | + try { + await github.rest.issues.removeLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.payload.pull_request.number, + name: 'verify:create-issue' + }); + core.info('Removed verify:create-issue label'); + } catch (error) { + core.warning('Could not remove label: ' + error.message); + } diff --git a/tests/scripts/test_pr_verifier_fallback.py b/tests/scripts/test_pr_verifier_fallback.py new file mode 100644 index 000000000..967c15197 --- /dev/null +++ b/tests/scripts/test_pr_verifier_fallback.py @@ -0,0 +1,109 @@ +"""Tests for PR verifier auth error fallback.""" + +import scripts.langchain.pr_verifier as pr_verifier + + +class FakeResponse: + def __init__(self, content: str) -> None: + self.content = content + + +class FakeClient: + def __init__(self, name: str, succeed: bool = True) -> None: + self.name = name + self.succeed = succeed + self.invoked = False + + def invoke(self, prompt: str) -> FakeResponse: + self.invoked = True + if not self.succeed: + raise Exception("401 Unauthorized: models permission required") + return FakeResponse( + '{"verdict":"PASS","scores":{"correctness":8,"completeness":7,' + '"quality":7,"testing":6,"risks":5},"concerns":[],"summary":"ok"}' + ) + + +def test_is_auth_error_detects_401() -> None: + exc = Exception("401 Unauthorized: models permission required") + assert pr_verifier._is_auth_error(exc) is True + + +def test_is_auth_error_detects_forbidden() -> None: + exc = Exception("403 Forbidden: access denied") + assert pr_verifier._is_auth_error(exc) is True + + +def test_is_auth_error_detects_permission() -> None: + exc = Exception("Error: permission denied for models API") + assert pr_verifier._is_auth_error(exc) is True + + +def test_is_auth_error_rejects_other_errors() -> None: + exc = Exception("Rate limit exceeded") + assert pr_verifier._is_auth_error(exc) is False + + +def test_evaluate_pr_falls_back_on_auth_error(monkeypatch) -> None: + """When primary provider fails with auth error, fallback to alternate provider.""" + primary_client = FakeClient("github-models", succeed=False) + fallback_client = FakeClient("openai", succeed=True) + + call_count = [0] + + def mock_get_client(model=None, provider=None): + call_count[0] += 1 + if call_count[0] == 1: + # First call: return failing github-models client + return (primary_client, "github-models/gpt-4o") + else: + # Second call: return working openai client + return (fallback_client, "openai/gpt-4o") + + monkeypatch.setattr(pr_verifier, "_get_llm_client", mock_get_client) + monkeypatch.setattr(pr_verifier, "_prepare_prompt", lambda ctx, diff: "test prompt") + + result = pr_verifier.evaluate_pr("test context") + + assert primary_client.invoked is True + assert fallback_client.invoked is True + assert result.verdict == "PASS" + assert result.provider_used == "openai/gpt-4o" + assert "fallback" in (result.error or "").lower() + + +def test_evaluate_pr_no_fallback_when_provider_explicit(monkeypatch) -> None: + """When provider is explicitly specified, don't fallback on auth error.""" + primary_client = FakeClient("github-models", succeed=False) + + def mock_get_client(model=None, provider=None): + return (primary_client, "github-models/gpt-4o") + + monkeypatch.setattr(pr_verifier, "_get_llm_client", mock_get_client) + monkeypatch.setattr(pr_verifier, "_prepare_prompt", lambda ctx, diff: "test prompt") + + # Explicitly request github-models - should NOT fallback + result = pr_verifier.evaluate_pr("test context", provider="github-models") + + assert primary_client.invoked is True + assert result.used_llm is False # Fallback evaluation + assert "401" in (result.error or "") + + +def test_evaluate_pr_no_fallback_on_non_auth_error(monkeypatch) -> None: + """When error is not auth-related, don't attempt fallback.""" + + class RateLimitClient: + def invoke(self, prompt: str): + raise Exception("Rate limit exceeded") + + def mock_get_client(model=None, provider=None): + return (RateLimitClient(), "github-models/gpt-4o") + + monkeypatch.setattr(pr_verifier, "_get_llm_client", mock_get_client) + monkeypatch.setattr(pr_verifier, "_prepare_prompt", lambda ctx, diff: "test prompt") + + result = pr_verifier.evaluate_pr("test context") + + assert result.used_llm is False + assert "Rate limit" in (result.error or "")