diff --git a/.github/scripts/conflict_detector.js b/.github/scripts/conflict_detector.js index 12ab605f..148c8244 100644 --- a/.github/scripts/conflict_detector.js +++ b/.github/scripts/conflict_detector.js @@ -19,6 +19,34 @@ const IGNORED_CONFLICT_FILES = [ 'residual-trend-history.ndjson', ]; +// Comments from automation often mention "conflict" but should not block execution. +const IGNORED_COMMENT_AUTHORS = new Set([ + 'github-actions[bot]', + 'github-merge-queue[bot]', + 'dependabot[bot]', + 'github', +]); + +const IGNORED_COMMENT_MARKERS = [ + '`, + '', + `Closes #${issueNumber}`, + '', + issue.body || '' + ].join('\n'); + + try { + const { data: pr } = await github.rest.pulls.create({ + owner: context.repo.owner, + repo: context.repo.repo, + title: prTitle, + head: branchName, + base: 'main', + body: prBody + }); + + core.info(`Created PR #${pr.number}`); + + // Add standard agent labels to the PR (separate try-catch to not fail PR creation) + let labelsAdded = false; + try { + await github.rest.issues.addLabels({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: pr.number, + labels: ['agent:codex', 'agents:keepalive', 'autofix'] + }); + labelsAdded = true; + core.info(`Added agent labels to PR #${pr.number}`); + } catch (labelError) { + const errMsg = labelError?.message || String(labelError); + core.warning(`Failed to add labels to PR #${pr.number}: ${errMsg}`); + } + + const labelStatus = labelsAdded + ? '✅ Added labels: `agent:codex`, `agents:keepalive`, `autofix`' + : '⚠️ Could not add labels (add manually)'; + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueNumber, + body: `🤖 **Auto-pilot step ${stepCount}**: PR created! + + ✅ Created PR #${pr.number} from branch \`${branchName}\` + ${labelStatus} + + The PR will now go through CI checks. Auto-pilot will continue monitoring.` + }); + + } catch (e) { + if (e.status === 422 && e.message?.includes('already exists')) { + core.info('PR already exists - this is fine'); + } else { + // PR creation failed - report but don't fail workflow + core.warning(`Failed to create PR: ${e.message}`); + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueNumber, + body: `🤖 **Auto-pilot step ${stepCount}**: Could not create PR + + ⚠️ Branch \`${branchName}\` exists but PR creation failed. + + Error: ${e.message} + + Please create the PR manually or check permissions.` + }); + } + } + + - name: Report - Monitoring PR + if: steps.next.outputs.next_step == 'monitor-pr' + uses: actions/github-script@v7 + with: + script: | + const prNumber = '${{ steps.context.outputs.linked_pr }}'; + core.info(`PR #${prNumber} exists. Keepalive and autofix will handle CI.`); + + - name: Report - Done + if: steps.next.outputs.next_step == 'done' + uses: actions/github-script@v7 + env: + ISSUE_NUMBER: ${{ steps.context.outputs.issue_number }} + with: + script: | + const issueNumber = parseInt(process.env.ISSUE_NUMBER); + + // Remove auto-pilot label since we're done + try { + await github.rest.issues.removeLabel({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueNumber, + name: 'agents:auto-pilot' + }); + } catch (e) { + // Label might already be removed (404) - that's OK + if (e && e.status === 404) { + core.info('Auto-pilot label already removed or not found'); + } else { + core.warning(`Unexpected error removing auto-pilot label: ${e?.message || e}`); + } + } + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: issueNumber, + body: `## ✅ Auto-Pilot Complete + + This issue has been fully processed: + - ✅ Issue formatted and optimized + - ✅ Agent assigned and PR created + - ✅ PR merged + - ✅ Verification triggered + + Thank you for using auto-pilot! 🚀` + }); + + core.info('Auto-pilot complete!'); diff --git a/.github/workflows/agents-dedup.yml b/.github/workflows/agents-dedup.yml index 6be508f6..5820afbd 100644 --- a/.github/workflows/agents-dedup.yml +++ b/.github/workflows/agents-dedup.yml @@ -14,8 +14,9 @@ permissions: env: # Similarity threshold for flagging duplicates (0.0-1.0) - # 0.85 = very similar, reduces false positives - SIMILARITY_THRESHOLD: "0.85" + # 0.92 = very high similarity required, reduces false positives from + # issues in the same domain/feature area that share vocabulary + SIMILARITY_THRESHOLD: "0.92" jobs: dedup: @@ -24,20 +25,16 @@ jobs: if: github.event.issue.user.type != 'Bot' steps: - - name: Checkout Workflows repo - uses: actions/checkout@v6 - with: - repository: stranske/Workflows - path: workflows-repo + - name: Checkout repository + uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: - python-version: "3.12" + python-version: "3.11" - name: Install dependencies run: | - cd workflows-repo pip install -e ".[langchain]" --quiet - name: Get open issues @@ -79,11 +76,10 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} - PYTHONPATH: ${{ github.workspace }}/workflows-repo + PYTHONPATH: ${{ github.workspace }} NEW_ISSUE_TITLE: ${{ github.event.issue.title }} NEW_ISSUE_BODY: ${{ github.event.issue.body }} run: | - cd workflows-repo python -c " import json import os @@ -97,7 +93,7 @@ jobs: ) # Load open issues - with open('../open_issues.json') as f: + with open('open_issues.json') as f: issues_data = json.load(f) if not issues_data: @@ -127,9 +123,31 @@ jobs: new_body = os.environ.get('NEW_ISSUE_BODY', '') query = f'{new_title}\n\n{new_body}' - threshold = float(os.environ.get('SIMILARITY_THRESHOLD', '0.85')) + threshold = float(os.environ.get('SIMILARITY_THRESHOLD', '0.92')) matches = find_similar_issues(store, query, threshold=threshold, k=3) + # Additional filter: require title similarity for true duplicates + # This reduces false positives from issues in the same domain/feature area + # that share vocabulary but are different tasks + filtered_matches = [] + new_title_lower = new_title.lower().strip() + for m in matches: + match_title_lower = m.issue.title.lower().strip() + # Check for significant title overlap + title_words_new = set(new_title_lower.split()) + title_words_match = set(match_title_lower.split()) + shared_words = title_words_new.intersection(title_words_match) + # Require at least 40% of words to overlap for a duplicate flag + max_words = max(len(title_words_new), len(title_words_match), 1) + overlap_ratio = len(shared_words) / max_words + if m.score >= 0.95 or overlap_ratio >= 0.4: + filtered_matches.append(m) + print(f' Match #{m.issue.number}: {m.score:.0%}, overlap={overlap_ratio:.0%}') + else: + print(f' Skip #{m.issue.number}: {m.score:.0%}, overlap={overlap_ratio:.0%}') + + matches = filtered_matches + if not matches: print('No duplicates found above threshold') with open(os.environ['GITHUB_OUTPUT'], 'a') as f: @@ -149,7 +167,7 @@ jobs: f.write(f'duplicate_count={len(duplicates)}\n') # Write to file for GitHub script - with open('../duplicates.json', 'w') as f: + with open('duplicates.json', 'w') as f: json.dump(duplicates, f) print(f'Found {len(duplicates)} potential duplicates:') diff --git a/.github/workflows/agents-verifier.yml b/.github/workflows/agents-verifier.yml index 1eaeb104..37bfce60 100644 --- a/.github/workflows/agents-verifier.yml +++ b/.github/workflows/agents-verifier.yml @@ -38,23 +38,20 @@ on: default: 'evaluate' model: description: >- - Model for evaluation. GitHub Models: gpt-4o, gpt-4o-mini, - text-embedding-3-large, text-embedding-3-small, Meta-Llama-3.1-405B-Instruct, - Meta-Llama-3.1-70B-Instruct, Meta-Llama-3-70B-Instruct | - OpenAI: gpt-5.2, o1, o1-preview, o1-mini, o3-mini (if available), gpt-4o, gpt-4o-mini, - gpt-4-turbo, gpt-4, gpt-3.5-turbo. Use script scripts/update_model_list.sh - to check current availability. + Model for evaluation. GitHub Models: gpt-4o (default), Mistral-large-2407, + Meta-Llama-3.1-405B-Instruct | OpenAI (requires key): o1, gpt-5.2. + For stricter evaluation, use compare mode with different model families. required: false type: string - default: 'gpt-4o-mini' + default: 'gpt-4o' model2: description: >- - Second model for compare mode. High quality options: - GitHub Models: gpt-4o, Meta-Llama-3.1-405B-Instruct | - OpenAI: gpt-5.2, o1, gpt-4o, gpt-4-turbo. Efficient: gpt-4o-mini, o1-mini + Second model for compare mode (cross-provider verification). + Default: Mistral-large-2407 (GitHub Models) paired with gpt-5.2 (OpenAI). + Using different providers ensures diverse evaluation perspectives. required: false type: string - default: '' + default: 'Mistral-large-2407' provider: description: 'LLM provider (OpenAI requires OPENAI_API_KEY secret)' required: true @@ -156,12 +153,13 @@ jobs: core.info(`Verifier triggered with mode: ${mode}`); core.setOutput('should_run', 'true'); core.setOutput('mode', mode); - // For compare mode, use high-quality models from different providers + // For compare mode, use models from different families/providers if (mode === 'compare') { - core.setOutput('model', 'gpt-4o'); // GitHub Models - current flagship - core.setOutput('model2', 'gpt-5.2'); // OpenAI - GPT-5.2 + // gpt-5.2 (OpenAI) + Mistral-large (GitHub Models) for cross-provider comparison + core.setOutput('model', 'gpt-5.2'); + core.setOutput('model2', 'Mistral-large-2407'); } else { - core.setOutput('model', ''); // Use default + core.setOutput('model', ''); // Use default (gpt-4o) core.setOutput('model2', ''); } core.setOutput('provider', ''); // Use default diff --git a/scripts/langchain/issue_formatter.py b/scripts/langchain/issue_formatter.py index c527299f..d89d64d2 100755 --- a/scripts/langchain/issue_formatter.py +++ b/scripts/langchain/issue_formatter.py @@ -309,6 +309,62 @@ def _append_raw_issue_section(formatted: str, issue_body: str) -> str: return f"{formatted.rstrip()}{details}\n" +def _extract_tasks_from_formatted(body: str) -> list[str]: + lines = body.splitlines() + header = "## Tasks" + try: + header_idx = next(i for i, line in enumerate(lines) if line.strip() == header) + except StopIteration: + return [] + end_idx = next( + ( + i + for i in range(header_idx + 1, len(lines)) + if lines[i].startswith("## ") and lines[i].strip() != header + ), + len(lines), + ) + tasks: list[str] = [] + for line in lines[header_idx + 1 : end_idx]: + if not line.strip(): + continue + match = LIST_ITEM_REGEX.match(line) + if not match: + continue + indent, _, remainder = match.groups() + if indent.strip(): + continue + text = remainder.strip() + checkbox = CHECKBOX_REGEX.match(text) + if checkbox: + text = checkbox.group(2).strip() + if not text or text == "_Not provided._": + continue + tasks.append(text) + return tasks + + +def _apply_task_decomposition(formatted: str, *, use_llm: bool) -> str: + tasks = _extract_tasks_from_formatted(formatted) + if not tasks: + return formatted + + from scripts.langchain import task_decomposer + + suggestions: list[dict[str, Any]] = [] + for task in tasks: + decomposition = task_decomposer.decompose_task(task, use_llm=use_llm) + sub_tasks = decomposition.get("sub_tasks") or [] + if sub_tasks: + suggestions.append({"task": task, "split_suggestions": sub_tasks}) + if not suggestions: + return formatted + + from scripts.langchain import issue_optimizer + + return issue_optimizer._apply_task_decomposition(formatted, {"task_splitting": suggestions}) + + def format_issue_body(issue_body: str, *, use_llm: bool = True) -> dict[str, Any]: if not issue_body: issue_body = "" @@ -327,6 +383,7 @@ def format_issue_body(issue_body: str, *, use_llm: bool = True) -> dict[str, Any content = getattr(response, "content", None) or str(response) formatted = content.strip() if _formatted_output_valid(formatted): + formatted = _apply_task_decomposition(formatted, use_llm=use_llm) formatted = _append_raw_issue_section(formatted, issue_body) return { "formatted_body": formatted, @@ -337,7 +394,9 @@ def format_issue_body(issue_body: str, *, use_llm: bool = True) -> dict[str, Any # Fall through to fallback if LLM fails (import, auth, API errors) pass - formatted = _append_raw_issue_section(_format_issue_fallback(issue_body), issue_body) + formatted = _format_issue_fallback(issue_body) + formatted = _apply_task_decomposition(formatted, use_llm=use_llm) + formatted = _append_raw_issue_section(formatted, issue_body) return { "formatted_body": formatted, "provider_used": None, diff --git a/tools/llm_provider.py b/tools/llm_provider.py index 2db6c931..08bd23e6 100644 --- a/tools/llm_provider.py +++ b/tools/llm_provider.py @@ -31,7 +31,9 @@ # GitHub Models API endpoint (OpenAI-compatible) GITHUB_MODELS_BASE_URL = "https://models.inference.ai.azure.com" -DEFAULT_MODEL = "gpt-4o-mini" +# Use gpt-4o for evaluation - best available on GitHub Models +# gpt-4o-mini was too lenient and passed obvious deficiencies +DEFAULT_MODEL = "gpt-4o" def _setup_langsmith_tracing() -> bool: