diff --git a/.github/scripts/__tests__/keepalive-loop.test.js b/.github/scripts/__tests__/keepalive-loop.test.js index 0e34cee77..cd51462b9 100644 --- a/.github/scripts/__tests__/keepalive-loop.test.js +++ b/.github/scripts/__tests__/keepalive-loop.test.js @@ -1989,3 +1989,123 @@ test('normaliseChecklistSection preserves non-list content', () => { assert.equal(result, expected); }); + +test('updateKeepaliveLoopSummary displays LLM provider analysis details', async () => { + const existingState = formatStateComment({ + trace: 'trace-llm', + iteration: 1, + max_iterations: 5, + failure_threshold: 3, + }); + const github = buildGithubStub({ + comments: [{ id: 77, body: existingState, html_url: 'https://example.com/77' }], + }); + await updateKeepaliveLoopSummary({ + github, + context: buildContext(123), + core: buildCore(), + inputs: { + prNumber: 123, + action: 'run', + runResult: 'success', + gateConclusion: 'success', + tasksTotal: 4, + tasksUnchecked: 2, + keepaliveEnabled: true, + autofixEnabled: false, + iteration: 1, + maxIterations: 5, + failureThreshold: 3, + trace: 'trace-llm', + llm_provider: 'github-models', + llm_confidence: 0.95, + llm_analysis_run: true, + }, + }); + + assert.equal(github.actions.length, 1); + assert.equal(github.actions[0].type, 'update'); + assert.match(github.actions[0].body, /### 🧠 Task Analysis/); + assert.match(github.actions[0].body, /GitHub Models \(primary\)/); + assert.match(github.actions[0].body, /Confidence \| 95%/); +}); + +test('updateKeepaliveLoopSummary shows fallback warning for OpenAI provider', async () => { + const existingState = formatStateComment({ + trace: 'trace-openai', + iteration: 1, + max_iterations: 5, + failure_threshold: 3, + }); + const github = buildGithubStub({ + comments: [{ id: 78, body: existingState, html_url: 'https://example.com/78' }], + }); + await updateKeepaliveLoopSummary({ + github, + context: buildContext(123), + core: buildCore(), + inputs: { + prNumber: 123, + action: 'run', + runResult: 'success', + gateConclusion: 'success', + tasksTotal: 4, + tasksUnchecked: 2, + keepaliveEnabled: true, + autofixEnabled: false, + iteration: 1, + maxIterations: 5, + failureThreshold: 3, + trace: 'trace-openai', + llm_provider: 'openai', + llm_confidence: 0.87, + llm_analysis_run: true, + }, + }); + + assert.equal(github.actions.length, 1); + assert.equal(github.actions[0].type, 'update'); + assert.match(github.actions[0].body, /### 🧠 Task Analysis/); + assert.match(github.actions[0].body, /OpenAI \(fallback\)/); + assert.match(github.actions[0].body, /Primary provider.*was unavailable/); +}); + +test('updateKeepaliveLoopSummary shows regex fallback warning', async () => { + const existingState = formatStateComment({ + trace: 'trace-regex', + iteration: 1, + max_iterations: 5, + failure_threshold: 3, + }); + const github = buildGithubStub({ + comments: [{ id: 79, body: existingState, html_url: 'https://example.com/79' }], + }); + await updateKeepaliveLoopSummary({ + github, + context: buildContext(123), + core: buildCore(), + inputs: { + prNumber: 123, + action: 'run', + runResult: 'success', + gateConclusion: 'success', + tasksTotal: 4, + tasksUnchecked: 2, + keepaliveEnabled: true, + autofixEnabled: false, + iteration: 1, + maxIterations: 5, + failureThreshold: 3, + trace: 'trace-regex', + llm_provider: 'regex-fallback', + llm_confidence: 0.7, + llm_analysis_run: true, + }, + }); + + assert.equal(github.actions.length, 1); + assert.equal(github.actions[0].type, 'update'); + assert.match(github.actions[0].body, /### 🧠 Task Analysis/); + assert.match(github.actions[0].body, /Regex \(fallback\)/); + assert.match(github.actions[0].body, /Primary provider.*was unavailable/); +}); diff --git a/.github/scripts/keepalive_loop.js b/.github/scripts/keepalive_loop.js index df3459610..a5afb996c 100644 --- a/.github/scripts/keepalive_loop.js +++ b/.github/scripts/keepalive_loop.js @@ -950,6 +950,11 @@ async function updateKeepaliveLoopSummary({ github, context, core, inputs }) { const agentSummary = normalise(inputs.agent_summary ?? inputs.agentSummary ?? inputs.codex_summary ?? inputs.codexSummary); const runUrl = normalise(inputs.run_url ?? inputs.runUrl); + // LLM task analysis details + const llmProvider = normalise(inputs.llm_provider ?? inputs.llmProvider); + const llmConfidence = toNumber(inputs.llm_confidence ?? inputs.llmConfidence, 0); + const llmAnalysisRun = toBool(inputs.llm_analysis_run ?? inputs.llmAnalysisRun, false); + const { state: previousState, commentId } = await loadKeepaliveState({ github, context, @@ -1211,6 +1216,29 @@ async function updateKeepaliveLoopSummary({ github, context, core, inputs }) { } } + // LLM analysis details - show which provider was used for task completion detection + if (llmAnalysisRun && llmProvider) { + const providerIcon = llmProvider === 'github-models' ? 'βœ…' : + llmProvider === 'openai' ? '⚠️' : + llmProvider === 'regex-fallback' ? 'πŸ”Ά' : 'ℹ️'; + const providerLabel = llmProvider === 'github-models' ? 'GitHub Models (primary)' : + llmProvider === 'openai' ? 'OpenAI (fallback)' : + llmProvider === 'regex-fallback' ? 'Regex (fallback)' : llmProvider; + const confidencePercent = Math.round(llmConfidence * 100); + summaryLines.push( + '', + '### 🧠 Task Analysis', + `| Provider | ${providerIcon} ${providerLabel} |`, + `| Confidence | ${confidencePercent}% |`, + ); + if (llmProvider !== 'github-models') { + summaryLines.push( + '', + `> ⚠️ Primary provider (GitHub Models) was unavailable; used ${providerLabel} instead.`, + ); + } + } + if (isTransientFailure) { summaryLines.push( '', @@ -1682,12 +1710,13 @@ async function analyzeTaskCompletion({ github, context, prNumber, baseSha, headS * @param {number} params.prNumber - PR number * @param {string} params.baseSha - Base SHA (before agent work) * @param {string} params.headSha - Head SHA (after agent work) + * @param {string[]} [params.llmCompletedTasks] - Tasks marked complete by LLM analysis * @param {object} [params.core] - Optional core for logging * @returns {Promise<{updated: boolean, tasksChecked: number, details: string}>} */ -async function autoReconcileTasks({ github, context, prNumber, baseSha, headSha, core }) { +async function autoReconcileTasks({ github, context, prNumber, baseSha, headSha, llmCompletedTasks, core }) { const log = (msg) => core?.info?.(msg) || console.log(msg); - + // Get current PR body let pr; try { @@ -1710,13 +1739,39 @@ async function autoReconcileTasks({ github, context, prNumber, baseSha, headSha, return { updated: false, tasksChecked: 0, details: 'No tasks found in PR body' }; } - // Analyze what tasks may have been completed + // Build high-confidence matches from multiple sources + let highConfidence = []; + + // Source 1: LLM analysis (highest priority if available) + if (llmCompletedTasks && Array.isArray(llmCompletedTasks) && llmCompletedTasks.length > 0) { + log(`LLM analysis found ${llmCompletedTasks.length} completed task(s)`); + for (const task of llmCompletedTasks) { + highConfidence.push({ + task, + reason: 'LLM session analysis', + confidence: 'high', + source: 'llm', + }); + } + } + + // Source 2: Commit/file analysis (fallback or supplementary) const analysis = await analyzeTaskCompletion({ github, context, prNumber, baseSha, headSha, taskText, core }); - // Only auto-check high-confidence matches - const highConfidence = analysis.matches.filter(m => m.confidence === 'high'); + // Add commit-based matches that aren't already covered by LLM + const llmTasksLower = new Set((llmCompletedTasks || []).map(t => t.toLowerCase())); + const commitMatches = analysis.matches + .filter(m => m.confidence === 'high') + .filter(m => !llmTasksLower.has(m.task.toLowerCase())); + + if (commitMatches.length > 0) { + log(`Commit analysis found ${commitMatches.length} additional task(s)`); + for (const match of commitMatches) { + highConfidence.push({ ...match, source: 'commit' }); + } + } if (highConfidence.length === 0) { log('No high-confidence task matches to auto-check'); @@ -1766,14 +1821,26 @@ async function autoReconcileTasks({ github, context, prNumber, baseSha, headSha, return { updated: false, tasksChecked: 0, - details: `Failed to update PR: ${error.message}` + details: `Failed to update PR: ${error.message}`, + sources: { llm: 0, commit: 0 }, }; } + // Count matches by source for reporting + const llmCount = highConfidence.filter(m => m.source === 'llm').length; + const commitCount = highConfidence.filter(m => m.source === 'commit').length; + + // Build detailed description + const sourceDesc = []; + if (llmCount > 0) sourceDesc.push(`${llmCount} from LLM analysis`); + if (commitCount > 0) sourceDesc.push(`${commitCount} from commit analysis`); + const sourceInfo = sourceDesc.length > 0 ? ` (${sourceDesc.join(', ')})` : ''; + return { updated: true, tasksChecked: checkedCount, - details: `Auto-checked ${checkedCount} task(s): ${highConfidence.map(m => m.task.slice(0, 30) + '...').join(', ')}` + details: `Auto-checked ${checkedCount} task(s)${sourceInfo}: ${highConfidence.map(m => m.task.slice(0, 30) + '...').join(', ')}`, + sources: { llm: llmCount, commit: commitCount }, }; } diff --git a/.github/workflows/agents-keepalive-loop.yml b/.github/workflows/agents-keepalive-loop.yml index 88bde2650..5ad9cff8d 100644 --- a/.github/workflows/agents-keepalive-loop.yml +++ b/.github/workflows/agents-keepalive-loop.yml @@ -362,6 +362,26 @@ jobs: const beforeSha = '${{ needs.evaluate.outputs.head_sha }}'; // SHA before agent ran const headSha = '${{ needs.run-codex.outputs.commit-sha }}'; // SHA after agent ran + // LLM analysis metadata + const llmProvider = '${{ needs.run-codex.outputs.llm-provider || '' }}'; + const llmConfidence = '${{ needs.run-codex.outputs.llm-confidence || '' }}'; + const llmAnalysisRun = '${{ needs.run-codex.outputs.llm-analysis-run }}' === 'true'; + + // Parse LLM completed tasks if available + let llmCompletedTasks = []; + const llmTasksJson = '${{ needs.run-codex.outputs.llm-completed-tasks || '[]' }}'; + try { + llmCompletedTasks = JSON.parse(llmTasksJson); + if (llmCompletedTasks.length > 0) { + core.info(`LLM analysis found ${llmCompletedTasks.length} completed task(s)`); + if (llmProvider) { + core.info(`LLM provider: ${llmProvider} (confidence: ${llmConfidence})`); + } + } + } catch (e) { + core.debug(`Failed to parse LLM tasks: ${e.message}`); + } + if (!prNumber || !beforeSha || !headSha) { core.info('Missing required inputs for task reconciliation'); return; @@ -371,19 +391,24 @@ jobs: core.info(`Comparing ${beforeSha.slice(0, 7)} β†’ ${headSha.slice(0, 7)}`); const result = await autoReconcileTasks({ - github, context, prNumber, baseSha: beforeSha, headSha, core + github, context, prNumber, baseSha: beforeSha, headSha, llmCompletedTasks, core }); if (result.updated) { core.info(`βœ… ${result.details}`); - core.notice(`Auto-checked ${result.tasksChecked} task(s) based on commit analysis`); + core.notice(`Auto-checked ${result.tasksChecked} task(s) based on analysis`); } else { core.info(`ℹ️ ${result.details}`); } - // Output for step summary + // Output for step summary and downstream reporting core.setOutput('tasks_checked', result.tasksChecked); core.setOutput('reconciliation_details', result.details); + core.setOutput('llm_provider', llmProvider); + core.setOutput('llm_confidence', llmConfidence); + core.setOutput('llm_analysis_run', llmAnalysisRun); + core.setOutput('llm_tasks_count', llmCompletedTasks.length); + core.setOutput('commit_tasks_count', result.sources?.commit || 0); - name: Update summary comment uses: actions/github-script@v7 @@ -415,5 +440,9 @@ jobs: agent_commit_sha: '${{ needs.run-codex.outputs.commit-sha }}', agent_files_changed: '${{ needs.run-codex.outputs.files-changed }}', agent_summary: process.env.CODEX_SUMMARY || '', + // LLM analysis details for task completion reporting + llm_provider: '${{ needs.run-codex.outputs.llm-provider || '' }}', + llm_confidence: '${{ needs.run-codex.outputs.llm-confidence || '' }}', + llm_analysis_run: '${{ needs.run-codex.outputs.llm-analysis-run }}' === 'true', }; await updateKeepaliveLoopSummary({ github, context, core, inputs }); diff --git a/.github/workflows/reusable-codex-run.yml b/.github/workflows/reusable-codex-run.yml index 1cb935f25..dae9c0e33 100644 --- a/.github/workflows/reusable-codex-run.yml +++ b/.github/workflows/reusable-codex-run.yml @@ -27,6 +27,11 @@ on: required: false default: '' type: string + workflows_ref: + description: 'The ref of the Workflows repo to checkout for scripts. Defaults to main.' + required: false + default: 'main' + type: string max_runtime_minutes: description: 'Upper bound for the job runtime in minutes.' required: false @@ -116,6 +121,14 @@ jobs: error-category: ${{ steps.classify_failure.outputs.error_category }} error-type: ${{ steps.classify_failure.outputs.error_type }} error-recovery: ${{ steps.classify_failure.outputs.error_recovery }} + # LLM analysis outputs + llm-analysis-run: ${{ steps.llm_analysis.outputs.llm-analysis-run }} + llm-completed-tasks: ${{ steps.llm_analysis.outputs.completed-tasks }} + llm-has-completions: ${{ steps.llm_analysis.outputs.has-completions }} + llm-provider: ${{ steps.llm_analysis.outputs.provider }} + llm-confidence: ${{ steps.llm_analysis.outputs.confidence }} + session-event-count: ${{ steps.analyze_session.outputs.event-count }} + session-todo-count: ${{ steps.analyze_session.outputs.todo-count }} steps: - name: Mint GitHub App token (preferred) id: app_token @@ -163,16 +176,15 @@ jobs: ref: ${{ inputs.pr_ref || github.ref }} token: ${{ steps.auth_token.outputs.checkout_token }} - # Checkout Workflows repo scripts for post-completion and error handling + # Checkout Workflows repo scripts for post-completion, error handling, and LLM analysis # These scripts are in stranske/Workflows but need to be available when # this reusable workflow runs in consumer repos - name: Checkout Workflows scripts uses: actions/checkout@v4 with: repository: stranske/Workflows - ref: main - sparse-checkout: .github/scripts - sparse-checkout-cone-mode: false + # Use the workflows_ref input which should match the @ref in the uses: line + ref: ${{ inputs.workflows_ref }} path: .workflows-lib token: ${{ steps.auth_token.outputs.checkout_token }} @@ -216,6 +228,17 @@ jobs: if [ -f pyproject.toml ]; then python -m pip install -e ".[dev]" || python -m pip install -e . fi + + - name: Install Workflows repo LLM dependencies + run: | + # Install LLM dependencies from Workflows repo for session analysis + if [ -f .workflows-lib/tools/requirements.txt ]; then + echo "Installing LLM analysis dependencies..." + python -m pip install -r .workflows-lib/tools/requirements.txt || { + echo "::notice::LLM dependencies not installed, will fall back to regex analysis" + } + fi + - name: Validate prompt template integrity id: guard env: @@ -370,8 +393,10 @@ jobs: PR_NUM="${{ inputs.pr_number }}" if [ -n "${PR_NUM}" ]; then OUTPUT_FILE="codex-output-${PR_NUM}.md" + SESSION_JSONL="codex-session-${PR_NUM}.jsonl" else OUTPUT_FILE="codex-output.md" + SESSION_JSONL="codex-session.jsonl" fi SANDBOX="${{ inputs.sandbox }}" EXTRA_ARGS="${{ inputs.codex_args }}" @@ -385,14 +410,15 @@ jobs: echo "Prompt file: $PROMPT_FILE" echo "Sandbox: $SANDBOX" - # Run codex exec with prompt from file + # Run codex exec with --json to capture rich session data + # JSONL events stream to stdout, final message still goes to OUTPUT_FILE # Build command array to handle EXTRA_ARGS properly # NOTE: --mode flag not yet supported by Codex CLI, removed for now CODEX_EXIT=0 if [ -n "${EXTRA_ARGS:-}" ]; then - eval "codex exec --skip-git-repo-check --sandbox \"$SANDBOX\" --output-last-message \"$OUTPUT_FILE\" $EXTRA_ARGS \"\$(cat \"\$PROMPT_FILE\")\"" || CODEX_EXIT=$? + eval "codex exec --json --skip-git-repo-check --sandbox \"$SANDBOX\" --output-last-message \"$OUTPUT_FILE\" $EXTRA_ARGS \"\$(cat \"\$PROMPT_FILE\")\"" > "$SESSION_JSONL" 2>&1 || CODEX_EXIT=$? else - codex exec --skip-git-repo-check --sandbox "$SANDBOX" --output-last-message "$OUTPUT_FILE" "$(cat "$PROMPT_FILE")" || CODEX_EXIT=$? + codex exec --json --skip-git-repo-check --sandbox "$SANDBOX" --output-last-message "$OUTPUT_FILE" "$(cat "$PROMPT_FILE")" > "$SESSION_JSONL" 2>&1 || CODEX_EXIT=$? fi echo "exit-code=${CODEX_EXIT}" >> "$GITHUB_OUTPUT" @@ -420,6 +446,130 @@ jobs: # Exit with original code to mark job as failed if Codex failed exit "$CODEX_EXIT" + - name: Analyze Codex session + id: analyze_session + if: always() + env: + PYTHONPATH: ${{ github.workspace }} + PR_NUM: ${{ inputs.pr_number }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + + if [ -n "${PR_NUM}" ]; then + SESSION_JSONL="codex-session-${PR_NUM}.jsonl" + else + SESSION_JSONL="codex-session.jsonl" + fi + export SESSION_JSONL + + # Check if session file exists and has content + if [ ! -f "$SESSION_JSONL" ] || [ ! -s "$SESSION_JSONL" ]; then + echo "No session JSONL found or file is empty" + echo "session-available=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + + echo "Session JSONL captured: $(wc -l < "$SESSION_JSONL") lines" + echo "session-available=true" >> "$GITHUB_OUTPUT" + + # Basic parsing (always available) + python3 << 'PYEOF' + import os + import sys + sys.path.insert(0, '.') + + session_file = os.environ.get("SESSION_JSONL", "codex-session.jsonl") + github_output = os.environ.get("GITHUB_OUTPUT", "/dev/null") + + try: + from tools.codex_jsonl_parser import parse_codex_jsonl_file + + session = parse_codex_jsonl_file(session_file) + + print(f"::notice::Session parsed: {session.raw_event_count} events") + print(f"::notice::Agent messages: {len(session.agent_messages)}") + print(f"::notice::Commands: {len(session.commands)} ({len(session.successful_commands)} ok, {len(session.failed_commands)} failed)") + print(f"::notice::File changes: {len(session.file_changes)}") + print(f"::notice::Todo items: {len(session.todo_items)}") + + if session.parse_errors: + print(f"::warning::Parse errors: {len(session.parse_errors)}") + + # Output key metrics for downstream steps + with open(github_output, "a") as f: + f.write(f"event-count={session.raw_event_count}\n") + f.write(f"message-count={len(session.agent_messages)}\n") + f.write(f"command-count={len(session.commands)}\n") + f.write(f"file-change-count={len(session.file_changes)}\n") + f.write(f"todo-count={len(session.todo_items)}\n") + f.write(f"completed-todo-count={len(session.completed_todos)}\n") + + except ImportError as e: + print(f"::notice::Session parser not available: {e}") + except Exception as e: + print(f"::warning::Session analysis failed: {e}") + PYEOF + + - name: Analyze task completion with LLM + id: llm_analysis + if: always() && steps.analyze_session.outputs.session-available == 'true' && inputs.pr_number != '' + env: + PYTHONPATH: ${{ github.workspace }}/.workflows-lib:${{ github.workspace }} + PR_NUM: ${{ inputs.pr_number }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + set -euo pipefail + + SESSION_JSONL="codex-session-${PR_NUM}.jsonl" + ANALYSIS_FILE="codex-analysis-${PR_NUM}.json" + + # Fetch PR body to extract tasks + echo "Fetching PR #${PR_NUM} body..." + PR_BODY=$(gh pr view "${PR_NUM}" --json body --jq '.body' 2>/dev/null || echo "") + + if [ -z "$PR_BODY" ]; then + echo "::notice::Could not fetch PR body, skipping LLM analysis" + echo "llm-analysis-run=false" >> "$GITHUB_OUTPUT" + exit 0 + fi + + # Save PR body to temp file + echo "$PR_BODY" > pr_body.md + + # Run full LLM analysis and save JSON output + # Scripts are in .workflows-lib from Workflows repo checkout + echo "Running LLM-powered task completion analysis..." + python3 .workflows-lib/scripts/analyze_codex_session.py \ + --session-file "$SESSION_JSONL" \ + --pr-body-file pr_body.md \ + --output json > "$ANALYSIS_FILE" 2>&1 || { + echo "::warning::LLM analysis failed, continuing without it" + cat "$ANALYSIS_FILE" 2>/dev/null || true # Show error for debugging + echo "llm-analysis-run=false" >> "$GITHUB_OUTPUT" + rm -f "$ANALYSIS_FILE" + exit 0 + } + + # Also output to GitHub Actions for visibility + python3 .workflows-lib/scripts/analyze_codex_session.py \ + --session-file "$SESSION_JSONL" \ + --pr-body-file pr_body.md \ + --output github-actions || true + + echo "llm-analysis-run=true" >> "$GITHUB_OUTPUT" + echo "analysis-file=$ANALYSIS_FILE" >> "$GITHUB_OUTPUT" + + # Extract key fields for downstream use + if [ -f "$ANALYSIS_FILE" ]; then + COMPLETED=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(json.dumps(d.get('completed_tasks', [])))") + PROVIDER=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('provider', 'unknown'))") + CONFIDENCE=$(python3 -c "import json; d=json.load(open('$ANALYSIS_FILE')); print(d.get('confidence', 0))") + echo "completed-tasks=$COMPLETED" >> "$GITHUB_OUTPUT" + echo "provider=$PROVIDER" >> "$GITHUB_OUTPUT" + echo "confidence=$CONFIDENCE" >> "$GITHUB_OUTPUT" + fi + - name: Commit and push changes id: commit env: @@ -555,6 +705,8 @@ jobs: name: codex-output-${{ inputs.pr_number || github.run_id }} path: | codex-output*.md + codex-session*.jsonl + codex-analysis*.json if-no-files-found: ignore - name: Post completion checkpoint comment diff --git a/docs/plans/langchain-keepalive-integration.md b/docs/plans/langchain-keepalive-integration.md new file mode 100644 index 000000000..0ee97c2af --- /dev/null +++ b/docs/plans/langchain-keepalive-integration.md @@ -0,0 +1,471 @@ +# LangChain Keepalive Integration Plan + +> **Status**: Planning +> **Created**: 2026-01-02 +> **Target Branch**: `feature/langchain-analysis` +> **Test Consumer**: `stranske/Portable-Alpha-Extension-Model` + +--- + +## Summary of Findings + +### 1. Session Data Sources (Multiple Options!) + +We discovered **three different data sources** from the Codex CLI, each with different richness levels: + +#### Option A: Final Summary (`--output-last-message`) - Current +**Current state**: We capture via `codex-output-*.md` artifacts. +- Uploaded by `reusable-codex-run.yml` line 553 +- Contains Codex's final summary message only +- Artifact name format: `codex-output-{pr_number}` + +**Pros**: Simple, low data volume +**Cons**: Limited context, misses intermediate steps + +--- + +#### Option B: JSONL Event Stream (`--json`) - **Recommended** +The Codex CLI has a `--json` flag that streams **detailed events as JSONL**: + +```bash +codex exec --json --output-last-message "$OUTPUT_FILE" "$PROMPT" 2>&1 | tee "$SESSION_LOG" +``` + +**Event types available** (from [exec.md](https://github.com/openai/codex/blob/main/docs/exec.md#json-output-mode)): +- `thread.started` / `turn.started` / `turn.completed` / `turn.failed` +- `item.started` / `item.updated` / `item.completed` + +**Item types**: +| Type | Contains | LLM Analysis Potential | +|------|----------|----------------------| +| `agent_message` | Assistant responses | ⭐⭐⭐ High - explicit completion statements | +| `reasoning` | Model thinking summaries | ⭐⭐⭐ High - reveals intent and progress | +| `command_execution` | Shell commands + exit codes + output | ⭐⭐ Medium - shows actual work done | +| `file_change` | Files added/modified/deleted | ⭐⭐ Medium - concrete evidence | +| `mcp_tool_call` | MCP tool invocations | ⭐ Low - implementation detail | +| `web_search` | Web search actions | ⭐ Low - implementation detail | +| `todo_list` | Task tracking | ⭐⭐⭐ High - direct task mapping! | + +**Known issues** (from GitHub): +- [#4776](https://github.com/openai/codex/issues/4776): Field names changed (`item_type`β†’`type`, `assistant_message`β†’`agent_message`) +- [#5276](https://github.com/openai/codex/issues/5276): Reasoning token usage not yet in output +- Schema may evolve - need graceful parsing + +**Pros**: Rich data, shows reasoning and progress, includes todo tracking! +**Cons**: More data to process, schema changes over time + +--- + +#### Option C: Session Files (`~/.codex/sessions/`) +Full session history saved to disk: +``` +~/.codex/sessions/YYYY/MM/DD/rollout-*.jsonl +``` + +**Pros**: Most complete data, includes full token counts +**Cons**: Requires file system access post-run, may not persist in CI + +--- + +#### Option D: TUI Session Recording (`CODEX_TUI_RECORD_SESSION=1`) +Environment variable enables detailed logging: +```bash +CODEX_TUI_RECORD_SESSION=1 codex ... +# Logs to ~/.codex/log/session-YYYYMMDDTHHMMSSZ.jsonl +``` + +**Pros**: Captures all TUI events +**Cons**: Designed for interactive mode, may not work with `codex exec` + +--- + +### Data Source Selection for Testing + +| Phase | Data Source | Why | +|-------|-------------|-----| +| Test 1 | Option A (summary only) | Baseline comparison | +| Test 2 | Option B (`--json` stream) | Recommended - rich + practical | +| Test 3 | Option B subset | Only `agent_message` + `reasoning` + `todo_list` | + +**Priority fields for analysis**: +1. `agent_message` - What did Codex say it accomplished? +2. `reasoning` - What was it thinking? +3. `todo_list` - Direct mapping to PR checkboxes! +4. `file_change` - Concrete evidence of work + +### 2. GitHub Models API βœ… + +**Verified working** with your GitHub token: +```bash +curl -s "https://models.inference.ai.azure.com/chat/completions" \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"messages":[{"role":"user","content":"test"}],"model":"gpt-4o-mini"}' +``` + +**Integration approach**: Use LangChain's OpenAI integration with custom base URL: +```python +from langchain_openai import ChatOpenAI + +llm = ChatOpenAI( + model="gpt-4o-mini", + base_url="https://models.inference.ai.azure.com", + api_key=os.environ["GITHUB_TOKEN"], # GitHub token works! +) +``` + +No separate `langchain-github` package needed. + +--- + +## Community Tools & Research + +### Existing Codex Session Analysis Tools + +| Tool | Description | Relevance | +|------|-------------|-----------| +| [codex-session-view](https://github.com/AcidicSoil/codex-session-view) | Visualizer with **AI Session Coach** that analyzes sessions using LLM | ⭐⭐⭐ Reference implementation! | +| [codex-history-list](https://github.com/shinshin86/codex-history-list) | CLI to list sessions, extracts cwd and first user request | ⭐⭐ Parsing patterns | +| [codex_usage_report](https://github.com/rubens-amaral/codex_usage_report) | Go CLI analyzing session logs for rate limits | ⭐ Token tracking | +| [cxusage](https://github.com/zaharsyahrafi/cxusage) | Daily usage aggregation from session logs | ⭐ Aggregation patterns | + +**Key insight from `codex-session-view`**: Uses AI Session Coach with multiple providers (OpenAI, Gemini, LM Studio) - validates our provider fallback approach! + +### LangChain Integration Patterns + +**No direct Codexβ†’LangChain library exists**, but relevant LangChain components: + +| Component | Use Case | +|-----------|----------| +| `TrajectoryEvalChain` | Evaluates agent step sequences - similar to our task completion analysis | +| `LogStreamCallbackHandler` | Real-time event streaming - pattern for processing JSONL | +| `FileCallbackHandler` | Persists agent actions - reference for our logging | + +**LangChain trajectory format** (from `trajectory_eval_chain.py`): +```python +def get_agent_trajectory(steps: Sequence[tuple[AgentAction, str]]) -> str: + return "\n\n".join([ + f"""Step {i}: +Tool used: {action.tool} +Tool input: {action.tool_input} +Tool output: {output}""" + for i, (action, output) in enumerate(steps, 1) + ]) +``` + +This pattern maps well to Codex JSONL events! + +### Gap Analysis + +- ❌ No existing Codex JSONL β†’ LangChain message converter +- ❌ Python SDK for Codex still proposed ([#5320](https://github.com/openai/codex/issues/5320)) +- βœ… Community has validated LLM-based session analysis approach +- βœ… Our provider fallback matches `codex-session-view` pattern + +--- + +## Provider Fallback Chain + +``` +β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β” +β”‚ 1. GitHub Models API (gpt-4o-mini) β”‚ +β”‚ - Uses existing GITHUB_TOKEN β”‚ +β”‚ - Free with Copilot subscription β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ 2. OpenAI API (gpt-4o-mini) β”‚ +β”‚ - Uses OPENAI_API_KEY secret β”‚ +β”‚ - ~$0.0006 per analysis β”‚ +β”œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ +β”‚ 3. Regex Fallback β”‚ +β”‚ - No API calls β”‚ +β”‚ - Basic pattern matching β”‚ +β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜ +``` + +--- + +## Analysis Timing Options + +| Option | When | Pros | Cons | +|--------|------|------|------| +| **A: Every round** | After each Codex run | Most accurate, catches all completions | Higher API usage | +| **B: On stall** | After round with no checkbox changes | Targeted intervention | Delays detection by 1 round | +| **C: Conditional** | Round 1 always, then only on stall | Balances accuracy vs cost | More logic complexity | +| **D: Post-CI** | After CI completes | Can correlate CI results with tasks | Adds latency | + +**Testing plan**: Run A vs C to measure cost/benefit trade-off. + +--- + +## Dependencies to Add + +```toml +# pyproject.toml [project.optional-dependencies] +langchain = [ + "langchain-core>=0.3.0", + "langchain-openai>=0.3.0", +] +``` + +**Note**: Keep as optional dependency so workflows without LLM still function. + +--- + +## Files to Create/Modify + +### New Files in `tools/` + +| File | Purpose | +|------|---------| +| `llm_provider.py` | Provider abstraction with GitHub β†’ OpenAI β†’ regex fallback | +| `langchain_task_extractor.py` | LLM-enhanced task/scope extraction | +| `codex_log_analyzer.py` | Session output analysis for completion detection | +| `ci_failure_triage.py` | CI failure classification and fix suggestions | +| `update_pr_checkboxes.py` | GitHub API wrapper to update PR body checkboxes | +| `post_progress_comment.py` | Posts analysis comment when work incomplete | + +### Workflow Modifications + +| File | Change | +|------|--------| +| `.github/workflows/reusable-codex-run.yml` | Add post-run analysis step | +| `.github/scripts/keepalive_loop.js` | Inject analysis into next prompt | + +--- + +## Testing Plan + +### Phase 0: Data Source Evaluation + +**Goal**: Determine which session data source provides best signal-to-noise for task completion detection. + +| Test | Data Source | Method | +|------|-------------|--------| +| **0.1** | Summary only (Option A) | Current `--output-last-message` | +| **0.2** | Full JSONL (Option B) | `--json` piped to file | +| **0.3** | Filtered JSONL | Only `agent_message` + `reasoning` + `todo_list` events | + +**Evaluation criteria**: +- Can the LLM accurately detect task completion? +- What's the token cost per analysis? +- How robust is parsing to schema changes? + +**Workflow change for Option B**: +```yaml +# Current: +codex exec --output-last-message "$OUTPUT_FILE" "$PROMPT" + +# Enhanced: +codex exec --json --output-last-message "$OUTPUT_FILE" "$PROMPT" 2>&1 | tee "$SESSION_JSONL" +# Then parse $SESSION_JSONL for rich analysis +``` + +--- + +### Phase 1: Baseline (Current System) + +1. Create test issue in Portable Alpha with 3-4 tasks +2. Let keepalive run with current regex-only system +3. Record: + - Total rounds to actual completion + - Rounds to checkbox detection + - False negatives (work done, not detected) + +### Phase 2: LangChain Enhanced + +1. Push `feature/langchain-analysis` branch to Workflows +2. Update Portable Alpha to use: + ```yaml + uses: stranske/Workflows/.github/workflows/reusable-codex-run.yml@feature/langchain-analysis + ``` +3. Add `OPENAI_API_KEY` secret to Portable Alpha (fallback) +4. Create similar test issue +5. Record same metrics + +### Phase 3: Analysis + +| Metric | Regex-Only | LangChain | Improvement | +|--------|------------|-----------|-------------| +| Rounds to completion | ? | ? | ? | +| Detection accuracy | ? | ? | ? | +| False positives | ? | ? | ? | +| API cost per PR | $0 | ~$0.01 | -$0.01 | +| Time per round | ? | +2-3s | Negligible | + +--- + +## Implementation Steps + +### Step 1: Add LangChain dependencies +- Update `pyproject.toml` with optional `[langchain]` extras +- Create `tools/llm_provider.py` with fallback logic + +### Step 2: Port and adapt tools +- Copy tools from Trend Model Project (already retrieved to /tmp) +- Adapt to use `llm_provider.py` abstraction +- Add tests + +### Step 3: Workflow integration +- Add analysis step to `reusable-codex-run.yml` +- Wire analysis results to PR checkbox updates +- Wire analysis results to next-round prompt + +### Step 4: Consumer setup +- Update Portable Alpha workflow reference +- Add `OPENAI_API_KEY` secret +- Create test issue + +### Step 5: Run comparison tests +- Execute Phase 1 (baseline) +- Execute Phase 2 (enhanced) +- Document results + +### Step 6: Tune and finalize +- Decide on timing option (A/B/C/D) +- Merge to main +- Revert consumer to `@main` + +--- + +## Secrets Required + +| Secret | Repo | Purpose | +|--------|------|---------| +| `OPENAI_API_KEY` | Portable Alpha | Fallback LLM provider | +| `GITHUB_TOKEN` | Auto-provided | GitHub Models API (primary) | + +--- + +## Codex JSONL Event Schema Reference + +Based on [exec.md](https://github.com/openai/codex/blob/main/docs/exec.md) and source code analysis. + +### Thread/Turn Events +```json +{"type": "thread.started", "thread_id": "uuid", "timestamp": "..."} +{"type": "turn.started", "turn_id": "uuid", "thread_id": "uuid"} +{"type": "turn.completed", "turn_id": "uuid", "token_usage": {...}} +{"type": "turn.failed", "turn_id": "uuid", "error": "..."} +``` + +### Item Events +```json +{"type": "item.started", "item_id": "uuid", "item_type": "agent_message"} +{"type": "item.updated", "item_id": "uuid", "content": "..."} +{"type": "item.completed", "item_id": "uuid"} +``` + +### High-Value Item Types for Analysis + +**`agent_message`** - What Codex says: +```json +{ + "type": "item.completed", + "item_type": "agent_message", + "content": "I've completed the first two tasks..." +} +``` + +**`reasoning`** - What Codex is thinking: +```json +{ + "type": "item.completed", + "item_type": "reasoning", + "content": "The user wants me to fix the tests. I should first..." +} +``` + +**`command_execution`** - Shell commands: +```json +{ + "type": "item.completed", + "item_type": "command_execution", + "command": "pytest tests/", + "exit_code": 0, + "output": "..." +} +``` + +**`file_change`** - File modifications: +```json +{ + "type": "item.completed", + "item_type": "file_change", + "path": "src/module.py", + "change_type": "modified" +} +``` + +**`todo_list`** - Task tracking (if emitted): +```json +{ + "type": "item.completed", + "item_type": "todo_list", + "items": [ + {"task": "Fix test failures", "status": "completed"}, + {"task": "Update documentation", "status": "in_progress"} + ] +} +``` + +### Schema Versioning Notes + +⚠️ **Known breaking changes**: +- `item_type` was renamed to `type` in some events +- `assistant_message` renamed to `agent_message` +- Always use defensive parsing with fallbacks + +--- + +## Rollback Plan + +If LangChain integration causes issues: +1. Consumer repos: Change `@feature/langchain-analysis` back to `@main` +2. No code changes needed in consumer +3. Feature branch remains available for debugging + +--- + +## Open Questions + +1. **~~Codex session logs~~**: Do we need full transcripts, or is the summary sufficient? + - βœ… **RESOLVED**: Multiple options identified! `--json` mode provides rich JSONL stream. + - Testing plan includes Phase 0 to evaluate data source options. + +2. **`todo_list` event**: Does Codex emit `todo_list` events that map to PR checkboxes? + - This could be the holy grail for direct checkbox synchronization + - Need to capture real session to verify event structure + +3. **Rate limits**: Does GitHub Models API have rate limits we need to handle? + - Need to test under load + +4. **Checkbox update permissions**: Can workflow token update PR body? + - Yes, `contents: write` and `pull-requests: write` already granted + +5. **JSONL schema stability**: How often do Codex event schemas change? + - Known issue [#4776](https://github.com/openai/codex/issues/4776) documents field renames + - Need defensive parsing with fallbacks + +--- + +## Next Steps + +### Immediate (Data Source Evaluation) +1. [ ] Modify workflow to capture `--json` output alongside summary +2. [ ] Run Codex manually to capture sample JSONL session +3. [ ] Analyze which event types contain task completion signals +4. [ ] Verify `todo_list` event structure (if present) + +### Implementation +5. [ ] Create `feature/langchain-analysis` branch +6. [ ] Implement JSONL parser for Codex events +7. [ ] Implement `llm_provider.py` with fallback chain +8. [ ] Port the three analysis tools with JSONL support +9. [ ] Add workflow integration + +### Testing +10. [ ] Set up Portable Alpha for testing +11. [ ] Run Phase 0 data source comparison +12. [ ] Run Phase 1 baseline measurement +13. [ ] Run Phase 2 LangChain measurement +14. [ ] Document results and decide on timing option diff --git a/pyproject.toml b/pyproject.toml index dc785f8a6..6fe7ff8b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,13 @@ dev = [ "tomlkit>=0.13.0", ] +# LangChain integration for LLM-enhanced task analysis +# Install with: pip install -e ".[langchain]" +langchain = [ + "langchain-core>=0.3.0", + "langchain-openai>=0.3.0", +] + [tool.setuptools] # This repo is primarily automation + scripts; avoid setuptools trying to auto-discover # random top-level dirs as importable packages (which breaks editable installs). diff --git a/scripts/analyze_codex_session.py b/scripts/analyze_codex_session.py new file mode 100755 index 000000000..09f0f8602 --- /dev/null +++ b/scripts/analyze_codex_session.py @@ -0,0 +1,325 @@ +#!/usr/bin/env python3 +""" +Analyze Codex Session CLI + +Command-line interface for analyzing Codex session output to determine +task completion status. Designed to be called from GitHub Actions workflows. + +Usage: + python scripts/analyze_codex_session.py \ + --session-file codex-session-123.jsonl \ + --tasks "Fix bug" "Add tests" "Update docs" \ + --output json + + # Or with PR body file containing checkboxes + python scripts/analyze_codex_session.py \ + --session-file codex-session-123.jsonl \ + --pr-body-file pr_body.md \ + --output github-actions + +Exit codes: + 0 - Analysis completed successfully + 1 - Error during analysis + 2 - No session file found +""" + +from __future__ import annotations + +import argparse +import json +import logging +import os +import re +import sys +from pathlib import Path + +# Add parent to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from tools.codex_session_analyzer import AnalysisResult, analyze_session + +logger = logging.getLogger(__name__) + + +def extract_tasks_from_pr_body(pr_body: str) -> list[str]: + """ + Extract task descriptions from PR body checkboxes. + + Looks for patterns like: + - [ ] Task description + - [x] Completed task + + Returns only unchecked tasks (the ones we're tracking). + """ + tasks = [] + + # Match both checked and unchecked boxes to get all tasks + # Pattern: - [ ] or - [x] followed by task text + checkbox_pattern = re.compile(r"^[\s]*-\s*\[([ xX])\]\s*(.+)$", re.MULTILINE) + + for match in checkbox_pattern.finditer(pr_body): + checked = match.group(1).lower() == "x" + task_text = match.group(2).strip() + + # Only track unchecked tasks + if not checked and task_text: + tasks.append(task_text) + + return tasks + + +def extract_all_tasks_from_pr_body(pr_body: str) -> dict[str, bool]: + """ + Extract all tasks with their current status. + + Returns: + Dict mapping task text to checked status + """ + tasks = {} + checkbox_pattern = re.compile(r"^[\s]*-\s*\[([ xX])\]\s*(.+)$", re.MULTILINE) + + for match in checkbox_pattern.finditer(pr_body): + checked = match.group(1).lower() == "x" + task_text = match.group(2).strip() + if task_text: + tasks[task_text] = checked + + return tasks + + +def update_pr_body_checkboxes(pr_body: str, completed_tasks: list[str]) -> str: + """ + Update PR body to check off completed tasks. + + Args: + pr_body: Original PR body text + completed_tasks: List of task descriptions to mark complete + + Returns: + Updated PR body with checkboxes updated + """ + updated_body = pr_body + + for task in completed_tasks: + # Escape special regex characters in task + escaped_task = re.escape(task) + + # Pattern to match unchecked checkbox with this task + pattern = re.compile( + rf"^([\s]*-\s*)\[ \](\s*){escaped_task}", + re.MULTILINE, + ) + + # Replace with checked version + updated_body = pattern.sub(rf"\1[x]\2{task}", updated_body) + + return updated_body + + +def output_github_actions(result: AnalysisResult) -> None: + """Output results in GitHub Actions format.""" + github_output = os.environ.get("GITHUB_OUTPUT", "") + + # Print notices for visibility in logs + print(f"::notice::Analysis completed with {result.completion.provider_used}") + print(f"::notice::Confidence: {result.completion.confidence:.0%}") + + if result.completion.completed_tasks: + print(f"::notice::Completed tasks: {len(result.completion.completed_tasks)}") + for task in result.completion.completed_tasks: + print(f"::notice:: βœ“ {task[:80]}") + + if result.completion.in_progress_tasks: + print(f"::notice::In progress: {len(result.completion.in_progress_tasks)}") + + if result.completion.blocked_tasks: + print(f"::warning::Blocked tasks: {len(result.completion.blocked_tasks)}") + for task in result.completion.blocked_tasks: + print(f"::warning:: βœ— {task[:80]}") + + # Write to GITHUB_OUTPUT if available + if github_output: + with open(github_output, "a") as f: + f.write(f"provider={result.completion.provider_used}\n") + f.write(f"confidence={result.completion.confidence}\n") + f.write(f"completed-count={len(result.completion.completed_tasks)}\n") + f.write(f"in-progress-count={len(result.completion.in_progress_tasks)}\n") + f.write(f"blocked-count={len(result.completion.blocked_tasks)}\n") + f.write(f"has-completions={str(result.has_completions).lower()}\n") + f.write(f"has-progress={str(result.has_progress).lower()}\n") + f.write(f"is-stalled={str(result.is_stalled).lower()}\n") + + # Encode completed tasks as JSON for downstream use + completed_json = json.dumps(result.completion.completed_tasks) + f.write(f"completed-tasks={completed_json}\n") + + +def output_json(result: AnalysisResult, pretty: bool = False) -> None: + """Output results as JSON.""" + data = { + "provider": result.completion.provider_used, + "confidence": result.completion.confidence, + "completed_tasks": result.completion.completed_tasks, + "in_progress_tasks": result.completion.in_progress_tasks, + "blocked_tasks": result.completion.blocked_tasks, + "reasoning": result.completion.reasoning, + "data_source": result.data_source, + "input_length": result.input_length, + "analysis_text_length": result.analysis_text_length, + } + + if result.session: + data["session"] = { + "event_count": result.session.raw_event_count, + "message_count": len(result.session.agent_messages), + "command_count": len(result.session.commands), + "file_change_count": len(result.session.file_changes), + "todo_count": len(result.session.todo_items), + } + + if pretty: + print(json.dumps(data, indent=2)) + else: + print(json.dumps(data)) + + +def output_markdown(result: AnalysisResult) -> None: + """Output results as markdown summary.""" + print(result.get_summary()) + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Analyze Codex session output for task completion", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=__doc__, + ) + + parser.add_argument( + "--session-file", + required=True, + help="Path to Codex session JSONL or summary file", + ) + + parser.add_argument( + "--tasks", + nargs="*", + help="Task descriptions to track (alternative to --pr-body-file)", + ) + + parser.add_argument( + "--pr-body-file", + help="Path to file containing PR body with checkboxes", + ) + + parser.add_argument( + "--pr-body", + help="PR body text directly (alternative to --pr-body-file)", + ) + + parser.add_argument( + "--context", + help="Additional context for analysis", + ) + + parser.add_argument( + "--output", + choices=["json", "json-pretty", "markdown", "github-actions"], + default="json", + help="Output format (default: json)", + ) + + parser.add_argument( + "--update-pr-body", + action="store_true", + help="Output updated PR body with completed checkboxes", + ) + + parser.add_argument( + "--updated-body-file", + help="Write updated PR body to this file", + ) + + parser.add_argument( + "--verbose", + "-v", + action="store_true", + help="Enable verbose logging", + ) + + args = parser.parse_args() + + # Setup logging + logging.basicConfig( + level=logging.DEBUG if args.verbose else logging.INFO, + format="%(levelname)s: %(message)s", + ) + + # Check session file exists + session_path = Path(args.session_file) + if not session_path.exists(): + logger.error(f"Session file not found: {args.session_file}") + return 2 + + # Get session content + session_content = session_path.read_text() + + # Get tasks + tasks = [] + pr_body = None + + if args.tasks: + tasks = args.tasks + elif args.pr_body_file: + pr_body = Path(args.pr_body_file).read_text() + tasks = extract_tasks_from_pr_body(pr_body) + elif args.pr_body: + pr_body = args.pr_body + tasks = extract_tasks_from_pr_body(pr_body) + else: + logger.error("Must provide --tasks, --pr-body-file, or --pr-body") + return 1 + + if not tasks: + logger.warning("No tasks found to track") + # Still run analysis but with empty task list + + logger.info(f"Analyzing session ({len(session_content)} bytes) with {len(tasks)} tasks") + + # Run analysis + try: + result = analyze_session( + content=session_content, + tasks=tasks, + context=args.context, + ) + except Exception as e: + logger.error(f"Analysis failed: {e}") + return 1 + + # Output results + if args.output == "github-actions": + output_github_actions(result) + elif args.output == "json": + output_json(result) + elif args.output == "json-pretty": + output_json(result, pretty=True) + elif args.output == "markdown": + output_markdown(result) + + # Update PR body if requested + if args.update_pr_body and pr_body and result.completion.completed_tasks: + updated_body = update_pr_body_checkboxes(pr_body, result.completion.completed_tasks) + + if args.updated_body_file: + Path(args.updated_body_file).write_text(updated_body) + logger.info(f"Updated PR body written to {args.updated_body_file}") + else: + print("\n--- UPDATED PR BODY ---") + print(updated_body) + + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/templates/consumer-repo/.github/workflows/agents-keepalive-loop.yml b/templates/consumer-repo/.github/workflows/agents-keepalive-loop.yml index f070017f6..f228686a2 100644 --- a/templates/consumer-repo/.github/workflows/agents-keepalive-loop.yml +++ b/templates/consumer-repo/.github/workflows/agents-keepalive-loop.yml @@ -438,6 +438,18 @@ jobs: const beforeSha = '${{ needs.evaluate.outputs.head_sha }}'; const headSha = '${{ needs.run-codex.outputs.commit-sha }}'; + // Parse LLM completed tasks if available + let llmCompletedTasks = []; + const llmTasksJson = '${{ needs.run-codex.outputs.llm-completed-tasks || '[]' }}'; + try { + llmCompletedTasks = JSON.parse(llmTasksJson); + if (llmCompletedTasks.length > 0) { + core.info(`LLM analysis found ${llmCompletedTasks.length} completed task(s)`); + } + } catch (e) { + core.debug(`Failed to parse LLM tasks: ${e.message}`); + } + if (!prNumber || !beforeSha || !headSha) { core.info('Missing required inputs for task reconciliation'); return; @@ -447,12 +459,12 @@ jobs: core.info(`Comparing ${beforeSha.slice(0, 7)} β†’ ${headSha.slice(0, 7)}`); const result = await autoReconcileTasks({ - github, context, prNumber, baseSha: beforeSha, headSha, core + github, context, prNumber, baseSha: beforeSha, headSha, llmCompletedTasks, core }); if (result.updated) { core.info(`βœ… ${result.details}`); - core.notice(`Auto-checked ${result.tasksChecked} task(s) based on commit analysis`); + core.notice(`Auto-checked ${result.tasksChecked} task(s) based on analysis`); } else { core.info(`ℹ️ ${result.details}`); } @@ -488,5 +500,9 @@ jobs: agent_commit_sha: '${{ needs.run-codex.outputs.commit-sha }}', agent_files_changed: '${{ needs.run-codex.outputs.files-changed }}', agent_summary: process.env.CODEX_SUMMARY || '', + // LLM task analysis provider info + llm_provider: '${{ needs.run-codex.outputs.llm-provider || '' }}', + llm_confidence: '${{ needs.run-codex.outputs.llm-confidence || '' }}', + llm_analysis_run: '${{ needs.run-codex.outputs.llm-analysis-run }}' === 'true', }; await updateKeepaliveLoopSummary({ github, context, core, inputs }); diff --git a/tests/scripts/test_analyze_codex_session.py b/tests/scripts/test_analyze_codex_session.py new file mode 100644 index 000000000..0da7cb39e --- /dev/null +++ b/tests/scripts/test_analyze_codex_session.py @@ -0,0 +1,280 @@ +"""Tests for analyze_codex_session CLI script.""" + +from __future__ import annotations + +import json +import subprocess +import sys +from pathlib import Path +from unittest.mock import patch + +import pytest + +# Import functions directly for unit testing +sys.path.insert(0, str(Path(__file__).parent.parent.parent)) +from scripts.analyze_codex_session import ( + extract_all_tasks_from_pr_body, + extract_tasks_from_pr_body, + update_pr_body_checkboxes, +) + + +class TestExtractTasksFromPRBody: + """Tests for PR body task extraction.""" + + def test_extracts_unchecked_tasks(self) -> None: + pr_body = """ +## Tasks + +- [ ] Fix the bug +- [ ] Add tests +- [x] Update docs +""" + tasks = extract_tasks_from_pr_body(pr_body) + assert tasks == ["Fix the bug", "Add tests"] + + def test_handles_mixed_indentation(self) -> None: + pr_body = """ +- [ ] Task 1 + - [ ] Subtask 1a + - [ ] Sub-subtask +- [ ] Task 2 +""" + tasks = extract_tasks_from_pr_body(pr_body) + assert "Task 1" in tasks + assert "Subtask 1a" in tasks + assert "Task 2" in tasks + + def test_handles_uppercase_x(self) -> None: + pr_body = """ +- [X] Completed with uppercase +- [ ] Still pending +""" + tasks = extract_tasks_from_pr_body(pr_body) + assert tasks == ["Still pending"] + + def test_empty_body_returns_empty_list(self) -> None: + assert extract_tasks_from_pr_body("") == [] + + def test_no_checkboxes_returns_empty_list(self) -> None: + pr_body = """ +## Description +This PR fixes a bug. + +## Notes +- Item 1 +- Item 2 +""" + assert extract_tasks_from_pr_body(pr_body) == [] + + def test_extracts_from_multiple_sections(self) -> None: + pr_body = """ +## Tasks +- [ ] Task from tasks section + +## Acceptance Criteria +- [ ] Criterion 1 +- [ ] Criterion 2 +""" + tasks = extract_tasks_from_pr_body(pr_body) + assert len(tasks) == 3 + assert "Task from tasks section" in tasks + assert "Criterion 1" in tasks + + +class TestExtractAllTasksFromPRBody: + """Tests for extracting all tasks with status.""" + + def test_extracts_all_with_status(self) -> None: + pr_body = """ +- [ ] Unchecked task +- [x] Checked task +- [X] Also checked +""" + tasks = extract_all_tasks_from_pr_body(pr_body) + assert tasks == { + "Unchecked task": False, + "Checked task": True, + "Also checked": True, + } + + +class TestUpdatePRBodyCheckboxes: + """Tests for checkbox update logic.""" + + def test_checks_completed_task(self) -> None: + pr_body = "- [ ] Fix the bug\n- [ ] Add tests" + updated = update_pr_body_checkboxes(pr_body, ["Fix the bug"]) + assert "- [x] Fix the bug" in updated + assert "- [ ] Add tests" in updated + + def test_preserves_already_checked(self) -> None: + pr_body = "- [x] Already done\n- [ ] New task" + updated = update_pr_body_checkboxes(pr_body, ["New task"]) + assert "- [x] Already done" in updated + assert "- [x] New task" in updated + + def test_handles_special_characters_in_task(self) -> None: + pr_body = "- [ ] Fix bug (issue #123)" + updated = update_pr_body_checkboxes(pr_body, ["Fix bug (issue #123)"]) + assert "- [x] Fix bug (issue #123)" in updated + + def test_handles_no_matches(self) -> None: + pr_body = "- [ ] Task A" + updated = update_pr_body_checkboxes(pr_body, ["Nonexistent task"]) + assert updated == pr_body + + def test_preserves_indentation(self) -> None: + pr_body = " - [ ] Indented task" + updated = update_pr_body_checkboxes(pr_body, ["Indented task"]) + assert " - [x] Indented task" in updated + + +class TestCLIScript: + """Integration tests for the CLI script.""" + + @pytest.fixture + def sample_session_file(self, tmp_path: Path) -> Path: + """Create a sample JSONL session file.""" + session_content = """{"type": "thread.started", "thread_id": "test123"} +{"type": "turn.started", "turn_id": "turn1"} +{"type": "item.completed", "item_type": "agent_message", "content": "I have fixed the bug in calculator.py. The tests now pass."} +{"type": "item.completed", "item_type": "command_execution", "command": "pytest", "exit_code": 0} +{"type": "turn.completed", "turn_id": "turn1"} +""" + session_file = tmp_path / "session.jsonl" + session_file.write_text(session_content) + return session_file + + @pytest.fixture + def sample_pr_body_file(self, tmp_path: Path) -> Path: + """Create a sample PR body file.""" + pr_body = """## Tasks +- [ ] Fix the bug +- [ ] Add tests +- [ ] Update documentation +""" + pr_body_file = tmp_path / "pr_body.md" + pr_body_file.write_text(pr_body) + return pr_body_file + + def test_cli_runs_with_task_args(self, sample_session_file: Path, tmp_path: Path) -> None: + """Test CLI with --tasks argument.""" + result = subprocess.run( + [ + sys.executable, + "scripts/analyze_codex_session.py", + "--session-file", + str(sample_session_file), + "--tasks", + "Fix the bug", + "Add tests", + "--output", + "json", + ], + capture_output=True, + text=True, + cwd=Path(__file__).parent.parent.parent, + ) + + # Should succeed (exit 0) + assert result.returncode == 0, f"stderr: {result.stderr}" + + # Output should be valid JSON + output = json.loads(result.stdout) + assert "provider" in output + assert "confidence" in output + + def test_cli_runs_with_pr_body_file( + self, sample_session_file: Path, sample_pr_body_file: Path + ) -> None: + """Test CLI with --pr-body-file argument.""" + result = subprocess.run( + [ + sys.executable, + "scripts/analyze_codex_session.py", + "--session-file", + str(sample_session_file), + "--pr-body-file", + str(sample_pr_body_file), + "--output", + "json", + ], + capture_output=True, + text=True, + cwd=Path(__file__).parent.parent.parent, + ) + + assert result.returncode == 0, f"stderr: {result.stderr}" + + def test_cli_returns_2_for_missing_session(self, tmp_path: Path) -> None: + """Test CLI returns exit code 2 for missing session file.""" + result = subprocess.run( + [ + sys.executable, + "scripts/analyze_codex_session.py", + "--session-file", + str(tmp_path / "nonexistent.jsonl"), + "--tasks", + "Some task", + ], + capture_output=True, + text=True, + cwd=Path(__file__).parent.parent.parent, + ) + + assert result.returncode == 2 + + def test_cli_markdown_output(self, sample_session_file: Path, tmp_path: Path) -> None: + """Test CLI with markdown output format.""" + result = subprocess.run( + [ + sys.executable, + "scripts/analyze_codex_session.py", + "--session-file", + str(sample_session_file), + "--tasks", + "Fix the bug", + "--output", + "markdown", + ], + capture_output=True, + text=True, + cwd=Path(__file__).parent.parent.parent, + ) + + assert result.returncode == 0 + assert "**Analysis Summary**" in result.stdout + + def test_cli_update_pr_body_option( + self, sample_session_file: Path, sample_pr_body_file: Path, tmp_path: Path + ) -> None: + """Test CLI with --update-pr-body option.""" + updated_file = tmp_path / "updated_body.md" + + # Mock the LLM to return a known completion + with patch("tools.llm_provider.get_llm_provider") as mock_provider: + from tools.llm_provider import RegexFallbackProvider + + mock_provider.return_value = RegexFallbackProvider() + + result = subprocess.run( + [ + sys.executable, + "scripts/analyze_codex_session.py", + "--session-file", + str(sample_session_file), + "--pr-body-file", + str(sample_pr_body_file), + "--output", + "json", + "--update-pr-body", + "--updated-body-file", + str(updated_file), + ], + capture_output=True, + text=True, + cwd=Path(__file__).parent.parent.parent, + ) + + assert result.returncode == 0 diff --git a/tests/tools/__init__.py b/tests/tools/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/tools/test_codex_jsonl_parser.py b/tests/tools/test_codex_jsonl_parser.py new file mode 100644 index 000000000..82cb1429a --- /dev/null +++ b/tests/tools/test_codex_jsonl_parser.py @@ -0,0 +1,261 @@ +"""Tests for tools/codex_jsonl_parser.py""" + +from tools.codex_jsonl_parser import ( + CodexSession, + CommandExecution, + FileChange, + TodoItem, + parse_codex_jsonl, +) + + +class TestCodexJSONLParser: + """Test JSONL parsing functionality.""" + + def test_parse_empty_content(self): + """Empty content returns empty session.""" + session = parse_codex_jsonl("") + assert session.raw_event_count == 0 + assert session.thread_id is None + assert len(session.agent_messages) == 0 + + def test_parse_thread_started(self): + """Thread started event sets thread_id.""" + jsonl = '{"type": "thread.started", "thread_id": "test-123"}' + session = parse_codex_jsonl(jsonl) + assert session.thread_id == "test-123" + assert session.raw_event_count == 1 + + def test_parse_turn_lifecycle(self): + """Turn events are tracked correctly.""" + jsonl = """ +{"type": "turn.started", "turn_id": "turn-1"} +{"type": "turn.completed", "turn_id": "turn-1", "token_usage": {"input_tokens": 100, "output_tokens": 50}} +""" + session = parse_codex_jsonl(jsonl) + assert len(session.turns) == 1 + assert session.turns[0].turn_id == "turn-1" + assert session.turns[0].completed is True + assert session.turns[0].input_tokens == 100 + assert session.turns[0].output_tokens == 50 + + def test_parse_turn_failed(self): + """Failed turns are tracked.""" + jsonl = """ +{"type": "turn.started", "turn_id": "turn-1"} +{"type": "turn.failed", "turn_id": "turn-1", "error": "Rate limited"} +""" + session = parse_codex_jsonl(jsonl) + assert len(session.turns) == 1 + assert session.turns[0].failed is True + assert session.turns[0].error == "Rate limited" + + def test_parse_agent_message_streaming(self): + """Agent messages with streaming updates are captured.""" + jsonl = """ +{"type": "item.started", "item_id": "msg-1", "item_type": "agent_message"} +{"type": "item.updated", "item_id": "msg-1", "content": "Hello "} +{"type": "item.updated", "item_id": "msg-1", "content": "world!"} +{"type": "item.completed", "item_id": "msg-1"} +""" + session = parse_codex_jsonl(jsonl) + assert len(session.agent_messages) == 1 + assert session.agent_messages[0] == "Hello world!" + + def test_parse_agent_message_old_schema(self): + """Old schema (assistant_message) is supported.""" + jsonl = '{"type": "item.completed", "item_type": "assistant_message", "content": "Done!"}' + session = parse_codex_jsonl(jsonl) + assert len(session.agent_messages) == 1 + assert session.agent_messages[0] == "Done!" + + def test_parse_reasoning(self): + """Reasoning summaries are captured.""" + jsonl = '{"type": "item.completed", "item_type": "reasoning", "content": "I should fix the tests first."}' + session = parse_codex_jsonl(jsonl) + assert len(session.reasoning_summaries) == 1 + assert "fix the tests" in session.reasoning_summaries[0] + + def test_parse_command_execution(self): + """Command executions are tracked.""" + jsonl = '{"type": "item.completed", "item_type": "command_execution", "command": "pytest tests/", "exit_code": 0, "output": "1 passed"}' + session = parse_codex_jsonl(jsonl) + assert len(session.commands) == 1 + assert session.commands[0].command == "pytest tests/" + assert session.commands[0].exit_code == 0 + assert len(session.successful_commands) == 1 + assert len(session.failed_commands) == 0 + + def test_parse_failed_command(self): + """Failed commands are tracked separately.""" + jsonl = '{"type": "item.completed", "item_type": "command_execution", "command": "pytest", "exit_code": 1}' + session = parse_codex_jsonl(jsonl) + assert len(session.failed_commands) == 1 + assert session.failed_commands[0].exit_code == 1 + + def test_parse_file_change(self): + """File changes are tracked.""" + jsonl = '{"type": "item.completed", "item_type": "file_change", "path": "src/main.py", "change_type": "modified"}' + session = parse_codex_jsonl(jsonl) + assert len(session.file_changes) == 1 + assert session.file_changes[0].path == "src/main.py" + assert session.file_changes[0].change_type == "modified" + + def test_parse_todo_list(self): + """Todo list items are extracted.""" + jsonl = '{"type": "item.completed", "item_type": "todo_list", "items": [{"task": "Fix tests", "status": "completed"}, {"task": "Update docs", "status": "in_progress"}]}' + session = parse_codex_jsonl(jsonl) + assert len(session.todo_items) == 2 + assert session.todo_items[0].task == "Fix tests" + assert session.todo_items[0].status == "completed" + assert len(session.completed_todos) == 1 + + def test_parse_handles_invalid_json(self): + """Invalid JSON lines are logged but don't crash.""" + jsonl = """ +{"type": "thread.started", "thread_id": "test"} +not valid json +{"type": "turn.started", "turn_id": "turn-1"} +""" + session = parse_codex_jsonl(jsonl) + assert session.thread_id == "test" + assert len(session.turns) == 1 + assert len(session.parse_errors) == 1 + + def test_total_tokens(self): + """Token totals are calculated across turns.""" + jsonl = """ +{"type": "turn.started", "turn_id": "turn-1"} +{"type": "turn.completed", "turn_id": "turn-1", "token_usage": {"input_tokens": 100, "output_tokens": 50}} +{"type": "turn.started", "turn_id": "turn-2"} +{"type": "turn.completed", "turn_id": "turn-2", "token_usage": {"input_tokens": 200, "output_tokens": 100}} +""" + session = parse_codex_jsonl(jsonl) + assert session.total_input_tokens == 300 + assert session.total_output_tokens == 150 + + +class TestCodexSessionAnalysisText: + """Test analysis text generation.""" + + def test_get_analysis_text_with_messages(self): + """Analysis text includes agent messages.""" + session = CodexSession( + agent_messages=["I completed the task successfully."], + ) + text = session.get_analysis_text() + assert "Agent Messages" in text + assert "completed the task" in text + + def test_get_analysis_text_with_reasoning(self): + """Reasoning is included when requested.""" + session = CodexSession( + reasoning_summaries=["I should check the tests."], + ) + text = session.get_analysis_text(include_reasoning=True) + assert "Reasoning" in text + assert "check the tests" in text + + def test_get_analysis_text_without_reasoning(self): + """Reasoning can be excluded.""" + session = CodexSession( + reasoning_summaries=["Secret thoughts"], + ) + text = session.get_analysis_text(include_reasoning=False) + assert "Secret thoughts" not in text + + def test_get_analysis_text_with_todos(self): + """Todo items are formatted with status.""" + session = CodexSession( + todo_items=[ + TodoItem(task="Fix tests", status="completed"), + TodoItem(task="Update docs", status="in_progress"), + ], + ) + text = session.get_analysis_text() + assert "Todo List" in text + assert "βœ“ Fix tests" in text + assert "β†’ Update docs" in text + + def test_get_analysis_text_with_files(self): + """File changes are listed.""" + session = CodexSession( + file_changes=[ + FileChange(path="src/main.py", change_type="modified"), + FileChange(path="tests/test_main.py", change_type="added"), + ], + ) + text = session.get_analysis_text() + assert "Files Modified" in text + assert "modified: src/main.py" in text + assert "added: tests/test_main.py" in text + + def test_get_analysis_text_with_commands(self): + """Command summary is included.""" + session = CodexSession( + commands=[ + CommandExecution(command="pytest", exit_code=0, output=""), + CommandExecution(command="black .", exit_code=0, output=""), + CommandExecution(command="mypy", exit_code=1, output="error"), + ], + ) + text = session.get_analysis_text() + assert "Commands Executed" in text + assert "Total: 3" in text + assert "Successful: 2" in text + assert "Failed: 1" in text + + +class TestCompleteSession: + """Test parsing a complete realistic session.""" + + def test_parse_realistic_session(self): + """Parse a realistic multi-turn session.""" + jsonl = """ +{"type": "thread.started", "thread_id": "session-abc"} +{"type": "turn.started", "turn_id": "turn-1"} +{"type": "item.started", "item_id": "reason-1", "item_type": "reasoning"} +{"type": "item.updated", "item_id": "reason-1", "content": "The user wants me to fix tests. I'll run pytest first."} +{"type": "item.completed", "item_id": "reason-1"} +{"type": "item.completed", "item_type": "command_execution", "command": "pytest tests/", "exit_code": 1, "output": "2 failed"} +{"type": "item.started", "item_id": "msg-1", "item_type": "agent_message"} +{"type": "item.updated", "item_id": "msg-1", "content": "I found 2 failing tests. Let me fix them."} +{"type": "item.completed", "item_id": "msg-1"} +{"type": "item.completed", "item_type": "file_change", "path": "tests/test_calc.py", "change_type": "modified"} +{"type": "item.completed", "item_type": "command_execution", "command": "pytest tests/", "exit_code": 0, "output": "all passed"} +{"type": "item.started", "item_id": "msg-2", "item_type": "agent_message"} +{"type": "item.updated", "item_id": "msg-2", "content": "All tests pass now. The fix was to update the expected value."} +{"type": "item.completed", "item_id": "msg-2"} +{"type": "turn.completed", "turn_id": "turn-1", "token_usage": {"input_tokens": 500, "output_tokens": 200}} +""" + session = parse_codex_jsonl(jsonl) + + # Check overall structure + assert session.thread_id == "session-abc" + assert len(session.turns) == 1 + assert session.turns[0].completed + + # Check content + assert len(session.reasoning_summaries) == 1 + assert "run pytest" in session.reasoning_summaries[0] + + assert len(session.agent_messages) == 2 + assert "2 failing tests" in session.agent_messages[0] + assert "All tests pass" in session.agent_messages[1] + + assert len(session.commands) == 2 + assert len(session.successful_commands) == 1 + assert len(session.failed_commands) == 1 + + assert len(session.file_changes) == 1 + assert session.file_changes[0].path == "tests/test_calc.py" + + # Check tokens + assert session.total_input_tokens == 500 + assert session.total_output_tokens == 200 + + # Check analysis text + text = session.get_analysis_text() + assert "All tests pass" in text + assert "Files Modified" in text + assert "Commands Executed" in text diff --git a/tests/tools/test_llm_provider.py b/tests/tools/test_llm_provider.py new file mode 100644 index 000000000..714a10b9f --- /dev/null +++ b/tests/tools/test_llm_provider.py @@ -0,0 +1,232 @@ +"""Tests for tools/llm_provider.py""" + +import os +from unittest.mock import MagicMock, patch + +import pytest + +from tools.llm_provider import ( + CompletionAnalysis, + FallbackChainProvider, + GitHubModelsProvider, + OpenAIProvider, + RegexFallbackProvider, + check_providers, + get_llm_provider, +) + + +class TestProviderAvailability: + """Test provider availability checks.""" + + def test_github_models_available_with_token(self): + """GitHub Models is available when GITHUB_TOKEN is set.""" + with patch.dict(os.environ, {"GITHUB_TOKEN": "test-token"}): + provider = GitHubModelsProvider() + assert provider.is_available() is True + + def test_github_models_unavailable_without_token(self): + """GitHub Models is unavailable without GITHUB_TOKEN.""" + env = {k: v for k, v in os.environ.items() if k != "GITHUB_TOKEN"} + with patch.dict(os.environ, env, clear=True): + provider = GitHubModelsProvider() + assert provider.is_available() is False + + def test_openai_available_with_key(self): + """OpenAI is available when OPENAI_API_KEY is set.""" + with patch.dict(os.environ, {"OPENAI_API_KEY": "sk-test"}): + provider = OpenAIProvider() + assert provider.is_available() is True + + def test_openai_unavailable_without_key(self): + """OpenAI is unavailable without OPENAI_API_KEY.""" + env = {k: v for k, v in os.environ.items() if k != "OPENAI_API_KEY"} + with patch.dict(os.environ, env, clear=True): + provider = OpenAIProvider() + assert provider.is_available() is False + + def test_regex_always_available(self): + """Regex fallback is always available.""" + provider = RegexFallbackProvider() + assert provider.is_available() is True + + def test_check_providers_returns_dict(self): + """check_providers returns availability dict.""" + result = check_providers() + assert isinstance(result, dict) + assert "github-models" in result + assert "openai" in result + assert "regex-fallback" in result + assert result["regex-fallback"] is True + + +class TestRegexFallbackProvider: + """Test regex-based analysis.""" + + def test_detects_completion_keywords(self): + """Regex detects completion keywords.""" + provider = RegexFallbackProvider() + tasks = ["Fix the calculator tests"] + output = "I have completed fixing the calculator tests. They all pass now." + + result = provider.analyze_completion(output, tasks) + assert len(result.completed_tasks) == 1 + assert result.provider_used == "regex-fallback" + assert result.confidence < 0.5 # Low confidence for regex + + def test_detects_progress_keywords(self): + """Regex detects progress keywords.""" + provider = RegexFallbackProvider() + tasks = ["Update documentation"] + output = "I'm working on updating the documentation now." + + result = provider.analyze_completion(output, tasks) + assert len(result.in_progress_tasks) == 1 + + def test_detects_blocker_keywords(self): + """Regex detects blocker keywords.""" + provider = RegexFallbackProvider() + tasks = ["Deploy to production"] + output = "I'm blocked on the deploy - there's an error with credentials." + + result = provider.analyze_completion(output, tasks) + assert len(result.blocked_tasks) == 1 + + def test_no_false_positives_without_keywords(self): + """No detection without relevant keywords.""" + provider = RegexFallbackProvider() + tasks = ["Implement feature X"] + output = "Looking at the codebase structure." + + result = provider.analyze_completion(output, tasks) + assert len(result.completed_tasks) == 0 + assert len(result.in_progress_tasks) == 0 + assert len(result.blocked_tasks) == 0 + + +class TestFallbackChainProvider: + """Test fallback chain behavior.""" + + def test_uses_first_available_provider(self): + """Chain uses first available provider.""" + mock_provider1 = MagicMock() + mock_provider1.name = "mock1" + mock_provider1.is_available.return_value = False + + mock_provider2 = MagicMock() + mock_provider2.name = "mock2" + mock_provider2.is_available.return_value = True + mock_provider2.analyze_completion.return_value = CompletionAnalysis( + completed_tasks=["task1"], + in_progress_tasks=[], + blocked_tasks=[], + confidence=0.9, + reasoning="test", + provider_used="mock2", + ) + + chain = FallbackChainProvider([mock_provider1, mock_provider2]) + result = chain.analyze_completion("output", ["task1"]) + + mock_provider1.analyze_completion.assert_not_called() + mock_provider2.analyze_completion.assert_called() + assert result.provider_used == "mock2" + + def test_falls_back_on_error(self): + """Chain falls back when provider raises error.""" + mock_provider1 = MagicMock() + mock_provider1.name = "mock1" + mock_provider1.is_available.return_value = True + mock_provider1.analyze_completion.side_effect = RuntimeError("API error") + + mock_provider2 = MagicMock() + mock_provider2.name = "mock2" + mock_provider2.is_available.return_value = True + mock_provider2.analyze_completion.return_value = CompletionAnalysis( + completed_tasks=[], + in_progress_tasks=[], + blocked_tasks=[], + confidence=0.5, + reasoning="fallback", + provider_used="mock2", + ) + + chain = FallbackChainProvider([mock_provider1, mock_provider2]) + result = chain.analyze_completion("output", ["task1"]) + + assert result.provider_used == "mock2" + + def test_raises_when_all_fail(self): + """Chain raises error when all providers fail.""" + mock_provider = MagicMock() + mock_provider.name = "mock" + mock_provider.is_available.return_value = True + mock_provider.analyze_completion.side_effect = RuntimeError("Failed") + + chain = FallbackChainProvider([mock_provider]) + + with pytest.raises(RuntimeError, match="All providers failed"): + chain.analyze_completion("output", ["task1"]) + + +class TestGetLLMProvider: + """Test get_llm_provider factory.""" + + def test_returns_fallback_chain(self): + """get_llm_provider returns a FallbackChainProvider.""" + provider = get_llm_provider() + assert isinstance(provider, FallbackChainProvider) + + def test_chain_always_available(self): + """Chain is always available (regex fallback).""" + provider = get_llm_provider() + assert provider.is_available() is True + + +class TestCompletionAnalysis: + """Test CompletionAnalysis dataclass.""" + + def test_dataclass_creation(self): + """CompletionAnalysis can be created.""" + analysis = CompletionAnalysis( + completed_tasks=["task1", "task2"], + in_progress_tasks=["task3"], + blocked_tasks=[], + confidence=0.85, + reasoning="Tasks 1 and 2 were completed based on output.", + provider_used="test", + ) + assert len(analysis.completed_tasks) == 2 + assert analysis.confidence == 0.85 + + +class TestGitHubModelsProvider: + """Test GitHub Models provider (mocked).""" + + def test_parse_response_valid_json(self): + """Parses valid JSON response.""" + provider = GitHubModelsProvider() + response = """ +Here's my analysis: +{ + "completed": ["task1"], + "in_progress": ["task2"], + "blocked": [], + "confidence": 0.9, + "reasoning": "Task 1 was explicitly marked done." +} +""" + result = provider._parse_response(response, ["task1", "task2"]) + assert result.completed_tasks == ["task1"] + assert result.in_progress_tasks == ["task2"] + assert result.confidence == 0.9 + + def test_parse_response_invalid_json(self): + """Handles invalid JSON gracefully.""" + provider = GitHubModelsProvider() + response = "I couldn't analyze this properly." + + result = provider._parse_response(response, ["task1"]) + assert result.completed_tasks == [] + assert result.confidence == 0.0 + assert "parse" in result.reasoning.lower() diff --git a/tools/codex_jsonl_parser.py b/tools/codex_jsonl_parser.py new file mode 100644 index 000000000..60fab970e --- /dev/null +++ b/tools/codex_jsonl_parser.py @@ -0,0 +1,376 @@ +""" +Codex JSONL Event Parser + +Parses the JSONL event stream from `codex exec --json` for task completion analysis. + +Event types supported: +- thread.started / turn.started / turn.completed / turn.failed +- item.started / item.updated / item.completed +- Item types: agent_message, reasoning, command_execution, file_change, todo_list + +Usage: + from tools.codex_jsonl_parser import parse_codex_jsonl, CodexSession + + session = parse_codex_jsonl(jsonl_content) + print(session.agent_messages) + print(session.file_changes) +""" + +from __future__ import annotations + +import json +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +logger = logging.getLogger(__name__) + + +@dataclass +class CommandExecution: + """Represents a shell command executed by Codex.""" + + command: str + exit_code: int + output: str + duration_seconds: float | None = None + + +@dataclass +class FileChange: + """Represents a file modification by Codex.""" + + path: str + change_type: str # added, modified, deleted + content_preview: str | None = None + + +@dataclass +class TodoItem: + """Represents a task in Codex's todo list.""" + + task: str + status: str # completed, in_progress, not_started, blocked + + +@dataclass +class TurnInfo: + """Information about a conversation turn.""" + + turn_id: str + input_tokens: int = 0 + output_tokens: int = 0 + reasoning_tokens: int = 0 + completed: bool = False + failed: bool = False + error: str | None = None + + +@dataclass +class CodexSession: + """Parsed Codex session data from JSONL events.""" + + # Thread info + thread_id: str | None = None + + # Turns + turns: list[TurnInfo] = field(default_factory=list) + + # High-value content for analysis + agent_messages: list[str] = field(default_factory=list) + reasoning_summaries: list[str] = field(default_factory=list) + + # Concrete work evidence + commands: list[CommandExecution] = field(default_factory=list) + file_changes: list[FileChange] = field(default_factory=list) + + # Direct task mapping (if available) + todo_items: list[TodoItem] = field(default_factory=list) + + # Raw events (for debugging) + raw_event_count: int = 0 + parse_errors: list[str] = field(default_factory=list) + + @property + def total_input_tokens(self) -> int: + return sum(t.input_tokens for t in self.turns) + + @property + def total_output_tokens(self) -> int: + return sum(t.output_tokens for t in self.turns) + + @property + def successful_commands(self) -> list[CommandExecution]: + return [c for c in self.commands if c.exit_code == 0] + + @property + def failed_commands(self) -> list[CommandExecution]: + return [c for c in self.commands if c.exit_code != 0] + + @property + def completed_todos(self) -> list[TodoItem]: + return [t for t in self.todo_items if t.status == "completed"] + + def get_analysis_text(self, include_reasoning: bool = True) -> str: + """ + Get consolidated text suitable for LLM analysis. + + Args: + include_reasoning: Whether to include reasoning summaries + + Returns: + Formatted text with key session information + """ + sections = [] + + # Agent messages (highest signal) + if self.agent_messages: + sections.append("## Agent Messages") + for msg in self.agent_messages: + sections.append(msg[:2000]) # Truncate long messages + sections.append("") + + # Reasoning (if requested) + if include_reasoning and self.reasoning_summaries: + sections.append("## Reasoning Summaries") + for reason in self.reasoning_summaries: + sections.append(reason[:1000]) + sections.append("") + + # Todo list (direct task mapping) + if self.todo_items: + sections.append("## Todo List") + for item in self.todo_items: + status_emoji = { + "completed": "βœ“", + "in_progress": "β†’", + "blocked": "βœ—", + "not_started": "β—‹", + }.get(item.status, "?") + sections.append(f"{status_emoji} {item.task}") + sections.append("") + + # File changes (concrete evidence) + if self.file_changes: + sections.append("## Files Modified") + for fc in self.file_changes: + sections.append(f"- {fc.change_type}: {fc.path}") + sections.append("") + + # Command summary + if self.commands: + sections.append("## Commands Executed") + sections.append(f"- Total: {len(self.commands)}") + sections.append(f"- Successful: {len(self.successful_commands)}") + sections.append(f"- Failed: {len(self.failed_commands)}") + if self.failed_commands: + sections.append("- Failed commands:") + for cmd in self.failed_commands[:3]: # Limit to first 3 + sections.append(f" - {cmd.command[:100]} (exit {cmd.exit_code})") + sections.append("") + + return "\n".join(sections) + + +class CodexJSONLParser: + """Parser for Codex JSONL event streams.""" + + def __init__(self): + self._session = CodexSession() + self._current_items: dict[str, dict] = {} # item_id -> item data + + def parse(self, jsonl_content: str) -> CodexSession: + """ + Parse JSONL content into a CodexSession. + + Args: + jsonl_content: Raw JSONL string (one JSON object per line) + + Returns: + Parsed CodexSession + """ + for line_num, line in enumerate(jsonl_content.strip().split("\n"), 1): + line = line.strip() + if not line: + continue + + try: + event = json.loads(line) + self._process_event(event) + self._session.raw_event_count += 1 + except json.JSONDecodeError as e: + error_msg = f"Line {line_num}: JSON parse error: {e}" + logger.warning(error_msg) + self._session.parse_errors.append(error_msg) + except Exception as e: + error_msg = f"Line {line_num}: Processing error: {e}" + logger.warning(error_msg) + self._session.parse_errors.append(error_msg) + + return self._session + + def _process_event(self, event: dict[str, Any]) -> None: + """Process a single event.""" + event_type = event.get("type", "") + + # Thread events + if event_type == "thread.started": + self._session.thread_id = event.get("thread_id") + + # Turn events + elif event_type == "turn.started": + turn = TurnInfo(turn_id=event.get("turn_id", "")) + self._session.turns.append(turn) + + elif event_type == "turn.completed": + turn_id = event.get("turn_id") + usage = event.get("token_usage", {}) + for turn in self._session.turns: + if turn.turn_id == turn_id: + turn.completed = True + turn.input_tokens = usage.get("input_tokens", 0) + turn.output_tokens = usage.get("output_tokens", 0) + turn.reasoning_tokens = usage.get("reasoning_tokens", 0) + break + + elif event_type == "turn.failed": + turn_id = event.get("turn_id") + for turn in self._session.turns: + if turn.turn_id == turn_id: + turn.failed = True + turn.error = event.get("error") + break + + # Item events + elif event_type == "item.started": + item_id = event.get("item_id") + # Handle both old (item_type) and new (type in nested object) schemas + item_type = event.get("item_type") or event.get("item", {}).get("type") + if item_id: + self._current_items[item_id] = { + "type": item_type, + "content": "", + } + + elif event_type == "item.updated": + item_id = event.get("item_id") + if item_id in self._current_items: + # Append content updates + content = event.get("content", "") + self._current_items[item_id]["content"] += content + + elif event_type == "item.completed": + item_id = event.get("item_id") + item_data = self._current_items.pop(item_id, None) + + if not item_data: + # Try to get item type from event itself + item_type = event.get("item_type") or event.get("item", {}).get("type") + item_data = {"type": item_type, "content": ""} + + item_type = item_data.get("type") + content = item_data.get("content", "") or event.get("content", "") + + self._handle_completed_item(item_type, content, event) + + def _handle_completed_item( + self, item_type: str | None, content: str, event: dict[str, Any] + ) -> None: + """Handle a completed item based on its type.""" + + # Handle schema variations (old: assistant_message, new: agent_message) + if item_type in ("agent_message", "assistant_message"): + if content: + self._session.agent_messages.append(content) + + elif item_type == "reasoning": + if content: + self._session.reasoning_summaries.append(content) + + elif item_type == "command_execution": + cmd = CommandExecution( + command=event.get("command", content), + exit_code=event.get("exit_code", 0), + output=event.get("output", ""), + duration_seconds=event.get("duration"), + ) + self._session.commands.append(cmd) + + elif item_type == "file_change": + fc = FileChange( + path=event.get("path", ""), + change_type=event.get("change_type", "modified"), + content_preview=content[:500] if content else None, + ) + self._session.file_changes.append(fc) + + elif item_type == "todo_list": + # Parse todo items from content or event + items = event.get("items", []) + if not items and content: + # Try to parse from content + import contextlib + + with contextlib.suppress(json.JSONDecodeError): + items = json.loads(content) + + for item in items: + if isinstance(item, dict): + todo = TodoItem( + task=item.get("task", ""), + status=item.get("status", "not_started"), + ) + self._session.todo_items.append(todo) + + +def parse_codex_jsonl(jsonl_content: str) -> CodexSession: + """ + Parse Codex JSONL event stream. + + Args: + jsonl_content: Raw JSONL string from `codex exec --json` + + Returns: + Parsed CodexSession with all extracted information + """ + parser = CodexJSONLParser() + return parser.parse(jsonl_content) + + +def parse_codex_jsonl_file(file_path: str | Path) -> CodexSession: + """ + Parse Codex JSONL from a file. + + Args: + file_path: Path to JSONL file + + Returns: + Parsed CodexSession + """ + path = Path(file_path) + content = path.read_text() + return parse_codex_jsonl(content) + + +if __name__ == "__main__": + # Example usage + sample_jsonl = """ +{"type": "thread.started", "thread_id": "abc123"} +{"type": "turn.started", "turn_id": "turn1"} +{"type": "item.started", "item_id": "msg1", "item_type": "agent_message"} +{"type": "item.updated", "item_id": "msg1", "content": "I'll fix the test failures "} +{"type": "item.updated", "item_id": "msg1", "content": "in the calculator module."} +{"type": "item.completed", "item_id": "msg1"} +{"type": "item.completed", "item_type": "command_execution", "command": "pytest tests/", "exit_code": 0} +{"type": "item.completed", "item_type": "file_change", "path": "src/calc.py", "change_type": "modified"} +{"type": "turn.completed", "turn_id": "turn1", "token_usage": {"input_tokens": 1000, "output_tokens": 500}} +""" + + session = parse_codex_jsonl(sample_jsonl) + print(f"Thread ID: {session.thread_id}") + print(f"Events parsed: {session.raw_event_count}") + print(f"Agent messages: {len(session.agent_messages)}") + print(f"Commands: {len(session.commands)}") + print(f"File changes: {len(session.file_changes)}") + print(f"\nAnalysis text:\n{session.get_analysis_text()}") diff --git a/tools/codex_session_analyzer.py b/tools/codex_session_analyzer.py new file mode 100644 index 000000000..5878725b1 --- /dev/null +++ b/tools/codex_session_analyzer.py @@ -0,0 +1,271 @@ +""" +Codex Session Analyzer + +Analyzes Codex session output to determine task completion status. +Supports multiple data source options: +- Option A: Final summary only (--output-last-message) +- Option B: Full JSONL stream (--json) +- Option B subset: Filtered to high-value events only + +Usage: + from tools.codex_session_analyzer import analyze_session, AnalysisResult + + # From JSONL + result = analyze_session(jsonl_content, tasks, data_source="jsonl") + + # From summary + result = analyze_session(summary_text, tasks, data_source="summary") +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Literal + +from tools.codex_jsonl_parser import CodexSession, parse_codex_jsonl +from tools.llm_provider import CompletionAnalysis, get_llm_provider + +logger = logging.getLogger(__name__) + +DataSource = Literal["jsonl", "jsonl_filtered", "summary", "auto"] + + +@dataclass +class AnalysisResult: + """Complete analysis result with metadata.""" + + # Core analysis + completion: CompletionAnalysis + + # Session metadata (if JSONL was parsed) + session: CodexSession | None = None + + # Data source used + data_source: str = "unknown" + + # Statistics + input_length: int = 0 + analysis_text_length: int = 0 + + @property + def has_completions(self) -> bool: + """Check if any tasks were marked complete.""" + return len(self.completion.completed_tasks) > 0 + + @property + def has_progress(self) -> bool: + """Check if any work was done (completed or in progress).""" + return ( + len(self.completion.completed_tasks) > 0 or len(self.completion.in_progress_tasks) > 0 + ) + + @property + def is_stalled(self) -> bool: + """Check if session appears stalled (no progress, maybe blocked).""" + return not self.has_progress and len(self.completion.blocked_tasks) > 0 + + def get_checkbox_updates(self) -> dict[str, bool]: + """ + Get mapping of task -> checked status for PR body update. + + Returns: + Dict mapping task text to checkbox state (True = checked) + """ + updates = {} + for task in self.completion.completed_tasks: + updates[task] = True + # Don't uncheck anything - only mark completions + return updates + + def get_summary(self) -> str: + """Get human-readable summary of the analysis.""" + lines = [ + f"**Analysis Summary** (confidence: {self.completion.confidence:.0%})", + f"- Provider: {self.completion.provider_used}", + f"- Data source: {self.data_source}", + "", + ] + + if self.completion.completed_tasks: + lines.append("**Completed:**") + for task in self.completion.completed_tasks: + lines.append(f"- βœ“ {task}") + lines.append("") + + if self.completion.in_progress_tasks: + lines.append("**In Progress:**") + for task in self.completion.in_progress_tasks: + lines.append(f"- β†’ {task}") + lines.append("") + + if self.completion.blocked_tasks: + lines.append("**Blocked:**") + for task in self.completion.blocked_tasks: + lines.append(f"- βœ— {task}") + lines.append("") + + if self.completion.reasoning: + lines.append(f"**Analysis:** {self.completion.reasoning}") + + return "\n".join(lines) + + +def analyze_session( + content: str, + tasks: list[str], + data_source: DataSource = "auto", + include_reasoning: bool = True, + context: str | None = None, +) -> AnalysisResult: + """ + Analyze Codex session output to determine task completion. + + Args: + content: Session output (JSONL or summary text) + tasks: List of task descriptions from PR checkboxes + data_source: How to interpret content: + - "jsonl": Parse as full JSONL stream + - "jsonl_filtered": Parse JSONL, use only agent_message + reasoning + - "summary": Treat as plain text summary + - "auto": Auto-detect based on content + include_reasoning: Include reasoning summaries in analysis (for JSONL) + context: Additional context (PR description, etc.) + + Returns: + AnalysisResult with completion status and metadata + """ + # Auto-detect data source + if data_source == "auto": + data_source = _detect_data_source(content) + logger.info(f"Auto-detected data source: {data_source}") + + session = None + analysis_text = content + + # Parse JSONL if applicable + if data_source in ("jsonl", "jsonl_filtered"): + try: + session = parse_codex_jsonl(content) + analysis_text = session.get_analysis_text( + include_reasoning=(data_source == "jsonl" and include_reasoning) + ) + logger.info( + f"Parsed JSONL: {session.raw_event_count} events, " + f"{len(session.agent_messages)} messages, " + f"{len(session.commands)} commands" + ) + except Exception as e: + logger.warning(f"Failed to parse as JSONL, falling back to summary: {e}") + data_source = "summary" + analysis_text = content + + # Get LLM provider and analyze + provider = get_llm_provider() + + try: + completion = provider.analyze_completion( + session_output=analysis_text, + tasks=tasks, + context=context, + ) + except Exception as e: + logger.error(f"Analysis failed: {e}") + # Return empty result on failure + completion = CompletionAnalysis( + completed_tasks=[], + in_progress_tasks=[], + blocked_tasks=[], + confidence=0.0, + reasoning=f"Analysis failed: {e}", + provider_used="error", + ) + + return AnalysisResult( + completion=completion, + session=session, + data_source=data_source, + input_length=len(content), + analysis_text_length=len(analysis_text), + ) + + +def _detect_data_source(content: str) -> DataSource: + """ + Auto-detect whether content is JSONL or plain text. + + Args: + content: Raw content to analyze + + Returns: + Detected data source type + """ + # Check first few lines for JSON structure + lines = content.strip().split("\n")[:5] + json_lines = 0 + + for line in lines: + line = line.strip() + if line.startswith("{") and line.endswith("}"): + json_lines += 1 + + # If most lines look like JSON, treat as JSONL + if json_lines >= len(lines) * 0.5: + return "jsonl" + + return "summary" + + +def analyze_from_files( + session_file: str, + tasks_file: str | None = None, + tasks: list[str] | None = None, +) -> AnalysisResult: + """ + Convenience function to analyze from file paths. + + Args: + session_file: Path to session output file + tasks_file: Path to file with tasks (one per line) + tasks: List of tasks (alternative to tasks_file) + + Returns: + AnalysisResult + """ + from pathlib import Path + + content = Path(session_file).read_text() + + if tasks is None: + if tasks_file: + task_text = Path(tasks_file).read_text() + tasks = [t.strip() for t in task_text.split("\n") if t.strip()] + else: + raise ValueError("Either tasks or tasks_file must be provided") + + return analyze_session(content, tasks) + + +if __name__ == "__main__": + + logging.basicConfig(level=logging.INFO) + + # Example usage + sample_tasks = [ + "Fix test failures in calculator module", + "Update documentation", + "Add type hints", + ] + + sample_jsonl = """ +{"type": "thread.started", "thread_id": "abc123"} +{"type": "turn.started", "turn_id": "turn1"} +{"type": "item.completed", "item_type": "agent_message", "content": "I've completed fixing the test failures in the calculator module. The tests now pass. I'm starting work on the documentation updates."} +{"type": "item.completed", "item_type": "command_execution", "command": "pytest tests/", "exit_code": 0} +{"type": "item.completed", "item_type": "file_change", "path": "src/calc.py", "change_type": "modified"} +{"type": "turn.completed", "turn_id": "turn1", "token_usage": {"input_tokens": 1000, "output_tokens": 500}} +""" + + print("Analyzing sample session...") + result = analyze_session(sample_jsonl, sample_tasks) + print(result.get_summary()) diff --git a/tools/llm_provider.py b/tools/llm_provider.py new file mode 100644 index 000000000..cc0a5afd5 --- /dev/null +++ b/tools/llm_provider.py @@ -0,0 +1,421 @@ +""" +LLM Provider Abstraction with Fallback Chain + +Provides a unified interface for LLM calls with automatic fallback: +1. GitHub Models API (primary) - uses GITHUB_TOKEN +2. OpenAI API (fallback) - uses OPENAI_API_KEY +3. Regex patterns (last resort) - no API calls + +Usage: + from tools.llm_provider import get_llm_provider, LLMProvider + + provider = get_llm_provider() + result = provider.analyze_completion(session_text, tasks) +""" + +from __future__ import annotations + +import json +import logging +import os +from abc import ABC, abstractmethod +from dataclasses import dataclass + +logger = logging.getLogger(__name__) + +# GitHub Models API endpoint (OpenAI-compatible) +GITHUB_MODELS_BASE_URL = "https://models.inference.ai.azure.com" +DEFAULT_MODEL = "gpt-4o-mini" + + +@dataclass +class CompletionAnalysis: + """Result of task completion analysis.""" + + completed_tasks: list[str] # Task descriptions marked complete + in_progress_tasks: list[str] # Tasks currently being worked on + blocked_tasks: list[str] # Tasks that are blocked + confidence: float # 0.0 to 1.0 + reasoning: str # Explanation of the analysis + provider_used: str # Which provider generated this + + +class LLMProvider(ABC): + """Abstract base class for LLM providers.""" + + @property + @abstractmethod + def name(self) -> str: + """Provider name for logging.""" + pass + + @abstractmethod + def is_available(self) -> bool: + """Check if this provider can be used.""" + pass + + @abstractmethod + def analyze_completion( + self, + session_output: str, + tasks: list[str], + context: str | None = None, + ) -> CompletionAnalysis: + """ + Analyze session output to determine task completion status. + + Args: + session_output: Codex session output (summary or JSONL events) + tasks: List of task descriptions from PR checkboxes + context: Optional additional context (PR description, etc.) + + Returns: + CompletionAnalysis with task status breakdown + """ + pass + + +class GitHubModelsProvider(LLMProvider): + """LLM provider using GitHub Models API (OpenAI-compatible).""" + + @property + def name(self) -> str: + return "github-models" + + def is_available(self) -> bool: + return bool(os.environ.get("GITHUB_TOKEN")) + + def _get_client(self): + """Get LangChain ChatOpenAI client configured for GitHub Models.""" + try: + from langchain_openai import ChatOpenAI + except ImportError: + logger.warning("langchain_openai not installed") + return None + + return ChatOpenAI( + model=DEFAULT_MODEL, + base_url=GITHUB_MODELS_BASE_URL, + api_key=os.environ.get("GITHUB_TOKEN"), + temperature=0.1, # Low temperature for consistent analysis + ) + + def analyze_completion( + self, + session_output: str, + tasks: list[str], + context: str | None = None, + ) -> CompletionAnalysis: + client = self._get_client() + if not client: + raise RuntimeError("LangChain OpenAI not available") + + prompt = self._build_analysis_prompt(session_output, tasks, context) + + try: + response = client.invoke(prompt) + return self._parse_response(response.content, tasks) + except Exception as e: + logger.error(f"GitHub Models API error: {e}") + raise + + def _build_analysis_prompt( + self, + session_output: str, + tasks: list[str], + context: str | None = None, + ) -> str: + task_list = "\n".join(f"- [ ] {task}" for task in tasks) + + return f"""Analyze this Codex session output and determine which tasks have been completed. + +## Tasks to Track +{task_list} + +## Session Output +{session_output[:8000]} # Truncate to avoid token limits + +## Instructions +For each task, determine if it was: +- COMPLETED: Clear evidence the task was finished +- IN_PROGRESS: Work started but not finished +- BLOCKED: Cannot proceed due to an issue +- NOT_STARTED: No evidence of work on this task + +Respond in JSON format: +{{ + "completed": ["task description 1", ...], + "in_progress": ["task description 2", ...], + "blocked": ["task description 3", ...], + "confidence": 0.85, + "reasoning": "Brief explanation of your analysis" +}} + +Only include tasks in completed/in_progress/blocked if you have evidence. Be conservative - if unsure, don't mark as completed.""" + + def _parse_response(self, content: str, tasks: list[str]) -> CompletionAnalysis: + """Parse LLM response into CompletionAnalysis.""" + try: + # Try to extract JSON from response + json_start = content.find("{") + json_end = content.rfind("}") + 1 + if json_start >= 0 and json_end > json_start: + data = json.loads(content[json_start:json_end]) + else: + raise ValueError("No JSON found in response") + + return CompletionAnalysis( + completed_tasks=data.get("completed", []), + in_progress_tasks=data.get("in_progress", []), + blocked_tasks=data.get("blocked", []), + confidence=float(data.get("confidence", 0.5)), + reasoning=data.get("reasoning", ""), + provider_used=self.name, + ) + except (json.JSONDecodeError, ValueError) as e: + logger.warning(f"Failed to parse LLM response: {e}") + # Return empty analysis on parse failure + return CompletionAnalysis( + completed_tasks=[], + in_progress_tasks=[], + blocked_tasks=[], + confidence=0.0, + reasoning=f"Failed to parse response: {e}", + provider_used=self.name, + ) + + +class OpenAIProvider(LLMProvider): + """LLM provider using OpenAI API directly.""" + + @property + def name(self) -> str: + return "openai" + + def is_available(self) -> bool: + return bool(os.environ.get("OPENAI_API_KEY")) + + def _get_client(self): + """Get LangChain ChatOpenAI client.""" + try: + from langchain_openai import ChatOpenAI + except ImportError: + logger.warning("langchain_openai not installed") + return None + + return ChatOpenAI( + model=DEFAULT_MODEL, + api_key=os.environ.get("OPENAI_API_KEY"), + temperature=0.1, + ) + + def analyze_completion( + self, + session_output: str, + tasks: list[str], + context: str | None = None, + ) -> CompletionAnalysis: + client = self._get_client() + if not client: + raise RuntimeError("LangChain OpenAI not available") + + # Reuse the same prompt building logic + github_provider = GitHubModelsProvider() + prompt = github_provider._build_analysis_prompt(session_output, tasks, context) + + try: + response = client.invoke(prompt) + result = github_provider._parse_response(response.content, tasks) + # Override provider name + return CompletionAnalysis( + completed_tasks=result.completed_tasks, + in_progress_tasks=result.in_progress_tasks, + blocked_tasks=result.blocked_tasks, + confidence=result.confidence, + reasoning=result.reasoning, + provider_used=self.name, + ) + except Exception as e: + logger.error(f"OpenAI API error: {e}") + raise + + +class RegexFallbackProvider(LLMProvider): + """Fallback provider using regex pattern matching (no API calls).""" + + # Patterns indicating task completion + COMPLETION_PATTERNS = [ + r"(?:completed?|finished|done|implemented|fixed|resolved)\s+(?:the\s+)?(.+?)(?:\.|$)", + r"βœ“\s+(.+?)(?:\.|$)", + r"\[x\]\s+(.+?)(?:\.|$)", + r"successfully\s+(?:completed?|implemented|fixed)\s+(.+?)(?:\.|$)", + ] + + # Patterns indicating work in progress + PROGRESS_PATTERNS = [ + r"(?:working on|started|beginning|implementing)\s+(.+?)(?:\.|$)", + r"(?:in progress|ongoing):\s*(.+?)(?:\.|$)", + ] + + # Patterns indicating blockers + BLOCKER_PATTERNS = [ + r"(?:blocked|stuck|cannot|failed|error)\s+(?:on\s+)?(.+?)(?:\.|$)", + r"(?:issue|problem|bug)\s+(?:with\s+)?(.+?)(?:\.|$)", + ] + + @property + def name(self) -> str: + return "regex-fallback" + + def is_available(self) -> bool: + return True # Always available + + def analyze_completion( + self, + session_output: str, + tasks: list[str], + context: str | None = None, + ) -> CompletionAnalysis: + + output_lower = session_output.lower() + completed = [] + in_progress = [] + blocked = [] + + for task in tasks: + task_lower = task.lower() + # Simple keyword matching + task_words = set(task_lower.split()) + + # Check for completion signals + is_completed = any( + word in output_lower + and any( + p in output_lower + for p in ["completed", "finished", "done", "fixed", "βœ“", "[x]"] + ) + for word in task_words + if len(word) > 3 + ) + + # Check for progress signals + is_in_progress = any( + word in output_lower + and any( + p in output_lower + for p in ["working on", "started", "implementing", "in progress"] + ) + for word in task_words + if len(word) > 3 + ) + + # Check for blocker signals + is_blocked = any( + word in output_lower + and any( + p in output_lower for p in ["blocked", "stuck", "failed", "error", "cannot"] + ) + for word in task_words + if len(word) > 3 + ) + + if is_completed: + completed.append(task) + elif is_blocked: + blocked.append(task) + elif is_in_progress: + in_progress.append(task) + + return CompletionAnalysis( + completed_tasks=completed, + in_progress_tasks=in_progress, + blocked_tasks=blocked, + confidence=0.3, # Low confidence for regex + reasoning="Pattern-based analysis (no LLM available)", + provider_used=self.name, + ) + + +class FallbackChainProvider(LLMProvider): + """Provider that tries multiple providers in sequence.""" + + def __init__(self, providers: list[LLMProvider]): + self._providers = providers + self._active_provider: LLMProvider | None = None + + @property + def name(self) -> str: + if self._active_provider: + return f"fallback-chain({self._active_provider.name})" + return "fallback-chain" + + def is_available(self) -> bool: + return any(p.is_available() for p in self._providers) + + def analyze_completion( + self, + session_output: str, + tasks: list[str], + context: str | None = None, + ) -> CompletionAnalysis: + last_error = None + + for provider in self._providers: + if not provider.is_available(): + logger.debug(f"Provider {provider.name} not available, skipping") + continue + + try: + logger.info(f"Attempting analysis with {provider.name}") + self._active_provider = provider + result = provider.analyze_completion(session_output, tasks, context) + logger.info(f"Successfully analyzed with {provider.name}") + return result + except Exception as e: + logger.warning(f"Provider {provider.name} failed: {e}") + last_error = e + continue + + if last_error: + raise RuntimeError(f"All providers failed. Last error: {last_error}") + raise RuntimeError("No providers available") + + +def get_llm_provider() -> LLMProvider: + """ + Get the best available LLM provider with fallback chain. + + Returns a FallbackChainProvider that tries: + 1. GitHub Models API (if GITHUB_TOKEN set) + 2. OpenAI API (if OPENAI_API_KEY set) + 3. Regex fallback (always available) + """ + providers = [ + GitHubModelsProvider(), + OpenAIProvider(), + RegexFallbackProvider(), + ] + + return FallbackChainProvider(providers) + + +def check_providers() -> dict[str, bool]: + """Check which providers are available.""" + return { + "github-models": GitHubModelsProvider().is_available(), + "openai": OpenAIProvider().is_available(), + "regex-fallback": True, + } + + +if __name__ == "__main__": + # Quick test + logging.basicConfig(level=logging.INFO) + + print("Provider availability:") + for name, available in check_providers().items(): + status = "βœ“" if available else "βœ—" + print(f" {status} {name}") + + provider = get_llm_provider() + print(f"\nActive provider chain: {provider.name}") diff --git a/tools/requirements.txt b/tools/requirements.txt new file mode 100644 index 000000000..6d4d34f21 --- /dev/null +++ b/tools/requirements.txt @@ -0,0 +1,3 @@ +# LLM Provider dependencies for Codex session analysis +# These are optional - the provider falls back to regex if not available +langchain-openai>=0.1.0