Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
156 changes: 149 additions & 7 deletions .github/scripts/__tests__/keepalive-loop.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -1407,8 +1407,8 @@ test('analyzeTaskCompletion matches explicit file creation tasks', async () => {

const taskText = `
- [ ] Create \`agents-guard.test.js\` with tests for label validation
- [ ] Create \`keepalive-guard-utils.test.js\` covering pause label detection
- [ ] Unrelated task about documentation
- [ ] Write poetry about sunsets and rainbows
- [ ] Cook dinner recipes for Italian cuisine
`;

const result = await analyzeTaskCompletion({
Expand All @@ -1431,12 +1431,12 @@ test('analyzeTaskCompletion matches explicit file creation tasks', async () => {
assert.equal(guardMatch.confidence, 'high', 'Should be high confidence for exact file');
assert.ok(guardMatch.reason.includes('Exact file'), 'Reason should mention exact file match');

// Should NOT match keepalive-guard-utils since that file wasn't created
const keepaliveMatch = result.matches.find(m =>
m.task.toLowerCase().includes('keepalive-guard-utils.test.js')
// Should NOT match poetry task since it's completely unrelated
const poetryMatch = result.matches.find(m =>
m.task.toLowerCase().includes('poetry')
);
assert.ok(!keepaliveMatch || keepaliveMatch.confidence !== 'high',
'Should not match keepalive-guard-utils with high confidence');
assert.ok(!poetryMatch || poetryMatch.confidence !== 'high',
'Should not match unrelated poetry task with high confidence');
});

test('analyzeTaskCompletion returns empty for unrelated commits', async () => {
Expand Down Expand Up @@ -1482,6 +1482,148 @@ test('analyzeTaskCompletion returns empty for unrelated commits', async () => {
assert.equal(highConfidence.length, 0, 'Should not find high-confidence matches for unrelated commits');
});

test('analyzeTaskCompletion uses lowered 35% threshold with file match', async () => {
// Task: "Add config support for financing model"
// Commit: "Pass schedule inputs into capital validation"
// Keywords in common: config, schedule, inputs (35%+ overlap with file match)
const commits = [
{ sha: 'abc123', commit: { message: 'feat: add schedule config inputs to validation' } },
];
const files = [
{ filename: 'src/config/financing_model.py' },
];

const github = {
rest: {
repos: {
async compareCommits() {
return { data: { commits } };
},
},
pulls: {
async listFiles() {
return { data: files };
},
},
},
};

const taskText = `
- [ ] Add config support for financing model schedule inputs
- [ ] Completely unrelated database task
`;

const result = await analyzeTaskCompletion({
github,
context: { repo: { owner: 'test', repo: 'repo' } },
prNumber: 1,
baseSha: 'base123',
headSha: 'head456',
taskText,
core: buildCore(),
});

// With lowered threshold (35%) + file match, should be high confidence
const configMatch = result.matches.find(m =>
m.task.toLowerCase().includes('config') && m.task.toLowerCase().includes('financing')
);
assert.ok(configMatch, 'Should match config/financing task');
assert.equal(configMatch.confidence, 'high', 'Should be high confidence with 35%+ match and file touch');
});

test('analyzeTaskCompletion gives high confidence for 25% keyword match with file match', async () => {
// Lower threshold: 25% keyword match + file match = high confidence
const commits = [
{ sha: 'abc123', commit: { message: 'add wizard step' } },
];
const files = [
{ filename: 'src/ui/wizard_step.py' },
];

const github = {
rest: {
repos: {
async compareCommits() {
return { data: { commits } };
},
},
pulls: {
async listFiles() {
return { data: files };
},
},
},
};

const taskText = `
- [ ] Add wizard step for sleeve suggestions with tooltips and validation
`;

const result = await analyzeTaskCompletion({
github,
context: { repo: { owner: 'test', repo: 'repo' } },
prNumber: 1,
baseSha: 'base123',
headSha: 'head456',
taskText,
core: buildCore(),
});

// wizard, step keywords match -> ~25% match, plus file match = high confidence
const wizardMatch = result.matches.find(m =>
m.task.toLowerCase().includes('wizard')
);
assert.ok(wizardMatch, 'Should match wizard task');
assert.equal(wizardMatch.confidence, 'high', 'Should be high confidence with file match even at ~25% keywords');
});

test('analyzeTaskCompletion uses synonym expansion for better matching', async () => {
// Task says "implement", commit says "add" - synonyms should match
const commits = [
{ sha: 'abc123', commit: { message: 'feat: add config validation logic' } },
];
const files = [
{ filename: 'src/config/validator.py' },
];

const github = {
rest: {
repos: {
async compareCommits() {
return { data: { commits } };
},
},
pulls: {
async listFiles() {
return { data: files };
},
},
},
};

const taskText = `
- [ ] Implement config validation with proper error handling
`;

const result = await analyzeTaskCompletion({
github,
context: { repo: { owner: 'test', repo: 'repo' } },
prNumber: 1,
baseSha: 'base123',
headSha: 'head456',
taskText,
core: buildCore(),
});

// "implement" in task should match "add" in commit via synonyms
// plus "config" and "validation" match directly
const configMatch = result.matches.find(m =>
m.task.toLowerCase().includes('config validation')
);
assert.ok(configMatch, 'Should match config validation task');
assert.equal(configMatch.confidence, 'high', 'Should be high confidence with synonym matching');
});
Comment on lines +1580 to +1625
Copy link

Copilot AI Dec 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This test claims to test synonym expansion (task says "implement", commit says "add"), but it doesn't properly validate this feature. The commit message contains "add config validation logic" and the task is "Implement config validation with proper error handling". Both contain the words "config" and "validation" which match directly, so the test would pass with high confidence even if synonym expansion didn't work at all.

To properly test synonym expansion, you would need a test where the ONLY matching words are synonyms. For example:

  • Task: "implement feature"
  • Commit: "add feature"
  • Only "feature" matches directly; "implement" and "add" should match via synonyms.

Given the bidirectional synonym bug identified earlier (comment ID 001), this test might actually be passing for the wrong reasons.

Copilot uses AI. Check for mistakes.

test('autoReconcileTasks updates PR body for high-confidence matches', async () => {
const prBody = `## Tasks
- [ ] Add step summary output to keepalive loop
Expand Down
56 changes: 52 additions & 4 deletions .github/scripts/keepalive_loop.js
Original file line number Diff line number Diff line change
Expand Up @@ -1315,6 +1315,29 @@ async function analyzeTaskCompletion({ github, context, prNumber, baseSha, headS

log(`Analyzing ${commits.length} commits against ${taskLines.length} unchecked tasks`);

// Common action synonyms for better matching
const SYNONYMS = {
add: ['create', 'implement', 'introduce', 'build'],
create: ['add', 'implement', 'introduce', 'build'],
implement: ['add', 'create', 'build'],
fix: ['repair', 'resolve', 'correct', 'patch'],
update: ['modify', 'change', 'revise', 'edit'],
remove: ['delete', 'drop', 'eliminate'],
test: ['tests', 'testing', 'spec', 'specs'],
config: ['configuration', 'settings', 'configure'],
doc: ['docs', 'documentation', 'document'],
};

// Helper to split camelCase/PascalCase into words
function splitCamelCase(str) {
return str
.replace(/([a-z])([A-Z])/g, '$1 $2')
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
.toLowerCase()
.split(/[\s_-]+/)
.filter(w => w.length > 2);
Comment on lines +1333 to +1338
Copy link

Copilot AI Dec 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The splitCamelCase function filters out words with length <= 2 on line 1338, which could remove meaningful short words. For example:

  • "getUserID" would split into ["get", "user", "id"], but "id" would be filtered out
  • "setIO" would split into ["set", "io"], but "io" would be filtered out
  • "addUIElement" would split into ["add", "ui", "element"], but "ui" would be filtered out

While filtering out very short words might reduce noise, it could also remove legitimate technical acronyms and abbreviations that are meaningful in the context of task matching (e.g., "io", "id", "ui", "db", "os", "vm").

Suggested change
return str
.replace(/([a-z])([A-Z])/g, '$1 $2')
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
.toLowerCase()
.split(/[\s_-]+/)
.filter(w => w.length > 2);
// Common short technical tokens that are still meaningful and should not be filtered out
const SHORT_TECH_TOKENS = new Set([
'id', 'io', 'ui', 'db', 'os', 'vm',
'ci', 'cd', 'ip', 'db', 'api'
]);
return str
.replace(/([a-z])([A-Z])/g, '$1 $2')
.replace(/([A-Z]+)([A-Z][a-z])/g, '$1 $2')
.toLowerCase()
.split(/[\s_-]+/)
.filter(w => w.length > 2 || SHORT_TECH_TOKENS.has(w));

Copilot uses AI. Check for mistakes.
}

// Build keyword map from commits
const commitKeywords = new Set();
const commitMessages = commits
Expand All @@ -1324,12 +1347,31 @@ async function analyzeTaskCompletion({ github, context, prNumber, baseSha, headS
// Extract meaningful words from commit messages
const words = commitMessages.match(/\b[a-z_-]{3,}\b/g) || [];
words.forEach(w => commitKeywords.add(w));

// Also split camelCase words from commit messages
const camelWords = commits
.map(c => c.commit.message)
.join(' ')
.match(/[a-zA-Z][a-z]+[A-Z][a-zA-Z]*/g) || [];
Copy link

Copilot AI Dec 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The regex pattern /[a-zA-Z][a-z]+[A-Z][a-zA-Z]*/g for detecting camelCase words is too restrictive and will miss several valid camelCase patterns:

  • It requires at least 2 lowercase letters before the first uppercase letter, so it would miss patterns like "aB", "iPhone", or "eBay"
  • It won't match PascalCase words that start with uppercase like "UserService" or "HttpClient"
  • It requires exactly one lowercase letter at the start, so it would miss "IO" or "HTTPSConnection"

A more comprehensive pattern would be /[a-z]+[A-Z][a-zA-Z]*/g or /\b[a-z]+[A-Z][a-z]*|[A-Z][a-z]+[A-Z]/gi to catch various camelCase and PascalCase patterns.

Suggested change
.match(/[a-zA-Z][a-z]+[A-Z][a-zA-Z]*/g) || [];
.match(/\b(?:[a-z]+[A-Z][a-zA-Z]*|[A-Z][a-z]+[A-Z][a-zA-Z]*)\b/g) || [];

Copilot uses AI. Check for mistakes.
camelWords.forEach(w => splitCamelCase(w).forEach(part => commitKeywords.add(part)));

// Also extract from file paths
filesChanged.forEach(f => {
const parts = f.toLowerCase().replace(/[^a-z0-9_/-]/g, ' ').split(/[\s/]+/);
parts.forEach(p => p.length > 2 && commitKeywords.add(p));
// Extract camelCase from file names
const fileName = f.split('/').pop() || '';
splitCamelCase(fileName.replace(/\.[^.]+$/, '')).forEach(w => commitKeywords.add(w));
});

// Add synonyms for all commit keywords
const expandedKeywords = new Set(commitKeywords);
for (const keyword of commitKeywords) {
const synonymList = SYNONYMS[keyword];
if (synonymList) {
synonymList.forEach(syn => expandedKeywords.add(syn));
Comment on lines +1369 to +1372
Copy link

Copilot AI Dec 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The synonym expansion logic is not bidirectional. When a keyword from commits is found (e.g., "add"), the code adds its synonyms (e.g., "create", "implement") to expandedKeywords. However, if the task contains "implement" and the commit contains "add", this won't match because "implement" from the task is not in commitKeywords, so its synonyms (including "add") are never added to expandedKeywords.

The matching process filters taskWords against expandedKeywords, but expandedKeywords is only expanded based on what's in commitKeywords. This means the synonym mapping only works one way (commit keyword → task synonym) but not the other way (task keyword → commit synonym).

To fix this, when matching task words, you should also check if any task word has synonyms that exist in commitKeywords, or expand keywords bidirectionally.

Suggested change
for (const keyword of commitKeywords) {
const synonymList = SYNONYMS[keyword];
if (synonymList) {
synonymList.forEach(syn => expandedKeywords.add(syn));
// Expand synonym groups bidirectionally: if any word in a synonym group
// appears in commitKeywords, add the entire group to expandedKeywords.
if (SYNONYMS && typeof SYNONYMS === 'object') {
for (const [base, synonymList] of Object.entries(SYNONYMS)) {
const list = Array.isArray(synonymList) ? synonymList : [];
const groupWords = [base, ...list];
const hasIntersection = groupWords.some(word => commitKeywords.has(word));
if (hasIntersection) {
groupWords.forEach(word => expandedKeywords.add(word));
}

Copilot uses AI. Check for mistakes.
}
}

// Build module-to-test-file map for better test task matching
// e.g., tests/test_adapter_base.py -> ["adapter", "base", "adapters"]
Expand All @@ -1355,8 +1397,8 @@ async function analyzeTaskCompletion({ github, context, prNumber, baseSha, headS
const taskWords = taskLower.match(/\b[a-z_-]{3,}\b/g) || [];
const isTestTask = /\b(test|tests|unit\s*test|coverage)\b/i.test(task);

// Calculate overlap score
const matchingWords = taskWords.filter(w => commitKeywords.has(w));
// Calculate overlap score using expanded keywords (with synonyms)
const matchingWords = taskWords.filter(w => expandedKeywords.has(w));
const score = taskWords.length > 0 ? matchingWords.length / taskWords.length : 0;

// Extract explicit file references from task (e.g., `filename.js` or filename.test.js)
Expand Down Expand Up @@ -1415,11 +1457,17 @@ async function analyzeTaskCompletion({ github, context, prNumber, baseSha, headS
confidence = 'high';
reason = 'Test file created matching module reference';
matches.push({ task, reason, confidence });
} else if (score >= 0.5 && (fileMatch || commitMatch)) {
} else if (score >= 0.35 && (fileMatch || commitMatch)) {
// Lowered threshold from 0.5 to 0.35 to catch more legitimate completions
confidence = 'high';
reason = `${Math.round(score * 100)}% keyword match, ${fileMatch ? 'file match' : 'commit match'}`;
matches.push({ task, reason, confidence });
} else if (score >= 0.3 || fileMatch) {
} else if (score >= 0.25 && fileMatch) {
// File match with moderate keyword overlap is high confidence
confidence = 'high';
reason = `${Math.round(score * 100)}% keyword match with file match`;
Comment on lines +1465 to +1468
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Badge Stop marking low-overlap file matches as high confidence

Lowering the high-confidence cutoff to 25% when fileMatch is true means tasks that only share a single token with a touched file now get marked high and will be auto-checked in autoReconcileTasks. For example, touching .github/scripts/__tests__/agents-guard.test.js while a task says Create keepalive-guard-utils.test.js covering pause label detection produces a ~29% keyword match (create/test) plus a substring file match on guard, so this branch would now flag it high even though the referenced file was never created; prior logic kept this as medium and avoided auto-checking unrelated tasks.

Useful? React with 👍 / 👎.

matches.push({ task, reason, confidence });
} else if (score >= 0.2 || fileMatch) {
Comment on lines +1460 to +1470
Copy link

Copilot AI Dec 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The condition on line 1465 (score >= 0.25 && fileMatch) is redundant because it's already covered by the previous condition on line 1460 (score >= 0.35 && (fileMatch || commitMatch)). Any score >= 0.35 with fileMatch would match line 1460 first, so line 1465 would only catch scores in the range [0.25, 0.35) with fileMatch.

Additionally, line 1470's condition score >= 0.2 || fileMatch would catch fileMatch cases when score < 0.25, but this creates an inconsistency:

  • score >= 0.35 with fileMatch → high confidence (line 1460)
  • 0.25 <= score < 0.35 with fileMatch → high confidence (line 1465)
  • score < 0.25 with fileMatch → medium confidence (line 1470)

This means a fileMatch alone (with score < 0.2) gets medium confidence, which seems reasonable, but the threshold boundaries should be documented more clearly to avoid confusion.

Copilot uses AI. Check for mistakes.
confidence = 'medium';
reason = `${Math.round(score * 100)}% keyword match${fileMatch ? ', file touched' : ''}`;
matches.push({ task, reason, confidence });
Expand Down
9 changes: 9 additions & 0 deletions .github/templates/keepalive-instruction.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,19 @@ Your objective is to satisfy the **Acceptance Criteria** by completing each **Ta
2. Commit meaningful source code (.py, .yml, .js, etc.)—not just status/docs updates.
3. **UPDATE THE CHECKBOXES** in the Tasks and Acceptance Criteria sections below to mark completed items.
4. Change `- [ ]` to `- [x]` for items you have completed and verified.
5. **In your final summary**, list completed tasks using the format: `✅ Completed: [exact task text]`

**CRITICAL - Checkbox Updates:**
When you complete a task or acceptance criterion, update its checkbox directly in this prompt file. Change the `[ ]` to `[x]` for completed items. The automation will read these checkboxes and update the PR's status summary.

**CRITICAL - Summary Format:**
At the end of your work, include explicit completion markers for each task you finished:
```
✅ Completed: Add validation for user input
✅ Completed: Write unit tests for validator module
```
This helps the automation accurately track which tasks were addressed in this round.

**Example:**
Before: `- [ ] Add validation for user input`
After: `- [x] Add validation for user input`
Expand Down
Loading