Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions .github/workflows/reusable-agents-verifier.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,11 @@ on:
required: false
type: string
default: 'github-models'
model2:
description: 'Second model for compare mode (defaults to model if not specified)'
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The description should clarify that in compare mode, the first model uses GitHub Models provider and the second model uses OpenAI provider. Consider updating to: "Second model for compare mode (uses OpenAI provider). Defaults to model if not specified for same-model cross-provider comparison."

Suggested change
description: 'Second model for compare mode (defaults to model if not specified)'
description: 'Second model for compare mode (uses OpenAI provider). Defaults to model if not specified for same-model cross-provider comparison.'

Copilot uses AI. Check for mistakes.
required: false
type: string
default: ''
secrets:
CODEX_AUTH_JSON:
required: false
Expand Down Expand Up @@ -398,6 +403,14 @@ jobs:
args+=(--diff-file "$diff_file")
fi

# Add model parameters if provided
if [ -n "${{ inputs.model }}" ]; then
args+=(--model "${{ inputs.model }}")
fi
if [ -n "${{ inputs.model2 }}" ]; then
args+=(--model2 "${{ inputs.model2 }}")
fi

# Run comparison (stderr to separate file to avoid corrupting JSON)
python .workflows-lib/scripts/langchain/pr_verifier.py "${args[@]}" > comparison.json 2> comparison-stderr.log || true

Expand Down
30 changes: 22 additions & 8 deletions scripts/langchain/pr_verifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,7 +187,9 @@ def _get_llm_client(
)


def _get_llm_clients() -> list[tuple[object, str]]:
def _get_llm_clients(
model1: str | None = None, model2: str | None = None
) -> list[tuple[object, str]]:
try:
from langchain_openai import ChatOpenAI
except ImportError:
Expand All @@ -200,12 +202,16 @@ def _get_llm_clients() -> list[tuple[object, str]]:

from tools.llm_provider import DEFAULT_MODEL, GITHUB_MODELS_BASE_URL

# Use provided models or fall back to DEFAULT_MODEL
first_model = model1 or DEFAULT_MODEL
second_model = model2 or model1 or DEFAULT_MODEL

clients: list[tuple[object, str]] = []
if github_token:
clients.append(
(
ChatOpenAI(
model=DEFAULT_MODEL,
model=first_model,
base_url=GITHUB_MODELS_BASE_URL,
api_key=github_token,
temperature=0.1,
Comment on lines 209 to 217
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The provider label should include the model name for clarity in comparison reports. Currently it only shows "github-models" but should be formatted as "github-models/{first_model}" to match the pattern used in _get_llm_client() (lines 149, 162, 174, 186). This will help users understand which specific models are being compared in the comparison report.

Copilot uses AI. Check for mistakes.
Expand All @@ -217,7 +223,7 @@ def _get_llm_clients() -> list[tuple[object, str]]:
clients.append(
(
ChatOpenAI(
model=DEFAULT_MODEL,
model=second_model,
api_key=openai_token,
temperature=0.1,
),
Comment on lines 223 to 229
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The provider label should include the model name for clarity in comparison reports. Currently it only shows "openai" but should be formatted as "openai/{second_model}" to match the pattern used in _get_llm_client() (lines 149, 162, 174, 186). This will help users understand which specific models are being compared in the comparison report.

Copilot uses AI. Check for mistakes.
Expand All @@ -235,12 +241,14 @@ class ComparisonRunner:
clients: list[tuple[object, str]]

@classmethod
def from_environment(cls, context: str, diff: str | None) -> ComparisonRunner:
def from_environment(
cls, context: str, diff: str | None, model1: str | None = None, model2: str | None = None
) -> ComparisonRunner:
return cls(
context=context,
diff=diff,
prompt=_prepare_prompt(context, diff),
clients=_get_llm_clients(),
clients=_get_llm_clients(model1, model2),
)

def run_single(self, client: object, provider: str) -> EvaluationResult:
Expand Down Expand Up @@ -472,8 +480,10 @@ def evaluate_pr(
return _parse_llm_response(content, provider_name)


def evaluate_pr_multiple(context: str, diff: str | None = None) -> list[EvaluationResult]:
runner = ComparisonRunner.from_environment(context, diff)
def evaluate_pr_multiple(
context: str, diff: str | None = None, model1: str | None = None, model2: str | None = None
) -> list[EvaluationResult]:
runner = ComparisonRunner.from_environment(context, diff, model1, model2)
if not runner.clients:
return [_fallback_evaluation("LLM client unavailable (missing credentials or dependency).")]
results: list[EvaluationResult] = []
Expand Down Expand Up @@ -657,6 +667,10 @@ def main() -> None:
choices=["openai", "github-models"],
help="LLM provider: 'openai' (requires OPENAI_API_KEY) or 'github-models' (uses GITHUB_TOKEN).",
)
parser.add_argument(
"--model2",
help="Second LLM model for compare mode (defaults to --model if not specified).",
)
parser.add_argument(
"--create-issue",
action="store_true",
Expand All @@ -679,7 +693,7 @@ def main() -> None:
context = _load_text(args.context_file)
diff = _load_text(args.diff_file) if args.diff_file else None
if args.compare:
results = evaluate_pr_multiple(context, diff=diff)
results = evaluate_pr_multiple(context, diff=diff, model1=args.model, model2=args.model2)
report = format_comparison_report(results)
if args.output_file:
Path(args.output_file).write_text(report, encoding="utf-8")
Expand Down
12 changes: 11 additions & 1 deletion templates/consumer-repo/.github/workflows/agents-verifier.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ on:
required: false
type: string
default: 'gpt-4o-mini'
model2:
description: 'Second model for compare mode (e.g., gpt-5, gpt-4.1)'
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The example model "gpt-4.1" in the description is not a valid OpenAI model name. Consider using a valid example like "gpt-4-turbo" or "gpt-4o" instead.

Suggested change
description: 'Second model for compare mode (e.g., gpt-5, gpt-4.1)'
description: 'Second model for compare mode (e.g., gpt-5, gpt-4o)'

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The description should clarify that in compare mode, the first model uses GitHub Models provider and the second model uses OpenAI provider. This is important information for users to understand how the comparison will work. Consider updating the description to something like: "Second model for compare mode (uses OpenAI provider). Leave empty to compare same model across providers."

Suggested change
description: 'Second model for compare mode (e.g., gpt-5, gpt-4.1)'
description: 'Second model for compare mode (uses OpenAI provider). Leave empty to compare same model across providers.'

Copilot uses AI. Check for mistakes.
required: false
type: string
default: ''
provider:
description: 'LLM provider (OpenAI requires OPENAI_API_KEY secret)'
required: true
Expand All @@ -65,6 +70,7 @@ jobs:
should_run: ${{ steps.check.outputs.should_run }}
mode: ${{ steps.check.outputs.mode }}
model: ${{ steps.check.outputs.model }}
model2: ${{ steps.check.outputs.model2 }}
Copy link

Copilot AI Jan 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The model2 output is being declared here, but it's not being set in the label trigger path (lines 146-151 of the check step script). When the workflow is triggered via label (verify:checkbox, verify:evaluate, or verify:compare), the model2 output will be undefined. You need to add "core.setOutput('model2', '');" in the label trigger path alongside the other setOutput calls for model and provider.

Copilot uses AI. Check for mistakes.
provider: ${{ steps.check.outputs.provider }}
pr_number: ${{ steps.check.outputs.pr_number }}
steps:
Expand All @@ -78,6 +84,7 @@ jobs:
const prNumber = context.payload.inputs.pr_number;
const mode = context.payload.inputs.mode;
const model = context.payload.inputs.model;
const model2 = context.payload.inputs.model2 || '';
const provider = context.payload.inputs.provider;

// Verify PR exists and is merged
Expand All @@ -95,10 +102,11 @@ jobs:
}

const logMsg = `Manual dispatch: PR #${prNumber}, mode=${mode}`;
core.info(`${logMsg}, model=${model}, provider=${provider}`);
core.info(`${logMsg}, model=${model}, model2=${model2}, provider=${provider}`);
core.setOutput('should_run', 'true');
core.setOutput('mode', mode);
core.setOutput('model', model);
core.setOutput('model2', model2);
core.setOutput('provider', provider);
core.setOutput('pr_number', prNumber);
return;
Expand Down Expand Up @@ -155,4 +163,6 @@ jobs:
model: ${{ needs.check.outputs.model }}
# Provider selection (empty string uses default)
provider: ${{ needs.check.outputs.provider }}
# Second model for compare mode (empty string uses default)
model2: ${{ needs.check.outputs.model2 }}
secrets: inherit
4 changes: 2 additions & 2 deletions tests/scripts/test_pr_verifier_compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def test_evaluate_pr_multiple_runs_sequentially(monkeypatch) -> None:
monkeypatch.setattr(
pr_verifier.ComparisonRunner,
"from_environment",
lambda context, diff: runner,
lambda context, diff, model1=None, model2=None: runner,
)

results = pr_verifier.evaluate_pr_multiple("context")
Expand All @@ -53,7 +53,7 @@ def test_evaluate_pr_multiple_falls_back_when_no_clients(monkeypatch) -> None:
monkeypatch.setattr(
pr_verifier.ComparisonRunner,
"from_environment",
lambda context, diff: runner,
lambda context, diff, model1=None, model2=None: runner,
)

results = pr_verifier.evaluate_pr_multiple("context")
Expand Down
Loading