diff --git a/.github/workflows/reusable-agents-verifier.yml b/.github/workflows/reusable-agents-verifier.yml index ce93aab35..9f675694e 100644 --- a/.github/workflows/reusable-agents-verifier.yml +++ b/.github/workflows/reusable-agents-verifier.yml @@ -35,6 +35,11 @@ on: required: false type: string default: 'github-models' + model2: + description: 'Second model for compare mode (defaults to model if not specified)' + required: false + type: string + default: '' secrets: CODEX_AUTH_JSON: required: false @@ -398,6 +403,14 @@ jobs: args+=(--diff-file "$diff_file") fi + # Add model parameters if provided + if [ -n "${{ inputs.model }}" ]; then + args+=(--model "${{ inputs.model }}") + fi + if [ -n "${{ inputs.model2 }}" ]; then + args+=(--model2 "${{ inputs.model2 }}") + fi + # Run comparison (stderr to separate file to avoid corrupting JSON) python .workflows-lib/scripts/langchain/pr_verifier.py "${args[@]}" > comparison.json 2> comparison-stderr.log || true diff --git a/scripts/langchain/pr_verifier.py b/scripts/langchain/pr_verifier.py index 1510e5589..80a5ac2fd 100755 --- a/scripts/langchain/pr_verifier.py +++ b/scripts/langchain/pr_verifier.py @@ -187,7 +187,9 @@ def _get_llm_client( ) -def _get_llm_clients() -> list[tuple[object, str]]: +def _get_llm_clients( + model1: str | None = None, model2: str | None = None +) -> list[tuple[object, str]]: try: from langchain_openai import ChatOpenAI except ImportError: @@ -200,12 +202,16 @@ def _get_llm_clients() -> list[tuple[object, str]]: from tools.llm_provider import DEFAULT_MODEL, GITHUB_MODELS_BASE_URL + # Use provided models or fall back to DEFAULT_MODEL + first_model = model1 or DEFAULT_MODEL + second_model = model2 or model1 or DEFAULT_MODEL + clients: list[tuple[object, str]] = [] if github_token: clients.append( ( ChatOpenAI( - model=DEFAULT_MODEL, + model=first_model, base_url=GITHUB_MODELS_BASE_URL, api_key=github_token, temperature=0.1, @@ -217,7 +223,7 @@ def _get_llm_clients() -> list[tuple[object, str]]: clients.append( ( ChatOpenAI( - model=DEFAULT_MODEL, + model=second_model, api_key=openai_token, temperature=0.1, ), @@ -235,12 +241,14 @@ class ComparisonRunner: clients: list[tuple[object, str]] @classmethod - def from_environment(cls, context: str, diff: str | None) -> ComparisonRunner: + def from_environment( + cls, context: str, diff: str | None, model1: str | None = None, model2: str | None = None + ) -> ComparisonRunner: return cls( context=context, diff=diff, prompt=_prepare_prompt(context, diff), - clients=_get_llm_clients(), + clients=_get_llm_clients(model1, model2), ) def run_single(self, client: object, provider: str) -> EvaluationResult: @@ -472,8 +480,10 @@ def evaluate_pr( return _parse_llm_response(content, provider_name) -def evaluate_pr_multiple(context: str, diff: str | None = None) -> list[EvaluationResult]: - runner = ComparisonRunner.from_environment(context, diff) +def evaluate_pr_multiple( + context: str, diff: str | None = None, model1: str | None = None, model2: str | None = None +) -> list[EvaluationResult]: + runner = ComparisonRunner.from_environment(context, diff, model1, model2) if not runner.clients: return [_fallback_evaluation("LLM client unavailable (missing credentials or dependency).")] results: list[EvaluationResult] = [] @@ -657,6 +667,10 @@ def main() -> None: choices=["openai", "github-models"], help="LLM provider: 'openai' (requires OPENAI_API_KEY) or 'github-models' (uses GITHUB_TOKEN).", ) + parser.add_argument( + "--model2", + help="Second LLM model for compare mode (defaults to --model if not specified).", + ) parser.add_argument( "--create-issue", action="store_true", @@ -679,7 +693,7 @@ def main() -> None: context = _load_text(args.context_file) diff = _load_text(args.diff_file) if args.diff_file else None if args.compare: - results = evaluate_pr_multiple(context, diff=diff) + results = evaluate_pr_multiple(context, diff=diff, model1=args.model, model2=args.model2) report = format_comparison_report(results) if args.output_file: Path(args.output_file).write_text(report, encoding="utf-8") diff --git a/templates/consumer-repo/.github/workflows/agents-verifier.yml b/templates/consumer-repo/.github/workflows/agents-verifier.yml index a59770989..a29803978 100644 --- a/templates/consumer-repo/.github/workflows/agents-verifier.yml +++ b/templates/consumer-repo/.github/workflows/agents-verifier.yml @@ -41,6 +41,11 @@ on: required: false type: string default: 'gpt-4o-mini' + model2: + description: 'Second model for compare mode (e.g., gpt-5, gpt-4.1)' + required: false + type: string + default: '' provider: description: 'LLM provider (OpenAI requires OPENAI_API_KEY secret)' required: true @@ -65,6 +70,7 @@ jobs: should_run: ${{ steps.check.outputs.should_run }} mode: ${{ steps.check.outputs.mode }} model: ${{ steps.check.outputs.model }} + model2: ${{ steps.check.outputs.model2 }} provider: ${{ steps.check.outputs.provider }} pr_number: ${{ steps.check.outputs.pr_number }} steps: @@ -78,6 +84,7 @@ jobs: const prNumber = context.payload.inputs.pr_number; const mode = context.payload.inputs.mode; const model = context.payload.inputs.model; + const model2 = context.payload.inputs.model2 || ''; const provider = context.payload.inputs.provider; // Verify PR exists and is merged @@ -95,10 +102,11 @@ jobs: } const logMsg = `Manual dispatch: PR #${prNumber}, mode=${mode}`; - core.info(`${logMsg}, model=${model}, provider=${provider}`); + core.info(`${logMsg}, model=${model}, model2=${model2}, provider=${provider}`); core.setOutput('should_run', 'true'); core.setOutput('mode', mode); core.setOutput('model', model); + core.setOutput('model2', model2); core.setOutput('provider', provider); core.setOutput('pr_number', prNumber); return; @@ -155,4 +163,6 @@ jobs: model: ${{ needs.check.outputs.model }} # Provider selection (empty string uses default) provider: ${{ needs.check.outputs.provider }} + # Second model for compare mode (empty string uses default) + model2: ${{ needs.check.outputs.model2 }} secrets: inherit diff --git a/tests/scripts/test_pr_verifier_compare.py b/tests/scripts/test_pr_verifier_compare.py index c56b848ab..c522c49f7 100644 --- a/tests/scripts/test_pr_verifier_compare.py +++ b/tests/scripts/test_pr_verifier_compare.py @@ -34,7 +34,7 @@ def test_evaluate_pr_multiple_runs_sequentially(monkeypatch) -> None: monkeypatch.setattr( pr_verifier.ComparisonRunner, "from_environment", - lambda context, diff: runner, + lambda context, diff, model1=None, model2=None: runner, ) results = pr_verifier.evaluate_pr_multiple("context") @@ -53,7 +53,7 @@ def test_evaluate_pr_multiple_falls_back_when_no_clients(monkeypatch) -> None: monkeypatch.setattr( pr_verifier.ComparisonRunner, "from_environment", - lambda context, diff: runner, + lambda context, diff, model1=None, model2=None: runner, ) results = pr_verifier.evaluate_pr_multiple("context")