-
Notifications
You must be signed in to change notification settings - Fork 1
feat: Enable comparing two different models in verify:compare mode #629
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -187,7 +187,9 @@ def _get_llm_client( | |
| ) | ||
|
|
||
|
|
||
| def _get_llm_clients() -> list[tuple[object, str]]: | ||
| def _get_llm_clients( | ||
| model1: str | None = None, model2: str | None = None | ||
| ) -> list[tuple[object, str]]: | ||
| try: | ||
| from langchain_openai import ChatOpenAI | ||
| except ImportError: | ||
|
|
@@ -200,12 +202,16 @@ def _get_llm_clients() -> list[tuple[object, str]]: | |
|
|
||
| from tools.llm_provider import DEFAULT_MODEL, GITHUB_MODELS_BASE_URL | ||
|
|
||
| # Use provided models or fall back to DEFAULT_MODEL | ||
| first_model = model1 or DEFAULT_MODEL | ||
| second_model = model2 or model1 or DEFAULT_MODEL | ||
|
|
||
| clients: list[tuple[object, str]] = [] | ||
| if github_token: | ||
| clients.append( | ||
| ( | ||
| ChatOpenAI( | ||
| model=DEFAULT_MODEL, | ||
| model=first_model, | ||
| base_url=GITHUB_MODELS_BASE_URL, | ||
| api_key=github_token, | ||
| temperature=0.1, | ||
|
Comment on lines
209
to
217
|
||
|
|
@@ -217,7 +223,7 @@ def _get_llm_clients() -> list[tuple[object, str]]: | |
| clients.append( | ||
| ( | ||
| ChatOpenAI( | ||
| model=DEFAULT_MODEL, | ||
| model=second_model, | ||
| api_key=openai_token, | ||
| temperature=0.1, | ||
| ), | ||
|
Comment on lines
223
to
229
|
||
|
|
@@ -235,12 +241,14 @@ class ComparisonRunner: | |
| clients: list[tuple[object, str]] | ||
|
|
||
| @classmethod | ||
| def from_environment(cls, context: str, diff: str | None) -> ComparisonRunner: | ||
| def from_environment( | ||
| cls, context: str, diff: str | None, model1: str | None = None, model2: str | None = None | ||
| ) -> ComparisonRunner: | ||
| return cls( | ||
| context=context, | ||
| diff=diff, | ||
| prompt=_prepare_prompt(context, diff), | ||
| clients=_get_llm_clients(), | ||
| clients=_get_llm_clients(model1, model2), | ||
| ) | ||
|
|
||
| def run_single(self, client: object, provider: str) -> EvaluationResult: | ||
|
|
@@ -472,8 +480,10 @@ def evaluate_pr( | |
| return _parse_llm_response(content, provider_name) | ||
|
|
||
|
|
||
| def evaluate_pr_multiple(context: str, diff: str | None = None) -> list[EvaluationResult]: | ||
| runner = ComparisonRunner.from_environment(context, diff) | ||
| def evaluate_pr_multiple( | ||
| context: str, diff: str | None = None, model1: str | None = None, model2: str | None = None | ||
| ) -> list[EvaluationResult]: | ||
| runner = ComparisonRunner.from_environment(context, diff, model1, model2) | ||
| if not runner.clients: | ||
| return [_fallback_evaluation("LLM client unavailable (missing credentials or dependency).")] | ||
| results: list[EvaluationResult] = [] | ||
|
|
@@ -657,6 +667,10 @@ def main() -> None: | |
| choices=["openai", "github-models"], | ||
| help="LLM provider: 'openai' (requires OPENAI_API_KEY) or 'github-models' (uses GITHUB_TOKEN).", | ||
| ) | ||
| parser.add_argument( | ||
| "--model2", | ||
| help="Second LLM model for compare mode (defaults to --model if not specified).", | ||
| ) | ||
| parser.add_argument( | ||
| "--create-issue", | ||
| action="store_true", | ||
|
|
@@ -679,7 +693,7 @@ def main() -> None: | |
| context = _load_text(args.context_file) | ||
| diff = _load_text(args.diff_file) if args.diff_file else None | ||
| if args.compare: | ||
| results = evaluate_pr_multiple(context, diff=diff) | ||
| results = evaluate_pr_multiple(context, diff=diff, model1=args.model, model2=args.model2) | ||
| report = format_comparison_report(results) | ||
| if args.output_file: | ||
| Path(args.output_file).write_text(report, encoding="utf-8") | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -41,6 +41,11 @@ on: | |||||||||
| required: false | ||||||||||
| type: string | ||||||||||
| default: 'gpt-4o-mini' | ||||||||||
| model2: | ||||||||||
| description: 'Second model for compare mode (e.g., gpt-5, gpt-4.1)' | ||||||||||
|
||||||||||
| description: 'Second model for compare mode (e.g., gpt-5, gpt-4.1)' | |
| description: 'Second model for compare mode (e.g., gpt-5, gpt-4o)' |
Copilot
AI
Jan 7, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The description should clarify that in compare mode, the first model uses GitHub Models provider and the second model uses OpenAI provider. This is important information for users to understand how the comparison will work. Consider updating the description to something like: "Second model for compare mode (uses OpenAI provider). Leave empty to compare same model across providers."
| description: 'Second model for compare mode (e.g., gpt-5, gpt-4.1)' | |
| description: 'Second model for compare mode (uses OpenAI provider). Leave empty to compare same model across providers.' |
Copilot
AI
Jan 7, 2026
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The model2 output is being declared here, but it's not being set in the label trigger path (lines 146-151 of the check step script). When the workflow is triggered via label (verify:checkbox, verify:evaluate, or verify:compare), the model2 output will be undefined. You need to add "core.setOutput('model2', '');" in the label trigger path alongside the other setOutput calls for model and provider.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The description should clarify that in compare mode, the first model uses GitHub Models provider and the second model uses OpenAI provider. Consider updating to: "Second model for compare mode (uses OpenAI provider). Defaults to model if not specified for same-model cross-provider comparison."