diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml new file mode 100644 index 0000000000..8405521810 --- /dev/null +++ b/.github/workflows/model-results-comparison.yaml @@ -0,0 +1,74 @@ +name: Model Results Comparison + +on: + pull_request: + types: [opened, synchronize, edited] + paths: + - 'results/**/*.json' + workflow_dispatch: + inputs: + reference_models: + description: 'Space-separated list of reference models for comparison' + required: true + type: string + default: 'intfloat/multilingual-e5-large google/gemini-embedding-001' + pull_request_number: + description: 'The pull request number to comment on (required if triggered manually)' + required: false # Make it not strictly required if you want to run it without a PR context for other reasons + type: string + +permissions: + contents: read + pull-requests: write + +jobs: + compare-results: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Fetch origin main + run: git fetch origin main + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install dependencies + run: | + pip install git+https://github.com/embeddings-benchmark/mteb.git tabulate + + - name: Generate model comparison + env: + REFERENCE_MODELS: ${{ github.event.inputs.reference_models || 'intfloat/multilingual-e5-large google/gemini-embedding-001' }} + run: | + python scripts/create_pr_results_comment.py --reference-models $REFERENCE_MODELS --output model-comparison.md + + - name: Determine PR Number + id: pr_info + run: | + if [ "${{ github.event_name }}" == "pull_request" ]; then + echo "pr_number=${{ github.event.number }}" >> $GITHUB_OUTPUT + elif [ "${{ github.event_name }}" == "workflow_dispatch" ] && [ -n "${{ github.event.inputs.pull_request_number }}" ]; then + echo "pr_number=${{ github.event.inputs.pull_request_number }}" >> $GITHUB_OUTPUT + else + echo "pr_number=" >> $GITHUB_OUTPUT + fi + + - name: Post PR comment + # This step will run if a PR number is available either from the PR event or workflow_dispatch input + if: steps.pr_info.outputs.pr_number != '' + env: + GITHUB_TOKEN: ${{ github.token }} + run: gh pr comment ${{ steps.pr_info.outputs.pr_number }} --body-file model-comparison.md --create-if-none --edit-last + + - name: Upload comparison report + uses: actions/upload-artifact@v4 + with: + name: model-comparison + path: model-comparison.md diff --git a/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json b/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json index c264367483..8b2abb3922 100644 --- a/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json +++ b/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json @@ -134,4 +134,4 @@ }, "evaluation_time": 47.84240365028381, "kg_co2_emissions": null -} \ No newline at end of file +} diff --git a/scripts/create_pr_results_comment.py b/scripts/create_pr_results_comment.py index 9bd2d2de01..d1837ca798 100644 --- a/scripts/create_pr_results_comment.py +++ b/scripts/create_pr_results_comment.py @@ -3,19 +3,20 @@ Usage: gh pr checkout {pr-number} - scripts/create_pr_results_comment.py [--models MODEL1 MODEL2 ...] + python scripts/create_pr_results_comment.py [--models MODEL1 MODEL2 ...] [--output OUTPUT_FILE] Description: - Compares new model results (added in the current PR) against reference models. - - Outputs a Markdown table with results for each new model and highlights the best scores. + - Outputs a Markdown file with results for each new model and highlights the best scores. - By default, compares against: intfloat/multilingual-e5-large and google/gemini-embedding-001. - You can specify reference models with the --models argument. Arguments: - --models: List of reference models to compare against (default: intfloat/multilingual-e5-large google/gemini-embedding-001) + --reference-models: List of reference models to compare against (default: intfloat/multilingual-e5-large google/gemini-embedding-001) + --output: Output markdown file path (default: model-comparison.md) Example: - scripts/create_pr_results_comment.py --models intfloat/multilingual-e5-large myorg/my-new-model + python scripts/create_pr_results_comment.py --models intfloat/multilingual-e5-large myorg/my-new-model """ from __future__ import annotations @@ -24,6 +25,7 @@ import json import os import subprocess +import logging from collections import defaultdict from pathlib import Path @@ -32,33 +34,23 @@ TaskName, ModelName = str, str - -repo_path = Path(__file__).parents[1] -results_path = repo_path / "results" - -os.environ["MTEB_CACHE"] = str(repo_path.parent) - - -default_reference_models = [ +# Default reference models to compare against +REFERENCE_MODELS: list[str] = [ "intfloat/multilingual-e5-large", "google/gemini-embedding-001", ] +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) -def get_diff_from_main() -> list[str]: - current_rev, origin_rev = subprocess.run( - ["git", "rev-parse", "main", "origin/main"], - cwd=repo_path, - capture_output=True, - check=True, - text=True, - ).stdout.splitlines() +repo_path = Path(__file__).parents[1] + +os.environ["MTEB_CACHE"] = str(repo_path.parent) - if current_rev != origin_rev: - raise ValueError( - f"Your main branch is not up-to-date ({current_rev} != {origin_rev}), please run `git fetch origin main`" - ) +def get_diff_from_main() -> list[str]: differences = subprocess.run( ["git", "diff", "--name-only", "origin/main...HEAD"], cwd=repo_path, @@ -91,66 +83,134 @@ def extract_new_models_and_tasks( return models -def create_comparison_table(models: list[str], tasks: list[str]) -> pd.DataFrame: +def create_comparison_table( + model: str, tasks: list[str], reference_models: list[str] +) -> pd.DataFrame: + models = [model] + reference_models + max_col_name = "Max result" + task_col_name = "task_name" results = mteb.load_results(models=models, tasks=tasks, download_latest=False) + results = results.join_revisions() df = results.to_dataframe() - # compute average pr. columns - model_names = [c for c in df.columns if c != "task_name"] + if df.empty: + raise ValueError(f"No results found for models {models} on tasks {tasks}") - row = pd.DataFrame( + df[max_col_name] = None + task_results = mteb.load_results(tasks=tasks, download_latest=False) + task_results = task_results.join_revisions() + max_dataframe = ( + task_results.to_dataframe(format="long").groupby(task_col_name).max() + ) + if not max_dataframe.empty: + for task_name, row in max_dataframe.iterrows(): + df.loc[df[task_col_name] == task_name, max_col_name] = ( + row["score"] / 100 + ) # scores are in percentage + + averages: dict[str, float | None] = {} + for col in models + [max_col_name]: + numeric = pd.to_numeric(df[col], errors="coerce") + avg = numeric.mean() + averages[col] = avg if not pd.isna(avg) else None + + avg_row = pd.DataFrame( { - "task_name": ["**Average**"], - **{ - model: df[model].mean() if model != "task_name" else None - for model in model_names - }, + task_col_name: ["**Average**"], + **{col: [val] for col, val in averages.items()}, } ) - df = pd.concat([df, row], ignore_index=True) - return df + return pd.concat([df, avg_row], ignore_index=True) -def highlight_max_bold(df, exclude_cols=["task_name"]): - # result_df = df.copy().astype(str) - # only 2 decimal places except for the excluded columns +def highlight_max_bold( + df: pd.DataFrame, exclude_cols: list[str] = ["task_name"] +) -> pd.DataFrame: result_df = df.copy() - result_df = result_df.applymap(lambda x: f"{x:.2f}" if isinstance(x, float) else x) - tmp_df = df.copy() - tmp_df = tmp_df.drop(columns=exclude_cols) + for col in result_df.columns: + if col not in exclude_cols: + result_df[col] = result_df[col].apply( + lambda x: f"{x:.2f}" + if isinstance(x, (int, float)) and pd.notna(x) + else x + ) + + tmp = df.drop(columns=exclude_cols) for idx in df.index: - max_col = tmp_df.loc[idx].idxmax() - result_df.loc[idx, max_col] = f"**{result_df.loc[idx, max_col]}**" + row = pd.to_numeric(tmp.loc[idx], errors="coerce") + if row.isna().all(): + continue + max_col = row.idxmax() + if pd.notna(row[max_col]): + result_df.at[idx, max_col] = f"**{result_df.at[idx, max_col]}**" return result_df +def generate_markdown_content( + model_tasks: dict[str, list[str]], reference_models: list[str] +) -> str: + if not model_tasks: + return "# Model Results Comparison\n\nNo new model results found in this PR." + + all_tasks = sorted({t for tasks in model_tasks.values() for t in tasks}) + new_models = list(model_tasks.keys()) + + parts: list[str] = [ + "# Model Results Comparison", + "", + f"**Reference models:** {', '.join(f'`{m}`' for m in reference_models)}", + f"**New models evaluated:** {', '.join(f'`{m}`' for m in new_models)}", + f"**Tasks:** {', '.join(f'`{t}`' for t in all_tasks)}", + "", + ] + + for model_name, tasks in model_tasks.items(): + parts.append(f"## Results for `{model_name}`") + + df = create_comparison_table(model_name, tasks, reference_models) + bold_df = highlight_max_bold(df) + parts.append(bold_df.to_markdown(index=False)) + + parts.extend(["", "---", ""]) + + return "\n".join(parts) + + def create_argparse() -> argparse.ArgumentParser: + """Create the argument parser for the script.""" parser = argparse.ArgumentParser( description="Create PR comment with results comparison." ) parser.add_argument( - "--models", + "--reference-models", nargs="+", - default=default_reference_models, + default=REFERENCE_MODELS, help="List of reference models to compare against (default: %(default)s)", ) + parser.add_argument( + "--output", + type=Path, + default=Path("model-comparison.md"), + help="Output markdown file path", + ) return parser -def main(reference_models: list[str]): +def main(reference_models: list[str], output_path: Path) -> None: + logger.info("Starting to create PR results comment...") + logger.info(f"Using reference models: {', '.join(reference_models)}") diff = get_diff_from_main() - new_additions = extract_new_models_and_tasks(diff) - for model, tasks in new_additions.items(): - print(f"**Results for `{model}`**") - df = create_comparison_table(models=reference_models + [model], tasks=tasks) - bold_df = highlight_max_bold(df) - print(bold_df.to_markdown(index=False)) + model_tasks = extract_new_models_and_tasks(diff) + markdown = generate_markdown_content(model_tasks, reference_models) + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(markdown) if __name__ == "__main__": parser = create_argparse() args = parser.parse_args() - main(reference_models=args.models) + main(args.reference_models, args.output)