From 7c75155321d8c063b325761eaf5760850a43b671 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 11 Jun 2025 23:06:24 +0300 Subject: [PATCH 01/20] init automate script --- .../workflows/model-results-comparison.yaml | 70 +++++ .../MassiveIntentClassification.json | 4 +- scripts/pr_results_comment.py | 295 ++++++++++++++++++ 3 files changed, 367 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/model-results-comparison.yaml create mode 100644 scripts/pr_results_comment.py diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml new file mode 100644 index 0000000000..7b0f7be2e5 --- /dev/null +++ b/.github/workflows/model-results-comparison.yaml @@ -0,0 +1,70 @@ +name: Model Results Comparison + +on: + pull_request: + types: [opened, synchronize, edited] + paths: + - 'results/**/*.json' + +permissions: + contents: read + pull-requests: write + +jobs: + compare-results: + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.9' + + - name: Install dependencies + run: | + pip install mteb + + - name: Get changed result files + id: changed-files + run: | + # Get list of changed JSON files in results directory + git fetch origin main + changed_files=$(git diff --name-only origin/main...HEAD | grep -E '^results/.*\.json$' | grep -v model_meta.json || true) + + if [ -z "$changed_files" ]; then + echo "No result files changed" + echo "has_changes=false" >> $GITHUB_OUTPUT + else + echo "Changed files:" + echo "$changed_files" + echo "has_changes=true" >> $GITHUB_OUTPUT + + # Convert to space-separated list for script + files_list=$(echo "$changed_files" | tr '\n' ' ') + echo "files_list=$files_list" >> $GITHUB_OUTPUT + fi + + - name: Generate model comparison + if: steps.changed-files.outputs.has_changes == 'true' + run: | + python scripts/create_pr_results_comment.py \ + ${{ steps.changed-files.outputs.files_list }} \ + --output model-comparison.md + + - name: Post PR comment + if: steps.changed-files.outputs.has_changes == 'true' + env: + GITHUB_TOKEN: ${{ github.token }} + run: gh pr comment ${{ github.event.number }} --body-file model-comparison.md --create-if-none --edit-last + + - name: Upload comparison report + if: steps.changed-files.outputs.has_changes == 'true' + uses: actions/upload-artifact@v3 + with: + name: model-comparison + path: model-comparison.md diff --git a/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json b/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json index c264367483..a325c9a0d6 100644 --- a/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json +++ b/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json @@ -124,7 +124,7 @@ "f1_weighted": 0.776515 } ], - "main_score": 0.791157, + "main_score": 0.891157, "hf_subset": "ru", "languages": [ "rus-Cyrl" @@ -134,4 +134,4 @@ }, "evaluation_time": 47.84240365028381, "kg_co2_emissions": null -} \ No newline at end of file +} diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py new file mode 100644 index 0000000000..0a2fc8d50d --- /dev/null +++ b/scripts/pr_results_comment.py @@ -0,0 +1,295 @@ +""" +Script to generate a Markdown comparison table for model results from file paths. + +The script takes a list of result files and extracts: +- Model name from the folder structure (results/model_name/...) +- Task name from the filename (without .json extension) + +Usage: + python scripts/create_pr_results_comment.py file1.json file2.json --output results.md + +Arguments: + files: List of result files to process + --output: Output markdown file path (required) + +Example: + python scripts/create_pr_results_comment.py \ + results/my-new-model/revision/task1.json \ + results/my-new-model/revision/task2.json \ + results/another-model/revision/task1.json \ + --output comparison.md +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from collections import defaultdict +from pathlib import Path + +import mteb +import pandas as pd + +repo_path = Path(__file__).parents[1] +results_path = repo_path / "results" + +# Set MTEB cache +os.environ["MTEB_CACHE"] = str(repo_path.parent) + +# Hardcoded reference models to compare against +REFERENCE_MODELS = [ + "intfloat/multilingual-e5-large", + "google/gemini-embedding-001", +] + + +def extract_model_and_task_from_path(file_path: str) -> tuple[str, str]: + """ + Extract model name and task name from file path. + + Expected structure: results/model_name/revision/task_name.json + Returns: (model_name, task_name) + """ + path = Path(file_path) + + if not path.suffix == '.json': + raise ValueError(f"File must be a JSON file: {file_path}") + + task_name = path.stem + parts = path.parts + try: + results_idx = parts.index('results') + if results_idx + 1 < len(parts): + model_dir = parts[results_idx + 1].replace("__", "/") + + # Try to get model name from model_meta.json + model_meta_path = path.parent / "model_meta.json" + if model_meta_path.exists(): + try: + with open(model_meta_path) as f: + meta = json.load(f) + model_name = meta.get("name", model_dir) + except (json.JSONDecodeError, KeyError): + model_name = model_dir + else: + model_name = model_dir + + return model_name, task_name + else: + raise ValueError(f"Invalid path structure: {file_path}") + except ValueError: + raise ValueError(f"Path must contain 'results' directory: {file_path}") + + +def group_files_by_model(file_paths: list[str]) -> dict[str, list[str]]: + """Group files by model and extract task names.""" + model_tasks = defaultdict(list) + + for file_path in file_paths: + try: + model_name, task_name = extract_model_and_task_from_path(file_path) + model_tasks[model_name].append(task_name) + print(f"✓ Found: {model_name} -> {task_name}") + except ValueError as e: + print(f"⚠ Warning: Skipping {file_path}: {e}") + continue + + # Remove duplicates and sort + for model in model_tasks: + model_tasks[model] = sorted(list(set(model_tasks[model]))) + + return dict(model_tasks) + + +def create_comparison_table(models: list[str], tasks: list[str]) -> pd.DataFrame: + """Create comparison table for given models and tasks.""" + try: + print(f"Loading results for models: {models}") + print(f"Tasks: {tasks}") + + results = mteb.load_results(models=models, tasks=tasks, download_latest=False) + results = results.join_revisions() + df = results.to_dataframe() + + if df.empty: + raise ValueError("No results found for the specified models and tasks") + + # Compute average per columns + model_names = [c for c in df.columns if c != "task_name"] + + # Calculate averages only for numeric columns + averages = {} + for model in model_names: + if model in df.columns: + numeric_values = pd.to_numeric(df[model], errors='coerce') + avg_value = numeric_values.mean() + averages[model] = avg_value if not pd.isna(avg_value) else None + else: + averages[model] = None + + # Add average row + avg_row = pd.DataFrame({ + "task_name": ["**Average**"], + **{model: [avg_val] for model, avg_val in averages.items()}, + }) + + df = pd.concat([df, avg_row], ignore_index=True) + return df + except Exception as e: + print(f"❌ Error creating comparison table: {e}") + raise + + +def highlight_max_bold(df: pd.DataFrame, exclude_cols=["task_name"]) -> pd.DataFrame: + """Highlight maximum values in bold for each row.""" + result_df = df.copy() + + # Format numeric values to 2 decimal places + for col in result_df.columns: + if col not in exclude_cols: + result_df[col] = result_df[col].apply( + lambda x: f"{x:.2f}" if isinstance(x, (int, float)) and pd.notna(x) else str(x) + ) + + # Create a temporary dataframe for finding max values + tmp_df = df.copy() + tmp_df = tmp_df.drop(columns=exclude_cols) + + for idx in df.index: + # Skip rows with no numeric data + numeric_row = pd.to_numeric(tmp_df.loc[idx], errors='coerce') + if numeric_row.isna().all(): + continue + + max_col = numeric_row.idxmax() + if pd.notna(numeric_row[max_col]): + current_value = result_df.loc[idx, max_col] + result_df.loc[idx, max_col] = f"**{current_value}**" + + return result_df + + +def generate_markdown_content(model_tasks: dict[str, list[str]]) -> str: + """Generate the complete markdown content with comparison tables.""" + + if not model_tasks: + return "# Model Results Comparison\n\nNo valid model results found." + + # Get all unique tasks across all models + all_tasks = [] + for tasks in model_tasks.values(): + all_tasks.extend(tasks) + all_tasks = sorted(list(set(all_tasks))) + + # Get all models + new_models = list(model_tasks.keys()) + + markdown_parts = [ + "# Model Results Comparison", + "", + f"**New models evaluated:** {', '.join(f'`{m}`' for m in new_models)}", + f"**Tasks:** {', '.join(f'`{t}`' for t in all_tasks)}", + "", + ] + + # Create comparison tables for each new model + for model_name, model_tasks_list in model_tasks.items(): + markdown_parts.extend([ + f"## Results for `{model_name}`", + ]) + + try: + # Compare this model against reference models + all_models = REFERENCE_MODELS + [model_name] + df = create_comparison_table(models=all_models, tasks=model_tasks_list) + bold_df = highlight_max_bold(df) + markdown_table = bold_df.to_markdown(index=False) + markdown_parts.append(markdown_table) + except Exception as e: + print(f"❌ Error generating comparison table for {model_name}: {e}") + + markdown_parts.extend(["", "---", ""]) + + # # Add global comparison if multiple models + # if len(new_models) > 1: + # markdown_parts.extend([ + # "## Overall Comparison", + # "", + # f"**All models compared:** {', '.join(f'`{m}`' for m in REFERENCE_MODELS + new_models)}", + # f"**Common tasks:** {', '.join(f'`{t}`' for t in all_tasks)}", + # "", + # ]) + # + # try: + # all_models = REFERENCE_MODELS + new_models + # df = create_comparison_table(models=all_models, tasks=all_tasks) + # bold_df = highlight_max_bold(df) + # markdown_table = bold_df.to_markdown(index=False) + # markdown_parts.append(markdown_table) + # except Exception as e: + # print(f"❌ Error generating overall comparison table: {e}") + + return "\n".join(markdown_parts) + + +def create_argparse() -> argparse.ArgumentParser: + """Create argument parser.""" + parser = argparse.ArgumentParser( + description="Generate markdown comparison table for model results from file paths." + ) + parser.add_argument( + "files", + nargs="+", + help="List of result JSON files to process", + ) + parser.add_argument( + "--output", + type=Path, + required=True, + help="Output markdown file path (required)", + ) + return parser + + +def main(): + """Main function.""" + parser = create_argparse() + args = parser.parse_args() + + print(f"Processing {len(args.files)} files...") + + # Group files by model + try: + model_tasks = group_files_by_model(args.files) + print(f"\nFound {len(model_tasks)} models:") + for model, tasks in model_tasks.items(): + print(f" {model}: {len(tasks)} tasks ({', '.join(tasks)})") + except Exception as e: + print(f"❌ Error processing files: {e}") + raise e + + if not model_tasks: + print("❌ No valid model results found") + raise e + + # Generate markdown content + try: + markdown_content = generate_markdown_content(model_tasks) + except Exception as e: + print(f"❌ Error generating markdown: {e}") + raise e + + # Write to output file + try: + args.output.parent.mkdir(parents=True, exist_ok=True) + args.output.write_text(markdown_content) + print(f"✅ Markdown written to {args.output}") + except Exception as e: + print(f"❌ Error writing to {args.output}: {e}") + raise e + + +if __name__ == "__main__": + main() From f7a94a9d51219a44e858124f142fa652bc30e25a Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 11 Jun 2025 23:08:07 +0300 Subject: [PATCH 02/20] bump versions --- .github/workflows/model-results-comparison.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml index 7b0f7be2e5..bcaef056e3 100644 --- a/.github/workflows/model-results-comparison.yaml +++ b/.github/workflows/model-results-comparison.yaml @@ -21,7 +21,7 @@ jobs: fetch-depth: 0 - name: Set up Python - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: '3.9' @@ -64,7 +64,7 @@ jobs: - name: Upload comparison report if: steps.changed-files.outputs.has_changes == 'true' - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: model-comparison path: model-comparison.md From fd574a9bef5b70593f41be697f49ae886756844c Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 11 Jun 2025 23:15:07 +0300 Subject: [PATCH 03/20] fix script name --- .github/workflows/model-results-comparison.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml index bcaef056e3..2cb09487df 100644 --- a/.github/workflows/model-results-comparison.yaml +++ b/.github/workflows/model-results-comparison.yaml @@ -52,7 +52,7 @@ jobs: - name: Generate model comparison if: steps.changed-files.outputs.has_changes == 'true' run: | - python scripts/create_pr_results_comment.py \ + python scripts/pr_results_comment.py \ ${{ steps.changed-files.outputs.files_list }} \ --output model-comparison.md From 1065f7bc55b5a17d07ecf91f5ce77054a4195929 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 11 Jun 2025 23:19:10 +0300 Subject: [PATCH 04/20] add tabulate --- .github/workflows/model-results-comparison.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml index 2cb09487df..86218f87cf 100644 --- a/.github/workflows/model-results-comparison.yaml +++ b/.github/workflows/model-results-comparison.yaml @@ -27,7 +27,7 @@ jobs: - name: Install dependencies run: | - pip install mteb + pip install mteb tabulate - name: Get changed result files id: changed-files From eec89fbbce6ffb52d380be920f23d5ac17d91d66 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 11 Jun 2025 23:27:59 +0300 Subject: [PATCH 05/20] add resutls for test model --- .../MassiveIntentClassification.json | 2 +- .../1234321/AfriSeniClassification.json | 18 ++++++++++++++++++ .../1234321/SomeNewTask.json | 18 ++++++++++++++++++ .../1234321/model_meta.json | 1 + 4 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 results/testuser__testmodel/1234321/AfriSeniClassification.json create mode 100644 results/testuser__testmodel/1234321/SomeNewTask.json create mode 100644 results/testuser__testmodel/1234321/model_meta.json diff --git a/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json b/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json index a325c9a0d6..8b2abb3922 100644 --- a/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json +++ b/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json @@ -124,7 +124,7 @@ "f1_weighted": 0.776515 } ], - "main_score": 0.891157, + "main_score": 0.791157, "hf_subset": "ru", "languages": [ "rus-Cyrl" diff --git a/results/testuser__testmodel/1234321/AfriSeniClassification.json b/results/testuser__testmodel/1234321/AfriSeniClassification.json new file mode 100644 index 0000000000..92ff554112 --- /dev/null +++ b/results/testuser__testmodel/1234321/AfriSeniClassification.json @@ -0,0 +1,18 @@ +{ + "dataset_revision": "b52e930385cf5ed7f063072c3f7bd17b599a16cf", + "task_name": "AfriSentiClassification", + "mteb_version": "1.18.0", + "scores": { + "test": [ + { + "main_score": 0.1, + "hf_subset": "yor", + "languages": [ + "yor-Latn" + ] + } + ] + }, + "evaluation_time": 23.979591608047485, + "kg_co2_emissions": 0.000927284611091405 +} diff --git a/results/testuser__testmodel/1234321/SomeNewTask.json b/results/testuser__testmodel/1234321/SomeNewTask.json new file mode 100644 index 0000000000..07c05183a9 --- /dev/null +++ b/results/testuser__testmodel/1234321/SomeNewTask.json @@ -0,0 +1,18 @@ +{ + "dataset_revision": "b52e930385cf5ed7f063072c3f7bd17b599a16cf", + "task_name": "SomeNewTask", + "mteb_version": "1.18.0", + "scores": { + "test": [ + { + "main_score": 0.2, + "hf_subset": "eng", + "languages": [ + "eng-Latn" + ] + } + ] + }, + "evaluation_time": 23.979591608047485, + "kg_co2_emissions": 0.000927284611091405 +} diff --git a/results/testuser__testmodel/1234321/model_meta.json b/results/testuser__testmodel/1234321/model_meta.json new file mode 100644 index 0000000000..3ec3fd4df3 --- /dev/null +++ b/results/testuser__testmodel/1234321/model_meta.json @@ -0,0 +1 @@ +{"name": "testuser/testmodel", "revision": "1234321", "release_date": null, "languages": null, "n_parameters": null, "memory_usage_mb": null, "max_tokens": null, "embed_dim": null, "license": null, "open_weights": null, "public_training_code": null, "public_training_data": null, "framework": null, "reference": null, "similarity_fn_name": null, "use_instructions": null, "training_datasets": null, "adapted_from": null, "superseded_by": null, "modalities": ["text"], "loader": null} From 42e02b153a66b77eb61a1e460b7b7f56466d4162 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 11 Jun 2025 23:38:36 +0300 Subject: [PATCH 06/20] handle no result on task --- scripts/pr_results_comment.py | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py index 0a2fc8d50d..bdeefb4a48 100644 --- a/scripts/pr_results_comment.py +++ b/scripts/pr_results_comment.py @@ -103,13 +103,17 @@ def group_files_by_model(file_paths: list[str]) -> dict[str, list[str]]: return dict(model_tasks) -def create_comparison_table(models: list[str], tasks: list[str]) -> pd.DataFrame: +def create_comparison_table(models: str, tasks: list[str]) -> pd.DataFrame: """Create comparison table for given models and tasks.""" try: print(f"Loading results for models: {models}") print(f"Tasks: {tasks}") - results = mteb.load_results(models=models, tasks=tasks, download_latest=False) + try: + results = mteb.load_results(models=[models]+REFERENCE_MODELS, tasks=tasks, download_latest=False) + except Exception as e: + # if model in reference don't have results on task + results = mteb.load_results(models=[models]+REFERENCE_MODELS, tasks=tasks, download_latest=False) results = results.join_revisions() df = results.to_dataframe() @@ -202,8 +206,7 @@ def generate_markdown_content(model_tasks: dict[str, list[str]]) -> str: try: # Compare this model against reference models - all_models = REFERENCE_MODELS + [model_name] - df = create_comparison_table(models=all_models, tasks=model_tasks_list) + df = create_comparison_table(model=model_name, tasks=model_tasks_list) bold_df = highlight_max_bold(df) markdown_table = bold_df.to_markdown(index=False) markdown_parts.append(markdown_table) @@ -212,25 +215,6 @@ def generate_markdown_content(model_tasks: dict[str, list[str]]) -> str: markdown_parts.extend(["", "---", ""]) - # # Add global comparison if multiple models - # if len(new_models) > 1: - # markdown_parts.extend([ - # "## Overall Comparison", - # "", - # f"**All models compared:** {', '.join(f'`{m}`' for m in REFERENCE_MODELS + new_models)}", - # f"**Common tasks:** {', '.join(f'`{t}`' for t in all_tasks)}", - # "", - # ]) - # - # try: - # all_models = REFERENCE_MODELS + new_models - # df = create_comparison_table(models=all_models, tasks=all_tasks) - # bold_df = highlight_max_bold(df) - # markdown_table = bold_df.to_markdown(index=False) - # markdown_parts.append(markdown_table) - # except Exception as e: - # print(f"❌ Error generating overall comparison table: {e}") - return "\n".join(markdown_parts) From 9b4025dc50073df72c50c6800428878ca9feafeb Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 11 Jun 2025 23:43:15 +0300 Subject: [PATCH 07/20] fix function --- scripts/pr_results_comment.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py index bdeefb4a48..28c39e4896 100644 --- a/scripts/pr_results_comment.py +++ b/scripts/pr_results_comment.py @@ -103,17 +103,17 @@ def group_files_by_model(file_paths: list[str]) -> dict[str, list[str]]: return dict(model_tasks) -def create_comparison_table(models: str, tasks: list[str]) -> pd.DataFrame: +def create_comparison_table(model: str, tasks: list[str]) -> pd.DataFrame: """Create comparison table for given models and tasks.""" try: print(f"Loading results for models: {models}") print(f"Tasks: {tasks}") try: - results = mteb.load_results(models=[models]+REFERENCE_MODELS, tasks=tasks, download_latest=False) + results = mteb.load_results(models=[model]+REFERENCE_MODELS, tasks=tasks, download_latest=False) except Exception as e: # if model in reference don't have results on task - results = mteb.load_results(models=[models]+REFERENCE_MODELS, tasks=tasks, download_latest=False) + results = mteb.load_results(models=[model]+REFERENCE_MODELS, tasks=tasks, download_latest=False) results = results.join_revisions() df = results.to_dataframe() @@ -206,7 +206,7 @@ def generate_markdown_content(model_tasks: dict[str, list[str]]) -> str: try: # Compare this model against reference models - df = create_comparison_table(model=model_name, tasks=model_tasks_list) + df = create_comparison_table(model_name, tasks=model_tasks_list) bold_df = highlight_max_bold(df) markdown_table = bold_df.to_markdown(index=False) markdown_parts.append(markdown_table) From e419cc89348b8422646941c8468a95786b0f764e Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 11 Jun 2025 23:48:44 +0300 Subject: [PATCH 08/20] fix function --- scripts/pr_results_comment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py index 28c39e4896..a82c3cef4d 100644 --- a/scripts/pr_results_comment.py +++ b/scripts/pr_results_comment.py @@ -106,7 +106,7 @@ def group_files_by_model(file_paths: list[str]) -> dict[str, list[str]]: def create_comparison_table(model: str, tasks: list[str]) -> pd.DataFrame: """Create comparison table for given models and tasks.""" try: - print(f"Loading results for models: {models}") + print(f"Loading results for model: {model}") print(f"Tasks: {tasks}") try: From 6d41b5f1814617606268325f3f8ef8d81f280f4b Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Wed, 11 Jun 2025 23:56:44 +0300 Subject: [PATCH 09/20] remove testuser --- .../1234321/AfriSeniClassification.json | 18 ------------------ .../1234321/SomeNewTask.json | 18 ------------------ .../1234321/model_meta.json | 1 - 3 files changed, 37 deletions(-) delete mode 100644 results/testuser__testmodel/1234321/AfriSeniClassification.json delete mode 100644 results/testuser__testmodel/1234321/SomeNewTask.json delete mode 100644 results/testuser__testmodel/1234321/model_meta.json diff --git a/results/testuser__testmodel/1234321/AfriSeniClassification.json b/results/testuser__testmodel/1234321/AfriSeniClassification.json deleted file mode 100644 index 92ff554112..0000000000 --- a/results/testuser__testmodel/1234321/AfriSeniClassification.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_revision": "b52e930385cf5ed7f063072c3f7bd17b599a16cf", - "task_name": "AfriSentiClassification", - "mteb_version": "1.18.0", - "scores": { - "test": [ - { - "main_score": 0.1, - "hf_subset": "yor", - "languages": [ - "yor-Latn" - ] - } - ] - }, - "evaluation_time": 23.979591608047485, - "kg_co2_emissions": 0.000927284611091405 -} diff --git a/results/testuser__testmodel/1234321/SomeNewTask.json b/results/testuser__testmodel/1234321/SomeNewTask.json deleted file mode 100644 index 07c05183a9..0000000000 --- a/results/testuser__testmodel/1234321/SomeNewTask.json +++ /dev/null @@ -1,18 +0,0 @@ -{ - "dataset_revision": "b52e930385cf5ed7f063072c3f7bd17b599a16cf", - "task_name": "SomeNewTask", - "mteb_version": "1.18.0", - "scores": { - "test": [ - { - "main_score": 0.2, - "hf_subset": "eng", - "languages": [ - "eng-Latn" - ] - } - ] - }, - "evaluation_time": 23.979591608047485, - "kg_co2_emissions": 0.000927284611091405 -} diff --git a/results/testuser__testmodel/1234321/model_meta.json b/results/testuser__testmodel/1234321/model_meta.json deleted file mode 100644 index 3ec3fd4df3..0000000000 --- a/results/testuser__testmodel/1234321/model_meta.json +++ /dev/null @@ -1 +0,0 @@ -{"name": "testuser/testmodel", "revision": "1234321", "release_date": null, "languages": null, "n_parameters": null, "memory_usage_mb": null, "max_tokens": null, "embed_dim": null, "license": null, "open_weights": null, "public_training_code": null, "public_training_data": null, "framework": null, "reference": null, "similarity_fn_name": null, "use_instructions": null, "training_datasets": null, "adapted_from": null, "superseded_by": null, "modalities": ["text"], "loader": null} From 15d60e9160f65a0826890e75deafe7306a615975 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Thu, 12 Jun 2025 00:03:28 +0300 Subject: [PATCH 10/20] fix script help --- scripts/pr_results_comment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py index a82c3cef4d..36688ac6bc 100644 --- a/scripts/pr_results_comment.py +++ b/scripts/pr_results_comment.py @@ -6,7 +6,7 @@ - Task name from the filename (without .json extension) Usage: - python scripts/create_pr_results_comment.py file1.json file2.json --output results.md + python scripts/pr_results_comment.py file1.json file2.json --output results.md Arguments: files: List of result files to process From cb95fd2a93db8c634c5cdaa79b0cc31838d7a45b Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Thu, 12 Jun 2025 00:12:42 +0300 Subject: [PATCH 11/20] try to run only one model --- scripts/pr_results_comment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py index 36688ac6bc..1a75a1acc8 100644 --- a/scripts/pr_results_comment.py +++ b/scripts/pr_results_comment.py @@ -113,7 +113,7 @@ def create_comparison_table(model: str, tasks: list[str]) -> pd.DataFrame: results = mteb.load_results(models=[model]+REFERENCE_MODELS, tasks=tasks, download_latest=False) except Exception as e: # if model in reference don't have results on task - results = mteb.load_results(models=[model]+REFERENCE_MODELS, tasks=tasks, download_latest=False) + results = mteb.load_results(models=[model], tasks=tasks, download_latest=False) results = results.join_revisions() df = results.to_dataframe() From e74dde05a1755f0c752832104a37bff26b6a221e Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Thu, 12 Jun 2025 13:25:44 +0300 Subject: [PATCH 12/20] install from sources --- .github/workflows/model-results-comparison.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml index 86218f87cf..c4962dbebc 100644 --- a/.github/workflows/model-results-comparison.yaml +++ b/.github/workflows/model-results-comparison.yaml @@ -27,7 +27,7 @@ jobs: - name: Install dependencies run: | - pip install mteb tabulate + pip install git+https://github.com/embeddings-benchmark/mteb.git tabulate - name: Get changed result files id: changed-files From d3d1b9bde7eb0d2b2b78f9f06d84a89a511dae81 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:20:58 +0300 Subject: [PATCH 13/20] update script --- .../workflows/model-results-comparison.yaml | 51 ++-- scripts/create_pr_results_comment.py | 149 +++++++--- scripts/pr_results_comment.py | 279 ------------------ 3 files changed, 135 insertions(+), 344 deletions(-) delete mode 100644 scripts/pr_results_comment.py diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml index c4962dbebc..d5d651f0a8 100644 --- a/.github/workflows/model-results-comparison.yaml +++ b/.github/workflows/model-results-comparison.yaml @@ -5,6 +5,17 @@ on: types: [opened, synchronize, edited] paths: - 'results/**/*.json' + workflow_dispatch: + inputs: + reference_models: + description: 'Space-separated list of reference models for comparison' + required: true + type: string + default: 'intfloat/multilingual-e5-large google/gemini-embedding-001' + pull_request_number: + description: 'The pull request number to comment on (required if triggered manually)' + required: false # Make it not strictly required if you want to run it without a PR context for other reasons + type: string permissions: contents: read @@ -29,41 +40,31 @@ jobs: run: | pip install git+https://github.com/embeddings-benchmark/mteb.git tabulate - - name: Get changed result files - id: changed-files + - name: Generate model comparison + env: + REFERENCE_MODELS: ${{ github.event.inputs.reference_models || 'intfloat/multilingual-e5-large google/gemini-embedding-001' }} run: | - # Get list of changed JSON files in results directory - git fetch origin main - changed_files=$(git diff --name-only origin/main...HEAD | grep -E '^results/.*\.json$' | grep -v model_meta.json || true) + python scripts/create_pr_results_comment.py --reference-models "$REFERENCE_MODELS" --output model-comparison.md - if [ -z "$changed_files" ]; then - echo "No result files changed" - echo "has_changes=false" >> $GITHUB_OUTPUT + - name: Determine PR Number + id: pr_info + run: | + if [ "${{ github.event_name }}" == "pull_request" ]; then + echo "pr_number=${{ github.event.number }}" >> $GITHUB_OUTPUT + elif [ "${{ github.event_name }}" == "workflow_dispatch" ] && [ -n "${{ github.event.inputs.pull_request_number }}" ]; then + echo "pr_number=${{ github.event.inputs.pull_request_number }}" >> $GITHUB_OUTPUT else - echo "Changed files:" - echo "$changed_files" - echo "has_changes=true" >> $GITHUB_OUTPUT - - # Convert to space-separated list for script - files_list=$(echo "$changed_files" | tr '\n' ' ') - echo "files_list=$files_list" >> $GITHUB_OUTPUT + echo "pr_number=" >> $GITHUB_OUTPUT fi - - name: Generate model comparison - if: steps.changed-files.outputs.has_changes == 'true' - run: | - python scripts/pr_results_comment.py \ - ${{ steps.changed-files.outputs.files_list }} \ - --output model-comparison.md - - name: Post PR comment - if: steps.changed-files.outputs.has_changes == 'true' + # This step will run if a PR number is available either from the PR event or workflow_dispatch input + if: steps.pr_info.outputs.pr_number != '' env: GITHUB_TOKEN: ${{ github.token }} - run: gh pr comment ${{ github.event.number }} --body-file model-comparison.md --create-if-none --edit-last + run: gh pr comment ${{ steps.pr_info.outputs.pr_number }} --body-file model-comparison.md --create-if-none --edit-last - name: Upload comparison report - if: steps.changed-files.outputs.has_changes == 'true' uses: actions/upload-artifact@v4 with: name: model-comparison diff --git a/scripts/create_pr_results_comment.py b/scripts/create_pr_results_comment.py index 9bd2d2de01..bf03aae47d 100644 --- a/scripts/create_pr_results_comment.py +++ b/scripts/create_pr_results_comment.py @@ -3,19 +3,20 @@ Usage: gh pr checkout {pr-number} - scripts/create_pr_results_comment.py [--models MODEL1 MODEL2 ...] + python scripts/create_pr_results_comment.py [--models MODEL1 MODEL2 ...] [--output OUTPUT_FILE] Description: - Compares new model results (added in the current PR) against reference models. - - Outputs a Markdown table with results for each new model and highlights the best scores. + - Outputs a Markdown file with results for each new model and highlights the best scores. - By default, compares against: intfloat/multilingual-e5-large and google/gemini-embedding-001. - You can specify reference models with the --models argument. Arguments: - --models: List of reference models to compare against (default: intfloat/multilingual-e5-large google/gemini-embedding-001) + --reference-models: List of reference models to compare against (default: intfloat/multilingual-e5-large google/gemini-embedding-001) + --output: Output markdown file path (default: model-comparison.md) Example: - scripts/create_pr_results_comment.py --models intfloat/multilingual-e5-large myorg/my-new-model + python scripts/create_pr_results_comment.py --models intfloat/multilingual-e5-large myorg/my-new-modelm """ from __future__ import annotations @@ -24,6 +25,7 @@ import json import os import subprocess +import logging from collections import defaultdict from pathlib import Path @@ -32,19 +34,22 @@ TaskName, ModelName = str, str +# Default reference models to compare against +REFERENCE_MODELS: list[str] = [ + "intfloat/multilingual-e5-large", + "google/gemini-embedding-001", +] + +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) repo_path = Path(__file__).parents[1] -results_path = repo_path / "results" os.environ["MTEB_CACHE"] = str(repo_path.parent) -default_reference_models = [ - "intfloat/multilingual-e5-large", - "google/gemini-embedding-001", -] - - def get_diff_from_main() -> list[str]: current_rev, origin_rev = subprocess.run( ["git", "rev-parse", "main", "origin/main"], @@ -91,66 +96,130 @@ def extract_new_models_and_tasks( return models -def create_comparison_table(models: list[str], tasks: list[str]) -> pd.DataFrame: +def create_comparison_table( + model: str, tasks: list[str], reference_models: list[str] +) -> pd.DataFrame: + models = [model] + reference_models + max_col_name = "Max result" + task_col_name = "task_name" results = mteb.load_results(models=models, tasks=tasks, download_latest=False) + results = results.join_revisions() df = results.to_dataframe() - # compute average pr. columns - model_names = [c for c in df.columns if c != "task_name"] + if df.empty: + raise ValueError(f"No results found for models {models} on tasks {tasks}") - row = pd.DataFrame( + df[max_col_name] = None + task_results = mteb.load_results(tasks=tasks, download_latest=False) + task_results = task_results.join_revisions() + max_dataframe = task_results.to_dataframe(format="long").groupby(task_col_name).max() + if not max_dataframe.empty: + for task_name, row in max_dataframe.iterrows(): + df.loc[df[task_col_name] == task_name, max_col_name] = row["score"] / 100 # scores are in percentage + + averages: dict[str, float | None] = {} + for col in models+[max_col_name]: + numeric = pd.to_numeric(df[col], errors="coerce") + avg = numeric.mean() + averages[col] = avg if not pd.isna(avg) else None + + avg_row = pd.DataFrame( { - "task_name": ["**Average**"], - **{ - model: df[model].mean() if model != "task_name" else None - for model in model_names - }, + task_col_name: ["**Average**"], + **{col: [val] for col, val in averages.items()}, } ) - df = pd.concat([df, row], ignore_index=True) - return df + return pd.concat([df, avg_row], ignore_index=True) -def highlight_max_bold(df, exclude_cols=["task_name"]): - # result_df = df.copy().astype(str) - # only 2 decimal places except for the excluded columns +def highlight_max_bold( + df: pd.DataFrame, exclude_cols: list[str] = ["task_name"] +) -> pd.DataFrame: result_df = df.copy() - result_df = result_df.applymap(lambda x: f"{x:.2f}" if isinstance(x, float) else x) - tmp_df = df.copy() - tmp_df = tmp_df.drop(columns=exclude_cols) + for col in result_df.columns: + if col not in exclude_cols: + result_df[col] = result_df[col].apply( + lambda x: f"{x:.2f}" + if isinstance(x, (int, float)) and pd.notna(x) + else x + ) + + tmp = df.drop(columns=exclude_cols) for idx in df.index: - max_col = tmp_df.loc[idx].idxmax() - result_df.loc[idx, max_col] = f"**{result_df.loc[idx, max_col]}**" + row = pd.to_numeric(tmp.loc[idx], errors="coerce") + if row.isna().all(): + continue + max_col = row.idxmax() + if pd.notna(row[max_col]): + result_df.at[idx, max_col] = f"**{result_df.at[idx, max_col]}**" return result_df +def generate_markdown_content( + model_tasks: dict[str, list[str]], reference_models: list[str] +) -> str: + if not model_tasks: + return "# Model Results Comparison\n\nNo new model results found in this PR." + + all_tasks = sorted({t for tasks in model_tasks.values() for t in tasks}) + new_models = list(model_tasks.keys()) + + parts: list[str] = [ + "# Model Results Comparison", + "", + f"**Reference models:** {', '.join(f'`{m}`' for m in reference_models)}", + f"**New models evaluated:** {', '.join(f'`{m}`' for m in new_models)}", + f"**Tasks:** {', '.join(f'`{t}`' for t in all_tasks)}", + "", + ] + + for model_name, tasks in model_tasks.items(): + parts.append(f"## Results for `{model_name}`") + + df = create_comparison_table(model_name, tasks, reference_models) + bold_df = highlight_max_bold(df) + parts.append(bold_df.to_markdown(index=False)) + + parts.extend(["", "---", ""]) + + return "\n".join(parts) + + def create_argparse() -> argparse.ArgumentParser: + """Create the argument parser for the script.""" parser = argparse.ArgumentParser( description="Create PR comment with results comparison." ) parser.add_argument( - "--models", + "--reference-models", nargs="+", - default=default_reference_models, + default=REFERENCE_MODELS, help="List of reference models to compare against (default: %(default)s)", ) + parser.add_argument( + "--output", + type=Path, + default=Path("model-comparison.md"), + help="Output markdown file path", + ) return parser -def main(reference_models: list[str]): +def main(reference_models: list[str], output_path: Path) -> None: + logger.info("Starting to create PR results comment...") + logger.info(f"Using reference models: {', '.join(reference_models)}") diff = get_diff_from_main() - new_additions = extract_new_models_and_tasks(diff) - for model, tasks in new_additions.items(): - print(f"**Results for `{model}`**") - df = create_comparison_table(models=reference_models + [model], tasks=tasks) - bold_df = highlight_max_bold(df) - print(bold_df.to_markdown(index=False)) + model_tasks = extract_new_models_and_tasks(diff) + markdown = generate_markdown_content(model_tasks, reference_models) + + output_path.parent.mkdir(parents=True, exist_ok=True) + output_path.write_text(markdown) if __name__ == "__main__": parser = create_argparse() args = parser.parse_args() - main(reference_models=args.models) + main(args.reference_models, args.output) diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py deleted file mode 100644 index 1a75a1acc8..0000000000 --- a/scripts/pr_results_comment.py +++ /dev/null @@ -1,279 +0,0 @@ -""" -Script to generate a Markdown comparison table for model results from file paths. - -The script takes a list of result files and extracts: -- Model name from the folder structure (results/model_name/...) -- Task name from the filename (without .json extension) - -Usage: - python scripts/pr_results_comment.py file1.json file2.json --output results.md - -Arguments: - files: List of result files to process - --output: Output markdown file path (required) - -Example: - python scripts/create_pr_results_comment.py \ - results/my-new-model/revision/task1.json \ - results/my-new-model/revision/task2.json \ - results/another-model/revision/task1.json \ - --output comparison.md -""" - -from __future__ import annotations - -import argparse -import json -import os -import sys -from collections import defaultdict -from pathlib import Path - -import mteb -import pandas as pd - -repo_path = Path(__file__).parents[1] -results_path = repo_path / "results" - -# Set MTEB cache -os.environ["MTEB_CACHE"] = str(repo_path.parent) - -# Hardcoded reference models to compare against -REFERENCE_MODELS = [ - "intfloat/multilingual-e5-large", - "google/gemini-embedding-001", -] - - -def extract_model_and_task_from_path(file_path: str) -> tuple[str, str]: - """ - Extract model name and task name from file path. - - Expected structure: results/model_name/revision/task_name.json - Returns: (model_name, task_name) - """ - path = Path(file_path) - - if not path.suffix == '.json': - raise ValueError(f"File must be a JSON file: {file_path}") - - task_name = path.stem - parts = path.parts - try: - results_idx = parts.index('results') - if results_idx + 1 < len(parts): - model_dir = parts[results_idx + 1].replace("__", "/") - - # Try to get model name from model_meta.json - model_meta_path = path.parent / "model_meta.json" - if model_meta_path.exists(): - try: - with open(model_meta_path) as f: - meta = json.load(f) - model_name = meta.get("name", model_dir) - except (json.JSONDecodeError, KeyError): - model_name = model_dir - else: - model_name = model_dir - - return model_name, task_name - else: - raise ValueError(f"Invalid path structure: {file_path}") - except ValueError: - raise ValueError(f"Path must contain 'results' directory: {file_path}") - - -def group_files_by_model(file_paths: list[str]) -> dict[str, list[str]]: - """Group files by model and extract task names.""" - model_tasks = defaultdict(list) - - for file_path in file_paths: - try: - model_name, task_name = extract_model_and_task_from_path(file_path) - model_tasks[model_name].append(task_name) - print(f"✓ Found: {model_name} -> {task_name}") - except ValueError as e: - print(f"⚠ Warning: Skipping {file_path}: {e}") - continue - - # Remove duplicates and sort - for model in model_tasks: - model_tasks[model] = sorted(list(set(model_tasks[model]))) - - return dict(model_tasks) - - -def create_comparison_table(model: str, tasks: list[str]) -> pd.DataFrame: - """Create comparison table for given models and tasks.""" - try: - print(f"Loading results for model: {model}") - print(f"Tasks: {tasks}") - - try: - results = mteb.load_results(models=[model]+REFERENCE_MODELS, tasks=tasks, download_latest=False) - except Exception as e: - # if model in reference don't have results on task - results = mteb.load_results(models=[model], tasks=tasks, download_latest=False) - results = results.join_revisions() - df = results.to_dataframe() - - if df.empty: - raise ValueError("No results found for the specified models and tasks") - - # Compute average per columns - model_names = [c for c in df.columns if c != "task_name"] - - # Calculate averages only for numeric columns - averages = {} - for model in model_names: - if model in df.columns: - numeric_values = pd.to_numeric(df[model], errors='coerce') - avg_value = numeric_values.mean() - averages[model] = avg_value if not pd.isna(avg_value) else None - else: - averages[model] = None - - # Add average row - avg_row = pd.DataFrame({ - "task_name": ["**Average**"], - **{model: [avg_val] for model, avg_val in averages.items()}, - }) - - df = pd.concat([df, avg_row], ignore_index=True) - return df - except Exception as e: - print(f"❌ Error creating comparison table: {e}") - raise - - -def highlight_max_bold(df: pd.DataFrame, exclude_cols=["task_name"]) -> pd.DataFrame: - """Highlight maximum values in bold for each row.""" - result_df = df.copy() - - # Format numeric values to 2 decimal places - for col in result_df.columns: - if col not in exclude_cols: - result_df[col] = result_df[col].apply( - lambda x: f"{x:.2f}" if isinstance(x, (int, float)) and pd.notna(x) else str(x) - ) - - # Create a temporary dataframe for finding max values - tmp_df = df.copy() - tmp_df = tmp_df.drop(columns=exclude_cols) - - for idx in df.index: - # Skip rows with no numeric data - numeric_row = pd.to_numeric(tmp_df.loc[idx], errors='coerce') - if numeric_row.isna().all(): - continue - - max_col = numeric_row.idxmax() - if pd.notna(numeric_row[max_col]): - current_value = result_df.loc[idx, max_col] - result_df.loc[idx, max_col] = f"**{current_value}**" - - return result_df - - -def generate_markdown_content(model_tasks: dict[str, list[str]]) -> str: - """Generate the complete markdown content with comparison tables.""" - - if not model_tasks: - return "# Model Results Comparison\n\nNo valid model results found." - - # Get all unique tasks across all models - all_tasks = [] - for tasks in model_tasks.values(): - all_tasks.extend(tasks) - all_tasks = sorted(list(set(all_tasks))) - - # Get all models - new_models = list(model_tasks.keys()) - - markdown_parts = [ - "# Model Results Comparison", - "", - f"**New models evaluated:** {', '.join(f'`{m}`' for m in new_models)}", - f"**Tasks:** {', '.join(f'`{t}`' for t in all_tasks)}", - "", - ] - - # Create comparison tables for each new model - for model_name, model_tasks_list in model_tasks.items(): - markdown_parts.extend([ - f"## Results for `{model_name}`", - ]) - - try: - # Compare this model against reference models - df = create_comparison_table(model_name, tasks=model_tasks_list) - bold_df = highlight_max_bold(df) - markdown_table = bold_df.to_markdown(index=False) - markdown_parts.append(markdown_table) - except Exception as e: - print(f"❌ Error generating comparison table for {model_name}: {e}") - - markdown_parts.extend(["", "---", ""]) - - return "\n".join(markdown_parts) - - -def create_argparse() -> argparse.ArgumentParser: - """Create argument parser.""" - parser = argparse.ArgumentParser( - description="Generate markdown comparison table for model results from file paths." - ) - parser.add_argument( - "files", - nargs="+", - help="List of result JSON files to process", - ) - parser.add_argument( - "--output", - type=Path, - required=True, - help="Output markdown file path (required)", - ) - return parser - - -def main(): - """Main function.""" - parser = create_argparse() - args = parser.parse_args() - - print(f"Processing {len(args.files)} files...") - - # Group files by model - try: - model_tasks = group_files_by_model(args.files) - print(f"\nFound {len(model_tasks)} models:") - for model, tasks in model_tasks.items(): - print(f" {model}: {len(tasks)} tasks ({', '.join(tasks)})") - except Exception as e: - print(f"❌ Error processing files: {e}") - raise e - - if not model_tasks: - print("❌ No valid model results found") - raise e - - # Generate markdown content - try: - markdown_content = generate_markdown_content(model_tasks) - except Exception as e: - print(f"❌ Error generating markdown: {e}") - raise e - - # Write to output file - try: - args.output.parent.mkdir(parents=True, exist_ok=True) - args.output.write_text(markdown_content) - print(f"✅ Markdown written to {args.output}") - except Exception as e: - print(f"❌ Error writing to {args.output}: {e}") - raise e - - -if __name__ == "__main__": - main() From a28be8f3583198a4ff29c4430594597bf73a9b0c Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:21:18 +0300 Subject: [PATCH 14/20] format --- scripts/create_pr_results_comment.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scripts/create_pr_results_comment.py b/scripts/create_pr_results_comment.py index bf03aae47d..9dbb93c4c7 100644 --- a/scripts/create_pr_results_comment.py +++ b/scripts/create_pr_results_comment.py @@ -113,13 +113,17 @@ def create_comparison_table( df[max_col_name] = None task_results = mteb.load_results(tasks=tasks, download_latest=False) task_results = task_results.join_revisions() - max_dataframe = task_results.to_dataframe(format="long").groupby(task_col_name).max() + max_dataframe = ( + task_results.to_dataframe(format="long").groupby(task_col_name).max() + ) if not max_dataframe.empty: for task_name, row in max_dataframe.iterrows(): - df.loc[df[task_col_name] == task_name, max_col_name] = row["score"] / 100 # scores are in percentage + df.loc[df[task_col_name] == task_name, max_col_name] = ( + row["score"] / 100 + ) # scores are in percentage averages: dict[str, float | None] = {} - for col in models+[max_col_name]: + for col in models + [max_col_name]: numeric = pd.to_numeric(df[col], errors="coerce") avg = numeric.mean() averages[col] = avg if not pd.isna(avg) else None From 6767970aa6d2909445d60a89ffef56bf25d170b2 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:24:16 +0300 Subject: [PATCH 15/20] fix typo --- scripts/create_pr_results_comment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/create_pr_results_comment.py b/scripts/create_pr_results_comment.py index 9dbb93c4c7..d79f5cb104 100644 --- a/scripts/create_pr_results_comment.py +++ b/scripts/create_pr_results_comment.py @@ -16,7 +16,7 @@ --output: Output markdown file path (default: model-comparison.md) Example: - python scripts/create_pr_results_comment.py --models intfloat/multilingual-e5-large myorg/my-new-modelm + python scripts/create_pr_results_comment.py --models intfloat/multilingual-e5-large myorg/my-new-model """ from __future__ import annotations From c66edbd3e1296d7a8c116073b8045fd59604782e Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:27:21 +0300 Subject: [PATCH 16/20] fetch main --- .github/workflows/model-results-comparison.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml index d5d651f0a8..f89ca05b84 100644 --- a/.github/workflows/model-results-comparison.yaml +++ b/.github/workflows/model-results-comparison.yaml @@ -31,6 +31,9 @@ jobs: with: fetch-depth: 0 + - name: Fetch origin main + run: git fetch origin main + - name: Set up Python uses: actions/setup-python@v5 with: From a7ed069571514100bf33d157e9ed2cbc12f455b5 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:33:50 +0300 Subject: [PATCH 17/20] fetch main in script --- scripts/create_pr_results_comment.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/create_pr_results_comment.py b/scripts/create_pr_results_comment.py index d79f5cb104..014ae69834 100644 --- a/scripts/create_pr_results_comment.py +++ b/scripts/create_pr_results_comment.py @@ -51,6 +51,13 @@ def get_diff_from_main() -> list[str]: + subprocess.run( + ["git", "fetch", "origin", "main"], + cwd=repo_path, + check=True, + text=True, + ) + current_rev, origin_rev = subprocess.run( ["git", "rev-parse", "main", "origin/main"], cwd=repo_path, From dc2d78786820aa12966432a2696ce843d3bda46d Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:38:05 +0300 Subject: [PATCH 18/20] remove revision check --- scripts/create_pr_results_comment.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/scripts/create_pr_results_comment.py b/scripts/create_pr_results_comment.py index 014ae69834..d1837ca798 100644 --- a/scripts/create_pr_results_comment.py +++ b/scripts/create_pr_results_comment.py @@ -51,26 +51,6 @@ def get_diff_from_main() -> list[str]: - subprocess.run( - ["git", "fetch", "origin", "main"], - cwd=repo_path, - check=True, - text=True, - ) - - current_rev, origin_rev = subprocess.run( - ["git", "rev-parse", "main", "origin/main"], - cwd=repo_path, - capture_output=True, - check=True, - text=True, - ).stdout.splitlines() - - if current_rev != origin_rev: - raise ValueError( - f"Your main branch is not up-to-date ({current_rev} != {origin_rev}), please run `git fetch origin main`" - ) - differences = subprocess.run( ["git", "diff", "--name-only", "origin/main...HEAD"], cwd=repo_path, From c67ecc5a189c99a02937b36cb5274c536dcd8eee Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Fri, 13 Jun 2025 14:44:11 +0300 Subject: [PATCH 19/20] fix reference models arg --- .github/workflows/model-results-comparison.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml index f89ca05b84..4f1b0d117b 100644 --- a/.github/workflows/model-results-comparison.yaml +++ b/.github/workflows/model-results-comparison.yaml @@ -47,7 +47,7 @@ jobs: env: REFERENCE_MODELS: ${{ github.event.inputs.reference_models || 'intfloat/multilingual-e5-large google/gemini-embedding-001' }} run: | - python scripts/create_pr_results_comment.py --reference-models "$REFERENCE_MODELS" --output model-comparison.md + python scripts/create_pr_results_comment.py --reference-models $REFERENCE_MODELS --output model-comparison.md - name: Determine PR Number id: pr_info From f40d643dd00b9ead3ba2334b048c12f1b15df696 Mon Sep 17 00:00:00 2001 From: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Date: Sun, 15 Jun 2025 21:55:18 +0300 Subject: [PATCH 20/20] bump python version --- .github/workflows/model-results-comparison.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml index 4f1b0d117b..8405521810 100644 --- a/.github/workflows/model-results-comparison.yaml +++ b/.github/workflows/model-results-comparison.yaml @@ -37,7 +37,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v5 with: - python-version: '3.9' + python-version: '3.10' - name: Install dependencies run: |