From 7c75155321d8c063b325761eaf5760850a43b671 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 11 Jun 2025 23:06:24 +0300
Subject: [PATCH 01/20] init automate script

---
 .../workflows/model-results-comparison.yaml   |  70 +++++
 .../MassiveIntentClassification.json          |   4 +-
 scripts/pr_results_comment.py                 | 295 ++++++++++++++++++
 3 files changed, 367 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/model-results-comparison.yaml
 create mode 100644 scripts/pr_results_comment.py

diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml
new file mode 100644
index 0000000000..7b0f7be2e5
--- /dev/null
+++ b/.github/workflows/model-results-comparison.yaml
@@ -0,0 +1,70 @@
+name: Model Results Comparison
+
+on:
+  pull_request:
+    types: [opened, synchronize, edited]
+    paths:
+      - 'results/**/*.json'
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  compare-results:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v4
+      with:
+        fetch-depth: 0
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.9'
+
+    - name: Install dependencies
+      run: |
+        pip install mteb
+
+    - name: Get changed result files
+      id: changed-files
+      run: |
+        # Get list of changed JSON files in results directory
+        git fetch origin main
+        changed_files=$(git diff --name-only origin/main...HEAD | grep -E '^results/.*\.json$' | grep -v model_meta.json || true)
+
+        if [ -z "$changed_files" ]; then
+          echo "No result files changed"
+          echo "has_changes=false" >> $GITHUB_OUTPUT
+        else
+          echo "Changed files:"
+          echo "$changed_files"
+          echo "has_changes=true" >> $GITHUB_OUTPUT
+
+          # Convert to space-separated list for script
+          files_list=$(echo "$changed_files" | tr '\n' ' ')
+          echo "files_list=$files_list" >> $GITHUB_OUTPUT
+        fi
+
+    - name: Generate model comparison
+      if: steps.changed-files.outputs.has_changes == 'true'
+      run: |
+        python scripts/create_pr_results_comment.py \
+          ${{ steps.changed-files.outputs.files_list }} \
+          --output model-comparison.md
+
+    - name: Post PR comment
+      if: steps.changed-files.outputs.has_changes == 'true'
+      env:
+        GITHUB_TOKEN: ${{ github.token }}
+      run: gh pr comment ${{ github.event.number }} --body-file model-comparison.md --create-if-none --edit-last
+
+    - name: Upload comparison report
+      if: steps.changed-files.outputs.has_changes == 'true'
+      uses: actions/upload-artifact@v3
+      with:
+        name: model-comparison
+        path: model-comparison.md
diff --git a/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json b/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json
index c264367483..a325c9a0d6 100644
--- a/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json
+++ b/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json
@@ -124,7 +124,7 @@
             "f1_weighted": 0.776515
           }
         ],
-        "main_score": 0.791157,
+        "main_score": 0.891157,
         "hf_subset": "ru",
         "languages": [
           "rus-Cyrl"
@@ -134,4 +134,4 @@
   },
   "evaluation_time": 47.84240365028381,
   "kg_co2_emissions": null
-}
\ No newline at end of file
+}
diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py
new file mode 100644
index 0000000000..0a2fc8d50d
--- /dev/null
+++ b/scripts/pr_results_comment.py
@@ -0,0 +1,295 @@
+"""
+Script to generate a Markdown comparison table for model results from file paths.
+
+The script takes a list of result files and extracts:
+- Model name from the folder structure (results/model_name/...)
+- Task name from the filename (without .json extension)
+
+Usage:
+    python scripts/create_pr_results_comment.py file1.json file2.json --output results.md
+
+Arguments:
+    files: List of result files to process
+    --output: Output markdown file path (required)
+
+Example:
+    python scripts/create_pr_results_comment.py \
+        results/my-new-model/revision/task1.json \
+        results/my-new-model/revision/task2.json \
+        results/another-model/revision/task1.json \
+        --output comparison.md
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import sys
+from collections import defaultdict
+from pathlib import Path
+
+import mteb
+import pandas as pd
+
+repo_path = Path(__file__).parents[1]
+results_path = repo_path / "results"
+
+# Set MTEB cache
+os.environ["MTEB_CACHE"] = str(repo_path.parent)
+
+# Hardcoded reference models to compare against
+REFERENCE_MODELS = [
+    "intfloat/multilingual-e5-large",
+    "google/gemini-embedding-001",
+]
+
+
+def extract_model_and_task_from_path(file_path: str) -> tuple[str, str]:
+    """
+    Extract model name and task name from file path.
+
+    Expected structure: results/model_name/revision/task_name.json
+    Returns: (model_name, task_name)
+    """
+    path = Path(file_path)
+
+    if not path.suffix == '.json':
+        raise ValueError(f"File must be a JSON file: {file_path}")
+
+    task_name = path.stem
+    parts = path.parts
+    try:
+        results_idx = parts.index('results')
+        if results_idx + 1 < len(parts):
+            model_dir = parts[results_idx + 1].replace("__", "/")
+
+            # Try to get model name from model_meta.json
+            model_meta_path = path.parent / "model_meta.json"
+            if model_meta_path.exists():
+                try:
+                    with open(model_meta_path) as f:
+                        meta = json.load(f)
+                        model_name = meta.get("name", model_dir)
+                except (json.JSONDecodeError, KeyError):
+                    model_name = model_dir
+            else:
+                model_name = model_dir
+
+            return model_name, task_name
+        else:
+            raise ValueError(f"Invalid path structure: {file_path}")
+    except ValueError:
+        raise ValueError(f"Path must contain 'results' directory: {file_path}")
+
+
+def group_files_by_model(file_paths: list[str]) -> dict[str, list[str]]:
+    """Group files by model and extract task names."""
+    model_tasks = defaultdict(list)
+
+    for file_path in file_paths:
+        try:
+            model_name, task_name = extract_model_and_task_from_path(file_path)
+            model_tasks[model_name].append(task_name)
+            print(f"✓ Found: {model_name} -> {task_name}")
+        except ValueError as e:
+            print(f"⚠ Warning: Skipping {file_path}: {e}")
+            continue
+
+    # Remove duplicates and sort
+    for model in model_tasks:
+        model_tasks[model] = sorted(list(set(model_tasks[model])))
+
+    return dict(model_tasks)
+
+
+def create_comparison_table(models: list[str], tasks: list[str]) -> pd.DataFrame:
+    """Create comparison table for given models and tasks."""
+    try:
+        print(f"Loading results for models: {models}")
+        print(f"Tasks: {tasks}")
+
+        results = mteb.load_results(models=models, tasks=tasks, download_latest=False)
+        results = results.join_revisions()
+        df = results.to_dataframe()
+
+        if df.empty:
+            raise ValueError("No results found for the specified models and tasks")
+
+        # Compute average per columns
+        model_names = [c for c in df.columns if c != "task_name"]
+
+        # Calculate averages only for numeric columns
+        averages = {}
+        for model in model_names:
+            if model in df.columns:
+                numeric_values = pd.to_numeric(df[model], errors='coerce')
+                avg_value = numeric_values.mean()
+                averages[model] = avg_value if not pd.isna(avg_value) else None
+            else:
+                averages[model] = None
+
+        # Add average row
+        avg_row = pd.DataFrame({
+            "task_name": ["**Average**"],
+            **{model: [avg_val] for model, avg_val in averages.items()},
+        })
+
+        df = pd.concat([df, avg_row], ignore_index=True)
+        return df
+    except Exception as e:
+        print(f"❌ Error creating comparison table: {e}")
+        raise
+
+
+def highlight_max_bold(df: pd.DataFrame, exclude_cols=["task_name"]) -> pd.DataFrame:
+    """Highlight maximum values in bold for each row."""
+    result_df = df.copy()
+
+    # Format numeric values to 2 decimal places
+    for col in result_df.columns:
+        if col not in exclude_cols:
+            result_df[col] = result_df[col].apply(
+                lambda x: f"{x:.2f}" if isinstance(x, (int, float)) and pd.notna(x) else str(x)
+            )
+
+    # Create a temporary dataframe for finding max values
+    tmp_df = df.copy()
+    tmp_df = tmp_df.drop(columns=exclude_cols)
+
+    for idx in df.index:
+        # Skip rows with no numeric data
+        numeric_row = pd.to_numeric(tmp_df.loc[idx], errors='coerce')
+        if numeric_row.isna().all():
+            continue
+
+        max_col = numeric_row.idxmax()
+        if pd.notna(numeric_row[max_col]):
+            current_value = result_df.loc[idx, max_col]
+            result_df.loc[idx, max_col] = f"**{current_value}**"
+
+    return result_df
+
+
+def generate_markdown_content(model_tasks: dict[str, list[str]]) -> str:
+    """Generate the complete markdown content with comparison tables."""
+
+    if not model_tasks:
+        return "# Model Results Comparison\n\nNo valid model results found."
+
+    # Get all unique tasks across all models
+    all_tasks = []
+    for tasks in model_tasks.values():
+        all_tasks.extend(tasks)
+    all_tasks = sorted(list(set(all_tasks)))
+
+    # Get all models
+    new_models = list(model_tasks.keys())
+
+    markdown_parts = [
+        "# Model Results Comparison",
+        "",
+        f"**New models evaluated:** {', '.join(f'`{m}`' for m in new_models)}",
+        f"**Tasks:** {', '.join(f'`{t}`' for t in all_tasks)}",
+        "",
+    ]
+
+    # Create comparison tables for each new model
+    for model_name, model_tasks_list in model_tasks.items():
+        markdown_parts.extend([
+            f"## Results for `{model_name}`",
+        ])
+
+        try:
+            # Compare this model against reference models
+            all_models = REFERENCE_MODELS + [model_name]
+            df = create_comparison_table(models=all_models, tasks=model_tasks_list)
+            bold_df = highlight_max_bold(df)
+            markdown_table = bold_df.to_markdown(index=False)
+            markdown_parts.append(markdown_table)
+        except Exception as e:
+            print(f"❌ Error generating comparison table for {model_name}: {e}")
+
+        markdown_parts.extend(["", "---", ""])
+
+    # # Add global comparison if multiple models
+    # if len(new_models) > 1:
+    #     markdown_parts.extend([
+    #         "## Overall Comparison",
+    #         "",
+    #         f"**All models compared:** {', '.join(f'`{m}`' for m in REFERENCE_MODELS + new_models)}",
+    #         f"**Common tasks:** {', '.join(f'`{t}`' for t in all_tasks)}",
+    #         "",
+    #     ])
+    #
+    #     try:
+    #         all_models = REFERENCE_MODELS + new_models
+    #         df = create_comparison_table(models=all_models, tasks=all_tasks)
+    #         bold_df = highlight_max_bold(df)
+    #         markdown_table = bold_df.to_markdown(index=False)
+    #         markdown_parts.append(markdown_table)
+    #     except Exception as e:
+    #         print(f"❌ Error generating overall comparison table: {e}")
+
+    return "\n".join(markdown_parts)
+
+
+def create_argparse() -> argparse.ArgumentParser:
+    """Create argument parser."""
+    parser = argparse.ArgumentParser(
+        description="Generate markdown comparison table for model results from file paths."
+    )
+    parser.add_argument(
+        "files",
+        nargs="+",
+        help="List of result JSON files to process",
+    )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        required=True,
+        help="Output markdown file path (required)",
+    )
+    return parser
+
+
+def main():
+    """Main function."""
+    parser = create_argparse()
+    args = parser.parse_args()
+
+    print(f"Processing {len(args.files)} files...")
+
+    # Group files by model
+    try:
+        model_tasks = group_files_by_model(args.files)
+        print(f"\nFound {len(model_tasks)} models:")
+        for model, tasks in model_tasks.items():
+            print(f"  {model}: {len(tasks)} tasks ({', '.join(tasks)})")
+    except Exception as e:
+        print(f"❌ Error processing files: {e}")
+        raise e
+
+    if not model_tasks:
+        print("❌ No valid model results found")
+        raise e
+
+    # Generate markdown content
+    try:
+        markdown_content = generate_markdown_content(model_tasks)
+    except Exception as e:
+        print(f"❌ Error generating markdown: {e}")
+        raise e
+
+    # Write to output file
+    try:
+        args.output.parent.mkdir(parents=True, exist_ok=True)
+        args.output.write_text(markdown_content)
+        print(f"✅ Markdown written to {args.output}")
+    except Exception as e:
+        print(f"❌ Error writing to {args.output}: {e}")
+        raise e
+
+
+if __name__ == "__main__":
+    main()

From f7a94a9d51219a44e858124f142fa652bc30e25a Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 11 Jun 2025 23:08:07 +0300
Subject: [PATCH 02/20] bump versions

---
 .github/workflows/model-results-comparison.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml
index 7b0f7be2e5..bcaef056e3 100644
--- a/.github/workflows/model-results-comparison.yaml
+++ b/.github/workflows/model-results-comparison.yaml
@@ -21,7 +21,7 @@ jobs:
         fetch-depth: 0
 
     - name: Set up Python
-      uses: actions/setup-python@v4
+      uses: actions/setup-python@v5
       with:
         python-version: '3.9'
 
@@ -64,7 +64,7 @@ jobs:
 
     - name: Upload comparison report
       if: steps.changed-files.outputs.has_changes == 'true'
-      uses: actions/upload-artifact@v3
+      uses: actions/upload-artifact@v4
       with:
         name: model-comparison
         path: model-comparison.md

From fd574a9bef5b70593f41be697f49ae886756844c Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 11 Jun 2025 23:15:07 +0300
Subject: [PATCH 03/20] fix script name

---
 .github/workflows/model-results-comparison.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml
index bcaef056e3..2cb09487df 100644
--- a/.github/workflows/model-results-comparison.yaml
+++ b/.github/workflows/model-results-comparison.yaml
@@ -52,7 +52,7 @@ jobs:
     - name: Generate model comparison
       if: steps.changed-files.outputs.has_changes == 'true'
       run: |
-        python scripts/create_pr_results_comment.py \
+        python scripts/pr_results_comment.py \
           ${{ steps.changed-files.outputs.files_list }} \
           --output model-comparison.md
 

From 1065f7bc55b5a17d07ecf91f5ce77054a4195929 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 11 Jun 2025 23:19:10 +0300
Subject: [PATCH 04/20] add tabulate

---
 .github/workflows/model-results-comparison.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml
index 2cb09487df..86218f87cf 100644
--- a/.github/workflows/model-results-comparison.yaml
+++ b/.github/workflows/model-results-comparison.yaml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install dependencies
       run: |
-        pip install mteb
+        pip install mteb tabulate
 
     - name: Get changed result files
       id: changed-files

From eec89fbbce6ffb52d380be920f23d5ac17d91d66 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 11 Jun 2025 23:27:59 +0300
Subject: [PATCH 05/20] add resutls for test model

---
 .../MassiveIntentClassification.json           |  2 +-
 .../1234321/AfriSeniClassification.json        | 18 ++++++++++++++++++
 .../1234321/SomeNewTask.json                   | 18 ++++++++++++++++++
 .../1234321/model_meta.json                    |  1 +
 4 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 results/testuser__testmodel/1234321/AfriSeniClassification.json
 create mode 100644 results/testuser__testmodel/1234321/SomeNewTask.json
 create mode 100644 results/testuser__testmodel/1234321/model_meta.json

diff --git a/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json b/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json
index a325c9a0d6..8b2abb3922 100644
--- a/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json
+++ b/results/ai-forever__FRIDA/7292217af9a9e6dbf07048f76b434ad1e2aa8b76/MassiveIntentClassification.json
@@ -124,7 +124,7 @@
             "f1_weighted": 0.776515
           }
         ],
-        "main_score": 0.891157,
+        "main_score": 0.791157,
         "hf_subset": "ru",
         "languages": [
           "rus-Cyrl"
diff --git a/results/testuser__testmodel/1234321/AfriSeniClassification.json b/results/testuser__testmodel/1234321/AfriSeniClassification.json
new file mode 100644
index 0000000000..92ff554112
--- /dev/null
+++ b/results/testuser__testmodel/1234321/AfriSeniClassification.json
@@ -0,0 +1,18 @@
+{
+  "dataset_revision": "b52e930385cf5ed7f063072c3f7bd17b599a16cf",
+  "task_name": "AfriSentiClassification",
+  "mteb_version": "1.18.0",
+  "scores": {
+    "test": [
+      {
+        "main_score": 0.1,
+        "hf_subset": "yor",
+        "languages": [
+          "yor-Latn"
+        ]
+      }
+    ]
+  },
+  "evaluation_time": 23.979591608047485,
+  "kg_co2_emissions": 0.000927284611091405
+}
diff --git a/results/testuser__testmodel/1234321/SomeNewTask.json b/results/testuser__testmodel/1234321/SomeNewTask.json
new file mode 100644
index 0000000000..07c05183a9
--- /dev/null
+++ b/results/testuser__testmodel/1234321/SomeNewTask.json
@@ -0,0 +1,18 @@
+{
+  "dataset_revision": "b52e930385cf5ed7f063072c3f7bd17b599a16cf",
+  "task_name": "SomeNewTask",
+  "mteb_version": "1.18.0",
+  "scores": {
+    "test": [
+      {
+        "main_score": 0.2,
+        "hf_subset": "eng",
+        "languages": [
+          "eng-Latn"
+        ]
+      }
+    ]
+  },
+  "evaluation_time": 23.979591608047485,
+  "kg_co2_emissions": 0.000927284611091405
+}
diff --git a/results/testuser__testmodel/1234321/model_meta.json b/results/testuser__testmodel/1234321/model_meta.json
new file mode 100644
index 0000000000..3ec3fd4df3
--- /dev/null
+++ b/results/testuser__testmodel/1234321/model_meta.json
@@ -0,0 +1 @@
+{"name": "testuser/testmodel", "revision": "1234321", "release_date": null, "languages": null, "n_parameters": null, "memory_usage_mb": null, "max_tokens": null, "embed_dim": null, "license": null, "open_weights": null, "public_training_code": null, "public_training_data": null, "framework": null, "reference": null, "similarity_fn_name": null, "use_instructions": null, "training_datasets": null, "adapted_from": null, "superseded_by": null, "modalities": ["text"], "loader": null}

From 42e02b153a66b77eb61a1e460b7b7f56466d4162 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 11 Jun 2025 23:38:36 +0300
Subject: [PATCH 06/20] handle no result on task

---
 scripts/pr_results_comment.py | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py
index 0a2fc8d50d..bdeefb4a48 100644
--- a/scripts/pr_results_comment.py
+++ b/scripts/pr_results_comment.py
@@ -103,13 +103,17 @@ def group_files_by_model(file_paths: list[str]) -> dict[str, list[str]]:
     return dict(model_tasks)
 
 
-def create_comparison_table(models: list[str], tasks: list[str]) -> pd.DataFrame:
+def create_comparison_table(models: str, tasks: list[str]) -> pd.DataFrame:
     """Create comparison table for given models and tasks."""
     try:
         print(f"Loading results for models: {models}")
         print(f"Tasks: {tasks}")
 
-        results = mteb.load_results(models=models, tasks=tasks, download_latest=False)
+        try:
+            results = mteb.load_results(models=[models]+REFERENCE_MODELS, tasks=tasks, download_latest=False)
+        except Exception as e:
+            # if model in reference don't have results on task
+            results = mteb.load_results(models=[models]+REFERENCE_MODELS, tasks=tasks, download_latest=False)
         results = results.join_revisions()
         df = results.to_dataframe()
 
@@ -202,8 +206,7 @@ def generate_markdown_content(model_tasks: dict[str, list[str]]) -> str:
 
         try:
             # Compare this model against reference models
-            all_models = REFERENCE_MODELS + [model_name]
-            df = create_comparison_table(models=all_models, tasks=model_tasks_list)
+            df = create_comparison_table(model=model_name, tasks=model_tasks_list)
             bold_df = highlight_max_bold(df)
             markdown_table = bold_df.to_markdown(index=False)
             markdown_parts.append(markdown_table)
@@ -212,25 +215,6 @@ def generate_markdown_content(model_tasks: dict[str, list[str]]) -> str:
 
         markdown_parts.extend(["", "---", ""])
 
-    # # Add global comparison if multiple models
-    # if len(new_models) > 1:
-    #     markdown_parts.extend([
-    #         "## Overall Comparison",
-    #         "",
-    #         f"**All models compared:** {', '.join(f'`{m}`' for m in REFERENCE_MODELS + new_models)}",
-    #         f"**Common tasks:** {', '.join(f'`{t}`' for t in all_tasks)}",
-    #         "",
-    #     ])
-    #
-    #     try:
-    #         all_models = REFERENCE_MODELS + new_models
-    #         df = create_comparison_table(models=all_models, tasks=all_tasks)
-    #         bold_df = highlight_max_bold(df)
-    #         markdown_table = bold_df.to_markdown(index=False)
-    #         markdown_parts.append(markdown_table)
-    #     except Exception as e:
-    #         print(f"❌ Error generating overall comparison table: {e}")
-
     return "\n".join(markdown_parts)
 
 

From 9b4025dc50073df72c50c6800428878ca9feafeb Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 11 Jun 2025 23:43:15 +0300
Subject: [PATCH 07/20] fix function

---
 scripts/pr_results_comment.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py
index bdeefb4a48..28c39e4896 100644
--- a/scripts/pr_results_comment.py
+++ b/scripts/pr_results_comment.py
@@ -103,17 +103,17 @@ def group_files_by_model(file_paths: list[str]) -> dict[str, list[str]]:
     return dict(model_tasks)
 
 
-def create_comparison_table(models: str, tasks: list[str]) -> pd.DataFrame:
+def create_comparison_table(model: str, tasks: list[str]) -> pd.DataFrame:
     """Create comparison table for given models and tasks."""
     try:
         print(f"Loading results for models: {models}")
         print(f"Tasks: {tasks}")
 
         try:
-            results = mteb.load_results(models=[models]+REFERENCE_MODELS, tasks=tasks, download_latest=False)
+            results = mteb.load_results(models=[model]+REFERENCE_MODELS, tasks=tasks, download_latest=False)
         except Exception as e:
             # if model in reference don't have results on task
-            results = mteb.load_results(models=[models]+REFERENCE_MODELS, tasks=tasks, download_latest=False)
+            results = mteb.load_results(models=[model]+REFERENCE_MODELS, tasks=tasks, download_latest=False)
         results = results.join_revisions()
         df = results.to_dataframe()
 
@@ -206,7 +206,7 @@ def generate_markdown_content(model_tasks: dict[str, list[str]]) -> str:
 
         try:
             # Compare this model against reference models
-            df = create_comparison_table(model=model_name, tasks=model_tasks_list)
+            df = create_comparison_table(model_name, tasks=model_tasks_list)
             bold_df = highlight_max_bold(df)
             markdown_table = bold_df.to_markdown(index=False)
             markdown_parts.append(markdown_table)

From e419cc89348b8422646941c8468a95786b0f764e Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 11 Jun 2025 23:48:44 +0300
Subject: [PATCH 08/20] fix function

---
 scripts/pr_results_comment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py
index 28c39e4896..a82c3cef4d 100644
--- a/scripts/pr_results_comment.py
+++ b/scripts/pr_results_comment.py
@@ -106,7 +106,7 @@ def group_files_by_model(file_paths: list[str]) -> dict[str, list[str]]:
 def create_comparison_table(model: str, tasks: list[str]) -> pd.DataFrame:
     """Create comparison table for given models and tasks."""
     try:
-        print(f"Loading results for models: {models}")
+        print(f"Loading results for model: {model}")
         print(f"Tasks: {tasks}")
 
         try:

From 6d41b5f1814617606268325f3f8ef8d81f280f4b Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Wed, 11 Jun 2025 23:56:44 +0300
Subject: [PATCH 09/20] remove testuser

---
 .../1234321/AfriSeniClassification.json        | 18 ------------------
 .../1234321/SomeNewTask.json                   | 18 ------------------
 .../1234321/model_meta.json                    |  1 -
 3 files changed, 37 deletions(-)
 delete mode 100644 results/testuser__testmodel/1234321/AfriSeniClassification.json
 delete mode 100644 results/testuser__testmodel/1234321/SomeNewTask.json
 delete mode 100644 results/testuser__testmodel/1234321/model_meta.json

diff --git a/results/testuser__testmodel/1234321/AfriSeniClassification.json b/results/testuser__testmodel/1234321/AfriSeniClassification.json
deleted file mode 100644
index 92ff554112..0000000000
--- a/results/testuser__testmodel/1234321/AfriSeniClassification.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-  "dataset_revision": "b52e930385cf5ed7f063072c3f7bd17b599a16cf",
-  "task_name": "AfriSentiClassification",
-  "mteb_version": "1.18.0",
-  "scores": {
-    "test": [
-      {
-        "main_score": 0.1,
-        "hf_subset": "yor",
-        "languages": [
-          "yor-Latn"
-        ]
-      }
-    ]
-  },
-  "evaluation_time": 23.979591608047485,
-  "kg_co2_emissions": 0.000927284611091405
-}
diff --git a/results/testuser__testmodel/1234321/SomeNewTask.json b/results/testuser__testmodel/1234321/SomeNewTask.json
deleted file mode 100644
index 07c05183a9..0000000000
--- a/results/testuser__testmodel/1234321/SomeNewTask.json
+++ /dev/null
@@ -1,18 +0,0 @@
-{
-  "dataset_revision": "b52e930385cf5ed7f063072c3f7bd17b599a16cf",
-  "task_name": "SomeNewTask",
-  "mteb_version": "1.18.0",
-  "scores": {
-    "test": [
-      {
-        "main_score": 0.2,
-        "hf_subset": "eng",
-        "languages": [
-          "eng-Latn"
-        ]
-      }
-    ]
-  },
-  "evaluation_time": 23.979591608047485,
-  "kg_co2_emissions": 0.000927284611091405
-}
diff --git a/results/testuser__testmodel/1234321/model_meta.json b/results/testuser__testmodel/1234321/model_meta.json
deleted file mode 100644
index 3ec3fd4df3..0000000000
--- a/results/testuser__testmodel/1234321/model_meta.json
+++ /dev/null
@@ -1 +0,0 @@
-{"name": "testuser/testmodel", "revision": "1234321", "release_date": null, "languages": null, "n_parameters": null, "memory_usage_mb": null, "max_tokens": null, "embed_dim": null, "license": null, "open_weights": null, "public_training_code": null, "public_training_data": null, "framework": null, "reference": null, "similarity_fn_name": null, "use_instructions": null, "training_datasets": null, "adapted_from": null, "superseded_by": null, "modalities": ["text"], "loader": null}

From 15d60e9160f65a0826890e75deafe7306a615975 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Thu, 12 Jun 2025 00:03:28 +0300
Subject: [PATCH 10/20] fix script help

---
 scripts/pr_results_comment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py
index a82c3cef4d..36688ac6bc 100644
--- a/scripts/pr_results_comment.py
+++ b/scripts/pr_results_comment.py
@@ -6,7 +6,7 @@
 - Task name from the filename (without .json extension)
 
 Usage:
-    python scripts/create_pr_results_comment.py file1.json file2.json --output results.md
+    python scripts/pr_results_comment.py file1.json file2.json --output results.md
 
 Arguments:
     files: List of result files to process

From cb95fd2a93db8c634c5cdaa79b0cc31838d7a45b Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Thu, 12 Jun 2025 00:12:42 +0300
Subject: [PATCH 11/20] try to run only one model

---
 scripts/pr_results_comment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py
index 36688ac6bc..1a75a1acc8 100644
--- a/scripts/pr_results_comment.py
+++ b/scripts/pr_results_comment.py
@@ -113,7 +113,7 @@ def create_comparison_table(model: str, tasks: list[str]) -> pd.DataFrame:
             results = mteb.load_results(models=[model]+REFERENCE_MODELS, tasks=tasks, download_latest=False)
         except Exception as e:
             # if model in reference don't have results on task
-            results = mteb.load_results(models=[model]+REFERENCE_MODELS, tasks=tasks, download_latest=False)
+            results = mteb.load_results(models=[model], tasks=tasks, download_latest=False)
         results = results.join_revisions()
         df = results.to_dataframe()
 

From e74dde05a1755f0c752832104a37bff26b6a221e Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Thu, 12 Jun 2025 13:25:44 +0300
Subject: [PATCH 12/20] install from sources

---
 .github/workflows/model-results-comparison.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml
index 86218f87cf..c4962dbebc 100644
--- a/.github/workflows/model-results-comparison.yaml
+++ b/.github/workflows/model-results-comparison.yaml
@@ -27,7 +27,7 @@ jobs:
 
     - name: Install dependencies
       run: |
-        pip install mteb tabulate
+        pip install git+https://github.com/embeddings-benchmark/mteb.git tabulate
 
     - name: Get changed result files
       id: changed-files

From d3d1b9bde7eb0d2b2b78f9f06d84a89a511dae81 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Fri, 13 Jun 2025 14:20:58 +0300
Subject: [PATCH 13/20] update script

---
 .../workflows/model-results-comparison.yaml   |  51 ++--
 scripts/create_pr_results_comment.py          | 149 +++++++---
 scripts/pr_results_comment.py                 | 279 ------------------
 3 files changed, 135 insertions(+), 344 deletions(-)
 delete mode 100644 scripts/pr_results_comment.py

diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml
index c4962dbebc..d5d651f0a8 100644
--- a/.github/workflows/model-results-comparison.yaml
+++ b/.github/workflows/model-results-comparison.yaml
@@ -5,6 +5,17 @@ on:
     types: [opened, synchronize, edited]
     paths:
       - 'results/**/*.json'
+  workflow_dispatch:
+    inputs:
+      reference_models:
+        description: 'Space-separated list of reference models for comparison'
+        required: true
+        type: string
+        default: 'intfloat/multilingual-e5-large google/gemini-embedding-001'
+      pull_request_number:
+        description: 'The pull request number to comment on (required if triggered manually)'
+        required: false # Make it not strictly required if you want to run it without a PR context for other reasons
+        type: string
 
 permissions:
   contents: read
@@ -29,41 +40,31 @@ jobs:
       run: |
         pip install git+https://github.com/embeddings-benchmark/mteb.git tabulate
 
-    - name: Get changed result files
-      id: changed-files
+    - name: Generate model comparison
+      env:
+        REFERENCE_MODELS: ${{ github.event.inputs.reference_models || 'intfloat/multilingual-e5-large google/gemini-embedding-001' }}
       run: |
-        # Get list of changed JSON files in results directory
-        git fetch origin main
-        changed_files=$(git diff --name-only origin/main...HEAD | grep -E '^results/.*\.json$' | grep -v model_meta.json || true)
+        python scripts/create_pr_results_comment.py --reference-models "$REFERENCE_MODELS" --output model-comparison.md
 
-        if [ -z "$changed_files" ]; then
-          echo "No result files changed"
-          echo "has_changes=false" >> $GITHUB_OUTPUT
+    - name: Determine PR Number
+      id: pr_info
+      run: |
+        if [ "${{ github.event_name }}" == "pull_request" ]; then
+          echo "pr_number=${{ github.event.number }}" >> $GITHUB_OUTPUT
+        elif [ "${{ github.event_name }}" == "workflow_dispatch" ] && [ -n "${{ github.event.inputs.pull_request_number }}" ]; then
+          echo "pr_number=${{ github.event.inputs.pull_request_number }}" >> $GITHUB_OUTPUT
         else
-          echo "Changed files:"
-          echo "$changed_files"
-          echo "has_changes=true" >> $GITHUB_OUTPUT
-
-          # Convert to space-separated list for script
-          files_list=$(echo "$changed_files" | tr '\n' ' ')
-          echo "files_list=$files_list" >> $GITHUB_OUTPUT
+          echo "pr_number=" >> $GITHUB_OUTPUT
         fi
 
-    - name: Generate model comparison
-      if: steps.changed-files.outputs.has_changes == 'true'
-      run: |
-        python scripts/pr_results_comment.py \
-          ${{ steps.changed-files.outputs.files_list }} \
-          --output model-comparison.md
-
     - name: Post PR comment
-      if: steps.changed-files.outputs.has_changes == 'true'
+      # This step will run if a PR number is available either from the PR event or workflow_dispatch input
+      if: steps.pr_info.outputs.pr_number != ''
       env:
         GITHUB_TOKEN: ${{ github.token }}
-      run: gh pr comment ${{ github.event.number }} --body-file model-comparison.md --create-if-none --edit-last
+      run: gh pr comment ${{ steps.pr_info.outputs.pr_number }} --body-file model-comparison.md --create-if-none --edit-last
 
     - name: Upload comparison report
-      if: steps.changed-files.outputs.has_changes == 'true'
       uses: actions/upload-artifact@v4
       with:
         name: model-comparison
diff --git a/scripts/create_pr_results_comment.py b/scripts/create_pr_results_comment.py
index 9bd2d2de01..bf03aae47d 100644
--- a/scripts/create_pr_results_comment.py
+++ b/scripts/create_pr_results_comment.py
@@ -3,19 +3,20 @@
 
 Usage:
     gh pr checkout {pr-number}
-    scripts/create_pr_results_comment.py [--models MODEL1 MODEL2 ...]
+    python scripts/create_pr_results_comment.py [--models MODEL1 MODEL2 ...] [--output OUTPUT_FILE]
 
 Description:
     - Compares new model results (added in the current PR) against reference models.
-    - Outputs a Markdown table with results for each new model and highlights the best scores.
+    - Outputs a Markdown file with results for each new model and highlights the best scores.
     - By default, compares against: intfloat/multilingual-e5-large and google/gemini-embedding-001.
     - You can specify reference models with the --models argument.
 
 Arguments:
-    --models: List of reference models to compare against (default: intfloat/multilingual-e5-large google/gemini-embedding-001)
+    --reference-models: List of reference models to compare against (default: intfloat/multilingual-e5-large google/gemini-embedding-001)
+    --output: Output markdown file path (default: model-comparison.md)
 
 Example:
-    scripts/create_pr_results_comment.py --models intfloat/multilingual-e5-large myorg/my-new-model
+    python scripts/create_pr_results_comment.py --models intfloat/multilingual-e5-large myorg/my-new-modelm
 """
 
 from __future__ import annotations
@@ -24,6 +25,7 @@
 import json
 import os
 import subprocess
+import logging
 from collections import defaultdict
 from pathlib import Path
 
@@ -32,19 +34,22 @@
 
 TaskName, ModelName = str, str
 
+# Default reference models to compare against
+REFERENCE_MODELS: list[str] = [
+    "intfloat/multilingual-e5-large",
+    "google/gemini-embedding-001",
+]
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
 
 repo_path = Path(__file__).parents[1]
-results_path = repo_path / "results"
 
 os.environ["MTEB_CACHE"] = str(repo_path.parent)
 
 
-default_reference_models = [
-    "intfloat/multilingual-e5-large",
-    "google/gemini-embedding-001",
-]
-
-
 def get_diff_from_main() -> list[str]:
     current_rev, origin_rev = subprocess.run(
         ["git", "rev-parse", "main", "origin/main"],
@@ -91,66 +96,130 @@ def extract_new_models_and_tasks(
     return models
 
 
-def create_comparison_table(models: list[str], tasks: list[str]) -> pd.DataFrame:
+def create_comparison_table(
+    model: str, tasks: list[str], reference_models: list[str]
+) -> pd.DataFrame:
+    models = [model] + reference_models
+    max_col_name = "Max result"
+    task_col_name = "task_name"
     results = mteb.load_results(models=models, tasks=tasks, download_latest=False)
+
     results = results.join_revisions()
     df = results.to_dataframe()
 
-    # compute average pr. columns
-    model_names = [c for c in df.columns if c != "task_name"]
+    if df.empty:
+        raise ValueError(f"No results found for models {models} on tasks {tasks}")
 
-    row = pd.DataFrame(
+    df[max_col_name] = None
+    task_results = mteb.load_results(tasks=tasks, download_latest=False)
+    task_results = task_results.join_revisions()
+    max_dataframe = task_results.to_dataframe(format="long").groupby(task_col_name).max()
+    if not max_dataframe.empty:
+        for task_name, row in max_dataframe.iterrows():
+            df.loc[df[task_col_name] == task_name, max_col_name] = row["score"] / 100  # scores are in percentage
+
+    averages: dict[str, float | None] = {}
+    for col in models+[max_col_name]:
+        numeric = pd.to_numeric(df[col], errors="coerce")
+        avg = numeric.mean()
+        averages[col] = avg if not pd.isna(avg) else None
+
+    avg_row = pd.DataFrame(
         {
-            "task_name": ["**Average**"],
-            **{
-                model: df[model].mean() if model != "task_name" else None
-                for model in model_names
-            },
+            task_col_name: ["**Average**"],
+            **{col: [val] for col, val in averages.items()},
         }
     )
-    df = pd.concat([df, row], ignore_index=True)
-    return df
+    return pd.concat([df, avg_row], ignore_index=True)
 
 
-def highlight_max_bold(df, exclude_cols=["task_name"]):
-    # result_df = df.copy().astype(str)
-    # only 2 decimal places except for the excluded columns
+def highlight_max_bold(
+    df: pd.DataFrame, exclude_cols: list[str] = ["task_name"]
+) -> pd.DataFrame:
     result_df = df.copy()
-    result_df = result_df.applymap(lambda x: f"{x:.2f}" if isinstance(x, float) else x)
-    tmp_df = df.copy()
-    tmp_df = tmp_df.drop(columns=exclude_cols)
+    for col in result_df.columns:
+        if col not in exclude_cols:
+            result_df[col] = result_df[col].apply(
+                lambda x: f"{x:.2f}"
+                if isinstance(x, (int, float)) and pd.notna(x)
+                else x
+            )
+
+    tmp = df.drop(columns=exclude_cols)
     for idx in df.index:
-        max_col = tmp_df.loc[idx].idxmax()
-        result_df.loc[idx, max_col] = f"**{result_df.loc[idx, max_col]}**"
+        row = pd.to_numeric(tmp.loc[idx], errors="coerce")
+        if row.isna().all():
+            continue
+        max_col = row.idxmax()
+        if pd.notna(row[max_col]):
+            result_df.at[idx, max_col] = f"**{result_df.at[idx, max_col]}**"
 
     return result_df
 
 
+def generate_markdown_content(
+    model_tasks: dict[str, list[str]], reference_models: list[str]
+) -> str:
+    if not model_tasks:
+        return "# Model Results Comparison\n\nNo new model results found in this PR."
+
+    all_tasks = sorted({t for tasks in model_tasks.values() for t in tasks})
+    new_models = list(model_tasks.keys())
+
+    parts: list[str] = [
+        "# Model Results Comparison",
+        "",
+        f"**Reference models:** {', '.join(f'`{m}`' for m in reference_models)}",
+        f"**New models evaluated:** {', '.join(f'`{m}`' for m in new_models)}",
+        f"**Tasks:** {', '.join(f'`{t}`' for t in all_tasks)}",
+        "",
+    ]
+
+    for model_name, tasks in model_tasks.items():
+        parts.append(f"## Results for `{model_name}`")
+
+        df = create_comparison_table(model_name, tasks, reference_models)
+        bold_df = highlight_max_bold(df)
+        parts.append(bold_df.to_markdown(index=False))
+
+        parts.extend(["", "---", ""])
+
+    return "\n".join(parts)
+
+
 def create_argparse() -> argparse.ArgumentParser:
+    """Create the argument parser for the script."""
     parser = argparse.ArgumentParser(
         description="Create PR comment with results comparison."
     )
     parser.add_argument(
-        "--models",
+        "--reference-models",
         nargs="+",
-        default=default_reference_models,
+        default=REFERENCE_MODELS,
         help="List of reference models to compare against (default: %(default)s)",
     )
+    parser.add_argument(
+        "--output",
+        type=Path,
+        default=Path("model-comparison.md"),
+        help="Output markdown file path",
+    )
     return parser
 
 
-def main(reference_models: list[str]):
+def main(reference_models: list[str], output_path: Path) -> None:
+    logger.info("Starting to create PR results comment...")
+    logger.info(f"Using reference models: {', '.join(reference_models)}")
     diff = get_diff_from_main()
-    new_additions = extract_new_models_and_tasks(diff)
 
-    for model, tasks in new_additions.items():
-        print(f"**Results for `{model}`**")
-        df = create_comparison_table(models=reference_models + [model], tasks=tasks)
-        bold_df = highlight_max_bold(df)
-        print(bold_df.to_markdown(index=False))
+    model_tasks = extract_new_models_and_tasks(diff)
+    markdown = generate_markdown_content(model_tasks, reference_models)
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    output_path.write_text(markdown)
 
 
 if __name__ == "__main__":
     parser = create_argparse()
     args = parser.parse_args()
-    main(reference_models=args.models)
+    main(args.reference_models, args.output)
diff --git a/scripts/pr_results_comment.py b/scripts/pr_results_comment.py
deleted file mode 100644
index 1a75a1acc8..0000000000
--- a/scripts/pr_results_comment.py
+++ /dev/null
@@ -1,279 +0,0 @@
-"""
-Script to generate a Markdown comparison table for model results from file paths.
-
-The script takes a list of result files and extracts:
-- Model name from the folder structure (results/model_name/...)
-- Task name from the filename (without .json extension)
-
-Usage:
-    python scripts/pr_results_comment.py file1.json file2.json --output results.md
-
-Arguments:
-    files: List of result files to process
-    --output: Output markdown file path (required)
-
-Example:
-    python scripts/create_pr_results_comment.py \
-        results/my-new-model/revision/task1.json \
-        results/my-new-model/revision/task2.json \
-        results/another-model/revision/task1.json \
-        --output comparison.md
-"""
-
-from __future__ import annotations
-
-import argparse
-import json
-import os
-import sys
-from collections import defaultdict
-from pathlib import Path
-
-import mteb
-import pandas as pd
-
-repo_path = Path(__file__).parents[1]
-results_path = repo_path / "results"
-
-# Set MTEB cache
-os.environ["MTEB_CACHE"] = str(repo_path.parent)
-
-# Hardcoded reference models to compare against
-REFERENCE_MODELS = [
-    "intfloat/multilingual-e5-large",
-    "google/gemini-embedding-001",
-]
-
-
-def extract_model_and_task_from_path(file_path: str) -> tuple[str, str]:
-    """
-    Extract model name and task name from file path.
-
-    Expected structure: results/model_name/revision/task_name.json
-    Returns: (model_name, task_name)
-    """
-    path = Path(file_path)
-
-    if not path.suffix == '.json':
-        raise ValueError(f"File must be a JSON file: {file_path}")
-
-    task_name = path.stem
-    parts = path.parts
-    try:
-        results_idx = parts.index('results')
-        if results_idx + 1 < len(parts):
-            model_dir = parts[results_idx + 1].replace("__", "/")
-
-            # Try to get model name from model_meta.json
-            model_meta_path = path.parent / "model_meta.json"
-            if model_meta_path.exists():
-                try:
-                    with open(model_meta_path) as f:
-                        meta = json.load(f)
-                        model_name = meta.get("name", model_dir)
-                except (json.JSONDecodeError, KeyError):
-                    model_name = model_dir
-            else:
-                model_name = model_dir
-
-            return model_name, task_name
-        else:
-            raise ValueError(f"Invalid path structure: {file_path}")
-    except ValueError:
-        raise ValueError(f"Path must contain 'results' directory: {file_path}")
-
-
-def group_files_by_model(file_paths: list[str]) -> dict[str, list[str]]:
-    """Group files by model and extract task names."""
-    model_tasks = defaultdict(list)
-
-    for file_path in file_paths:
-        try:
-            model_name, task_name = extract_model_and_task_from_path(file_path)
-            model_tasks[model_name].append(task_name)
-            print(f"✓ Found: {model_name} -> {task_name}")
-        except ValueError as e:
-            print(f"⚠ Warning: Skipping {file_path}: {e}")
-            continue
-
-    # Remove duplicates and sort
-    for model in model_tasks:
-        model_tasks[model] = sorted(list(set(model_tasks[model])))
-
-    return dict(model_tasks)
-
-
-def create_comparison_table(model: str, tasks: list[str]) -> pd.DataFrame:
-    """Create comparison table for given models and tasks."""
-    try:
-        print(f"Loading results for model: {model}")
-        print(f"Tasks: {tasks}")
-
-        try:
-            results = mteb.load_results(models=[model]+REFERENCE_MODELS, tasks=tasks, download_latest=False)
-        except Exception as e:
-            # if model in reference don't have results on task
-            results = mteb.load_results(models=[model], tasks=tasks, download_latest=False)
-        results = results.join_revisions()
-        df = results.to_dataframe()
-
-        if df.empty:
-            raise ValueError("No results found for the specified models and tasks")
-
-        # Compute average per columns
-        model_names = [c for c in df.columns if c != "task_name"]
-
-        # Calculate averages only for numeric columns
-        averages = {}
-        for model in model_names:
-            if model in df.columns:
-                numeric_values = pd.to_numeric(df[model], errors='coerce')
-                avg_value = numeric_values.mean()
-                averages[model] = avg_value if not pd.isna(avg_value) else None
-            else:
-                averages[model] = None
-
-        # Add average row
-        avg_row = pd.DataFrame({
-            "task_name": ["**Average**"],
-            **{model: [avg_val] for model, avg_val in averages.items()},
-        })
-
-        df = pd.concat([df, avg_row], ignore_index=True)
-        return df
-    except Exception as e:
-        print(f"❌ Error creating comparison table: {e}")
-        raise
-
-
-def highlight_max_bold(df: pd.DataFrame, exclude_cols=["task_name"]) -> pd.DataFrame:
-    """Highlight maximum values in bold for each row."""
-    result_df = df.copy()
-
-    # Format numeric values to 2 decimal places
-    for col in result_df.columns:
-        if col not in exclude_cols:
-            result_df[col] = result_df[col].apply(
-                lambda x: f"{x:.2f}" if isinstance(x, (int, float)) and pd.notna(x) else str(x)
-            )
-
-    # Create a temporary dataframe for finding max values
-    tmp_df = df.copy()
-    tmp_df = tmp_df.drop(columns=exclude_cols)
-
-    for idx in df.index:
-        # Skip rows with no numeric data
-        numeric_row = pd.to_numeric(tmp_df.loc[idx], errors='coerce')
-        if numeric_row.isna().all():
-            continue
-
-        max_col = numeric_row.idxmax()
-        if pd.notna(numeric_row[max_col]):
-            current_value = result_df.loc[idx, max_col]
-            result_df.loc[idx, max_col] = f"**{current_value}**"
-
-    return result_df
-
-
-def generate_markdown_content(model_tasks: dict[str, list[str]]) -> str:
-    """Generate the complete markdown content with comparison tables."""
-
-    if not model_tasks:
-        return "# Model Results Comparison\n\nNo valid model results found."
-
-    # Get all unique tasks across all models
-    all_tasks = []
-    for tasks in model_tasks.values():
-        all_tasks.extend(tasks)
-    all_tasks = sorted(list(set(all_tasks)))
-
-    # Get all models
-    new_models = list(model_tasks.keys())
-
-    markdown_parts = [
-        "# Model Results Comparison",
-        "",
-        f"**New models evaluated:** {', '.join(f'`{m}`' for m in new_models)}",
-        f"**Tasks:** {', '.join(f'`{t}`' for t in all_tasks)}",
-        "",
-    ]
-
-    # Create comparison tables for each new model
-    for model_name, model_tasks_list in model_tasks.items():
-        markdown_parts.extend([
-            f"## Results for `{model_name}`",
-        ])
-
-        try:
-            # Compare this model against reference models
-            df = create_comparison_table(model_name, tasks=model_tasks_list)
-            bold_df = highlight_max_bold(df)
-            markdown_table = bold_df.to_markdown(index=False)
-            markdown_parts.append(markdown_table)
-        except Exception as e:
-            print(f"❌ Error generating comparison table for {model_name}: {e}")
-
-        markdown_parts.extend(["", "---", ""])
-
-    return "\n".join(markdown_parts)
-
-
-def create_argparse() -> argparse.ArgumentParser:
-    """Create argument parser."""
-    parser = argparse.ArgumentParser(
-        description="Generate markdown comparison table for model results from file paths."
-    )
-    parser.add_argument(
-        "files",
-        nargs="+",
-        help="List of result JSON files to process",
-    )
-    parser.add_argument(
-        "--output",
-        type=Path,
-        required=True,
-        help="Output markdown file path (required)",
-    )
-    return parser
-
-
-def main():
-    """Main function."""
-    parser = create_argparse()
-    args = parser.parse_args()
-
-    print(f"Processing {len(args.files)} files...")
-
-    # Group files by model
-    try:
-        model_tasks = group_files_by_model(args.files)
-        print(f"\nFound {len(model_tasks)} models:")
-        for model, tasks in model_tasks.items():
-            print(f"  {model}: {len(tasks)} tasks ({', '.join(tasks)})")
-    except Exception as e:
-        print(f"❌ Error processing files: {e}")
-        raise e
-
-    if not model_tasks:
-        print("❌ No valid model results found")
-        raise e
-
-    # Generate markdown content
-    try:
-        markdown_content = generate_markdown_content(model_tasks)
-    except Exception as e:
-        print(f"❌ Error generating markdown: {e}")
-        raise e
-
-    # Write to output file
-    try:
-        args.output.parent.mkdir(parents=True, exist_ok=True)
-        args.output.write_text(markdown_content)
-        print(f"✅ Markdown written to {args.output}")
-    except Exception as e:
-        print(f"❌ Error writing to {args.output}: {e}")
-        raise e
-
-
-if __name__ == "__main__":
-    main()

From a28be8f3583198a4ff29c4430594597bf73a9b0c Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Fri, 13 Jun 2025 14:21:18 +0300
Subject: [PATCH 14/20] format

---
 scripts/create_pr_results_comment.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/scripts/create_pr_results_comment.py b/scripts/create_pr_results_comment.py
index bf03aae47d..9dbb93c4c7 100644
--- a/scripts/create_pr_results_comment.py
+++ b/scripts/create_pr_results_comment.py
@@ -113,13 +113,17 @@ def create_comparison_table(
     df[max_col_name] = None
     task_results = mteb.load_results(tasks=tasks, download_latest=False)
     task_results = task_results.join_revisions()
-    max_dataframe = task_results.to_dataframe(format="long").groupby(task_col_name).max()
+    max_dataframe = (
+        task_results.to_dataframe(format="long").groupby(task_col_name).max()
+    )
     if not max_dataframe.empty:
         for task_name, row in max_dataframe.iterrows():
-            df.loc[df[task_col_name] == task_name, max_col_name] = row["score"] / 100  # scores are in percentage
+            df.loc[df[task_col_name] == task_name, max_col_name] = (
+                row["score"] / 100
+            )  # scores are in percentage
 
     averages: dict[str, float | None] = {}
-    for col in models+[max_col_name]:
+    for col in models + [max_col_name]:
         numeric = pd.to_numeric(df[col], errors="coerce")
         avg = numeric.mean()
         averages[col] = avg if not pd.isna(avg) else None

From 6767970aa6d2909445d60a89ffef56bf25d170b2 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Fri, 13 Jun 2025 14:24:16 +0300
Subject: [PATCH 15/20] fix typo

---
 scripts/create_pr_results_comment.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/create_pr_results_comment.py b/scripts/create_pr_results_comment.py
index 9dbb93c4c7..d79f5cb104 100644
--- a/scripts/create_pr_results_comment.py
+++ b/scripts/create_pr_results_comment.py
@@ -16,7 +16,7 @@
     --output: Output markdown file path (default: model-comparison.md)
 
 Example:
-    python scripts/create_pr_results_comment.py --models intfloat/multilingual-e5-large myorg/my-new-modelm
+    python scripts/create_pr_results_comment.py --models intfloat/multilingual-e5-large myorg/my-new-model
 """
 
 from __future__ import annotations

From c66edbd3e1296d7a8c116073b8045fd59604782e Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Fri, 13 Jun 2025 14:27:21 +0300
Subject: [PATCH 16/20] fetch main

---
 .github/workflows/model-results-comparison.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml
index d5d651f0a8..f89ca05b84 100644
--- a/.github/workflows/model-results-comparison.yaml
+++ b/.github/workflows/model-results-comparison.yaml
@@ -31,6 +31,9 @@ jobs:
       with:
         fetch-depth: 0
 
+    - name: Fetch origin main
+      run: git fetch origin main
+
     - name: Set up Python
       uses: actions/setup-python@v5
       with:

From a7ed069571514100bf33d157e9ed2cbc12f455b5 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Fri, 13 Jun 2025 14:33:50 +0300
Subject: [PATCH 17/20] fetch main in script

---
 scripts/create_pr_results_comment.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/create_pr_results_comment.py b/scripts/create_pr_results_comment.py
index d79f5cb104..014ae69834 100644
--- a/scripts/create_pr_results_comment.py
+++ b/scripts/create_pr_results_comment.py
@@ -51,6 +51,13 @@
 
 
 def get_diff_from_main() -> list[str]:
+    subprocess.run(
+        ["git", "fetch", "origin", "main"],
+        cwd=repo_path,
+        check=True,
+        text=True,
+    )
+
     current_rev, origin_rev = subprocess.run(
         ["git", "rev-parse", "main", "origin/main"],
         cwd=repo_path,

From dc2d78786820aa12966432a2696ce843d3bda46d Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Fri, 13 Jun 2025 14:38:05 +0300
Subject: [PATCH 18/20] remove revision check

---
 scripts/create_pr_results_comment.py | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/scripts/create_pr_results_comment.py b/scripts/create_pr_results_comment.py
index 014ae69834..d1837ca798 100644
--- a/scripts/create_pr_results_comment.py
+++ b/scripts/create_pr_results_comment.py
@@ -51,26 +51,6 @@
 
 
 def get_diff_from_main() -> list[str]:
-    subprocess.run(
-        ["git", "fetch", "origin", "main"],
-        cwd=repo_path,
-        check=True,
-        text=True,
-    )
-
-    current_rev, origin_rev = subprocess.run(
-        ["git", "rev-parse", "main", "origin/main"],
-        cwd=repo_path,
-        capture_output=True,
-        check=True,
-        text=True,
-    ).stdout.splitlines()
-
-    if current_rev != origin_rev:
-        raise ValueError(
-            f"Your main branch is not up-to-date ({current_rev} != {origin_rev}), please run `git fetch origin main`"
-        )
-
     differences = subprocess.run(
         ["git", "diff", "--name-only", "origin/main...HEAD"],
         cwd=repo_path,

From c67ecc5a189c99a02937b36cb5274c536dcd8eee Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Fri, 13 Jun 2025 14:44:11 +0300
Subject: [PATCH 19/20] fix reference models arg

---
 .github/workflows/model-results-comparison.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml
index f89ca05b84..4f1b0d117b 100644
--- a/.github/workflows/model-results-comparison.yaml
+++ b/.github/workflows/model-results-comparison.yaml
@@ -47,7 +47,7 @@ jobs:
       env:
         REFERENCE_MODELS: ${{ github.event.inputs.reference_models || 'intfloat/multilingual-e5-large google/gemini-embedding-001' }}
       run: |
-        python scripts/create_pr_results_comment.py --reference-models "$REFERENCE_MODELS" --output model-comparison.md
+        python scripts/create_pr_results_comment.py --reference-models $REFERENCE_MODELS --output model-comparison.md
 
     - name: Determine PR Number
       id: pr_info

From f40d643dd00b9ead3ba2334b048c12f1b15df696 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sun, 15 Jun 2025 21:55:18 +0300
Subject: [PATCH 20/20] bump python version

---
 .github/workflows/model-results-comparison.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/model-results-comparison.yaml b/.github/workflows/model-results-comparison.yaml
index 4f1b0d117b..8405521810 100644
--- a/.github/workflows/model-results-comparison.yaml
+++ b/.github/workflows/model-results-comparison.yaml
@@ -37,7 +37,7 @@ jobs:
     - name: Set up Python
       uses: actions/setup-python@v5
       with:
-        python-version: '3.9'
+        python-version: '3.10'
 
     - name: Install dependencies
       run: |