From f665c96d36ebb8fec1260ce9373f135fb1871713 Mon Sep 17 00:00:00 2001
From: ayush1298 <munotayush6@kgpian.iitkgp.ac.in>
Date: Fri, 28 Mar 2025 23:54:46 +0530
Subject: [PATCH 1/8] CLI Tool for results dataframe on leaderboard

---
 docs/adding_a_model.md  |   1 +
 pyproject.toml          |   2 +
 scripts/create_table.py | 306 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 309 insertions(+)
 create mode 100644 scripts/create_table.py

diff --git a/docs/adding_a_model.md b/docs/adding_a_model.md
index b9e39ce436..36b5718fda 100644
--- a/docs/adding_a_model.md
+++ b/docs/adding_a_model.md
@@ -139,6 +139,7 @@ If your are adding a model that requires additional dependencies, you can add th
 
 In the [voyage_models.py](../mteb/models/voyage_models.py) file, we have added the following code:
 ```python
+from mteb.requires_package import requires_package
 requires_package(self, "voyageai", model_name, "pip install 'mteb[voyageai]'")
 ```
 and also updated [pyproject.toml]((../pyproject.toml)) file with the following code:
diff --git a/pyproject.toml b/pyproject.toml
index a67226ea04..9326028776 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,6 +93,8 @@ vertexai = ["vertexai==1.71.1"]
 ll2vec = ["ll2vec==0.2.3"]
 timm = ["timm==1.0.15"]
 open_clip_torch = ["open_clip_torch==2.31.0"]
+xlsx = ["openpyxl>=3.1.0"]
+markdown = ["tabulate>=0.8.0"]
 
 [tool.coverage.report]
 
diff --git a/scripts/create_table.py b/scripts/create_table.py
new file mode 100644
index 0000000000..a253601a1f
--- /dev/null
+++ b/scripts/create_table.py
@@ -0,0 +1,306 @@
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import Literal
+
+import numpy as np
+import pandas as pd
+
+import mteb
+from mteb.load_results import load_results
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def get_available_benchmarks():
+    """Get all available benchmark names."""
+    return [b.name for b in mteb.get_benchmarks()]
+
+
+def save_dataframe(
+    df: pd.DataFrame,
+    output_path: str,
+):
+    """Save a DataFrame to the specified format based on file extension.
+
+    Args:
+        df: The DataFrame to save
+        output_path: Path for the output file, extension determines format
+
+    Returns:
+        str: The full path to the saved file
+    """
+    ext = Path(output_path).suffix.lower()
+    fallback_path = str(Path(output_path).with_suffix(".csv"))
+
+    def warn_and_fallback(reason: str):
+        """Logs a warning and saves the DataFrame as CSV instead."""
+        logger.warning(f"{reason}. Defaulting to CSV format: {fallback_path}")
+        df.to_csv(fallback_path, index=False)
+        return fallback_path
+
+    if ext == ".csv":
+        df.to_csv(output_path, index=False)
+    elif ext == ".xlsx":
+        try:
+            df.to_excel(output_path, index=False)
+        except ImportError:
+            return warn_and_fallback(
+                "openpyxl not installed. Please install with 'pip install mteb[xlsx]' to save as Excel."
+            )
+    elif ext == ".md":
+        try:
+            with open(output_path, "w") as f:
+                f.write(df.to_markdown(index=False))
+        except ImportError:
+            return warn_and_fallback(
+                "tabulate not installed. Please install with 'pip install mteb[markdown]' to save as Markdown."
+            )
+    else:
+        return warn_and_fallback(
+            f"Unsupported file extension: {ext}, defaulting to CSV"
+        )
+
+    return output_path
+
+
+def create_comparison_table(
+    results_folder: str,
+    model_names: list[str],
+    benchmark_name: str | None = None,
+    output_path: str | None = None,
+    aggregation_level: Literal["subset", "split", "task"] = "task",
+) -> pd.DataFrame:
+    """Create comparison tables for MTEB models.
+
+    Args:
+        results_folder: Path to the results folder
+        model_names: List of model names to include
+        benchmark_name: Name of the benchmark (optional)
+        output_path: Path to save the output tables
+        aggregation_level: Level of aggregation for results ('subset', 'split', or 'task')
+                          - 'subset': Results for each subset within each split for each task
+                          - 'split': Results aggregated over subsets for each split for each task
+                          - 'task': Results aggregated over subsets and splits for each task
+
+    Returns:
+        result_df: DataFrame with aggregated results
+    """
+    logger.info(f"Creating comparison table for models: {', '.join(model_names)}")
+    logger.info(f"Using aggregation level: {aggregation_level}")
+
+    # Load results
+    benchmark_results = load_results(
+        results_repo=results_folder,
+        only_main_score=True,
+        require_model_meta=False,
+        models=model_names,
+    )
+
+    # Filter by benchmark if specified
+    if benchmark_name:
+        logger.info(f"Filtering tasks for benchmark: {benchmark_name}")
+        benchmark = next(
+            (b for b in mteb.get_benchmarks() if b.name == benchmark_name), None
+        )
+        if not benchmark:
+            raise ValueError(
+                f"Benchmark '{benchmark_name}' not found. Available: {get_available_benchmarks()}"
+            )
+
+        benchmark_results_filtered = benchmark.load_results(
+            base_results=benchmark_results
+        ).join_revisions()
+    else:
+        logger.info("Using all available tasks for the specified models")
+        benchmark_results_filtered = benchmark_results.join_revisions()
+
+    # Check if we have any results
+    if not benchmark_results_filtered.model_results or not any(
+        model_result.task_results
+        for model_result in benchmark_results_filtered.model_results
+    ):
+        logger.warning("No results found for the specified models and benchmark")
+        return pd.DataFrame()
+
+    # Get detailed scores
+    scores_data = []
+    for model_result in benchmark_results_filtered.model_results:
+        model_name = model_result.model_name
+        for task_result in model_result.task_results:
+            task_name = task_result.task_name
+            for split, scores_list in task_result.scores.items():
+                for score_item in scores_list:
+                    scores_data.append(
+                        {
+                            "model_name": model_name,
+                            "task_name": task_name,
+                            "split": split,
+                            "subset": score_item.get("hf_subset", "default"),
+                            "score": score_item.get("main_score", 0.0) * 100,
+                        }
+                    )
+
+    if not scores_data:
+        logger.warning("No scores found for the specified models and benchmark")
+        return pd.DataFrame()
+
+    scores_df = pd.DataFrame(scores_data)
+
+    # Create the appropriate table based on aggregation level
+    if aggregation_level == "subset":
+        # For subset level, show raw data at task/split/subset level (no aggregation)
+        pivot_df = scores_df.pivot_table(
+            index=["task_name", "split", "subset"],
+            columns="model_name",
+            values="score",
+            aggfunc="mean",
+        ).reset_index()
+
+    elif aggregation_level == "split":
+        # For split level, aggregate across subsets for each task/split combination
+        agg_df = (
+            scores_df.groupby(["model_name", "task_name", "split"])["score"]
+            .mean()
+            .reset_index()
+        )
+        pivot_df = agg_df.pivot_table(
+            index=["task_name", "split"],
+            columns="model_name",
+            values="score",
+            aggfunc="mean",
+        ).reset_index()
+
+    elif aggregation_level == "task":
+        # For task level, aggregate across both subsets and splits for each task
+        agg_df = (
+            scores_df.groupby(["model_name", "task_name"])["score"].mean().reset_index()
+        )
+        pivot_df = agg_df.pivot_table(
+            index=["task_name"],
+            columns="model_name",
+            values="score",
+            aggfunc="mean",
+        ).reset_index()
+
+    pivot_df.columns.name = None
+    model_cols = [
+        col for col in pivot_df.columns if col not in ["task_name", "split", "subset"]
+    ]
+    if model_cols:
+        # Create mean row based on aggregation level
+        if aggregation_level == "subset":
+            # Add an empty row for overall mean
+            overall_mean_row = {"task_name": "mean_score", "split": "", "subset": ""}
+            for model in model_cols:
+                overall_mean_row[model] = pivot_df[model].mean()
+            pivot_df = pd.concat(
+                [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True
+            )
+
+        elif aggregation_level == "split":
+            overall_mean_row = {"task_name": "mean_score", "split": ""}
+            for model in model_cols:
+                overall_mean_row[model] = pivot_df[model].mean()
+            pivot_df = pd.concat(
+                [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True
+            )
+
+        elif aggregation_level == "task":
+            # Add overall mean row
+            overall_mean_row = {"task_name": "mean_score"}
+            for model in model_cols:
+                overall_mean_row[model] = pivot_df[model].mean()
+            pivot_df = pd.concat(
+                [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True
+            )
+
+    # Round scores to 2 decimal places
+    numeric_columns = pivot_df.select_dtypes(include=np.number).columns
+    pivot_df[numeric_columns] = pivot_df[numeric_columns].round(2)
+
+    # Save output if path is provided
+    if output_path:
+        output_dir = Path(output_path).parent
+        os.makedirs(output_dir, exist_ok=True)
+
+        save_dataframe(pivot_df, output_path)
+        logger.info(f"Comparison table saved to {output_path}")
+
+    return pivot_df
+
+
+def format_table_for_display(df: pd.DataFrame) -> str:
+    """Format a DataFrame for terminal display."""
+    max_rows = 10
+    if len(df) > max_rows:
+        display_df = df.head(max_rows)
+        return f"{display_df.to_string()}\n... {len(df) - max_rows} more rows"
+    return df.to_string()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Create comparison tables for MTEB models."
+    )
+
+    parser.add_argument(
+        "--results",
+        type=str,
+        default="results/",
+        help="Path to fetch results from (local folder or GitHub repo URL)",
+    )
+
+    parser.add_argument(
+        "--models",
+        type=str,
+        required=True,
+        help="Comma-separated list of models to include in the table",
+    )
+
+    parser.add_argument(
+        "--benchmark",
+        type=str,
+        default=None,
+        help=f"Benchmark to use (optional). Available: {get_available_benchmarks()}",
+    )
+
+    parser.add_argument(
+        "--aggregation-level",
+        type=str,
+        choices=["subset", "split", "task"],
+        default="task",
+        help="Level of aggregation for results (subset, split, or task)",
+    )
+
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="comparison_table.csv",
+        help="Output path for the generated table (include extension: .csv, .xlsx, or .md)",
+    )
+
+    args = parser.parse_args()
+
+    models = [model.strip() for model in args.models.split(",")]
+
+    result_df = create_comparison_table(
+        results_folder=args.results,
+        model_names=models,
+        benchmark_name=args.benchmark,
+        output_path=args.output,
+        aggregation_level=args.aggregation_level,
+    )
+
+    # Display table in terminal
+    if not result_df.empty:
+        print(
+            f"\n===== COMPARISON TABLE ({args.aggregation_level.upper()} AGGREGATION) ====="
+        )
+        print(format_table_for_display(result_df))
+    else:
+        print("\nNo data available for the specified models and benchmark")

From faeae37d759a8ee0d985cb406a68e754e73663da Mon Sep 17 00:00:00 2001
From: ayush1298 <munotayush6@kgpian.iitkgp.ac.in>
Date: Sat, 29 Mar 2025 14:43:14 +0530
Subject: [PATCH 2/8] Integrated script with CLI

---
 mteb/cli.py                       | 59 +++++++++++++++++++++++++++++++
 {scripts => mteb}/create_table.py | 59 +++++++------------------------
 2 files changed, 72 insertions(+), 46 deletions(-)
 rename {scripts => mteb}/create_table.py (87%)

diff --git a/mteb/cli.py b/mteb/cli.py
index ece20027f9..3b0fbdb7b0 100644
--- a/mteb/cli.py
+++ b/mteb/cli.py
@@ -5,6 +5,7 @@
 - mteb run: Runs a model on a set of tasks
 - mteb available_tasks: Lists the available tasks within MTEB
 - mteb create_meta: Creates the metadata for a model card from a folder of results
+- mteb create-table: Creates comparison tables for MTEB models 
 
 ## Running Models on Tasks
 
@@ -73,6 +74,18 @@
       value: 84.49350649350649
 ---
 ```
+
+
+## Creating Comparison Tables
+
+To create comparison tables between models based on various aggregation levels (task, split, or subset), use the `mteb create-table` command. For example:
+
+```bash
+mteb create-table --results results/ \
+                 --models "intfloat/multilingual-e5-small,intfloat/multilingual-e5-base" \
+                 --benchmark "MTEB(eng, v1)" \
+                 --aggregation-level task \
+                 --output comparison_table.csv
 """
 
 from __future__ import annotations
@@ -87,6 +100,7 @@
 
 import mteb
 from mteb.create_meta import generate_readme
+from mteb.create_table import create_table_cli
 
 logging.basicConfig(level=logging.WARNING)
 logger = logging.getLogger(__name__)
@@ -354,6 +368,50 @@ def add_create_meta_parser(subparsers) -> None:
     parser.set_defaults(func=create_meta)
 
 
+def add_create_table_parser(subparsers) -> None:
+    parser = subparsers.add_parser(
+        "create-table", help="Create comparison tables for MTEB models"
+    )
+
+    parser.add_argument(
+        "--results",
+        type=str,
+        default="results/",
+        help="Path to fetch results from (local folder or GitHub repo URL)",
+    )
+
+    parser.add_argument(
+        "--models",
+        type=str,
+        default=None,
+        help="Comma-separated list of models to include in the table (default: all models)",
+    )
+
+    parser.add_argument(
+        "--benchmark",
+        type=str,
+        default=None,
+        help="Benchmark to use (optional). Available benchmarks can be listed with 'mteb available_benchmarks'",
+    )
+
+    parser.add_argument(
+        "--aggregation-level",
+        type=str,
+        choices=["subset", "split", "task"],
+        default="task",
+        help="Level of aggregation for results (subset, split, or task)",
+    )
+
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="comparison_table.csv",
+        help="Output path for the generated table (include extension: .csv, .xlsx, or .md)",
+    )
+
+    parser.set_defaults(func=create_table_cli)
+
+
 def main():
     parser = argparse.ArgumentParser(description="The MTEB Command line interface.")
 
@@ -364,6 +422,7 @@ def main():
     add_available_tasks_parser(subparsers)
     add_available_benchmarks_parser(subparsers)
     add_create_meta_parser(subparsers)
+    add_create_table_parser(subparsers)
 
     args = parser.parse_args()
 
diff --git a/scripts/create_table.py b/mteb/create_table.py
similarity index 87%
rename from scripts/create_table.py
rename to mteb/create_table.py
index a253601a1f..e32ea36254 100644
--- a/scripts/create_table.py
+++ b/mteb/create_table.py
@@ -70,7 +70,7 @@ def warn_and_fallback(reason: str):
 
 def create_comparison_table(
     results_folder: str,
-    model_names: list[str],
+    model_names: list[str] | None = None,
     benchmark_name: str | None = None,
     output_path: str | None = None,
     aggregation_level: Literal["subset", "split", "task"] = "task",
@@ -79,7 +79,7 @@ def create_comparison_table(
 
     Args:
         results_folder: Path to the results folder
-        model_names: List of model names to include
+        model_names: List of model names to include (default: None, which means all available models)
         benchmark_name: Name of the benchmark (optional)
         output_path: Path to save the output tables
         aggregation_level: Level of aggregation for results ('subset', 'split', or 'task')
@@ -90,7 +90,11 @@ def create_comparison_table(
     Returns:
         result_df: DataFrame with aggregated results
     """
-    logger.info(f"Creating comparison table for models: {', '.join(model_names)}")
+    if model_names:
+        logger.info(f"Creating comparison table for models: {', '.join(model_names)}")
+    else:
+        logger.info("Creating comparison table for all available models")
+
     logger.info(f"Using aggregation level: {aggregation_level}")
 
     # Load results
@@ -243,51 +247,12 @@ def format_table_for_display(df: pd.DataFrame) -> str:
     return df.to_string()
 
 
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        description="Create comparison tables for MTEB models."
-    )
-
-    parser.add_argument(
-        "--results",
-        type=str,
-        default="results/",
-        help="Path to fetch results from (local folder or GitHub repo URL)",
-    )
-
-    parser.add_argument(
-        "--models",
-        type=str,
-        required=True,
-        help="Comma-separated list of models to include in the table",
-    )
-
-    parser.add_argument(
-        "--benchmark",
-        type=str,
-        default=None,
-        help=f"Benchmark to use (optional). Available: {get_available_benchmarks()}",
+def create_table_cli(args: argparse.Namespace) -> pd.DataFrame:
+    """Entry point for CLI integration."""
+    models = (
+        [model.strip() for model in args.models.split(",")] if args.models else None
     )
 
-    parser.add_argument(
-        "--aggregation-level",
-        type=str,
-        choices=["subset", "split", "task"],
-        default="task",
-        help="Level of aggregation for results (subset, split, or task)",
-    )
-
-    parser.add_argument(
-        "--output",
-        type=str,
-        default="comparison_table.csv",
-        help="Output path for the generated table (include extension: .csv, .xlsx, or .md)",
-    )
-
-    args = parser.parse_args()
-
-    models = [model.strip() for model in args.models.split(",")]
-
     result_df = create_comparison_table(
         results_folder=args.results,
         model_names=models,
@@ -304,3 +269,5 @@ def format_table_for_display(df: pd.DataFrame) -> str:
         print(format_table_for_display(result_df))
     else:
         print("\nNo data available for the specified models and benchmark")
+
+    return result_df

From 9365a5ddc0b24ceefd1d3803ed0c8fa49117fa85 Mon Sep 17 00:00:00 2001
From: ayush1298 <munotayush6@kgpian.iitkgp.ac.in>
Date: Sat, 29 Mar 2025 20:09:30 +0530
Subject: [PATCH 3/8] Address comments

---
 mteb/cli.py                                       | 7 ++++---
 mteb/{create_table.py => create_results_table.py} | 8 +++-----
 2 files changed, 7 insertions(+), 8 deletions(-)
 rename mteb/{create_table.py => create_results_table.py} (98%)

diff --git a/mteb/cli.py b/mteb/cli.py
index 3b0fbdb7b0..d97b57b57d 100644
--- a/mteb/cli.py
+++ b/mteb/cli.py
@@ -82,7 +82,7 @@
 
 ```bash
 mteb create-table --results results/ \
-                 --models "intfloat/multilingual-e5-small,intfloat/multilingual-e5-base" \
+                 --models "intfloat/multilingual-e5-small" "intfloat/multilingual-e5-base" \
                  --benchmark "MTEB(eng, v1)" \
                  --aggregation-level task \
                  --output comparison_table.csv
@@ -100,7 +100,7 @@
 
 import mteb
 from mteb.create_meta import generate_readme
-from mteb.create_table import create_table_cli
+from mteb.create_results_table import create_table_cli
 
 logging.basicConfig(level=logging.WARNING)
 logger = logging.getLogger(__name__)
@@ -383,8 +383,9 @@ def add_create_table_parser(subparsers) -> None:
     parser.add_argument(
         "--models",
         type=str,
+        nargs='*',
         default=None,
-        help="Comma-separated list of models to include in the table (default: all models)",
+        help="Models to include in the table (default: all models from results dir)",
     )
 
     parser.add_argument(
diff --git a/mteb/create_table.py b/mteb/create_results_table.py
similarity index 98%
rename from mteb/create_table.py
rename to mteb/create_results_table.py
index e32ea36254..2561d99a5c 100644
--- a/mteb/create_table.py
+++ b/mteb/create_results_table.py
@@ -12,10 +12,8 @@
 import mteb
 from mteb.load_results import load_results
 
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-
 def get_available_benchmarks():
     """Get all available benchmark names."""
     return [b.name for b in mteb.get_benchmarks()]
@@ -70,9 +68,9 @@ def warn_and_fallback(reason: str):
 
 def create_comparison_table(
     results_folder: str,
+    output_path: str,
     model_names: list[str] | None = None,
     benchmark_name: str | None = None,
-    output_path: str | None = None,
     aggregation_level: Literal["subset", "split", "task"] = "task",
 ) -> pd.DataFrame:
     """Create comparison tables for MTEB models.
@@ -250,14 +248,14 @@ def format_table_for_display(df: pd.DataFrame) -> str:
 def create_table_cli(args: argparse.Namespace) -> pd.DataFrame:
     """Entry point for CLI integration."""
     models = (
-        [model.strip() for model in args.models.split(",")] if args.models else None
+        [model.strip() for model in args.models] if args.models else None
     )
 
     result_df = create_comparison_table(
         results_folder=args.results,
+        output_path=args.output,
         model_names=models,
         benchmark_name=args.benchmark,
-        output_path=args.output,
         aggregation_level=args.aggregation_level,
     )
 

From c53d53797922ba78cb9072c048cad2cd7cc5bf30 Mon Sep 17 00:00:00 2001
From: ayush1298 <munotayush6@kgpian.iitkgp.ac.in>
Date: Sat, 29 Mar 2025 20:10:16 +0530
Subject: [PATCH 4/8] make lint

---
 mteb/cli.py                  | 2 +-
 mteb/create_results_table.py | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/mteb/cli.py b/mteb/cli.py
index d97b57b57d..0328db4499 100644
--- a/mteb/cli.py
+++ b/mteb/cli.py
@@ -383,7 +383,7 @@ def add_create_table_parser(subparsers) -> None:
     parser.add_argument(
         "--models",
         type=str,
-        nargs='*',
+        nargs="*",
         default=None,
         help="Models to include in the table (default: all models from results dir)",
     )
diff --git a/mteb/create_results_table.py b/mteb/create_results_table.py
index 2561d99a5c..0185766479 100644
--- a/mteb/create_results_table.py
+++ b/mteb/create_results_table.py
@@ -14,6 +14,7 @@
 
 logger = logging.getLogger(__name__)
 
+
 def get_available_benchmarks():
     """Get all available benchmark names."""
     return [b.name for b in mteb.get_benchmarks()]
@@ -247,9 +248,7 @@ def format_table_for_display(df: pd.DataFrame) -> str:
 
 def create_table_cli(args: argparse.Namespace) -> pd.DataFrame:
     """Entry point for CLI integration."""
-    models = (
-        [model.strip() for model in args.models] if args.models else None
-    )
+    models = [model.strip() for model in args.models] if args.models else None
 
     result_df = create_comparison_table(
         results_folder=args.results,

From ac62a09b68b36b2c5395064e0ac24268e3523395 Mon Sep 17 00:00:00 2001
From: ayush1298 <munotayush6@kgpian.iitkgp.ac.in>
Date: Sun, 30 Mar 2025 23:45:25 +0530
Subject: [PATCH 5/8] Added tests

---
 docs/adding_a_benchmark.md        |  2 +-
 mteb/create_results_table.py      |  4 +-
 mteb/load_results/load_results.py |  5 +++
 tests/test_cli.py                 | 74 +++++++++++++++++++++++++++++++
 4 files changed, 83 insertions(+), 2 deletions(-)

diff --git a/docs/adding_a_benchmark.md b/docs/adding_a_benchmark.md
index 7da178bbb3..1a444ee59f 100644
--- a/docs/adding_a_benchmark.md
+++ b/docs/adding_a_benchmark.md
@@ -3,5 +3,5 @@
 The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard) and we encourage additions of new benchmarks. To add a new benchmark:
 
 1. Add your benchmark to [benchmark.py](../mteb/benchmarks/benchmarks.py) as a `Benchmark` object, and select the MTEB tasks that will be in the benchmark. If some of the tasks do not exist in MTEB, follow the "add a dataset" instructions to add them.
-2. Open a PR at https://github.com/embedding-benchmark/results with results of models on your benchmark.
+2. Open a PR at https://github.com/embeddings-benchmark/results with results of models on your benchmark.
 3. When PRs are merged, your benchmark will be added to the leaderboard automatically after the next workflow trigger.
diff --git a/mteb/create_results_table.py b/mteb/create_results_table.py
index 0185766479..2dad26fa5b 100644
--- a/mteb/create_results_table.py
+++ b/mteb/create_results_table.py
@@ -144,7 +144,9 @@ def create_comparison_table(
                             "task_name": task_name,
                             "split": split,
                             "subset": score_item.get("hf_subset", "default"),
-                            "score": score_item.get("main_score", 0.0) * 100,
+                            "score": score_item.get("main_score", 0.0) * 100
+                            if score_item.get("main_score", 0.0) is not None
+                            else 0.0,
                         }
                     )
 
diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py
index 917f82553f..9f1957d8a2 100644
--- a/mteb/load_results/load_results.py
+++ b/mteb/load_results/load_results.py
@@ -30,6 +30,11 @@ def download_of_results(
     Returns:
         The path to the local cache directory.
     """
+    results_path = Path(results_repo)
+    if results_path.exists() and results_path.is_dir():
+        logger.info(f"Using local results repository at {results_path}")
+        return results_path
+
     default_cache_directory = Path.home() / ".cache" / "mteb"
 
     if cache_directory is None:
diff --git a/tests/test_cli.py b/tests/test_cli.py
index c91d47fb83..7c8ded9b4f 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -188,6 +188,80 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name:
     assert result.returncode == 0, "Command failed"
 
 
+@pytest.mark.parametrize(
+    "results_folder",
+    [
+        # "tests/results",  # Local results folder
+        "https://github.com/embeddings-benchmark/results",  # Remote results repository
+    ],
+)
+@pytest.mark.parametrize("aggregation_level", ["subset", "split", "task"])
+@pytest.mark.parametrize("output_format", ["csv", "md", "xlsx"])
+def test_create_table(
+    results_folder: str,
+    aggregation_level: str,
+    output_format: str,
+):
+    """Test create-table CLI tool with local and remote results repositories."""
+    test_folder = Path(__file__).parent
+    output_file = f"comparison_table_test.{output_format}"
+    output_path = test_folder / output_file
+    models = ["intfloat/multilingual-e5-small", "intfloat/multilingual-e5-base"]
+    benchmark = "MTEB(Multilingual, v1)"
+
+    models_arg = " ".join(f'"{model}"' for model in models)
+    command = (
+        f"{sys.executable} -m mteb create-table "
+        f"--results {results_folder} "
+        f"--models {models_arg} "
+        f'--benchmark "{benchmark}" '
+        f"--aggregation-level {aggregation_level} "
+        f"--output {output_path}"
+    )
+
+    # Run the command
+    result = subprocess.run(command, shell=True, capture_output=True, text=True)
+
+    # Assert the command executed successfully
+    assert result.returncode == 0, f"Command failed: {result.stderr}"
+
+    # Assert the output file was created
+    assert output_path.exists(), "Output file not created"
+
+    if aggregation_level == "task":
+        expected_headers = ["task_name"] + models
+    elif aggregation_level == "split":
+        expected_headers = ["task_name", "split"] + models
+    elif aggregation_level == "subset":
+        expected_headers = ["task_name", "split", "subset"] + models
+
+    if output_file.endswith(".csv"):
+        with output_path.open("r") as f:
+            content = f.readline().strip().split(",")
+            assert sorted(content) == sorted(expected_headers), (
+                f"CSV headers do not match: {content}"
+            )
+    elif output_file.endswith(".xlsx"):
+        try:
+            import pandas as pd
+
+            df = pd.read_excel(output_path)
+            assert sorted(df.columns) == sorted(expected_headers), (
+                f"Excel headers do not match: {list(df.columns)}"
+            )
+        except ImportError:
+            pytest.fail("pandas or openpyxl is not installed for reading Excel files")
+    elif output_file.endswith(".md"):
+        with output_path.open("r") as f:
+            content = f.readline()
+            assert all(header in content for header in expected_headers), (
+                "Markdown headers do not match"
+            )
+
+    if output_path.exists():
+        output_path.unlink()
+
+
 def test_save_predictions():
     command = f"{sys.executable} -m mteb run -m sentence-transformers/average_word_embeddings_komninos -t NFCorpus --output_folder tests/results --save_predictions"
     result = subprocess.run(command, shell=True, capture_output=True, text=True)

From 6ece2dda1669b02b80bc44b2f4b027494623a445 Mon Sep 17 00:00:00 2001
From: ayush1298 <munotayush6@kgpian.iitkgp.ac.in>
Date: Mon, 31 Mar 2025 16:36:08 +0530
Subject: [PATCH 6/8] Fix tests

---
 tests/test_cli.py | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 7c8ded9b4f..546e8b426b 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -228,36 +228,6 @@ def test_create_table(
     # Assert the output file was created
     assert output_path.exists(), "Output file not created"
 
-    if aggregation_level == "task":
-        expected_headers = ["task_name"] + models
-    elif aggregation_level == "split":
-        expected_headers = ["task_name", "split"] + models
-    elif aggregation_level == "subset":
-        expected_headers = ["task_name", "split", "subset"] + models
-
-    if output_file.endswith(".csv"):
-        with output_path.open("r") as f:
-            content = f.readline().strip().split(",")
-            assert sorted(content) == sorted(expected_headers), (
-                f"CSV headers do not match: {content}"
-            )
-    elif output_file.endswith(".xlsx"):
-        try:
-            import pandas as pd
-
-            df = pd.read_excel(output_path)
-            assert sorted(df.columns) == sorted(expected_headers), (
-                f"Excel headers do not match: {list(df.columns)}"
-            )
-        except ImportError:
-            pytest.fail("pandas or openpyxl is not installed for reading Excel files")
-    elif output_file.endswith(".md"):
-        with output_path.open("r") as f:
-            content = f.readline()
-            assert all(header in content for header in expected_headers), (
-                "Markdown headers do not match"
-            )
-
     if output_path.exists():
         output_path.unlink()
 

From 7aa541de7f0cb79b697d3c4ae0b1f185aebc5cd8 Mon Sep 17 00:00:00 2001
From: ayush1298 <munotayush6@kgpian.iitkgp.ac.in>
Date: Mon, 31 Mar 2025 18:20:37 +0530
Subject: [PATCH 7/8] Checks not passing fix

---
 tests/test_cli.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/test_cli.py b/tests/test_cli.py
index 546e8b426b..3e9e790a58 100644
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@@ -195,10 +195,12 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name:
         "https://github.com/embeddings-benchmark/results",  # Remote results repository
     ],
 )
+@pytest.mark.parametrize("benchmark", ["MTEB(eng, v1)", "MTEB(Multilingual, v1)"])
 @pytest.mark.parametrize("aggregation_level", ["subset", "split", "task"])
 @pytest.mark.parametrize("output_format", ["csv", "md", "xlsx"])
 def test_create_table(
     results_folder: str,
+    benchmark: str,
     aggregation_level: str,
     output_format: str,
 ):
@@ -207,9 +209,11 @@ def test_create_table(
     output_file = f"comparison_table_test.{output_format}"
     output_path = test_folder / output_file
     models = ["intfloat/multilingual-e5-small", "intfloat/multilingual-e5-base"]
-    benchmark = "MTEB(Multilingual, v1)"
-
     models_arg = " ".join(f'"{model}"' for model in models)
+
+    if output_format == "xlsx":
+        pytest.importorskip("openpyxl", reason="openpyxl is required for .xlsx output")
+
     command = (
         f"{sys.executable} -m mteb create-table "
         f"--results {results_folder} "

From 89db9ac28b16a0701646a9730732d1eb5419e50c Mon Sep 17 00:00:00 2001
From: ayush1298 <munotayush6@kgpian.iitkgp.ac.in>
Date: Tue, 1 Apr 2025 01:11:50 +0530
Subject: [PATCH 8/8] Added choices for benchmark

---
 mteb/cli.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mteb/cli.py b/mteb/cli.py
index 0328db4499..e50c03d55a 100644
--- a/mteb/cli.py
+++ b/mteb/cli.py
@@ -392,6 +392,7 @@ def add_create_table_parser(subparsers) -> None:
         "--benchmark",
         type=str,
         default=None,
+        choices=[benchmark.name for benchmark in mteb.get_benchmarks()],
         help="Benchmark to use (optional). Available benchmarks can be listed with 'mteb available_benchmarks'",
     )