embeddings-benchmark · ayush1298 · Mar 28, 2025 · Mar 29, 2025 · Mar 29, 2025 · Mar 29, 2025
diff --git a/mteb/cli.py b/mteb/cli.py
@@ -5,6 +5,7 @@
 - mteb run: Runs a model on a set of tasks
 - mteb available_tasks: Lists the available tasks within MTEB
 - mteb create_meta: Creates the metadata for a model card from a folder of results
+- mteb create-table: Creates comparison tables for MTEB models 
 
 ## Running Models on Tasks
 
@@ -73,6 +74,18 @@
       value: 84.49350649350649
 ---
 ```
+
+
+## Creating Comparison Tables
+
+To create comparison tables between models based on various aggregation levels (task, split, or subset), use the `mteb create-table` command. For example:
+
+```bash
+mteb create-table --results results/ \
+                 --models "intfloat/multilingual-e5-small" "intfloat/multilingual-e5-base" \
+                 --benchmark "MTEB(eng, v1)" \
+                 --aggregation-level task \
+                 --output comparison_table.csv
 """
 
 from __future__ import annotations
@@ -87,6 +100,7 @@
 
 import mteb
 from mteb.create_meta import generate_readme
+from mteb.create_results_table import create_table_cli
 
 logging.basicConfig(level=logging.WARNING)
 logger = logging.getLogger(__name__)
@@ -354,6 +368,52 @@ def add_create_meta_parser(subparsers) -> None:
     parser.set_defaults(func=create_meta)
 
 
+def add_create_table_parser(subparsers) -> None:
+    parser = subparsers.add_parser(
+        "create-table", help="Create comparison tables for MTEB models"
+    )
+
+    parser.add_argument(
+        "--results",
+        type=str,
+        default="results/",
+        help="Path to fetch results from (local folder or GitHub repo URL)",
+    )
+
+    parser.add_argument(
+        "--models",
+        type=str,
+        nargs="*",
+        default=None,
+        help="Models to include in the table (default: all models from results dir)",
+    )
+
+    parser.add_argument(
+        "--benchmark",
+        type=str,
+        default=None,
+        choices=[benchmark.name for benchmark in mteb.get_benchmarks()],
+        help="Benchmark to use (optional). Available benchmarks can be listed with 'mteb available_benchmarks'",
+    )
+
+    parser.add_argument(
+        "--aggregation-level",
+        type=str,
+        choices=["subset", "split", "task"],
+        default="task",
+        help="Level of aggregation for results (subset, split, or task)",
+    )
+
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="comparison_table.csv",
+        help="Output path for the generated table (include extension: .csv, .xlsx, or .md)",
+    )
+
+    parser.set_defaults(func=create_table_cli)
+
+
 def main():
     parser = argparse.ArgumentParser(description="The MTEB Command line interface.")
 
@@ -364,6 +424,7 @@ def main():
     add_available_tasks_parser(subparsers)
     add_available_benchmarks_parser(subparsers)
     add_create_meta_parser(subparsers)
+    add_create_table_parser(subparsers)
 
     args = parser.parse_args()
 

diff --git a/mteb/create_results_table.py b/mteb/create_results_table.py
@@ -0,0 +1,272 @@
+from __future__ import annotations
+
+import argparse
+import logging
+import os
+from pathlib import Path
+from typing import Literal
+
+import numpy as np
+import pandas as pd
+
+import mteb
+from mteb.load_results import load_results
+
+logger = logging.getLogger(__name__)
+
+
+def get_available_benchmarks():
+    """Get all available benchmark names."""
+    return [b.name for b in mteb.get_benchmarks()]
+
+
+def save_dataframe(
+    df: pd.DataFrame,
+    output_path: str,
+):
+    """Save a DataFrame to the specified format based on file extension.
+
+    Args:
+        df: The DataFrame to save
+        output_path: Path for the output file, extension determines format
+
+    Returns:
+        str: The full path to the saved file
+    """
+    ext = Path(output_path).suffix.lower()
+    fallback_path = str(Path(output_path).with_suffix(".csv"))
+
+    def warn_and_fallback(reason: str):
+        """Logs a warning and saves the DataFrame as CSV instead."""
+        logger.warning(f"{reason}. Defaulting to CSV format: {fallback_path}")
+        df.to_csv(fallback_path, index=False)
+        return fallback_path
+
+    if ext == ".csv":
+        df.to_csv(output_path, index=False)
+    elif ext == ".xlsx":
+        try:
+            df.to_excel(output_path, index=False)
+        except ImportError:
+            return warn_and_fallback(
+                "openpyxl not installed. Please install with 'pip install mteb[xlsx]' to save as Excel."
+            )
+    elif ext == ".md":
+        try:
+            with open(output_path, "w") as f:
+                f.write(df.to_markdown(index=False))
+        except ImportError:
+            return warn_and_fallback(
+                "tabulate not installed. Please install with 'pip install mteb[markdown]' to save as Markdown."
+            )
+    else:
+        return warn_and_fallback(
+            f"Unsupported file extension: {ext}, defaulting to CSV"
+        )
+
+    return output_path
+
+
+def create_comparison_table(
+    results_folder: str,
+    output_path: str,
+    model_names: list[str] | None = None,
+    benchmark_name: str | None = None,
+    aggregation_level: Literal["subset", "split", "task"] = "task",
+) -> pd.DataFrame:
+    """Create comparison tables for MTEB models.
+
+    Args:
+        results_folder: Path to the results folder
+        model_names: List of model names to include (default: None, which means all available models)
+        benchmark_name: Name of the benchmark (optional)
+        output_path: Path to save the output tables
+        aggregation_level: Level of aggregation for results ('subset', 'split', or 'task')
+                          - 'subset': Results for each subset within each split for each task
+                          - 'split': Results aggregated over subsets for each split for each task
+                          - 'task': Results aggregated over subsets and splits for each task
+
+    Returns:
+        result_df: DataFrame with aggregated results
+    """
+    if model_names:
+        logger.info(f"Creating comparison table for models: {', '.join(model_names)}")
+    else:
+        logger.info("Creating comparison table for all available models")
+
+    logger.info(f"Using aggregation level: {aggregation_level}")
+
+    # Load results
+    benchmark_results = load_results(
+        results_repo=results_folder,
+        only_main_score=True,
+        require_model_meta=False,
+        models=model_names,
+    )
+
+    # Filter by benchmark if specified
+    if benchmark_name:
+        logger.info(f"Filtering tasks for benchmark: {benchmark_name}")
+        benchmark = next(
+            (b for b in mteb.get_benchmarks() if b.name == benchmark_name), None
+        )
+        if not benchmark:
+            raise ValueError(
+                f"Benchmark '{benchmark_name}' not found. Available: {get_available_benchmarks()}"
+            )
+
+        benchmark_results_filtered = benchmark.load_results(
+            base_results=benchmark_results
+        ).join_revisions()
+    else:
+        logger.info("Using all available tasks for the specified models")
+        benchmark_results_filtered = benchmark_results.join_revisions()
+
+    # Check if we have any results
+    if not benchmark_results_filtered.model_results or not any(
+        model_result.task_results
+        for model_result in benchmark_results_filtered.model_results
+    ):
+        logger.warning("No results found for the specified models and benchmark")
+        return pd.DataFrame()
+
+    # Get detailed scores
+    scores_data = []
+    for model_result in benchmark_results_filtered.model_results:
+        model_name = model_result.model_name
+        for task_result in model_result.task_results:
+            task_name = task_result.task_name
+            for split, scores_list in task_result.scores.items():
+                for score_item in scores_list:
+                    scores_data.append(
+                        {
+                            "model_name": model_name,
+                            "task_name": task_name,
+                            "split": split,
+                            "subset": score_item.get("hf_subset", "default"),
+                            "score": score_item.get("main_score", 0.0) * 100
+                            if score_item.get("main_score", 0.0) is not None
+                            else 0.0,
+                        }
+                    )
+
+    if not scores_data:
+        logger.warning("No scores found for the specified models and benchmark")
+        return pd.DataFrame()
+
+    scores_df = pd.DataFrame(scores_data)
+
+    # Create the appropriate table based on aggregation level
+    if aggregation_level == "subset":
+        # For subset level, show raw data at task/split/subset level (no aggregation)
+        pivot_df = scores_df.pivot_table(
+            index=["task_name", "split", "subset"],
+            columns="model_name",
+            values="score",
+            aggfunc="mean",
+        ).reset_index()
+
+    elif aggregation_level == "split":
+        # For split level, aggregate across subsets for each task/split combination
+        agg_df = (
+            scores_df.groupby(["model_name", "task_name", "split"])["score"]
+            .mean()
+            .reset_index()
+        )
+        pivot_df = agg_df.pivot_table(
+            index=["task_name", "split"],
+            columns="model_name",
+            values="score",
+            aggfunc="mean",
+        ).reset_index()
+
+    elif aggregation_level == "task":
+        # For task level, aggregate across both subsets and splits for each task
+        agg_df = (
+            scores_df.groupby(["model_name", "task_name"])["score"].mean().reset_index()
+        )
+        pivot_df = agg_df.pivot_table(
+            index=["task_name"],
+            columns="model_name",
+            values="score",
+            aggfunc="mean",
+        ).reset_index()
+
+    pivot_df.columns.name = None
+    model_cols = [
+        col for col in pivot_df.columns if col not in ["task_name", "split", "subset"]
+    ]
+    if model_cols:
+        # Create mean row based on aggregation level
+        if aggregation_level == "subset":
+            # Add an empty row for overall mean
+            overall_mean_row = {"task_name": "mean_score", "split": "", "subset": ""}
+            for model in model_cols:
+                overall_mean_row[model] = pivot_df[model].mean()
+            pivot_df = pd.concat(
+                [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True
+            )
+
+        elif aggregation_level == "split":
+            overall_mean_row = {"task_name": "mean_score", "split": ""}
+            for model in model_cols:
+                overall_mean_row[model] = pivot_df[model].mean()
+            pivot_df = pd.concat(
+                [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True
+            )
+
+        elif aggregation_level == "task":
+            # Add overall mean row
+            overall_mean_row = {"task_name": "mean_score"}
+            for model in model_cols:
+                overall_mean_row[model] = pivot_df[model].mean()
+            pivot_df = pd.concat(
+                [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True
+            )
+
+    # Round scores to 2 decimal places
+    numeric_columns = pivot_df.select_dtypes(include=np.number).columns
+    pivot_df[numeric_columns] = pivot_df[numeric_columns].round(2)
+
+    # Save output if path is provided
+    if output_path:
+        output_dir = Path(output_path).parent
+        os.makedirs(output_dir, exist_ok=True)
+
+        save_dataframe(pivot_df, output_path)
+        logger.info(f"Comparison table saved to {output_path}")
+
+    return pivot_df
+
+
+def format_table_for_display(df: pd.DataFrame) -> str:
+    """Format a DataFrame for terminal display."""
+    max_rows = 10
+    if len(df) > max_rows:
+        display_df = df.head(max_rows)
+        return f"{display_df.to_string()}\n... {len(df) - max_rows} more rows"
+    return df.to_string()
+
+
+def create_table_cli(args: argparse.Namespace) -> pd.DataFrame:
+    """Entry point for CLI integration."""
+    models = [model.strip() for model in args.models] if args.models else None
+
+    result_df = create_comparison_table(
+        results_folder=args.results,
+        output_path=args.output,
+        model_names=models,
+        benchmark_name=args.benchmark,
+        aggregation_level=args.aggregation_level,
+    )
+
+    # Display table in terminal
+    if not result_df.empty:
+        print(
+            f"\n===== COMPARISON TABLE ({args.aggregation_level.upper()} AGGREGATION) ====="
+        )
+        print(format_table_for_display(result_df))
+    else:
+        print("\nNo data available for the specified models and benchmark")
+
+    return result_df
diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py
@@ -30,6 +30,11 @@ def download_of_results(
     Returns:
         The path to the local cache directory.
     """
+    results_path = Path(results_repo)
+    if results_path.exists() and results_path.is_dir():
+        logger.info(f"Using local results repository at {results_path}")
+        return results_path
+
     default_cache_directory = Path.home() / ".cache" / "mteb"
 
     if cache_directory is None:

diff --git a/pyproject.toml b/pyproject.toml
@@ -94,6 +94,8 @@ vertexai = ["vertexai==1.71.1"]
 ll2vec = ["ll2vec==0.2.3"]
 timm = ["timm==1.0.15"]
 open_clip_torch = ["open_clip_torch==2.31.0"]
+xlsx = ["openpyxl>=3.1.0"]
+markdown = ["tabulate>=0.8.0"]
 
 [tool.coverage.report]