diff --git a/mteb/cli.py b/mteb/cli.py index ece20027f9..e50c03d55a 100644 --- a/mteb/cli.py +++ b/mteb/cli.py @@ -5,6 +5,7 @@ - mteb run: Runs a model on a set of tasks - mteb available_tasks: Lists the available tasks within MTEB - mteb create_meta: Creates the metadata for a model card from a folder of results +- mteb create-table: Creates comparison tables for MTEB models ## Running Models on Tasks @@ -73,6 +74,18 @@ value: 84.49350649350649 --- ``` + + +## Creating Comparison Tables + +To create comparison tables between models based on various aggregation levels (task, split, or subset), use the `mteb create-table` command. For example: + +```bash +mteb create-table --results results/ \ + --models "intfloat/multilingual-e5-small" "intfloat/multilingual-e5-base" \ + --benchmark "MTEB(eng, v1)" \ + --aggregation-level task \ + --output comparison_table.csv """ from __future__ import annotations @@ -87,6 +100,7 @@ import mteb from mteb.create_meta import generate_readme +from mteb.create_results_table import create_table_cli logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) @@ -354,6 +368,52 @@ def add_create_meta_parser(subparsers) -> None: parser.set_defaults(func=create_meta) +def add_create_table_parser(subparsers) -> None: + parser = subparsers.add_parser( + "create-table", help="Create comparison tables for MTEB models" + ) + + parser.add_argument( + "--results", + type=str, + default="results/", + help="Path to fetch results from (local folder or GitHub repo URL)", + ) + + parser.add_argument( + "--models", + type=str, + nargs="*", + default=None, + help="Models to include in the table (default: all models from results dir)", + ) + + parser.add_argument( + "--benchmark", + type=str, + default=None, + choices=[benchmark.name for benchmark in mteb.get_benchmarks()], + help="Benchmark to use (optional). Available benchmarks can be listed with 'mteb available_benchmarks'", + ) + + parser.add_argument( + "--aggregation-level", + type=str, + choices=["subset", "split", "task"], + default="task", + help="Level of aggregation for results (subset, split, or task)", + ) + + parser.add_argument( + "--output", + type=str, + default="comparison_table.csv", + help="Output path for the generated table (include extension: .csv, .xlsx, or .md)", + ) + + parser.set_defaults(func=create_table_cli) + + def main(): parser = argparse.ArgumentParser(description="The MTEB Command line interface.") @@ -364,6 +424,7 @@ def main(): add_available_tasks_parser(subparsers) add_available_benchmarks_parser(subparsers) add_create_meta_parser(subparsers) + add_create_table_parser(subparsers) args = parser.parse_args() diff --git a/mteb/create_results_table.py b/mteb/create_results_table.py new file mode 100644 index 0000000000..2dad26fa5b --- /dev/null +++ b/mteb/create_results_table.py @@ -0,0 +1,272 @@ +from __future__ import annotations + +import argparse +import logging +import os +from pathlib import Path +from typing import Literal + +import numpy as np +import pandas as pd + +import mteb +from mteb.load_results import load_results + +logger = logging.getLogger(__name__) + + +def get_available_benchmarks(): + """Get all available benchmark names.""" + return [b.name for b in mteb.get_benchmarks()] + + +def save_dataframe( + df: pd.DataFrame, + output_path: str, +): + """Save a DataFrame to the specified format based on file extension. + + Args: + df: The DataFrame to save + output_path: Path for the output file, extension determines format + + Returns: + str: The full path to the saved file + """ + ext = Path(output_path).suffix.lower() + fallback_path = str(Path(output_path).with_suffix(".csv")) + + def warn_and_fallback(reason: str): + """Logs a warning and saves the DataFrame as CSV instead.""" + logger.warning(f"{reason}. Defaulting to CSV format: {fallback_path}") + df.to_csv(fallback_path, index=False) + return fallback_path + + if ext == ".csv": + df.to_csv(output_path, index=False) + elif ext == ".xlsx": + try: + df.to_excel(output_path, index=False) + except ImportError: + return warn_and_fallback( + "openpyxl not installed. Please install with 'pip install mteb[xlsx]' to save as Excel." + ) + elif ext == ".md": + try: + with open(output_path, "w") as f: + f.write(df.to_markdown(index=False)) + except ImportError: + return warn_and_fallback( + "tabulate not installed. Please install with 'pip install mteb[markdown]' to save as Markdown." + ) + else: + return warn_and_fallback( + f"Unsupported file extension: {ext}, defaulting to CSV" + ) + + return output_path + + +def create_comparison_table( + results_folder: str, + output_path: str, + model_names: list[str] | None = None, + benchmark_name: str | None = None, + aggregation_level: Literal["subset", "split", "task"] = "task", +) -> pd.DataFrame: + """Create comparison tables for MTEB models. + + Args: + results_folder: Path to the results folder + model_names: List of model names to include (default: None, which means all available models) + benchmark_name: Name of the benchmark (optional) + output_path: Path to save the output tables + aggregation_level: Level of aggregation for results ('subset', 'split', or 'task') + - 'subset': Results for each subset within each split for each task + - 'split': Results aggregated over subsets for each split for each task + - 'task': Results aggregated over subsets and splits for each task + + Returns: + result_df: DataFrame with aggregated results + """ + if model_names: + logger.info(f"Creating comparison table for models: {', '.join(model_names)}") + else: + logger.info("Creating comparison table for all available models") + + logger.info(f"Using aggregation level: {aggregation_level}") + + # Load results + benchmark_results = load_results( + results_repo=results_folder, + only_main_score=True, + require_model_meta=False, + models=model_names, + ) + + # Filter by benchmark if specified + if benchmark_name: + logger.info(f"Filtering tasks for benchmark: {benchmark_name}") + benchmark = next( + (b for b in mteb.get_benchmarks() if b.name == benchmark_name), None + ) + if not benchmark: + raise ValueError( + f"Benchmark '{benchmark_name}' not found. Available: {get_available_benchmarks()}" + ) + + benchmark_results_filtered = benchmark.load_results( + base_results=benchmark_results + ).join_revisions() + else: + logger.info("Using all available tasks for the specified models") + benchmark_results_filtered = benchmark_results.join_revisions() + + # Check if we have any results + if not benchmark_results_filtered.model_results or not any( + model_result.task_results + for model_result in benchmark_results_filtered.model_results + ): + logger.warning("No results found for the specified models and benchmark") + return pd.DataFrame() + + # Get detailed scores + scores_data = [] + for model_result in benchmark_results_filtered.model_results: + model_name = model_result.model_name + for task_result in model_result.task_results: + task_name = task_result.task_name + for split, scores_list in task_result.scores.items(): + for score_item in scores_list: + scores_data.append( + { + "model_name": model_name, + "task_name": task_name, + "split": split, + "subset": score_item.get("hf_subset", "default"), + "score": score_item.get("main_score", 0.0) * 100 + if score_item.get("main_score", 0.0) is not None + else 0.0, + } + ) + + if not scores_data: + logger.warning("No scores found for the specified models and benchmark") + return pd.DataFrame() + + scores_df = pd.DataFrame(scores_data) + + # Create the appropriate table based on aggregation level + if aggregation_level == "subset": + # For subset level, show raw data at task/split/subset level (no aggregation) + pivot_df = scores_df.pivot_table( + index=["task_name", "split", "subset"], + columns="model_name", + values="score", + aggfunc="mean", + ).reset_index() + + elif aggregation_level == "split": + # For split level, aggregate across subsets for each task/split combination + agg_df = ( + scores_df.groupby(["model_name", "task_name", "split"])["score"] + .mean() + .reset_index() + ) + pivot_df = agg_df.pivot_table( + index=["task_name", "split"], + columns="model_name", + values="score", + aggfunc="mean", + ).reset_index() + + elif aggregation_level == "task": + # For task level, aggregate across both subsets and splits for each task + agg_df = ( + scores_df.groupby(["model_name", "task_name"])["score"].mean().reset_index() + ) + pivot_df = agg_df.pivot_table( + index=["task_name"], + columns="model_name", + values="score", + aggfunc="mean", + ).reset_index() + + pivot_df.columns.name = None + model_cols = [ + col for col in pivot_df.columns if col not in ["task_name", "split", "subset"] + ] + if model_cols: + # Create mean row based on aggregation level + if aggregation_level == "subset": + # Add an empty row for overall mean + overall_mean_row = {"task_name": "mean_score", "split": "", "subset": ""} + for model in model_cols: + overall_mean_row[model] = pivot_df[model].mean() + pivot_df = pd.concat( + [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True + ) + + elif aggregation_level == "split": + overall_mean_row = {"task_name": "mean_score", "split": ""} + for model in model_cols: + overall_mean_row[model] = pivot_df[model].mean() + pivot_df = pd.concat( + [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True + ) + + elif aggregation_level == "task": + # Add overall mean row + overall_mean_row = {"task_name": "mean_score"} + for model in model_cols: + overall_mean_row[model] = pivot_df[model].mean() + pivot_df = pd.concat( + [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True + ) + + # Round scores to 2 decimal places + numeric_columns = pivot_df.select_dtypes(include=np.number).columns + pivot_df[numeric_columns] = pivot_df[numeric_columns].round(2) + + # Save output if path is provided + if output_path: + output_dir = Path(output_path).parent + os.makedirs(output_dir, exist_ok=True) + + save_dataframe(pivot_df, output_path) + logger.info(f"Comparison table saved to {output_path}") + + return pivot_df + + +def format_table_for_display(df: pd.DataFrame) -> str: + """Format a DataFrame for terminal display.""" + max_rows = 10 + if len(df) > max_rows: + display_df = df.head(max_rows) + return f"{display_df.to_string()}\n... {len(df) - max_rows} more rows" + return df.to_string() + + +def create_table_cli(args: argparse.Namespace) -> pd.DataFrame: + """Entry point for CLI integration.""" + models = [model.strip() for model in args.models] if args.models else None + + result_df = create_comparison_table( + results_folder=args.results, + output_path=args.output, + model_names=models, + benchmark_name=args.benchmark, + aggregation_level=args.aggregation_level, + ) + + # Display table in terminal + if not result_df.empty: + print( + f"\n===== COMPARISON TABLE ({args.aggregation_level.upper()} AGGREGATION) =====" + ) + print(format_table_for_display(result_df)) + else: + print("\nNo data available for the specified models and benchmark") + + return result_df diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py index ed8a90c060..0d43847b5d 100644 --- a/mteb/load_results/load_results.py +++ b/mteb/load_results/load_results.py @@ -30,6 +30,11 @@ def download_of_results( Returns: The path to the local cache directory. """ + results_path = Path(results_repo) + if results_path.exists() and results_path.is_dir(): + logger.info(f"Using local results repository at {results_path}") + return results_path + default_cache_directory = Path.home() / ".cache" / "mteb" if cache_directory is None: diff --git a/pyproject.toml b/pyproject.toml index 031365b79d..23c1abf3ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,6 +94,8 @@ vertexai = ["vertexai==1.71.1"] ll2vec = ["ll2vec==0.2.3"] timm = ["timm==1.0.15"] open_clip_torch = ["open_clip_torch==2.31.0"] +xlsx = ["openpyxl>=3.1.0"] +markdown = ["tabulate>=0.8.0"] [tool.coverage.report] diff --git a/tests/test_cli.py b/tests/test_cli.py index c91d47fb83..3e9e790a58 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -188,6 +188,54 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name: assert result.returncode == 0, "Command failed" +@pytest.mark.parametrize( + "results_folder", + [ + # "tests/results", # Local results folder + "https://github.com/embeddings-benchmark/results", # Remote results repository + ], +) +@pytest.mark.parametrize("benchmark", ["MTEB(eng, v1)", "MTEB(Multilingual, v1)"]) +@pytest.mark.parametrize("aggregation_level", ["subset", "split", "task"]) +@pytest.mark.parametrize("output_format", ["csv", "md", "xlsx"]) +def test_create_table( + results_folder: str, + benchmark: str, + aggregation_level: str, + output_format: str, +): + """Test create-table CLI tool with local and remote results repositories.""" + test_folder = Path(__file__).parent + output_file = f"comparison_table_test.{output_format}" + output_path = test_folder / output_file + models = ["intfloat/multilingual-e5-small", "intfloat/multilingual-e5-base"] + models_arg = " ".join(f'"{model}"' for model in models) + + if output_format == "xlsx": + pytest.importorskip("openpyxl", reason="openpyxl is required for .xlsx output") + + command = ( + f"{sys.executable} -m mteb create-table " + f"--results {results_folder} " + f"--models {models_arg} " + f'--benchmark "{benchmark}" ' + f"--aggregation-level {aggregation_level} " + f"--output {output_path}" + ) + + # Run the command + result = subprocess.run(command, shell=True, capture_output=True, text=True) + + # Assert the command executed successfully + assert result.returncode == 0, f"Command failed: {result.stderr}" + + # Assert the output file was created + assert output_path.exists(), "Output file not created" + + if output_path.exists(): + output_path.unlink() + + def test_save_predictions(): command = f"{sys.executable} -m mteb run -m sentence-transformers/average_word_embeddings_komninos -t NFCorpus --output_folder tests/results --save_predictions" result = subprocess.run(command, shell=True, capture_output=True, text=True)