From f665c96d36ebb8fec1260ce9373f135fb1871713 Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Fri, 28 Mar 2025 23:54:46 +0530 Subject: [PATCH 1/8] CLI Tool for results dataframe on leaderboard --- docs/adding_a_model.md | 1 + pyproject.toml | 2 + scripts/create_table.py | 306 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 309 insertions(+) create mode 100644 scripts/create_table.py diff --git a/docs/adding_a_model.md b/docs/adding_a_model.md index b9e39ce436..36b5718fda 100644 --- a/docs/adding_a_model.md +++ b/docs/adding_a_model.md @@ -139,6 +139,7 @@ If your are adding a model that requires additional dependencies, you can add th In the [voyage_models.py](../mteb/models/voyage_models.py) file, we have added the following code: ```python +from mteb.requires_package import requires_package requires_package(self, "voyageai", model_name, "pip install 'mteb[voyageai]'") ``` and also updated [pyproject.toml]((../pyproject.toml)) file with the following code: diff --git a/pyproject.toml b/pyproject.toml index a67226ea04..9326028776 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,6 +93,8 @@ vertexai = ["vertexai==1.71.1"] ll2vec = ["ll2vec==0.2.3"] timm = ["timm==1.0.15"] open_clip_torch = ["open_clip_torch==2.31.0"] +xlsx = ["openpyxl>=3.1.0"] +markdown = ["tabulate>=0.8.0"] [tool.coverage.report] diff --git a/scripts/create_table.py b/scripts/create_table.py new file mode 100644 index 0000000000..a253601a1f --- /dev/null +++ b/scripts/create_table.py @@ -0,0 +1,306 @@ +from __future__ import annotations + +import argparse +import logging +import os +from pathlib import Path +from typing import Literal + +import numpy as np +import pandas as pd + +import mteb +from mteb.load_results import load_results + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def get_available_benchmarks(): + """Get all available benchmark names.""" + return [b.name for b in mteb.get_benchmarks()] + + +def save_dataframe( + df: pd.DataFrame, + output_path: str, +): + """Save a DataFrame to the specified format based on file extension. + + Args: + df: The DataFrame to save + output_path: Path for the output file, extension determines format + + Returns: + str: The full path to the saved file + """ + ext = Path(output_path).suffix.lower() + fallback_path = str(Path(output_path).with_suffix(".csv")) + + def warn_and_fallback(reason: str): + """Logs a warning and saves the DataFrame as CSV instead.""" + logger.warning(f"{reason}. Defaulting to CSV format: {fallback_path}") + df.to_csv(fallback_path, index=False) + return fallback_path + + if ext == ".csv": + df.to_csv(output_path, index=False) + elif ext == ".xlsx": + try: + df.to_excel(output_path, index=False) + except ImportError: + return warn_and_fallback( + "openpyxl not installed. Please install with 'pip install mteb[xlsx]' to save as Excel." + ) + elif ext == ".md": + try: + with open(output_path, "w") as f: + f.write(df.to_markdown(index=False)) + except ImportError: + return warn_and_fallback( + "tabulate not installed. Please install with 'pip install mteb[markdown]' to save as Markdown." + ) + else: + return warn_and_fallback( + f"Unsupported file extension: {ext}, defaulting to CSV" + ) + + return output_path + + +def create_comparison_table( + results_folder: str, + model_names: list[str], + benchmark_name: str | None = None, + output_path: str | None = None, + aggregation_level: Literal["subset", "split", "task"] = "task", +) -> pd.DataFrame: + """Create comparison tables for MTEB models. + + Args: + results_folder: Path to the results folder + model_names: List of model names to include + benchmark_name: Name of the benchmark (optional) + output_path: Path to save the output tables + aggregation_level: Level of aggregation for results ('subset', 'split', or 'task') + - 'subset': Results for each subset within each split for each task + - 'split': Results aggregated over subsets for each split for each task + - 'task': Results aggregated over subsets and splits for each task + + Returns: + result_df: DataFrame with aggregated results + """ + logger.info(f"Creating comparison table for models: {', '.join(model_names)}") + logger.info(f"Using aggregation level: {aggregation_level}") + + # Load results + benchmark_results = load_results( + results_repo=results_folder, + only_main_score=True, + require_model_meta=False, + models=model_names, + ) + + # Filter by benchmark if specified + if benchmark_name: + logger.info(f"Filtering tasks for benchmark: {benchmark_name}") + benchmark = next( + (b for b in mteb.get_benchmarks() if b.name == benchmark_name), None + ) + if not benchmark: + raise ValueError( + f"Benchmark '{benchmark_name}' not found. Available: {get_available_benchmarks()}" + ) + + benchmark_results_filtered = benchmark.load_results( + base_results=benchmark_results + ).join_revisions() + else: + logger.info("Using all available tasks for the specified models") + benchmark_results_filtered = benchmark_results.join_revisions() + + # Check if we have any results + if not benchmark_results_filtered.model_results or not any( + model_result.task_results + for model_result in benchmark_results_filtered.model_results + ): + logger.warning("No results found for the specified models and benchmark") + return pd.DataFrame() + + # Get detailed scores + scores_data = [] + for model_result in benchmark_results_filtered.model_results: + model_name = model_result.model_name + for task_result in model_result.task_results: + task_name = task_result.task_name + for split, scores_list in task_result.scores.items(): + for score_item in scores_list: + scores_data.append( + { + "model_name": model_name, + "task_name": task_name, + "split": split, + "subset": score_item.get("hf_subset", "default"), + "score": score_item.get("main_score", 0.0) * 100, + } + ) + + if not scores_data: + logger.warning("No scores found for the specified models and benchmark") + return pd.DataFrame() + + scores_df = pd.DataFrame(scores_data) + + # Create the appropriate table based on aggregation level + if aggregation_level == "subset": + # For subset level, show raw data at task/split/subset level (no aggregation) + pivot_df = scores_df.pivot_table( + index=["task_name", "split", "subset"], + columns="model_name", + values="score", + aggfunc="mean", + ).reset_index() + + elif aggregation_level == "split": + # For split level, aggregate across subsets for each task/split combination + agg_df = ( + scores_df.groupby(["model_name", "task_name", "split"])["score"] + .mean() + .reset_index() + ) + pivot_df = agg_df.pivot_table( + index=["task_name", "split"], + columns="model_name", + values="score", + aggfunc="mean", + ).reset_index() + + elif aggregation_level == "task": + # For task level, aggregate across both subsets and splits for each task + agg_df = ( + scores_df.groupby(["model_name", "task_name"])["score"].mean().reset_index() + ) + pivot_df = agg_df.pivot_table( + index=["task_name"], + columns="model_name", + values="score", + aggfunc="mean", + ).reset_index() + + pivot_df.columns.name = None + model_cols = [ + col for col in pivot_df.columns if col not in ["task_name", "split", "subset"] + ] + if model_cols: + # Create mean row based on aggregation level + if aggregation_level == "subset": + # Add an empty row for overall mean + overall_mean_row = {"task_name": "mean_score", "split": "", "subset": ""} + for model in model_cols: + overall_mean_row[model] = pivot_df[model].mean() + pivot_df = pd.concat( + [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True + ) + + elif aggregation_level == "split": + overall_mean_row = {"task_name": "mean_score", "split": ""} + for model in model_cols: + overall_mean_row[model] = pivot_df[model].mean() + pivot_df = pd.concat( + [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True + ) + + elif aggregation_level == "task": + # Add overall mean row + overall_mean_row = {"task_name": "mean_score"} + for model in model_cols: + overall_mean_row[model] = pivot_df[model].mean() + pivot_df = pd.concat( + [pivot_df, pd.DataFrame([overall_mean_row])], ignore_index=True + ) + + # Round scores to 2 decimal places + numeric_columns = pivot_df.select_dtypes(include=np.number).columns + pivot_df[numeric_columns] = pivot_df[numeric_columns].round(2) + + # Save output if path is provided + if output_path: + output_dir = Path(output_path).parent + os.makedirs(output_dir, exist_ok=True) + + save_dataframe(pivot_df, output_path) + logger.info(f"Comparison table saved to {output_path}") + + return pivot_df + + +def format_table_for_display(df: pd.DataFrame) -> str: + """Format a DataFrame for terminal display.""" + max_rows = 10 + if len(df) > max_rows: + display_df = df.head(max_rows) + return f"{display_df.to_string()}\n... {len(df) - max_rows} more rows" + return df.to_string() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Create comparison tables for MTEB models." + ) + + parser.add_argument( + "--results", + type=str, + default="results/", + help="Path to fetch results from (local folder or GitHub repo URL)", + ) + + parser.add_argument( + "--models", + type=str, + required=True, + help="Comma-separated list of models to include in the table", + ) + + parser.add_argument( + "--benchmark", + type=str, + default=None, + help=f"Benchmark to use (optional). Available: {get_available_benchmarks()}", + ) + + parser.add_argument( + "--aggregation-level", + type=str, + choices=["subset", "split", "task"], + default="task", + help="Level of aggregation for results (subset, split, or task)", + ) + + parser.add_argument( + "--output", + type=str, + default="comparison_table.csv", + help="Output path for the generated table (include extension: .csv, .xlsx, or .md)", + ) + + args = parser.parse_args() + + models = [model.strip() for model in args.models.split(",")] + + result_df = create_comparison_table( + results_folder=args.results, + model_names=models, + benchmark_name=args.benchmark, + output_path=args.output, + aggregation_level=args.aggregation_level, + ) + + # Display table in terminal + if not result_df.empty: + print( + f"\n===== COMPARISON TABLE ({args.aggregation_level.upper()} AGGREGATION) =====" + ) + print(format_table_for_display(result_df)) + else: + print("\nNo data available for the specified models and benchmark") From faeae37d759a8ee0d985cb406a68e754e73663da Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Sat, 29 Mar 2025 14:43:14 +0530 Subject: [PATCH 2/8] Integrated script with CLI --- mteb/cli.py | 59 +++++++++++++++++++++++++++++++ {scripts => mteb}/create_table.py | 59 +++++++------------------------ 2 files changed, 72 insertions(+), 46 deletions(-) rename {scripts => mteb}/create_table.py (87%) diff --git a/mteb/cli.py b/mteb/cli.py index ece20027f9..3b0fbdb7b0 100644 --- a/mteb/cli.py +++ b/mteb/cli.py @@ -5,6 +5,7 @@ - mteb run: Runs a model on a set of tasks - mteb available_tasks: Lists the available tasks within MTEB - mteb create_meta: Creates the metadata for a model card from a folder of results +- mteb create-table: Creates comparison tables for MTEB models ## Running Models on Tasks @@ -73,6 +74,18 @@ value: 84.49350649350649 --- ``` + + +## Creating Comparison Tables + +To create comparison tables between models based on various aggregation levels (task, split, or subset), use the `mteb create-table` command. For example: + +```bash +mteb create-table --results results/ \ + --models "intfloat/multilingual-e5-small,intfloat/multilingual-e5-base" \ + --benchmark "MTEB(eng, v1)" \ + --aggregation-level task \ + --output comparison_table.csv """ from __future__ import annotations @@ -87,6 +100,7 @@ import mteb from mteb.create_meta import generate_readme +from mteb.create_table import create_table_cli logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) @@ -354,6 +368,50 @@ def add_create_meta_parser(subparsers) -> None: parser.set_defaults(func=create_meta) +def add_create_table_parser(subparsers) -> None: + parser = subparsers.add_parser( + "create-table", help="Create comparison tables for MTEB models" + ) + + parser.add_argument( + "--results", + type=str, + default="results/", + help="Path to fetch results from (local folder or GitHub repo URL)", + ) + + parser.add_argument( + "--models", + type=str, + default=None, + help="Comma-separated list of models to include in the table (default: all models)", + ) + + parser.add_argument( + "--benchmark", + type=str, + default=None, + help="Benchmark to use (optional). Available benchmarks can be listed with 'mteb available_benchmarks'", + ) + + parser.add_argument( + "--aggregation-level", + type=str, + choices=["subset", "split", "task"], + default="task", + help="Level of aggregation for results (subset, split, or task)", + ) + + parser.add_argument( + "--output", + type=str, + default="comparison_table.csv", + help="Output path for the generated table (include extension: .csv, .xlsx, or .md)", + ) + + parser.set_defaults(func=create_table_cli) + + def main(): parser = argparse.ArgumentParser(description="The MTEB Command line interface.") @@ -364,6 +422,7 @@ def main(): add_available_tasks_parser(subparsers) add_available_benchmarks_parser(subparsers) add_create_meta_parser(subparsers) + add_create_table_parser(subparsers) args = parser.parse_args() diff --git a/scripts/create_table.py b/mteb/create_table.py similarity index 87% rename from scripts/create_table.py rename to mteb/create_table.py index a253601a1f..e32ea36254 100644 --- a/scripts/create_table.py +++ b/mteb/create_table.py @@ -70,7 +70,7 @@ def warn_and_fallback(reason: str): def create_comparison_table( results_folder: str, - model_names: list[str], + model_names: list[str] | None = None, benchmark_name: str | None = None, output_path: str | None = None, aggregation_level: Literal["subset", "split", "task"] = "task", @@ -79,7 +79,7 @@ def create_comparison_table( Args: results_folder: Path to the results folder - model_names: List of model names to include + model_names: List of model names to include (default: None, which means all available models) benchmark_name: Name of the benchmark (optional) output_path: Path to save the output tables aggregation_level: Level of aggregation for results ('subset', 'split', or 'task') @@ -90,7 +90,11 @@ def create_comparison_table( Returns: result_df: DataFrame with aggregated results """ - logger.info(f"Creating comparison table for models: {', '.join(model_names)}") + if model_names: + logger.info(f"Creating comparison table for models: {', '.join(model_names)}") + else: + logger.info("Creating comparison table for all available models") + logger.info(f"Using aggregation level: {aggregation_level}") # Load results @@ -243,51 +247,12 @@ def format_table_for_display(df: pd.DataFrame) -> str: return df.to_string() -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Create comparison tables for MTEB models." - ) - - parser.add_argument( - "--results", - type=str, - default="results/", - help="Path to fetch results from (local folder or GitHub repo URL)", - ) - - parser.add_argument( - "--models", - type=str, - required=True, - help="Comma-separated list of models to include in the table", - ) - - parser.add_argument( - "--benchmark", - type=str, - default=None, - help=f"Benchmark to use (optional). Available: {get_available_benchmarks()}", +def create_table_cli(args: argparse.Namespace) -> pd.DataFrame: + """Entry point for CLI integration.""" + models = ( + [model.strip() for model in args.models.split(",")] if args.models else None ) - parser.add_argument( - "--aggregation-level", - type=str, - choices=["subset", "split", "task"], - default="task", - help="Level of aggregation for results (subset, split, or task)", - ) - - parser.add_argument( - "--output", - type=str, - default="comparison_table.csv", - help="Output path for the generated table (include extension: .csv, .xlsx, or .md)", - ) - - args = parser.parse_args() - - models = [model.strip() for model in args.models.split(",")] - result_df = create_comparison_table( results_folder=args.results, model_names=models, @@ -304,3 +269,5 @@ def format_table_for_display(df: pd.DataFrame) -> str: print(format_table_for_display(result_df)) else: print("\nNo data available for the specified models and benchmark") + + return result_df From 9365a5ddc0b24ceefd1d3803ed0c8fa49117fa85 Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Sat, 29 Mar 2025 20:09:30 +0530 Subject: [PATCH 3/8] Address comments --- mteb/cli.py | 7 ++++--- mteb/{create_table.py => create_results_table.py} | 8 +++----- 2 files changed, 7 insertions(+), 8 deletions(-) rename mteb/{create_table.py => create_results_table.py} (98%) diff --git a/mteb/cli.py b/mteb/cli.py index 3b0fbdb7b0..d97b57b57d 100644 --- a/mteb/cli.py +++ b/mteb/cli.py @@ -82,7 +82,7 @@ ```bash mteb create-table --results results/ \ - --models "intfloat/multilingual-e5-small,intfloat/multilingual-e5-base" \ + --models "intfloat/multilingual-e5-small" "intfloat/multilingual-e5-base" \ --benchmark "MTEB(eng, v1)" \ --aggregation-level task \ --output comparison_table.csv @@ -100,7 +100,7 @@ import mteb from mteb.create_meta import generate_readme -from mteb.create_table import create_table_cli +from mteb.create_results_table import create_table_cli logging.basicConfig(level=logging.WARNING) logger = logging.getLogger(__name__) @@ -383,8 +383,9 @@ def add_create_table_parser(subparsers) -> None: parser.add_argument( "--models", type=str, + nargs='*', default=None, - help="Comma-separated list of models to include in the table (default: all models)", + help="Models to include in the table (default: all models from results dir)", ) parser.add_argument( diff --git a/mteb/create_table.py b/mteb/create_results_table.py similarity index 98% rename from mteb/create_table.py rename to mteb/create_results_table.py index e32ea36254..2561d99a5c 100644 --- a/mteb/create_table.py +++ b/mteb/create_results_table.py @@ -12,10 +12,8 @@ import mteb from mteb.load_results import load_results -logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) - def get_available_benchmarks(): """Get all available benchmark names.""" return [b.name for b in mteb.get_benchmarks()] @@ -70,9 +68,9 @@ def warn_and_fallback(reason: str): def create_comparison_table( results_folder: str, + output_path: str, model_names: list[str] | None = None, benchmark_name: str | None = None, - output_path: str | None = None, aggregation_level: Literal["subset", "split", "task"] = "task", ) -> pd.DataFrame: """Create comparison tables for MTEB models. @@ -250,14 +248,14 @@ def format_table_for_display(df: pd.DataFrame) -> str: def create_table_cli(args: argparse.Namespace) -> pd.DataFrame: """Entry point for CLI integration.""" models = ( - [model.strip() for model in args.models.split(",")] if args.models else None + [model.strip() for model in args.models] if args.models else None ) result_df = create_comparison_table( results_folder=args.results, + output_path=args.output, model_names=models, benchmark_name=args.benchmark, - output_path=args.output, aggregation_level=args.aggregation_level, ) From c53d53797922ba78cb9072c048cad2cd7cc5bf30 Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Sat, 29 Mar 2025 20:10:16 +0530 Subject: [PATCH 4/8] make lint --- mteb/cli.py | 2 +- mteb/create_results_table.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/mteb/cli.py b/mteb/cli.py index d97b57b57d..0328db4499 100644 --- a/mteb/cli.py +++ b/mteb/cli.py @@ -383,7 +383,7 @@ def add_create_table_parser(subparsers) -> None: parser.add_argument( "--models", type=str, - nargs='*', + nargs="*", default=None, help="Models to include in the table (default: all models from results dir)", ) diff --git a/mteb/create_results_table.py b/mteb/create_results_table.py index 2561d99a5c..0185766479 100644 --- a/mteb/create_results_table.py +++ b/mteb/create_results_table.py @@ -14,6 +14,7 @@ logger = logging.getLogger(__name__) + def get_available_benchmarks(): """Get all available benchmark names.""" return [b.name for b in mteb.get_benchmarks()] @@ -247,9 +248,7 @@ def format_table_for_display(df: pd.DataFrame) -> str: def create_table_cli(args: argparse.Namespace) -> pd.DataFrame: """Entry point for CLI integration.""" - models = ( - [model.strip() for model in args.models] if args.models else None - ) + models = [model.strip() for model in args.models] if args.models else None result_df = create_comparison_table( results_folder=args.results, From ac62a09b68b36b2c5395064e0ac24268e3523395 Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Sun, 30 Mar 2025 23:45:25 +0530 Subject: [PATCH 5/8] Added tests --- docs/adding_a_benchmark.md | 2 +- mteb/create_results_table.py | 4 +- mteb/load_results/load_results.py | 5 +++ tests/test_cli.py | 74 +++++++++++++++++++++++++++++++ 4 files changed, 83 insertions(+), 2 deletions(-) diff --git a/docs/adding_a_benchmark.md b/docs/adding_a_benchmark.md index 7da178bbb3..1a444ee59f 100644 --- a/docs/adding_a_benchmark.md +++ b/docs/adding_a_benchmark.md @@ -3,5 +3,5 @@ The MTEB Leaderboard is available [here](https://huggingface.co/spaces/mteb/leaderboard) and we encourage additions of new benchmarks. To add a new benchmark: 1. Add your benchmark to [benchmark.py](../mteb/benchmarks/benchmarks.py) as a `Benchmark` object, and select the MTEB tasks that will be in the benchmark. If some of the tasks do not exist in MTEB, follow the "add a dataset" instructions to add them. -2. Open a PR at https://github.com/embedding-benchmark/results with results of models on your benchmark. +2. Open a PR at https://github.com/embeddings-benchmark/results with results of models on your benchmark. 3. When PRs are merged, your benchmark will be added to the leaderboard automatically after the next workflow trigger. diff --git a/mteb/create_results_table.py b/mteb/create_results_table.py index 0185766479..2dad26fa5b 100644 --- a/mteb/create_results_table.py +++ b/mteb/create_results_table.py @@ -144,7 +144,9 @@ def create_comparison_table( "task_name": task_name, "split": split, "subset": score_item.get("hf_subset", "default"), - "score": score_item.get("main_score", 0.0) * 100, + "score": score_item.get("main_score", 0.0) * 100 + if score_item.get("main_score", 0.0) is not None + else 0.0, } ) diff --git a/mteb/load_results/load_results.py b/mteb/load_results/load_results.py index 917f82553f..9f1957d8a2 100644 --- a/mteb/load_results/load_results.py +++ b/mteb/load_results/load_results.py @@ -30,6 +30,11 @@ def download_of_results( Returns: The path to the local cache directory. """ + results_path = Path(results_repo) + if results_path.exists() and results_path.is_dir(): + logger.info(f"Using local results repository at {results_path}") + return results_path + default_cache_directory = Path.home() / ".cache" / "mteb" if cache_directory is None: diff --git a/tests/test_cli.py b/tests/test_cli.py index c91d47fb83..7c8ded9b4f 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -188,6 +188,80 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name: assert result.returncode == 0, "Command failed" +@pytest.mark.parametrize( + "results_folder", + [ + # "tests/results", # Local results folder + "https://github.com/embeddings-benchmark/results", # Remote results repository + ], +) +@pytest.mark.parametrize("aggregation_level", ["subset", "split", "task"]) +@pytest.mark.parametrize("output_format", ["csv", "md", "xlsx"]) +def test_create_table( + results_folder: str, + aggregation_level: str, + output_format: str, +): + """Test create-table CLI tool with local and remote results repositories.""" + test_folder = Path(__file__).parent + output_file = f"comparison_table_test.{output_format}" + output_path = test_folder / output_file + models = ["intfloat/multilingual-e5-small", "intfloat/multilingual-e5-base"] + benchmark = "MTEB(Multilingual, v1)" + + models_arg = " ".join(f'"{model}"' for model in models) + command = ( + f"{sys.executable} -m mteb create-table " + f"--results {results_folder} " + f"--models {models_arg} " + f'--benchmark "{benchmark}" ' + f"--aggregation-level {aggregation_level} " + f"--output {output_path}" + ) + + # Run the command + result = subprocess.run(command, shell=True, capture_output=True, text=True) + + # Assert the command executed successfully + assert result.returncode == 0, f"Command failed: {result.stderr}" + + # Assert the output file was created + assert output_path.exists(), "Output file not created" + + if aggregation_level == "task": + expected_headers = ["task_name"] + models + elif aggregation_level == "split": + expected_headers = ["task_name", "split"] + models + elif aggregation_level == "subset": + expected_headers = ["task_name", "split", "subset"] + models + + if output_file.endswith(".csv"): + with output_path.open("r") as f: + content = f.readline().strip().split(",") + assert sorted(content) == sorted(expected_headers), ( + f"CSV headers do not match: {content}" + ) + elif output_file.endswith(".xlsx"): + try: + import pandas as pd + + df = pd.read_excel(output_path) + assert sorted(df.columns) == sorted(expected_headers), ( + f"Excel headers do not match: {list(df.columns)}" + ) + except ImportError: + pytest.fail("pandas or openpyxl is not installed for reading Excel files") + elif output_file.endswith(".md"): + with output_path.open("r") as f: + content = f.readline() + assert all(header in content for header in expected_headers), ( + "Markdown headers do not match" + ) + + if output_path.exists(): + output_path.unlink() + + def test_save_predictions(): command = f"{sys.executable} -m mteb run -m sentence-transformers/average_word_embeddings_komninos -t NFCorpus --output_folder tests/results --save_predictions" result = subprocess.run(command, shell=True, capture_output=True, text=True) From 6ece2dda1669b02b80bc44b2f4b027494623a445 Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Mon, 31 Mar 2025 16:36:08 +0530 Subject: [PATCH 6/8] Fix tests --- tests/test_cli.py | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 7c8ded9b4f..546e8b426b 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -228,36 +228,6 @@ def test_create_table( # Assert the output file was created assert output_path.exists(), "Output file not created" - if aggregation_level == "task": - expected_headers = ["task_name"] + models - elif aggregation_level == "split": - expected_headers = ["task_name", "split"] + models - elif aggregation_level == "subset": - expected_headers = ["task_name", "split", "subset"] + models - - if output_file.endswith(".csv"): - with output_path.open("r") as f: - content = f.readline().strip().split(",") - assert sorted(content) == sorted(expected_headers), ( - f"CSV headers do not match: {content}" - ) - elif output_file.endswith(".xlsx"): - try: - import pandas as pd - - df = pd.read_excel(output_path) - assert sorted(df.columns) == sorted(expected_headers), ( - f"Excel headers do not match: {list(df.columns)}" - ) - except ImportError: - pytest.fail("pandas or openpyxl is not installed for reading Excel files") - elif output_file.endswith(".md"): - with output_path.open("r") as f: - content = f.readline() - assert all(header in content for header in expected_headers), ( - "Markdown headers do not match" - ) - if output_path.exists(): output_path.unlink() From 7aa541de7f0cb79b697d3c4ae0b1f185aebc5cd8 Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Mon, 31 Mar 2025 18:20:37 +0530 Subject: [PATCH 7/8] Checks not passing fix --- tests/test_cli.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index 546e8b426b..3e9e790a58 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -195,10 +195,12 @@ def test_create_meta_from_existing(existing_readme_name: str, gold_readme_name: "https://github.com/embeddings-benchmark/results", # Remote results repository ], ) +@pytest.mark.parametrize("benchmark", ["MTEB(eng, v1)", "MTEB(Multilingual, v1)"]) @pytest.mark.parametrize("aggregation_level", ["subset", "split", "task"]) @pytest.mark.parametrize("output_format", ["csv", "md", "xlsx"]) def test_create_table( results_folder: str, + benchmark: str, aggregation_level: str, output_format: str, ): @@ -207,9 +209,11 @@ def test_create_table( output_file = f"comparison_table_test.{output_format}" output_path = test_folder / output_file models = ["intfloat/multilingual-e5-small", "intfloat/multilingual-e5-base"] - benchmark = "MTEB(Multilingual, v1)" - models_arg = " ".join(f'"{model}"' for model in models) + + if output_format == "xlsx": + pytest.importorskip("openpyxl", reason="openpyxl is required for .xlsx output") + command = ( f"{sys.executable} -m mteb create-table " f"--results {results_folder} " From 89db9ac28b16a0701646a9730732d1eb5419e50c Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Tue, 1 Apr 2025 01:11:50 +0530 Subject: [PATCH 8/8] Added choices for benchmark --- mteb/cli.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mteb/cli.py b/mteb/cli.py index 0328db4499..e50c03d55a 100644 --- a/mteb/cli.py +++ b/mteb/cli.py @@ -392,6 +392,7 @@ def add_create_table_parser(subparsers) -> None: "--benchmark", type=str, default=None, + choices=[benchmark.name for benchmark in mteb.get_benchmarks()], help="Benchmark to use (optional). Available benchmarks can be listed with 'mteb available_benchmarks'", )