diff --git a/mteb/benchmarks/_create_table.py b/mteb/benchmarks/_create_table.py index 08d3c4ef1d..fba9eae7e1 100644 --- a/mteb/benchmarks/_create_table.py +++ b/mteb/benchmarks/_create_table.py @@ -254,3 +254,133 @@ def _create_per_task_table_from_benchmark_results( ) return per_task + + +def _create_summary_table_mean_public_private( + benchmark_results: BenchmarkResults, +) -> pd.DataFrame: + """Create summary table from BenchmarkResults. + + Returns a DataFrame with one row per model containing summary statistics + and task type averages. + + Args: + benchmark_results: BenchmarkResults object containing model results + + Returns: + DataFrame with model summaries, ready for styling in the leaderboard + """ + print("all tasks:", benchmark_results.task_names) + data = benchmark_results.to_dataframe(format="long") + + if data.empty: + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + public_task_name = benchmark_results.filter_tasks(is_public=True).task_names + print("Public tasks:", public_task_name) + private_task_name = benchmark_results.filter_tasks(is_public=False).task_names + print("Private tasks:", private_task_name) + # Convert to DataFrame and pivot + per_task = data.pivot(index="model_name", columns="task_name", values="score") + print(per_task.columns) + + # Remove models with no scores + to_remove = per_task.isna().all(axis="columns") + if to_remove.all(): + no_results_frame = pd.DataFrame( + {"No results": ["You can try relaxing your criteria"]} + ) + return no_results_frame + + models_to_remove = list(per_task[to_remove].index) + per_task = per_task.drop(models_to_remove, axis=0) + + # Calculate means by task type + mean_per_type = _get_means_per_types(per_task) + mean_per_type = mean_per_type.pivot( + index="model_name", columns="task_type", values="score" + ) + mean_per_type.columns = [ + _split_on_capital(column) for column in mean_per_type.columns + ] + + # Calculate overall means + public_mean = per_task[public_task_name].mean(skipna=False, axis=1) + private_mean = per_task[private_task_name].mean(skipna=False, axis=1) + + # Build joint table + joint_table = mean_per_type.copy() + joint_table = joint_table.drop(models_to_remove, axis=0) + joint_table.insert(0, "mean(public)", public_mean) + joint_table.insert(1, "mean(private)", private_mean) + joint_table["borda_rank"] = _get_borda_rank(per_task) + joint_table = joint_table.sort_values("borda_rank", ascending=True) + joint_table = joint_table.reset_index() + + # Add model metadata + model_metas = joint_table["model_name"].map(_failsafe_get_model_meta) + joint_table = joint_table[model_metas.notna()] + joint_table["model_link"] = model_metas.map(lambda m: m.reference) + + # Insert model metadata columns + joint_table.insert( + 1, + "Max Tokens", + model_metas.map(lambda m: _format_max_tokens(m.max_tokens)), + ) + joint_table.insert( + 1, + "Embedding Dimensions", + model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"), + ) + joint_table.insert( + 1, + "Number of Parameters", + model_metas.map(lambda m: _format_n_parameters(m.n_parameters)), + ) + joint_table.insert( + 1, + "Memory Usage (MB)", + model_metas.map( + lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown" + ), + ) + + # Add zero-shot percentage + tasks = get_tasks(tasks=list(data["task_name"].unique())) + joint_table.insert( + 1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks)) + ) + joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1) + + # Clean up model names (remove HF organization) + joint_table["model_name"] = joint_table["model_name"].map( + lambda name: name.split("/")[-1] + ) + + # Add markdown links to model names + name_w_link = ( + "[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")" + ) + joint_table["model_name"] = joint_table["model_name"].mask( + joint_table["model_link"].notna(), name_w_link + ) + joint_table = joint_table.drop(columns=["model_link"]) + + # Rename columns + rename_dict = { + "model_name": "Model", + "mean(public)": "Mean (Public)", + "mean(private)": "Mean (Private)", + } + # For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task) + if "Retrieval" in joint_table.columns: + rename_dict["Retrieval"] = "Mean (Task)" + joint_table = joint_table.rename(columns=rename_dict) + + # Move borda rank to front + joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) + + return joint_table diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py index e48b455ad3..c6a570be8b 100644 --- a/mteb/benchmarks/benchmark.py +++ b/mteb/benchmarks/benchmark.py @@ -10,6 +10,7 @@ from mteb.benchmarks._create_table import ( _create_per_task_table_from_benchmark_results, _create_summary_table_from_benchmark_results, + _create_summary_table_mean_public_private, ) from mteb.load_results.load_results import load_results @@ -89,3 +90,11 @@ def _create_per_task_table( ) -> pd.DataFrame: """Create per-task table. Called by the leaderboard app.""" return _create_per_task_table_from_benchmark_results(benchmark_results) + + +class RtebBenchmark(Benchmark): + def _create_summary_table( + self, benchmark_results: BenchmarkResults + ) -> pd.DataFrame: + """Create summary table. Called by the leaderboard app.""" + return _create_summary_table_mean_public_private(benchmark_results) diff --git a/mteb/benchmarks/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks/benchmarks.py index b24941eddf..e6a7022e5f 100644 --- a/mteb/benchmarks/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks/benchmarks.py @@ -445,6 +445,7 @@ CoIR = Benchmark( name="CoIR", display_name="Code Information Retrieval", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", tasks=get_tasks( tasks=[ "AppsRetrieval", @@ -476,7 +477,7 @@ RAR_b = Benchmark( name="RAR-b", - display_name="Reasoning retrieval", + display_name="Reasoning as retrieval", tasks=get_tasks( tasks=[ "ARCChallenge", @@ -1163,6 +1164,7 @@ BRIGHT = Benchmark( name="BRIGHT", + display_name="Reasoning Retrieval", tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["standard"]), description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval. BRIGHT is the first text retrieval @@ -1238,6 +1240,8 @@ BEIR = Benchmark( name="BEIR", + display_name="BEIR", + icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg", tasks=get_tasks( tasks=[ "TRECCOVID", @@ -1578,6 +1582,8 @@ BEIR_NL = Benchmark( name="BEIR-NL", + display_name="BEIR-NL", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/nl.svg", tasks=get_tasks( tasks=[ "ArguAna-NL", diff --git a/mteb/benchmarks/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/benchmarks/rteb_benchmarks.py index d124b4c208..bf845d4f66 100644 --- a/mteb/benchmarks/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/benchmarks/rteb_benchmarks.py @@ -1,7 +1,7 @@ # RTEB Benchmarks - Retrieval Embedding Benchmark from __future__ import annotations -from mteb.benchmarks.benchmark import Benchmark +from mteb.benchmarks.benchmark import RtebBenchmark from mteb.overview import get_tasks RTEB_CITATION = r"""@article{rteb2024, @@ -10,9 +10,9 @@ year = {2024}, }""" -RTEB_MAIN = Benchmark( +RTEB_MAIN = RtebBenchmark( name="RTEB(beta)", - display_name="RTEB Retrieval Embedding Benchmark", + display_name="RTEB Multilingual", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-search.svg", tasks=get_tasks( tasks=[ @@ -48,12 +48,12 @@ "JapaneseLegal1Retrieval", ], ), - description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 29 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across English, French, German, and Japanese languages.", + description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 29 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across mutliple languages.", citation=RTEB_CITATION, contacts=["fzowl"], ) -RTEB_ENGLISH = Benchmark( +RTEB_ENGLISH = RtebBenchmark( name="RTEB(eng, beta)", display_name="RTEB English", icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg", @@ -88,8 +88,8 @@ contacts=["fzowl"], ) -RTEB_FRENCH = Benchmark( - name="RTEB(fr, beta)", +RTEB_FRENCH = RtebBenchmark( + name="RTEB(fra, beta)", display_name="RTEB French", icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", tasks=get_tasks( @@ -106,7 +106,7 @@ contacts=["fzowl"], ) -RTEB_GERMAN = Benchmark( +RTEB_GERMAN = RtebBenchmark( name="RTEB(deu, beta)", display_name="RTEB German", icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg", @@ -124,7 +124,7 @@ contacts=["fzowl"], ) -RTEB_JAPANESE = Benchmark( +RTEB_JAPANESE = RtebBenchmark( name="RTEB(jpn, beta)", display_name="RTEB Japanese", icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg", @@ -140,7 +140,7 @@ contacts=["fzowl"], ) -RTEB_FINANCE = Benchmark( +RTEB_FINANCE = RtebBenchmark( name="RTEB(fin, beta)", display_name="RTEB Finance", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-price-tag.svg", @@ -161,7 +161,7 @@ contacts=["fzowl"], ) -RTEB_LEGAL = Benchmark( +RTEB_LEGAL = RtebBenchmark( name="RTEB(Law, beta)", display_name="RTEB Legal", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg", @@ -182,7 +182,7 @@ contacts=["fzowl"], ) -RTEB_CODE = Benchmark( +RTEB_CODE = RtebBenchmark( name="RTEB(Code, beta)", display_name="RTEB Code", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", @@ -204,7 +204,7 @@ contacts=["fzowl"], ) -RTEB_HEALTHCARE = Benchmark( +RTEB_HEALTHCARE = RtebBenchmark( name="RTEB(Health, beta)", display_name="RTEB Healthcare", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg", diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 88f56ffd22..e162b01aeb 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -1,6 +1,5 @@ from __future__ import annotations -import argparse import itertools import json import logging @@ -19,9 +18,9 @@ from mteb.abstasks.TaskMetadata import TASK_DOMAIN, TASK_TYPE from mteb.custom_validators import MODALITIES from mteb.leaderboard.benchmark_selector import ( - BENCHMARK_ENTRIES, DEFAULT_BENCHMARK_NAME, - RTEB_BENCHMARK_ENTRIES, + GP_BENCHMARK_ENTRIES, + R_BENCHMARK_ENTRIES, make_selector, ) from mteb.leaderboard.figures import performance_size_plot, radar_chart @@ -123,6 +122,7 @@ def update_task_info(task_names: str) -> gr.DataFrame: "reference", "main_score", "modalities", + "is_public", ] ) df["languages"] = df["languages"].map(format_list) @@ -138,6 +138,7 @@ def update_task_info(task_names: str) -> gr.DataFrame: "domains": "Domains", "main_score": "Metric", "modalities": "Modality", + "is_public": "Public", } ) df = df.drop(columns="reference") @@ -195,23 +196,7 @@ def filter_models( return list(models_to_keep) -def get_startup_arguments(): - parser = argparse.ArgumentParser() - - # Add a Boolean flag parameter - parser.add_argument( - "--show_rteb", - action="store_true", - help="If set, display RTEB results; otherwise show default results.", - ) - - return parser.parse_args() - - def get_leaderboard_app() -> gr.Blocks: - args = get_startup_arguments() - show_rteb = args.show_rteb - logger.info("Loading all benchmark results") all_results = load_results() @@ -309,12 +294,10 @@ def get_leaderboard_app() -> gr.Blocks: visible=True, width="18%", ): - if show_rteb: - benchmark_select, column = make_selector( - BENCHMARK_ENTRIES + RTEB_BENCHMARK_ENTRIES - ) - else: - benchmark_select, column = make_selector(BENCHMARK_ENTRIES) + benchmark_select, column = make_selector( + GP_BENCHMARK_ENTRIES + R_BENCHMARK_ENTRIES + ) + gr.Markdown( """ ## Embedding Leaderboard diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index 3395a9ecf9..148dcbb114 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -29,9 +29,9 @@ class MenuEntry: open: bool = False -BENCHMARK_ENTRIES = [ +GP_BENCHMARK_ENTRIES = [ MenuEntry( - name="Select Benchmark", + name="General Purpose", description="", open=False, benchmarks=mteb.get_benchmarks(["MTEB(Multilingual, v2)", "MTEB(eng, v2)"]) @@ -44,13 +44,11 @@ class MenuEntry: "MIEB(eng)", "MIEB(lite)", "MIEB(Img)", - "VisualDocumentRetrieval", - "JinaVDR", ] ), ), MenuEntry( - "Domain-Specific", + "Domain-Specific ", mteb.get_benchmarks( [ "MTEB(Code, v1)", @@ -88,17 +86,8 @@ class MenuEntry: "Miscellaneous", # All of these are retrieval benchmarks mteb.get_benchmarks( [ - "BEIR", - "BEIR-NL", - "NanoBEIR", - "BRIGHT", - "BRIGHT (long)", "BuiltBench(eng)", - "CoIR", - "FollowIR", - "LongEmbed", "MINERSBitextMining", - "RAR-b", ] ), ), @@ -106,24 +95,58 @@ class MenuEntry: ), ] -RTEB_BENCHMARK_ENTRIES = [ +R_BENCHMARK_ENTRIES = [ MenuEntry( - name="RTEB (Retrieval)", + name="Retrieval", description=None, open=False, benchmarks=[ RTEB_MAIN, + RTEB_ENGLISH, + MenuEntry( + "Image", + description=None, + open=False, + benchmarks=[ + mteb.get_benchmark("VisualDocumentRetrieval"), + mteb.get_benchmark("JinaVDR"), + ], + ), MenuEntry( "Domain-Specific", description=None, open=False, - benchmarks=[RTEB_FINANCE, RTEB_LEGAL, RTEB_CODE, RTEB_HEALTHCARE], + benchmarks=[ + RTEB_FINANCE, + RTEB_LEGAL, + RTEB_CODE, + mteb.get_benchmark("CoIR"), + RTEB_HEALTHCARE, + mteb.get_benchmark("FollowIR"), + mteb.get_benchmark("LongEmbed"), + mteb.get_benchmark("BRIGHT"), + ], ), MenuEntry( "Language-specific", description=None, open=False, - benchmarks=[RTEB_ENGLISH, RTEB_FRENCH, RTEB_GERMAN], + benchmarks=[ + RTEB_FRENCH, + RTEB_GERMAN, + mteb.get_benchmark("BEIR"), + mteb.get_benchmark("BEIR-NL"), + ], + ), + MenuEntry( + "Miscellaneous", + mteb.get_benchmarks( + [ + "NanoBEIR", + "BRIGHT (long)", + "RAR-b", + ] + ), ), ], ) @@ -229,5 +252,5 @@ def _render_benchmark_item( if __name__ == "__main__": with gr.Blocks() as b: - selector = make_selector(BENCHMARK_ENTRIES) + selector = make_selector(GP_BENCHMARK_ENTRIES) b.launch() diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 623e508caa..732e10d803 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -1,5 +1,7 @@ from __future__ import annotations +from typing import TYPE_CHECKING + import gradio as gr import matplotlib.pyplot as plt import numpy as np @@ -7,6 +9,10 @@ from matplotlib.colors import LinearSegmentedColormap from pandas.api.types import is_numeric_dtype +if TYPE_CHECKING: + from mteb.benchmarks.benchmark import Benchmark + from mteb.load_results.benchmark_results import BenchmarkResults + def format_scores(score: float) -> float: return round(score * 100, 2) @@ -57,14 +63,14 @@ def create_light_green_cmap(): def apply_summary_styling_from_benchmark( - benchmark_instance, benchmark_results + benchmark_instance: Benchmark, benchmark_results: BenchmarkResults ) -> gr.DataFrame: """Apply styling to summary table created by the benchmark instance's _create_summary_table method. This supports polymorphism - different benchmark classes can have different table generation logic. Args: - benchmark_instance: The benchmark instance (could be Benchmark, RTEBBenchmark, etc.) + benchmark_instance: The benchmark instance benchmark_results: BenchmarkResults object containing model results (may be pre-filtered) Returns: @@ -82,14 +88,14 @@ def apply_summary_styling_from_benchmark( def apply_per_task_styling_from_benchmark( - benchmark_instance, benchmark_results + benchmark_instance: Benchmark, benchmark_results: BenchmarkResults ) -> gr.DataFrame: """Apply styling to per-task table created by the benchmark instance's _create_per_task_table method. This supports polymorphism - different benchmark classes can have different table generation logic. Args: - benchmark_instance: The benchmark instance (could be Benchmark, RTEBBenchmark, etc.) + benchmark_instance: The benchmark instance benchmark_results: BenchmarkResults object containing model results (may be pre-filtered) Returns: diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index 4c83d3b156..823d96c0ff 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -107,26 +107,29 @@ def filter_tasks( domains: list[TASK_DOMAIN] | None = None, task_types: list[TASK_TYPE] | None = None, modalities: list[MODALITIES] | None = None, + is_public: bool | None = None, ) -> ModelResult: # TODO: v2 see filter_tasks in BenchmarkResults - but can be moved to a private function or removed new_task_results = [] for task_result in self.task_results: - if (task_names is not None) and (task_result.task_name not in task_names): + task_meta = task_result.task.metadata + if (task_names is not None) and (task_meta.name not in task_names): continue if languages is not None: - task_languages = task_result.languages + task_languages = task_meta.languages if not any(lang in task_languages for lang in languages): continue if domains is not None: - task_domains = task_result.domains + task_domains = task_meta.domains if task_meta.domains else [] if not any(domain in task_domains for domain in domains): continue - if (task_types is not None) and (task_result.task_type not in task_types): + if (task_types is not None) and (task_meta.type not in task_types): continue if modalities is not None: - task_modalities = getattr(task_result, "modalities", []) - if not any(modality in task_modalities for modality in modalities): + if not any(modality in task_meta.modalities for modality in modalities): continue + if (is_public is not None) and (task_meta.is_public is not is_public): + continue new_task_results.append(task_result) return type(self).model_construct( model_name=self.model_name, @@ -395,6 +398,7 @@ def filter_tasks( domains: list[TASK_DOMAIN] | None = None, task_types: list[TASK_TYPE] | None = None, # type: ignore modalities: list[MODALITIES] | None = None, + is_public: bool | None = None, ) -> BenchmarkResults: # TODO: Same as filter_models model_results = [ @@ -404,6 +408,7 @@ def filter_tasks( domains=domains, task_types=task_types, modalities=modalities, + is_public=is_public, ) for res in self.model_results ]