Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions mteb/benchmarks/_create_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,3 +254,133 @@ def _create_per_task_table_from_benchmark_results(
)

return per_task


def _create_summary_table_mean_public_private(
benchmark_results: BenchmarkResults,
) -> pd.DataFrame:
"""Create summary table from BenchmarkResults.

Returns a DataFrame with one row per model containing summary statistics
and task type averages.

Args:
benchmark_results: BenchmarkResults object containing model results

Returns:
DataFrame with model summaries, ready for styling in the leaderboard
"""
print("all tasks:", benchmark_results.task_names)
data = benchmark_results.to_dataframe(format="long")

if data.empty:
no_results_frame = pd.DataFrame(
{"No results": ["You can try relaxing your criteria"]}
)
return no_results_frame
public_task_name = benchmark_results.filter_tasks(is_public=True).task_names
print("Public tasks:", public_task_name)
private_task_name = benchmark_results.filter_tasks(is_public=False).task_names
print("Private tasks:", private_task_name)
# Convert to DataFrame and pivot
per_task = data.pivot(index="model_name", columns="task_name", values="score")
print(per_task.columns)

# Remove models with no scores
to_remove = per_task.isna().all(axis="columns")
if to_remove.all():
no_results_frame = pd.DataFrame(
{"No results": ["You can try relaxing your criteria"]}
)
return no_results_frame

models_to_remove = list(per_task[to_remove].index)
per_task = per_task.drop(models_to_remove, axis=0)

# Calculate means by task type
mean_per_type = _get_means_per_types(per_task)
mean_per_type = mean_per_type.pivot(
index="model_name", columns="task_type", values="score"
)
mean_per_type.columns = [
_split_on_capital(column) for column in mean_per_type.columns
]

# Calculate overall means
public_mean = per_task[public_task_name].mean(skipna=False, axis=1)
private_mean = per_task[private_task_name].mean(skipna=False, axis=1)

# Build joint table
joint_table = mean_per_type.copy()
joint_table = joint_table.drop(models_to_remove, axis=0)
joint_table.insert(0, "mean(public)", public_mean)
joint_table.insert(1, "mean(private)", private_mean)
joint_table["borda_rank"] = _get_borda_rank(per_task)
joint_table = joint_table.sort_values("borda_rank", ascending=True)
joint_table = joint_table.reset_index()

# Add model metadata
model_metas = joint_table["model_name"].map(_failsafe_get_model_meta)
joint_table = joint_table[model_metas.notna()]
joint_table["model_link"] = model_metas.map(lambda m: m.reference)

# Insert model metadata columns
joint_table.insert(
1,
"Max Tokens",
model_metas.map(lambda m: _format_max_tokens(m.max_tokens)),
)
joint_table.insert(
1,
"Embedding Dimensions",
model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"),
)
joint_table.insert(
1,
"Number of Parameters",
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
)
joint_table.insert(
1,
"Memory Usage (MB)",
model_metas.map(
lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown"
),
)

# Add zero-shot percentage
tasks = get_tasks(tasks=list(data["task_name"].unique()))
joint_table.insert(
1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks))
)
joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1)

# Clean up model names (remove HF organization)
joint_table["model_name"] = joint_table["model_name"].map(
lambda name: name.split("/")[-1]
)

# Add markdown links to model names
name_w_link = (
"[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")"
)
joint_table["model_name"] = joint_table["model_name"].mask(
joint_table["model_link"].notna(), name_w_link
)
joint_table = joint_table.drop(columns=["model_link"])

# Rename columns
rename_dict = {
"model_name": "Model",
"mean(public)": "Mean (Public)",
"mean(private)": "Mean (Private)",
}
# For RTEB: all tasks are Retrieval type, so Retrieval column = Mean (Task)
if "Retrieval" in joint_table.columns:
rename_dict["Retrieval"] = "Mean (Task)"
joint_table = joint_table.rename(columns=rename_dict)

# Move borda rank to front
joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank"))

return joint_table
9 changes: 9 additions & 0 deletions mteb/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from mteb.benchmarks._create_table import (
_create_per_task_table_from_benchmark_results,
_create_summary_table_from_benchmark_results,
_create_summary_table_mean_public_private,
)
from mteb.load_results.load_results import load_results

Expand Down Expand Up @@ -89,3 +90,11 @@ def _create_per_task_table(
) -> pd.DataFrame:
"""Create per-task table. Called by the leaderboard app."""
return _create_per_task_table_from_benchmark_results(benchmark_results)


class RtebBenchmark(Benchmark):
def _create_summary_table(
self, benchmark_results: BenchmarkResults
) -> pd.DataFrame:
"""Create summary table. Called by the leaderboard app."""
return _create_summary_table_mean_public_private(benchmark_results)
8 changes: 7 additions & 1 deletion mteb/benchmarks/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,7 @@
CoIR = Benchmark(
name="CoIR",
display_name="Code Information Retrieval",
icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg",
tasks=get_tasks(
tasks=[
"AppsRetrieval",
Expand Down Expand Up @@ -476,7 +477,7 @@

RAR_b = Benchmark(
name="RAR-b",
display_name="Reasoning retrieval",
display_name="Reasoning as retrieval",
tasks=get_tasks(
tasks=[
"ARCChallenge",
Expand Down Expand Up @@ -1163,6 +1164,7 @@

BRIGHT = Benchmark(
name="BRIGHT",
display_name="Reasoning Retrieval",
tasks=get_tasks(tasks=["BrightRetrieval"], eval_splits=["standard"]),
description="""BRIGHT: A Realistic and Challenging Benchmark for Reasoning-Intensive Retrieval.
BRIGHT is the first text retrieval
Expand Down Expand Up @@ -1238,6 +1240,8 @@

BEIR = Benchmark(
name="BEIR",
display_name="BEIR",
icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg",
tasks=get_tasks(
tasks=[
"TRECCOVID",
Expand Down Expand Up @@ -1578,6 +1582,8 @@

BEIR_NL = Benchmark(
name="BEIR-NL",
display_name="BEIR-NL",
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/nl.svg",
tasks=get_tasks(
tasks=[
"ArguAna-NL",
Expand Down
26 changes: 13 additions & 13 deletions mteb/benchmarks/benchmarks/rteb_benchmarks.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# RTEB Benchmarks - Retrieval Embedding Benchmark
from __future__ import annotations

from mteb.benchmarks.benchmark import Benchmark
from mteb.benchmarks.benchmark import RtebBenchmark
from mteb.overview import get_tasks

RTEB_CITATION = r"""@article{rteb2024,
Expand All @@ -10,9 +10,9 @@
year = {2024},
}"""

RTEB_MAIN = Benchmark(
RTEB_MAIN = RtebBenchmark(
name="RTEB(beta)",
display_name="RTEB Retrieval Embedding Benchmark",
display_name="RTEB Multilingual",
icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-search.svg",
tasks=get_tasks(
tasks=[
Expand Down Expand Up @@ -48,12 +48,12 @@
"JapaneseLegal1Retrieval",
],
),
description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 29 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across English, French, German, and Japanese languages.",
description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 29 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts across mutliple languages.",
citation=RTEB_CITATION,
contacts=["fzowl"],
)

RTEB_ENGLISH = Benchmark(
RTEB_ENGLISH = RtebBenchmark(
name="RTEB(eng, beta)",
display_name="RTEB English",
icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg",
Expand Down Expand Up @@ -88,8 +88,8 @@
contacts=["fzowl"],
)

RTEB_FRENCH = Benchmark(
name="RTEB(fr, beta)",
RTEB_FRENCH = RtebBenchmark(
name="RTEB(fra, beta)",
display_name="RTEB French",
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg",
tasks=get_tasks(
Expand All @@ -106,7 +106,7 @@
contacts=["fzowl"],
)

RTEB_GERMAN = Benchmark(
RTEB_GERMAN = RtebBenchmark(
name="RTEB(deu, beta)",
display_name="RTEB German",
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg",
Expand All @@ -124,7 +124,7 @@
contacts=["fzowl"],
)

RTEB_JAPANESE = Benchmark(
RTEB_JAPANESE = RtebBenchmark(
name="RTEB(jpn, beta)",
display_name="RTEB Japanese",
icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
Expand All @@ -140,7 +140,7 @@
contacts=["fzowl"],
)

RTEB_FINANCE = Benchmark(
RTEB_FINANCE = RtebBenchmark(
name="RTEB(fin, beta)",
display_name="RTEB Finance",
icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-price-tag.svg",
Expand All @@ -161,7 +161,7 @@
contacts=["fzowl"],
)

RTEB_LEGAL = Benchmark(
RTEB_LEGAL = RtebBenchmark(
name="RTEB(Law, beta)",
display_name="RTEB Legal",
icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg",
Expand All @@ -182,7 +182,7 @@
contacts=["fzowl"],
)

RTEB_CODE = Benchmark(
RTEB_CODE = RtebBenchmark(
name="RTEB(Code, beta)",
display_name="RTEB Code",
icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg",
Expand All @@ -204,7 +204,7 @@
contacts=["fzowl"],
)

RTEB_HEALTHCARE = Benchmark(
RTEB_HEALTHCARE = RtebBenchmark(
name="RTEB(Health, beta)",
display_name="RTEB Healthcare",
icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg",
Expand Down
33 changes: 8 additions & 25 deletions mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import argparse
import itertools
import json
import logging
Expand All @@ -19,9 +18,9 @@
from mteb.abstasks.TaskMetadata import TASK_DOMAIN, TASK_TYPE
from mteb.custom_validators import MODALITIES
from mteb.leaderboard.benchmark_selector import (
BENCHMARK_ENTRIES,
DEFAULT_BENCHMARK_NAME,
RTEB_BENCHMARK_ENTRIES,
GP_BENCHMARK_ENTRIES,
R_BENCHMARK_ENTRIES,
make_selector,
)
from mteb.leaderboard.figures import performance_size_plot, radar_chart
Expand Down Expand Up @@ -123,6 +122,7 @@ def update_task_info(task_names: str) -> gr.DataFrame:
"reference",
"main_score",
"modalities",
"is_public",
]
)
df["languages"] = df["languages"].map(format_list)
Expand All @@ -138,6 +138,7 @@ def update_task_info(task_names: str) -> gr.DataFrame:
"domains": "Domains",
"main_score": "Metric",
"modalities": "Modality",
"is_public": "Public",
}
)
df = df.drop(columns="reference")
Expand Down Expand Up @@ -195,23 +196,7 @@ def filter_models(
return list(models_to_keep)


def get_startup_arguments():
parser = argparse.ArgumentParser()

# Add a Boolean flag parameter
parser.add_argument(
"--show_rteb",
action="store_true",
help="If set, display RTEB results; otherwise show default results.",
)

return parser.parse_args()


def get_leaderboard_app() -> gr.Blocks:
args = get_startup_arguments()
show_rteb = args.show_rteb

logger.info("Loading all benchmark results")
all_results = load_results()

Expand Down Expand Up @@ -309,12 +294,10 @@ def get_leaderboard_app() -> gr.Blocks:
visible=True,
width="18%",
):
if show_rteb:
benchmark_select, column = make_selector(
BENCHMARK_ENTRIES + RTEB_BENCHMARK_ENTRIES
)
else:
benchmark_select, column = make_selector(BENCHMARK_ENTRIES)
benchmark_select, column = make_selector(
GP_BENCHMARK_ENTRIES + R_BENCHMARK_ENTRIES
)

gr.Markdown(
"""
## Embedding Leaderboard
Expand Down
Loading