Skip to content
256 changes: 256 additions & 0 deletions mteb/benchmarks/_create_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
from __future__ import annotations

import math
import re
from collections import defaultdict

import numpy as np
import pandas as pd

from mteb.load_results.benchmark_results import BenchmarkResults
from mteb.overview import get_task, get_tasks


def _borda_count(scores: pd.Series) -> pd.Series:
n = len(scores)
ranks = scores.rank(method="average", ascending=False)
counts = n - ranks
return counts


def _get_borda_rank(score_table: pd.DataFrame) -> pd.Series:
borda_counts = score_table.apply(_borda_count, axis="index")
mean_borda = borda_counts.sum(axis=1)
return mean_borda.rank(method="min", ascending=False).astype(int)


def _split_on_capital(s: str) -> str:
"""Splits on capital letters and joins with spaces"""
return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s))


def _format_n_parameters(n_parameters) -> str:
if (n_parameters is None) or (not int(n_parameters)):
return "Unknown"
n_thousand = int(n_parameters // 1e3)
if n_thousand < 1:
return str(int(n_parameters))
n_zeros = math.log10(n_thousand)
if n_zeros >= 6:
return str(n_thousand // (10**6)) + "B"
if n_zeros >= 3:
return str(n_thousand // (10**3)) + "M"
return str(n_thousand) + "K"


def _format_max_tokens(max_tokens: float | None) -> str:
if max_tokens is None:
return "Unknown"
if max_tokens == np.inf:
return "Infinite"
return str(int(max_tokens))


def _failsafe_get_model_meta(model_name):
try:
from mteb.models.overview import get_model_meta

return get_model_meta(model_name)
except Exception:
return None
Comment on lines +54 to +60
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed a strange issue after moving the code into _create_table.py.
Placing this line at the top:

from mteb.models.overview import get_model_meta

causes a circular import, even though it works fine in benchmark.py.
For now, I applied lazy loading inside the function to avoid the problem.
Do you have some better different solution.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @KennethEnevoldsen ,could you please take a look,when you have time.



def _get_means_per_types(per_task: pd.DataFrame):
task_names_per_type = defaultdict(list)
for task_name in per_task.columns:
task_type = get_task(task_name).metadata.type
task_names_per_type[task_type].append(task_name)
records = []
for task_type, tasks in task_names_per_type.items():
for model_name, scores in per_task.iterrows():
records.append(
dict(
model_name=model_name,
task_type=task_type,
score=scores[tasks].mean(skipna=False),
)
)
return pd.DataFrame.from_records(records)


def _create_summary_table_from_benchmark_results(
benchmark_results: BenchmarkResults,
) -> pd.DataFrame:
"""Create summary table from BenchmarkResults.

Returns a DataFrame with one row per model containing summary statistics
and task type averages.

Args:
benchmark_results: BenchmarkResults object containing model results

Returns:
DataFrame with model summaries, ready for styling in the leaderboard
"""
data = benchmark_results.to_dataframe(format="long")

if data.empty:
no_results_frame = pd.DataFrame(
{"No results": ["You can try relaxing your criteria"]}
)
return no_results_frame

# Convert to DataFrame and pivot
per_task = data.pivot(index="model_name", columns="task_name", values="score")

# Remove models with no scores
to_remove = per_task.isna().all(axis="columns")
if to_remove.all():
no_results_frame = pd.DataFrame(
{"No results": ["You can try relaxing your criteria"]}
)
return no_results_frame

models_to_remove = list(per_task[to_remove].index)
per_task = per_task.drop(models_to_remove, axis=0)

# Calculate means by task type
mean_per_type = _get_means_per_types(per_task)
mean_per_type = mean_per_type.pivot(
index="model_name", columns="task_type", values="score"
)
mean_per_type.columns = [
_split_on_capital(column) for column in mean_per_type.columns
]

# Calculate overall means
typed_mean = mean_per_type.mean(skipna=False, axis=1)
overall_mean = per_task.mean(skipna=False, axis=1)

# Build joint table
joint_table = mean_per_type.copy()
joint_table = joint_table.drop(models_to_remove, axis=0)
joint_table.insert(0, "mean", overall_mean)
joint_table.insert(1, "mean_by_task_type", typed_mean)
joint_table["borda_rank"] = _get_borda_rank(per_task)
joint_table = joint_table.sort_values("borda_rank", ascending=True)
joint_table = joint_table.reset_index()

# Add model metadata
model_metas = joint_table["model_name"].map(_failsafe_get_model_meta)
joint_table = joint_table[model_metas.notna()]
joint_table["model_link"] = model_metas.map(lambda m: m.reference)

# Insert model metadata columns
joint_table.insert(
1,
"Max Tokens",
model_metas.map(lambda m: _format_max_tokens(m.max_tokens)),
)
joint_table.insert(
1,
"Embedding Dimensions",
model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"),
)
joint_table.insert(
1,
"Number of Parameters",
model_metas.map(lambda m: _format_n_parameters(m.n_parameters)),
)
joint_table.insert(
1,
"Memory Usage (MB)",
model_metas.map(
lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown"
),
)

# Add zero-shot percentage
tasks = get_tasks(tasks=list(data["task_name"].unique()))
joint_table.insert(
1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks))
)
joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1)

# Clean up model names (remove HF organization)
joint_table["model_name"] = joint_table["model_name"].map(
lambda name: name.split("/")[-1]
)

# Add markdown links to model names
name_w_link = (
"[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")"
)
joint_table["model_name"] = joint_table["model_name"].mask(
joint_table["model_link"].notna(), name_w_link
)
joint_table = joint_table.drop(columns=["model_link"])

# Rename columns
joint_table = joint_table.rename(
columns={
"model_name": "Model",
"mean_by_task_type": "Mean (TaskType)",
"mean": "Mean (Task)",
}
)

# Move borda rank to front
joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank"))

return joint_table


def _create_per_task_table_from_benchmark_results(
benchmark_results: BenchmarkResults,
) -> pd.DataFrame:
"""Create per-task table from BenchmarkResults.

Returns a DataFrame with one row per model and one column per task.

Args:
benchmark_results: BenchmarkResults object containing model results

Returns:
DataFrame with per-task scores, ready for styling in the leaderboard
"""
# Get scores in long format
data = benchmark_results.to_dataframe(format="long")

if data.empty:
no_results_frame = pd.DataFrame(
{"No results": ["You can try relaxing your criteria"]}
)
return no_results_frame

# Convert to DataFrame and pivot
per_task = data.pivot(index="model_name", columns="task_name", values="score")

# Remove models with no scores
to_remove = per_task.isna().all(axis="columns")
if to_remove.all():
no_results_frame = pd.DataFrame(
{"No results": ["You can try relaxing your criteria"]}
)
return no_results_frame

models_to_remove = list(per_task[to_remove].index)
per_task = per_task.drop(models_to_remove, axis=0)

# Add borda rank and sort
per_task["borda_rank"] = _get_borda_rank(per_task)
per_task = per_task.sort_values("borda_rank", ascending=True)
per_task = per_task.drop(columns=["borda_rank"])
per_task = per_task.reset_index()

# Clean up model names (remove HF organization)
per_task["model_name"] = per_task["model_name"].map(
lambda name: name.split("/")[-1]
)
per_task = per_task.rename(
columns={
"model_name": "Model",
}
)

return per_task
17 changes: 17 additions & 0 deletions mteb/benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,13 @@
from dataclasses import dataclass
from typing import TYPE_CHECKING, Annotated

import pandas as pd
from pydantic import AnyUrl, BeforeValidator, TypeAdapter

from mteb.benchmarks._create_table import (
_create_per_task_table_from_benchmark_results,
_create_summary_table_from_benchmark_results,
)
from mteb.load_results.load_results import load_results

if TYPE_CHECKING:
Expand Down Expand Up @@ -72,3 +77,15 @@ def load_results(
results = base_results.select_tasks(self.tasks)
self.results_cache[base_results] = results
return results

def _create_summary_table(
self, benchmark_results: BenchmarkResults
) -> pd.DataFrame:
"""Create summary table. Called by the leaderboard app."""
return _create_summary_table_from_benchmark_results(benchmark_results)

def _create_per_task_table(
self, benchmark_results: BenchmarkResults
) -> pd.DataFrame:
"""Create per-task table. Called by the leaderboard app."""
return _create_per_task_table_from_benchmark_results(benchmark_results)
70 changes: 54 additions & 16 deletions mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,10 @@
make_selector,
)
from mteb.leaderboard.figures import performance_size_plot, radar_chart
from mteb.leaderboard.table import create_tables
from mteb.leaderboard.table import (
apply_per_task_styling_from_benchmark,
apply_summary_styling_from_benchmark,
)
from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -218,10 +221,21 @@ def get_leaderboard_app() -> gr.Blocks:
max_model_size=MAX_MODEL_SIZE,
zero_shot_setting="allow_all",
)
default_filtered_scores = [
entry for entry in default_scores if entry["model_name"] in filtered_models
]

# Filter BenchmarkResults based on default filtered models (as required by Kenneth)
filtered_model_names = [entry["model_name"] for entry in default_filtered_scores]
filtered_benchmark_results = default_results.select_models(filtered_model_names)

summary_table, per_task_table = create_tables(
[entry for entry in default_scores if entry["model_name"] in filtered_models]
summary_table = apply_summary_styling_from_benchmark(
default_benchmark, filtered_benchmark_results
)
per_task_table = apply_per_task_styling_from_benchmark(
default_benchmark, filtered_benchmark_results
)

lang_select = gr.Dropdown(
LANGUAGE,
value=sorted(default_results.languages),
Expand Down Expand Up @@ -751,19 +765,43 @@ def update_tables(
tasks = set(tasks)
benchmark = mteb.get_benchmark(benchmark_name)
benchmark_tasks = {task.metadata.name for task in benchmark.tasks}
if (benchmark_tasks != tasks) or (models_to_keep is not None):
filtered_scores = []
for entry in scores:
if entry["task_name"] not in tasks:
continue
if (models_to_keep is not None) and (
entry["model_name"] not in models_to_keep
):
continue
filtered_scores.append(entry)
else:
filtered_scores = scores
summary, per_task = create_tables(filtered_scores)

# Extract filtered model and task names from scores (respects UI filters)
filtered_model_names = set()
filtered_task_names = set()

for entry in scores:
if entry["task_name"] not in tasks:
continue
if (models_to_keep is not None) and (
entry["model_name"] not in models_to_keep
):
continue
filtered_model_names.add(entry["model_name"])
filtered_task_names.add(entry["task_name"])

# Create filtered BenchmarkResults as required by Kenneth
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
# Create filtered BenchmarkResults as required by Kenneth

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure I understand?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not sure I understand?

Maybe one possible reason is that the original create_table function returned both summary_table and per_task in a single pass.
In the updated version, these two DataFrames are computed separately, which introduces some duplicated processing and may cause the slowdown.

benchmark_results = all_benchmark_results[benchmark_name]
filtered_benchmark_results = benchmark_results

# Apply task filtering if needed
if filtered_task_names != benchmark_tasks:
filtered_benchmark_results = filtered_benchmark_results.filter_tasks(
task_names=list(filtered_task_names)
)

# Apply model filtering if needed
if filtered_model_names:
filtered_benchmark_results = filtered_benchmark_results.select_models(
list(filtered_model_names)
)

summary = apply_summary_styling_from_benchmark(
benchmark, filtered_benchmark_results
)
per_task = apply_per_task_styling_from_benchmark(
benchmark, filtered_benchmark_results
)
elapsed = time.time() - start_time
logger.debug(f"update_tables callback: {elapsed}s")
return summary, per_task
Expand Down
Loading
Loading