Skip to content
5 changes: 2 additions & 3 deletions mteb/benchmarks/benchmarks.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@
from pydantic import AnyUrl, BeforeValidator, TypeAdapter

from mteb.abstasks.AbsTask import AbsTask
from mteb.load_results.benchmark_results import (
BenchmarkResults,
)
from mteb.load_results.benchmark_results import BenchmarkResults
from mteb.load_results.load_results import load_results
from mteb.overview import get_tasks

Expand Down Expand Up @@ -63,6 +61,7 @@ def load_results(
base_results = load_results()
return base_results.select_tasks(self.tasks)


MTEB_MAIN_EN = Benchmark(
name="MTEB(eng)",
tasks=get_tasks(
Expand Down
104 changes: 57 additions & 47 deletions mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from gradio_rangeslider import RangeSlider

import mteb
from mteb.leaderboard.table import scores_to_table
from mteb.leaderboard.table import scores_to_tables


def load_results():
Expand All @@ -27,7 +27,7 @@ def load_results():

benchmarks = mteb.get_benchmarks()

default_benchmark = mteb.get_benchmark("MTEB(multilingual)")
default_benchmark = mteb.get_benchmark("MTEB(Multilingual)")
default_results = default_benchmark.load_results(base_results=all_results)

benchmark_select = gr.Dropdown(
Expand Down Expand Up @@ -60,6 +60,7 @@ def load_results():
task_select = gr.Dropdown(
default_results.task_names,
value=default_results.task_names,
allow_custom_value=True,
multiselect=True,
label="Task",
info="Select specific tasks to include",
Expand All @@ -79,44 +80,46 @@ def load_results():
""",
)
with gr.Group():
availability = gr.Radio(
[
("Only Open", True),
("Only Proprietary", False),
("Both", None),
],
value=None,
label="Availability",
interactive=True,
)
compatibility = gr.CheckboxGroup(
[
(
"Should be sentence-transformers compatible",
"sbert_compatible",
with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
with gr.Column():
availability = gr.Radio(
[
("Only Open", True),
("Only Proprietary", False),
("Both", None),
],
value=None,
label="Availability",
interactive=True,
)
compatibility = gr.CheckboxGroup(
[
(
"Should be sentence-transformers compatible",
"sbert_compatible",
)
],
value=[],
label="Compatibility",
interactive=True,
)
instructions = gr.Radio(
[
("Only Instruction-tuned", True),
("Only non-instruction", False),
("Both", None),
],
value=None,
label="Instructions",
interactive=True,
)
model_size = RangeSlider(
minimum=0,
maximum=8000,
value=(0, 8000),
label="Model Size (#M Parameters)",
interactive=True,
)
],
value=[],
label="Compatibility",
interactive=True,
)
instructions = gr.Radio(
[
("Only Instruction-tuned", True),
("Only non-instruction", False),
("Both", None),
],
value=None,
label="Instructions",
interactive=True,
)
model_size = RangeSlider(
minimum=0,
maximum=8000,
value=(0, 8000),
label="Model Size (#M Parameters)",
interactive=True,
)
with gr.Column(scale=2):
gr.Markdown(
"""
Expand All @@ -126,7 +129,7 @@ def load_results():
"""
)
with gr.Group():
with gr.Row(elem_classes="overflow-y-scroll h-80"):
with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
with gr.Column():
benchmark_select.render()
with gr.Accordion("Select Languages", open=False):
Expand All @@ -135,13 +138,20 @@ def load_results():
type_select.render()
with gr.Accordion("Select Domains", open=False):
domain_select.render()
# with gr.Accordion("Add and remove tasks:", open=False):
task_select.render()
scores = gr.State(default_results.get_scores(format="long"))
dataframe = gr.DataFrame(
scores_to_table,
inputs=[scores],
)
with gr.Accordion("Add and remove tasks:", open=False):
task_select.render()
default_scores = default_results.get_scores(format="long")
scores = gr.State(default_scores)
summary, per_task = scores_to_tables(default_scores)
with gr.Tab("Summary"):
summary_table = gr.DataFrame(summary)
with gr.Tab("Performance per task"):
per_task_table = gr.DataFrame(per_task)

@gr.on(inputs=[scores], outputs=[summary_table, per_task_table])
def update_tables(scores):
summary, per_task = scores_to_tables(scores)
return summary, per_task

@gr.on(
inputs=[benchmark_select],
Expand Down
50 changes: 47 additions & 3 deletions mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
from __future__ import annotations

import gradio as gr
import numpy as np
import pandas as pd

from mteb.overview import get_task


def scores_to_table(scores_long: list[dict]):
def format_scores(score: float) -> float:
return score * 100


def scores_to_tables(scores_long: list[dict]):
if not scores_long:
return gr.DataFrame(), gr.DataFrame()
data = pd.DataFrame.from_records(scores_long)
data["task_type"] = data["task_name"].map(
lambda task_name: get_task(task_name).metadata.type
Expand All @@ -27,19 +34,56 @@ def scores_to_table(scores_long: list[dict]):
per_task = data.pivot(
index=["model_name", "model_revision"], columns="task_name", values="score"
)
to_remove = per_task.isna().any(axis="columns")
overall_mean = (
data.groupby(["model_name", "model_revision"])[["score"]]
.agg(np.nanmean)
.rename(columns={"score": "mean"})
)
joint_table = overall_mean.join([typed_mean, mean_per_type, per_task]).reset_index()
per_task = per_task[~to_remove]
mean_per_type = mean_per_type[~to_remove]
overall_mean = overall_mean[~to_remove]
mean_rank = per_task.rank(ascending=False, numeric_only=True).mean(
axis=1, skipna=True
)
joint_table = overall_mean.join([typed_mean, mean_per_type])
joint_table.insert(0, "mean_rank", mean_rank)
joint_table = joint_table.reset_index()
joint_table = joint_table.sort_values("mean", ascending=False)
joint_table["model_name"] = joint_table["model_name"].map(
lambda name: name.split("/")[-1]
)
joint_table = joint_table.rename(
columns={
"model_name": "Model",
"mean_by_task_type": "Mean by Task Type",
"mean": "Mean",
"mean_rank": "Mean Rank",
}
)
joint_table = joint_table.drop(columns=["model_revision"])
return joint_table
joint_table.insert(
0, "Rank", joint_table["Mean"].rank(ascending=False).map(int).map(str)
)
per_task = per_task.rename(
columns={
"model_name": "Model",
}
)
per_task = per_task.reset_index().drop(columns=["model_revision"])
numerics = joint_table.select_dtypes("number").columns
to_format = ["Mean", "Mean by Task Type", *mean_per_type.columns]
joint_table[to_format] = joint_table[to_format].map(format_scores)
joint_table = joint_table.style.highlight_max(
subset=to_format,
props="font-weight: bold",
).format("{:.2f}", subset=numerics)
joint_table = joint_table.highlight_min(
subset=["Mean Rank"], props="font-weight: bold"
)
numerics = per_task.select_dtypes("number").columns
per_task[numerics] = per_task[numerics].map(format_scores)
per_task = per_task.style.highlight_max(
subset=numerics, props="font-weight: bold"
).format("{:.2f}", subset=numerics)
return joint_table, per_task