Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import mteb
from mteb.caching import json_cache
from mteb.leaderboard.figures import performance_size_plot
from mteb.leaderboard.figures import performance_size_plot, radar_chart
from mteb.leaderboard.table import scores_to_tables


Expand Down Expand Up @@ -218,10 +218,16 @@ def update_task_info(task_names: str) -> gr.DataFrame:
)
citation = gr.Markdown(update_citation, inputs=[benchmark_select])
with gr.Column():
plot = gr.Plot(performance_size_plot, inputs=[summary_table])
gr.Markdown(
"*We only display models that have been run on all tasks in the benchmark*"
)
with gr.Tab("Performance-Size Plot"):
plot = gr.Plot(performance_size_plot, inputs=[summary_table])
gr.Markdown(
"*We only display models that have been run on all tasks in the benchmark*"
)
with gr.Tab("Top 5 Radar Chart"):
radar_plot = gr.Plot(radar_chart, inputs=[summary_table])
gr.Markdown(
"*We only display models that have been run on all task types in the benchmark*"
)
with gr.Tab("Summary"):
summary_table.render()
with gr.Tab("Performance per task"):
Expand Down
89 changes: 89 additions & 0 deletions mteb/leaderboard/figures.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,3 +97,92 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure:
margin=dict(b=20, t=10, l=20, r=10), # noqa
)
return fig


TOP_N = 5
task_types = [
"BitextMining",
"Classification",
"MultilabelClassification",
"Clustering",
"PairClassification",
"Reranking",
"Retrieval",
"STS",
"Summarization",
# "InstructionRetrieval",
# Not displayed, because the scores are negative,
# doesn't work well with the radar chart.
"Speed",
]

line_colors = [
"#EE4266",
"#00a6ed",
"#ECA72C",
"#B42318",
"#3CBBB1",
]
fill_colors = [
"rgba(238,66,102,0.2)",
"rgba(0,166,237,0.2)",
"rgba(236,167,44,0.2)",
"rgba(180,35,24,0.2)",
"rgba(60,187,177,0.2)",
]


def radar_chart(df: pd.DataFrame) -> go.Figure:
df = df.copy()
df["Model"] = df["Model"].map(parse_model_name)
# Remove whitespace
task_type_columns = [
column for column in df.columns if "".join(column.split()) in task_types
]
df = df[["Model", *task_type_columns]].set_index("Model")
df = df.replace("", np.nan)
df = df.dropna()
df = df.head(TOP_N)
df = df.iloc[::-1]
fig = go.Figure()
for i, (model_name, row) in enumerate(df.iterrows()):
fig.add_trace(
go.Scatterpolar(
name=model_name,
r=[row[task_type] for task_type in task_type_columns]
+ [row[task_type_columns[0]]],
theta=task_type_columns + [task_type_columns[0]],
showlegend=True,
mode="lines",
line=dict(width=2, color=line_colors[i]),
fill="toself",
fillcolor=fill_colors[i],
)
)
fig.update_layout(
font=dict(size=16, color="black"), # noqa
template="plotly_white",
polar=dict(
radialaxis=dict(
visible=True,
gridcolor="black",
linecolor="rgba(0,0,0,0)",
gridwidth=1,
showticklabels=False,
ticks="",
),
angularaxis=dict(
gridcolor="black", gridwidth=1.5, linecolor="rgba(0,0,0,0)"
),
),
legend=dict(
orientation="h",
yanchor="bottom",
y=-0.6,
xanchor="left",
x=-0.05,
entrywidthmode="fraction",
entrywidth=1 / 5,
),
)
return fig
23 changes: 9 additions & 14 deletions mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,9 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
task_names_per_type = defaultdict(list)
for task_name, task_type in zip(df["task_name"], df["task_type"]):
task_names_per_type[task_type].append(task_name)
groups = df.groupby(["model_name", "model_revision"])
groups = df.groupby("model_name")
records = []
for (model_name, model_revision), group_data in groups:
for (model_name), group_data in groups:
name_to_score = dict(zip(group_data["task_name"], group_data["score"]))
for task_type, task_names in task_names_per_type.items():
type_mean = np.mean(
Expand All @@ -91,7 +91,6 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
records.append(
dict( # noqa
model_name=model_name,
model_revision=model_revision,
task_type=task_type,
score=type_mean,
)
Expand Down Expand Up @@ -125,24 +124,23 @@ def scores_to_tables(
)
mean_per_type = get_means_per_types(data)
mean_per_type = mean_per_type.pivot(
index=["model_name", "model_revision"], columns="task_type", values="score"
index="model_name", columns="task_type", values="score"
)
mean_per_type.columns = [
split_on_capital(column) for column in mean_per_type.columns
]
per_task = data.pivot(
index=["model_name", "model_revision"], columns="task_name", values="score"
)
per_task = data.pivot(index="model_name", columns="task_name", values="score")
to_remove = per_task.isna().all(axis="columns")
if search_query:
names = per_task.index.get_level_values("model_name")
names = pd.Series(names, index=per_task.index)
to_remove |= ~names.str.contains(search_query, regex=True)
models_to_remove = list(per_task[to_remove].index)
typed_mean = mean_per_type.mean(skipna=False, axis=1)
overall_mean = per_task.mean(skipna=False, axis=1)
joint_table = mean_per_type.copy()
per_task = per_task[~to_remove]
joint_table = joint_table[~to_remove]
per_task = per_task.drop(models_to_remove, axis=0)
joint_table = joint_table.drop(models_to_remove, axis=0)
joint_table.insert(0, "mean", overall_mean)
joint_table.insert(1, "mean_by_task_type", typed_mean)
joint_table["borda_rank"] = get_borda_rank(per_task)
Expand All @@ -166,10 +164,7 @@ def scores_to_tables(
model_metas.map(lambda m: format_n_parameters(m.n_parameters)),
)
joint_table = joint_table.sort_values("borda_rank", ascending=True)
per_task = per_task.loc[
joint_table.set_index(["model_name", "model_revision"]).index
]
joint_table = joint_table.drop(columns=["model_revision"])
per_task = per_task.loc[joint_table.set_index("model_name").index]
# Removing HF organization from model
joint_table["model_name"] = joint_table["model_name"].map(
lambda name: name.split("/")[-1]
Expand All @@ -189,7 +184,7 @@ def scores_to_tables(
"mean": "Mean (Task)",
}
)
per_task = per_task.reset_index().drop(columns=["model_revision"])
per_task = per_task.reset_index()
per_task["model_name"] = per_task["model_name"].map(
lambda name: name.split("/")[-1]
)
Expand Down