diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index c51dc7a502..a6dd1c7325 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -9,7 +9,7 @@ import mteb from mteb.caching import json_cache -from mteb.leaderboard.figures import performance_size_plot +from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import scores_to_tables @@ -218,10 +218,16 @@ def update_task_info(task_names: str) -> gr.DataFrame: ) citation = gr.Markdown(update_citation, inputs=[benchmark_select]) with gr.Column(): - plot = gr.Plot(performance_size_plot, inputs=[summary_table]) - gr.Markdown( - "*We only display models that have been run on all tasks in the benchmark*" - ) + with gr.Tab("Performance-Size Plot"): + plot = gr.Plot(performance_size_plot, inputs=[summary_table]) + gr.Markdown( + "*We only display models that have been run on all tasks in the benchmark*" + ) + with gr.Tab("Top 5 Radar Chart"): + radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) + gr.Markdown( + "*We only display models that have been run on all task types in the benchmark*" + ) with gr.Tab("Summary"): summary_table.render() with gr.Tab("Performance per task"): diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py index 373bcd00c6..9f3e73f7a4 100644 --- a/mteb/leaderboard/figures.py +++ b/mteb/leaderboard/figures.py @@ -97,3 +97,92 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure: margin=dict(b=20, t=10, l=20, r=10), # noqa ) return fig + + +TOP_N = 5 +task_types = [ + "BitextMining", + "Classification", + "MultilabelClassification", + "Clustering", + "PairClassification", + "Reranking", + "Retrieval", + "STS", + "Summarization", + # "InstructionRetrieval", + # Not displayed, because the scores are negative, + # doesn't work well with the radar chart. + "Speed", +] + +line_colors = [ + "#EE4266", + "#00a6ed", + "#ECA72C", + "#B42318", + "#3CBBB1", +] +fill_colors = [ + "rgba(238,66,102,0.2)", + "rgba(0,166,237,0.2)", + "rgba(236,167,44,0.2)", + "rgba(180,35,24,0.2)", + "rgba(60,187,177,0.2)", +] + + +def radar_chart(df: pd.DataFrame) -> go.Figure: + df = df.copy() + df["Model"] = df["Model"].map(parse_model_name) + # Remove whitespace + task_type_columns = [ + column for column in df.columns if "".join(column.split()) in task_types + ] + df = df[["Model", *task_type_columns]].set_index("Model") + df = df.replace("", np.nan) + df = df.dropna() + df = df.head(TOP_N) + df = df.iloc[::-1] + fig = go.Figure() + for i, (model_name, row) in enumerate(df.iterrows()): + fig.add_trace( + go.Scatterpolar( + name=model_name, + r=[row[task_type] for task_type in task_type_columns] + + [row[task_type_columns[0]]], + theta=task_type_columns + [task_type_columns[0]], + showlegend=True, + mode="lines", + line=dict(width=2, color=line_colors[i]), + fill="toself", + fillcolor=fill_colors[i], + ) + ) + fig.update_layout( + font=dict(size=16, color="black"), # noqa + template="plotly_white", + polar=dict( + radialaxis=dict( + visible=True, + gridcolor="black", + linecolor="rgba(0,0,0,0)", + gridwidth=1, + showticklabels=False, + ticks="", + ), + angularaxis=dict( + gridcolor="black", gridwidth=1.5, linecolor="rgba(0,0,0,0)" + ), + ), + legend=dict( + orientation="h", + yanchor="bottom", + y=-0.6, + xanchor="left", + x=-0.05, + entrywidthmode="fraction", + entrywidth=1 / 5, + ), + ) + return fig diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 9856493c74..2e5a3e3633 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -80,9 +80,9 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame: task_names_per_type = defaultdict(list) for task_name, task_type in zip(df["task_name"], df["task_type"]): task_names_per_type[task_type].append(task_name) - groups = df.groupby(["model_name", "model_revision"]) + groups = df.groupby("model_name") records = [] - for (model_name, model_revision), group_data in groups: + for (model_name), group_data in groups: name_to_score = dict(zip(group_data["task_name"], group_data["score"])) for task_type, task_names in task_names_per_type.items(): type_mean = np.mean( @@ -91,7 +91,6 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame: records.append( dict( # noqa model_name=model_name, - model_revision=model_revision, task_type=task_type, score=type_mean, ) @@ -125,24 +124,23 @@ def scores_to_tables( ) mean_per_type = get_means_per_types(data) mean_per_type = mean_per_type.pivot( - index=["model_name", "model_revision"], columns="task_type", values="score" + index="model_name", columns="task_type", values="score" ) mean_per_type.columns = [ split_on_capital(column) for column in mean_per_type.columns ] - per_task = data.pivot( - index=["model_name", "model_revision"], columns="task_name", values="score" - ) + per_task = data.pivot(index="model_name", columns="task_name", values="score") to_remove = per_task.isna().all(axis="columns") if search_query: names = per_task.index.get_level_values("model_name") names = pd.Series(names, index=per_task.index) to_remove |= ~names.str.contains(search_query, regex=True) + models_to_remove = list(per_task[to_remove].index) typed_mean = mean_per_type.mean(skipna=False, axis=1) overall_mean = per_task.mean(skipna=False, axis=1) joint_table = mean_per_type.copy() - per_task = per_task[~to_remove] - joint_table = joint_table[~to_remove] + per_task = per_task.drop(models_to_remove, axis=0) + joint_table = joint_table.drop(models_to_remove, axis=0) joint_table.insert(0, "mean", overall_mean) joint_table.insert(1, "mean_by_task_type", typed_mean) joint_table["borda_rank"] = get_borda_rank(per_task) @@ -166,10 +164,7 @@ def scores_to_tables( model_metas.map(lambda m: format_n_parameters(m.n_parameters)), ) joint_table = joint_table.sort_values("borda_rank", ascending=True) - per_task = per_task.loc[ - joint_table.set_index(["model_name", "model_revision"]).index - ] - joint_table = joint_table.drop(columns=["model_revision"]) + per_task = per_task.loc[joint_table.set_index("model_name").index] # Removing HF organization from model joint_table["model_name"] = joint_table["model_name"].map( lambda name: name.split("/")[-1] @@ -189,7 +184,7 @@ def scores_to_tables( "mean": "Mean (Task)", } ) - per_task = per_task.reset_index().drop(columns=["model_revision"]) + per_task = per_task.reset_index() per_task["model_name"] = per_task["model_name"].map( lambda name: name.split("/")[-1] )