embeddings-benchmark · x-tabdeveloping · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024 · Dec 9, 2024
diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
@@ -9,7 +9,7 @@
 
 import mteb
 from mteb.caching import json_cache
-from mteb.leaderboard.figures import performance_size_plot
+from mteb.leaderboard.figures import performance_size_plot, radar_chart
 from mteb.leaderboard.table import scores_to_tables
 
 
@@ -218,10 +218,16 @@ def update_task_info(task_names: str) -> gr.DataFrame:
             )
             citation = gr.Markdown(update_citation, inputs=[benchmark_select])
         with gr.Column():
-            plot = gr.Plot(performance_size_plot, inputs=[summary_table])
-            gr.Markdown(
-                "*We only display models that have been run on all tasks in the benchmark*"
-            )
+            with gr.Tab("Performance-Size Plot"):
+                plot = gr.Plot(performance_size_plot, inputs=[summary_table])
+                gr.Markdown(
+                    "*We only display models that have been run on all tasks in the benchmark*"
+                )
+            with gr.Tab("Top 5 Radar Chart"):
+                radar_plot = gr.Plot(radar_chart, inputs=[summary_table])
+                gr.Markdown(
+                    "*We only display models that have been run on all task types in the benchmark*"
+                )
     with gr.Tab("Summary"):
         summary_table.render()
     with gr.Tab("Performance per task"):

diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py
@@ -97,3 +97,92 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure:
         margin=dict(b=20, t=10, l=20, r=10),  # noqa
     )
     return fig
+
+
+TOP_N = 5
+task_types = [
+    "BitextMining",
+    "Classification",
+    "MultilabelClassification",
+    "Clustering",
+    "PairClassification",
+    "Reranking",
+    "Retrieval",
+    "STS",
+    "Summarization",
+    # "InstructionRetrieval",
+    # Not displayed, because the scores are negative,
+    # doesn't work well with the radar chart.
+    "Speed",
+]
+
+line_colors = [
+    "#EE4266",
+    "#00a6ed",
+    "#ECA72C",
+    "#B42318",
+    "#3CBBB1",
+]
+fill_colors = [
+    "rgba(238,66,102,0.2)",
+    "rgba(0,166,237,0.2)",
+    "rgba(236,167,44,0.2)",
+    "rgba(180,35,24,0.2)",
+    "rgba(60,187,177,0.2)",
+]
+
+
+def radar_chart(df: pd.DataFrame) -> go.Figure:
+    df = df.copy()
+    df["Model"] = df["Model"].map(parse_model_name)
+    # Remove whitespace
+    task_type_columns = [
+        column for column in df.columns if "".join(column.split()) in task_types
+    ]
+    df = df[["Model", *task_type_columns]].set_index("Model")
+    df = df.replace("", np.nan)
+    df = df.dropna()
+    df = df.head(TOP_N)
+    df = df.iloc[::-1]
+    fig = go.Figure()
+    for i, (model_name, row) in enumerate(df.iterrows()):
+        fig.add_trace(
+            go.Scatterpolar(
+                name=model_name,
+                r=[row[task_type] for task_type in task_type_columns]
+                + [row[task_type_columns[0]]],
+                theta=task_type_columns + [task_type_columns[0]],
+                showlegend=True,
+                mode="lines",
+                line=dict(width=2, color=line_colors[i]),
+                fill="toself",
+                fillcolor=fill_colors[i],
+            )
+        )
+    fig.update_layout(
+        font=dict(size=16, color="black"),  # noqa
+        template="plotly_white",
+        polar=dict(
+            radialaxis=dict(
+                visible=True,
+                gridcolor="black",
+                linecolor="rgba(0,0,0,0)",
+                gridwidth=1,
+                showticklabels=False,
+                ticks="",
+            ),
+            angularaxis=dict(
+                gridcolor="black", gridwidth=1.5, linecolor="rgba(0,0,0,0)"
+            ),
+        ),
+        legend=dict(
+            orientation="h",
+            yanchor="bottom",
+            y=-0.6,
+            xanchor="left",
+            x=-0.05,
+            entrywidthmode="fraction",
+            entrywidth=1 / 5,
+        ),
+    )
+    return fig
diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py
@@ -80,9 +80,9 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
     task_names_per_type = defaultdict(list)
     for task_name, task_type in zip(df["task_name"], df["task_type"]):
         task_names_per_type[task_type].append(task_name)
-    groups = df.groupby(["model_name", "model_revision"])
+    groups = df.groupby("model_name")
     records = []
-    for (model_name, model_revision), group_data in groups:
+    for (model_name), group_data in groups:
         name_to_score = dict(zip(group_data["task_name"], group_data["score"]))
         for task_type, task_names in task_names_per_type.items():
             type_mean = np.mean(
@@ -91,7 +91,6 @@ def get_means_per_types(df: pd.DataFrame) -> pd.DataFrame:
             records.append(
                 dict(  # noqa
                     model_name=model_name,
-                    model_revision=model_revision,
                     task_type=task_type,
                     score=type_mean,
                 )
@@ -125,24 +124,23 @@ def scores_to_tables(
     )
     mean_per_type = get_means_per_types(data)
     mean_per_type = mean_per_type.pivot(
-        index=["model_name", "model_revision"], columns="task_type", values="score"
+        index="model_name", columns="task_type", values="score"
     )
     mean_per_type.columns = [
         split_on_capital(column) for column in mean_per_type.columns
     ]
-    per_task = data.pivot(
-        index=["model_name", "model_revision"], columns="task_name", values="score"
-    )
+    per_task = data.pivot(index="model_name", columns="task_name", values="score")
     to_remove = per_task.isna().all(axis="columns")
     if search_query:
         names = per_task.index.get_level_values("model_name")
         names = pd.Series(names, index=per_task.index)
         to_remove |= ~names.str.contains(search_query, regex=True)
+    models_to_remove = list(per_task[to_remove].index)
     typed_mean = mean_per_type.mean(skipna=False, axis=1)
     overall_mean = per_task.mean(skipna=False, axis=1)
     joint_table = mean_per_type.copy()
-    per_task = per_task[~to_remove]
-    joint_table = joint_table[~to_remove]
+    per_task = per_task.drop(models_to_remove, axis=0)
+    joint_table = joint_table.drop(models_to_remove, axis=0)
     joint_table.insert(0, "mean", overall_mean)
     joint_table.insert(1, "mean_by_task_type", typed_mean)
     joint_table["borda_rank"] = get_borda_rank(per_task)
@@ -166,10 +164,7 @@ def scores_to_tables(
         model_metas.map(lambda m: format_n_parameters(m.n_parameters)),
     )
     joint_table = joint_table.sort_values("borda_rank", ascending=True)
-    per_task = per_task.loc[
-        joint_table.set_index(["model_name", "model_revision"]).index
-    ]
-    joint_table = joint_table.drop(columns=["model_revision"])
+    per_task = per_task.loc[joint_table.set_index("model_name").index]
     # Removing HF organization from model
     joint_table["model_name"] = joint_table["model_name"].map(
         lambda name: name.split("/")[-1]
@@ -189,7 +184,7 @@ def scores_to_tables(
             "mean": "Mean (Task)",
         }
     )
-    per_task = per_task.reset_index().drop(columns=["model_revision"])
+    per_task = per_task.reset_index()
     per_task["model_name"] = per_task["model_name"].map(
         lambda name: name.split("/")[-1]
     )