From e17e9ce3755001de3cea3b9a855f518e4bbf894c Mon Sep 17 00:00:00 2001
From: q275343119 <275343119@qq.com>
Date: Mon, 18 Aug 2025 17:01:13 +0800
Subject: [PATCH 1/7] feat - Combine Plots and Tables into a Single Tab #3009

---
 mteb/leaderboard/app.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
index 2dcb4d96be..03b8f04083 100644
--- a/mteb/leaderboard/app.py
+++ b/mteb/leaderboard/app.py
@@ -292,7 +292,7 @@ def get_leaderboard_app() -> gr.Blocks:
         scores = gr.State(default_scores)
         models = gr.State(filtered_models)
         with gr.Row():
-            with gr.Column(scale=1):
+            with gr.Column():
                 description = gr.Markdown(  # noqa: F841
                     update_description,
                     inputs=[benchmark_select, lang_select, type_select, domain_select],
@@ -301,17 +301,6 @@ def get_leaderboard_app() -> gr.Blocks:
                     citation = gr.Markdown(update_citation, inputs=[benchmark_select])  # noqa: F841
                 with gr.Accordion("Share this benchmark:", open=False):
                     gr.Markdown(produce_benchmark_link, inputs=[benchmark_select])
-            with gr.Column(scale=2):
-                with gr.Tab("Performance per Model Size"):
-                    plot = gr.Plot(performance_size_plot, inputs=[summary_table])  # noqa: F841
-                    gr.Markdown(
-                        "*We only display models that have been run on all tasks in the benchmark*"
-                    )
-                with gr.Tab("Performance per Task Type (Radar Chart)"):
-                    radar_plot = gr.Plot(radar_chart, inputs=[summary_table])  # noqa: F841
-                    gr.Markdown(
-                        "*We only display models that have been run on all task types in the benchmark*"
-                    )
 
         with gr.Accordion("Customize this Benchmark", open=False):
             with gr.Column():
@@ -402,6 +391,18 @@ def get_leaderboard_app() -> gr.Blocks:
                 open=False,
             ):
                 gr.Markdown(FAQ)
+
+        with gr.Tab("Performance per Model Size"):
+            plot = gr.Plot(performance_size_plot, inputs=[summary_table])  # noqa: F841
+            gr.Markdown(
+                "*We only display models that have been run on all tasks in the benchmark*"
+            )
+        with gr.Tab("Performance per Task Type (Radar Chart)"):
+            radar_plot = gr.Plot(radar_chart, inputs=[summary_table])  # noqa: F841
+            gr.Markdown(
+                "*We only display models that have been run on all task types in the benchmark*"
+            )
+
         with gr.Tab("Performance per task"):
             per_task_table.render()
             download_per_task = gr.DownloadButton("Download Table")

From d7b1f3b1ae32f3b0a1d6825e9cf540724f9dc265 Mon Sep 17 00:00:00 2001
From: q275343119 <275343119@qq.com>
Date: Mon, 18 Aug 2025 23:09:01 +0800
Subject: [PATCH 2/7] feat - Resize the plot to make it more readable

---
 mteb/leaderboard/app.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
index 03b8f04083..193fa4b514 100644
--- a/mteb/leaderboard/app.py
+++ b/mteb/leaderboard/app.py
@@ -392,16 +392,19 @@ def get_leaderboard_app() -> gr.Blocks:
             ):
                 gr.Markdown(FAQ)
 
-        with gr.Tab("Performance per Model Size"):
+        with gr.Tab("Performance per Model Size") as plot_tab:
             plot = gr.Plot(performance_size_plot, inputs=[summary_table])  # noqa: F841
             gr.Markdown(
                 "*We only display models that have been run on all tasks in the benchmark*"
             )
-        with gr.Tab("Performance per Task Type (Radar Chart)"):
-            radar_plot = gr.Plot(radar_chart, inputs=[summary_table])  # noqa: F841
+            plot_tab.select(performance_size_plot, inputs=[summary_table], outputs=[plot])
+
+        with gr.Tab("Performance per Task Type (Radar Chart)") as radar_plot_tab:
+            radar_plot = gr.Plot(radar_chart, inputs=[summary_table])   # noqa: F841
             gr.Markdown(
                 "*We only display models that have been run on all task types in the benchmark*"
             )
+            radar_plot_tab.select(radar_chart, inputs=[summary_table], outputs=[radar_plot])
 
         with gr.Tab("Performance per task"):
             per_task_table.render()

From 6d9c5b910f28660ef17b6dcaef35415a3e12faf2 Mon Sep 17 00:00:00 2001
From: q275343119 <275343119@qq.com>
Date: Mon, 18 Aug 2025 23:09:45 +0800
Subject: [PATCH 3/7] feat - Remove the (radar chart)

---
 mteb/leaderboard/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
index 193fa4b514..f497447280 100644
--- a/mteb/leaderboard/app.py
+++ b/mteb/leaderboard/app.py
@@ -399,7 +399,7 @@ def get_leaderboard_app() -> gr.Blocks:
             )
             plot_tab.select(performance_size_plot, inputs=[summary_table], outputs=[plot])
 
-        with gr.Tab("Performance per Task Type (Radar Chart)") as radar_plot_tab:
+        with gr.Tab("Performance per Task Type") as radar_plot_tab:
             radar_plot = gr.Plot(radar_chart, inputs=[summary_table])   # noqa: F841
             gr.Markdown(
                 "*We only display models that have been run on all task types in the benchmark*"

From ee56b742db9daf855aceb40506280ff59781703a Mon Sep 17 00:00:00 2001
From: q275343119 <275343119@qq.com>
Date: Mon, 18 Aug 2025 23:10:59 +0800
Subject: [PATCH 4/7] feat - Add a comment stating that it only shows the Top 5
 models in the table.

---
 mteb/leaderboard/app.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
index f497447280..53b0539f82 100644
--- a/mteb/leaderboard/app.py
+++ b/mteb/leaderboard/app.py
@@ -395,14 +395,14 @@ def get_leaderboard_app() -> gr.Blocks:
         with gr.Tab("Performance per Model Size") as plot_tab:
             plot = gr.Plot(performance_size_plot, inputs=[summary_table])  # noqa: F841
             gr.Markdown(
-                "*We only display models that have been run on all tasks in the benchmark*"
+                "*We only display TOP 5 models that have been run on all tasks in the benchmark*"
             )
             plot_tab.select(performance_size_plot, inputs=[summary_table], outputs=[plot])
 
         with gr.Tab("Performance per Task Type") as radar_plot_tab:
             radar_plot = gr.Plot(radar_chart, inputs=[summary_table])   # noqa: F841
             gr.Markdown(
-                "*We only display models that have been run on all task types in the benchmark*"
+                "*We only display TOP 5 models that have been run on all task types in the benchmark*"
             )
             radar_plot_tab.select(radar_chart, inputs=[summary_table], outputs=[radar_plot])
 

From b327a3d9af3c1e85ff38591244f7e57563479c45 Mon Sep 17 00:00:00 2001
From: q275343119 <275343119@qq.com>
Date: Tue, 19 Aug 2025 21:52:55 +0800
Subject: [PATCH 5/7] feat - adjust layout

---
 mteb/leaderboard/app.py | 168 ++++++++++++++++++++--------------------
 1 file changed, 85 insertions(+), 83 deletions(-)

diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
index 53b0539f82..2d0f0f2228 100644
--- a/mteb/leaderboard/app.py
+++ b/mteb/leaderboard/app.py
@@ -28,7 +28,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 LANGUAGE: list[str] = list({l for t in mteb.get_tasks() for l in t.metadata.languages})
 ALL_MODELS = {meta.name for meta in mteb.get_model_metas()}
 
@@ -54,8 +53,9 @@ def produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str:
         }
     )
     base_url = request.request.base_url
+    md = "You can also share this benchmark using the following link:\n"
     url = f"{base_url}?{params}"
-    md = f"```\n{url}\n```"
+    md += f"```\n{url}\n```"
     return md
 
 
@@ -73,7 +73,8 @@ def download_table(table: pd.DataFrame) -> str:
 def update_citation(benchmark_name: str) -> str:
     benchmark = mteb.get_benchmark(benchmark_name)
     if benchmark.citation is not None:
-        citation = f"```bibtex\n{benchmark.citation}\n```"
+        citation = "To cite this work, please use the following reference:\n"
+        citation += f"```bibtex\n{benchmark.citation}\n```"
     else:
         citation = ""
     return citation
@@ -292,92 +293,93 @@ def get_leaderboard_app() -> gr.Blocks:
         scores = gr.State(default_scores)
         models = gr.State(filtered_models)
         with gr.Row():
-            with gr.Column():
+            with gr.Column(scale=1):
                 description = gr.Markdown(  # noqa: F841
                     update_description,
                     inputs=[benchmark_select, lang_select, type_select, domain_select],
                 )
-                with gr.Accordion("Cite this benchmark:", open=False):
+
+            with gr.Column(scale=1):
+                with gr.Accordion("Cite and share this benchmark:", open=False):
                     citation = gr.Markdown(update_citation, inputs=[benchmark_select])  # noqa: F841
-                with gr.Accordion("Share this benchmark:", open=False):
                     gr.Markdown(produce_benchmark_link, inputs=[benchmark_select])
 
-        with gr.Accordion("Customize this Benchmark", open=False):
-            with gr.Column():
-                with gr.Row():
-                    type_select.render()
-                with gr.Row():
-                    domain_select.render()
-                with gr.Row():
-                    modality_select.render()
-                with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
-                    lang_select.render()
-                with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
-                    task_select.render()
-
-        with gr.Accordion("Advanced Model Filters", open=False):
-            with gr.Group():
-                with gr.Row(elem_classes=""):
+                with gr.Accordion("Customize this Benchmark", open=False, ):
                     with gr.Column():
-                        compatibility = gr.CheckboxGroup(
-                            [
-                                (
-                                    "Should be sentence-transformers compatible",
-                                    "Sentence Transformers",
+                        with gr.Row():
+                            type_select.render()
+                        with gr.Row():
+                            domain_select.render()
+                        with gr.Row():
+                            modality_select.render()
+                        with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
+                            lang_select.render()
+                        with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
+                            task_select.render()
+
+                with gr.Accordion("Advanced Model Filters", open=False):
+                    with gr.Group():
+                        with gr.Row(elem_classes=""):
+                            with gr.Column():
+                                compatibility = gr.CheckboxGroup(
+                                    [
+                                        (
+                                            "Should be sentence-transformers compatible",
+                                            "Sentence Transformers",
+                                        )
+                                    ],
+                                    value=[],
+                                    label="Compatibility",
+                                    interactive=True,
+                                )
+                                availability = gr.Radio(
+                                    [
+                                        ("Only Open", True),
+                                        ("Only Proprietary", False),
+                                        ("Both", None),
+                                    ],
+                                    value=None,
+                                    label="Availability",
+                                    interactive=True,
+                                )
+                                instructions = gr.Radio(
+                                    [
+                                        ("Only Instruction-tuned", True),
+                                        ("Only non-instruction", False),
+                                        ("Both", None),
+                                    ],
+                                    value=None,
+                                    label="Instructions",
+                                    interactive=True,
+                                )
+                            with gr.Column():
+                                zero_shot = gr.Radio(
+                                    [
+                                        (
+                                            "Only Zero-shot",
+                                            "only_zero_shot",
+                                        ),
+                                        ("Remove Unknown", "remove_unknown"),
+                                        ("Allow All", "allow_all"),
+                                    ],
+                                    value="allow_all",
+                                    label="Zero-shot",
+                                    interactive=True,
+                                )
+
+                                max_model_size = gr.Radio(
+                                    [
+                                        ("<100M", 100),
+                                        ("<500M", 500),
+                                        ("<1B", 1000),
+                                        ("<5B", 5000),
+                                        ("<10B", 10000),
+                                        (">10B", MAX_MODEL_SIZE),
+                                    ],
+                                    value=MAX_MODEL_SIZE,
+                                    label="Model Parameters",
+                                    interactive=True,
                                 )
-                            ],
-                            value=[],
-                            label="Compatibility",
-                            interactive=True,
-                        )
-                        availability = gr.Radio(
-                            [
-                                ("Only Open", True),
-                                ("Only Proprietary", False),
-                                ("Both", None),
-                            ],
-                            value=None,
-                            label="Availability",
-                            interactive=True,
-                        )
-                        instructions = gr.Radio(
-                            [
-                                ("Only Instruction-tuned", True),
-                                ("Only non-instruction", False),
-                                ("Both", None),
-                            ],
-                            value=None,
-                            label="Instructions",
-                            interactive=True,
-                        )
-                    with gr.Column():
-                        zero_shot = gr.Radio(
-                            [
-                                (
-                                    "Only Zero-shot",
-                                    "only_zero_shot",
-                                ),
-                                ("Remove Unknown", "remove_unknown"),
-                                ("Allow All", "allow_all"),
-                            ],
-                            value="allow_all",
-                            label="Zero-shot",
-                            interactive=True,
-                        )
-
-                        max_model_size = gr.Radio(
-                            [
-                                ("<100M", 100),
-                                ("<500M", 500),
-                                ("<1B", 1000),
-                                ("<5B", 5000),
-                                ("<10B", 10000),
-                                (">10B", MAX_MODEL_SIZE),
-                            ],
-                            value=MAX_MODEL_SIZE,
-                            label="Model Parameters",
-                            interactive=True,
-                        )
 
         with gr.Tab("Summary"):
             summary_table.render()
@@ -387,8 +389,8 @@ def get_leaderboard_app() -> gr.Blocks:
             )
 
             with gr.Accordion(
-                "Frequently Asked Questions",
-                open=False,
+                    "Frequently Asked Questions",
+                    open=False,
             ):
                 gr.Markdown(FAQ)
 
@@ -400,7 +402,7 @@ def get_leaderboard_app() -> gr.Blocks:
             plot_tab.select(performance_size_plot, inputs=[summary_table], outputs=[plot])
 
         with gr.Tab("Performance per Task Type") as radar_plot_tab:
-            radar_plot = gr.Plot(radar_chart, inputs=[summary_table])   # noqa: F841
+            radar_plot = gr.Plot(radar_chart, inputs=[summary_table])  # noqa: F841
             gr.Markdown(
                 "*We only display TOP 5 models that have been run on all task types in the benchmark*"
             )

From 77a6f1345a0a9ef31249d442144d6adc7909cf97 Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kenevoldsen@pm.me>
Date: Fri, 22 Aug 2025 16:27:05 +0200
Subject: [PATCH 6/7] Update mteb/leaderboard/app.py

---
 mteb/leaderboard/app.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
index 2d0f0f2228..6c04c6a9c5 100644
--- a/mteb/leaderboard/app.py
+++ b/mteb/leaderboard/app.py
@@ -300,7 +300,7 @@ def get_leaderboard_app() -> gr.Blocks:
                 )
 
             with gr.Column(scale=1):
-                with gr.Accordion("Cite and share this benchmark:", open=False):
+                with gr.Accordion("Cite and share this benchmark", open=False):
                     citation = gr.Markdown(update_citation, inputs=[benchmark_select])  # noqa: F841
                     gr.Markdown(produce_benchmark_link, inputs=[benchmark_select])
 

From ef5c57c1e3a4734b0cdd135179ecb1929a3c56fe Mon Sep 17 00:00:00 2001
From: Kenneth Enevoldsen <kennethcenevoldsen@gmail.com>
Date: Fri, 22 Aug 2025 16:46:46 +0200
Subject: [PATCH 7/7] format

---
 mteb/leaderboard/app.py                | 17 ++++++++++++-----
 mteb/leaderboard/benchmark_selector.py |  1 -
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
index 6c04c6a9c5..3c0921ab05 100644
--- a/mteb/leaderboard/app.py
+++ b/mteb/leaderboard/app.py
@@ -304,7 +304,10 @@ def get_leaderboard_app() -> gr.Blocks:
                     citation = gr.Markdown(update_citation, inputs=[benchmark_select])  # noqa: F841
                     gr.Markdown(produce_benchmark_link, inputs=[benchmark_select])
 
-                with gr.Accordion("Customize this Benchmark", open=False, ):
+                with gr.Accordion(
+                    "Customize this Benchmark",
+                    open=False,
+                ):
                     with gr.Column():
                         with gr.Row():
                             type_select.render()
@@ -389,8 +392,8 @@ def get_leaderboard_app() -> gr.Blocks:
             )
 
             with gr.Accordion(
-                    "Frequently Asked Questions",
-                    open=False,
+                "Frequently Asked Questions",
+                open=False,
             ):
                 gr.Markdown(FAQ)
 
@@ -399,14 +402,18 @@ def get_leaderboard_app() -> gr.Blocks:
             gr.Markdown(
                 "*We only display TOP 5 models that have been run on all tasks in the benchmark*"
             )
-            plot_tab.select(performance_size_plot, inputs=[summary_table], outputs=[plot])
+            plot_tab.select(
+                performance_size_plot, inputs=[summary_table], outputs=[plot]
+            )
 
         with gr.Tab("Performance per Task Type") as radar_plot_tab:
             radar_plot = gr.Plot(radar_chart, inputs=[summary_table])  # noqa: F841
             gr.Markdown(
                 "*We only display TOP 5 models that have been run on all task types in the benchmark*"
             )
-            radar_plot_tab.select(radar_chart, inputs=[summary_table], outputs=[radar_plot])
+            radar_plot_tab.select(
+                radar_chart, inputs=[summary_table], outputs=[radar_plot]
+            )
 
         with gr.Tab("Performance per task"):
             per_task_table.render()
diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py
index 4906e9126d..4ac31ccc28 100644
--- a/mteb/leaderboard/benchmark_selector.py
+++ b/mteb/leaderboard/benchmark_selector.py
@@ -7,7 +7,6 @@
 import mteb
 from build.lib.mteb.benchmarks.benchmarks import MTEB_multilingual
 from mteb import Benchmark
-from mteb.benchmarks.benchmarks import MTEB_multilingual
 
 DEFAULT_BENCHMARK_NAME = MTEB_multilingual.name