diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
index e3833b5ce3..225ecb44ca 100644
--- a/mteb/leaderboard/app.py
+++ b/mteb/leaderboard/app.py
@@ -20,6 +20,7 @@
 from mteb.benchmarks.benchmarks import MTEB_multilingual
 from mteb.custom_validators import MODALITIES
 from mteb.languages import ISO_TO_LANGUAGE
+from mteb.leaderboard.benchmark_selector import BENCHMARK_ENTRIES, make_selector
 from mteb.leaderboard.figures import performance_size_plot, radar_chart
 from mteb.leaderboard.table import create_tables
 
@@ -104,7 +105,7 @@ def update_description(
     benchmark_name: str, languages: list[str], task_types: list[str], domains: list[str]
 ) -> str:
     benchmark = mteb.get_benchmark(benchmark_name)
-    description = f"## {benchmark.name}\n{benchmark.description}\n"
+    description = f"{benchmark.description}\n"
     n_languages = len(languages)
     n_task_types = len(task_types)
     n_tasks = len(benchmark.tasks)
@@ -156,7 +157,13 @@ def update_task_info(task_names: str) -> gr.DataFrame:
         }
     )
     df = df.drop(columns="reference")
-    return gr.DataFrame(df, datatype=["markdown"] + ["str"] * (len(df.columns) - 1))
+    return gr.DataFrame(
+        df,
+        datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
+        show_copy_button=True,
+        show_fullscreen_button=True,
+        show_search="filter",
+    )
 
 
 # Model sizes in million parameters
@@ -235,13 +242,6 @@ def get_leaderboard_app() -> gr.Blocks:
     summary_table, per_task_table = create_tables(
         [entry for entry in default_scores if entry["model_name"] in filtered_models]
     )
-
-    benchmark_select = gr.Dropdown(
-        [bench.name for bench in benchmarks],
-        value=default_benchmark.name,
-        label="Prebuilt Benchmarks",
-        info="Select one of our expert-selected benchmarks from MTEB publications.",
-    )
     lang_select = gr.Dropdown(
         ISO_TO_LANGUAGE,
         value=sorted(default_results.languages),
@@ -284,116 +284,46 @@ def get_leaderboard_app() -> gr.Blocks:
       <link href="https://cdn.jsdelivr.net/npm/tailwindcss@2.2.19/dist/tailwind.min.css" rel="stylesheet">
     """
 
-    with gr.Blocks(fill_width=True, theme=gr.themes.Base(), head=head) as demo:
+    with gr.Blocks(
+        fill_width=True,
+        theme=gr.themes.Soft(
+            font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
+        ),
+        head=head,
+    ) as demo:
+        with gr.Sidebar(
+            position="left",
+            label="Benchmark Selection and Customization",
+            visible=True,
+            width="25%",
+        ):
+            gr.Markdown("## Select Benchmark")
+            benchmark_select, column = make_selector(BENCHMARK_ENTRIES)
         gr.Markdown(
             """
         ## Embedding Leaderboard
 
-        This leaderboard compares 100+ text and image (soon) embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️
-
-        > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated.
+        This leaderboard compares 100+ text and image embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️
         """
         )
+        gr.Markdown(
+            lambda name: f"<center> <h2> <b> {name} </b> </h2> </center><br>",
+            inputs=benchmark_select,
+        )
 
-        with gr.Row():
-            with gr.Column(scale=5):
-                gr.Markdown(
-                    "### Benchmarks\n"
-                    "Select one of the hand-curated benchmarks from our publications and modify them using one of the following filters to fit your needs."
-                )
-                with gr.Group():
-                    with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
-                        with gr.Column():
-                            benchmark_select.render()
-                            with gr.Accordion("Select Languages", open=False):
-                                lang_select.render()
-                            with gr.Accordion("Select Task Types", open=False):
-                                type_select.render()
-                            with gr.Accordion("Select Domains", open=False):
-                                domain_select.render()
-                            with gr.Accordion("Select Modalities", open=False):
-                                modality_select.render()
-                            with gr.Accordion("Add and remove tasks:", open=False):
-                                task_select.render()
-            with gr.Column(scale=8):
-                gr.Markdown(
-                    """
-                ### Model Selection
-                Select models to rank based on an assortment of criteria.
-                """,
-                )
-                with gr.Group():
-                    with gr.Row():
-                        searchbar = gr.Textbox(
-                            label="Search Models",
-                            info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)",
-                            interactive=True,
-                        )
-                        compatibility = gr.CheckboxGroup(
-                            [
-                                (
-                                    "Should be sentence-transformers compatible",
-                                    "Sentence Transformers",
-                                )
-                            ],
-                            value=[],
-                            label="Compatibility",
-                            interactive=True,
-                        )
-                    with gr.Row(elem_classes=""):
-                        with gr.Column():
-                            availability = gr.Radio(
-                                [
-                                    ("Only Open", True),
-                                    ("Only Proprietary", False),
-                                    ("Both", None),
-                                ],
-                                value=None,
-                                label="Availability",
-                                interactive=True,
-                            )
-                            instructions = gr.Radio(
-                                [
-                                    ("Only Instruction-tuned", True),
-                                    ("Only non-instruction", False),
-                                    ("Both", None),
-                                ],
-                                value=None,
-                                label="Instructions",
-                                interactive=True,
-                            )
-                        with gr.Column():
-                            zero_shot = gr.Radio(
-                                [
-                                    (
-                                        "Only Zero-shot",
-                                        "only_zero_shot",
-                                    ),
-                                    ("Remove Unknown", "remove_unknown"),
-                                    ("Allow All", "allow_all"),
-                                ],
-                                value="allow_all",
-                                label="Zero-shot",
-                                interactive=True,
-                            )
-                            model_size = RangeSlider(
-                                minimum=MIN_MODEL_SIZE,
-                                maximum=MAX_MODEL_SIZE,
-                                value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE),
-                                label="Model Size (#M Parameters)",
-                            )
         scores = gr.State(default_scores)
         models = gr.State(filtered_models)
         with gr.Row():
-            with gr.Column():
+            with gr.Column(scale=1):
                 description = gr.Markdown(  # noqa: F841
                     update_description,
                     inputs=[benchmark_select, lang_select, type_select, domain_select],
                 )
-                citation = gr.Markdown(update_citation, inputs=[benchmark_select])  # noqa: F841
+                with gr.Accordion("Cite this benchmark:", open=False):
+                    citation = gr.Markdown(update_citation, inputs=[benchmark_select])  # noqa: F841
                 with gr.Accordion("Share this benchmark:", open=False):
                     gr.Markdown(produce_benchmark_link, inputs=[benchmark_select])
-            with gr.Column():
+            with gr.Column(scale=2):
                 with gr.Tab("Performance per Model Size"):
                     plot = gr.Plot(performance_size_plot, inputs=[summary_table])  # noqa: F841
                     gr.Markdown(
@@ -404,6 +334,76 @@ def get_leaderboard_app() -> gr.Blocks:
                     gr.Markdown(
                         "*We only display models that have been run on all task types in the benchmark*"
                     )
+
+        with gr.Accordion("Customize this Benchmark", open=False):
+            with gr.Column():
+                with gr.Row():
+                    type_select.render()
+                with gr.Row():
+                    domain_select.render()
+                with gr.Row():
+                    modality_select.render()
+                with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
+                    lang_select.render()
+                with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
+                    task_select.render()
+
+        with gr.Accordion("Advanced Model Filters", open=False):
+            with gr.Group():
+                with gr.Row(elem_classes=""):
+                    with gr.Column():
+                        compatibility = gr.CheckboxGroup(
+                            [
+                                (
+                                    "Should be sentence-transformers compatible",
+                                    "Sentence Transformers",
+                                )
+                            ],
+                            value=[],
+                            label="Compatibility",
+                            interactive=True,
+                        )
+                        availability = gr.Radio(
+                            [
+                                ("Only Open", True),
+                                ("Only Proprietary", False),
+                                ("Both", None),
+                            ],
+                            value=None,
+                            label="Availability",
+                            interactive=True,
+                        )
+                        instructions = gr.Radio(
+                            [
+                                ("Only Instruction-tuned", True),
+                                ("Only non-instruction", False),
+                                ("Both", None),
+                            ],
+                            value=None,
+                            label="Instructions",
+                            interactive=True,
+                        )
+                    with gr.Column():
+                        zero_shot = gr.Radio(
+                            [
+                                (
+                                    "Only Zero-shot",
+                                    "only_zero_shot",
+                                ),
+                                ("Remove Unknown", "remove_unknown"),
+                                ("Allow All", "allow_all"),
+                            ],
+                            value="allow_all",
+                            label="Zero-shot",
+                            interactive=True,
+                        )
+                        model_size = RangeSlider(
+                            minimum=MIN_MODEL_SIZE,
+                            maximum=MAX_MODEL_SIZE,
+                            value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE),
+                            label="Model Size (#M Parameters)",
+                        )
+
         with gr.Tab("Summary"):
             summary_table.render()
             download_summary = gr.DownloadButton("Download Table")
@@ -512,7 +512,7 @@ def on_benchmark_select(benchmark_name):
             elapsed = time.time() - start_time
             benchmark_results = all_benchmark_results[benchmark_name]
             scores = benchmark_results.get_scores(format="long")
-            logger.info(f"on_benchmark_select callback: {elapsed}s")
+            logger.debug(f"on_benchmark_select callback: {elapsed}s")
             return (
                 languages,
                 domains,
@@ -543,10 +543,12 @@ def on_benchmark_select(benchmark_name):
         )
         def update_scores_on_lang_change(benchmark_name, languages):
             start_time = time.time()
+            if not len(languages):
+                return []
             benchmark_results = all_benchmark_results[benchmark_name]
             scores = benchmark_results.get_scores(languages=languages, format="long")
             elapsed = time.time() - start_time
-            logger.info(f"update_scores callback: {elapsed}s")
+            logger.debug(f"update_scores callback: {elapsed}s")
             return scores
 
         lang_select.input(
@@ -574,6 +576,8 @@ def update_scores_on_lang_change(benchmark_name, languages):
         def update_task_list(
             benchmark_name, type_select, domain_select, lang_select, modality_select
         ):
+            if not len(lang_select):
+                return []
             start_time = time.time()
             tasks_to_keep = []
             for task in mteb.get_benchmark(benchmark_name).tasks:
@@ -587,7 +591,7 @@ def update_task_list(
                     continue
                 tasks_to_keep.append(task.metadata.name)
             elapsed = time.time() - start_time
-            logger.info(f"update_task_list callback: {elapsed}s")
+            logger.debug(f"update_task_list callback: {elapsed}s")
             return sorted(tasks_to_keep)
 
         type_select.input(
@@ -679,7 +683,7 @@ def update_models(
             if model_names == filtered_models:
                 # This indicates that the models should not be filtered
                 return None
-            logger.info(f"update_models callback: {elapsed}s")
+            logger.debug(f"update_models callback: {elapsed}s")
             return sorted(filtered_models)
 
         scores.change(
@@ -776,14 +780,9 @@ def update_models(
 
         @cachetools.cached(
             cache={},
-            key=lambda scores,
-            search_query,
-            tasks,
-            models_to_keep,
-            benchmark_name: hash(
+            key=lambda scores, tasks, models_to_keep, benchmark_name: hash(
                 (
                     id(scores),
-                    hash(search_query),
                     hash(tuple(tasks)),
                     id(models_to_keep),
                     hash(benchmark_name),
@@ -792,7 +791,6 @@ def update_models(
         )
         def update_tables(
             scores,
-            search_query: str,
             tasks,
             models_to_keep,
             benchmark_name: str,
@@ -813,33 +811,33 @@ def update_tables(
                     filtered_scores.append(entry)
             else:
                 filtered_scores = scores
-            summary, per_task = create_tables(filtered_scores, search_query)
+            summary, per_task = create_tables(filtered_scores)
             elapsed = time.time() - start_time
-            logger.info(f"update_tables callback: {elapsed}s")
+            logger.debug(f"update_tables callback: {elapsed}s")
             return summary, per_task
 
         task_select.change(
             update_tables,
-            inputs=[scores, searchbar, task_select, models, benchmark_select],
+            inputs=[scores, task_select, models, benchmark_select],
             outputs=[summary_table, per_task_table],
         )
         scores.change(
             update_tables,
-            inputs=[scores, searchbar, task_select, models, benchmark_select],
+            inputs=[scores, task_select, models, benchmark_select],
             outputs=[summary_table, per_task_table],
         )
         models.change(
             update_tables,
-            inputs=[scores, searchbar, task_select, models, benchmark_select],
-            outputs=[summary_table, per_task_table],
-        )
-        searchbar.submit(
-            update_tables,
-            inputs=[scores, searchbar, task_select, models, benchmark_select],
+            inputs=[scores, task_select, models, benchmark_select],
             outputs=[summary_table, per_task_table],
         )
 
         gr.Markdown(acknowledgment_md, elem_id="ack_markdown")
+        gr.Markdown(
+            """
+        > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated.
+        """
+        )
 
     # Prerun on all benchmarks, so that results of callbacks get cached
     for benchmark in benchmarks:
@@ -862,7 +860,7 @@ def update_tables(
         )
         # We have to call this both on the filtered and unfiltered task because the callbacks
         # also gets called twice for some reason
-        update_tables(bench_scores, "", bench_tasks, filtered_models, benchmark.name)
+        update_tables(bench_scores, bench_tasks, filtered_models, benchmark.name)
         filtered_tasks = update_task_list(
             benchmark.name,
             bench_types,
@@ -870,7 +868,7 @@ def update_tables(
             bench_languages,
             bench_modalities,
         )
-        update_tables(bench_scores, "", filtered_tasks, filtered_models, benchmark.name)
+        update_tables(bench_scores, filtered_tasks, filtered_models, benchmark.name)
     return demo
 
 
diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py
new file mode 100644
index 0000000000..ebbbac9343
--- /dev/null
+++ b/mteb/leaderboard/benchmark_selector.py
@@ -0,0 +1,259 @@
+import gradio as gr
+
+"""
+Each entry is a tuple, where the first element is a label, and the second is either a single benchmark or a group of benchmarks.
+
+Example: 
+[
+    ("First Benchmark", dict(value="MTEB(something)", icon="icon_url")),
+    ("Group of Benchmarks", 
+        [
+            ("Second Benchmark", dict(value="MTEB(something)", icon="icon_url")),
+            ("Third Benchmark", dict(value="MTEB(something)", icon="icon_url")),
+        ],
+    ),
+]
+"""
+BENCHMARK_ENTRIES = [
+    (
+        "Multilingual",
+        dict(
+            value="MTEB(Multilingual, v1)",
+            icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-globe.svg",
+        ),
+    ),
+    (
+        "English",
+        dict(
+            value="MTEB(eng, v2)",
+            icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg",
+        ),
+    ),
+    (
+        "Image Benchmarks",
+        [
+            (
+                "Images, Multilingual",
+                dict(
+                    value="MIEB(Multilingual)",
+                    icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-pictures.svg",
+                ),
+            ),
+            (
+                "Images, English",
+                dict(
+                    value="MIEB(eng)",
+                    icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-picture.svg",
+                ),
+            ),
+            (
+                "Images, Lite",
+                dict(
+                    value="MIEB(lite)",
+                    icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-landscape.svg",
+                ),
+            ),
+        ],
+    ),
+    (
+        "Domain-Specific Benchmarks",
+        [
+            (
+                "Code",
+                dict(
+                    value="MTEB(Code, v1)",
+                    icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg",
+                ),
+            ),
+            (
+                "Legal",
+                dict(
+                    value="MTEB(Law, v1)",
+                    icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg",
+                ),
+            ),
+            (
+                "Medical",
+                dict(
+                    value="MTEB(Medical, v1)",
+                    icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg",
+                ),
+            ),
+            (
+                "Chemical",
+                dict(
+                    value="ChemTEB",
+                    icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-purge.svg",
+                ),
+            ),
+        ],
+    ),
+    (
+        "Regional Benchmarks",
+        [
+            (
+                "European",
+                dict(
+                    value="MTEB(Europe, v1)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/eu.svg",
+                ),
+            ),
+            (
+                "Indic",
+                dict(
+                    value="MTEB(Indic, v1)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/in.svg",
+                ),
+            ),
+            (
+                "Scandinavian",
+                dict(
+                    value="MTEB(Scandinavian, v1)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/dk.svg",
+                ),
+            ),
+        ],
+    ),
+    (
+        "Language-specific Benchmarks",
+        [
+            (
+                "Chinese",
+                dict(
+                    value="MTEB(cmn, v1)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/cn.svg",
+                ),
+            ),
+            (
+                "German",
+                dict(
+                    value="MTEB(deu, v1)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg",
+                ),
+            ),
+            (
+                "French",
+                dict(
+                    value="MTEB(fra, v1)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg",
+                ),
+            ),
+            (
+                "Japanese",
+                dict(
+                    value="MTEB(jpn, v1)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
+                ),
+            ),
+            (
+                "Korean",
+                dict(
+                    value="MTEB(kor, v1)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg",
+                ),
+            ),
+            (
+                "Polish",
+                dict(
+                    value="MTEB(pol, v1)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/pl.svg",
+                ),
+            ),
+            (
+                "Russian",
+                dict(
+                    value="MTEB(rus, v1)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg",
+                ),
+            ),
+            (
+                "Farsi (BETA)",
+                dict(
+                    value="MTEB(fas, beta)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ir.svg",
+                ),
+            ),
+        ],
+    ),
+    (
+        "Miscellaneous",
+        [
+            ("BEIR", dict(value="BEIR", icon=None)),
+            ("BEIR-NL", dict(value="BEIR-NL", icon=None)),
+            ("BRIGHT", dict(value="BRIGHT", icon=None)),
+            ("BRIGHT (long)", dict(value="BRIGHT (long)", icon=None)),
+            ("BuiltBench (eng)", dict(value="BRIGHT (long)", icon=None)),
+            ("Code Information Retrieval", dict(value="CoIR", icon=None)),
+            ("Instruction Following", dict(value="FollowIR", icon=None)),
+            ("Long-context Retrieval", dict(value="LongEmbed", icon=None)),
+            ("MINERSBitextMining", dict(value="MINERSBitextMining", icon=None)),
+            ("NanoBEIR", dict(value="NanoBEIR", icon=None)),
+            ("Reasoning retrieval", dict(value="RAR-b", icon=None)),
+        ],
+    ),
+    (
+        "Legacy",
+        [
+            (
+                "English Legacy",
+                dict(
+                    value="MTEB(eng, v1)",
+                    icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/gb.svg",
+                ),
+            ),
+        ],
+    ),
+]
+
+
+def _create_button(i, label, entry, state, label_to_value, **kwargs):
+    val = entry["value"]
+    label_to_value[label] = val
+    button = gr.Button(
+        label,
+        variant="secondary" if i != 0 else "primary",
+        icon=entry["icon"],
+        key=f"{i}_button_{val}",
+        elem_classes="text-white",
+        **kwargs,
+    )
+
+    def _update_variant(state, label) -> gr.Button:
+        if state == label_to_value[label]:
+            return gr.Button(variant="primary")
+        else:
+            return gr.Button(variant="secondary")
+
+    def _update_value(label) -> str:
+        return label_to_value[label]
+
+    state.change(_update_variant, inputs=[state, button], outputs=[button])
+    button.click(_update_value, outputs=[state], inputs=[button])
+    return button
+
+
+def make_selector(entries: list[tuple[str, dict | list]]) -> tuple[gr.State, gr.Column]:
+    if not entries:
+        raise ValueError("No entries were specified, can't build selector.")
+    label_to_value = {}
+    state = None
+    with gr.Column() as column:
+        for i, (label, entry) in enumerate(entries):
+            if i == 0:
+                if isinstance(entry, dict):
+                    state = gr.State(entry["value"])
+                else:
+                    _label, _entry = entry[0]
+                    state = gr.State(_entry["value"])
+            if isinstance(entry, dict):
+                button = _create_button(
+                    i, label, entry, state, label_to_value, size="lg"
+                )
+            else:
+                gr.Markdown(f"### **{label}**")
+                for sub_label, sub_entry in entry:
+                    button = _create_button(
+                        i, sub_label, sub_entry, state, label_to_value, size="md"
+                    )
+
+    return state, column
diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py
index 57a282327c..6a945346f2 100644
--- a/mteb/leaderboard/figures.py
+++ b/mteb/leaderboard/figures.py
@@ -147,6 +147,7 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure:
             "model_text": False,
         },
         hover_name="Model",
+        color_continuous_scale=px.colors.sequential.Greens,
     )
     # Note: it's important that this comes before setting the size mode
     fig = add_size_guide(fig)
diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py
index b848406ba5..fbc01496e8 100644
--- a/mteb/leaderboard/table.py
+++ b/mteb/leaderboard/table.py
@@ -61,6 +61,23 @@ def get_column_types(df: pd.DataFrame) -> list[str]:
     return types
 
 
+def get_column_widths(df: pd.DataFrame) -> list[str]:
+    # Please do not remove this function when refactoring.
+    # Column width calculation seeminlgy changes regularly with Gradio releases,
+    # and this piece of logic is good enough to quickly fix related issues.
+    widths = []
+    for column_name in df.columns:
+        column_word_lengths = [len(word) for word in column_name.split()]
+        if is_numeric_dtype(df[column_name]):
+            value_lengths = [len(f"{value:.2f}") for value in df[column_name]]
+        else:
+            value_lengths = [len(str(value)) for value in df[column_name]]
+        max_length = max(max(column_word_lengths), max(value_lengths))
+        n_pixels = 25 + (max_length * 10)
+        widths.append(f"{n_pixels}px")
+    return widths
+
+
 def get_means_per_types(per_task: pd.DataFrame):
     task_names_per_type = defaultdict(list)
     for task_name in per_task.columns:
@@ -237,7 +254,6 @@ def apply_styling(
     ]
     light_green_cmap = create_light_green_cmap()
     numeric_data = joint_table.copy()
-    numeric_data["Zero-shot"] = numeric_data["Zero-shot"].replace(-1, np.nan)
     joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot)
     joint_table[score_columns] = joint_table[score_columns].map(format_scores)
     joint_table_style = joint_table.style.format(
@@ -278,22 +294,40 @@ def apply_styling(
     per_task_style = per_task.style.format(
         "{:.2f}", subset=task_score_columns, na_rep=""
     ).highlight_max(subset=task_score_columns, props="font-weight: bold")
-    for col in task_score_columns:
-        if col != "Model":
-            mask = per_task[col].notna()
-            per_task_style = per_task_style.background_gradient(
-                cmap=light_green_cmap,
-                subset=pd.IndexSlice[mask, col],
-                gmap=per_task[col].loc[mask],
-            )
+    # TODO: uncomment this when Gradio fixes it.
+    # The fix is already merged and contained in this release: https://github.com/gradio-app/gradio/pull/11032
+    # It will be available in Gradio 5.25.3
+    # for col in task_score_columns:
+    #     if col != "Model":
+    #         mask = per_task[col].notna()
+    #         per_task_style = per_task_style.background_gradient(
+    #             cmap=light_green_cmap,
+    #             subset=pd.IndexSlice[mask, col],
+    #             gmap=per_task[col].loc[mask],
+    #         )
+    column_widths = get_column_widths(joint_table_style.data)
+    column_widths[0] = "100px"
+    column_widths[1] = "250px"
     return (
         gr.DataFrame(
             joint_table_style,
             datatype=column_types,
             interactive=False,
             pinned_columns=3,
+            column_widths=column_widths,
+            wrap=True,
+            show_fullscreen_button=True,
+            show_copy_button=True,
+            show_search="filter",
+        ),
+        gr.DataFrame(
+            per_task_style,
+            interactive=False,
+            pinned_columns=1,
+            show_fullscreen_button=True,
+            show_copy_button=True,
+            show_search="filter",
         ),
-        gr.DataFrame(per_task_style, interactive=False, pinned_columns=1),
     )
 
 
diff --git a/pyproject.toml b/pyproject.toml
index b07e5ed703..eaaba260e2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,8 +70,7 @@ speedtask = [
 ]
 peft = ["peft>=0.11.0"]
 leaderboard = [
-    "gradio==5.16.0; python_version > '3.9'", # 3.10 is required for gradio
-    "pydantic<2.11", # remove with gradio bump https://github.com/embeddings-benchmark/mteb/issues/2523
+    "gradio==5.17.1; python_version > '3.9'", # 3.10 is required for gradio
     "gradio_rangeslider>=0.0.8",
     "plotly>=5.24.0,<6.0.0",
     "cachetools>=5.2.0",