diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py
index e3833b5ce3..225ecb44ca 100644
--- a/mteb/leaderboard/app.py
+++ b/mteb/leaderboard/app.py
@@ -20,6 +20,7 @@
from mteb.benchmarks.benchmarks import MTEB_multilingual
from mteb.custom_validators import MODALITIES
from mteb.languages import ISO_TO_LANGUAGE
+from mteb.leaderboard.benchmark_selector import BENCHMARK_ENTRIES, make_selector
from mteb.leaderboard.figures import performance_size_plot, radar_chart
from mteb.leaderboard.table import create_tables
@@ -104,7 +105,7 @@ def update_description(
benchmark_name: str, languages: list[str], task_types: list[str], domains: list[str]
) -> str:
benchmark = mteb.get_benchmark(benchmark_name)
- description = f"## {benchmark.name}\n{benchmark.description}\n"
+ description = f"{benchmark.description}\n"
n_languages = len(languages)
n_task_types = len(task_types)
n_tasks = len(benchmark.tasks)
@@ -156,7 +157,13 @@ def update_task_info(task_names: str) -> gr.DataFrame:
}
)
df = df.drop(columns="reference")
- return gr.DataFrame(df, datatype=["markdown"] + ["str"] * (len(df.columns) - 1))
+ return gr.DataFrame(
+ df,
+ datatype=["markdown"] + ["str"] * (len(df.columns) - 1),
+ show_copy_button=True,
+ show_fullscreen_button=True,
+ show_search="filter",
+ )
# Model sizes in million parameters
@@ -235,13 +242,6 @@ def get_leaderboard_app() -> gr.Blocks:
summary_table, per_task_table = create_tables(
[entry for entry in default_scores if entry["model_name"] in filtered_models]
)
-
- benchmark_select = gr.Dropdown(
- [bench.name for bench in benchmarks],
- value=default_benchmark.name,
- label="Prebuilt Benchmarks",
- info="Select one of our expert-selected benchmarks from MTEB publications.",
- )
lang_select = gr.Dropdown(
ISO_TO_LANGUAGE,
value=sorted(default_results.languages),
@@ -284,116 +284,46 @@ def get_leaderboard_app() -> gr.Blocks:
"""
- with gr.Blocks(fill_width=True, theme=gr.themes.Base(), head=head) as demo:
+ with gr.Blocks(
+ fill_width=True,
+ theme=gr.themes.Soft(
+ font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"],
+ ),
+ head=head,
+ ) as demo:
+ with gr.Sidebar(
+ position="left",
+ label="Benchmark Selection and Customization",
+ visible=True,
+ width="25%",
+ ):
+ gr.Markdown("## Select Benchmark")
+ benchmark_select, column = make_selector(BENCHMARK_ENTRIES)
gr.Markdown(
"""
## Embedding Leaderboard
- This leaderboard compares 100+ text and image (soon) embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️
-
- > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated.
+ This leaderboard compares 100+ text and image embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️
"""
)
+ gr.Markdown(
+ lambda name: f"
{name}
",
+ inputs=benchmark_select,
+ )
- with gr.Row():
- with gr.Column(scale=5):
- gr.Markdown(
- "### Benchmarks\n"
- "Select one of the hand-curated benchmarks from our publications and modify them using one of the following filters to fit your needs."
- )
- with gr.Group():
- with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
- with gr.Column():
- benchmark_select.render()
- with gr.Accordion("Select Languages", open=False):
- lang_select.render()
- with gr.Accordion("Select Task Types", open=False):
- type_select.render()
- with gr.Accordion("Select Domains", open=False):
- domain_select.render()
- with gr.Accordion("Select Modalities", open=False):
- modality_select.render()
- with gr.Accordion("Add and remove tasks:", open=False):
- task_select.render()
- with gr.Column(scale=8):
- gr.Markdown(
- """
- ### Model Selection
- Select models to rank based on an assortment of criteria.
- """,
- )
- with gr.Group():
- with gr.Row():
- searchbar = gr.Textbox(
- label="Search Models",
- info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)",
- interactive=True,
- )
- compatibility = gr.CheckboxGroup(
- [
- (
- "Should be sentence-transformers compatible",
- "Sentence Transformers",
- )
- ],
- value=[],
- label="Compatibility",
- interactive=True,
- )
- with gr.Row(elem_classes=""):
- with gr.Column():
- availability = gr.Radio(
- [
- ("Only Open", True),
- ("Only Proprietary", False),
- ("Both", None),
- ],
- value=None,
- label="Availability",
- interactive=True,
- )
- instructions = gr.Radio(
- [
- ("Only Instruction-tuned", True),
- ("Only non-instruction", False),
- ("Both", None),
- ],
- value=None,
- label="Instructions",
- interactive=True,
- )
- with gr.Column():
- zero_shot = gr.Radio(
- [
- (
- "Only Zero-shot",
- "only_zero_shot",
- ),
- ("Remove Unknown", "remove_unknown"),
- ("Allow All", "allow_all"),
- ],
- value="allow_all",
- label="Zero-shot",
- interactive=True,
- )
- model_size = RangeSlider(
- minimum=MIN_MODEL_SIZE,
- maximum=MAX_MODEL_SIZE,
- value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE),
- label="Model Size (#M Parameters)",
- )
scores = gr.State(default_scores)
models = gr.State(filtered_models)
with gr.Row():
- with gr.Column():
+ with gr.Column(scale=1):
description = gr.Markdown( # noqa: F841
update_description,
inputs=[benchmark_select, lang_select, type_select, domain_select],
)
- citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841
+ with gr.Accordion("Cite this benchmark:", open=False):
+ citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841
with gr.Accordion("Share this benchmark:", open=False):
gr.Markdown(produce_benchmark_link, inputs=[benchmark_select])
- with gr.Column():
+ with gr.Column(scale=2):
with gr.Tab("Performance per Model Size"):
plot = gr.Plot(performance_size_plot, inputs=[summary_table]) # noqa: F841
gr.Markdown(
@@ -404,6 +334,76 @@ def get_leaderboard_app() -> gr.Blocks:
gr.Markdown(
"*We only display models that have been run on all task types in the benchmark*"
)
+
+ with gr.Accordion("Customize this Benchmark", open=False):
+ with gr.Column():
+ with gr.Row():
+ type_select.render()
+ with gr.Row():
+ domain_select.render()
+ with gr.Row():
+ modality_select.render()
+ with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
+ lang_select.render()
+ with gr.Row(elem_classes="overflow-y-scroll max-h-80"):
+ task_select.render()
+
+ with gr.Accordion("Advanced Model Filters", open=False):
+ with gr.Group():
+ with gr.Row(elem_classes=""):
+ with gr.Column():
+ compatibility = gr.CheckboxGroup(
+ [
+ (
+ "Should be sentence-transformers compatible",
+ "Sentence Transformers",
+ )
+ ],
+ value=[],
+ label="Compatibility",
+ interactive=True,
+ )
+ availability = gr.Radio(
+ [
+ ("Only Open", True),
+ ("Only Proprietary", False),
+ ("Both", None),
+ ],
+ value=None,
+ label="Availability",
+ interactive=True,
+ )
+ instructions = gr.Radio(
+ [
+ ("Only Instruction-tuned", True),
+ ("Only non-instruction", False),
+ ("Both", None),
+ ],
+ value=None,
+ label="Instructions",
+ interactive=True,
+ )
+ with gr.Column():
+ zero_shot = gr.Radio(
+ [
+ (
+ "Only Zero-shot",
+ "only_zero_shot",
+ ),
+ ("Remove Unknown", "remove_unknown"),
+ ("Allow All", "allow_all"),
+ ],
+ value="allow_all",
+ label="Zero-shot",
+ interactive=True,
+ )
+ model_size = RangeSlider(
+ minimum=MIN_MODEL_SIZE,
+ maximum=MAX_MODEL_SIZE,
+ value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE),
+ label="Model Size (#M Parameters)",
+ )
+
with gr.Tab("Summary"):
summary_table.render()
download_summary = gr.DownloadButton("Download Table")
@@ -512,7 +512,7 @@ def on_benchmark_select(benchmark_name):
elapsed = time.time() - start_time
benchmark_results = all_benchmark_results[benchmark_name]
scores = benchmark_results.get_scores(format="long")
- logger.info(f"on_benchmark_select callback: {elapsed}s")
+ logger.debug(f"on_benchmark_select callback: {elapsed}s")
return (
languages,
domains,
@@ -543,10 +543,12 @@ def on_benchmark_select(benchmark_name):
)
def update_scores_on_lang_change(benchmark_name, languages):
start_time = time.time()
+ if not len(languages):
+ return []
benchmark_results = all_benchmark_results[benchmark_name]
scores = benchmark_results.get_scores(languages=languages, format="long")
elapsed = time.time() - start_time
- logger.info(f"update_scores callback: {elapsed}s")
+ logger.debug(f"update_scores callback: {elapsed}s")
return scores
lang_select.input(
@@ -574,6 +576,8 @@ def update_scores_on_lang_change(benchmark_name, languages):
def update_task_list(
benchmark_name, type_select, domain_select, lang_select, modality_select
):
+ if not len(lang_select):
+ return []
start_time = time.time()
tasks_to_keep = []
for task in mteb.get_benchmark(benchmark_name).tasks:
@@ -587,7 +591,7 @@ def update_task_list(
continue
tasks_to_keep.append(task.metadata.name)
elapsed = time.time() - start_time
- logger.info(f"update_task_list callback: {elapsed}s")
+ logger.debug(f"update_task_list callback: {elapsed}s")
return sorted(tasks_to_keep)
type_select.input(
@@ -679,7 +683,7 @@ def update_models(
if model_names == filtered_models:
# This indicates that the models should not be filtered
return None
- logger.info(f"update_models callback: {elapsed}s")
+ logger.debug(f"update_models callback: {elapsed}s")
return sorted(filtered_models)
scores.change(
@@ -776,14 +780,9 @@ def update_models(
@cachetools.cached(
cache={},
- key=lambda scores,
- search_query,
- tasks,
- models_to_keep,
- benchmark_name: hash(
+ key=lambda scores, tasks, models_to_keep, benchmark_name: hash(
(
id(scores),
- hash(search_query),
hash(tuple(tasks)),
id(models_to_keep),
hash(benchmark_name),
@@ -792,7 +791,6 @@ def update_models(
)
def update_tables(
scores,
- search_query: str,
tasks,
models_to_keep,
benchmark_name: str,
@@ -813,33 +811,33 @@ def update_tables(
filtered_scores.append(entry)
else:
filtered_scores = scores
- summary, per_task = create_tables(filtered_scores, search_query)
+ summary, per_task = create_tables(filtered_scores)
elapsed = time.time() - start_time
- logger.info(f"update_tables callback: {elapsed}s")
+ logger.debug(f"update_tables callback: {elapsed}s")
return summary, per_task
task_select.change(
update_tables,
- inputs=[scores, searchbar, task_select, models, benchmark_select],
+ inputs=[scores, task_select, models, benchmark_select],
outputs=[summary_table, per_task_table],
)
scores.change(
update_tables,
- inputs=[scores, searchbar, task_select, models, benchmark_select],
+ inputs=[scores, task_select, models, benchmark_select],
outputs=[summary_table, per_task_table],
)
models.change(
update_tables,
- inputs=[scores, searchbar, task_select, models, benchmark_select],
- outputs=[summary_table, per_task_table],
- )
- searchbar.submit(
- update_tables,
- inputs=[scores, searchbar, task_select, models, benchmark_select],
+ inputs=[scores, task_select, models, benchmark_select],
outputs=[summary_table, per_task_table],
)
gr.Markdown(acknowledgment_md, elem_id="ack_markdown")
+ gr.Markdown(
+ """
+ > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated.
+ """
+ )
# Prerun on all benchmarks, so that results of callbacks get cached
for benchmark in benchmarks:
@@ -862,7 +860,7 @@ def update_tables(
)
# We have to call this both on the filtered and unfiltered task because the callbacks
# also gets called twice for some reason
- update_tables(bench_scores, "", bench_tasks, filtered_models, benchmark.name)
+ update_tables(bench_scores, bench_tasks, filtered_models, benchmark.name)
filtered_tasks = update_task_list(
benchmark.name,
bench_types,
@@ -870,7 +868,7 @@ def update_tables(
bench_languages,
bench_modalities,
)
- update_tables(bench_scores, "", filtered_tasks, filtered_models, benchmark.name)
+ update_tables(bench_scores, filtered_tasks, filtered_models, benchmark.name)
return demo
diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py
new file mode 100644
index 0000000000..ebbbac9343
--- /dev/null
+++ b/mteb/leaderboard/benchmark_selector.py
@@ -0,0 +1,259 @@
+import gradio as gr
+
+"""
+Each entry is a tuple, where the first element is a label, and the second is either a single benchmark or a group of benchmarks.
+
+Example:
+[
+ ("First Benchmark", dict(value="MTEB(something)", icon="icon_url")),
+ ("Group of Benchmarks",
+ [
+ ("Second Benchmark", dict(value="MTEB(something)", icon="icon_url")),
+ ("Third Benchmark", dict(value="MTEB(something)", icon="icon_url")),
+ ],
+ ),
+]
+"""
+BENCHMARK_ENTRIES = [
+ (
+ "Multilingual",
+ dict(
+ value="MTEB(Multilingual, v1)",
+ icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-globe.svg",
+ ),
+ ),
+ (
+ "English",
+ dict(
+ value="MTEB(eng, v2)",
+ icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg",
+ ),
+ ),
+ (
+ "Image Benchmarks",
+ [
+ (
+ "Images, Multilingual",
+ dict(
+ value="MIEB(Multilingual)",
+ icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-pictures.svg",
+ ),
+ ),
+ (
+ "Images, English",
+ dict(
+ value="MIEB(eng)",
+ icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-picture.svg",
+ ),
+ ),
+ (
+ "Images, Lite",
+ dict(
+ value="MIEB(lite)",
+ icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-landscape.svg",
+ ),
+ ),
+ ],
+ ),
+ (
+ "Domain-Specific Benchmarks",
+ [
+ (
+ "Code",
+ dict(
+ value="MTEB(Code, v1)",
+ icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg",
+ ),
+ ),
+ (
+ "Legal",
+ dict(
+ value="MTEB(Law, v1)",
+ icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg",
+ ),
+ ),
+ (
+ "Medical",
+ dict(
+ value="MTEB(Medical, v1)",
+ icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg",
+ ),
+ ),
+ (
+ "Chemical",
+ dict(
+ value="ChemTEB",
+ icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-purge.svg",
+ ),
+ ),
+ ],
+ ),
+ (
+ "Regional Benchmarks",
+ [
+ (
+ "European",
+ dict(
+ value="MTEB(Europe, v1)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/eu.svg",
+ ),
+ ),
+ (
+ "Indic",
+ dict(
+ value="MTEB(Indic, v1)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/in.svg",
+ ),
+ ),
+ (
+ "Scandinavian",
+ dict(
+ value="MTEB(Scandinavian, v1)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/dk.svg",
+ ),
+ ),
+ ],
+ ),
+ (
+ "Language-specific Benchmarks",
+ [
+ (
+ "Chinese",
+ dict(
+ value="MTEB(cmn, v1)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/cn.svg",
+ ),
+ ),
+ (
+ "German",
+ dict(
+ value="MTEB(deu, v1)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg",
+ ),
+ ),
+ (
+ "French",
+ dict(
+ value="MTEB(fra, v1)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg",
+ ),
+ ),
+ (
+ "Japanese",
+ dict(
+ value="MTEB(jpn, v1)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg",
+ ),
+ ),
+ (
+ "Korean",
+ dict(
+ value="MTEB(kor, v1)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg",
+ ),
+ ),
+ (
+ "Polish",
+ dict(
+ value="MTEB(pol, v1)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/pl.svg",
+ ),
+ ),
+ (
+ "Russian",
+ dict(
+ value="MTEB(rus, v1)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg",
+ ),
+ ),
+ (
+ "Farsi (BETA)",
+ dict(
+ value="MTEB(fas, beta)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ir.svg",
+ ),
+ ),
+ ],
+ ),
+ (
+ "Miscellaneous",
+ [
+ ("BEIR", dict(value="BEIR", icon=None)),
+ ("BEIR-NL", dict(value="BEIR-NL", icon=None)),
+ ("BRIGHT", dict(value="BRIGHT", icon=None)),
+ ("BRIGHT (long)", dict(value="BRIGHT (long)", icon=None)),
+ ("BuiltBench (eng)", dict(value="BRIGHT (long)", icon=None)),
+ ("Code Information Retrieval", dict(value="CoIR", icon=None)),
+ ("Instruction Following", dict(value="FollowIR", icon=None)),
+ ("Long-context Retrieval", dict(value="LongEmbed", icon=None)),
+ ("MINERSBitextMining", dict(value="MINERSBitextMining", icon=None)),
+ ("NanoBEIR", dict(value="NanoBEIR", icon=None)),
+ ("Reasoning retrieval", dict(value="RAR-b", icon=None)),
+ ],
+ ),
+ (
+ "Legacy",
+ [
+ (
+ "English Legacy",
+ dict(
+ value="MTEB(eng, v1)",
+ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/gb.svg",
+ ),
+ ),
+ ],
+ ),
+]
+
+
+def _create_button(i, label, entry, state, label_to_value, **kwargs):
+ val = entry["value"]
+ label_to_value[label] = val
+ button = gr.Button(
+ label,
+ variant="secondary" if i != 0 else "primary",
+ icon=entry["icon"],
+ key=f"{i}_button_{val}",
+ elem_classes="text-white",
+ **kwargs,
+ )
+
+ def _update_variant(state, label) -> gr.Button:
+ if state == label_to_value[label]:
+ return gr.Button(variant="primary")
+ else:
+ return gr.Button(variant="secondary")
+
+ def _update_value(label) -> str:
+ return label_to_value[label]
+
+ state.change(_update_variant, inputs=[state, button], outputs=[button])
+ button.click(_update_value, outputs=[state], inputs=[button])
+ return button
+
+
+def make_selector(entries: list[tuple[str, dict | list]]) -> tuple[gr.State, gr.Column]:
+ if not entries:
+ raise ValueError("No entries were specified, can't build selector.")
+ label_to_value = {}
+ state = None
+ with gr.Column() as column:
+ for i, (label, entry) in enumerate(entries):
+ if i == 0:
+ if isinstance(entry, dict):
+ state = gr.State(entry["value"])
+ else:
+ _label, _entry = entry[0]
+ state = gr.State(_entry["value"])
+ if isinstance(entry, dict):
+ button = _create_button(
+ i, label, entry, state, label_to_value, size="lg"
+ )
+ else:
+ gr.Markdown(f"### **{label}**")
+ for sub_label, sub_entry in entry:
+ button = _create_button(
+ i, sub_label, sub_entry, state, label_to_value, size="md"
+ )
+
+ return state, column
diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py
index 57a282327c..6a945346f2 100644
--- a/mteb/leaderboard/figures.py
+++ b/mteb/leaderboard/figures.py
@@ -147,6 +147,7 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure:
"model_text": False,
},
hover_name="Model",
+ color_continuous_scale=px.colors.sequential.Greens,
)
# Note: it's important that this comes before setting the size mode
fig = add_size_guide(fig)
diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py
index b848406ba5..fbc01496e8 100644
--- a/mteb/leaderboard/table.py
+++ b/mteb/leaderboard/table.py
@@ -61,6 +61,23 @@ def get_column_types(df: pd.DataFrame) -> list[str]:
return types
+def get_column_widths(df: pd.DataFrame) -> list[str]:
+ # Please do not remove this function when refactoring.
+ # Column width calculation seeminlgy changes regularly with Gradio releases,
+ # and this piece of logic is good enough to quickly fix related issues.
+ widths = []
+ for column_name in df.columns:
+ column_word_lengths = [len(word) for word in column_name.split()]
+ if is_numeric_dtype(df[column_name]):
+ value_lengths = [len(f"{value:.2f}") for value in df[column_name]]
+ else:
+ value_lengths = [len(str(value)) for value in df[column_name]]
+ max_length = max(max(column_word_lengths), max(value_lengths))
+ n_pixels = 25 + (max_length * 10)
+ widths.append(f"{n_pixels}px")
+ return widths
+
+
def get_means_per_types(per_task: pd.DataFrame):
task_names_per_type = defaultdict(list)
for task_name in per_task.columns:
@@ -237,7 +254,6 @@ def apply_styling(
]
light_green_cmap = create_light_green_cmap()
numeric_data = joint_table.copy()
- numeric_data["Zero-shot"] = numeric_data["Zero-shot"].replace(-1, np.nan)
joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot)
joint_table[score_columns] = joint_table[score_columns].map(format_scores)
joint_table_style = joint_table.style.format(
@@ -278,22 +294,40 @@ def apply_styling(
per_task_style = per_task.style.format(
"{:.2f}", subset=task_score_columns, na_rep=""
).highlight_max(subset=task_score_columns, props="font-weight: bold")
- for col in task_score_columns:
- if col != "Model":
- mask = per_task[col].notna()
- per_task_style = per_task_style.background_gradient(
- cmap=light_green_cmap,
- subset=pd.IndexSlice[mask, col],
- gmap=per_task[col].loc[mask],
- )
+ # TODO: uncomment this when Gradio fixes it.
+ # The fix is already merged and contained in this release: https://github.com/gradio-app/gradio/pull/11032
+ # It will be available in Gradio 5.25.3
+ # for col in task_score_columns:
+ # if col != "Model":
+ # mask = per_task[col].notna()
+ # per_task_style = per_task_style.background_gradient(
+ # cmap=light_green_cmap,
+ # subset=pd.IndexSlice[mask, col],
+ # gmap=per_task[col].loc[mask],
+ # )
+ column_widths = get_column_widths(joint_table_style.data)
+ column_widths[0] = "100px"
+ column_widths[1] = "250px"
return (
gr.DataFrame(
joint_table_style,
datatype=column_types,
interactive=False,
pinned_columns=3,
+ column_widths=column_widths,
+ wrap=True,
+ show_fullscreen_button=True,
+ show_copy_button=True,
+ show_search="filter",
+ ),
+ gr.DataFrame(
+ per_task_style,
+ interactive=False,
+ pinned_columns=1,
+ show_fullscreen_button=True,
+ show_copy_button=True,
+ show_search="filter",
),
- gr.DataFrame(per_task_style, interactive=False, pinned_columns=1),
)
diff --git a/pyproject.toml b/pyproject.toml
index b07e5ed703..eaaba260e2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -70,8 +70,7 @@ speedtask = [
]
peft = ["peft>=0.11.0"]
leaderboard = [
- "gradio==5.16.0; python_version > '3.9'", # 3.10 is required for gradio
- "pydantic<2.11", # remove with gradio bump https://github.com/embeddings-benchmark/mteb/issues/2523
+ "gradio==5.17.1; python_version > '3.9'", # 3.10 is required for gradio
"gradio_rangeslider>=0.0.8",
"plotly>=5.24.0,<6.0.0",
"cachetools>=5.2.0",