diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index e3833b5ce3..225ecb44ca 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -20,6 +20,7 @@ from mteb.benchmarks.benchmarks import MTEB_multilingual from mteb.custom_validators import MODALITIES from mteb.languages import ISO_TO_LANGUAGE +from mteb.leaderboard.benchmark_selector import BENCHMARK_ENTRIES, make_selector from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import create_tables @@ -104,7 +105,7 @@ def update_description( benchmark_name: str, languages: list[str], task_types: list[str], domains: list[str] ) -> str: benchmark = mteb.get_benchmark(benchmark_name) - description = f"## {benchmark.name}\n{benchmark.description}\n" + description = f"{benchmark.description}\n" n_languages = len(languages) n_task_types = len(task_types) n_tasks = len(benchmark.tasks) @@ -156,7 +157,13 @@ def update_task_info(task_names: str) -> gr.DataFrame: } ) df = df.drop(columns="reference") - return gr.DataFrame(df, datatype=["markdown"] + ["str"] * (len(df.columns) - 1)) + return gr.DataFrame( + df, + datatype=["markdown"] + ["str"] * (len(df.columns) - 1), + show_copy_button=True, + show_fullscreen_button=True, + show_search="filter", + ) # Model sizes in million parameters @@ -235,13 +242,6 @@ def get_leaderboard_app() -> gr.Blocks: summary_table, per_task_table = create_tables( [entry for entry in default_scores if entry["model_name"] in filtered_models] ) - - benchmark_select = gr.Dropdown( - [bench.name for bench in benchmarks], - value=default_benchmark.name, - label="Prebuilt Benchmarks", - info="Select one of our expert-selected benchmarks from MTEB publications.", - ) lang_select = gr.Dropdown( ISO_TO_LANGUAGE, value=sorted(default_results.languages), @@ -284,116 +284,46 @@ def get_leaderboard_app() -> gr.Blocks: """ - with gr.Blocks(fill_width=True, theme=gr.themes.Base(), head=head) as demo: + with gr.Blocks( + fill_width=True, + theme=gr.themes.Soft( + font=[gr.themes.GoogleFont("Roboto Mono"), "Arial", "sans-serif"], + ), + head=head, + ) as demo: + with gr.Sidebar( + position="left", + label="Benchmark Selection and Customization", + visible=True, + width="25%", + ): + gr.Markdown("## Select Benchmark") + benchmark_select, column = make_selector(BENCHMARK_ENTRIES) gr.Markdown( """ ## Embedding Leaderboard - This leaderboard compares 100+ text and image (soon) embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️ - - > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated. + This leaderboard compares 100+ text and image embedding models across 1000+ languages. We refer to the publication of each selectable benchmark for details on metrics, languages, tasks, and task types. Anyone is welcome [to add a model](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_model.md), [add benchmarks](https://github.com/embeddings-benchmark/mteb/blob/main/docs/adding_a_benchmark.md), [help us improve zero-shot annotations](https://github.com/embeddings-benchmark/mteb/blob/06489abca007261c7e6b11f36d4844c5ed5efdcb/mteb/models/bge_models.py#L91) or [propose other changes to the leaderboard](https://github.com/embeddings-benchmark/mteb/tree/main/mteb/leaderboard) 🤗 Also, check out [MTEB Arena](https://huggingface.co/spaces/mteb/arena) ⚔️ """ ) + gr.Markdown( + lambda name: f"

{name}


", + inputs=benchmark_select, + ) - with gr.Row(): - with gr.Column(scale=5): - gr.Markdown( - "### Benchmarks\n" - "Select one of the hand-curated benchmarks from our publications and modify them using one of the following filters to fit your needs." - ) - with gr.Group(): - with gr.Row(elem_classes="overflow-y-scroll max-h-80"): - with gr.Column(): - benchmark_select.render() - with gr.Accordion("Select Languages", open=False): - lang_select.render() - with gr.Accordion("Select Task Types", open=False): - type_select.render() - with gr.Accordion("Select Domains", open=False): - domain_select.render() - with gr.Accordion("Select Modalities", open=False): - modality_select.render() - with gr.Accordion("Add and remove tasks:", open=False): - task_select.render() - with gr.Column(scale=8): - gr.Markdown( - """ - ### Model Selection - Select models to rank based on an assortment of criteria. - """, - ) - with gr.Group(): - with gr.Row(): - searchbar = gr.Textbox( - label="Search Models", - info="Press Enter to search.\nSearch models by name (RegEx sensitive. Separate queries with `|`)", - interactive=True, - ) - compatibility = gr.CheckboxGroup( - [ - ( - "Should be sentence-transformers compatible", - "Sentence Transformers", - ) - ], - value=[], - label="Compatibility", - interactive=True, - ) - with gr.Row(elem_classes=""): - with gr.Column(): - availability = gr.Radio( - [ - ("Only Open", True), - ("Only Proprietary", False), - ("Both", None), - ], - value=None, - label="Availability", - interactive=True, - ) - instructions = gr.Radio( - [ - ("Only Instruction-tuned", True), - ("Only non-instruction", False), - ("Both", None), - ], - value=None, - label="Instructions", - interactive=True, - ) - with gr.Column(): - zero_shot = gr.Radio( - [ - ( - "Only Zero-shot", - "only_zero_shot", - ), - ("Remove Unknown", "remove_unknown"), - ("Allow All", "allow_all"), - ], - value="allow_all", - label="Zero-shot", - interactive=True, - ) - model_size = RangeSlider( - minimum=MIN_MODEL_SIZE, - maximum=MAX_MODEL_SIZE, - value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), - label="Model Size (#M Parameters)", - ) scores = gr.State(default_scores) models = gr.State(filtered_models) with gr.Row(): - with gr.Column(): + with gr.Column(scale=1): description = gr.Markdown( # noqa: F841 update_description, inputs=[benchmark_select, lang_select, type_select, domain_select], ) - citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841 + with gr.Accordion("Cite this benchmark:", open=False): + citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841 with gr.Accordion("Share this benchmark:", open=False): gr.Markdown(produce_benchmark_link, inputs=[benchmark_select]) - with gr.Column(): + with gr.Column(scale=2): with gr.Tab("Performance per Model Size"): plot = gr.Plot(performance_size_plot, inputs=[summary_table]) # noqa: F841 gr.Markdown( @@ -404,6 +334,76 @@ def get_leaderboard_app() -> gr.Blocks: gr.Markdown( "*We only display models that have been run on all task types in the benchmark*" ) + + with gr.Accordion("Customize this Benchmark", open=False): + with gr.Column(): + with gr.Row(): + type_select.render() + with gr.Row(): + domain_select.render() + with gr.Row(): + modality_select.render() + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): + lang_select.render() + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): + task_select.render() + + with gr.Accordion("Advanced Model Filters", open=False): + with gr.Group(): + with gr.Row(elem_classes=""): + with gr.Column(): + compatibility = gr.CheckboxGroup( + [ + ( + "Should be sentence-transformers compatible", + "Sentence Transformers", + ) + ], + value=[], + label="Compatibility", + interactive=True, + ) + availability = gr.Radio( + [ + ("Only Open", True), + ("Only Proprietary", False), + ("Both", None), + ], + value=None, + label="Availability", + interactive=True, + ) + instructions = gr.Radio( + [ + ("Only Instruction-tuned", True), + ("Only non-instruction", False), + ("Both", None), + ], + value=None, + label="Instructions", + interactive=True, + ) + with gr.Column(): + zero_shot = gr.Radio( + [ + ( + "Only Zero-shot", + "only_zero_shot", + ), + ("Remove Unknown", "remove_unknown"), + ("Allow All", "allow_all"), + ], + value="allow_all", + label="Zero-shot", + interactive=True, + ) + model_size = RangeSlider( + minimum=MIN_MODEL_SIZE, + maximum=MAX_MODEL_SIZE, + value=(MIN_MODEL_SIZE, MAX_MODEL_SIZE), + label="Model Size (#M Parameters)", + ) + with gr.Tab("Summary"): summary_table.render() download_summary = gr.DownloadButton("Download Table") @@ -512,7 +512,7 @@ def on_benchmark_select(benchmark_name): elapsed = time.time() - start_time benchmark_results = all_benchmark_results[benchmark_name] scores = benchmark_results.get_scores(format="long") - logger.info(f"on_benchmark_select callback: {elapsed}s") + logger.debug(f"on_benchmark_select callback: {elapsed}s") return ( languages, domains, @@ -543,10 +543,12 @@ def on_benchmark_select(benchmark_name): ) def update_scores_on_lang_change(benchmark_name, languages): start_time = time.time() + if not len(languages): + return [] benchmark_results = all_benchmark_results[benchmark_name] scores = benchmark_results.get_scores(languages=languages, format="long") elapsed = time.time() - start_time - logger.info(f"update_scores callback: {elapsed}s") + logger.debug(f"update_scores callback: {elapsed}s") return scores lang_select.input( @@ -574,6 +576,8 @@ def update_scores_on_lang_change(benchmark_name, languages): def update_task_list( benchmark_name, type_select, domain_select, lang_select, modality_select ): + if not len(lang_select): + return [] start_time = time.time() tasks_to_keep = [] for task in mteb.get_benchmark(benchmark_name).tasks: @@ -587,7 +591,7 @@ def update_task_list( continue tasks_to_keep.append(task.metadata.name) elapsed = time.time() - start_time - logger.info(f"update_task_list callback: {elapsed}s") + logger.debug(f"update_task_list callback: {elapsed}s") return sorted(tasks_to_keep) type_select.input( @@ -679,7 +683,7 @@ def update_models( if model_names == filtered_models: # This indicates that the models should not be filtered return None - logger.info(f"update_models callback: {elapsed}s") + logger.debug(f"update_models callback: {elapsed}s") return sorted(filtered_models) scores.change( @@ -776,14 +780,9 @@ def update_models( @cachetools.cached( cache={}, - key=lambda scores, - search_query, - tasks, - models_to_keep, - benchmark_name: hash( + key=lambda scores, tasks, models_to_keep, benchmark_name: hash( ( id(scores), - hash(search_query), hash(tuple(tasks)), id(models_to_keep), hash(benchmark_name), @@ -792,7 +791,6 @@ def update_models( ) def update_tables( scores, - search_query: str, tasks, models_to_keep, benchmark_name: str, @@ -813,33 +811,33 @@ def update_tables( filtered_scores.append(entry) else: filtered_scores = scores - summary, per_task = create_tables(filtered_scores, search_query) + summary, per_task = create_tables(filtered_scores) elapsed = time.time() - start_time - logger.info(f"update_tables callback: {elapsed}s") + logger.debug(f"update_tables callback: {elapsed}s") return summary, per_task task_select.change( update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], + inputs=[scores, task_select, models, benchmark_select], outputs=[summary_table, per_task_table], ) scores.change( update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], + inputs=[scores, task_select, models, benchmark_select], outputs=[summary_table, per_task_table], ) models.change( update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], - outputs=[summary_table, per_task_table], - ) - searchbar.submit( - update_tables, - inputs=[scores, searchbar, task_select, models, benchmark_select], + inputs=[scores, task_select, models, benchmark_select], outputs=[summary_table, per_task_table], ) gr.Markdown(acknowledgment_md, elem_id="ack_markdown") + gr.Markdown( + """ + > Looking for the previous MTEB leaderboard? We have made it available [here](https://huggingface.co/spaces/mteb/leaderboard_legacy) but it will no longer be updated. + """ + ) # Prerun on all benchmarks, so that results of callbacks get cached for benchmark in benchmarks: @@ -862,7 +860,7 @@ def update_tables( ) # We have to call this both on the filtered and unfiltered task because the callbacks # also gets called twice for some reason - update_tables(bench_scores, "", bench_tasks, filtered_models, benchmark.name) + update_tables(bench_scores, bench_tasks, filtered_models, benchmark.name) filtered_tasks = update_task_list( benchmark.name, bench_types, @@ -870,7 +868,7 @@ def update_tables( bench_languages, bench_modalities, ) - update_tables(bench_scores, "", filtered_tasks, filtered_models, benchmark.name) + update_tables(bench_scores, filtered_tasks, filtered_models, benchmark.name) return demo diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py new file mode 100644 index 0000000000..ebbbac9343 --- /dev/null +++ b/mteb/leaderboard/benchmark_selector.py @@ -0,0 +1,259 @@ +import gradio as gr + +""" +Each entry is a tuple, where the first element is a label, and the second is either a single benchmark or a group of benchmarks. + +Example: +[ + ("First Benchmark", dict(value="MTEB(something)", icon="icon_url")), + ("Group of Benchmarks", + [ + ("Second Benchmark", dict(value="MTEB(something)", icon="icon_url")), + ("Third Benchmark", dict(value="MTEB(something)", icon="icon_url")), + ], + ), +] +""" +BENCHMARK_ENTRIES = [ + ( + "Multilingual", + dict( + value="MTEB(Multilingual, v1)", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-globe.svg", + ), + ), + ( + "English", + dict( + value="MTEB(eng, v2)", + icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg", + ), + ), + ( + "Image Benchmarks", + [ + ( + "Images, Multilingual", + dict( + value="MIEB(Multilingual)", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-pictures.svg", + ), + ), + ( + "Images, English", + dict( + value="MIEB(eng)", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-picture.svg", + ), + ), + ( + "Images, Lite", + dict( + value="MIEB(lite)", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-landscape.svg", + ), + ), + ], + ), + ( + "Domain-Specific Benchmarks", + [ + ( + "Code", + dict( + value="MTEB(Code, v1)", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", + ), + ), + ( + "Legal", + dict( + value="MTEB(Law, v1)", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg", + ), + ), + ( + "Medical", + dict( + value="MTEB(Medical, v1)", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg", + ), + ), + ( + "Chemical", + dict( + value="ChemTEB", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-purge.svg", + ), + ), + ], + ), + ( + "Regional Benchmarks", + [ + ( + "European", + dict( + value="MTEB(Europe, v1)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/eu.svg", + ), + ), + ( + "Indic", + dict( + value="MTEB(Indic, v1)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/in.svg", + ), + ), + ( + "Scandinavian", + dict( + value="MTEB(Scandinavian, v1)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/dk.svg", + ), + ), + ], + ), + ( + "Language-specific Benchmarks", + [ + ( + "Chinese", + dict( + value="MTEB(cmn, v1)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/cn.svg", + ), + ), + ( + "German", + dict( + value="MTEB(deu, v1)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg", + ), + ), + ( + "French", + dict( + value="MTEB(fra, v1)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", + ), + ), + ( + "Japanese", + dict( + value="MTEB(jpn, v1)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg", + ), + ), + ( + "Korean", + dict( + value="MTEB(kor, v1)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/kr.svg", + ), + ), + ( + "Polish", + dict( + value="MTEB(pol, v1)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/pl.svg", + ), + ), + ( + "Russian", + dict( + value="MTEB(rus, v1)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ru.svg", + ), + ), + ( + "Farsi (BETA)", + dict( + value="MTEB(fas, beta)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/ir.svg", + ), + ), + ], + ), + ( + "Miscellaneous", + [ + ("BEIR", dict(value="BEIR", icon=None)), + ("BEIR-NL", dict(value="BEIR-NL", icon=None)), + ("BRIGHT", dict(value="BRIGHT", icon=None)), + ("BRIGHT (long)", dict(value="BRIGHT (long)", icon=None)), + ("BuiltBench (eng)", dict(value="BRIGHT (long)", icon=None)), + ("Code Information Retrieval", dict(value="CoIR", icon=None)), + ("Instruction Following", dict(value="FollowIR", icon=None)), + ("Long-context Retrieval", dict(value="LongEmbed", icon=None)), + ("MINERSBitextMining", dict(value="MINERSBitextMining", icon=None)), + ("NanoBEIR", dict(value="NanoBEIR", icon=None)), + ("Reasoning retrieval", dict(value="RAR-b", icon=None)), + ], + ), + ( + "Legacy", + [ + ( + "English Legacy", + dict( + value="MTEB(eng, v1)", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/gb.svg", + ), + ), + ], + ), +] + + +def _create_button(i, label, entry, state, label_to_value, **kwargs): + val = entry["value"] + label_to_value[label] = val + button = gr.Button( + label, + variant="secondary" if i != 0 else "primary", + icon=entry["icon"], + key=f"{i}_button_{val}", + elem_classes="text-white", + **kwargs, + ) + + def _update_variant(state, label) -> gr.Button: + if state == label_to_value[label]: + return gr.Button(variant="primary") + else: + return gr.Button(variant="secondary") + + def _update_value(label) -> str: + return label_to_value[label] + + state.change(_update_variant, inputs=[state, button], outputs=[button]) + button.click(_update_value, outputs=[state], inputs=[button]) + return button + + +def make_selector(entries: list[tuple[str, dict | list]]) -> tuple[gr.State, gr.Column]: + if not entries: + raise ValueError("No entries were specified, can't build selector.") + label_to_value = {} + state = None + with gr.Column() as column: + for i, (label, entry) in enumerate(entries): + if i == 0: + if isinstance(entry, dict): + state = gr.State(entry["value"]) + else: + _label, _entry = entry[0] + state = gr.State(_entry["value"]) + if isinstance(entry, dict): + button = _create_button( + i, label, entry, state, label_to_value, size="lg" + ) + else: + gr.Markdown(f"### **{label}**") + for sub_label, sub_entry in entry: + button = _create_button( + i, sub_label, sub_entry, state, label_to_value, size="md" + ) + + return state, column diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py index 57a282327c..6a945346f2 100644 --- a/mteb/leaderboard/figures.py +++ b/mteb/leaderboard/figures.py @@ -147,6 +147,7 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure: "model_text": False, }, hover_name="Model", + color_continuous_scale=px.colors.sequential.Greens, ) # Note: it's important that this comes before setting the size mode fig = add_size_guide(fig) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index b848406ba5..fbc01496e8 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -61,6 +61,23 @@ def get_column_types(df: pd.DataFrame) -> list[str]: return types +def get_column_widths(df: pd.DataFrame) -> list[str]: + # Please do not remove this function when refactoring. + # Column width calculation seeminlgy changes regularly with Gradio releases, + # and this piece of logic is good enough to quickly fix related issues. + widths = [] + for column_name in df.columns: + column_word_lengths = [len(word) for word in column_name.split()] + if is_numeric_dtype(df[column_name]): + value_lengths = [len(f"{value:.2f}") for value in df[column_name]] + else: + value_lengths = [len(str(value)) for value in df[column_name]] + max_length = max(max(column_word_lengths), max(value_lengths)) + n_pixels = 25 + (max_length * 10) + widths.append(f"{n_pixels}px") + return widths + + def get_means_per_types(per_task: pd.DataFrame): task_names_per_type = defaultdict(list) for task_name in per_task.columns: @@ -237,7 +254,6 @@ def apply_styling( ] light_green_cmap = create_light_green_cmap() numeric_data = joint_table.copy() - numeric_data["Zero-shot"] = numeric_data["Zero-shot"].replace(-1, np.nan) joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot) joint_table[score_columns] = joint_table[score_columns].map(format_scores) joint_table_style = joint_table.style.format( @@ -278,22 +294,40 @@ def apply_styling( per_task_style = per_task.style.format( "{:.2f}", subset=task_score_columns, na_rep="" ).highlight_max(subset=task_score_columns, props="font-weight: bold") - for col in task_score_columns: - if col != "Model": - mask = per_task[col].notna() - per_task_style = per_task_style.background_gradient( - cmap=light_green_cmap, - subset=pd.IndexSlice[mask, col], - gmap=per_task[col].loc[mask], - ) + # TODO: uncomment this when Gradio fixes it. + # The fix is already merged and contained in this release: https://github.com/gradio-app/gradio/pull/11032 + # It will be available in Gradio 5.25.3 + # for col in task_score_columns: + # if col != "Model": + # mask = per_task[col].notna() + # per_task_style = per_task_style.background_gradient( + # cmap=light_green_cmap, + # subset=pd.IndexSlice[mask, col], + # gmap=per_task[col].loc[mask], + # ) + column_widths = get_column_widths(joint_table_style.data) + column_widths[0] = "100px" + column_widths[1] = "250px" return ( gr.DataFrame( joint_table_style, datatype=column_types, interactive=False, pinned_columns=3, + column_widths=column_widths, + wrap=True, + show_fullscreen_button=True, + show_copy_button=True, + show_search="filter", + ), + gr.DataFrame( + per_task_style, + interactive=False, + pinned_columns=1, + show_fullscreen_button=True, + show_copy_button=True, + show_search="filter", ), - gr.DataFrame(per_task_style, interactive=False, pinned_columns=1), ) diff --git a/pyproject.toml b/pyproject.toml index b07e5ed703..eaaba260e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -70,8 +70,7 @@ speedtask = [ ] peft = ["peft>=0.11.0"] leaderboard = [ - "gradio==5.16.0; python_version > '3.9'", # 3.10 is required for gradio - "pydantic<2.11", # remove with gradio bump https://github.com/embeddings-benchmark/mteb/issues/2523 + "gradio==5.17.1; python_version > '3.9'", # 3.10 is required for gradio "gradio_rangeslider>=0.0.8", "plotly>=5.24.0,<6.0.0", "cachetools>=5.2.0",