From e17e9ce3755001de3cea3b9a855f518e4bbf894c Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Mon, 18 Aug 2025 17:01:13 +0800 Subject: [PATCH 1/7] feat - Combine Plots and Tables into a Single Tab #3009 --- mteb/leaderboard/app.py | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 2dcb4d96be..03b8f04083 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -292,7 +292,7 @@ def get_leaderboard_app() -> gr.Blocks: scores = gr.State(default_scores) models = gr.State(filtered_models) with gr.Row(): - with gr.Column(scale=1): + with gr.Column(): description = gr.Markdown( # noqa: F841 update_description, inputs=[benchmark_select, lang_select, type_select, domain_select], @@ -301,17 +301,6 @@ def get_leaderboard_app() -> gr.Blocks: citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841 with gr.Accordion("Share this benchmark:", open=False): gr.Markdown(produce_benchmark_link, inputs=[benchmark_select]) - with gr.Column(scale=2): - with gr.Tab("Performance per Model Size"): - plot = gr.Plot(performance_size_plot, inputs=[summary_table]) # noqa: F841 - gr.Markdown( - "*We only display models that have been run on all tasks in the benchmark*" - ) - with gr.Tab("Performance per Task Type (Radar Chart)"): - radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) # noqa: F841 - gr.Markdown( - "*We only display models that have been run on all task types in the benchmark*" - ) with gr.Accordion("Customize this Benchmark", open=False): with gr.Column(): @@ -402,6 +391,18 @@ def get_leaderboard_app() -> gr.Blocks: open=False, ): gr.Markdown(FAQ) + + with gr.Tab("Performance per Model Size"): + plot = gr.Plot(performance_size_plot, inputs=[summary_table]) # noqa: F841 + gr.Markdown( + "*We only display models that have been run on all tasks in the benchmark*" + ) + with gr.Tab("Performance per Task Type (Radar Chart)"): + radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) # noqa: F841 + gr.Markdown( + "*We only display models that have been run on all task types in the benchmark*" + ) + with gr.Tab("Performance per task"): per_task_table.render() download_per_task = gr.DownloadButton("Download Table") From d7b1f3b1ae32f3b0a1d6825e9cf540724f9dc265 Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Mon, 18 Aug 2025 23:09:01 +0800 Subject: [PATCH 2/7] feat - Resize the plot to make it more readable --- mteb/leaderboard/app.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 03b8f04083..193fa4b514 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -392,16 +392,19 @@ def get_leaderboard_app() -> gr.Blocks: ): gr.Markdown(FAQ) - with gr.Tab("Performance per Model Size"): + with gr.Tab("Performance per Model Size") as plot_tab: plot = gr.Plot(performance_size_plot, inputs=[summary_table]) # noqa: F841 gr.Markdown( "*We only display models that have been run on all tasks in the benchmark*" ) - with gr.Tab("Performance per Task Type (Radar Chart)"): - radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) # noqa: F841 + plot_tab.select(performance_size_plot, inputs=[summary_table], outputs=[plot]) + + with gr.Tab("Performance per Task Type (Radar Chart)") as radar_plot_tab: + radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) # noqa: F841 gr.Markdown( "*We only display models that have been run on all task types in the benchmark*" ) + radar_plot_tab.select(radar_chart, inputs=[summary_table], outputs=[radar_plot]) with gr.Tab("Performance per task"): per_task_table.render() From 6d9c5b910f28660ef17b6dcaef35415a3e12faf2 Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Mon, 18 Aug 2025 23:09:45 +0800 Subject: [PATCH 3/7] feat - Remove the (radar chart) --- mteb/leaderboard/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 193fa4b514..f497447280 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -399,7 +399,7 @@ def get_leaderboard_app() -> gr.Blocks: ) plot_tab.select(performance_size_plot, inputs=[summary_table], outputs=[plot]) - with gr.Tab("Performance per Task Type (Radar Chart)") as radar_plot_tab: + with gr.Tab("Performance per Task Type") as radar_plot_tab: radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) # noqa: F841 gr.Markdown( "*We only display models that have been run on all task types in the benchmark*" From ee56b742db9daf855aceb40506280ff59781703a Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Mon, 18 Aug 2025 23:10:59 +0800 Subject: [PATCH 4/7] feat - Add a comment stating that it only shows the Top 5 models in the table. --- mteb/leaderboard/app.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index f497447280..53b0539f82 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -395,14 +395,14 @@ def get_leaderboard_app() -> gr.Blocks: with gr.Tab("Performance per Model Size") as plot_tab: plot = gr.Plot(performance_size_plot, inputs=[summary_table]) # noqa: F841 gr.Markdown( - "*We only display models that have been run on all tasks in the benchmark*" + "*We only display TOP 5 models that have been run on all tasks in the benchmark*" ) plot_tab.select(performance_size_plot, inputs=[summary_table], outputs=[plot]) with gr.Tab("Performance per Task Type") as radar_plot_tab: radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) # noqa: F841 gr.Markdown( - "*We only display models that have been run on all task types in the benchmark*" + "*We only display TOP 5 models that have been run on all task types in the benchmark*" ) radar_plot_tab.select(radar_chart, inputs=[summary_table], outputs=[radar_plot]) From b327a3d9af3c1e85ff38591244f7e57563479c45 Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Tue, 19 Aug 2025 21:52:55 +0800 Subject: [PATCH 5/7] feat - adjust layout --- mteb/leaderboard/app.py | 168 ++++++++++++++++++++-------------------- 1 file changed, 85 insertions(+), 83 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 53b0539f82..2d0f0f2228 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -28,7 +28,6 @@ logger = logging.getLogger(__name__) - LANGUAGE: list[str] = list({l for t in mteb.get_tasks() for l in t.metadata.languages}) ALL_MODELS = {meta.name for meta in mteb.get_model_metas()} @@ -54,8 +53,9 @@ def produce_benchmark_link(benchmark_name: str, request: gr.Request) -> str: } ) base_url = request.request.base_url + md = "You can also share this benchmark using the following link:\n" url = f"{base_url}?{params}" - md = f"```\n{url}\n```" + md += f"```\n{url}\n```" return md @@ -73,7 +73,8 @@ def download_table(table: pd.DataFrame) -> str: def update_citation(benchmark_name: str) -> str: benchmark = mteb.get_benchmark(benchmark_name) if benchmark.citation is not None: - citation = f"```bibtex\n{benchmark.citation}\n```" + citation = "To cite this work, please use the following reference:\n" + citation += f"```bibtex\n{benchmark.citation}\n```" else: citation = "" return citation @@ -292,92 +293,93 @@ def get_leaderboard_app() -> gr.Blocks: scores = gr.State(default_scores) models = gr.State(filtered_models) with gr.Row(): - with gr.Column(): + with gr.Column(scale=1): description = gr.Markdown( # noqa: F841 update_description, inputs=[benchmark_select, lang_select, type_select, domain_select], ) - with gr.Accordion("Cite this benchmark:", open=False): + + with gr.Column(scale=1): + with gr.Accordion("Cite and share this benchmark:", open=False): citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841 - with gr.Accordion("Share this benchmark:", open=False): gr.Markdown(produce_benchmark_link, inputs=[benchmark_select]) - with gr.Accordion("Customize this Benchmark", open=False): - with gr.Column(): - with gr.Row(): - type_select.render() - with gr.Row(): - domain_select.render() - with gr.Row(): - modality_select.render() - with gr.Row(elem_classes="overflow-y-scroll max-h-80"): - lang_select.render() - with gr.Row(elem_classes="overflow-y-scroll max-h-80"): - task_select.render() - - with gr.Accordion("Advanced Model Filters", open=False): - with gr.Group(): - with gr.Row(elem_classes=""): + with gr.Accordion("Customize this Benchmark", open=False, ): with gr.Column(): - compatibility = gr.CheckboxGroup( - [ - ( - "Should be sentence-transformers compatible", - "Sentence Transformers", + with gr.Row(): + type_select.render() + with gr.Row(): + domain_select.render() + with gr.Row(): + modality_select.render() + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): + lang_select.render() + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): + task_select.render() + + with gr.Accordion("Advanced Model Filters", open=False): + with gr.Group(): + with gr.Row(elem_classes=""): + with gr.Column(): + compatibility = gr.CheckboxGroup( + [ + ( + "Should be sentence-transformers compatible", + "Sentence Transformers", + ) + ], + value=[], + label="Compatibility", + interactive=True, + ) + availability = gr.Radio( + [ + ("Only Open", True), + ("Only Proprietary", False), + ("Both", None), + ], + value=None, + label="Availability", + interactive=True, + ) + instructions = gr.Radio( + [ + ("Only Instruction-tuned", True), + ("Only non-instruction", False), + ("Both", None), + ], + value=None, + label="Instructions", + interactive=True, + ) + with gr.Column(): + zero_shot = gr.Radio( + [ + ( + "Only Zero-shot", + "only_zero_shot", + ), + ("Remove Unknown", "remove_unknown"), + ("Allow All", "allow_all"), + ], + value="allow_all", + label="Zero-shot", + interactive=True, + ) + + max_model_size = gr.Radio( + [ + ("<100M", 100), + ("<500M", 500), + ("<1B", 1000), + ("<5B", 5000), + ("<10B", 10000), + (">10B", MAX_MODEL_SIZE), + ], + value=MAX_MODEL_SIZE, + label="Model Parameters", + interactive=True, ) - ], - value=[], - label="Compatibility", - interactive=True, - ) - availability = gr.Radio( - [ - ("Only Open", True), - ("Only Proprietary", False), - ("Both", None), - ], - value=None, - label="Availability", - interactive=True, - ) - instructions = gr.Radio( - [ - ("Only Instruction-tuned", True), - ("Only non-instruction", False), - ("Both", None), - ], - value=None, - label="Instructions", - interactive=True, - ) - with gr.Column(): - zero_shot = gr.Radio( - [ - ( - "Only Zero-shot", - "only_zero_shot", - ), - ("Remove Unknown", "remove_unknown"), - ("Allow All", "allow_all"), - ], - value="allow_all", - label="Zero-shot", - interactive=True, - ) - - max_model_size = gr.Radio( - [ - ("<100M", 100), - ("<500M", 500), - ("<1B", 1000), - ("<5B", 5000), - ("<10B", 10000), - (">10B", MAX_MODEL_SIZE), - ], - value=MAX_MODEL_SIZE, - label="Model Parameters", - interactive=True, - ) with gr.Tab("Summary"): summary_table.render() @@ -387,8 +389,8 @@ def get_leaderboard_app() -> gr.Blocks: ) with gr.Accordion( - "Frequently Asked Questions", - open=False, + "Frequently Asked Questions", + open=False, ): gr.Markdown(FAQ) @@ -400,7 +402,7 @@ def get_leaderboard_app() -> gr.Blocks: plot_tab.select(performance_size_plot, inputs=[summary_table], outputs=[plot]) with gr.Tab("Performance per Task Type") as radar_plot_tab: - radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) # noqa: F841 + radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) # noqa: F841 gr.Markdown( "*We only display TOP 5 models that have been run on all task types in the benchmark*" ) From 77a6f1345a0a9ef31249d442144d6adc7909cf97 Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 22 Aug 2025 16:27:05 +0200 Subject: [PATCH 6/7] Update mteb/leaderboard/app.py --- mteb/leaderboard/app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 2d0f0f2228..6c04c6a9c5 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -300,7 +300,7 @@ def get_leaderboard_app() -> gr.Blocks: ) with gr.Column(scale=1): - with gr.Accordion("Cite and share this benchmark:", open=False): + with gr.Accordion("Cite and share this benchmark", open=False): citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841 gr.Markdown(produce_benchmark_link, inputs=[benchmark_select]) From ef5c57c1e3a4734b0cdd135179ecb1929a3c56fe Mon Sep 17 00:00:00 2001 From: Kenneth Enevoldsen Date: Fri, 22 Aug 2025 16:46:46 +0200 Subject: [PATCH 7/7] format --- mteb/leaderboard/app.py | 17 ++++++++++++----- mteb/leaderboard/benchmark_selector.py | 1 - 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 6c04c6a9c5..3c0921ab05 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -304,7 +304,10 @@ def get_leaderboard_app() -> gr.Blocks: citation = gr.Markdown(update_citation, inputs=[benchmark_select]) # noqa: F841 gr.Markdown(produce_benchmark_link, inputs=[benchmark_select]) - with gr.Accordion("Customize this Benchmark", open=False, ): + with gr.Accordion( + "Customize this Benchmark", + open=False, + ): with gr.Column(): with gr.Row(): type_select.render() @@ -389,8 +392,8 @@ def get_leaderboard_app() -> gr.Blocks: ) with gr.Accordion( - "Frequently Asked Questions", - open=False, + "Frequently Asked Questions", + open=False, ): gr.Markdown(FAQ) @@ -399,14 +402,18 @@ def get_leaderboard_app() -> gr.Blocks: gr.Markdown( "*We only display TOP 5 models that have been run on all tasks in the benchmark*" ) - plot_tab.select(performance_size_plot, inputs=[summary_table], outputs=[plot]) + plot_tab.select( + performance_size_plot, inputs=[summary_table], outputs=[plot] + ) with gr.Tab("Performance per Task Type") as radar_plot_tab: radar_plot = gr.Plot(radar_chart, inputs=[summary_table]) # noqa: F841 gr.Markdown( "*We only display TOP 5 models that have been run on all task types in the benchmark*" ) - radar_plot_tab.select(radar_chart, inputs=[summary_table], outputs=[radar_plot]) + radar_plot_tab.select( + radar_chart, inputs=[summary_table], outputs=[radar_plot] + ) with gr.Tab("Performance per task"): per_task_table.render() diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index 4906e9126d..4ac31ccc28 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -7,7 +7,6 @@ import mteb from build.lib.mteb.benchmarks.benchmarks import MTEB_multilingual from mteb import Benchmark -from mteb.benchmarks.benchmarks import MTEB_multilingual DEFAULT_BENCHMARK_NAME = MTEB_multilingual.name