diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 01a5784757..9c24c525ac 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -7,9 +7,7 @@ from pydantic import AnyUrl, BeforeValidator, TypeAdapter from mteb.abstasks.AbsTask import AbsTask -from mteb.load_results.benchmark_results import ( - BenchmarkResults, -) +from mteb.load_results.benchmark_results import BenchmarkResults from mteb.load_results.load_results import load_results from mteb.overview import get_tasks @@ -63,6 +61,7 @@ def load_results( base_results = load_results() return base_results.select_tasks(self.tasks) + MTEB_MAIN_EN = Benchmark( name="MTEB(eng)", tasks=get_tasks( diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 770445b286..7d49d009d1 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -7,7 +7,7 @@ from gradio_rangeslider import RangeSlider import mteb -from mteb.leaderboard.table import scores_to_table +from mteb.leaderboard.table import scores_to_tables def load_results(): @@ -27,7 +27,7 @@ def load_results(): benchmarks = mteb.get_benchmarks() -default_benchmark = mteb.get_benchmark("MTEB(multilingual)") +default_benchmark = mteb.get_benchmark("MTEB(Multilingual)") default_results = default_benchmark.load_results(base_results=all_results) benchmark_select = gr.Dropdown( @@ -60,6 +60,7 @@ def load_results(): task_select = gr.Dropdown( default_results.task_names, value=default_results.task_names, + allow_custom_value=True, multiselect=True, label="Task", info="Select specific tasks to include", @@ -79,44 +80,46 @@ def load_results(): """, ) with gr.Group(): - availability = gr.Radio( - [ - ("Only Open", True), - ("Only Proprietary", False), - ("Both", None), - ], - value=None, - label="Availability", - interactive=True, - ) - compatibility = gr.CheckboxGroup( - [ - ( - "Should be sentence-transformers compatible", - "sbert_compatible", + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): + with gr.Column(): + availability = gr.Radio( + [ + ("Only Open", True), + ("Only Proprietary", False), + ("Both", None), + ], + value=None, + label="Availability", + interactive=True, + ) + compatibility = gr.CheckboxGroup( + [ + ( + "Should be sentence-transformers compatible", + "sbert_compatible", + ) + ], + value=[], + label="Compatibility", + interactive=True, + ) + instructions = gr.Radio( + [ + ("Only Instruction-tuned", True), + ("Only non-instruction", False), + ("Both", None), + ], + value=None, + label="Instructions", + interactive=True, + ) + model_size = RangeSlider( + minimum=0, + maximum=8000, + value=(0, 8000), + label="Model Size (#M Parameters)", + interactive=True, ) - ], - value=[], - label="Compatibility", - interactive=True, - ) - instructions = gr.Radio( - [ - ("Only Instruction-tuned", True), - ("Only non-instruction", False), - ("Both", None), - ], - value=None, - label="Instructions", - interactive=True, - ) - model_size = RangeSlider( - minimum=0, - maximum=8000, - value=(0, 8000), - label="Model Size (#M Parameters)", - interactive=True, - ) with gr.Column(scale=2): gr.Markdown( """ @@ -126,7 +129,7 @@ def load_results(): """ ) with gr.Group(): - with gr.Row(elem_classes="overflow-y-scroll h-80"): + with gr.Row(elem_classes="overflow-y-scroll max-h-80"): with gr.Column(): benchmark_select.render() with gr.Accordion("Select Languages", open=False): @@ -135,13 +138,20 @@ def load_results(): type_select.render() with gr.Accordion("Select Domains", open=False): domain_select.render() - # with gr.Accordion("Add and remove tasks:", open=False): - task_select.render() - scores = gr.State(default_results.get_scores(format="long")) - dataframe = gr.DataFrame( - scores_to_table, - inputs=[scores], - ) + with gr.Accordion("Add and remove tasks:", open=False): + task_select.render() + default_scores = default_results.get_scores(format="long") + scores = gr.State(default_scores) + summary, per_task = scores_to_tables(default_scores) + with gr.Tab("Summary"): + summary_table = gr.DataFrame(summary) + with gr.Tab("Performance per task"): + per_task_table = gr.DataFrame(per_task) + + @gr.on(inputs=[scores], outputs=[summary_table, per_task_table]) + def update_tables(scores): + summary, per_task = scores_to_tables(scores) + return summary, per_task @gr.on( inputs=[benchmark_select], diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index b2ec0384f8..570d5bc6dd 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -1,12 +1,19 @@ from __future__ import annotations +import gradio as gr import numpy as np import pandas as pd from mteb.overview import get_task -def scores_to_table(scores_long: list[dict]): +def format_scores(score: float) -> float: + return score * 100 + + +def scores_to_tables(scores_long: list[dict]): + if not scores_long: + return gr.DataFrame(), gr.DataFrame() data = pd.DataFrame.from_records(scores_long) data["task_type"] = data["task_name"].map( lambda task_name: get_task(task_name).metadata.type @@ -27,19 +34,56 @@ def scores_to_table(scores_long: list[dict]): per_task = data.pivot( index=["model_name", "model_revision"], columns="task_name", values="score" ) + to_remove = per_task.isna().any(axis="columns") overall_mean = ( data.groupby(["model_name", "model_revision"])[["score"]] .agg(np.nanmean) .rename(columns={"score": "mean"}) ) - joint_table = overall_mean.join([typed_mean, mean_per_type, per_task]).reset_index() + per_task = per_task[~to_remove] + mean_per_type = mean_per_type[~to_remove] + overall_mean = overall_mean[~to_remove] + mean_rank = per_task.rank(ascending=False, numeric_only=True).mean( + axis=1, skipna=True + ) + joint_table = overall_mean.join([typed_mean, mean_per_type]) + joint_table.insert(0, "mean_rank", mean_rank) + joint_table = joint_table.reset_index() joint_table = joint_table.sort_values("mean", ascending=False) + joint_table["model_name"] = joint_table["model_name"].map( + lambda name: name.split("/")[-1] + ) joint_table = joint_table.rename( columns={ "model_name": "Model", "mean_by_task_type": "Mean by Task Type", "mean": "Mean", + "mean_rank": "Mean Rank", } ) joint_table = joint_table.drop(columns=["model_revision"]) - return joint_table + joint_table.insert( + 0, "Rank", joint_table["Mean"].rank(ascending=False).map(int).map(str) + ) + per_task = per_task.rename( + columns={ + "model_name": "Model", + } + ) + per_task = per_task.reset_index().drop(columns=["model_revision"]) + numerics = joint_table.select_dtypes("number").columns + to_format = ["Mean", "Mean by Task Type", *mean_per_type.columns] + joint_table[to_format] = joint_table[to_format].map(format_scores) + joint_table = joint_table.style.highlight_max( + subset=to_format, + props="font-weight: bold", + ).format("{:.2f}", subset=numerics) + joint_table = joint_table.highlight_min( + subset=["Mean Rank"], props="font-weight: bold" + ) + numerics = per_task.select_dtypes("number").columns + per_task[numerics] = per_task[numerics].map(format_scores) + per_task = per_task.style.highlight_max( + subset=numerics, props="font-weight: bold" + ).format("{:.2f}", subset=numerics) + return joint_table, per_task