diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 5bbbbce6aa..045bdf4ca6 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -9,6 +9,7 @@ import mteb from mteb.caching import json_cache +from mteb.leaderboard.figures import performance_size_plot from mteb.leaderboard.table import scores_to_tables @@ -32,11 +33,22 @@ def update_citation(benchmark_name: str) -> str: return citation -def update_description(benchmark_name: str) -> str: +def update_description( + benchmark_name: str, languages: list[str], task_types: list[str], domains: list[str] +) -> str: benchmark = mteb.get_benchmark(benchmark_name) description = f"## {benchmark.name}\n{benchmark.description}\n" + n_languages = len(languages) + n_task_types = len(task_types) + n_tasks = len(benchmark.tasks) + n_domains = len(domains) + description += f" - **Number of languages**: {n_languages}\n" + description += f" - **Number of datasets**: {n_tasks}\n" + description += f" - **Number of task types**: {n_task_types}\n" + description += f" - **Number of domains**: {n_domains}\n" if str(benchmark.reference) != "None": description += f"\n[Click for More Info]({benchmark.reference})" + return description @@ -194,14 +206,21 @@ def update_task_info(task_names: str) -> str: interactive=True, ) scores = gr.State(default_scores) - description = gr.Markdown(update_description, inputs=[benchmark_select]) + with gr.Row(): + with gr.Column(): + description = gr.Markdown( + update_description, + inputs=[benchmark_select, lang_select, type_select, domain_select], + ) + citation = gr.Markdown(update_citation, inputs=[benchmark_select]) + with gr.Column(): + plot = gr.Plot(performance_size_plot, inputs=[summary_table]) with gr.Tab("Summary"): summary_table.render() with gr.Tab("Performance per task"): per_task_table.render() with gr.Tab("Task information"): task_info_table = gr.DataFrame(update_task_info, inputs=[task_select]) - citation = gr.Markdown(update_citation, inputs=[benchmark_select]) @gr.on(inputs=[scores, searchbar], outputs=[summary_table, per_task_table]) def update_tables(scores, search_query: str): diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py new file mode 100644 index 0000000000..2810eb0a30 --- /dev/null +++ b/mteb/leaderboard/figures.py @@ -0,0 +1,82 @@ +import numpy as np +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go + + +def parse_n_params(text: str) -> int: + if text.endswith("M"): + return float(text[:-1]) * 1e6 + if text.endswith("B"): + return float(text[:-1]) * 1e9 + + +def parse_model_name(name: str) -> str: + name, _ = name.split("]") + return name[1:] + + +models_to_annotate = [ + "all-MiniLM-L6-v2", + "GritLM-7B", + "LaBSE", + "multilingual-e5-large-instruct", +] + + +def performance_size_plot(df: pd.DataFrame) -> go.Figure: + df = df.copy() + df["Number of Parameters"] = df["Number of Parameters"].map(parse_n_params) + df["Model"] = df["Model"].map(parse_model_name) + df["model_text"] = df["Model"].where(df["Model"].isin(models_to_annotate), "") + df["Embedding Dimensions"] = df["Embedding Dimensions"].map(int) + df["Max Tokens"] = df["Max Tokens"].map(int) + df["Log(Tokens)"] = np.log10(df["Max Tokens"]) + min_score, max_score = df["Mean (Task)"].min(), df["Mean (Task)"].max() + fig = px.scatter( + df, + x="Number of Parameters", + y="Mean (Task)", + log_x=True, + template="plotly_white", + text="model_text", + size="Embedding Dimensions", + color="Log(Tokens)", + range_color=[2, 5], + range_x=[8 * 1e6, 11 * 1e9], + range_y=[min(0, min_score * 1.25), max_score * 1.25], + hover_data={ + "Max Tokens": True, + "Embedding Dimensions": True, + "Number of Parameters": True, + "Mean (Task)": True, + "Rank (Borda)": True, + "Log(Tokens)": False, + "model_text": False, + }, + hover_name="Model", + ) + fig.update_layout( + coloraxis_colorbar=dict( + title="Max Tokens", + tickvals=[2, 3, 4, 5], + ticktext=[ + "100", + "1K", + "10K", + "100K", + ], + ), + hoverlabel=dict( + bgcolor="white", + font_size=16, + ), + ) + fig.update_traces( + textposition="top center", + ) + fig.update_layout( + font=dict(size=16, color="black"), + margin=dict(b=20, t=10, l=20, r=10), + ) + return fig diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index ad4510969f..bc8103077c 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -117,11 +117,6 @@ def scores_to_tables( joint_table = joint_table.drop(columns=["model_revision"]) model_metas = joint_table["model_name"].map(get_model_meta) joint_table["model_link"] = model_metas.map(lambda m: m.reference) - # joint_table.insert( - # 1, - # "Rank (Mean)", - # joint_table["mean"].rank(ascending=False, method="min").astype(int), - # ) joint_table.insert( 1, "Max Tokens", @@ -163,36 +158,32 @@ def scores_to_tables( } ) joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) - to_format = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns] - joint_table[to_format] = joint_table[to_format].map(format_scores) - joint_table = joint_table.style.highlight_max( - subset=to_format, - props="font-weight: bold", - ) - joint_table = joint_table.format( - "{:.2f}", subset=joint_table.data.select_dtypes("number").columns - ) - joint_table = joint_table.format("{:,}", subset=["Rank (Borda)"]) - joint_table = joint_table.highlight_min( - subset=["Rank (Borda)"], props="font-weight: bold" - ) - numerics = per_task.select_dtypes("number").columns - per_task[numerics] = per_task[numerics].map(format_scores) - per_task = per_task.style.highlight_max( - subset=numerics, props="font-weight: bold" - ).format("{:.2f}", subset=numerics) - column_widths = get_column_widths(joint_table.data) + column_widths = get_column_widths(joint_table) # overriding for model name column_widths[1] = "250px" - column_types = get_column_types(joint_table.data) + column_types = get_column_types(joint_table) # setting model name column to markdown column_types[1] = "markdown" + score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns] + joint_table[score_columns] *= 100 + joint_table_style = ( + joint_table.style.format( + {**{column: "{:.2f}" for column in score_columns}, "Rank (Borda)": "{:.0f}"} + ) + .highlight_min("Rank (Borda)", props="font-weight: bold") + .highlight_max(subset=score_columns, props="font-weight: bold") + ) + task_score_columns = per_task.select_dtypes("number").columns + per_task[task_score_columns] *= 100 + per_task_style = per_task.style.format( + "{:.2f}", subset=task_score_columns + ).highlight_max(subset=task_score_columns, props="font-weight: bold") return ( gr.DataFrame( - joint_table, - column_widths=column_widths, + joint_table_style, + # column_widths=column_widths, datatype=column_types, wrap=True, ), - gr.DataFrame(per_task), + gr.DataFrame(per_task_style), ) diff --git a/pyproject.toml b/pyproject.toml index 2cb53dd7bd..2fddab6a62 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,7 +57,7 @@ dev = ["ruff==0.6.4", # locked so we don't get PRs which fail only due to a lint codecarbon = ["codecarbon"] speedtask = ["GPUtil>=1.4.0", "psutil>=5.9.8"] peft = ["peft>=0.11.0"] -leaderboard = ["gradio>=4.44.0", "gradio_rangeslider>=0.0.6"] +leaderboard = ["gradio>=5.5.0", "gradio_rangeslider>=0.0.8"] flagembedding = ["FlagEmbedding"] jina = ["einops>=0.8.0"] flash_attention = ["flash-attn>=2.6.3"]