From 6228cd9748851e41495cee57f047779b4cfa816f Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Tue, 18 Mar 2025 14:55:17 +0530 Subject: [PATCH 1/6] Add Background Gradients in Summary and Task Table --- mteb/leaderboard/table.py | 67 ++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 19 deletions(-) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 3f11817273..350eb96339 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -196,32 +196,61 @@ def scores_to_tables( # setting model name column to markdown column_types[1] = "markdown" score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns] - numeric_zero_shot = joint_table["Zero-shot"].copy().replace(-1, np.nan) + excluded_columns = [ + "Rank (Borda)", + "Model", + "Number of Parameters", + "Embedding Dimensions", + "Max Tokens", + ] + gradient_columns = { + col: "Greens" for col in joint_table.columns if col not in excluded_columns + } + numeric_data = joint_table.copy() + for col in score_columns + ["Zero-shot"]: + if col in numeric_data.columns: + numeric_data[col] = numeric_data[col].replace(-1, np.nan) joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot) joint_table[score_columns] = joint_table[score_columns].map(format_scores) - joint_table_style = ( - joint_table.style.format( - { - **{column: "{:.2f}" for column in score_columns}, - "Rank (Borda)": "{:.0f}", - }, - na_rep="", - ) - .highlight_min("Rank (Borda)", props="font-weight: bold") - .highlight_max(subset=score_columns, props="font-weight: bold") - .background_gradient( - cmap="RdYlGn", - subset=["Zero-shot"], - vmin=50, - vmax=100, - gmap=numeric_zero_shot, - ) + joint_table_style = joint_table.style.format( + { + **{column: "{:.2f}" for column in score_columns}, + "Rank (Borda)": "{:.0f}", + }, + na_rep="⚠️ NA", ) + joint_table_style = joint_table_style.highlight_min( + "Rank (Borda)", props="font-weight: bold" + ).highlight_max(subset=score_columns, props="font-weight: bold") + + # Apply background gradients for each selected column + for col, cmap in gradient_columns.items(): + if col in joint_table.columns and numeric_data[col].notna().sum() > 0: + mask = numeric_data[col].notna() + if col != "Zero-shot": + gmap_values = numeric_data[col] * 100 + else: + gmap_values = numeric_data[col] + + joint_table_style = joint_table_style.background_gradient( + cmap=cmap, + subset=pd.IndexSlice[mask, col], + gmap=gmap_values.loc[mask], + ) task_score_columns = per_task.select_dtypes("number").columns per_task[task_score_columns] *= 100 + per_task_numeric = per_task.copy() per_task_style = per_task.style.format( - "{:.2f}", subset=task_score_columns, na_rep="" + "{:.2f}", subset=task_score_columns, na_rep="⚠️ NA" ).highlight_max(subset=task_score_columns, props="font-weight: bold") + for col in task_score_columns: + if col != "Model" and per_task_numeric[col].notna().sum() > 0: + mask = per_task_numeric[col].notna() + per_task_style = per_task_style.background_gradient( + cmap="Greens", + subset=pd.IndexSlice[mask, col], + gmap=per_task_numeric[col].loc[mask], + ) return ( gr.DataFrame( joint_table_style, From c38e70de9641dff3e3f16f086815ac63d29ac3c7 Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Tue, 18 Mar 2025 18:47:19 +0530 Subject: [PATCH 2/6] Remove warnings and add light green cmap --- mteb/leaderboard/table.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 350eb96339..503841ee77 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -5,8 +5,10 @@ from collections import defaultdict import gradio as gr +import matplotlib.pyplot as plt import numpy as np import pandas as pd +from matplotlib.colors import LinearSegmentedColormap from pandas.api.types import is_numeric_dtype from mteb.models.overview import get_model_meta @@ -98,6 +100,17 @@ def format_zero_shot(zero_shot_percentage: int): return f"{zero_shot_percentage:.0f}%" +def create_light_green_cmap(): + cmap = plt.cm.get_cmap("Greens") + num_colors = 256 + half_colors = np.linspace(0, 0.5, num_colors) + half_cmap = [cmap(val) for val in half_colors] + light_green_cmap = LinearSegmentedColormap.from_list( + "LightGreens", half_cmap, N=256 + ) + return light_green_cmap + + def scores_to_tables( scores_long: list[dict], search_query: str | None = None ) -> tuple[gr.DataFrame, gr.DataFrame]: @@ -203,9 +216,10 @@ def scores_to_tables( "Embedding Dimensions", "Max Tokens", ] - gradient_columns = { - col: "Greens" for col in joint_table.columns if col not in excluded_columns - } + gradient_columns = [ + col for col in joint_table.columns if col not in excluded_columns + ] + light_green_cmap = create_light_green_cmap() numeric_data = joint_table.copy() for col in score_columns + ["Zero-shot"]: if col in numeric_data.columns: @@ -217,21 +231,22 @@ def scores_to_tables( **{column: "{:.2f}" for column in score_columns}, "Rank (Borda)": "{:.0f}", }, - na_rep="⚠️ NA", + na_rep="", ) joint_table_style = joint_table_style.highlight_min( "Rank (Borda)", props="font-weight: bold" ).highlight_max(subset=score_columns, props="font-weight: bold") # Apply background gradients for each selected column - for col, cmap in gradient_columns.items(): + for col in gradient_columns: if col in joint_table.columns and numeric_data[col].notna().sum() > 0: mask = numeric_data[col].notna() if col != "Zero-shot": gmap_values = numeric_data[col] * 100 + cmap = light_green_cmap else: gmap_values = numeric_data[col] - + cmap = "Greens" joint_table_style = joint_table_style.background_gradient( cmap=cmap, subset=pd.IndexSlice[mask, col], @@ -241,13 +256,13 @@ def scores_to_tables( per_task[task_score_columns] *= 100 per_task_numeric = per_task.copy() per_task_style = per_task.style.format( - "{:.2f}", subset=task_score_columns, na_rep="⚠️ NA" + "{:.2f}", subset=task_score_columns, na_rep="" ).highlight_max(subset=task_score_columns, props="font-weight: bold") for col in task_score_columns: if col != "Model" and per_task_numeric[col].notna().sum() > 0: mask = per_task_numeric[col].notna() per_task_style = per_task_style.background_gradient( - cmap="Greens", + cmap=light_green_cmap, subset=pd.IndexSlice[mask, col], gmap=per_task_numeric[col].loc[mask], ) From 13b835a15cfc70cfea555b0a144cfb5f46a948c7 Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Mon, 24 Mar 2025 10:40:04 +0530 Subject: [PATCH 3/6] Address comments --- mteb/leaderboard/table.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 503841ee77..353cedae91 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -221,9 +221,7 @@ def scores_to_tables( ] light_green_cmap = create_light_green_cmap() numeric_data = joint_table.copy() - for col in score_columns + ["Zero-shot"]: - if col in numeric_data.columns: - numeric_data[col] = numeric_data[col].replace(-1, np.nan) + numeric_data["Zero-shot"] = numeric_data["Zero-shot"].replace(-1, np.nan) joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot) joint_table[score_columns] = joint_table[score_columns].map(format_scores) joint_table_style = joint_table.style.format( @@ -239,7 +237,7 @@ def scores_to_tables( # Apply background gradients for each selected column for col in gradient_columns: - if col in joint_table.columns and numeric_data[col].notna().sum() > 0: + if col in joint_table.columns: mask = numeric_data[col].notna() if col != "Zero-shot": gmap_values = numeric_data[col] * 100 @@ -254,17 +252,16 @@ def scores_to_tables( ) task_score_columns = per_task.select_dtypes("number").columns per_task[task_score_columns] *= 100 - per_task_numeric = per_task.copy() per_task_style = per_task.style.format( "{:.2f}", subset=task_score_columns, na_rep="" ).highlight_max(subset=task_score_columns, props="font-weight: bold") for col in task_score_columns: - if col != "Model" and per_task_numeric[col].notna().sum() > 0: - mask = per_task_numeric[col].notna() + if col != "Model": + mask = per_task[col].notna() per_task_style = per_task_style.background_gradient( cmap=light_green_cmap, subset=pd.IndexSlice[mask, col], - gmap=per_task_numeric[col].loc[mask], + gmap=per_task[col].loc[mask], ) return ( gr.DataFrame( From 8fbd9fd7d69e1420454c867cc2874beae6bf318b Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Tue, 25 Mar 2025 00:03:25 +0530 Subject: [PATCH 4/6] Separate styling function --- mteb/leaderboard/table.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 353cedae91..3a12d18267 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -209,6 +209,16 @@ def scores_to_tables( # setting model name column to markdown column_types[1] = "markdown" score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns] + + return apply_styling(joint_table, per_task, score_columns, column_types) + + +def apply_styling( + joint_table: pd.DataFrame, + per_task: pd.DataFrame, + score_columns: list[str], + column_types: list[str], +) -> tuple[gr.DataFrame, gr.DataFrame]: excluded_columns = [ "Rank (Borda)", "Model", From b9e7352edf3083213cf1739e2b94692c00f601ee Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Wed, 26 Mar 2025 22:42:54 +0530 Subject: [PATCH 5/6] address comments --- mteb/leaderboard/app.py | 6 +++--- mteb/leaderboard/table.py | 39 +++++++++++++++++++++++++++---------- scripts/make_leaderboard.py | 4 ++-- 3 files changed, 34 insertions(+), 15 deletions(-) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 143d925bcd..59b28771e9 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -21,7 +21,7 @@ from mteb.custom_validators import MODALITIES from mteb.languages import ISO_TO_LANGUAGE from mteb.leaderboard.figures import performance_size_plot, radar_chart -from mteb.leaderboard.table import scores_to_tables +from mteb.leaderboard.table import create_tables logging.getLogger("mteb.load_results.task_results").setLevel( logging.WARNING @@ -234,7 +234,7 @@ def filter_models( zero_shot_setting="allow_all", ) -summary_table, per_task_table = scores_to_tables( +summary_table, per_task_table = create_tables( [entry for entry in default_scores if entry["model_name"] in filtered_models] ) @@ -809,7 +809,7 @@ def update_tables( filtered_scores.append(entry) else: filtered_scores = scores - summary, per_task = scores_to_tables(filtered_scores, search_query) + summary, per_task = create_tables(filtered_scores, search_query) elapsed = time.time() - start_time logger.info(f"update_tables callback: {elapsed}s") return summary, per_task diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 3a12d18267..eaf7021590 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -111,9 +111,7 @@ def create_light_green_cmap(): return light_green_cmap -def scores_to_tables( - scores_long: list[dict], search_query: str | None = None -) -> tuple[gr.DataFrame, gr.DataFrame]: +def scores_to_tables(scores_long: list[dict], search_query: str | None = None): if not scores_long: no_results_frame = pd.DataFrame( {"No results": ["You can try relaxing your criteria"]} @@ -210,7 +208,7 @@ def scores_to_tables( column_types[1] = "markdown" score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns] - return apply_styling(joint_table, per_task, score_columns, column_types) + return joint_table, per_task, score_columns, column_types def apply_styling( @@ -252,14 +250,21 @@ def apply_styling( if col != "Zero-shot": gmap_values = numeric_data[col] * 100 cmap = light_green_cmap + joint_table_style = joint_table_style.background_gradient( + cmap=cmap, + subset=pd.IndexSlice[mask, col], + gmap=gmap_values.loc[mask], + ) else: gmap_values = numeric_data[col] - cmap = "Greens" - joint_table_style = joint_table_style.background_gradient( - cmap=cmap, - subset=pd.IndexSlice[mask, col], - gmap=gmap_values.loc[mask], - ) + cmap = "RdYlGn" + joint_table_style = joint_table_style.background_gradient( + cmap=cmap, + subset=pd.IndexSlice[mask, col], + vmin=50, + vmax=100, + gmap=gmap_values.loc[mask], + ) task_score_columns = per_task.select_dtypes("number").columns per_task[task_score_columns] *= 100 per_task_style = per_task.style.format( @@ -282,3 +287,17 @@ def apply_styling( ), gr.DataFrame(per_task_style, interactive=False, pinned_columns=1), ) + + +def create_tables( + scores_long: list[dict], search_query: str | None = None +) -> tuple[gr.DataFrame, gr.DataFrame]: + result = scores_to_tables(scores_long, search_query) + if len(result) == 2: + joint_table, per_task = result + return joint_table, per_task + joint_table, per_task, score_columns, column_types = result + summary_table, per_task_table = apply_styling( + joint_table, per_task, score_columns, column_types + ) + return summary_table, per_task_table diff --git a/scripts/make_leaderboard.py b/scripts/make_leaderboard.py index fed85e383f..4e322b3210 100644 --- a/scripts/make_leaderboard.py +++ b/scripts/make_leaderboard.py @@ -7,7 +7,7 @@ import pandas as pd import mteb -from mteb.leaderboard.table import scores_to_tables +from mteb.leaderboard.table import create_tables from mteb.load_results import load_results logging.basicConfig(level=logging.INFO) @@ -64,7 +64,7 @@ def load_leaderboard( scores_long = benchmark_results_filtered.get_scores(format="long") # Convert scores into leaderboard tables - summary_gr_df, per_task_gr_df = scores_to_tables(scores_long=scores_long) + summary_gr_df, per_task_gr_df = create_tables(scores_long=scores_long) # Convert Gradio DataFrames to Pandas summary_df = pd.DataFrame( From 721c466ba62d21615f2f19673965ca408ad03cad Mon Sep 17 00:00:00 2001 From: ayush1298 Date: Thu, 27 Mar 2025 18:31:25 +0530 Subject: [PATCH 6/6] added comments --- mteb/leaderboard/table.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index eaf7021590..3b209fac55 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -293,6 +293,7 @@ def create_tables( scores_long: list[dict], search_query: str | None = None ) -> tuple[gr.DataFrame, gr.DataFrame]: result = scores_to_tables(scores_long, search_query) + # dataframe with No Results is returned, so no need to apply styling if len(result) == 2: joint_table, per_task = result return joint_table, per_task