Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions mteb/leaderboard/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from mteb.custom_validators import MODALITIES
from mteb.languages import ISO_TO_LANGUAGE
from mteb.leaderboard.figures import performance_size_plot, radar_chart
from mteb.leaderboard.table import scores_to_tables
from mteb.leaderboard.table import create_tables

logging.getLogger("mteb.load_results.task_results").setLevel(
logging.WARNING
Expand Down Expand Up @@ -234,7 +234,7 @@ def filter_models(
zero_shot_setting="allow_all",
)

summary_table, per_task_table = scores_to_tables(
summary_table, per_task_table = create_tables(
[entry for entry in default_scores if entry["model_name"] in filtered_models]
)

Expand Down Expand Up @@ -809,7 +809,7 @@ def update_tables(
filtered_scores.append(entry)
else:
filtered_scores = scores
summary, per_task = scores_to_tables(filtered_scores, search_query)
summary, per_task = create_tables(filtered_scores, search_query)
elapsed = time.time() - start_time
logger.info(f"update_tables callback: {elapsed}s")
return summary, per_task
Expand Down
113 changes: 92 additions & 21 deletions mteb/leaderboard/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from collections import defaultdict

import gradio as gr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
from pandas.api.types import is_numeric_dtype

from mteb.models.overview import get_model_meta
Expand Down Expand Up @@ -98,9 +100,18 @@ def format_zero_shot(zero_shot_percentage: int):
return f"{zero_shot_percentage:.0f}%"


def scores_to_tables(
scores_long: list[dict], search_query: str | None = None
) -> tuple[gr.DataFrame, gr.DataFrame]:
def create_light_green_cmap():
cmap = plt.cm.get_cmap("Greens")
num_colors = 256
half_colors = np.linspace(0, 0.5, num_colors)
half_cmap = [cmap(val) for val in half_colors]
light_green_cmap = LinearSegmentedColormap.from_list(
"LightGreens", half_cmap, N=256
)
return light_green_cmap


def scores_to_tables(scores_long: list[dict], search_query: str | None = None):
if not scores_long:
no_results_frame = pd.DataFrame(
{"No results": ["You can try relaxing your criteria"]}
Expand Down Expand Up @@ -196,32 +207,77 @@ def scores_to_tables(
# setting model name column to markdown
column_types[1] = "markdown"
score_columns = ["Mean (Task)", "Mean (TaskType)", *mean_per_type.columns]
numeric_zero_shot = joint_table["Zero-shot"].copy().replace(-1, np.nan)

return joint_table, per_task, score_columns, column_types


def apply_styling(
joint_table: pd.DataFrame,
per_task: pd.DataFrame,
score_columns: list[str],
column_types: list[str],
) -> tuple[gr.DataFrame, gr.DataFrame]:
excluded_columns = [
"Rank (Borda)",
"Model",
"Number of Parameters",
"Embedding Dimensions",
"Max Tokens",
]
gradient_columns = [
col for col in joint_table.columns if col not in excluded_columns
]
light_green_cmap = create_light_green_cmap()
numeric_data = joint_table.copy()
numeric_data["Zero-shot"] = numeric_data["Zero-shot"].replace(-1, np.nan)
joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot)
joint_table[score_columns] = joint_table[score_columns].map(format_scores)
joint_table_style = (
joint_table.style.format(
{
**{column: "{:.2f}" for column in score_columns},
"Rank (Borda)": "{:.0f}",
},
na_rep="",
)
.highlight_min("Rank (Borda)", props="font-weight: bold")
.highlight_max(subset=score_columns, props="font-weight: bold")
.background_gradient(
cmap="RdYlGn",
subset=["Zero-shot"],
vmin=50,
vmax=100,
gmap=numeric_zero_shot,
)
joint_table_style = joint_table.style.format(
{
**{column: "{:.2f}" for column in score_columns},
"Rank (Borda)": "{:.0f}",
},
na_rep="",
)
joint_table_style = joint_table_style.highlight_min(
"Rank (Borda)", props="font-weight: bold"
).highlight_max(subset=score_columns, props="font-weight: bold")

# Apply background gradients for each selected column
for col in gradient_columns:
if col in joint_table.columns:
mask = numeric_data[col].notna()
if col != "Zero-shot":
gmap_values = numeric_data[col] * 100
cmap = light_green_cmap
joint_table_style = joint_table_style.background_gradient(
cmap=cmap,
subset=pd.IndexSlice[mask, col],
gmap=gmap_values.loc[mask],
)
else:
gmap_values = numeric_data[col]
cmap = "RdYlGn"
joint_table_style = joint_table_style.background_gradient(
cmap=cmap,
subset=pd.IndexSlice[mask, col],
vmin=50,
vmax=100,
gmap=gmap_values.loc[mask],
)
task_score_columns = per_task.select_dtypes("number").columns
per_task[task_score_columns] *= 100
per_task_style = per_task.style.format(
"{:.2f}", subset=task_score_columns, na_rep=""
).highlight_max(subset=task_score_columns, props="font-weight: bold")
for col in task_score_columns:
if col != "Model":
mask = per_task[col].notna()
per_task_style = per_task_style.background_gradient(
cmap=light_green_cmap,
subset=pd.IndexSlice[mask, col],
gmap=per_task[col].loc[mask],
)
return (
gr.DataFrame(
joint_table_style,
Expand All @@ -231,3 +287,18 @@ def scores_to_tables(
),
gr.DataFrame(per_task_style, interactive=False, pinned_columns=1),
)


def create_tables(
scores_long: list[dict], search_query: str | None = None
) -> tuple[gr.DataFrame, gr.DataFrame]:
result = scores_to_tables(scores_long, search_query)
# dataframe with No Results is returned, so no need to apply styling
if len(result) == 2:
joint_table, per_task = result
return joint_table, per_task
joint_table, per_task, score_columns, column_types = result
summary_table, per_task_table = apply_styling(
joint_table, per_task, score_columns, column_types
)
return summary_table, per_task_table
4 changes: 2 additions & 2 deletions scripts/make_leaderboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pandas as pd

import mteb
from mteb.leaderboard.table import scores_to_tables
from mteb.leaderboard.table import create_tables
from mteb.load_results import load_results

logging.basicConfig(level=logging.INFO)
Expand Down Expand Up @@ -64,7 +64,7 @@ def load_leaderboard(
scores_long = benchmark_results_filtered.get_scores(format="long")

# Convert scores into leaderboard tables
summary_gr_df, per_task_gr_df = scores_to_tables(scores_long=scores_long)
summary_gr_df, per_task_gr_df = create_tables(scores_long=scores_long)

# Convert Gradio DataFrames to Pandas
summary_df = pd.DataFrame(
Expand Down