-
Notifications
You must be signed in to change notification settings - Fork 554
Refactor split create_tables into static Benchmark methods #3126
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
098f616
a7ff0da
0b59f5d
bf930f3
130bec1
3fa7ac4
a4e74b8
444cdf4
d6d9070
97f9650
5787990
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,256 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import math | ||
| import re | ||
| from collections import defaultdict | ||
|
|
||
| import numpy as np | ||
| import pandas as pd | ||
|
|
||
| from mteb.load_results.benchmark_results import BenchmarkResults | ||
| from mteb.overview import get_task, get_tasks | ||
|
|
||
|
|
||
| def _borda_count(scores: pd.Series) -> pd.Series: | ||
| n = len(scores) | ||
| ranks = scores.rank(method="average", ascending=False) | ||
| counts = n - ranks | ||
| return counts | ||
|
|
||
|
|
||
| def _get_borda_rank(score_table: pd.DataFrame) -> pd.Series: | ||
| borda_counts = score_table.apply(_borda_count, axis="index") | ||
| mean_borda = borda_counts.sum(axis=1) | ||
| return mean_borda.rank(method="min", ascending=False).astype(int) | ||
|
|
||
|
|
||
| def _split_on_capital(s: str) -> str: | ||
| """Splits on capital letters and joins with spaces""" | ||
| return " ".join(re.findall(r"[A-Z]?[a-z]+|[A-Z]+(?=[A-Z]|$)", s)) | ||
|
|
||
|
|
||
| def _format_n_parameters(n_parameters) -> str: | ||
| if (n_parameters is None) or (not int(n_parameters)): | ||
| return "Unknown" | ||
| n_thousand = int(n_parameters // 1e3) | ||
| if n_thousand < 1: | ||
| return str(int(n_parameters)) | ||
| n_zeros = math.log10(n_thousand) | ||
| if n_zeros >= 6: | ||
| return str(n_thousand // (10**6)) + "B" | ||
| if n_zeros >= 3: | ||
| return str(n_thousand // (10**3)) + "M" | ||
| return str(n_thousand) + "K" | ||
|
|
||
|
|
||
| def _format_max_tokens(max_tokens: float | None) -> str: | ||
| if max_tokens is None: | ||
| return "Unknown" | ||
| if max_tokens == np.inf: | ||
| return "Infinite" | ||
| return str(int(max_tokens)) | ||
|
|
||
|
|
||
| def _failsafe_get_model_meta(model_name): | ||
| try: | ||
| from mteb.models.overview import get_model_meta | ||
|
|
||
| return get_model_meta(model_name) | ||
| except Exception: | ||
| return None | ||
|
|
||
|
|
||
| def _get_means_per_types(per_task: pd.DataFrame): | ||
| task_names_per_type = defaultdict(list) | ||
| for task_name in per_task.columns: | ||
| task_type = get_task(task_name).metadata.type | ||
| task_names_per_type[task_type].append(task_name) | ||
| records = [] | ||
| for task_type, tasks in task_names_per_type.items(): | ||
| for model_name, scores in per_task.iterrows(): | ||
| records.append( | ||
| dict( | ||
| model_name=model_name, | ||
| task_type=task_type, | ||
| score=scores[tasks].mean(skipna=False), | ||
| ) | ||
| ) | ||
| return pd.DataFrame.from_records(records) | ||
|
|
||
|
|
||
| def _create_summary_table_from_benchmark_results( | ||
| benchmark_results: BenchmarkResults, | ||
| ) -> pd.DataFrame: | ||
| """Create summary table from BenchmarkResults. | ||
|
|
||
| Returns a DataFrame with one row per model containing summary statistics | ||
| and task type averages. | ||
|
|
||
| Args: | ||
| benchmark_results: BenchmarkResults object containing model results | ||
|
|
||
| Returns: | ||
| DataFrame with model summaries, ready for styling in the leaderboard | ||
| """ | ||
| data = benchmark_results.to_dataframe(format="long") | ||
|
|
||
| if data.empty: | ||
| no_results_frame = pd.DataFrame( | ||
| {"No results": ["You can try relaxing your criteria"]} | ||
| ) | ||
| return no_results_frame | ||
|
|
||
| # Convert to DataFrame and pivot | ||
| per_task = data.pivot(index="model_name", columns="task_name", values="score") | ||
|
|
||
| # Remove models with no scores | ||
| to_remove = per_task.isna().all(axis="columns") | ||
| if to_remove.all(): | ||
| no_results_frame = pd.DataFrame( | ||
| {"No results": ["You can try relaxing your criteria"]} | ||
| ) | ||
| return no_results_frame | ||
|
|
||
| models_to_remove = list(per_task[to_remove].index) | ||
| per_task = per_task.drop(models_to_remove, axis=0) | ||
|
|
||
| # Calculate means by task type | ||
| mean_per_type = _get_means_per_types(per_task) | ||
| mean_per_type = mean_per_type.pivot( | ||
| index="model_name", columns="task_type", values="score" | ||
| ) | ||
| mean_per_type.columns = [ | ||
| _split_on_capital(column) for column in mean_per_type.columns | ||
| ] | ||
|
|
||
| # Calculate overall means | ||
| typed_mean = mean_per_type.mean(skipna=False, axis=1) | ||
| overall_mean = per_task.mean(skipna=False, axis=1) | ||
|
|
||
| # Build joint table | ||
| joint_table = mean_per_type.copy() | ||
| joint_table = joint_table.drop(models_to_remove, axis=0) | ||
| joint_table.insert(0, "mean", overall_mean) | ||
| joint_table.insert(1, "mean_by_task_type", typed_mean) | ||
| joint_table["borda_rank"] = _get_borda_rank(per_task) | ||
| joint_table = joint_table.sort_values("borda_rank", ascending=True) | ||
| joint_table = joint_table.reset_index() | ||
|
|
||
| # Add model metadata | ||
| model_metas = joint_table["model_name"].map(_failsafe_get_model_meta) | ||
| joint_table = joint_table[model_metas.notna()] | ||
| joint_table["model_link"] = model_metas.map(lambda m: m.reference) | ||
|
|
||
| # Insert model metadata columns | ||
| joint_table.insert( | ||
| 1, | ||
| "Max Tokens", | ||
| model_metas.map(lambda m: _format_max_tokens(m.max_tokens)), | ||
| ) | ||
| joint_table.insert( | ||
| 1, | ||
| "Embedding Dimensions", | ||
| model_metas.map(lambda m: str(int(m.embed_dim)) if m.embed_dim else "Unknown"), | ||
| ) | ||
| joint_table.insert( | ||
| 1, | ||
| "Number of Parameters", | ||
| model_metas.map(lambda m: _format_n_parameters(m.n_parameters)), | ||
| ) | ||
| joint_table.insert( | ||
| 1, | ||
| "Memory Usage (MB)", | ||
| model_metas.map( | ||
| lambda m: str(int(m.memory_usage_mb)) if m.memory_usage_mb else "Unknown" | ||
| ), | ||
| ) | ||
|
|
||
| # Add zero-shot percentage | ||
| tasks = get_tasks(tasks=list(data["task_name"].unique())) | ||
| joint_table.insert( | ||
| 1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks)) | ||
| ) | ||
| joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1) | ||
|
|
||
| # Clean up model names (remove HF organization) | ||
| joint_table["model_name"] = joint_table["model_name"].map( | ||
| lambda name: name.split("/")[-1] | ||
| ) | ||
|
|
||
| # Add markdown links to model names | ||
| name_w_link = ( | ||
| "[" + joint_table["model_name"] + "](" + joint_table["model_link"] + ")" | ||
| ) | ||
| joint_table["model_name"] = joint_table["model_name"].mask( | ||
| joint_table["model_link"].notna(), name_w_link | ||
| ) | ||
| joint_table = joint_table.drop(columns=["model_link"]) | ||
|
|
||
| # Rename columns | ||
| joint_table = joint_table.rename( | ||
| columns={ | ||
| "model_name": "Model", | ||
| "mean_by_task_type": "Mean (TaskType)", | ||
| "mean": "Mean (Task)", | ||
| } | ||
| ) | ||
|
|
||
| # Move borda rank to front | ||
| joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) | ||
|
|
||
| return joint_table | ||
|
|
||
|
|
||
| def _create_per_task_table_from_benchmark_results( | ||
| benchmark_results: BenchmarkResults, | ||
| ) -> pd.DataFrame: | ||
| """Create per-task table from BenchmarkResults. | ||
|
|
||
| Returns a DataFrame with one row per model and one column per task. | ||
|
|
||
| Args: | ||
| benchmark_results: BenchmarkResults object containing model results | ||
|
|
||
| Returns: | ||
| DataFrame with per-task scores, ready for styling in the leaderboard | ||
| """ | ||
| # Get scores in long format | ||
| data = benchmark_results.to_dataframe(format="long") | ||
|
|
||
| if data.empty: | ||
| no_results_frame = pd.DataFrame( | ||
| {"No results": ["You can try relaxing your criteria"]} | ||
| ) | ||
| return no_results_frame | ||
|
|
||
| # Convert to DataFrame and pivot | ||
| per_task = data.pivot(index="model_name", columns="task_name", values="score") | ||
|
|
||
| # Remove models with no scores | ||
| to_remove = per_task.isna().all(axis="columns") | ||
| if to_remove.all(): | ||
| no_results_frame = pd.DataFrame( | ||
| {"No results": ["You can try relaxing your criteria"]} | ||
| ) | ||
| return no_results_frame | ||
|
|
||
| models_to_remove = list(per_task[to_remove].index) | ||
| per_task = per_task.drop(models_to_remove, axis=0) | ||
|
|
||
| # Add borda rank and sort | ||
| per_task["borda_rank"] = _get_borda_rank(per_task) | ||
| per_task = per_task.sort_values("borda_rank", ascending=True) | ||
| per_task = per_task.drop(columns=["borda_rank"]) | ||
| per_task = per_task.reset_index() | ||
|
|
||
| # Clean up model names (remove HF organization) | ||
| per_task["model_name"] = per_task["model_name"].map( | ||
| lambda name: name.split("/")[-1] | ||
| ) | ||
| per_task = per_task.rename( | ||
| columns={ | ||
| "model_name": "Model", | ||
| } | ||
| ) | ||
|
|
||
| return per_task | ||
KennethEnevoldsen marked this conversation as resolved.
Show resolved
Hide resolved
|
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -23,7 +23,10 @@ | |||
| make_selector, | ||||
| ) | ||||
| from mteb.leaderboard.figures import performance_size_plot, radar_chart | ||||
| from mteb.leaderboard.table import create_tables | ||||
| from mteb.leaderboard.table import ( | ||||
| apply_per_task_styling_from_benchmark, | ||||
| apply_summary_styling_from_benchmark, | ||||
| ) | ||||
| from mteb.leaderboard.text_segments import ACKNOWLEDGEMENT, FAQ | ||||
|
|
||||
| logger = logging.getLogger(__name__) | ||||
|
|
@@ -218,10 +221,21 @@ def get_leaderboard_app() -> gr.Blocks: | |||
| max_model_size=MAX_MODEL_SIZE, | ||||
| zero_shot_setting="allow_all", | ||||
| ) | ||||
| default_filtered_scores = [ | ||||
| entry for entry in default_scores if entry["model_name"] in filtered_models | ||||
| ] | ||||
|
|
||||
| # Filter BenchmarkResults based on default filtered models (as required by Kenneth) | ||||
| filtered_model_names = [entry["model_name"] for entry in default_filtered_scores] | ||||
| filtered_benchmark_results = default_results.select_models(filtered_model_names) | ||||
|
|
||||
| summary_table, per_task_table = create_tables( | ||||
| [entry for entry in default_scores if entry["model_name"] in filtered_models] | ||||
| summary_table = apply_summary_styling_from_benchmark( | ||||
| default_benchmark, filtered_benchmark_results | ||||
| ) | ||||
| per_task_table = apply_per_task_styling_from_benchmark( | ||||
| default_benchmark, filtered_benchmark_results | ||||
| ) | ||||
|
|
||||
| lang_select = gr.Dropdown( | ||||
| LANGUAGE, | ||||
| value=sorted(default_results.languages), | ||||
|
|
@@ -751,19 +765,43 @@ def update_tables( | |||
| tasks = set(tasks) | ||||
| benchmark = mteb.get_benchmark(benchmark_name) | ||||
| benchmark_tasks = {task.metadata.name for task in benchmark.tasks} | ||||
| if (benchmark_tasks != tasks) or (models_to_keep is not None): | ||||
| filtered_scores = [] | ||||
| for entry in scores: | ||||
| if entry["task_name"] not in tasks: | ||||
| continue | ||||
| if (models_to_keep is not None) and ( | ||||
| entry["model_name"] not in models_to_keep | ||||
| ): | ||||
| continue | ||||
| filtered_scores.append(entry) | ||||
| else: | ||||
| filtered_scores = scores | ||||
| summary, per_task = create_tables(filtered_scores) | ||||
|
|
||||
| # Extract filtered model and task names from scores (respects UI filters) | ||||
| filtered_model_names = set() | ||||
| filtered_task_names = set() | ||||
|
|
||||
| for entry in scores: | ||||
| if entry["task_name"] not in tasks: | ||||
| continue | ||||
| if (models_to_keep is not None) and ( | ||||
| entry["model_name"] not in models_to_keep | ||||
| ): | ||||
| continue | ||||
| filtered_model_names.add(entry["model_name"]) | ||||
| filtered_task_names.add(entry["task_name"]) | ||||
|
|
||||
| # Create filtered BenchmarkResults as required by Kenneth | ||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure I understand?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Maybe one possible reason is that the original create_table function returned both summary_table and per_task in a single pass. |
||||
| benchmark_results = all_benchmark_results[benchmark_name] | ||||
| filtered_benchmark_results = benchmark_results | ||||
|
|
||||
| # Apply task filtering if needed | ||||
| if filtered_task_names != benchmark_tasks: | ||||
| filtered_benchmark_results = filtered_benchmark_results.filter_tasks( | ||||
| task_names=list(filtered_task_names) | ||||
| ) | ||||
|
|
||||
| # Apply model filtering if needed | ||||
| if filtered_model_names: | ||||
| filtered_benchmark_results = filtered_benchmark_results.select_models( | ||||
| list(filtered_model_names) | ||||
| ) | ||||
|
|
||||
| summary = apply_summary_styling_from_benchmark( | ||||
| benchmark, filtered_benchmark_results | ||||
| ) | ||||
| per_task = apply_per_task_styling_from_benchmark( | ||||
| benchmark, filtered_benchmark_results | ||||
| ) | ||||
| elapsed = time.time() - start_time | ||||
| logger.debug(f"update_tables callback: {elapsed}s") | ||||
| return summary, per_task | ||||
|
|
||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I noticed a strange issue after moving the code into
_create_table.py.Placing this line at the top:
causes a circular import, even though it works fine in
benchmark.py.For now, I applied lazy loading inside the function to avoid the problem.
Do you have some better different solution.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi @KennethEnevoldsen ,could you please take a look,when you have time.