From 6c73b58efa304d146841afcc89bb78b645634621 Mon Sep 17 00:00:00 2001 From: ethan Date: Wed, 1 Oct 2025 17:37:23 +0800 Subject: [PATCH 1/4] Refactor: Move zero-shot percentage calculation to the end of summary table creation which only apply to RTEB table. --- mteb/benchmarks/_create_table.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mteb/benchmarks/_create_table.py b/mteb/benchmarks/_create_table.py index a517a36d62..daee842f6d 100644 --- a/mteb/benchmarks/_create_table.py +++ b/mteb/benchmarks/_create_table.py @@ -344,13 +344,6 @@ def _create_summary_table_mean_public_private( ), ) - # Add zero-shot percentage - tasks = get_tasks(tasks=list(data["task_name"].unique())) - joint_table.insert( - 1, "Zero-shot", model_metas.map(lambda m: m.zero_shot_percentage(tasks)) - ) - joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1) - # Clean up model names (remove HF organization) joint_table["model_name"] = joint_table["model_name"].map( lambda name: name.split("/")[-1] @@ -379,6 +372,11 @@ def _create_summary_table_mean_public_private( # Move borda rank to front joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) + # Add zero-shot percentage at the end + tasks = get_tasks(tasks=list(data["task_name"].unique())) + joint_table["Zero-shot"] = model_metas.map(lambda m: m.zero_shot_percentage(tasks)) + joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1) + return joint_table From 8fea520cc6fe628de6f4547b1d4880bd4164425c Mon Sep 17 00:00:00 2001 From: ethan Date: Wed, 1 Oct 2025 21:26:13 +0800 Subject: [PATCH 2/4] Update RTEB benchmark name from "RTEB(beta)" to "RTEB" for consistency in display. --- mteb/benchmarks/benchmarks/rteb_benchmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/benchmarks/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/benchmarks/rteb_benchmarks.py index 7060456ba0..a652513603 100644 --- a/mteb/benchmarks/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/benchmarks/rteb_benchmarks.py @@ -11,7 +11,7 @@ }""" RTEB_MAIN = RtebBenchmark( - name="RTEB(beta)", + name="RTEB", display_name="RTEB Multilingual", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-search.svg", tasks=get_tasks( From 756d836b123b93454576b59c2a3bdd640b46e646 Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Sat, 4 Oct 2025 22:14:38 +0800 Subject: [PATCH 3/4] feat - RTEB(beta) --- mteb/benchmarks/benchmarks/rteb_benchmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/benchmarks/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/benchmarks/rteb_benchmarks.py index a652513603..7060456ba0 100644 --- a/mteb/benchmarks/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/benchmarks/rteb_benchmarks.py @@ -11,7 +11,7 @@ }""" RTEB_MAIN = RtebBenchmark( - name="RTEB", + name="RTEB(beta)", display_name="RTEB Multilingual", icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-search.svg", tasks=get_tasks( From 0e9dcf12d028dfb06dcfa396ae043aa346bf2b45 Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Sat, 4 Oct 2025 22:47:22 +0800 Subject: [PATCH 4/4] feat - remove Zero-shot --- mteb/benchmarks/_create_table.py | 5 ----- mteb/leaderboard/app.py | 14 ++++++++++++++ mteb/leaderboard/table.py | 3 ++- 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/mteb/benchmarks/_create_table.py b/mteb/benchmarks/_create_table.py index daee842f6d..3e6a503652 100644 --- a/mteb/benchmarks/_create_table.py +++ b/mteb/benchmarks/_create_table.py @@ -372,11 +372,6 @@ def _create_summary_table_mean_public_private( # Move borda rank to front joint_table.insert(0, "Rank (Borda)", joint_table.pop("borda_rank")) - # Add zero-shot percentage at the end - tasks = get_tasks(tasks=list(data["task_name"].unique())) - joint_table["Zero-shot"] = model_metas.map(lambda m: m.zero_shot_percentage(tasks)) - joint_table["Zero-shot"] = joint_table["Zero-shot"].fillna(-1) - return joint_table diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index e162b01aeb..98e956c5e7 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -16,6 +16,7 @@ import mteb from mteb.abstasks.TaskMetadata import TASK_DOMAIN, TASK_TYPE +from mteb.benchmarks.benchmark import RtebBenchmark from mteb.custom_validators import MODALITIES from mteb.leaderboard.benchmark_selector import ( DEFAULT_BENCHMARK_NAME, @@ -196,6 +197,14 @@ def filter_models( return list(models_to_keep) +def should_show_zero_shot_filter(benchmark_name: str) -> bool: + benchmark = mteb.get_benchmark(benchmark_name) + + if isinstance(benchmark, RtebBenchmark): + return False + return True + + def get_leaderboard_app() -> gr.Blocks: logger.info("Loading all benchmark results") all_results = load_results() @@ -479,6 +488,8 @@ def on_benchmark_select(benchmark_name): benchmark_results = all_benchmark_results[benchmark_name] scores = benchmark_results.get_scores(format="long") logger.debug(f"on_benchmark_select callback: {elapsed}s") + show_zero_shot = should_show_zero_shot_filter(benchmark_name) + return ( languages, domains, @@ -486,6 +497,7 @@ def on_benchmark_select(benchmark_name): modalities, sorted([task.metadata.name for task in benchmark.tasks]), scores, + gr.update(visible=show_zero_shot), ) benchmark_select.change( @@ -498,6 +510,7 @@ def on_benchmark_select(benchmark_name): modality_select, task_select, scores, + zero_shot, ], ) @@ -839,6 +852,7 @@ def update_tables( bench_modalities, bench_tasks, bench_scores, + zero_shot, ) = on_benchmark_select(benchmark.name) filtered_models = update_models( bench_scores, diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index 732e10d803..3d085de7e8 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -138,7 +138,8 @@ def _apply_summary_table_styling(joint_table: pd.DataFrame) -> gr.DataFrame: numeric_data = joint_table.copy() # Format data for display - joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot) + if "Zero-shot" in joint_table.columns: + joint_table["Zero-shot"] = joint_table["Zero-shot"].apply(format_zero_shot) joint_table[score_columns] = joint_table[score_columns].map(format_scores) joint_table_style = joint_table.style.format(