diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py index f4b34233db..a240dd3b0f 100644 --- a/mteb/benchmarks/benchmarks/__init__.py +++ b/mteb/benchmarks/benchmarks/__init__.py @@ -46,6 +46,16 @@ MTEB_multilingual_v2, RAR_b, ) +from mteb.benchmarks.benchmarks.rteb_benchmarks import ( + RTEB_CODE, + RTEB_ENGLISH, + RTEB_FINANCE, + RTEB_FRENCH, + RTEB_GERMAN, + RTEB_HEALTHCARE, + RTEB_LEGAL, + RTEB_MAIN, +) __all__ = [ "Benchmark", @@ -92,4 +102,12 @@ "R2MED", "VN_MTEB", "JINA_VDR", + "RTEB_MAIN", + "RTEB_FINANCE", + "RTEB_LEGAL", + "RTEB_CODE", + "RTEB_HEALTHCARE", + "RTEB_ENGLISH", + "RTEB_FRENCH", + "RTEB_GERMAN", ] diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 3c0921ab05..0ec8b91fde 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -1,5 +1,6 @@ from __future__ import annotations +import argparse import itertools import json import logging @@ -20,6 +21,7 @@ from mteb.leaderboard.benchmark_selector import ( BENCHMARK_ENTRIES, DEFAULT_BENCHMARK_NAME, + RTEB_BENCHMARK_ENTRIES, make_selector, ) from mteb.leaderboard.figures import performance_size_plot, radar_chart @@ -190,7 +192,23 @@ def filter_models( return list(models_to_keep) +def get_startup_arguments(): + parser = argparse.ArgumentParser() + + # Add a Boolean flag parameter + parser.add_argument( + "--show_rteb", + action="store_true", + help="If set, display RTEB results; otherwise show default results.", + ) + + return parser.parse_args() + + def get_leaderboard_app() -> gr.Blocks: + args = get_startup_arguments() + show_rteb = args.show_rteb + logger.info("Loading all benchmark results") all_results = load_results() @@ -277,7 +295,12 @@ def get_leaderboard_app() -> gr.Blocks: visible=True, width="18%", ): - benchmark_select, column = make_selector(BENCHMARK_ENTRIES) + if show_rteb: + benchmark_select, column = make_selector( + BENCHMARK_ENTRIES + RTEB_BENCHMARK_ENTRIES + ) + else: + benchmark_select, column = make_selector(BENCHMARK_ENTRIES) gr.Markdown( """ ## Embedding Leaderboard diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index def6ab18f5..8bb10235d7 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -7,6 +7,16 @@ import mteb from mteb import Benchmark from mteb.benchmarks.benchmarks import MTEB_multilingual_v2 +from mteb.benchmarks.benchmarks.rteb_benchmarks import ( + RTEB_CODE, + RTEB_ENGLISH, + RTEB_FINANCE, + RTEB_FRENCH, + RTEB_GERMAN, + RTEB_HEALTHCARE, + RTEB_LEGAL, + RTEB_MAIN, +) DEFAULT_BENCHMARK_NAME = MTEB_multilingual_v2.name @@ -92,6 +102,29 @@ class MenuEntry: ), ] +RTEB_BENCHMARK_ENTRIES = [ + MenuEntry( + name="RTEB (Retrieval)", + description=None, + open=False, + benchmarks=[ + RTEB_MAIN, + MenuEntry( + "Domain-Specific", + description=None, + open=False, + benchmarks=[RTEB_FINANCE, RTEB_LEGAL, RTEB_CODE, RTEB_HEALTHCARE], + ), + MenuEntry( + "Language-specific", + description=None, + open=False, + benchmarks=[RTEB_ENGLISH, RTEB_FRENCH, RTEB_GERMAN], + ), + ], + ) +] + def _create_button( i: int,