From 2759dca9e7e2f40b618d8f9bd7dce386b96f7ea7 Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Tue, 2 Sep 2025 17:40:57 +0800 Subject: [PATCH 1/5] feat - remove special filtering, keep zero-shot, keep borda rank --- mteb/benchmarks/benchmark.py | 2 ++ mteb/benchmarks/benchmarks/rteb_benchmarks.py | 9 ++++++++ mteb/benchmarks/get_rteb_benchmark.py | 0 mteb/leaderboard/app.py | 23 ++++++++++++++++++- 4 files changed, 33 insertions(+), 1 deletion(-) create mode 100644 mteb/benchmarks/get_rteb_benchmark.py diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py index 37b654ac92..5135f8d1ae 100644 --- a/mteb/benchmarks/benchmark.py +++ b/mteb/benchmarks/benchmark.py @@ -72,3 +72,5 @@ def load_results( results = base_results.select_tasks(self.tasks) self.results_cache[base_results] = results return results + + diff --git a/mteb/benchmarks/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/benchmarks/rteb_benchmarks.py index 508009fbc1..93e625098c 100644 --- a/mteb/benchmarks/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/benchmarks/rteb_benchmarks.py @@ -36,6 +36,7 @@ description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 15 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts.", citation=RTEB_CITATION, contacts=["fzowl"], + ) RTEB_ENGLISH = Benchmark( @@ -64,6 +65,7 @@ description="RTEB English subset containing retrieval tasks in English across legal, finance, code, and healthcare domains.", citation=RTEB_CITATION, contacts=["fzowl"], + ) RTEB_FRENCH = Benchmark( @@ -79,6 +81,7 @@ description="RTEB French subset containing retrieval tasks in French across multiple domains.", citation=RTEB_CITATION, contacts=["fzowl"], + ) RTEB_GERMAN = Benchmark( @@ -93,6 +96,7 @@ description="RTEB German subset containing retrieval tasks in German, focusing on legal domain.", citation=RTEB_CITATION, contacts=["fzowl"], + ) RTEB_JAPANESE = Benchmark( @@ -107,6 +111,7 @@ description="RTEB Japanese subset containing retrieval tasks in Japanese across multiple domains.", citation=RTEB_CITATION, contacts=["fzowl"], + ) RTEB_FINANCE = Benchmark( @@ -123,6 +128,7 @@ description="RTEB Finance subset containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, and financial document retrieval.", citation=RTEB_CITATION, contacts=["fzowl"], + ) RTEB_LEGAL = Benchmark( @@ -140,6 +146,7 @@ description="RTEB Legal subset containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and legal Q&A.", citation=RTEB_CITATION, contacts=["fzowl"], + ) RTEB_CODE = Benchmark( @@ -159,6 +166,7 @@ description="RTEB Code subset containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, and SQL retrieval.", citation=RTEB_CITATION, contacts=["fzowl"], + ) RTEB_HEALTHCARE = Benchmark( @@ -174,4 +182,5 @@ description="RTEB Healthcare subset containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, and cross-lingual medical retrieval.", citation=RTEB_CITATION, contacts=["fzowl"], + ) diff --git a/mteb/benchmarks/get_rteb_benchmark.py b/mteb/benchmarks/get_rteb_benchmark.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 3c0921ab05..e40f83326f 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -1,5 +1,6 @@ from __future__ import annotations +import argparse import itertools import json import logging @@ -21,6 +22,7 @@ BENCHMARK_ENTRIES, DEFAULT_BENCHMARK_NAME, make_selector, + RTEB_BENCHMARK_ENTRIES, ) from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import create_tables @@ -190,7 +192,23 @@ def filter_models( return list(models_to_keep) +def get_startup_arguments(): + parser = argparse.ArgumentParser() + + # Add a Boolean flag parameter + parser.add_argument( + "--show_rteb", + action="store_true", + help="If set, display RTEB results; otherwise show default results." + ) + + return parser.parse_args() + def get_leaderboard_app() -> gr.Blocks: + + args = get_startup_arguments() + show_rteb = args.show_rteb + logger.info("Loading all benchmark results") all_results = load_results() @@ -277,7 +295,10 @@ def get_leaderboard_app() -> gr.Blocks: visible=True, width="18%", ): - benchmark_select, column = make_selector(BENCHMARK_ENTRIES) + if show_rteb: + benchmark_select, column = make_selector(RTEB_BENCHMARK_ENTRIES) + else: + benchmark_select, column = make_selector(BENCHMARK_ENTRIES) gr.Markdown( """ ## Embedding Leaderboard From 2f7c75ad165483d945da4ff7b3e4db9521486141 Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Tue, 2 Sep 2025 18:01:18 +0800 Subject: [PATCH 2/5] feat - remove get_rteb_benchmark.py --- mteb/benchmarks/benchmarks/__init__.py | 18 +++ mteb/leaderboard/benchmark_selector.py | 214 ++++++++++++++++++------- 2 files changed, 171 insertions(+), 61 deletions(-) diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py index f4b34233db..a600bee39d 100644 --- a/mteb/benchmarks/benchmarks/__init__.py +++ b/mteb/benchmarks/benchmarks/__init__.py @@ -45,7 +45,17 @@ MTEB_multilingual_v1, MTEB_multilingual_v2, RAR_b, + ) +from mteb.benchmarks.benchmarks.rteb_benchmarks import ( + RTEB_MAIN, + RTEB_FINANCE, + RTEB_LEGAL, + RTEB_CODE, + RTEB_HEALTHCARE, + RTEB_ENGLISH, + RTEB_FRENCH, + RTEB_GERMAN, ) __all__ = [ "Benchmark", @@ -92,4 +102,12 @@ "R2MED", "VN_MTEB", "JINA_VDR", + "RTEB_MAIN", + "RTEB_FINANCE", + "RTEB_LEGAL", + "RTEB_CODE", + "RTEB_HEALTHCARE", + "RTEB_ENGLISH", + "RTEB_FRENCH", + "RTEB_GERMAN", ] diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index def6ab18f5..788339ac4b 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -7,6 +7,8 @@ import mteb from mteb import Benchmark from mteb.benchmarks.benchmarks import MTEB_multilingual_v2 +from mteb.benchmarks.benchmarks.rteb_benchmarks import RTEB_MAIN, RTEB_FINANCE, RTEB_LEGAL, RTEB_CODE, RTEB_HEALTHCARE, \ + RTEB_ENGLISH, RTEB_FRENCH, RTEB_GERMAN DEFAULT_BENCHMARK_NAME = MTEB_multilingual_v2.name @@ -25,74 +27,164 @@ class MenuEntry: description="", open=False, benchmarks=mteb.get_benchmarks(["MTEB(Multilingual, v2)", "MTEB(eng, v2)"]) - + [ - MenuEntry( - "Image", - mteb.get_benchmarks( - [ - "MIEB(Multilingual)", - "MIEB(eng)", - "MIEB(lite)", - "MIEB(Img)", - "VisualDocumentRetrieval", - "JinaVDR", - ] - ), - ), - MenuEntry( - "Domain-Specific", - mteb.get_benchmarks( - [ - "MTEB(Code, v1)", - "MTEB(Law, v1)", - "MTEB(Medical, v1)", - "ChemTEB", - ] - ), - ), + + [ + MenuEntry( + "Image", + mteb.get_benchmarks( + [ + "MIEB(Multilingual)", + "MIEB(eng)", + "MIEB(lite)", + "MIEB(Img)", + "VisualDocumentRetrieval", + "JinaVDR", + ] + ), + ), + MenuEntry( + "Domain-Specific", + mteb.get_benchmarks( + [ + "MTEB(Code, v1)", + "MTEB(Law, v1)", + "MTEB(Medical, v1)", + "ChemTEB", + ] + ), + ), + MenuEntry( + "Language-specific", + mteb.get_benchmarks( + [ + "MTEB(Europe, v1)", + "MTEB(Indic, v1)", + "MTEB(Scandinavian, v1)", + "MTEB(cmn, v1)", + "MTEB(deu, v1)", + "MTEB(fra, v1)", + "MTEB(jpn, v1)", + "MTEB(kor, v1)", + "MTEB(pol, v1)", + "MTEB(rus, v1)", + "MTEB(fas, v1)", + "VN-MTEB (vie, v1)", + ] + ) + + [MenuEntry("Other", mteb.get_benchmarks(["MTEB(eng, v1)"]))], + ), + MenuEntry( + "Miscellaneous", # All of these are retrieval benchmarks + mteb.get_benchmarks( + [ + "BEIR", + "BEIR-NL", + "NanoBEIR", + "BRIGHT", + "BRIGHT (long)", + "BuiltBench(eng)", + "CoIR", + "FollowIR", + "LongEmbed", + "MINERSBitextMining", + "RAR-b", + ] + ), + ), + ], + ), +] + +RTEB_BENCHMARK_ENTRIES = [ + MenuEntry( + name="Select Benchmark", + description="", + open=False, + benchmarks=mteb.get_benchmarks(["MTEB(Multilingual, v2)", "MTEB(eng, v2)"]) + + [ + MenuEntry( + "Image", + mteb.get_benchmarks( + [ + "MIEB(Multilingual)", + "MIEB(eng)", + "MIEB(lite)", + "MIEB(Img)", + "VisualDocumentRetrieval", + "JinaVDR", + ] + ), + ), + MenuEntry( + "Domain-Specific", + mteb.get_benchmarks( + [ + "MTEB(Code, v1)", + "MTEB(Law, v1)", + "MTEB(Medical, v1)", + "ChemTEB", + ] + ), + ), + MenuEntry( + "Language-specific", + mteb.get_benchmarks( + [ + "MTEB(Europe, v1)", + "MTEB(Indic, v1)", + "MTEB(Scandinavian, v1)", + "MTEB(cmn, v1)", + "MTEB(deu, v1)", + "MTEB(fra, v1)", + "MTEB(jpn, v1)", + "MTEB(kor, v1)", + "MTEB(pol, v1)", + "MTEB(rus, v1)", + "MTEB(fas, v1)", + "VN-MTEB (vie, v1)", + ] + ) + + [MenuEntry("Other", mteb.get_benchmarks(["MTEB(eng, v1)"]))], + ), + MenuEntry( + "Miscellaneous", # All of these are retrieval benchmarks + mteb.get_benchmarks( + [ + "BEIR", + "BEIR-NL", + "NanoBEIR", + "BRIGHT", + "BRIGHT (long)", + "BuiltBench(eng)", + "CoIR", + "FollowIR", + "LongEmbed", + "MINERSBitextMining", + "RAR-b", + ] + ), + ), + ], + ), + MenuEntry( + name="RTEB (Retrieval)", + description=None, + open=False, + benchmarks=[ + RTEB_MAIN, MenuEntry( - "Language-specific", - mteb.get_benchmarks( - [ - "MTEB(Europe, v1)", - "MTEB(Indic, v1)", - "MTEB(Scandinavian, v1)", - "MTEB(cmn, v1)", - "MTEB(deu, v1)", - "MTEB(fra, v1)", - "MTEB(jpn, v1)", - "MTEB(kor, v1)", - "MTEB(pol, v1)", - "MTEB(rus, v1)", - "MTEB(fas, v1)", - "VN-MTEB (vie, v1)", - ] - ) - + [MenuEntry("Other", mteb.get_benchmarks(["MTEB(eng, v1)"]))], + "Domain-Specific", description=None, + open=False, benchmarks=[RTEB_FINANCE, RTEB_LEGAL, RTEB_CODE, RTEB_HEALTHCARE] ), MenuEntry( - "Miscellaneous", # All of these are retrieval benchmarks - mteb.get_benchmarks( - [ - "BEIR", - "BEIR-NL", - "NanoBEIR", - "BRIGHT", - "BRIGHT (long)", - "BuiltBench(eng)", - "CoIR", - "FollowIR", - "LongEmbed", - "MINERSBitextMining", - "RAR-b", - ] - ), + "Language-specific", description=None, + open=False, benchmarks=[RTEB_ENGLISH, RTEB_FRENCH, RTEB_GERMAN] ), - ], - ), + ] + ) ] + def _create_button( i: int, benchmark: Benchmark, From 9903771426e8c727fd1cdb42c1471e56c9aa470d Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Thu, 4 Sep 2025 00:03:39 +0800 Subject: [PATCH 3/5] feat - delete get_rteb_benchmark.py;RTEB_BENCHMARK_ENTRIES changes --- mteb/benchmarks/get_rteb_benchmark.py | 0 mteb/leaderboard/app.py | 2 +- mteb/leaderboard/benchmark_selector.py | 70 -------------------------- 3 files changed, 1 insertion(+), 71 deletions(-) delete mode 100644 mteb/benchmarks/get_rteb_benchmark.py diff --git a/mteb/benchmarks/get_rteb_benchmark.py b/mteb/benchmarks/get_rteb_benchmark.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index e40f83326f..5f4e8f652a 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -296,7 +296,7 @@ def get_leaderboard_app() -> gr.Blocks: width="18%", ): if show_rteb: - benchmark_select, column = make_selector(RTEB_BENCHMARK_ENTRIES) + benchmark_select, column = make_selector(BENCHMARK_ENTRIES + RTEB_BENCHMARK_ENTRIES) else: benchmark_select, column = make_selector(BENCHMARK_ENTRIES) gr.Markdown( diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index 788339ac4b..24adbbd0f8 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -95,76 +95,6 @@ class MenuEntry: ] RTEB_BENCHMARK_ENTRIES = [ - MenuEntry( - name="Select Benchmark", - description="", - open=False, - benchmarks=mteb.get_benchmarks(["MTEB(Multilingual, v2)", "MTEB(eng, v2)"]) - + [ - MenuEntry( - "Image", - mteb.get_benchmarks( - [ - "MIEB(Multilingual)", - "MIEB(eng)", - "MIEB(lite)", - "MIEB(Img)", - "VisualDocumentRetrieval", - "JinaVDR", - ] - ), - ), - MenuEntry( - "Domain-Specific", - mteb.get_benchmarks( - [ - "MTEB(Code, v1)", - "MTEB(Law, v1)", - "MTEB(Medical, v1)", - "ChemTEB", - ] - ), - ), - MenuEntry( - "Language-specific", - mteb.get_benchmarks( - [ - "MTEB(Europe, v1)", - "MTEB(Indic, v1)", - "MTEB(Scandinavian, v1)", - "MTEB(cmn, v1)", - "MTEB(deu, v1)", - "MTEB(fra, v1)", - "MTEB(jpn, v1)", - "MTEB(kor, v1)", - "MTEB(pol, v1)", - "MTEB(rus, v1)", - "MTEB(fas, v1)", - "VN-MTEB (vie, v1)", - ] - ) - + [MenuEntry("Other", mteb.get_benchmarks(["MTEB(eng, v1)"]))], - ), - MenuEntry( - "Miscellaneous", # All of these are retrieval benchmarks - mteb.get_benchmarks( - [ - "BEIR", - "BEIR-NL", - "NanoBEIR", - "BRIGHT", - "BRIGHT (long)", - "BuiltBench(eng)", - "CoIR", - "FollowIR", - "LongEmbed", - "MINERSBitextMining", - "RAR-b", - ] - ), - ), - ], - ), MenuEntry( name="RTEB (Retrieval)", description=None, From fb896d0214dea79b8e8363082a2d9f1819a0b74f Mon Sep 17 00:00:00 2001 From: q275343119 <275343119@qq.com> Date: Thu, 4 Sep 2025 17:54:06 +0800 Subject: [PATCH 4/5] feat -format --- mteb/benchmarks/benchmark.py | 2 - mteb/benchmarks/benchmarks/__init__.py | 12 +- mteb/benchmarks/benchmarks/rteb_benchmarks.py | 9 - mteb/leaderboard/app.py | 10 +- mteb/leaderboard/benchmark_selector.py | 155 ++++++++++-------- mteb/load_results/benchmark_results.py | 2 +- 6 files changed, 96 insertions(+), 94 deletions(-) diff --git a/mteb/benchmarks/benchmark.py b/mteb/benchmarks/benchmark.py index 5135f8d1ae..37b654ac92 100644 --- a/mteb/benchmarks/benchmark.py +++ b/mteb/benchmarks/benchmark.py @@ -72,5 +72,3 @@ def load_results( results = base_results.select_tasks(self.tasks) self.results_cache[base_results] = results return results - - diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py index a600bee39d..a240dd3b0f 100644 --- a/mteb/benchmarks/benchmarks/__init__.py +++ b/mteb/benchmarks/benchmarks/__init__.py @@ -45,17 +45,17 @@ MTEB_multilingual_v1, MTEB_multilingual_v2, RAR_b, - ) from mteb.benchmarks.benchmarks.rteb_benchmarks import ( - RTEB_MAIN, - RTEB_FINANCE, - RTEB_LEGAL, RTEB_CODE, - RTEB_HEALTHCARE, RTEB_ENGLISH, + RTEB_FINANCE, RTEB_FRENCH, - RTEB_GERMAN, ) + RTEB_GERMAN, + RTEB_HEALTHCARE, + RTEB_LEGAL, + RTEB_MAIN, +) __all__ = [ "Benchmark", diff --git a/mteb/benchmarks/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/benchmarks/rteb_benchmarks.py index 93e625098c..508009fbc1 100644 --- a/mteb/benchmarks/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/benchmarks/rteb_benchmarks.py @@ -36,7 +36,6 @@ description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 15 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts.", citation=RTEB_CITATION, contacts=["fzowl"], - ) RTEB_ENGLISH = Benchmark( @@ -65,7 +64,6 @@ description="RTEB English subset containing retrieval tasks in English across legal, finance, code, and healthcare domains.", citation=RTEB_CITATION, contacts=["fzowl"], - ) RTEB_FRENCH = Benchmark( @@ -81,7 +79,6 @@ description="RTEB French subset containing retrieval tasks in French across multiple domains.", citation=RTEB_CITATION, contacts=["fzowl"], - ) RTEB_GERMAN = Benchmark( @@ -96,7 +93,6 @@ description="RTEB German subset containing retrieval tasks in German, focusing on legal domain.", citation=RTEB_CITATION, contacts=["fzowl"], - ) RTEB_JAPANESE = Benchmark( @@ -111,7 +107,6 @@ description="RTEB Japanese subset containing retrieval tasks in Japanese across multiple domains.", citation=RTEB_CITATION, contacts=["fzowl"], - ) RTEB_FINANCE = Benchmark( @@ -128,7 +123,6 @@ description="RTEB Finance subset containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, and financial document retrieval.", citation=RTEB_CITATION, contacts=["fzowl"], - ) RTEB_LEGAL = Benchmark( @@ -146,7 +140,6 @@ description="RTEB Legal subset containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and legal Q&A.", citation=RTEB_CITATION, contacts=["fzowl"], - ) RTEB_CODE = Benchmark( @@ -166,7 +159,6 @@ description="RTEB Code subset containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, and SQL retrieval.", citation=RTEB_CITATION, contacts=["fzowl"], - ) RTEB_HEALTHCARE = Benchmark( @@ -182,5 +174,4 @@ description="RTEB Healthcare subset containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, and cross-lingual medical retrieval.", citation=RTEB_CITATION, contacts=["fzowl"], - ) diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 5f4e8f652a..0ec8b91fde 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -21,8 +21,8 @@ from mteb.leaderboard.benchmark_selector import ( BENCHMARK_ENTRIES, DEFAULT_BENCHMARK_NAME, - make_selector, RTEB_BENCHMARK_ENTRIES, + make_selector, ) from mteb.leaderboard.figures import performance_size_plot, radar_chart from mteb.leaderboard.table import create_tables @@ -199,13 +199,13 @@ def get_startup_arguments(): parser.add_argument( "--show_rteb", action="store_true", - help="If set, display RTEB results; otherwise show default results." + help="If set, display RTEB results; otherwise show default results.", ) return parser.parse_args() -def get_leaderboard_app() -> gr.Blocks: +def get_leaderboard_app() -> gr.Blocks: args = get_startup_arguments() show_rteb = args.show_rteb @@ -296,7 +296,9 @@ def get_leaderboard_app() -> gr.Blocks: width="18%", ): if show_rteb: - benchmark_select, column = make_selector(BENCHMARK_ENTRIES + RTEB_BENCHMARK_ENTRIES) + benchmark_select, column = make_selector( + BENCHMARK_ENTRIES + RTEB_BENCHMARK_ENTRIES + ) else: benchmark_select, column = make_selector(BENCHMARK_ENTRIES) gr.Markdown( diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index 24adbbd0f8..8bb10235d7 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -7,8 +7,16 @@ import mteb from mteb import Benchmark from mteb.benchmarks.benchmarks import MTEB_multilingual_v2 -from mteb.benchmarks.benchmarks.rteb_benchmarks import RTEB_MAIN, RTEB_FINANCE, RTEB_LEGAL, RTEB_CODE, RTEB_HEALTHCARE, \ - RTEB_ENGLISH, RTEB_FRENCH, RTEB_GERMAN +from mteb.benchmarks.benchmarks.rteb_benchmarks import ( + RTEB_CODE, + RTEB_ENGLISH, + RTEB_FINANCE, + RTEB_FRENCH, + RTEB_GERMAN, + RTEB_HEALTHCARE, + RTEB_LEGAL, + RTEB_MAIN, +) DEFAULT_BENCHMARK_NAME = MTEB_multilingual_v2.name @@ -27,70 +35,70 @@ class MenuEntry: description="", open=False, benchmarks=mteb.get_benchmarks(["MTEB(Multilingual, v2)", "MTEB(eng, v2)"]) - + [ - MenuEntry( - "Image", - mteb.get_benchmarks( - [ - "MIEB(Multilingual)", - "MIEB(eng)", - "MIEB(lite)", - "MIEB(Img)", - "VisualDocumentRetrieval", - "JinaVDR", - ] - ), - ), - MenuEntry( - "Domain-Specific", - mteb.get_benchmarks( - [ - "MTEB(Code, v1)", - "MTEB(Law, v1)", - "MTEB(Medical, v1)", - "ChemTEB", - ] - ), - ), - MenuEntry( - "Language-specific", - mteb.get_benchmarks( - [ - "MTEB(Europe, v1)", - "MTEB(Indic, v1)", - "MTEB(Scandinavian, v1)", - "MTEB(cmn, v1)", - "MTEB(deu, v1)", - "MTEB(fra, v1)", - "MTEB(jpn, v1)", - "MTEB(kor, v1)", - "MTEB(pol, v1)", - "MTEB(rus, v1)", - "MTEB(fas, v1)", - "VN-MTEB (vie, v1)", - ] - ) - + [MenuEntry("Other", mteb.get_benchmarks(["MTEB(eng, v1)"]))], - ), - MenuEntry( - "Miscellaneous", # All of these are retrieval benchmarks - mteb.get_benchmarks( - [ - "BEIR", - "BEIR-NL", - "NanoBEIR", - "BRIGHT", - "BRIGHT (long)", - "BuiltBench(eng)", - "CoIR", - "FollowIR", - "LongEmbed", - "MINERSBitextMining", - "RAR-b", - ] - ), - ), - ], + + [ + MenuEntry( + "Image", + mteb.get_benchmarks( + [ + "MIEB(Multilingual)", + "MIEB(eng)", + "MIEB(lite)", + "MIEB(Img)", + "VisualDocumentRetrieval", + "JinaVDR", + ] + ), + ), + MenuEntry( + "Domain-Specific", + mteb.get_benchmarks( + [ + "MTEB(Code, v1)", + "MTEB(Law, v1)", + "MTEB(Medical, v1)", + "ChemTEB", + ] + ), + ), + MenuEntry( + "Language-specific", + mteb.get_benchmarks( + [ + "MTEB(Europe, v1)", + "MTEB(Indic, v1)", + "MTEB(Scandinavian, v1)", + "MTEB(cmn, v1)", + "MTEB(deu, v1)", + "MTEB(fra, v1)", + "MTEB(jpn, v1)", + "MTEB(kor, v1)", + "MTEB(pol, v1)", + "MTEB(rus, v1)", + "MTEB(fas, v1)", + "VN-MTEB (vie, v1)", + ] + ) + + [MenuEntry("Other", mteb.get_benchmarks(["MTEB(eng, v1)"]))], + ), + MenuEntry( + "Miscellaneous", # All of these are retrieval benchmarks + mteb.get_benchmarks( + [ + "BEIR", + "BEIR-NL", + "NanoBEIR", + "BRIGHT", + "BRIGHT (long)", + "BuiltBench(eng)", + "CoIR", + "FollowIR", + "LongEmbed", + "MINERSBitextMining", + "RAR-b", + ] + ), + ), + ], ), ] @@ -102,19 +110,22 @@ class MenuEntry: benchmarks=[ RTEB_MAIN, MenuEntry( - "Domain-Specific", description=None, - open=False, benchmarks=[RTEB_FINANCE, RTEB_LEGAL, RTEB_CODE, RTEB_HEALTHCARE] + "Domain-Specific", + description=None, + open=False, + benchmarks=[RTEB_FINANCE, RTEB_LEGAL, RTEB_CODE, RTEB_HEALTHCARE], ), MenuEntry( - "Language-specific", description=None, - open=False, benchmarks=[RTEB_ENGLISH, RTEB_FRENCH, RTEB_GERMAN] + "Language-specific", + description=None, + open=False, + benchmarks=[RTEB_ENGLISH, RTEB_FRENCH, RTEB_GERMAN], ), - ] + ], ) ] - def _create_button( i: int, benchmark: Benchmark, diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index 4c83d3b156..fdc23d616e 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -730,7 +730,7 @@ def to_disk(self, path: Path | str) -> None: @classmethod def from_validated(cls, **data) -> BenchmarkResults: model_results = [] - for model_res in data["model_results"]: + for model_res in data["model_results"][:1]: model_results.append(ModelResult.from_validated(**model_res)) return cls.model_construct(model_results=model_results) From caf4e84c3e390ba9ea9fbc27d2e15716f8d232da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=AC=91=E5=B0=BF=E4=BC=8A=E4=BA=BA?= <44760272+q275343119@users.noreply.github.com> Date: Thu, 4 Sep 2025 19:50:25 +0800 Subject: [PATCH 5/5] Update mteb/load_results/benchmark_results.py Co-authored-by: Roman Solomatin --- mteb/load_results/benchmark_results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/load_results/benchmark_results.py b/mteb/load_results/benchmark_results.py index fdc23d616e..4c83d3b156 100644 --- a/mteb/load_results/benchmark_results.py +++ b/mteb/load_results/benchmark_results.py @@ -730,7 +730,7 @@ def to_disk(self, path: Path | str) -> None: @classmethod def from_validated(cls, **data) -> BenchmarkResults: model_results = [] - for model_res in data["model_results"][:1]: + for model_res in data["model_results"]: model_results.append(ModelResult.from_validated(**model_res)) return cls.model_construct(model_results=model_results)