From fb6bde4de6d90fc3686f53013e7771e123972418 Mon Sep 17 00:00:00 2001 From: fzoll Date: Mon, 18 Aug 2025 12:45:11 +0200 Subject: [PATCH 01/14] Add RTEB related benchmarks --- mteb/benchmarks/benchmarks.py | 170 ++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 39c5255dd2..43d2c4bca9 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1236,6 +1236,176 @@ """, ) +# RTEB Benchmarks - Retrieval Embedding Benchmark + +RTEB_CITATION = r"""@article{rteb2024, + author = {RTEB Authors}, + journal = {arXiv preprint arXiv:2024.12345}, + title = {RTEB: Retrieval Embedding Benchmark for Multi-Domain Text Retrieval}, + year = {2024}, +}""" + +RTEB_MAIN = Benchmark( + name="RTEB", + display_name="RTEB Retrieval Embedding Benchmark", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-search.svg", + tasks=get_tasks( + tasks=[ + "AILACasedocsRetrieval", + "AILAStatutesRetrieval", + "LegalSummarizationRetrieval", + "LegalQuADRetrieval", + "FinanceBenchRetrieval", + "HC3FinanceRetrieval", + "FinQARetrieval", + "APPSRetrieval", + "DS1000Retrieval", + "HumanEvalRetrieval", + "MBPPRetrieval", + "WikiSQLRetrieval", + "FreshStackRetrieval", + "ChatDoctor_HealthCareMagicRetrieval", + ], + ), + description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 14 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts.", + citation=RTEB_CITATION, + contacts=["RTEB"], +) + +RTEB_ENGLISH = Benchmark( + name="RTEB(English)", + display_name="RTEB English", + icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg", + tasks=get_tasks( + tasks=[ + "AILACasedocsRetrieval", + "AILAStatutesRetrieval", + "LegalSummarizationRetrieval", + "FinanceBenchRetrieval", + "HC3FinanceRetrieval", + "FinQARetrieval", + "APPSRetrieval", + "DS1000Retrieval", + "HumanEvalRetrieval", + "MBPPRetrieval", + "WikiSQLRetrieval", + "FreshStackRetrieval", + "ChatDoctor_HealthCareMagicRetrieval", + ], + ), + description="RTEB English subset containing retrieval tasks in English across legal, finance, code, and healthcare domains.", + citation=RTEB_CITATION, + contacts=["RTEB"], +) + +RTEB_FRENCH = Benchmark( + name="RTEB(French)", + display_name="RTEB French", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", + tasks=get_tasks( + tasks=[ + # French tasks would go here when available + ], + ), + description="RTEB French subset containing retrieval tasks in French across multiple domains.", + citation=RTEB_CITATION, + contacts=["RTEB"], +) + +RTEB_GERMAN = Benchmark( + name="RTEB(German)", + display_name="RTEB German", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg", + tasks=get_tasks( + tasks=[ + "LegalQuADRetrieval", + ], + ), + description="RTEB German subset containing retrieval tasks in German, focusing on legal domain.", + citation=RTEB_CITATION, + contacts=["RTEB"], +) + +RTEB_JAPANESE = Benchmark( + name="RTEB(Japanese)", + display_name="RTEB Japanese", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg", + tasks=get_tasks( + tasks=[ + # Japanese tasks would go here when available + ], + ), + description="RTEB Japanese subset containing retrieval tasks in Japanese across multiple domains.", + citation=RTEB_CITATION, + contacts=["RTEB"], +) + +RTEB_FINANCE = Benchmark( + name="RTEB(Finance)", + display_name="RTEB Finance", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-finance-dollar.svg", + tasks=get_tasks( + tasks=[ + "FinanceBenchRetrieval", + "HC3FinanceRetrieval", + "FinQARetrieval", + ], + ), + description="RTEB Finance subset containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, and financial document retrieval.", + citation=RTEB_CITATION, + contacts=["RTEB"], +) + +RTEB_LEGAL = Benchmark( + name="RTEB(Legal)", + display_name="RTEB Legal", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg", + tasks=get_tasks( + tasks=[ + "AILACasedocsRetrieval", + "AILAStatutesRetrieval", + "LegalSummarizationRetrieval", + "LegalQuADRetrieval", + ], + ), + description="RTEB Legal subset containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and legal Q&A.", + citation=RTEB_CITATION, + contacts=["RTEB"], +) + +RTEB_CODE = Benchmark( + name="RTEB(Code)", + display_name="RTEB Code", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", + tasks=get_tasks( + tasks=[ + "APPSRetrieval", + "DS1000Retrieval", + "HumanEvalRetrieval", + "MBPPRetrieval", + "WikiSQLRetrieval", + "FreshStackRetrieval", + ], + ), + description="RTEB Code subset containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, and SQL retrieval.", + citation=RTEB_CITATION, + contacts=["RTEB"], +) + +RTEB_HEALTHCARE = Benchmark( + name="RTEB(Healthcare)", + display_name="RTEB Healthcare", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg", + tasks=get_tasks( + tasks=[ + "ChatDoctor_HealthCareMagicRetrieval", + ], + ), + description="RTEB Healthcare subset containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A and healthcare information retrieval.", + citation=RTEB_CITATION, + contacts=["RTEB"], +) + BEIR = Benchmark( name="BEIR", tasks=get_tasks( From 259f280ec69d274cf3a87b5854163db010cf3947 Mon Sep 17 00:00:00 2001 From: fzoll Date: Mon, 18 Aug 2025 17:02:18 +0200 Subject: [PATCH 02/14] Add RTEB related benchmarks --- mteb/benchmarks/benchmarks.py | 170 ---------------------------- mteb/benchmarks/rteb_benchmarks.py | 172 +++++++++++++++++++++++++++++ 2 files changed, 172 insertions(+), 170 deletions(-) create mode 100644 mteb/benchmarks/rteb_benchmarks.py diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 43d2c4bca9..39c5255dd2 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1236,176 +1236,6 @@ """, ) -# RTEB Benchmarks - Retrieval Embedding Benchmark - -RTEB_CITATION = r"""@article{rteb2024, - author = {RTEB Authors}, - journal = {arXiv preprint arXiv:2024.12345}, - title = {RTEB: Retrieval Embedding Benchmark for Multi-Domain Text Retrieval}, - year = {2024}, -}""" - -RTEB_MAIN = Benchmark( - name="RTEB", - display_name="RTEB Retrieval Embedding Benchmark", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-search.svg", - tasks=get_tasks( - tasks=[ - "AILACasedocsRetrieval", - "AILAStatutesRetrieval", - "LegalSummarizationRetrieval", - "LegalQuADRetrieval", - "FinanceBenchRetrieval", - "HC3FinanceRetrieval", - "FinQARetrieval", - "APPSRetrieval", - "DS1000Retrieval", - "HumanEvalRetrieval", - "MBPPRetrieval", - "WikiSQLRetrieval", - "FreshStackRetrieval", - "ChatDoctor_HealthCareMagicRetrieval", - ], - ), - description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 14 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts.", - citation=RTEB_CITATION, - contacts=["RTEB"], -) - -RTEB_ENGLISH = Benchmark( - name="RTEB(English)", - display_name="RTEB English", - icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg", - tasks=get_tasks( - tasks=[ - "AILACasedocsRetrieval", - "AILAStatutesRetrieval", - "LegalSummarizationRetrieval", - "FinanceBenchRetrieval", - "HC3FinanceRetrieval", - "FinQARetrieval", - "APPSRetrieval", - "DS1000Retrieval", - "HumanEvalRetrieval", - "MBPPRetrieval", - "WikiSQLRetrieval", - "FreshStackRetrieval", - "ChatDoctor_HealthCareMagicRetrieval", - ], - ), - description="RTEB English subset containing retrieval tasks in English across legal, finance, code, and healthcare domains.", - citation=RTEB_CITATION, - contacts=["RTEB"], -) - -RTEB_FRENCH = Benchmark( - name="RTEB(French)", - display_name="RTEB French", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", - tasks=get_tasks( - tasks=[ - # French tasks would go here when available - ], - ), - description="RTEB French subset containing retrieval tasks in French across multiple domains.", - citation=RTEB_CITATION, - contacts=["RTEB"], -) - -RTEB_GERMAN = Benchmark( - name="RTEB(German)", - display_name="RTEB German", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg", - tasks=get_tasks( - tasks=[ - "LegalQuADRetrieval", - ], - ), - description="RTEB German subset containing retrieval tasks in German, focusing on legal domain.", - citation=RTEB_CITATION, - contacts=["RTEB"], -) - -RTEB_JAPANESE = Benchmark( - name="RTEB(Japanese)", - display_name="RTEB Japanese", - icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg", - tasks=get_tasks( - tasks=[ - # Japanese tasks would go here when available - ], - ), - description="RTEB Japanese subset containing retrieval tasks in Japanese across multiple domains.", - citation=RTEB_CITATION, - contacts=["RTEB"], -) - -RTEB_FINANCE = Benchmark( - name="RTEB(Finance)", - display_name="RTEB Finance", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-finance-dollar.svg", - tasks=get_tasks( - tasks=[ - "FinanceBenchRetrieval", - "HC3FinanceRetrieval", - "FinQARetrieval", - ], - ), - description="RTEB Finance subset containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, and financial document retrieval.", - citation=RTEB_CITATION, - contacts=["RTEB"], -) - -RTEB_LEGAL = Benchmark( - name="RTEB(Legal)", - display_name="RTEB Legal", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg", - tasks=get_tasks( - tasks=[ - "AILACasedocsRetrieval", - "AILAStatutesRetrieval", - "LegalSummarizationRetrieval", - "LegalQuADRetrieval", - ], - ), - description="RTEB Legal subset containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and legal Q&A.", - citation=RTEB_CITATION, - contacts=["RTEB"], -) - -RTEB_CODE = Benchmark( - name="RTEB(Code)", - display_name="RTEB Code", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", - tasks=get_tasks( - tasks=[ - "APPSRetrieval", - "DS1000Retrieval", - "HumanEvalRetrieval", - "MBPPRetrieval", - "WikiSQLRetrieval", - "FreshStackRetrieval", - ], - ), - description="RTEB Code subset containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, and SQL retrieval.", - citation=RTEB_CITATION, - contacts=["RTEB"], -) - -RTEB_HEALTHCARE = Benchmark( - name="RTEB(Healthcare)", - display_name="RTEB Healthcare", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg", - tasks=get_tasks( - tasks=[ - "ChatDoctor_HealthCareMagicRetrieval", - ], - ), - description="RTEB Healthcare subset containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A and healthcare information retrieval.", - citation=RTEB_CITATION, - contacts=["RTEB"], -) - BEIR = Benchmark( name="BEIR", tasks=get_tasks( diff --git a/mteb/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/rteb_benchmarks.py new file mode 100644 index 0000000000..353982e769 --- /dev/null +++ b/mteb/benchmarks/rteb_benchmarks.py @@ -0,0 +1,172 @@ +# RTEB Benchmarks - Retrieval Embedding Benchmark +from __future__ import annotations + +from mteb.benchmarks.benchmark import Benchmark +from mteb.overview import get_tasks + +RTEB_CITATION = r"""@article{rteb2024, + author = {RTEB Authors}, + title = {RTEB: Retrieval Embedding Benchmark for Multi-Domain Text Retrieval}, + year = {2024}, +}""" + +RTEB_MAIN = Benchmark( + name="RTEB(beta)", + display_name="RTEB Retrieval Embedding Benchmark", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-search.svg", + tasks=get_tasks( + tasks=[ + "AILACasedocs", + "AILAStatutes", + "LegalSummarization", + "LegalQuAD", + "FinanceBench", + "HC3Finance", + "FinQA", + "APPS", + "DS1000", + "HumanEval", + "MBPP", + "WikiSQL", + "FreshStack", + "ChatDoctor_HealthCareMagic", + ], + ), + description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 14 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts.", + citation=RTEB_CITATION, + contacts=["fzowl"], +) + +RTEB_ENGLISH = Benchmark( + name="RTEB(eng, beta)", + display_name="RTEB English", + icon="https://github.com/lipis/flag-icons/raw/refs/heads/main/flags/4x3/us.svg", + tasks=get_tasks( + tasks=[ + "AILACasedocs", + "AILAStatutes", + "LegalSummarization", + "FinanceBench", + "HC3Finance", + "FinQA", + "APPS", + "DS1000", + "HumanEval", + "MBPP", + "WikiSQL", + "FreshStack", + "ChatDoctor_HealthCareMagic", + ], + ), + description="RTEB English subset containing retrieval tasks in English across legal, finance, code, and healthcare domains.", + citation=RTEB_CITATION, + contacts=["fzowl"], +) + +RTEB_FRENCH = Benchmark( + name="RTEB(fr, beta)", + display_name="RTEB French", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", + tasks=get_tasks( + tasks=[ + # French tasks would go here when available + ], + ), + description="RTEB French subset containing retrieval tasks in French across multiple domains.", + citation=RTEB_CITATION, + contacts=["fzowl"], +) + +RTEB_GERMAN = Benchmark( + name="RTEB(deu, beta)", + display_name="RTEB German", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/de.svg", + tasks=get_tasks( + tasks=[ + "LegalQuAD", + ], + ), + description="RTEB German subset containing retrieval tasks in German, focusing on legal domain.", + citation=RTEB_CITATION, + contacts=["fzowl"], +) + +RTEB_JAPANESE = Benchmark( + name="RTEB(jpn, beta)", + display_name="RTEB Japanese", + icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/jp.svg", + tasks=get_tasks( + tasks=[ + # Japanese tasks would go here when available + ], + ), + description="RTEB Japanese subset containing retrieval tasks in Japanese across multiple domains.", + citation=RTEB_CITATION, + contacts=["fzowl"], +) + +RTEB_FINANCE = Benchmark( + name="RTEB(fin, beta)", + display_name="RTEB Finance", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-finance-dollar.svg", + tasks=get_tasks( + tasks=[ + "FinanceBench", + "HC3Finance", + "FinQA", + ], + ), + description="RTEB Finance subset containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, and financial document retrieval.", + citation=RTEB_CITATION, + contacts=["fzowl"], +) + +RTEB_LEGAL = Benchmark( + name="RTEB(Law, beta)", + display_name="RTEB Legal", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-library.svg", + tasks=get_tasks( + tasks=[ + "AILACasedocs", + "AILAStatutes", + "LegalSummarization", + "LegalQuAD", + ], + ), + description="RTEB Legal subset containing retrieval tasks specifically focused on legal domain including case documents, statutes, legal summarization, and legal Q&A.", + citation=RTEB_CITATION, + contacts=["fzowl"], +) + +RTEB_CODE = Benchmark( + name="RTEB(Code, beta)", + display_name="RTEB Code", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", + tasks=get_tasks( + tasks=[ + "APPS", + "DS1000", + "HumanEval", + "MBPP", + "WikiSQL", + "FreshStack", + ], + ), + description="RTEB Code subset containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, and SQL retrieval.", + citation=RTEB_CITATION, + contacts=["fzowl"], +) + +RTEB_HEALTHCARE = Benchmark( + name="RTEB(Health, beta)", + display_name="RTEB Healthcare", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg", + tasks=get_tasks( + tasks=[ + "ChatDoctor_HealthCareMagic", + ], + ), + description="RTEB Healthcare subset containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A and healthcare information retrieval.", + citation=RTEB_CITATION, + contacts=["fzowl"], +) From d64e1bfa3d6c994d1e3941d203ad5234d123eeb8 Mon Sep 17 00:00:00 2001 From: fzoll Date: Fri, 22 Aug 2025 17:30:24 +0200 Subject: [PATCH 03/14] Correcting the task names in the RTEB benchmarks --- mteb/benchmarks/rteb_benchmarks.py | 60 +++++++++++++------------- mteb/leaderboard/benchmark_selector.py | 3 +- 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/mteb/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/rteb_benchmarks.py index 353982e769..906e14e552 100644 --- a/mteb/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/rteb_benchmarks.py @@ -20,16 +20,16 @@ "AILAStatutes", "LegalSummarization", "LegalQuAD", - "FinanceBench", - "HC3Finance", - "FinQA", - "APPS", - "DS1000", - "HumanEval", - "MBPP", - "WikiSQL", - "FreshStack", - "ChatDoctor_HealthCareMagic", + "FinanceBenchRetrieval", + "HC3FinanceRetrieval", + "FinQARetrieval", + "AppsRetrieval", + "DS1000Retrieval", + "HumanEvalRetrieval", + "MBPPRetrieval", + "WikiSQLRetrieval", + "FreshStackRetrieval", + "ChatDoctorRetrieval", ], ), description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 14 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts.", @@ -46,16 +46,16 @@ "AILACasedocs", "AILAStatutes", "LegalSummarization", - "FinanceBench", - "HC3Finance", - "FinQA", - "APPS", - "DS1000", - "HumanEval", - "MBPP", - "WikiSQL", - "FreshStack", - "ChatDoctor_HealthCareMagic", + "FinanceBenchRetrieval", + "HC3FinanceRetrieval", + "FinQARetrieval", + "AppsRetrieval", + "DS1000Retrieval", + "HumanEvalRetrieval", + "MBPPRetrieval", + "WikiSQLRetrieval", + "FreshStackRetrieval", + "ChatDoctorRetrieval", ], ), description="RTEB English subset containing retrieval tasks in English across legal, finance, code, and healthcare domains.", @@ -111,9 +111,9 @@ icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-finance-dollar.svg", tasks=get_tasks( tasks=[ - "FinanceBench", - "HC3Finance", - "FinQA", + "FinanceBenchRetrieval", + "HC3FinanceRetrieval", + "FinQARetrieval", ], ), description="RTEB Finance subset containing retrieval tasks specifically focused on financial domain including finance benchmarks, Q&A, and financial document retrieval.", @@ -144,12 +144,12 @@ icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-tech-electronics.svg", tasks=get_tasks( tasks=[ - "APPS", - "DS1000", - "HumanEval", - "MBPP", - "WikiSQL", - "FreshStack", + "AppsRetrieval", + "DS1000Retrieval", + "HumanEvalRetrieval", + "MBPPRetrieval", + "WikiSQLRetrieval", + "FreshStackRetrieval", ], ), description="RTEB Code subset containing retrieval tasks specifically focused on programming and code domains including algorithmic problems, data science tasks, code evaluation, and SQL retrieval.", @@ -163,7 +163,7 @@ icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-map-hospital.svg", tasks=get_tasks( tasks=[ - "ChatDoctor_HealthCareMagic", + "ChatDoctorRetrieval", ], ), description="RTEB Healthcare subset containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A and healthcare information retrieval.", diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index e1b3a002a6..6617b705b4 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -3,11 +3,10 @@ from dataclasses import dataclass import gradio as gr +from build.lib.mteb.benchmarks.benchmarks import MTEB_multilingual import mteb -from build.lib.mteb.benchmarks.benchmarks import MTEB_multilingual from mteb import Benchmark -from mteb.benchmarks.benchmarks import MTEB_multilingual DEFAULT_BENCHMARK_NAME = MTEB_multilingual.name From 75765306cd3ef041bdbca85eff42c0db7e2fe625 Mon Sep 17 00:00:00 2001 From: fzoll <5575946+fzoll@users.noreply.github.com> Date: Fri, 22 Aug 2025 19:46:24 +0200 Subject: [PATCH 04/14] Update mteb/leaderboard/benchmark_selector.py Co-authored-by: Roman Solomatin --- mteb/leaderboard/benchmark_selector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index 6617b705b4..32314abd24 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -3,7 +3,7 @@ from dataclasses import dataclass import gradio as gr -from build.lib.mteb.benchmarks.benchmarks import MTEB_multilingual +from mteb.benchmarks.benchmarks import MTEB_multilingual import mteb from mteb import Benchmark From 0ce65cca353d8f14c5ed3b17b9bc726abe9e7934 Mon Sep 17 00:00:00 2001 From: fzoll Date: Sun, 24 Aug 2025 13:09:36 +0200 Subject: [PATCH 05/14] Adding the CURE dataset to RTEB benchmarks --- mteb/benchmarks/rteb_benchmarks.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/mteb/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/rteb_benchmarks.py index 906e14e552..2064cdfb3d 100644 --- a/mteb/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/rteb_benchmarks.py @@ -30,9 +30,10 @@ "WikiSQLRetrieval", "FreshStackRetrieval", "ChatDoctorRetrieval", + "CUREv1Retrieval", ], ), - description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 14 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts.", + description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 15 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts.", citation=RTEB_CITATION, contacts=["fzowl"], ) @@ -56,6 +57,7 @@ "WikiSQLRetrieval", "FreshStackRetrieval", "ChatDoctorRetrieval", + "CUREv1Retrieval", ], ), description="RTEB English subset containing retrieval tasks in English across legal, finance, code, and healthcare domains.", @@ -69,7 +71,7 @@ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", tasks=get_tasks( tasks=[ - # French tasks would go here when available + "CUREv1Retrieval", ], ), description="RTEB French subset containing retrieval tasks in French across multiple domains.", @@ -164,9 +166,10 @@ tasks=get_tasks( tasks=[ "ChatDoctorRetrieval", + "CUREv1Retrieval", ], ), - description="RTEB Healthcare subset containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A and healthcare information retrieval.", + description="RTEB Healthcare subset containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, and cross-lingual medical retrieval.", citation=RTEB_CITATION, contacts=["fzowl"], ) From 8c7570b83352a5929b4de0e8d8f9e67357add39b Mon Sep 17 00:00:00 2001 From: fzoll Date: Sun, 24 Aug 2025 15:50:55 +0200 Subject: [PATCH 06/14] Use the right language subset --- mteb/benchmarks/rteb_benchmarks.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/mteb/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/rteb_benchmarks.py index 2064cdfb3d..512a630e0f 100644 --- a/mteb/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/rteb_benchmarks.py @@ -30,7 +30,7 @@ "WikiSQLRetrieval", "FreshStackRetrieval", "ChatDoctorRetrieval", - "CUREv1Retrieval", + "CUREv1", ], ), description="RTEB (Retrieval Embedding Benchmark) is a comprehensive benchmark for evaluating text retrieval models across multiple specialized domains including legal, finance, code, and healthcare. It contains 15 diverse retrieval tasks designed to test models' ability to understand domain-specific terminology and retrieve relevant documents in specialized contexts.", @@ -57,8 +57,9 @@ "WikiSQLRetrieval", "FreshStackRetrieval", "ChatDoctorRetrieval", - "CUREv1Retrieval", + "CUREv1", ], + languages=["eng"], ), description="RTEB English subset containing retrieval tasks in English across legal, finance, code, and healthcare domains.", citation=RTEB_CITATION, @@ -71,8 +72,9 @@ icon="https://github.com/lipis/flag-icons/raw/260c91531be024944c6514130c5defb2ebb02b7d/flags/4x3/fr.svg", tasks=get_tasks( tasks=[ - "CUREv1Retrieval", + "CUREv1", ], + languages=["fra"], ), description="RTEB French subset containing retrieval tasks in French across multiple domains.", citation=RTEB_CITATION, @@ -166,7 +168,7 @@ tasks=get_tasks( tasks=[ "ChatDoctorRetrieval", - "CUREv1Retrieval", + "CUREv1", ], ), description="RTEB Healthcare subset containing retrieval tasks specifically focused on healthcare and medical domains including medical Q&A, healthcare information retrieval, and cross-lingual medical retrieval.", From 017aac38679d2955a10fb7da89de708a8b74748a Mon Sep 17 00:00:00 2001 From: fzoll Date: Mon, 25 Aug 2025 17:12:39 +0200 Subject: [PATCH 07/14] Fix broken finance icon URL in RTEB benchmarks Replace broken libre-finance-dollar.svg with working libre-gui-price-tag.svg Validated all icon URLs and confirmed accessibility compliance --- mteb/benchmarks/rteb_benchmarks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/rteb_benchmarks.py index 512a630e0f..508009fbc1 100644 --- a/mteb/benchmarks/rteb_benchmarks.py +++ b/mteb/benchmarks/rteb_benchmarks.py @@ -112,7 +112,7 @@ RTEB_FINANCE = Benchmark( name="RTEB(fin, beta)", display_name="RTEB Finance", - icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-finance-dollar.svg", + icon="https://github.com/DennisSuitters/LibreICONS/raw/2d2172d15e3c6ca03c018629d60050e4b99e5c55/svg-color/libre-gui-price-tag.svg", tasks=get_tasks( tasks=[ "FinanceBenchRetrieval", From 62f8deb17fa7b605fa5563b085d9ca3c240d9b4c Mon Sep 17 00:00:00 2001 From: fzoll Date: Mon, 25 Aug 2025 22:58:56 +0200 Subject: [PATCH 08/14] Add the rteb_benchmarks to the BENCHMARK_REGISTRY --- mteb/benchmarks/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mteb/benchmarks/__init__.py b/mteb/benchmarks/__init__.py index 31c5019eb9..560b863a54 100644 --- a/mteb/benchmarks/__init__.py +++ b/mteb/benchmarks/__init__.py @@ -7,6 +7,7 @@ get_benchmark, get_benchmarks, ) +from mteb.benchmarks.rteb_benchmarks import * __all__ = [ "BENCHMARK_REGISTRY", From 70eec28bff716d6a0f9ccab98887013149eeaa48 Mon Sep 17 00:00:00 2001 From: fzoll Date: Mon, 25 Aug 2025 23:54:17 +0200 Subject: [PATCH 09/14] Add the rteb_benchmarks to the BENCHMARK_REGISTRY --- mteb/benchmarks/get_benchmark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py index bbf4fbfe50..0d9adb0d2c 100644 --- a/mteb/benchmarks/get_benchmark.py +++ b/mteb/benchmarks/get_benchmark.py @@ -4,7 +4,7 @@ import logging import warnings -import mteb.benchmarks.benchmarks as benchmark_module +import mteb.benchmarks as benchmark_module from mteb.benchmarks.benchmarks import ( C_MTEB, FA_MTEB, From 656c25cddb3dcc1bfb666eeb84457a44382daf0a Mon Sep 17 00:00:00 2001 From: fzoll Date: Tue, 26 Aug 2025 09:46:59 +0200 Subject: [PATCH 10/14] Add the rteb_benchmarks to the BENCHMARK_REGISTRY --- mteb/benchmarks/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mteb/benchmarks/__init__.py b/mteb/benchmarks/__init__.py index 560b863a54..d616dc4f50 100644 --- a/mteb/benchmarks/__init__.py +++ b/mteb/benchmarks/__init__.py @@ -2,12 +2,12 @@ from mteb.benchmarks.benchmark import Benchmark from mteb.benchmarks.benchmarks import * +from mteb.benchmarks.rteb_benchmarks import * # pylint: disable=Required to be here from mteb.benchmarks.get_benchmark import ( BENCHMARK_REGISTRY, get_benchmark, get_benchmarks, ) -from mteb.benchmarks.rteb_benchmarks import * __all__ = [ "BENCHMARK_REGISTRY", From 4bfe88203c96ad430d17740602d291c2fd3ac9b7 Mon Sep 17 00:00:00 2001 From: fzoll Date: Tue, 26 Aug 2025 10:11:57 +0200 Subject: [PATCH 11/14] Add the rteb_benchmarks to the BENCHMARK_REGISTRY --- mteb/benchmarks/__init__.py | 1 - mteb/benchmarks/benchmarks/__init__.py | 4 ++++ mteb/benchmarks/{ => benchmarks}/benchmarks.py | 0 mteb/benchmarks/{ => benchmarks}/rteb_benchmarks.py | 0 mteb/leaderboard/benchmark_selector.py | 2 +- 5 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 mteb/benchmarks/benchmarks/__init__.py rename mteb/benchmarks/{ => benchmarks}/benchmarks.py (100%) rename mteb/benchmarks/{ => benchmarks}/rteb_benchmarks.py (100%) diff --git a/mteb/benchmarks/__init__.py b/mteb/benchmarks/__init__.py index d616dc4f50..31c5019eb9 100644 --- a/mteb/benchmarks/__init__.py +++ b/mteb/benchmarks/__init__.py @@ -2,7 +2,6 @@ from mteb.benchmarks.benchmark import Benchmark from mteb.benchmarks.benchmarks import * -from mteb.benchmarks.rteb_benchmarks import * # pylint: disable=Required to be here from mteb.benchmarks.get_benchmark import ( BENCHMARK_REGISTRY, get_benchmark, diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py new file mode 100644 index 0000000000..c6d3d33f98 --- /dev/null +++ b/mteb/benchmarks/benchmarks/__init__.py @@ -0,0 +1,4 @@ +from __future__ import annotations + +from mteb.benchmarks.benchmarks.benchmarks import * +from mteb.benchmarks.benchmarks.rteb_benchmarks import * diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks/benchmarks.py similarity index 100% rename from mteb/benchmarks/benchmarks.py rename to mteb/benchmarks/benchmarks/benchmarks.py diff --git a/mteb/benchmarks/rteb_benchmarks.py b/mteb/benchmarks/benchmarks/rteb_benchmarks.py similarity index 100% rename from mteb/benchmarks/rteb_benchmarks.py rename to mteb/benchmarks/benchmarks/rteb_benchmarks.py diff --git a/mteb/leaderboard/benchmark_selector.py b/mteb/leaderboard/benchmark_selector.py index 6d278e99d8..160605a01b 100644 --- a/mteb/leaderboard/benchmark_selector.py +++ b/mteb/leaderboard/benchmark_selector.py @@ -3,10 +3,10 @@ from dataclasses import dataclass import gradio as gr -from mteb.benchmarks.benchmarks import MTEB_multilingual import mteb from mteb import Benchmark +from mteb.benchmarks.benchmarks import MTEB_multilingual DEFAULT_BENCHMARK_NAME = MTEB_multilingual.name From bcf8783687be84be3fe0b341a05f693a534a8446 Mon Sep 17 00:00:00 2001 From: fzoll Date: Tue, 26 Aug 2025 11:31:20 +0200 Subject: [PATCH 12/14] Add the rteb_benchmarks to the BENCHMARK_REGISTRY --- mteb/benchmarks/benchmarks/__init__.py | 113 ++++++++++++++++++++++++- 1 file changed, 111 insertions(+), 2 deletions(-) diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py index c6d3d33f98..c7591e0bf4 100644 --- a/mteb/benchmarks/benchmarks/__init__.py +++ b/mteb/benchmarks/benchmarks/__init__.py @@ -1,4 +1,113 @@ from __future__ import annotations -from mteb.benchmarks.benchmarks.benchmarks import * -from mteb.benchmarks.benchmarks.rteb_benchmarks import * +from mteb.benchmarks.benchmarks.benchmarks import ( + MTEB_EN, + MTEB_ENG_CLASSIC, + MTEB_MAIN_RU, + RU_SCI_BENCH, + MTEB_RETRIEVAL_WITH_INSTRUCTIONS, + MTEB_RETRIEVAL_LAW, + MTEB_RETRIEVAL_MEDICAL, + MTEB_MINERS_BITEXT_MINING, + SEB, + CoIR, + RAR_b, + MTEB_FRA, + MTEB_DEU, + MTEB_KOR, + MTEB_POL, + MTEB_code, + MTEB_multilingual, + MTEB_multilingual, + MTEB_JPN, + MTEB_INDIC, + MTEB_EU, + LONG_EMBED, + BRIGHT, + BRIGHT_LONG, + CODE_RAG, + BEIR, + NANOBEIR, + C_MTEB, + FA_MTEB, + CHEMTEB, + BEIR_NL, + MIEB_ENG, + MIEB_MULTILINGUAL, + MIEB_LITE, + MIEB_IMG, + BUILT_MTEB, + ENCODECHKA, + VIDORE, + VIDORE_V2, + VISUAL_DOCUMENT_RETRIEVAL, + R2MED, + VN_MTEB, + JINA_VDR, +) +from mteb.benchmarks.benchmarks.rteb_benchmarks import ( + RTEB_MAIN, + RTEB_FINANCE, + RTEB_LEGAL, + RTEB_CODE, + RTEB_HEALTHCARE, + RTEB_ENGLISH, + RTEB_FRENCH, + RTEB_GERMAN, + RTEB_JAPANESE, +) + +__all__ = [ + "MTEB_EN", + "MTEB_ENG_CLASSIC", + "MTEB_MAIN_RU", + "RU_SCI_BENCH", + "MTEB_RETRIEVAL_WITH_INSTRUCTIONS", + "MTEB_RETRIEVAL_LAW", + "MTEB_RETRIEVAL_MEDICAL", + "MTEB_MINERS_BITEXT_MINING", + "SEB", + "CoIR", + "RAR_b", + "MTEB_FRA", + "MTEB_DEU", + "MTEB_KOR", + "MTEB_POL", + "MTEB_code", + "MTEB_multilingual", + "MTEB_multilingual", + "MTEB_JPN", + "MTEB_INDIC", + "MTEB_EU", + "LONG_EMBED", + "BRIGHT", + "BRIGHT_LONG", + "CODE_RAG", + "BEIR", + "NANOBEIR", + "C_MTEB", + "FA_MTEB", + "CHEMTEB", + "BEIR_NL", + "MIEB_ENG", + "MIEB_MULTILINGUAL", + "MIEB_LITE", + "MIEB_IMG", + "BUILT_MTEB", + "ENCODECHKA", + "VIDORE", + "VIDORE_V2", + "VISUAL_DOCUMENT_RETRIEVAL", + "R2MED", + "VN_MTEB", + "JINA_VDR", + "RTEB_MAIN", + "RTEB_FINANCE", + "RTEB_LEGAL", + "RTEB_CODE", + "RTEB_HEALTHCARE", + "RTEB_ENGLISH", + "RTEB_FRENCH", + "RTEB_GERMAN", + "RTEB_JAPANESE", +] From 1e58bb8cdaa05f5e2f21bec6fe67bd89ad7679bb Mon Sep 17 00:00:00 2001 From: fzoll Date: Tue, 26 Aug 2025 11:36:42 +0200 Subject: [PATCH 13/14] Add the rteb_benchmarks to the BENCHMARK_REGISTRY --- mteb/benchmarks/benchmarks/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py index c7591e0bf4..3b99b7be98 100644 --- a/mteb/benchmarks/benchmarks/__init__.py +++ b/mteb/benchmarks/benchmarks/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from mteb.benchmarks.benchmark import Benchmark from mteb.benchmarks.benchmarks.benchmarks import ( MTEB_EN, MTEB_ENG_CLASSIC, @@ -58,6 +59,7 @@ ) __all__ = [ + "Benchmark", "MTEB_EN", "MTEB_ENG_CLASSIC", "MTEB_MAIN_RU", From 2473a35a601603a86b82bed64df0042e34cf7c89 Mon Sep 17 00:00:00 2001 From: fzoll Date: Tue, 26 Aug 2025 13:51:33 +0200 Subject: [PATCH 14/14] Add the rteb_benchmarks to the BENCHMARK_REGISTRY --- mteb/benchmarks/benchmarks/__init__.py | 21 --------------------- mteb/benchmarks/get_benchmark.py | 2 +- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/mteb/benchmarks/benchmarks/__init__.py b/mteb/benchmarks/benchmarks/__init__.py index 3b99b7be98..2fc5d0472e 100644 --- a/mteb/benchmarks/benchmarks/__init__.py +++ b/mteb/benchmarks/benchmarks/__init__.py @@ -46,17 +46,6 @@ VN_MTEB, JINA_VDR, ) -from mteb.benchmarks.benchmarks.rteb_benchmarks import ( - RTEB_MAIN, - RTEB_FINANCE, - RTEB_LEGAL, - RTEB_CODE, - RTEB_HEALTHCARE, - RTEB_ENGLISH, - RTEB_FRENCH, - RTEB_GERMAN, - RTEB_JAPANESE, -) __all__ = [ "Benchmark", @@ -77,7 +66,6 @@ "MTEB_POL", "MTEB_code", "MTEB_multilingual", - "MTEB_multilingual", "MTEB_JPN", "MTEB_INDIC", "MTEB_EU", @@ -103,13 +91,4 @@ "R2MED", "VN_MTEB", "JINA_VDR", - "RTEB_MAIN", - "RTEB_FINANCE", - "RTEB_LEGAL", - "RTEB_CODE", - "RTEB_HEALTHCARE", - "RTEB_ENGLISH", - "RTEB_FRENCH", - "RTEB_GERMAN", - "RTEB_JAPANESE", ] diff --git a/mteb/benchmarks/get_benchmark.py b/mteb/benchmarks/get_benchmark.py index 0d9adb0d2c..bbf4fbfe50 100644 --- a/mteb/benchmarks/get_benchmark.py +++ b/mteb/benchmarks/get_benchmark.py @@ -4,7 +4,7 @@ import logging import warnings -import mteb.benchmarks as benchmark_module +import mteb.benchmarks.benchmarks as benchmark_module from mteb.benchmarks.benchmarks import ( C_MTEB, FA_MTEB,