diff --git a/docs/create_tasks_table.py b/docs/create_tasks_table.py index a6111523a9..13e9830276 100644 --- a/docs/create_tasks_table.py +++ b/docs/create_tasks_table.py @@ -68,7 +68,7 @@ def create_tasks_table(tasks: list[mteb.AbsTask]) -> str: return table -def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str: +def create_task_lang_table(tasks: list[mteb.AbsTask], sort_by_sum=False) -> str: table_dict = {} ## Group by language. If it is a multilingual dataset, 1 is added to all languages present. for task in tasks: @@ -82,22 +82,27 @@ def create_task_lang_table(tasks: list[mteb.AbsTask]) -> str: ## Wrangle for polars pl_table_dict = [] for lang, d in table_dict.items(): - d.update({"lang": lang}) + d.update({"0-lang": lang}) # for sorting columns pl_table_dict.append(d) - df = pl.DataFrame(pl_table_dict).sort(by="lang") + df = pl.DataFrame(pl_table_dict).sort(by="0-lang") + df = df.with_columns(sum=pl.sum_horizontal(get_args(TASK_TYPE))) + df = df.select(sorted(df.columns)) + if sort_by_sum: + df = df.sort(by="sum", descending=True) + total = df.sum() task_names_md = " | ".join(sorted(get_args(TASK_TYPE))) - horizontal_line_md = "---|---" * len(sorted(get_args(TASK_TYPE))) + horizontal_line_md = "---|---" * (len(sorted(get_args(TASK_TYPE))) + 1) table = f""" -| Language | {task_names_md} | +| Language | {task_names_md} | Sum | |{horizontal_line_md}| """ for row in df.iter_rows(): - table += f"| {row[-1]} " - for num in row[:-1]: + table += f"| {row[0]} " + for num in row[1:]: table += f"| {num} " table += "|\n" diff --git a/mteb/__init__.py b/mteb/__init__.py index 1ef561a5f1..6de017b1f1 100644 --- a/mteb/__init__.py +++ b/mteb/__init__.py @@ -6,6 +6,7 @@ MTEB_ENG_CLASSIC, MTEB_MAIN_RU, MTEB_RETRIEVAL_LAW, + MTEB_RETRIEVAL_MEDICAL, MTEB_RETRIEVAL_WITH_INSTRUCTIONS, CoIR, ) @@ -24,6 +25,7 @@ "MTEB_ENG_CLASSIC", "MTEB_MAIN_RU", "MTEB_RETRIEVAL_LAW", + "MTEB_RETRIEVAL_MEDICAL", "MTEB_RETRIEVAL_WITH_INSTRUCTIONS", "CoIR", "TASKS_REGISTRY", diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 743a5bde12..9aaefda3cb 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -106,7 +106,7 @@ def load_results( "StackExchangeClustering.v2", "StackExchangeClusteringP2P.v2", "TRECCOVID", - "Touche2020", + "Touche2020Retrieval.v3", "ToxicConversationsClassification", "TweetSentimentExtractionClassification", "TwentyNewsgroupsClustering.v2", @@ -186,7 +186,7 @@ def load_results( "StackOverflowDupQuestions", "SummEval", "TRECCOVID", - "Touche2020Retrieval.v3", + "Touche2020", "ToxicConversationsClassification", "TweetSentimentExtractionClassification", "TwentyNewsgroupsClustering", @@ -308,6 +308,29 @@ def load_results( citation=None, ) +MTEB_RETRIEVAL_MEDICAL = Benchmark( + name="MTEB(Medical)", + tasks=get_tasks( + tasks=[ + "CUREv1", + "NFCorpus", + "TRECCOVID", + "TRECCOVID-PL", + "SciFact", + "SciFact-PL", + "MedicalQARetrieval", + "PublicHealthQA", + "MedrxivClusteringP2P.v2", + "MedrxivClusteringS2S.v2", + "CmedqaRetrieval", + "CMedQAv2-reranking", + ], + ), + description="A curated set of MTEB tasks designed to evaluate systems in the context of medical information retrieval.", + reference="", + citation=None, +) + MTEB_MINERS_BITEXT_MINING = Benchmark( name="MINERSBitextMining", tasks=get_tasks( @@ -702,6 +725,7 @@ def load_results( "SpartQA", "TempReasonL1", "TRECCOVID", + "CUREv1", "WinoGrande", "BelebeleRetrieval", "MLQARetrieval", diff --git a/mteb/descriptive_stats/Retrieval/CUREv1.json b/mteb/descriptive_stats/Retrieval/CUREv1.json new file mode 100644 index 0000000000..682b3752fb --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/CUREv1.json @@ -0,0 +1,1256 @@ +{ + "all": { + "number_of_characters": 376986167, + "num_samples": 739800, + "num_queries": 6000, + "num_documents": 733800, + "num_relevant_docs": 242148, + "min_document_length": 13, + "average_document_length": 0.7376887435268465, + "max_document_length": 357, + "unique_documents": 733800, + "min_query_length": 39, + "average_query_length": 62740.8085, + "max_query_length": 10344, + "unique_queries": 6000, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 40.358, + "max_relevant_docs_per_query": 1364, + "unique_relevant_docs": 124581, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "en": { + "number_of_characters": 125639484, + "num_samples": 246600, + "num_queries": 2000, + "num_documents": 244600, + "num_relevant_docs": 80716, + "min_document_length": 13, + "average_document_length": 0.645408830744072, + "max_document_length": 232, + "unique_documents": 244600, + "min_query_length": 39, + "average_query_length": 62740.8085, + "max_query_length": 10344, + "unique_queries": 2000, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 40.358, + "max_relevant_docs_per_query": 1364, + "unique_relevant_docs": 41527, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 125664632, + "num_samples": 246600, + "num_queries": 2000, + "num_documents": 244600, + "num_relevant_docs": 80716, + "min_document_length": 16, + "average_document_length": 0.748221586263287, + "max_document_length": 288, + "unique_documents": 244600, + "min_query_length": 39, + "average_query_length": 62740.8085, + "max_query_length": 10344, + "unique_queries": 2000, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 40.358, + "max_relevant_docs_per_query": 1364, + "unique_relevant_docs": 41527, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 125682051, + "num_samples": 246600, + "num_queries": 2000, + "num_documents": 244600, + "num_relevant_docs": 80716, + "min_document_length": 20, + "average_document_length": 0.8194358135731807, + "max_document_length": 357, + "unique_documents": 244600, + "min_query_length": 39, + "average_query_length": 62740.8085, + "max_query_length": 10344, + "unique_queries": 2000, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 40.358, + "max_relevant_docs_per_query": 1364, + "unique_relevant_docs": 41527, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + }, + "dentistry_and_oral_health": { + "number_of_characters": 42547753, + "num_samples": 88656, + "num_queries": 600, + "num_documents": 88056, + "num_relevant_docs": 23898, + "min_document_length": 21, + "average_document_length": 0.5983351503588625, + "max_document_length": 187, + "unique_documents": 88056, + "min_query_length": 39, + "average_query_length": 70825.11, + "max_query_length": 4539, + "unique_queries": 600, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 39.83, + "max_relevant_docs_per_query": 269, + "unique_relevant_docs": 12189, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "en": { + "number_of_characters": 14180200, + "num_samples": 29552, + "num_queries": 200, + "num_documents": 29352, + "num_relevant_docs": 7966, + "min_document_length": 21, + "average_document_length": 0.5171027527936768, + "max_document_length": 147, + "unique_documents": 29352, + "min_query_length": 39, + "average_query_length": 70825.11, + "max_query_length": 4539, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 39.83, + "max_relevant_docs_per_query": 269, + "unique_relevant_docs": 4063, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 14183105, + "num_samples": 29552, + "num_queries": 200, + "num_documents": 29352, + "num_relevant_docs": 7966, + "min_document_length": 27, + "average_document_length": 0.6160738620877624, + "max_document_length": 160, + "unique_documents": 29352, + "min_query_length": 39, + "average_query_length": 70825.11, + "max_query_length": 4539, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 39.83, + "max_relevant_docs_per_query": 269, + "unique_relevant_docs": 4063, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 14184448, + "num_samples": 29552, + "num_queries": 200, + "num_documents": 29352, + "num_relevant_docs": 7966, + "min_document_length": 31, + "average_document_length": 0.6618288361951485, + "max_document_length": 187, + "unique_documents": 29352, + "min_query_length": 39, + "average_query_length": 70825.11, + "max_query_length": 4539, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 39.83, + "max_relevant_docs_per_query": 269, + "unique_relevant_docs": 4063, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + }, + "dermatology": { + "number_of_characters": 22943198, + "num_samples": 47661, + "num_queries": 600, + "num_documents": 47061, + "num_relevant_docs": 8076, + "min_document_length": 16, + "average_document_length": 0.8371687809438814, + "max_document_length": 127, + "unique_documents": 47061, + "min_query_length": 52, + "average_query_length": 38173.0, + "max_query_length": 5440, + "unique_queries": 600, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 13.46, + "max_relevant_docs_per_query": 111, + "unique_relevant_docs": 3270, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "en": { + "number_of_characters": 7646329, + "num_samples": 15887, + "num_queries": 200, + "num_documents": 15687, + "num_relevant_docs": 2692, + "min_document_length": 16, + "average_document_length": 0.7476891693759163, + "max_document_length": 106, + "unique_documents": 15687, + "min_query_length": 52, + "average_query_length": 38173.0, + "max_query_length": 5440, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 13.46, + "max_relevant_docs_per_query": 111, + "unique_relevant_docs": 1090, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 7647866, + "num_samples": 15887, + "num_queries": 200, + "num_documents": 15687, + "num_relevant_docs": 2692, + "min_document_length": 20, + "average_document_length": 0.8456683878370626, + "max_document_length": 126, + "unique_documents": 15687, + "min_query_length": 52, + "average_query_length": 38173.0, + "max_query_length": 5440, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 13.46, + "max_relevant_docs_per_query": 111, + "unique_relevant_docs": 1090, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 7649003, + "num_samples": 15887, + "num_queries": 200, + "num_documents": 15687, + "num_relevant_docs": 2692, + "min_document_length": 25, + "average_document_length": 0.9181487856186651, + "max_document_length": 127, + "unique_documents": 15687, + "min_query_length": 52, + "average_query_length": 38173.0, + "max_query_length": 5440, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 13.46, + "max_relevant_docs_per_query": 111, + "unique_relevant_docs": 1090, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + }, + "gastroenterology": { + "number_of_characters": 33714725, + "num_samples": 69804, + "num_queries": 600, + "num_documents": 69204, + "num_relevant_docs": 31995, + "min_document_length": 26, + "average_document_length": 0.7955031501069303, + "max_document_length": 216, + "unique_documents": 69204, + "min_query_length": 51, + "average_query_length": 56099.455, + "max_query_length": 5027, + "unique_queries": 600, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 53.325, + "max_relevant_docs_per_query": 892, + "unique_relevant_docs": 15657, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "en": { + "number_of_characters": 11236098, + "num_samples": 23268, + "num_queries": 200, + "num_documents": 23068, + "num_relevant_docs": 10665, + "min_document_length": 26, + "average_document_length": 0.7025749956649905, + "max_document_length": 174, + "unique_documents": 23068, + "min_query_length": 51, + "average_query_length": 56099.455, + "max_query_length": 5027, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 53.325, + "max_relevant_docs_per_query": 892, + "unique_relevant_docs": 5219, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 11238409, + "num_samples": 23268, + "num_queries": 200, + "num_documents": 23068, + "num_relevant_docs": 10665, + "min_document_length": 26, + "average_document_length": 0.8027570660655453, + "max_document_length": 214, + "unique_documents": 23068, + "min_query_length": 51, + "average_query_length": 56099.455, + "max_query_length": 5027, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 53.325, + "max_relevant_docs_per_query": 892, + "unique_relevant_docs": 5219, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 11240218, + "num_samples": 23268, + "num_queries": 200, + "num_documents": 23068, + "num_relevant_docs": 10665, + "min_document_length": 31, + "average_document_length": 0.8811773885902549, + "max_document_length": 216, + "unique_documents": 23068, + "min_query_length": 51, + "average_query_length": 56099.455, + "max_query_length": 5027, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 53.325, + "max_relevant_docs_per_query": 892, + "unique_relevant_docs": 5219, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + }, + "genetics": { + "number_of_characters": 46802844, + "num_samples": 81588, + "num_queries": 600, + "num_documents": 80988, + "num_relevant_docs": 40815, + "min_document_length": 16, + "average_document_length": 0.6085716402429989, + "max_document_length": 227, + "unique_documents": 80988, + "min_query_length": 44, + "average_query_length": 77922.595, + "max_query_length": 6394, + "unique_queries": 600, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 68.025, + "max_relevant_docs_per_query": 1070, + "unique_relevant_docs": 20163, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "en": { + "number_of_characters": 15598969, + "num_samples": 27196, + "num_queries": 200, + "num_documents": 26996, + "num_relevant_docs": 13605, + "min_document_length": 16, + "average_document_length": 0.535264483627204, + "max_document_length": 178, + "unique_documents": 26996, + "min_query_length": 44, + "average_query_length": 77922.595, + "max_query_length": 6394, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 68.025, + "max_relevant_docs_per_query": 1070, + "unique_relevant_docs": 6721, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 15601118, + "num_samples": 27196, + "num_queries": 200, + "num_documents": 26996, + "num_relevant_docs": 13605, + "min_document_length": 18, + "average_document_length": 0.6148688694621426, + "max_document_length": 205, + "unique_documents": 26996, + "min_query_length": 44, + "average_query_length": 77922.595, + "max_query_length": 6394, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 68.025, + "max_relevant_docs_per_query": 1070, + "unique_relevant_docs": 6721, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 15602757, + "num_samples": 27196, + "num_queries": 200, + "num_documents": 26996, + "num_relevant_docs": 13605, + "min_document_length": 25, + "average_document_length": 0.6755815676396503, + "max_document_length": 227, + "unique_documents": 26996, + "min_query_length": 44, + "average_query_length": 77922.595, + "max_query_length": 6394, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 68.025, + "max_relevant_docs_per_query": 1070, + "unique_relevant_docs": 6721, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + }, + "neuroscience_and_neurology": { + "number_of_characters": 47565061, + "num_samples": 91764, + "num_queries": 600, + "num_documents": 91164, + "num_relevant_docs": 25227, + "min_document_length": 28, + "average_document_length": 0.6213417577113773, + "max_document_length": 357, + "unique_documents": 91164, + "min_query_length": 45, + "average_query_length": 79180.695, + "max_query_length": 6394, + "unique_queries": 600, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 42.045, + "max_relevant_docs_per_query": 251, + "unique_relevant_docs": 15252, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "en": { + "number_of_characters": 15852189, + "num_samples": 30588, + "num_queries": 200, + "num_documents": 30388, + "num_relevant_docs": 8409, + "min_document_length": 28, + "average_document_length": 0.528169014084507, + "max_document_length": 196, + "unique_documents": 30388, + "min_query_length": 45, + "average_query_length": 79180.695, + "max_query_length": 6394, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 42.045, + "max_relevant_docs_per_query": 251, + "unique_relevant_docs": 5084, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 15854555, + "num_samples": 30588, + "num_queries": 200, + "num_documents": 30388, + "num_relevant_docs": 8409, + "min_document_length": 33, + "average_document_length": 0.6060286955377122, + "max_document_length": 223, + "unique_documents": 30388, + "min_query_length": 45, + "average_query_length": 79180.695, + "max_query_length": 6394, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 42.045, + "max_relevant_docs_per_query": 251, + "unique_relevant_docs": 5084, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 15858317, + "num_samples": 30588, + "num_queries": 200, + "num_documents": 30388, + "num_relevant_docs": 8409, + "min_document_length": 38, + "average_document_length": 0.7298275635119126, + "max_document_length": 357, + "unique_documents": 30388, + "min_query_length": 45, + "average_query_length": 79180.695, + "max_query_length": 6394, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 42.045, + "max_relevant_docs_per_query": 251, + "unique_relevant_docs": 5084, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + }, + "orthopedic_surgery": { + "number_of_characters": 37106615, + "num_samples": 75330, + "num_queries": 600, + "num_documents": 74730, + "num_relevant_docs": 9006, + "min_document_length": 20, + "average_document_length": 0.7896293322628128, + "max_document_length": 239, + "unique_documents": 74730, + "min_query_length": 52, + "average_query_length": 61746.01, + "max_query_length": 10344, + "unique_queries": 600, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 15.01, + "max_relevant_docs_per_query": 82, + "unique_relevant_docs": 5085, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "en": { + "number_of_characters": 12366489, + "num_samples": 25110, + "num_queries": 200, + "num_documents": 24910, + "num_relevant_docs": 3002, + "min_document_length": 25, + "average_document_length": 0.6939783219590526, + "max_document_length": 219, + "unique_documents": 24910, + "min_query_length": 52, + "average_query_length": 61746.01, + "max_query_length": 10344, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 15.01, + "max_relevant_docs_per_query": 82, + "unique_relevant_docs": 1695, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 12369347, + "num_samples": 25110, + "num_queries": 200, + "num_documents": 24910, + "num_relevant_docs": 3002, + "min_document_length": 21, + "average_document_length": 0.8087113608992372, + "max_document_length": 228, + "unique_documents": 24910, + "min_query_length": 52, + "average_query_length": 61746.01, + "max_query_length": 10344, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 15.01, + "max_relevant_docs_per_query": 82, + "unique_relevant_docs": 1695, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 12370779, + "num_samples": 25110, + "num_queries": 200, + "num_documents": 24910, + "num_relevant_docs": 3002, + "min_document_length": 20, + "average_document_length": 0.8661983139301486, + "max_document_length": 239, + "unique_documents": 24910, + "min_query_length": 52, + "average_query_length": 61746.01, + "max_query_length": 10344, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 15.01, + "max_relevant_docs_per_query": 82, + "unique_relevant_docs": 1695, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + }, + "otorhinolaryngology": { + "number_of_characters": 33469218, + "num_samples": 73176, + "num_queries": 600, + "num_documents": 72576, + "num_relevant_docs": 22026, + "min_document_length": 13, + "average_document_length": 0.7396660052910053, + "max_document_length": 214, + "unique_documents": 72576, + "min_query_length": 44, + "average_query_length": 55692.56, + "max_query_length": 3594, + "unique_queries": 600, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 36.71, + "max_relevant_docs_per_query": 254, + "unique_relevant_docs": 8856, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "en": { + "number_of_characters": 11154337, + "num_samples": 24392, + "num_queries": 200, + "num_documents": 24192, + "num_relevant_docs": 7342, + "min_document_length": 13, + "average_document_length": 0.6541418650793651, + "max_document_length": 189, + "unique_documents": 24192, + "min_query_length": 44, + "average_query_length": 55692.56, + "max_query_length": 3594, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 36.71, + "max_relevant_docs_per_query": 254, + "unique_relevant_docs": 2952, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 11156678, + "num_samples": 24392, + "num_queries": 200, + "num_documents": 24192, + "num_relevant_docs": 7342, + "min_document_length": 16, + "average_document_length": 0.7509093915343915, + "max_document_length": 198, + "unique_documents": 24192, + "min_query_length": 44, + "average_query_length": 55692.56, + "max_query_length": 3594, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 36.71, + "max_relevant_docs_per_query": 254, + "unique_relevant_docs": 2952, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 11158203, + "num_samples": 24392, + "num_queries": 200, + "num_documents": 24192, + "num_relevant_docs": 7342, + "min_document_length": 20, + "average_document_length": 0.8139467592592593, + "max_document_length": 214, + "unique_documents": 24192, + "min_query_length": 44, + "average_query_length": 55692.56, + "max_query_length": 3594, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 36.71, + "max_relevant_docs_per_query": 254, + "unique_relevant_docs": 2952, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + }, + "plastic_surgery": { + "number_of_characters": 36780121, + "num_samples": 80415, + "num_queries": 600, + "num_documents": 79815, + "num_relevant_docs": 15255, + "min_document_length": 22, + "average_document_length": 0.7145774603771221, + "max_document_length": 245, + "unique_documents": 79815, + "min_query_length": 44, + "average_query_length": 61205.145, + "max_query_length": 4996, + "unique_queries": 600, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 25.425, + "max_relevant_docs_per_query": 197, + "unique_relevant_docs": 9324, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "en": { + "number_of_characters": 12257549, + "num_samples": 26805, + "num_queries": 200, + "num_documents": 26605, + "num_relevant_docs": 5085, + "min_document_length": 22, + "average_document_length": 0.6209359143018229, + "max_document_length": 177, + "unique_documents": 26605, + "min_query_length": 44, + "average_query_length": 61205.145, + "max_query_length": 4996, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 25.425, + "max_relevant_docs_per_query": 197, + "unique_relevant_docs": 3108, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 12260329, + "num_samples": 26805, + "num_queries": 200, + "num_documents": 26605, + "num_relevant_docs": 5085, + "min_document_length": 25, + "average_document_length": 0.7254275512121782, + "max_document_length": 225, + "unique_documents": 26605, + "min_query_length": 44, + "average_query_length": 61205.145, + "max_query_length": 4996, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 25.425, + "max_relevant_docs_per_query": 197, + "unique_relevant_docs": 3108, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 12262243, + "num_samples": 26805, + "num_queries": 200, + "num_documents": 26605, + "num_relevant_docs": 5085, + "min_document_length": 29, + "average_document_length": 0.7973689156173651, + "max_document_length": 245, + "unique_documents": 26605, + "min_query_length": 44, + "average_query_length": 61205.145, + "max_query_length": 4996, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 25.425, + "max_relevant_docs_per_query": 197, + "unique_relevant_docs": 3108, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + }, + "psychiatry_and_psychology": { + "number_of_characters": 59876950, + "num_samples": 107868, + "num_queries": 600, + "num_documents": 107268, + "num_relevant_docs": 41259, + "min_document_length": 29, + "average_document_length": 0.5822892195249282, + "max_document_length": 248, + "unique_documents": 107268, + "min_query_length": 44, + "average_query_length": 99690.815, + "max_query_length": 5370, + "unique_queries": 600, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 68.765, + "max_relevant_docs_per_query": 1070, + "unique_relevant_docs": 23991, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "en": { + "number_of_characters": 19956676, + "num_samples": 35956, + "num_queries": 200, + "num_documents": 35756, + "num_relevant_docs": 13753, + "min_document_length": 29, + "average_document_length": 0.5177592571876048, + "max_document_length": 226, + "unique_documents": 35756, + "min_query_length": 44, + "average_query_length": 99690.815, + "max_query_length": 5370, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 68.765, + "max_relevant_docs_per_query": 1070, + "unique_relevant_docs": 7997, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 19959277, + "num_samples": 35956, + "num_queries": 200, + "num_documents": 35756, + "num_relevant_docs": 13753, + "min_document_length": 34, + "average_document_length": 0.5905022933214006, + "max_document_length": 248, + "unique_documents": 35756, + "min_query_length": 44, + "average_query_length": 99690.815, + "max_query_length": 5370, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 68.765, + "max_relevant_docs_per_query": 1070, + "unique_relevant_docs": 7997, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 19960997, + "num_samples": 35956, + "num_queries": 200, + "num_documents": 35756, + "num_relevant_docs": 13753, + "min_document_length": 35, + "average_document_length": 0.6386061080657792, + "max_document_length": 248, + "unique_documents": 35756, + "min_query_length": 44, + "average_query_length": 99690.815, + "max_query_length": 5370, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 68.765, + "max_relevant_docs_per_query": 1070, + "unique_relevant_docs": 7997, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + }, + "pulmonology": { + "number_of_characters": 47108443, + "num_samples": 97551, + "num_queries": 600, + "num_documents": 96951, + "num_relevant_docs": 24591, + "min_document_length": 25, + "average_document_length": 0.5782508689956782, + "max_document_length": 289, + "unique_documents": 96951, + "min_query_length": 39, + "average_query_length": 78420.635, + "max_query_length": 4772, + "unique_queries": 600, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 40.985, + "max_relevant_docs_per_query": 1364, + "unique_relevant_docs": 13683, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null, + "hf_subset_descriptive_stats": { + "en": { + "number_of_characters": 15700235, + "num_samples": 32517, + "num_queries": 200, + "num_documents": 32317, + "num_relevant_docs": 8197, + "min_document_length": 25, + "average_document_length": 0.4984373549525018, + "max_document_length": 232, + "unique_documents": 32317, + "min_query_length": 39, + "average_query_length": 78420.635, + "max_query_length": 4772, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 40.985, + "max_relevant_docs_per_query": 1364, + "unique_relevant_docs": 4561, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "es": { + "number_of_characters": 15703535, + "num_samples": 32517, + "num_queries": 200, + "num_documents": 32317, + "num_relevant_docs": 8197, + "min_document_length": 29, + "average_document_length": 0.6005507936999103, + "max_document_length": 288, + "unique_documents": 32317, + "min_query_length": 39, + "average_query_length": 78420.635, + "max_query_length": 4772, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 40.985, + "max_relevant_docs_per_query": 1364, + "unique_relevant_docs": 4561, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + }, + "fr": { + "number_of_characters": 15704673, + "num_samples": 32517, + "num_queries": 200, + "num_documents": 32317, + "num_relevant_docs": 8197, + "min_document_length": 29, + "average_document_length": 0.6357644583346227, + "max_document_length": 289, + "unique_documents": 32317, + "min_query_length": 39, + "average_query_length": 78420.635, + "max_query_length": 4772, + "unique_queries": 200, + "none_queries": 0, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 40.985, + "max_relevant_docs_per_query": 1364, + "unique_relevant_docs": 4561, + "num_instructions": null, + "min_instruction_length": null, + "average_instruction_length": null, + "max_instruction_length": null, + "unique_instructions": null, + "min_top_ranked_per_query": null, + "average_top_ranked_per_query": null, + "max_top_ranked_per_query": null + } + } + } +} \ No newline at end of file diff --git a/mteb/leaderboard/app.py b/mteb/leaderboard/app.py index 9b89d5dd4c..8a5eb961c1 100644 --- a/mteb/leaderboard/app.py +++ b/mteb/leaderboard/app.py @@ -60,21 +60,25 @@ def format_list(props: list[str]): return ", ".join(props) -def update_task_info(task_names: str) -> str: +def update_task_info(task_names: str) -> gr.DataFrame: tasks = mteb.get_tasks(tasks=task_names) - df = tasks.to_dataframe() + df = tasks.to_dataframe( + properties=["name", "type", "languages", "domains", "reference", "main_score"] + ) df["languages"] = df["languages"].map(format_list) df["domains"] = df["domains"].map(format_list) + df["name"] = "[" + df["name"] + "](" + df["reference"] + ")" df = df.rename( columns={ "name": "Task Name", "type": "Task Type", "languages": "Languages", "domains": "Domains", - "license": "License", + "main_score": "Metric", } ) - return df + df = df.drop(columns="reference") + return gr.DataFrame(df, datatype=["markdown"] + ["str"] * (len(df.columns) - 1)) all_results = load_results().filter_models() @@ -215,6 +219,9 @@ def update_task_info(task_names: str) -> str: citation = gr.Markdown(update_citation, inputs=[benchmark_select]) with gr.Column(): plot = gr.Plot(performance_size_plot, inputs=[summary_table]) + gr.Markdown( + "*We only display models that have been run on all tasks in the benchmark*" + ) with gr.Tab("Summary"): summary_table.render() with gr.Tab("Performance per task"): diff --git a/mteb/leaderboard/figures.py b/mteb/leaderboard/figures.py index 7a354f7c82..373bcd00c6 100644 --- a/mteb/leaderboard/figures.py +++ b/mteb/leaderboard/figures.py @@ -14,6 +14,10 @@ def parse_n_params(text: str) -> int: def parse_model_name(name: str) -> str: + if name is None: + return "" + if "]" not in name: + return name name, _ = name.split("]") return name[1:] @@ -38,8 +42,8 @@ def performance_size_plot(df: pd.DataFrame) -> go.Figure: df["Number of Parameters"] = df["Number of Parameters"].map(parse_n_params) df["Model"] = df["Model"].map(parse_model_name) df["model_text"] = df["Model"].where(df["Model"].isin(models_to_annotate), "") - df["Embedding Dimensions"] = df["Embedding Dimensions"].map(int) - df["Max Tokens"] = df["Max Tokens"].map(int) + df["Embedding Dimensions"] = df["Embedding Dimensions"].map(parse_float) + df["Max Tokens"] = df["Max Tokens"].map(parse_float) df["Log(Tokens)"] = np.log10(df["Max Tokens"]) df["Mean (Task)"] = df["Mean (Task)"].map(parse_float) df = df.dropna(subset=["Mean (Task)", "Number of Parameters"]) diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index d9b830d236..c965a7f682 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -200,7 +200,7 @@ def scores_to_tables( joint_table_style, # column_widths=column_widths, datatype=column_types, - wrap=True, + # wrap=True, ), gr.DataFrame(per_task_style), ) diff --git a/mteb/load_results/task_results.py b/mteb/load_results/task_results.py index 202ed9b5f5..8f587fd72b 100644 --- a/mteb/load_results/task_results.py +++ b/mteb/load_results/task_results.py @@ -296,10 +296,12 @@ def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult: pre_1_11_load = ( ( "mteb_version" in data + and data["mteb_version"] is not None and Version(data["mteb_version"]) < Version("1.11.0") ) or "mteb_version" not in data ) # assume it is before 1.11.0 if the version is not present + try: obj = cls.model_validate(data) except Exception as e: @@ -310,9 +312,11 @@ def from_disk(cls, path: Path, load_historic_data: bool = True) -> TaskResult: ) obj = cls._convert_from_before_v1_11_0(data) - pre_v_12_48 = "mteb_version" in data and Version( - data["mteb_version"] - ) < Version("1.12.48") + pre_v_12_48 = ( + "mteb_version" in data + and data["mteb_version"] is not None + and Version(data["mteb_version"]) < Version("1.12.48") + ) if pre_v_12_48: cls._fix_pair_classification_scores(obj) diff --git a/mteb/models/sentence_transformer_wrapper.py b/mteb/models/sentence_transformer_wrapper.py index 5cc824fa82..13d39e4031 100644 --- a/mteb/models/sentence_transformer_wrapper.py +++ b/mteb/models/sentence_transformer_wrapper.py @@ -53,6 +53,9 @@ def __init__( self.model.prompts = model_prompts self.model_prompts = self.validate_task_to_prompt_name(model_prompts) + if isinstance(self.model, CrossEncoder): + self.predict = self._predict + def encode( self, sentences: Sequence[str], @@ -106,7 +109,7 @@ def encode( embeddings = embeddings.cpu().detach().float().numpy() return embeddings - def predict( + def _predict( self, sentences: Sequence[str], **kwargs: Any, diff --git a/mteb/tasks/Reranking/zho/CMTEBReranking.py b/mteb/tasks/Reranking/zho/CMTEBReranking.py index ee830f7e16..d6ff57a2a9 100644 --- a/mteb/tasks/Reranking/zho/CMTEBReranking.py +++ b/mteb/tasks/Reranking/zho/CMTEBReranking.py @@ -128,7 +128,7 @@ class CMedQAv2(AbsTaskReranking): main_score="map_at_1000", date=None, form=None, - domains=None, + domains=["Medical", "Written"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index f8a47b08a9..ca41d4354f 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -105,6 +105,7 @@ from .multilingual.BelebeleRetrieval import * from .multilingual.CrossLingualSemanticDiscriminationWMT19 import * from .multilingual.CrossLingualSemanticDiscriminationWMT21 import * +from .multilingual.CUREv1Retrieval import * from .multilingual.IndicQARetrieval import * from .multilingual.MintakaRetrieval import * from .multilingual.MIRACLRetrieval import * diff --git a/mteb/tasks/Retrieval/eng/NFCorpusRetrieval.py b/mteb/tasks/Retrieval/eng/NFCorpusRetrieval.py index 7c40b6707b..31f4eb60b1 100644 --- a/mteb/tasks/Retrieval/eng/NFCorpusRetrieval.py +++ b/mteb/tasks/Retrieval/eng/NFCorpusRetrieval.py @@ -21,7 +21,7 @@ class NFCorpus(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, + domains=["Medical", "Academic", "Written"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/Retrieval/eng/SciFactRetrieval.py b/mteb/tasks/Retrieval/eng/SciFactRetrieval.py index 05e9a6e541..1dc47d8b66 100644 --- a/mteb/tasks/Retrieval/eng/SciFactRetrieval.py +++ b/mteb/tasks/Retrieval/eng/SciFactRetrieval.py @@ -21,7 +21,7 @@ class SciFact(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, + domains=["Academic", "Medical", "Written"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/Retrieval/eng/TRECCOVIDRetrieval.py b/mteb/tasks/Retrieval/eng/TRECCOVIDRetrieval.py index 6c7b7f01d1..00c96c0d04 100644 --- a/mteb/tasks/Retrieval/eng/TRECCOVIDRetrieval.py +++ b/mteb/tasks/Retrieval/eng/TRECCOVIDRetrieval.py @@ -21,7 +21,7 @@ class TRECCOVID(AbsTaskRetrieval): eval_langs=["eng-Latn"], main_score="ndcg_at_10", date=None, - domains=None, + domains=["Medical", "Academic", "Written"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/Retrieval/multilingual/CUREv1Retrieval.py b/mteb/tasks/Retrieval/multilingual/CUREv1Retrieval.py new file mode 100644 index 0000000000..6e97786a77 --- /dev/null +++ b/mteb/tasks/Retrieval/multilingual/CUREv1Retrieval.py @@ -0,0 +1,151 @@ +from __future__ import annotations + +from enum import Enum + +from datasets import DatasetDict, load_dataset + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from ....abstasks.MultilingualTask import MultilingualTask + +_LANGUAGES = { + "en": ["eng-Latn", "eng-Latn"], + "es": ["spa-Latn", "eng-Latn"], + "fr": ["fra-Latn", "eng-Latn"], +} + + +class CUREv1Splits(str, Enum): + all = "All" + dentistry_and_oral_health = "Dentistry and Oral Health" + dermatology = "Dermatology" + gastroenterology = "Gastroenterology" + genetics = "Genetics" + neuroscience_and_neurology = "Neuroscience and Neurology" + orthopedic_surgery = "Orthopedic Surgery" + otorhinolaryngology = "Otorhinolaryngology" + plastic_surgery = "Plastic Surgery" + psychiatry_and_psychology = "Psychiatry and Psychology" + pulmonology = "Pulmonology" + + @classmethod + def names(cls) -> list[str]: + return sorted(cls._member_names_) + + +class CUREv1Retrieval(MultilingualTask, AbsTaskRetrieval): + metadata = TaskMetadata( + dataset={ + "path": "clinia/CUREv1", + "revision": "3bcf51c91e04d04a8a3329dfbe988b964c5cbe83", + }, + name="CUREv1", + description="Collection of query-passage pairs curated by medical professionals, across 10 disciplines and 3 cross-lingual settings.", + type="Retrieval", + modalities=["text"], + category="s2p", + reference="https://huggingface.co/datasets/clinia/CUREv1", + eval_splits=CUREv1Splits.names(), + eval_langs=_LANGUAGES, + main_score="ndcg_at_10", + date=("2024-01-01", "2024-10-31"), + domains=["Medical", "Academic", "Written"], + task_subtypes=[], + license="cc-by-nc-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="created", + bibtex_citation="", + prompt={ + "query": "Given a question by a medical professional, retrieve relevant passages that best answer the question", + }, + ) + + def _load_corpus(self, split: str, cache_dir: str | None = None): + ds = load_dataset( + path=self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + name="corpus", + split=split, + cache_dir=cache_dir, + ) + + corpus = { + doc["_id"]: {"title": doc["title"], "text": doc["text"]} for doc in ds + } + + return corpus + + def _load_qrels(self, split: str, cache_dir: str | None = None): + ds = load_dataset( + path=self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + name="qrels", + split=split, + cache_dir=cache_dir, + ) + + qrels = {} + + for qrel in ds: + query_id = qrel["query-id"] + doc_id = qrel["corpus-id"] + score = int(qrel["score"]) + if query_id not in qrels: + qrels[query_id] = {} + qrels[query_id][doc_id] = score + + return qrels + + def _load_queries(self, split: str, language: str, cache_dir: str | None = None): + ds = load_dataset( + path=self.metadata_dict["dataset"]["path"], + revision=self.metadata_dict["dataset"]["revision"], + name=f"queries-{language}", + split=split, + cache_dir=cache_dir, + ) + + queries = {query["_id"]: query["text"] for query in ds} + + return queries + + def load_data(self, **kwargs): + if self.data_loaded: + return + + eval_splits = kwargs.get("eval_splits", self.metadata.eval_splits) + languages = kwargs.get("eval_langs", self.metadata.eval_langs) + cache_dir = kwargs.get("cache_dir", None) + + # Iterate over splits and languages + corpus = { + language: {split: None for split in eval_splits} for language in languages + } + queries = { + language: {split: None for split in eval_splits} for language in languages + } + relevant_docs = { + language: {split: None for split in eval_splits} for language in languages + } + for split in eval_splits: + # Since this is a cross-lingual dataset, the corpus and the relevant documents do not depend on the language + split_corpus = self._load_corpus(split=split, cache_dir=cache_dir) + split_qrels = self._load_qrels(split=split, cache_dir=cache_dir) + + # Queries depend on the language + for language in languages: + corpus[language][split] = split_corpus + relevant_docs[language][split] = split_qrels + + queries[language][split] = self._load_queries( + split=split, language=language, cache_dir=cache_dir + ) + + # Convert into DatasetDict + self.corpus = DatasetDict(corpus) + self.queries = DatasetDict(queries) + self.relevant_docs = DatasetDict(relevant_docs) + + self.data_loaded = True diff --git a/mteb/tasks/Retrieval/pol/SciFactPLRetrieval.py b/mteb/tasks/Retrieval/pol/SciFactPLRetrieval.py index 2588b1c288..92d61b42bd 100644 --- a/mteb/tasks/Retrieval/pol/SciFactPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/SciFactPLRetrieval.py @@ -22,7 +22,7 @@ class SciFactPL(AbsTaskRetrieval): eval_langs=["pol-Latn"], main_score="ndcg_at_10", date=None, - domains=None, + domains=["Academic", "Medical", "Written"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/mteb/tasks/Retrieval/pol/TRECCOVIDPLRetrieval.py b/mteb/tasks/Retrieval/pol/TRECCOVIDPLRetrieval.py index 4ba6a9ac00..f9f331191a 100644 --- a/mteb/tasks/Retrieval/pol/TRECCOVIDPLRetrieval.py +++ b/mteb/tasks/Retrieval/pol/TRECCOVIDPLRetrieval.py @@ -25,7 +25,7 @@ class TRECCOVIDPL(AbsTaskRetrieval): "2019-12-01", "2022-12-31", ), # approximate date of covid pandemic start and end (best guess) - domains=["Academic", "Non-fiction", "Written"], + domains=["Academic", "Medical", "Non-fiction", "Written"], task_subtypes=["Article retrieval"], license="not specified", annotations_creators="derived", diff --git a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py index 08674ec8c8..ad26652ccd 100644 --- a/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py +++ b/mteb/tasks/Retrieval/zho/CMTEBRetrieval.py @@ -236,7 +236,7 @@ class CmedqaRetrieval(AbsTaskRetrieval): eval_langs=["cmn-Hans"], main_score="ndcg_at_10", date=None, - domains=None, + domains=["Medical", "Written"], task_subtypes=None, license=None, annotations_creators=None, diff --git a/pyproject.toml b/pyproject.toml index c5bd396536..1ce9e09356 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.19.5" +version = "1.19.4" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [