diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index 2c2a0ed6b2..f2aef081ca 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1999,3 +1999,33 @@ } """, ) + +R2MED = Benchmark( + name="R2MED", + display_name="Reasoning-driven medical retrieval", + tasks=get_tasks( + tasks=[ + "R2MEDBiologyRetrieval", + "R2MEDBioinformaticsRetrieval", + "R2MEDMedicalSciencesRetrieval", + "R2MEDMedXpertQAExamRetrieval", + "R2MEDMedQADiagRetrieval", + "R2MEDPMCTreatmentRetrieval", + "R2MEDPMCClinicalRetrieval", + "R2MEDIIYiClinicalRetrieval", + ] + ), + description="""R2MED: First Reasoning-Driven Medical Retrieval Benchmark. + R2MED is a high-quality, high-resolution information retrieval (IR) dataset designed for medical scenarios. + It contains 876 queries with three retrieval tasks, five medical scenarios, and twelve body systems. + """, + reference="https://r2med.github.io/", + citation=r""" +@article{li2025r2med, + author = {Li, Lei and Zhou, Xiao and Liu, Zheng}, + journal = {arXiv preprint arXiv:2505.14558}, + title = {R2MED: A Benchmark for Reasoning-Driven Medical Retrieval}, + year = {2025}, +} +""", +) diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index b2abdd19e5..39932c47ec 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -87,6 +87,7 @@ from .eng.PiqaRetrieval import * from .eng.QuailRetrieval import * from .eng.QuoraRetrieval import * +from .eng.R2MEDRetrieval import * from .eng.RARbCodeRetrieval import * from .eng.RARbMathRetrieval import * from .eng.SCIDOCSRetrieval import * diff --git a/mteb/tasks/Retrieval/eng/R2MEDRetrieval.py b/mteb/tasks/Retrieval/eng/R2MEDRetrieval.py new file mode 100644 index 0000000000..9cac3cb0c0 --- /dev/null +++ b/mteb/tasks/Retrieval/eng/R2MEDRetrieval.py @@ -0,0 +1,393 @@ +from __future__ import annotations + +from collections import defaultdict + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +def load_r2med_data( + path: str, + eval_splits: list, + cache_dir: str, + revision: str, +): + eval_split = eval_splits[0] + corpus = {eval_split: None} + queries = {eval_split: None} + relevant_docs = {eval_split: None} + domain_corpus = datasets.load_dataset( + path, name="corpus", split="corpus", cache_dir=cache_dir, revision=revision + ) + domain_queries = datasets.load_dataset( + path, name="query", split="query", cache_dir=cache_dir, revision=revision + ) + domain_qrels = datasets.load_dataset( + path, name="qrels", split="qrels", cache_dir=cache_dir, revision=revision + ) + corpus[eval_split] = {e["id"]: {"text": e["text"]} for e in domain_corpus} + queries[eval_split] = {e["id"]: e["text"] for e in domain_queries} + relevant_docs[eval_split] = defaultdict(dict) + for e in domain_qrels: + qid = e["q_id"] + pid = e["p_id"] + relevant_docs[eval_split][qid][pid] = int(e["score"]) + + corpus = datasets.DatasetDict(corpus) + queries = datasets.DatasetDict(queries) + relevant_docs = datasets.DatasetDict(relevant_docs) + return corpus, queries, relevant_docs + + +class R2MEDBiologyRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="R2MEDBiologyRetrieval", + dataset={ + "path": "R2MED/Biology", + "revision": "8b9fec2db9eda4b5742d03732213fbaee8169556", + }, + reference="https://huggingface.co/R2MED/Biology", + description="Biology retrieval dataset.", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=r""" +@article{li2025r2med, + author = {Li, Lei and Zhou, Xiao and Liu, Zheng}, + journal = {arXiv preprint arXiv:2505.14558}, + title = {R2MED: A Benchmark for Reasoning-Driven Medical Retrieval}, + year = {2025}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = load_r2med_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + self.data_loaded = True + + +class R2MEDBioinformaticsRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="R2MEDBioinformaticsRetrieval", + dataset={ + "path": "R2MED/Bioinformatics", + "revision": "6021fce366892cbfd7837fa85a4128ea93315e18", + }, + reference="https://huggingface.co/R2MED/Bioinformatics", + description="Bioinformatics retrieval dataset.", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=r""" +@article{li2025r2med, + author = {Li, Lei and Zhou, Xiao and Liu, Zheng}, + journal = {arXiv preprint arXiv:2505.14558}, + title = {R2MED: A Benchmark for Reasoning-Driven Medical Retrieval}, + year = {2025}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = load_r2med_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + self.data_loaded = True + + +class R2MEDMedicalSciencesRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="R2MEDMedicalSciencesRetrieval", + dataset={ + "path": "R2MED/Medical-Sciences", + "revision": "1b48911514c80bf9182222d99752ad75e23b4b47", + }, + reference="https://huggingface.co/R2MED/Medical-Sciences", + description="Medical-Sciences retrieval dataset.", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=r""" +@article{li2025r2med, + author = {Li, Lei and Zhou, Xiao and Liu, Zheng}, + journal = {arXiv preprint arXiv:2505.14558}, + title = {R2MED: A Benchmark for Reasoning-Driven Medical Retrieval}, + year = {2025}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = load_r2med_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + self.data_loaded = True + + +class R2MEDMedXpertQAExamRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="R2MEDMedXpertQAExamRetrieval", + dataset={ + "path": "R2MED/MedXpertQA-Exam", + "revision": "b457ea43db9ae5db74c3a3e5be0a213d0f85ac3a", + }, + reference="https://huggingface.co/R2MED/MedXpertQA-Exam", + description="MedXpertQA-Exam retrieval dataset.", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=r""" +@article{li2025r2med, + author = {Li, Lei and Zhou, Xiao and Liu, Zheng}, + journal = {arXiv preprint arXiv:2505.14558}, + title = {R2MED: A Benchmark for Reasoning-Driven Medical Retrieval}, + year = {2025}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = load_r2med_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + self.data_loaded = True + + +class R2MEDMedQADiagRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="R2MEDMedQADiagRetrieval", + dataset={ + "path": "R2MED/MedQA-Diag", + "revision": "78b585990279cc01a493f876c1b0cf09557fba57", + }, + reference="https://huggingface.co/R2MED/MedQA-Diag", + description="MedQA-Diag retrieval dataset.", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=r""" +@article{li2025r2med, + author = {Li, Lei and Zhou, Xiao and Liu, Zheng}, + journal = {arXiv preprint arXiv:2505.14558}, + title = {R2MED: A Benchmark for Reasoning-Driven Medical Retrieval}, + year = {2025}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = load_r2med_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + self.data_loaded = True + + +class R2MEDPMCTreatmentRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="R2MEDPMCTreatmentRetrieval", + dataset={ + "path": "R2MED/PMC-Treatment", + "revision": "53c489a44a3664ba352c07550b72b4525a5968d5", + }, + reference="https://huggingface.co/R2MED/PMC-Treatment", + description="PMC-Treatment retrieval dataset.", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=r""" +@article{li2025r2med, + author = {Li, Lei and Zhou, Xiao and Liu, Zheng}, + journal = {arXiv preprint arXiv:2505.14558}, + title = {R2MED: A Benchmark for Reasoning-Driven Medical Retrieval}, + year = {2025}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = load_r2med_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + self.data_loaded = True + + +class R2MEDPMCClinicalRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="R2MEDPMCClinicalRetrieval", + dataset={ + "path": "R2MED/PMC-Clinical", + "revision": "812829522f7eaa407ef82b96717be85788a50f7e", + }, + reference="https://huggingface.co/R2MED/PMC-Clinical", + description="PMC-Clinical retrieval dataset.", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=r""" +@article{li2025r2med, + author = {Li, Lei and Zhou, Xiao and Liu, Zheng}, + journal = {arXiv preprint arXiv:2505.14558}, + title = {R2MED: A Benchmark for Reasoning-Driven Medical Retrieval}, + year = {2025}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = load_r2med_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + self.data_loaded = True + + +class R2MEDIIYiClinicalRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="R2MEDIIYiClinicalRetrieval", + dataset={ + "path": "R2MED/IIYi-Clinical", + "revision": "974abbc9bc281c3169180a6aa5d7586cfd2f5877", + }, + reference="https://huggingface.co/R2MED/IIYi-Clinical", + description="IIYi-Clinical retrieval dataset.", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="cc-by-4.0", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="found", + modalities=["text"], + bibtex_citation=r""" +@article{li2025r2med, + author = {Li, Lei and Zhou, Xiao and Liu, Zheng}, + journal = {arXiv preprint arXiv:2505.14558}, + title = {R2MED: A Benchmark for Reasoning-Driven Medical Retrieval}, + year = {2025}, +} +""", + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = load_r2med_data( + path=self.metadata.dataset["path"], + eval_splits=self.metadata.eval_splits, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata.dataset["revision"], + ) + self.data_loaded = True