Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
137 changes: 77 additions & 60 deletions mteb/tasks/Retrieval/multilingual/MIRACLRetrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,39 @@
"zh": ["zho-Hans"],
}

_common_metadata = dict(
reference="http://miracl.ai",
type="Retrieval",
category="s2p",
modalities=["text"],
eval_splits=[_EVAL_SPLIT],
eval_langs=_LANGUAGES,
main_score="ndcg_at_10",
date=("2022-06-01", "2023-01-30"),
domains=["Encyclopaedic", "Written"],
task_subtypes=[],
license="cc-by-sa-4.0",
annotations_creators="expert-annotated",
dialect=[],
sample_creation="created",
bibtex_citation=r"""
@article{10.1162/tacl_a_00595,
abstract = {{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}},
author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy},
doi = {10.1162/tacl_a_00595},
eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf},
issn = {2307-387X},
journal = {Transactions of the Association for Computational Linguistics},
month = {09},
pages = {1114-1131},
title = {{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}},
url = {https://doi.org/10.1162/tacl\_a\_00595},
volume = {11},
year = {2023},
}
""",
)


def _load_miracl_data(
path: str,
Expand Down Expand Up @@ -106,44 +139,15 @@ class MIRACLRetrieval(MultilingualTask, AbsTaskRetrieval):
metadata = TaskMetadata(
name="MIRACLRetrieval",
description="MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual retrieval dataset that focuses on search across 18 different languages.",
reference="http://miracl.ai",
dataset={
"path": "miracl/mmteb-miracl",
"revision": "main",
"trust_remote_code": True,
},
type="Retrieval",
category="s2p",
modalities=["text"],
eval_splits=[_EVAL_SPLIT],
eval_langs=_LANGUAGES,
main_score="ndcg_at_10",
date=("2022-06-01", "2023-01-30"),
domains=["Encyclopaedic", "Written"],
task_subtypes=[],
license="cc-by-sa-4.0",
annotations_creators="expert-annotated",
dialect=[],
sample_creation="created",
bibtex_citation=r"""
@article{10.1162/tacl_a_00595,
abstract = {{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}},
author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy},
doi = {10.1162/tacl_a_00595},
eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf},
issn = {2307-387X},
journal = {Transactions of the Association for Computational Linguistics},
month = {09},
pages = {1114-1131},
title = {{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}},
url = {https://doi.org/10.1162/tacl\_a\_00595},
volume = {11},
year = {2023},
}
""",
prompt={
"query": "Given a question, retrieve Wikipedia passages that answer the question"
},
**_common_metadata,
)

def load_data(self, **kwargs):
Expand Down Expand Up @@ -300,45 +304,58 @@ def _load_miracl_data_hard_negatives(


class MIRACLRetrievalHardNegatives(MultilingualTask, AbsTaskRetrieval):
# in current version prompt for instruction models different from original task
superseded_by = "MIRACLRetrievalHardNegatives.v2"

metadata = TaskMetadata(
name="MIRACLRetrievalHardNegatives",
description="MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual retrieval dataset that focuses on search across 18 different languages. The hard negative version has been created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct.",
reference="http://miracl.ai",
dataset={
"path": "mteb/miracl-hard-negatives",
"revision": "95c8db7d4a6e9c1d8a60601afd63d553ae20a2eb",
"trust_remote_code": True,
},
type="Retrieval",
category="s2p",
modalities=["text"],
eval_splits=[_EVAL_SPLIT],
eval_langs=_LANGUAGES,
main_score="ndcg_at_10",
date=("2022-06-01", "2023-01-30"),
domains=["Encyclopaedic", "Written"],
task_subtypes=[],
license="cc-by-sa-4.0",
annotations_creators="expert-annotated",
dialect=[],
sample_creation="created",
bibtex_citation=r"""
@article{10.1162/tacl_a_00595,
abstract = {{MIRACL is a multilingual dataset for ad hoc retrieval across 18 languages that collectively encompass over three billion native speakers around the world. This resource is designed to support monolingual retrieval tasks, where the queries and the corpora are in the same language. In total, we have gathered over 726k high-quality relevance judgments for 78k queries over Wikipedia in these languages, where all annotations have been performed by native speakers hired by our team. MIRACL covers languages that are both typologically close as well as distant from 10 language families and 13 sub-families, associated with varying amounts of publicly available resources. Extensive automatic heuristic verification and manual assessments were performed during the annotation process to control data quality. In total, MIRACL represents an investment of around five person-years of human annotator effort. Our goal is to spur research on improving retrieval across a continuum of languages, thus enhancing information access capabilities for diverse populations around the world, particularly those that have traditionally been underserved. MIRACL is available at http://miracl.ai/.}},
author = {Zhang, Xinyu and Thakur, Nandan and Ogundepo, Odunayo and Kamalloo, Ehsan and Alfonso-Hermelo, David and Li, Xiaoguang and Liu, Qun and Rezagholizadeh, Mehdi and Lin, Jimmy},
doi = {10.1162/tacl_a_00595},
eprint = {https://direct.mit.edu/tacl/article-pdf/doi/10.1162/tacl\_a\_00595/2157340/tacl\_a\_00595.pdf},
issn = {2307-387X},
journal = {Transactions of the Association for Computational Linguistics},
month = {09},
pages = {1114-1131},
title = {{MIRACL: A Multilingual Retrieval Dataset Covering 18 Diverse Languages}},
url = {https://doi.org/10.1162/tacl\_a\_00595},
volume = {11},
year = {2023},
}
""",
adapted_from=["MIRACLRetrieval"],
**_common_metadata,
)

def load_data(self, **kwargs):
if self.data_loaded:
return

self.corpus, self.queries, self.relevant_docs = (
_load_miracl_data_hard_negatives(
path=self.metadata.dataset["path"],
langs=self.hf_subsets,
splits=self.metadata_dict["eval_splits"],
cache_dir=kwargs.get("cache_dir", None),
revision=self.metadata.dataset["revision"],
trust_remote_code=self.metadata.dataset.get("trust_remote_code", False),
)
)

self.data_loaded = True


class MIRACLRetrievalHardNegativesV2(MultilingualTask, AbsTaskRetrieval):
metadata = TaskMetadata(
name="MIRACLRetrievalHardNegatives.v2",
description=(
"MIRACL (Multilingual Information Retrieval Across a Continuum of Languages) is a multilingual retrieval "
"dataset that focuses on search across 18 different languages. The hard negative version has been "
"created by pooling the 250 top documents per query from BM25, e5-multilingual-large and e5-mistral-instruct."
"V2 uses a more appropriate prompt rather than the default prompt for retrieval."
),
dataset={
"path": "mteb/miracl-hard-negatives",
"revision": "95c8db7d4a6e9c1d8a60601afd63d553ae20a2eb",
"trust_remote_code": True,
},
prompt={
"query": "Given a question, retrieve Wikipedia passages that answer the question",
},
adapted_from=["MIRACLRetrieval"],
**_common_metadata,
)

def load_data(self, **kwargs):
Expand Down