diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index ccb266aacb..c40766045c 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -687,3 +687,37 @@ def __getitem__(self, index): reference=None, citation=None, ) + +MTEB_JPN = Benchmark( + name="MTEB(jpn)", + tasks=get_tasks( + languages=["jpn"], + tasks=[ + # clustering + "LivedoorNewsClustering.v2", + "MewsC16JaClustering", + # classification + "AmazonReviewsClassification", + "AmazonCounterfactualClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + # STS + "JSTS", + "JSICK", + # pair classification + "PawsXPairClassification", + # retrieval + "JaqketRetrieval", + "MrTidyRetrieval", + "JaGovFaqsRetrieval", + "NLPJournalTitleAbsRetrieval", + "NLPJournalAbsIntroRetrieval", + "NLPJournalTitleIntroRetrieval", + # reranking + "ESCIReranking", + ], + ), + description="Main Japanese benchmarks from MTEB", + reference="https://github.com/sbintuitions/JMTEB", + citation=None, +) diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py index f96985d458..a4b302a17f 100644 --- a/mteb/tasks/Reranking/__init__.py +++ b/mteb/tasks/Reranking/__init__.py @@ -8,6 +8,7 @@ from .fra.AlloprofReranking import * from .fra.SyntecReranking import * from .jpn.MMarcoReranking import * +from .multilingual.ESCIReranking import * from .multilingual.MIRACLReranking import * from .multilingual.WikipediaRerankingMultilingual import * from .rus.RuBQReranking import * diff --git a/mteb/tasks/Reranking/multilingual/ESCIReranking.py b/mteb/tasks/Reranking/multilingual/ESCIReranking.py new file mode 100644 index 0000000000..c3597c2fdf --- /dev/null +++ b/mteb/tasks/Reranking/multilingual/ESCIReranking.py @@ -0,0 +1,86 @@ +from __future__ import annotations + +import logging + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +logger = logging.getLogger(__name__) + +_EVAL_SPLIT = "test" +_LANGUAGES = { + "us": ["eng-Latn"], + "es": ["spa-Latn"], + "jp": ["jpn-Jpan"], +} + +_CITATION = """@article{reddy2022shopping, + title={Shopping Queries Dataset: A Large-Scale {ESCI} Benchmark for Improving Product Search}, + author={Chandan K. Reddy and Lluís Màrquez and Fran Valero and Nikhil Rao and Hugo Zaragoza and Sambaran Bandyopadhyay and Arnab Biswas and Anlu Xing and Karthik Subbian}, + year={2022}, + eprint={2206.06588}, + archivePrefix={arXiv} +}""" + + +class ESCIReranking(MultilingualTask, AbsTaskReranking): + metadata = TaskMetadata( + name="ESCIReranking", + description="", + reference="https://github.com/amazon-science/esci-data/", + dataset={ + "path": "mteb/esci", + "revision": "237f74be0503482b4e8bc1b83778c7a87ea93fd8", + }, + type="Reranking", + category="s2p", + modalities=["text"], + eval_splits=[_EVAL_SPLIT], + eval_langs=_LANGUAGES, + main_score="map", + date=("2022-06-14", "2022-06-14"), + domains=["Written"], + task_subtypes=[], + license="apache-2.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=_CITATION, + descriptive_stats={ + "test": { + "num_samples": 29285, + "num_positive": 29285, + "num_negative": 29285, + "avg_query_len": 19.691890046098685, + "avg_positive_len": 9.268089465596722, + "avg_negative_len": 1.5105002561038074, + "hf_subset_descriptive_stats": { + "us": { + "num_samples": 21296, + "num_positive": 21296, + "num_negative": 21296, + "avg_query_len": 21.440833959429, + "avg_positive_len": 8.892515026296017, + "avg_negative_len": 1.1956705484598047, + }, + "es": { + "num_samples": 3703, + "num_positive": 3703, + "num_negative": 3703, + "avg_query_len": 20.681609505806104, + "avg_positive_len": 10.561706724277613, + "avg_negative_len": 2.749932487172563, + }, + "jp": { + "num_samples": 4286, + "num_positive": 4286, + "num_negative": 4286, + "avg_query_len": 10.146756882874476, + "avg_positive_len": 10.016565562295847, + "avg_negative_len": 2.003966402239851, + }, + }, + } + }, + ) diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 3975cd9bd3..a25eec33b4 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -94,6 +94,7 @@ from .fra.SyntecRetrieval import * from .hun.HunSum2 import * from .jpn.JaGovFaqsRetrieval import * +from .jpn.JaqketRetrieval import * from .jpn.JaQuADRetrieval import * from .jpn.NLPJournalAbsIntroRetrieval import * from .jpn.NLPJournalTitleAbsRetrieval import * @@ -107,6 +108,7 @@ from .multilingual.MintakaRetrieval import * from .multilingual.MIRACLRetrieval import * from .multilingual.MLQARetrieval import * +from .multilingual.MrTidyRetrieval import * from .multilingual.MultiLongDocRetrieval import * from .multilingual.NeuCLIR2022Retrieval import * from .multilingual.NeuCLIR2023Retrieval import * diff --git a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py index 2c9dc8df41..01b955b19d 100644 --- a/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py +++ b/mteb/tasks/Retrieval/eng/Touche2020Retrieval.py @@ -1,11 +1,12 @@ from __future__ import annotations +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval from mteb.abstasks.TaskMetadata import TaskMetadata -from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval - class Touche2020(AbsTaskRetrieval): + superseded_by = "Touche2020Retrieval.v3" + metadata = TaskMetadata( name="Touche2020", description="Touché Task 1: Argument Retrieval for Controversial Questions", @@ -20,13 +21,13 @@ class Touche2020(AbsTaskRetrieval): eval_splits=["test"], eval_langs=["eng-Latn"], main_score="ndcg_at_10", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2020-09-23", "2020-09-23"), + domains=["Academic"], + task_subtypes=["Question answering"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@dataset{potthast_2022_6862281, author = {Potthast, Martin and Gienapp, Lukas and @@ -57,3 +58,44 @@ class Touche2020(AbsTaskRetrieval): }, }, ) + + +class Touche2020v3Retrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="Touche2020Retrieval.v3", + description="Touché Task 1: Argument Retrieval for Controversial Questions", + reference="https://github.com/castorini/touche-error-analysis", + dataset={ + "path": "mteb/webis-touche2020-v3", + "revision": "431886eaecc48f067a3975b70d0949ea2862463c", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["eng-Latn"], + main_score="ndcg_at_10", + date=("2020-09-23", "2020-09-23"), + domains=["Academic"], + task_subtypes=["Question answering"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@INPROCEEDINGS{Thakur_etal_SIGIR2024, + author = "Nandan Thakur and Luiz Bonifacio and Maik {Fr\"{o}be} and Alexander Bondarenko and Ehsan Kamalloo and Martin Potthast and Matthias Hagen and Jimmy Lin", + title = "Systematic Evaluation of Neural Retrieval Models on the {Touch\'{e}} 2020 Argument Retrieval Subset of {BEIR}", + booktitle = "Proceedings of the 47th International ACM SIGIR Conference on Research and Development in Information Retrieval", + year = 2024, + address_ = "Washington, D.C." +}""", + descriptive_stats={ + "test": { + "average_document_length": 2096.391812518931, + "average_query_length": 43.42857142857143, + "num_documents": 303732, + "num_queries": 49, + "average_relevant_docs_per_query": 34.93877551020408, + } + }, + ) diff --git a/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py b/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py new file mode 100644 index 0000000000..0af7d06772 --- /dev/null +++ b/mteb/tasks/Retrieval/jpn/JaqketRetrieval.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class JaqketRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="JaqketRetrieval", + dataset={ + "path": "mteb/jaqket", + "revision": "3a5b92dad489a61e664c05ed2175bc9220230199", + }, + description="JAQKET (JApanese Questions on Knowledge of EnTities) is a QA dataset that is created based on quiz questions.", + reference="https://github.com/kumapo/JAQKET-dataset", + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["jpn-Jpan"], + main_score="ndcg_at_10", + date=("2023-10-09", "2023-10-09"), + domains=["Encyclopaedic", "Non-fiction", "Written"], + task_subtypes=["Question answering"], + license="cc-by-sa-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@InProceedings{Kurihara_nlp2020, +author = "鈴木正敏 and 鈴木潤 and 松田耕史 and ⻄田京介 and 井之上直也", +title = "JAQKET: クイズを題材にした日本語 QA データセットの構築", +booktitle = "言語処理学会第26回年次大会", +year = "2020", +url = "https://www.anlp.jp/proceedings/annual_meeting/2020/pdf_dir/P2-24.pdf" +note= "in Japanese" +}""", + descriptive_stats={ + "test": { + "average_document_length": 3747.995228882333, + "average_query_length": 50.70611835506519, + "num_documents": 114229, + "num_queries": 997, + "average_relevant_docs_per_query": 1.0, + } + }, + ) diff --git a/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py b/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py new file mode 100644 index 0000000000..f7bf5f9dc8 --- /dev/null +++ b/mteb/tasks/Retrieval/multilingual/MrTidyRetrieval.py @@ -0,0 +1,131 @@ +from __future__ import annotations + +import logging + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_EVAL_LANGS = { + "bengali": ["ben-Beng"], + "english": ["eng-Latn"], + "finnish": ["fin-Latn"], + "russian": ["rus-Cyrl"], + "korean": ["kor-Kore"], + "japanese": ["jpn-Jpan"], + "telugu": ["tel-Telu"], + "thai": ["tha-Thai"], + "swahili": ["swa-Latn"], + "arabic": ["ara-Arab"], + "indonesian": ["ind-Latn"], +} +_EVAL_SPLIT = "test" + +logger = logging.getLogger(__name__) + + +def _load_data_retrieval( + path: str, langs: list, splits: str, cache_dir: str = None, revision: str = None +): + corpus = {lang: {split: {} for split in splits} for lang in langs} + queries = {lang: {split: {} for split in splits} for lang in langs} + relevant_docs = {lang: {split: {} for split in splits} for lang in langs} + + split = _EVAL_SPLIT + + for lang in langs: + qrels_data = datasets.load_dataset( + path, + name=f"{lang}-qrels", + cache_dir=cache_dir, + revision=revision, + trust_remote_code=True, + )[split] + + for row in qrels_data: + query_id = row["query-id"] + doc_id = row["corpus-id"] + score = row["score"] + if query_id not in relevant_docs[lang][split]: + relevant_docs[lang][split][query_id] = {} + relevant_docs[lang][split][query_id][doc_id] = score + + corpus_data = datasets.load_dataset( + path, + name=f"{lang}-corpus", + cache_dir=cache_dir, + revision=revision, + trust_remote_code=True, + )["train"] + + for row in corpus_data: + doc_id = row["_id"] + doc_title = row["title"] + doc_text = row["text"] + corpus[lang][split][doc_id] = {"title": doc_title, "text": doc_text} + + queries_data = datasets.load_dataset( + path, + name=f"{lang}-queries", + cache_dir=cache_dir, + revision=revision, + trust_remote_code=True, + )[split] + + for row in queries_data: + query_id = row["_id"] + query_text = row["text"] + queries[lang][split][query_id] = query_text + + queries = queries + logger.info("Loaded %d %s Queries.", len(queries), split.upper()) + + return corpus, queries, relevant_docs + + +class MrTidyRetrieval(MultilingualTask, AbsTaskRetrieval): + metadata = TaskMetadata( + name="MrTidyRetrieval", + description="Mr. TyDi is a multi-lingual benchmark dataset built on TyDi, covering eleven typologically diverse languages. It is designed for monolingual retrieval, specifically to evaluate ranking with learned dense representations.", + reference="https://huggingface.co/datasets/castorini/mr-tydi", + dataset={ + "path": "mteb/mrtidy", + "revision": "fc24a3ce8f09746410daee3d5cd823ff7a0675b7", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=_EVAL_LANGS, + main_score="ndcg_at_10", + date=("2023-11-01", "2024-05-15"), + domains=["Encyclopaedic", "Written"], + task_subtypes=[], + license="cc-by-sa-3.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation="""@article{mrtydi, + title={{Mr. TyDi}: A Multi-lingual Benchmark for Dense Retrieval}, + author={Xinyu Zhang and Xueguang Ma and Peng Shi and Jimmy Lin}, + year={2021}, + journal={arXiv:2108.08787}, + }""", + descriptive_stats={}, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = _load_data_retrieval( + path=self.metadata_dict["dataset"]["path"], + langs=self.hf_subsets, + splits=self.metadata_dict["eval_splits"], + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata_dict["dataset"]["revision"], + ) + + self.data_loaded = True