diff --git a/mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json b/mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json new file mode 100644 index 0000000000..b193ad5f0d --- /dev/null +++ b/mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json @@ -0,0 +1,116 @@ +{ + "test": { + "num_samples": 30300, + "number_of_characters": 17320243, + "documents_text_statistics": { + "total_text_length": 17276572, + "min_text_length": 316, + "average_text_length": 575.8857333333333, + "max_text_length": 1008, + "unique_texts": 28361 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 43671, + "min_text_length": 67, + "average_text_length": 145.57, + "max_text_length": 345, + "unique_texts": 300 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 300, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 300 + }, + "top_ranked_statistics": null, + "hf_subset_descriptive_stats": { + "en": { + "num_samples": 10100, + "number_of_characters": 5517678, + "documents_text_statistics": { + "total_text_length": 5503635, + "min_text_length": 316, + "average_text_length": 550.3635, + "max_text_length": 726, + "unique_texts": 9422 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 14043, + "min_text_length": 68, + "average_text_length": 140.43, + "max_text_length": 305, + "unique_texts": 100 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 100, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 100 + }, + "top_ranked_statistics": null + }, + "fi": { + "num_samples": 10100, + "number_of_characters": 5953462, + "documents_text_statistics": { + "total_text_length": 5938809, + "min_text_length": 326, + "average_text_length": 593.8809, + "max_text_length": 1008, + "unique_texts": 9422 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 14653, + "min_text_length": 67, + "average_text_length": 146.53, + "max_text_length": 345, + "unique_texts": 100 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 100, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 100 + }, + "top_ranked_statistics": null + }, + "pt": { + "num_samples": 10100, + "number_of_characters": 5849103, + "documents_text_statistics": { + "total_text_length": 5834128, + "min_text_length": 325, + "average_text_length": 583.4128, + "max_text_length": 774, + "unique_texts": 9517 + }, + "documents_image_statistics": null, + "queries_text_statistics": { + "total_text_length": 14975, + "min_text_length": 69, + "average_text_length": 149.75, + "max_text_length": 320, + "unique_texts": 100 + }, + "queries_image_statistics": null, + "relevant_docs_statistics": { + "num_relevant_docs": 100, + "min_relevant_docs_per_query": 1, + "average_relevant_docs_per_query": 1.0, + "max_relevant_docs_per_query": 1, + "unique_relevant_docs": 100 + }, + "top_ranked_statistics": null + } + } + } +} diff --git a/mteb/tasks/retrieval/multilingual/__init__.py b/mteb/tasks/retrieval/multilingual/__init__.py index 9586991986..6b0be6c254 100644 --- a/mteb/tasks/retrieval/multilingual/__init__.py +++ b/mteb/tasks/retrieval/multilingual/__init__.py @@ -6,6 +6,7 @@ CrossLingualSemanticDiscriminationWMT21, ) from .cur_ev1_retrieval import CUREv1Retrieval +from .euro_pirq_retrieval import EuroPIRQRetrieval from .indic_qa_retrieval import IndicQARetrieval from .jina_vdr_bench_retrieval import ( JinaVDRAirbnbSyntheticRetrieval, @@ -107,6 +108,7 @@ "CUREv1Retrieval", "CrossLingualSemanticDiscriminationWMT19", "CrossLingualSemanticDiscriminationWMT21", + "EuroPIRQRetrieval", "IndicQARetrieval", "JinaVDRAirbnbSyntheticRetrieval", "JinaVDRArabicChartQARetrieval", diff --git a/mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py b/mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py new file mode 100644 index 0000000000..bdfe4b0950 --- /dev/null +++ b/mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py @@ -0,0 +1,43 @@ +from mteb.abstasks.retrieval import AbsTaskRetrieval +from mteb.abstasks.task_metadata import TaskMetadata + +_LANGUAGES = { + "en": ["eng-Latn"], + "fi": ["fin-Latn"], + "pt": ["por-Latn"], +} + + +class EuroPIRQRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="EuroPIRQRetrieval", + description="The EuroPIRQ retrieval dataset is a multilingual collection designed for evaluating retrieval and cross-lingual retrieval tasks. Dataset contains 10,000 parallel passages & 100 parallel queries (synthetic) in three languages: English, Portuguese, and Finnish, constructed from the European Union's DGT-Acquis corpus.", + reference="https://huggingface.co/datasets/eherra/EuroPIRQ-retrieval", + dataset={ + "path": "eherra/EuroPIRQ-retrieval", + "revision": "59225ed25fbcea2185e1acbc8c3c80f1a8cd8341", + }, + type="Retrieval", + category="t2t", + modalities=["text"], + eval_splits=["test"], + eval_langs=_LANGUAGES, + main_score="ndcg_at_10", + date=("2025-12-01", "2025-12-31"), + domains=["Legal"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated and reviewed", + dialect=[], + sample_creation="found", + is_public=True, + bibtex_citation=r""" +@misc{eherra_2025_europirq, + author = { {Elias Herranen} }, + publisher = { Hugging Face }, + title = { EuroPIRQ: European Parallel Information Retrieval Queries }, + url = { https://huggingface.co/datasets/eherra/EuroPIRQ-retrieval }, + year = {2025}, +} +""", + )