Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 116 additions & 0 deletions mteb/descriptive_stats/Retrieval/EuroPIRQRetrieval.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
{
"test": {
"num_samples": 30300,
"number_of_characters": 17320243,
"documents_text_statistics": {
"total_text_length": 17276572,
"min_text_length": 316,
"average_text_length": 575.8857333333333,
"max_text_length": 1008,
"unique_texts": 28361
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 43671,
"min_text_length": 67,
"average_text_length": 145.57,
"max_text_length": 345,
"unique_texts": 300
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 300,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.0,
"max_relevant_docs_per_query": 1,
"unique_relevant_docs": 300
},
"top_ranked_statistics": null,
"hf_subset_descriptive_stats": {
"en": {
"num_samples": 10100,
"number_of_characters": 5517678,
"documents_text_statistics": {
"total_text_length": 5503635,
"min_text_length": 316,
"average_text_length": 550.3635,
"max_text_length": 726,
"unique_texts": 9422
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 14043,
"min_text_length": 68,
"average_text_length": 140.43,
"max_text_length": 305,
"unique_texts": 100
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 100,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.0,
"max_relevant_docs_per_query": 1,
"unique_relevant_docs": 100
},
"top_ranked_statistics": null
},
"fi": {
"num_samples": 10100,
"number_of_characters": 5953462,
"documents_text_statistics": {
"total_text_length": 5938809,
"min_text_length": 326,
"average_text_length": 593.8809,
"max_text_length": 1008,
"unique_texts": 9422
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 14653,
"min_text_length": 67,
"average_text_length": 146.53,
"max_text_length": 345,
"unique_texts": 100
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 100,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.0,
"max_relevant_docs_per_query": 1,
"unique_relevant_docs": 100
},
"top_ranked_statistics": null
},
"pt": {
"num_samples": 10100,
"number_of_characters": 5849103,
"documents_text_statistics": {
"total_text_length": 5834128,
"min_text_length": 325,
"average_text_length": 583.4128,
"max_text_length": 774,
"unique_texts": 9517
},
"documents_image_statistics": null,
"queries_text_statistics": {
"total_text_length": 14975,
"min_text_length": 69,
"average_text_length": 149.75,
"max_text_length": 320,
"unique_texts": 100
},
"queries_image_statistics": null,
"relevant_docs_statistics": {
"num_relevant_docs": 100,
"min_relevant_docs_per_query": 1,
"average_relevant_docs_per_query": 1.0,
"max_relevant_docs_per_query": 1,
"unique_relevant_docs": 100
},
"top_ranked_statistics": null
}
}
}
}
2 changes: 2 additions & 0 deletions mteb/tasks/retrieval/multilingual/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
CrossLingualSemanticDiscriminationWMT21,
)
from .cur_ev1_retrieval import CUREv1Retrieval
from .euro_pirq_retrieval import EuroPIRQRetrieval
from .indic_qa_retrieval import IndicQARetrieval
from .jina_vdr_bench_retrieval import (
JinaVDRAirbnbSyntheticRetrieval,
Expand Down Expand Up @@ -107,6 +108,7 @@
"CUREv1Retrieval",
"CrossLingualSemanticDiscriminationWMT19",
"CrossLingualSemanticDiscriminationWMT21",
"EuroPIRQRetrieval",
"IndicQARetrieval",
"JinaVDRAirbnbSyntheticRetrieval",
"JinaVDRArabicChartQARetrieval",
Expand Down
43 changes: 43 additions & 0 deletions mteb/tasks/retrieval/multilingual/euro_pirq_retrieval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from mteb.abstasks.retrieval import AbsTaskRetrieval
from mteb.abstasks.task_metadata import TaskMetadata

_LANGUAGES = {
"en": ["eng-Latn"],
"fi": ["fin-Latn"],
"pt": ["por-Latn"],
}


class EuroPIRQRetrieval(AbsTaskRetrieval):
metadata = TaskMetadata(
name="EuroPIRQRetrieval",
description="The EuroPIRQ retrieval dataset is a multilingual collection designed for evaluating retrieval and cross-lingual retrieval tasks. Dataset contains 10,000 parallel passages & 100 parallel queries (synthetic) in three languages: English, Portuguese, and Finnish, constructed from the European Union's DGT-Acquis corpus.",
reference="https://huggingface.co/datasets/eherra/EuroPIRQ-retrieval",
dataset={
"path": "eherra/EuroPIRQ-retrieval",
"revision": "59225ed25fbcea2185e1acbc8c3c80f1a8cd8341",
},
type="Retrieval",
category="t2t",
modalities=["text"],
eval_splits=["test"],
eval_langs=_LANGUAGES,
main_score="ndcg_at_10",
date=("2025-12-01", "2025-12-31"),
domains=["Legal"],
task_subtypes=[],
license="not specified",
annotations_creators="LM-generated and reviewed",
dialect=[],
sample_creation="found",
is_public=True,
bibtex_citation=r"""
@misc{eherra_2025_europirq,
author = { {Elias Herranen} },
publisher = { Hugging Face },
title = { EuroPIRQ: European Parallel Information Retrieval Queries },
url = { https://huggingface.co/datasets/eherra/EuroPIRQ-retrieval },
year = {2025},
}
""",
)