From 79fb490c9152e5352b89ffa5d577b93400b0ad6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Kara=C5=9B?= Date: Sun, 20 Apr 2025 19:39:02 +0200 Subject: [PATCH 1/2] Add metadata for GermanDPR and GermanQuAD --- mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py | 14 +++++++------- mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py | 16 ++++++++-------- pyproject.toml | 2 +- tests/test_TaskMetadata.py | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py b/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py index a3118b8f73..a55ae4dffb 100644 --- a/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GermanDPRRetrieval.py @@ -25,13 +25,13 @@ class GermanDPR(AbsTaskRetrieval): eval_splits=[_EVAL_SPLIT], eval_langs=["deu-Latn"], main_score="ndcg_at_10", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2020-05-19", "2021-04-26"), + domains=["Written", "Non-fiction", "Web"], + task_subtypes=["Question answering"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""@misc{möller2021germanquad, title={GermanQuAD and GermanDPR: Improving Non-English Question Answering and Passage Retrieval}, author={Timo Möller and Julian Risch and Malte Pietsch}, diff --git a/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py b/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py index ba6a21e96e..2de3c2441b 100644 --- a/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py +++ b/mteb/tasks/Retrieval/deu/GermanQuADRetrieval.py @@ -31,7 +31,7 @@ class GermanQuADRetrieval(AbsTaskRetrieval): metadata = TaskMetadata( name="GermanQuAD-Retrieval", description="Context Retrieval for German Question Answering", - reference="https://www.kaggle.com/datasets/GermanQuAD", + reference="https://huggingface.co/datasets/deepset/germanquad", dataset={ "path": "mteb/germanquad-retrieval", "revision": "f5c87ae5a2e7a5106606314eef45255f03151bb3", @@ -42,13 +42,13 @@ class GermanQuADRetrieval(AbsTaskRetrieval): eval_splits=["test"], eval_langs=["deu-Latn"], main_score="mrr_at_5", - date=None, - domains=None, - task_subtypes=None, - license=None, - annotations_creators=None, - dialect=None, - sample_creation=None, + date=("2020-05-19", "2021-04-26"), + domains=["Written", "Non-fiction", "Web"], + task_subtypes=["Question answering"], + license="cc-by-4.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", bibtex_citation="""misc{möller2021germanquad, title={GermanQuAD and GermanDPR: Improving Non-English Question Answering and Passage Retrieval}, author={Timo Möller and Julian Risch and Malte Pietsch}, diff --git a/pyproject.toml b/pyproject.toml index 22357a45a2..c54c15473a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.38.1" +version = "1.38.2" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 75f3095a48..24cb80d1cb 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -53,7 +53,7 @@ "TwitterHjerneRetrieval", "GerDaLIR", "GerDaLIRSmall", - "GermanDPR", + # "GermanDPR", "GermanQuAD-Retrieval", "LegalQuAD", "AILACasedocs", From 9bfbef8962b98901b391df965391412bb3334d8f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Kara=C5=9B?= Date: Sun, 20 Apr 2025 20:10:52 +0200 Subject: [PATCH 2/2] PR improvements --- pyproject.toml | 2 +- tests/test_TaskMetadata.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c54c15473a..22357a45a2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mteb" -version = "1.38.2" +version = "1.38.1" description = "Massive Text Embedding Benchmark" readme = "README.md" authors = [ diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 24cb80d1cb..3b5aa1e158 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -53,8 +53,6 @@ "TwitterHjerneRetrieval", "GerDaLIR", "GerDaLIRSmall", - # "GermanDPR", - "GermanQuAD-Retrieval", "LegalQuAD", "AILACasedocs", "AILAStatutes",