From 6812bf0dd6c7714956c41f68d415130ce788fcf6 Mon Sep 17 00:00:00 2001 From: Imene Kerboua Date: Fri, 2 May 2025 11:14:39 +0200 Subject: [PATCH] Add talemaader pair classification task --- mteb/tasks/PairClassification/__init__.py | 1 + .../PairClassification/dan/TalemaaderPC.py | 59 +++++++++++++++++++ 2 files changed, 60 insertions(+) create mode 100644 mteb/tasks/PairClassification/dan/TalemaaderPC.py diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index 6cd75ea144..f562879bd8 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -2,6 +2,7 @@ from .ara.ArEntail import * from .ces.CTKFactsNLI import * +from .dan.TalemaaderPC import * from .deu.FalseFriendsDeEnPC import * from .eng.LegalBenchPC import * from .eng.PubChemAISentenceParaphrasePC import * diff --git a/mteb/tasks/PairClassification/dan/TalemaaderPC.py b/mteb/tasks/PairClassification/dan/TalemaaderPC.py new file mode 100644 index 0000000000..7a35c8415b --- /dev/null +++ b/mteb/tasks/PairClassification/dan/TalemaaderPC.py @@ -0,0 +1,59 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TalemaaderPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="TalemaaderPC", + description="""\ +The Danish Language and Literature Society has developed a dataset for evaluating language models in Danish. +The dataset contains a total of 1000 Danish idioms and fixed expressions with transferred meanings based on the Danish Dictionary's collection of fixed expressions with associated definitions. +For each of the 1000 idioms and fixed expressions, three false definitions have also been prepared. +The dataset can be used to test the performance of language models in identifying correct definitions for Danish idioms and fixed expressions. +""", + reference="https://sprogteknologi.dk/dataset/1000-talemader-evalueringsdatasaet", + dataset={ + "path": "mteb/talemaader_pc", + "revision": "e714d53c059ca83d56c41d22f800da8245bb87fc", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["dan-Latn"], + main_score="max_accuracy", + date=("2024-11-20", "2024-11-20"), + domains=["Academic", "Written"], + task_subtypes=[], + license="cc-by-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="created", + bibtex_citation=""" +@misc{DSLDK1000Talemader, + title = {1000 danske talemåder - evalueringsdatasæt}, + author = {{Det Danske Sprog- og Litteraturselskab}}, + year = {2024}, + howpublished = {Sprogteknologi.dk}, + url = {https://sprogteknologi.dk/dataset/1000-talemader-evalueringsdatasaet}, + note = {CC-BY licensed dataset of 1000 Danish sayings and expressions}, + publisher = {Digitaliseringsstyrelsen \& Det Danske Sprog- og Litteraturselskab}, + language = {Danish} +} +""", + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + hf_dataset = self.dataset[split] + _dataset[split] = [ + { + "sentence1": hf_dataset["sentence1"], + "sentence2": hf_dataset["sentence2"], + "labels": hf_dataset["label"], + } + ] + self.dataset = _dataset