Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions mteb/tasks/PairClassification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from .ara.ArEntail import *
from .ces.CTKFactsNLI import *
from .dan.TalemaaderPC import *
from .deu.FalseFriendsDeEnPC import *
from .eng.LegalBenchPC import *
from .eng.PubChemAISentenceParaphrasePC import *
Expand Down
59 changes: 59 additions & 0 deletions mteb/tasks/PairClassification/dan/TalemaaderPC.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class TalemaaderPC(AbsTaskPairClassification):
metadata = TaskMetadata(
name="TalemaaderPC",
description="""\
The Danish Language and Literature Society has developed a dataset for evaluating language models in Danish.
The dataset contains a total of 1000 Danish idioms and fixed expressions with transferred meanings based on the Danish Dictionary's collection of fixed expressions with associated definitions.
For each of the 1000 idioms and fixed expressions, three false definitions have also been prepared.
The dataset can be used to test the performance of language models in identifying correct definitions for Danish idioms and fixed expressions.
""",
reference="https://sprogteknologi.dk/dataset/1000-talemader-evalueringsdatasaet",
dataset={
"path": "mteb/talemaader_pc",
"revision": "e714d53c059ca83d56c41d22f800da8245bb87fc",
},
type="PairClassification",
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_langs=["dan-Latn"],
main_score="max_accuracy",
date=("2024-11-20", "2024-11-20"),
domains=["Academic", "Written"],
task_subtypes=[],
license="cc-by-4.0",
annotations_creators="derived",
dialect=[],
sample_creation="created",
bibtex_citation="""
@misc{DSLDK1000Talemader,
title = {1000 danske talemåder - evalueringsdatasæt},
author = {{Det Danske Sprog- og Litteraturselskab}},
year = {2024},
howpublished = {Sprogteknologi.dk},
url = {https://sprogteknologi.dk/dataset/1000-talemader-evalueringsdatasaet},
note = {CC-BY licensed dataset of 1000 Danish sayings and expressions},
publisher = {Digitaliseringsstyrelsen \& Det Danske Sprog- og Litteraturselskab},
language = {Danish}
}
""",
)

def dataset_transform(self):
_dataset = {}
for split in self.metadata.eval_splits:
hf_dataset = self.dataset[split]
_dataset[split] = [
{
"sentence1": hf_dataset["sentence1"],
"sentence2": hf_dataset["sentence2"],
"labels": hf_dataset["label"],
}
]
self.dataset = _dataset
Comment on lines +48 to +59
Copy link
Member

@Samoed Samoed May 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def dataset_transform(self):
_dataset = {}
for split in self.metadata.eval_splits:
hf_dataset = self.dataset[split]
_dataset[split] = [
{
"sentence1": hf_dataset["sentence1"],
"sentence2": hf_dataset["sentence2"],
"labels": hf_dataset["label"],
}
]
self.dataset = _dataset
def dataset_transform(self):
self.dataset = self.dataset.rename_column("label", "labels")

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't remove this because PCEvaluator expects each column to be a list of samples (it's due to legacy PC formatting within mteb, we never changed it):

self.dataset[split] before transform: <class 'datasets.arrow_dataset.Dataset'>
self.dataset[split] after transform: <class 'list'>

Removing this leads to error and the task not running.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, maybe then we can change it like it's done in v2?

data_split = dataset[0] if len(dataset) == 1 else dataset

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be great just to use the columns - but that might be a v2 thing

Also, I believe it should be possible just to specify the columns of the labels (instead of reformatting)

Copy link
Contributor Author

@imenelydiaker imenelydiaker May 2, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, if we have this line we won't need this function. But it's in v2, right? We're compelting issue #2608 on main no?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It wouldn't break anything, so I think we can add it directly to main