Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions mteb/tasks/Classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,10 @@
from .hrv.FrenkHrClassification import *
from .ind.IndonesianIdClickbaitClassification import *
from .ind.IndonesianMongabayConservationClassification import *
from .ita.DadoEvalCoarseClassification import *
from .ita.ItaCaseholdClassification import *
from .ita.ItalianLinguistAcceptabilityClassification import *
from .ita.SardiStanceClassification import *
from .jav.JavaneseIMDBClassification import *
from .jpn.WRIMEClassification import *
from .kan.KannadaNewsClassification import *
Expand Down
48 changes: 48 additions & 0 deletions mteb/tasks/Classification/ita/DadoEvalCoarseClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class DadoEvalCoarseClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="DadoEvalCoarseClassification",
dataset={
"path": "MattiaSangermano/DaDoEval",
"revision": "7a78eb7cc137fdd1c5826be1a9e9813177706509",
},
description="The DaDoEval dataset is a curated collection of 2,759 documents authored by Alcide De Gasperi, spanning the period from 1901 to 1954. Each document in the dataset is manually tagged with its date of issue.",
reference="https://github.com/dhfbk/DaDoEval",
type="Classification",
date=("1901-01-01", "1954-12-31"),
category="s2s",
modalities=["text"],
eval_splits=["test"],
eval_langs=["ita-Latn"],
main_score="accuracy",
domains=["Written"],
task_subtypes=[],
license="cc-by-nc-sa-4.0",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation=r"""
@inproceedings{menini2020dadoeval,
author = {Menini, Stefano and Moretti, Giovanni and Sprugnoli, Rachele and Tonelli, Sara and others},
booktitle = {Proceedings of the Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian. Final Workshop (EVALITA 2020)},
organization = {Accademia University Press},
pages = {391--397},
title = {DaDoEval@ EVALITA 2020: Same-genre and cross-genre dating of historical documents},
year = {2020},
}
""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("class", "label")
unused_cols = [
col
for col in self.dataset["test"].column_names
if col not in ["text", "label"]
]
self.dataset = self.dataset.remove_columns(unused_cols)
47 changes: 47 additions & 0 deletions mteb/tasks/Classification/ita/SardiStanceClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskClassification import AbsTaskClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class SardiStanceClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="SardiStanceClassification",
dataset={
"path": "MattiaSangermano/SardiStance",
"revision": "e25d91e6f6a28ebef42212128f0d5e275b676233",
},
description="SardiStance is a unique dataset designed for the task of stance detection in Italian tweets. It consists of tweets related to the Sardines movement, providing a valuable resource for researchers and practitioners in the field of NLP.",
reference="https://github.com/mirkolai/evalita-sardistance",
type="Classification",
category="s2s",
date=("2019-11-01", "2020-01-31"),
modalities=["text"],
eval_splits=["test"],
eval_langs=["ita-Latn"],
main_score="accuracy",
domains=["Social"],
task_subtypes=["Political classification"],
license="cc-by-nc-sa-4.0",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation=r"""
@inproceedings{cignarella2020sardistance,
author = {Cignarella, Alessandra Teresa and Lai, Mirko and Bosco, Cristina and Patti, Viviana and Rosso, Paolo and others},
booktitle = {CEUR WORKSHOP PROCEEDINGS},
organization = {Ceur},
pages = {1--10},
title = {Sardistance@ evalita2020: Overview of the task on stance detection in italian tweets},
year = {2020},
}
""",
)

def dataset_transform(self):
unused_cols = [
col
for col in self.dataset["test"].column_names
if col not in ["text", "label"]
]
self.dataset = self.dataset.remove_columns(unused_cols)
1 change: 1 addition & 0 deletions mteb/tasks/MultiLabelClassification/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

from .ita.EmitClassification import *
from .kor.KorHateSpeechMLClassification import *
from .mlt.MalteseNewsClassification import *
from .multilingual.MultiEURLEXMultilabelClassification import *
Expand Down
55 changes: 55 additions & 0 deletions mteb/tasks/MultiLabelClassification/ita/EmitClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskMultilabelClassification import (
AbsTaskMultilabelClassification,
)
from mteb.abstasks.TaskMetadata import TaskMetadata


class EmitClassification(AbsTaskMultilabelClassification):
metadata = TaskMetadata(
name="EmitClassification",
description="""The EMit dataset is a comprehensive resource for the detection of emotions in Italian social media texts.
The EMit dataset consists of social media messages about TV shows, TV series, music videos, and advertisements.
Each message is annotated with one or more of the 8 primary emotions defined by Plutchik
(anger, anticipation, disgust, fear, joy, sadness, surprise, trust), as well as an additional label “love.”
""",
reference="https://github.com/oaraque/emit",
dataset={
"path": "MattiaSangermano/emit",
"revision": "b0ceff2da0ca463d5c8c97a4e1c6e40545a1c3a6",
},
type="MultilabelClassification",
category="s2s",
modalities=["text"],
date=("2022-01-01", "2022-12-31"),
eval_splits=["test"],
eval_langs=["ita-Latn"],
main_score="accuracy",
domains=["Social", "Written"],
task_subtypes=["Sentiment/Hate speech"],
license="cc-by-sa-4.0",
annotations_creators="expert-annotated",
dialect=[],
sample_creation="found",
bibtex_citation=r"""
@inproceedings{araque2023emit,
author = {Araque, O and Frenda, S and Sprugnoli, R and Nozza, D and Patti, V and others},
booktitle = {CEUR WORKSHOP PROCEEDINGS},
organization = {CEUR-WS},
pages = {1--8},
title = {EMit at EVALITA 2023: Overview of the Categorical Emotion Detection in Italian Social Media Task},
volume = {3473},
year = {2023},
}
""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"emotion_labels": "label"})
unused_cols = [
col
for col in self.dataset["test"].column_names
if col not in ["text", "label"]
]
self.dataset = self.dataset.remove_columns(unused_cols)
Empty file.
1 change: 1 addition & 0 deletions mteb/tasks/PairClassification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from .fas.FarsTail import *
from .hye.ArmenianParaphrasePC import *
from .ind.IndoNLI import *
from .ita.DisCoTexPairClassification import *
from .kor.KlueNLI import *
from .multilingual.OpusparcusPC import *
from .multilingual.PawsXPairClassification import *
Expand Down
55 changes: 55 additions & 0 deletions mteb/tasks/PairClassification/ita/DisCoTexPairClassification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from __future__ import annotations

from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification
from mteb.abstasks.TaskMetadata import TaskMetadata


class DisCoTexPairClassification(AbsTaskPairClassification):
metadata = TaskMetadata(
name="DisCoTexPairClassification",
description="The DisCoTEX dataset aims at assessing discourse coherence in Italian texts. This dataset focuses on Italian real-world texts and provides resources to model coherence in natural language.",
reference="https://github.com/davidecolla/DisCoTex",
dataset={
"path": "MattiaSangermano/DisCoTex-last-sentence",
"revision": "ab9ea43f8e54c8b24b12cd1b77d6eb462385a30b",
},
type="PairClassification",
category="s2s",
modalities=["text"],
date=("2023-01-01", "2023-12-31"),
eval_splits=["test"],
eval_langs=["ita-Latn"],
main_score="max_ap",
domains=["Social", "Written"],
task_subtypes=[],
license="not specified",
annotations_creators="derived",
dialect=[],
sample_creation="found",
bibtex_citation=r"""
@inproceedings{brunato2023discotex,
author = {Brunato, Dominique and Colla, Davide and Dell'Orletta, Felice and Dini, Irene and Radicioni, Daniele Paolo and Ravelli, Andrea Amelio and others},
booktitle = {CEUR WORKSHOP PROCEEDINGS},
organization = {CEUR},
pages = {1--8},
title = {DisCoTex at EVALITA 2023: overview of the assessing discourse coherence in Italian texts task},
volume = {3473},
year = {2023},
}
""",
)

def dataset_transform(self):
self.dataset = self.dataset.remove_columns(["id", "source"])
self.dataset = self.dataset.map(
lambda x: {
"prompt": [x["prompt"]],
"target": [x["target"]],
"class": [x["class"]],
},
batched=True,
batch_size=len(self.dataset["train"]),
)
self.dataset = self.dataset.rename_column("prompt", "sentence1")
self.dataset = self.dataset.rename_column("target", "sentence2")
self.dataset = self.dataset.rename_column("class", "labels")
Empty file.
1 change: 1 addition & 0 deletions mteb/tasks/Reranking/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,6 @@
from .multilingual.ESCIReranking import *
from .multilingual.MIRACLReranking import *
from .multilingual.WikipediaRerankingMultilingual import *
from .multilingual.XGlueWPRReranking import *
from .rus.RuBQReranking import *
from .zho.CMTEBReranking import *
Loading