diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py
index 36592d1e0d..a51da8ea42 100644
--- a/mteb/tasks/Classification/__init__.py
+++ b/mteb/tasks/Classification/__init__.py
@@ -16,6 +16,7 @@
from .ces.CzechSubjectivityClassification import *
from .dan.AngryTweetsClassification import *
from .dan.DanishPoliticalCommentsClassification import *
+from .dan.DdiscoCohesionClassification import *
from .dan.DKHateClassification import *
from .dan.LccSentimentClassification import *
from .deu.GermanPoliticiansTwitterSentimentClassification import *
@@ -109,6 +110,7 @@
from .multilingual.NusaParagraphEmotionClassification import *
from .multilingual.NusaParagraphTopicClassification import *
from .multilingual.NusaXSenti import *
+from .multilingual.ru_nlu_intent_classification import *
from .multilingual.ScalaClassification import *
from .multilingual.ScandiSentClassification import *
from .multilingual.SIB200Classification import *
@@ -132,7 +134,6 @@
from .rus.HeadlineClassification import *
from .rus.InappropriatenessClassification import *
from .rus.KinopoiskClassification import *
-from .rus.ru_nlu_intent_classification import *
from .rus.ru_toixic_classification_okmlcup import *
from .rus.RuReviewsClassification import *
from .rus.RuSciBenchGRNTIClassification import *
diff --git a/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py b/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py
index 9b1f68f0a3..ef3cea7172 100644
--- a/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py
+++ b/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py
@@ -5,6 +5,7 @@
class CSFDCZMovieReviewSentimentClassification(AbsTaskClassification):
+ superseded_by = "CSFDCZMovieReviewSentimentClassification.v2"
metadata = TaskMetadata(
name="CSFDCZMovieReviewSentimentClassification",
description="The dataset contains 30k user reviews from csfd.cz in Czech.",
@@ -49,3 +50,49 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES
)
+
+
+class CSFDCZMovieReviewSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="CSFDCZMovieReviewSentimentClassification.v2",
+ description="""The dataset contains 30k user reviews from csfd.cz in Czech.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2304.01922",
+ dataset={
+ "path": "mteb/csfdcz_movie_review_sentiment",
+ "revision": "bda232f79c949fd881572f7e1b9ad59fd04a6c7c",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2002-06-28", "2020-03-13"),
+ eval_splits=["test"],
+ eval_langs=["ces-Latn"],
+ main_score="accuracy",
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{štefánik2023resources,
+ archiveprefix = {arXiv},
+ author = {Michal Štefánik and Marek Kadlčík and Piotr Gramacki and Petr Sojka},
+ eprint = {2304.01922},
+ primaryclass = {cs.CL},
+ title = {Resources and Few-shot Learners for In-context Learning in Slavic Languages},
+ year = {2023},
+}
+""",
+ adapted_from=["CSFDCZMovieReviewSentimentClassification"],
+ )
+ # Increase the samples_per_label in order to improve baseline performance
+ samples_per_label = 20
+
+ def dataset_transform(self):
+ N_SAMPLES = 2048
+
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES
+ )
diff --git a/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py b/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py
index cd29ac4353..0bd1a85dd6 100644
--- a/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py
+++ b/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py
@@ -5,6 +5,7 @@
class CzechProductReviewSentimentClassification(AbsTaskClassification):
+ superseded_by = "CzechProductReviewSentimentClassification.v2"
metadata = TaskMetadata(
name="CzechProductReviewSentimentClassification",
description="User reviews of products on Czech e-shop Mall.cz with 3 sentiment classes (positive, neutral, negative)",
@@ -54,3 +55,54 @@ def dataset_transform(self) -> None:
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class CzechProductReviewSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="CzechProductReviewSentimentClassification.v2",
+ description="""User reviews of products on Czech e-shop Mall.cz with 3 sentiment classes (positive, neutral, negative)
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/W13-1609/",
+ dataset={
+ "path": "mteb/czech_product_review_sentiment",
+ "revision": "1a3fb305bde30eec7067ab15ad2db9f61b115ca2",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["ces-Latn"],
+ main_score="accuracy",
+ date=("2013-01-01", "2013-06-01"),
+ dialect=[],
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="derived",
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{habernal-etal-2013-sentiment,
+ address = {Atlanta, Georgia},
+ author = {Habernal, Ivan and
+Pt{\'a}{\v{c}}ek, Tom{\'a}{\v{s}} and
+Steinberger, Josef},
+ booktitle = {Proceedings of the 4th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis},
+ editor = {Balahur, Alexandra and
+van der Goot, Erik and
+Montoyo, Andres},
+ month = jun,
+ pages = {65--74},
+ publisher = {Association for Computational Linguistics},
+ title = {Sentiment Analysis in {C}zech Social Media Using Supervised Machine Learning},
+ url = {https://aclanthology.org/W13-1609},
+ year = {2013},
+}
+""",
+ adapted_from=["CzechProductReviewSentimentClassification"],
+ )
+ samples_per_label = 16
+
+ def dataset_transform(self) -> None:
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py b/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py
index 333cd3aa4a..a2052eee43 100644
--- a/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py
+++ b/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py
@@ -5,6 +5,7 @@
class CzechSoMeSentimentClassification(AbsTaskClassification):
+ superseded_by = "CzechSoMeSentimentClassification.v2"
metadata = TaskMetadata(
name="CzechSoMeSentimentClassification",
description="User comments on Facebook",
@@ -51,3 +52,49 @@ def dataset_transform(self) -> None:
self.dataset = self.dataset.rename_columns(
{"comment": "text", "sentiment_int": "label"}
)
+
+
+class CzechSoMeSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="CzechSoMeSentimentClassification.v2",
+ description="""User comments on Facebook
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/W13-1609/",
+ dataset={
+ "path": "mteb/czech_so_me_sentiment",
+ "revision": "a12152e40ff9857bf3c83694528f40ec5c02aafc",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["ces-Latn"],
+ main_score="accuracy",
+ date=("2013-01-01", "2013-06-01"),
+ dialect=[],
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="derived",
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{habernal-etal-2013-sentiment,
+ address = {Atlanta, Georgia},
+ author = {Habernal, Ivan and
+Pt{\'a}{\v{c}}ek, Tom{\'a}{\v{s}} and
+Steinberger, Josef},
+ booktitle = {Proceedings of the 4th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis},
+ editor = {Balahur, Alexandra and
+van der Goot, Erik and
+Montoyo, Andres},
+ month = jun,
+ pages = {65--74},
+ publisher = {Association for Computational Linguistics},
+ title = {Sentiment Analysis in {C}zech Social Media Using Supervised Machine Learning},
+ url = {https://aclanthology.org/W13-1609},
+ year = {2013},
+}
+""",
+ adapted_from=["CzechSoMeSentimentClassification"],
+ )
+ samples_per_label = 16
diff --git a/mteb/tasks/Classification/dan/AngryTweetsClassification.py b/mteb/tasks/Classification/dan/AngryTweetsClassification.py
index 886612db48..08251acc5c 100644
--- a/mteb/tasks/Classification/dan/AngryTweetsClassification.py
+++ b/mteb/tasks/Classification/dan/AngryTweetsClassification.py
@@ -5,6 +5,7 @@
class AngryTweetsClassification(AbsTaskClassification):
+ superseded_by = "AngryTweetsClassification.v2"
metadata = TaskMetadata(
name="AngryTweetsClassification",
dataset={
@@ -39,3 +40,42 @@ class AngryTweetsClassification(AbsTaskClassification):
)
samples_per_label = 16
+
+
+class AngryTweetsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="AngryTweetsClassification.v2",
+ dataset={
+ "path": "mteb/angry_tweets",
+ "revision": "b9475fb66a13befda4fa9871cd92343bb2c0eb77",
+ },
+ description="""A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2021.nodalida-main.53/",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["dan-Latn"],
+ main_score="accuracy",
+ date=("2021-01-01", "2021-12-31"),
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-4.0",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{pauli2021danlp,
+ author = {Pauli, Amalie Brogaard and Barrett, Maria and Lacroix, Oph{\'e}lie and Hvingelby, Rasmus},
+ booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)},
+ pages = {460--466},
+ title = {DaNLP: An open-source toolkit for Danish Natural Language Processing},
+ year = {2021},
+}
+""",
+ prompt="Classify Danish tweets by sentiment. (positive, negative, neutral).",
+ adapted_from=["AngryTweetsClassification"],
+ )
+
+ samples_per_label = 16
diff --git a/mteb/tasks/Classification/dan/DKHateClassification.py b/mteb/tasks/Classification/dan/DKHateClassification.py
index e67e3ebee0..e101be40b1 100644
--- a/mteb/tasks/Classification/dan/DKHateClassification.py
+++ b/mteb/tasks/Classification/dan/DKHateClassification.py
@@ -5,6 +5,7 @@
class DKHateClassification(AbsTaskClassification):
+ superseded_by = "DKHateClassification.v2"
metadata = TaskMetadata(
name="DKHateClassification",
dataset={
@@ -69,3 +70,64 @@ def dataset_transform(self):
self.dataset = self.dataset.map(
lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
)
+
+
+class DKHateClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="DKHateClassification.v2",
+ dataset={
+ "path": "mteb/dk_hate",
+ "revision": "0468ff11393992d8347cf4282fb706fe970608d4",
+ },
+ description="""Danish Tweets annotated for Hate Speech either being Offensive or not
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2020.lrec-1.430/",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["dan-Latn"],
+ main_score="accuracy",
+ date=("2018-01-01", "2018-12-31"),
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{sigurbergsson-derczynski-2020-offensive,
+ abstract = {The presence of offensive language on social media platforms and the implications this poses is becoming a major concern in modern society. Given the enormous amount of content created every day, automatic methods are required to detect and deal with this type of content. Until now, most of the research has focused on solving the problem for the English language, while the problem is multilingual. We construct a Danish dataset DKhate containing user-generated comments from various social media platforms, and to our knowledge, the first of its kind, annotated for various types and target of offensive language. We develop four automatic classification systems, each designed to work for both the English and the Danish language. In the detection of offensive language in English, the best performing system achieves a macro averaged F1-score of 0.74, and the best performing system for Danish achieves a macro averaged F1-score of 0.70. In the detection of whether or not an offensive post is targeted, the best performing system for English achieves a macro averaged F1-score of 0.62, while the best performing system for Danish achieves a macro averaged F1-score of 0.73. Finally, in the detection of the target type in a targeted offensive post, the best performing system for English achieves a macro averaged F1-score of 0.56, and the best performing system for Danish achieves a macro averaged F1-score of 0.63. Our work for both the English and the Danish language captures the type and targets of offensive language, and present automatic methods for detecting different kinds of offensive language such as hate speech and cyberbullying.},
+ address = {Marseille, France},
+ author = {Sigurbergsson, Gudbjartur Ingi and
+Derczynski, Leon},
+ booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference},
+ editor = {Calzolari, Nicoletta and
+B{\'e}chet, Fr{\'e}d{\'e}ric and
+Blache, Philippe and
+Choukri, Khalid and
+Cieri, Christopher and
+Declerck, Thierry and
+Goggi, Sara and
+Isahara, Hitoshi and
+Maegaard, Bente and
+Mariani, Joseph and
+Mazo, H{\'e}l{\`e}ne and
+Moreno, Asuncion and
+Odijk, Jan and
+Piperidis, Stelios},
+ isbn = {979-10-95546-34-4},
+ language = {English},
+ month = may,
+ pages = {3498--3508},
+ publisher = {European Language Resources Association},
+ title = {Offensive Language and Hate Speech Detection for {D}anish},
+ url = {https://aclanthology.org/2020.lrec-1.430},
+ year = {2020},
+}
+""",
+ prompt="Classify Danish tweets based on offensiveness (offensive, not offensive)",
+ adapted_from=["DKHateClassification"],
+ )
+
+ samples_per_label = 16
diff --git a/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py b/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py
index c0bac1528e..7846b73870 100644
--- a/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py
+++ b/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py
@@ -5,6 +5,7 @@
class DanishPoliticalCommentsClassification(AbsTaskClassification):
+ superseded_by = "DanishPoliticalCommentsClassification.v2"
metadata = TaskMetadata(
name="DanishPoliticalCommentsClassification",
dataset={
@@ -49,3 +50,44 @@ def dataset_transform(self):
# create train and test splits
self.dataset = self.dataset["train"].train_test_split(0.2, seed=self.seed)
+
+
+class DanishPoliticalCommentsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="DanishPoliticalCommentsClassification.v2",
+ dataset={
+ "path": "mteb/danish_political_comments",
+ "revision": "476a9e7327aba70ad3e97a169d7310b86be9b245",
+ },
+ description="""A dataset of Danish political comments rated for sentiment
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/danish_political_comments",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["dan-Latn"],
+ main_score="accuracy",
+ date=(
+ "2000-01-01",
+ "2022-12-31",
+ ), # Estimated range for the collection of comments
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@techreport{SAMsentiment,
+ author = {Mads Guldborg Kjeldgaard Kongsbak and Steffan Eybye Christensen and Lucas Høyberg Puvis~de~Chavannes and Peter Due Jensen},
+ institution = {IT University of Copenhagen},
+ title = {Sentiment Analysis Multitool, SAM},
+ year = {2019},
+}
+""",
+ prompt="Classify Danish political comments for sentiment",
+ adapted_from=["DanishPoliticalCommentsClassification"],
+ )
+
+ samples_per_label = 16
diff --git a/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py b/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py
index c1eb16d190..3a0b229517 100644
--- a/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py
+++ b/mteb/tasks/Classification/dan/DdiscoCohesionClassification.py
@@ -5,6 +5,7 @@
class DdiscoCohesionClassification(AbsTaskClassification):
+ superseded_by = "Ddisco.v2"
metadata = TaskMetadata(
name="Ddisco",
dataset={
@@ -62,3 +63,60 @@ def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"rating": "label"}).remove_columns(
["domain"]
)
+
+
+class DdiscoCohesionClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="Ddisco.v2",
+ dataset={
+ "path": "mteb/ddisco_cohesion",
+ "revision": "b5a05bdecdfc6efc14eebc8f7a86e0986edaf5ff",
+ },
+ description="""A Danish Discourse dataset with values for coherence and source (Wikipedia or Reddit)
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2022.lrec-1.260/",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["dan-Latn"],
+ main_score="accuracy",
+ date=("2021-01-01", "2022-06-25"),
+ domains=["Non-fiction", "Social", "Written"],
+ dialect=[],
+ task_subtypes=["Discourse coherence"],
+ license="cc-by-sa-3.0",
+ annotations_creators="expert-annotated",
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{flansmose-mikkelsen-etal-2022-ddisco,
+ abstract = {To date, there has been no resource for studying discourse coherence on real-world Danish texts. Discourse coherence has mostly been approached with the assumption that incoherent texts can be represented by coherent texts in which sentences have been shuffled. However, incoherent real-world texts rarely resemble that. We thus present DDisCo, a dataset including text from the Danish Wikipedia and Reddit annotated for discourse coherence. We choose to annotate real-world texts instead of relying on artificially incoherent text for training and testing models. Then, we evaluate the performance of several methods, including neural networks, on the dataset.},
+ address = {Marseille, France},
+ author = {Flansmose Mikkelsen, Linea and
+Kinch, Oliver and
+Jess Pedersen, Anders and
+Lacroix, Oph{\'e}lie},
+ booktitle = {Proceedings of the Thirteenth Language Resources and Evaluation Conference},
+ editor = {Calzolari, Nicoletta and
+B{\'e}chet, Fr{\'e}d{\'e}ric and
+Blache, Philippe and
+Choukri, Khalid and
+Cieri, Christopher and
+Declerck, Thierry and
+Goggi, Sara and
+Isahara, Hitoshi and
+Maegaard, Bente and
+Mariani, Joseph and
+Mazo, H{\'e}l{\`e}ne and
+Odijk, Jan and
+Piperidis, Stelios},
+ month = jun,
+ pages = {2440--2445},
+ publisher = {European Language Resources Association},
+ title = {{DD}is{C}o: A Discourse Coherence Dataset for {D}anish},
+ url = {https://aclanthology.org/2022.lrec-1.260},
+ year = {2022},
+}
+""",
+ adapted_from=["DdiscoCohesionClassification"],
+ )
diff --git a/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py b/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py
index 979a70c707..fa61715eea 100644
--- a/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py
+++ b/mteb/tasks/Classification/deu/GermanPoliticiansTwitterSentimentClassification.py
@@ -5,6 +5,7 @@
class GermanPoliticiansTwitterSentimentClassification(AbsTaskClassification):
+ superseded_by = "GermanPoliticiansTwitterSentimentClassification.v2"
metadata = TaskMetadata(
name="GermanPoliticiansTwitterSentimentClassification",
description="GermanPoliticiansTwitterSentiment is a dataset of German tweets categorized with their sentiment (3 classes).",
@@ -52,3 +53,52 @@ class GermanPoliticiansTwitterSentimentClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset = self.dataset.rename_column("majority_sentiment", "label")
+
+
+class GermanPoliticiansTwitterSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="GermanPoliticiansTwitterSentimentClassification.v2",
+ description="""GermanPoliticiansTwitterSentiment is a dataset of German tweets categorized with their sentiment (3 classes).
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2022.konvens-1.9",
+ dataset={
+ "path": "mteb/german_politicians_twitter_sentiment",
+ "revision": "aeb7e9cd08a0c77856ec5396bb82c32f309276d0",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["deu-Latn"],
+ main_score="accuracy",
+ date=("2021-01-01", "2021-12-31"),
+ domains=["Social", "Government", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{schmidt-etal-2022-sentiment,
+ address = {Potsdam, Germany},
+ author = {Schmidt, Thomas and
+Fehle, Jakob and
+Weissenbacher, Maximilian and
+Richter, Jonathan and
+Gottschalk, Philipp and
+Wolff, Christian},
+ booktitle = {Proceedings of the 18th Conference on Natural Language Processing (KONVENS 2022)},
+ editor = {Schaefer, Robin and
+Bai, Xiaoyu and
+Stede, Manfred and
+Zesch, Torsten},
+ month = {12--15 } # sep,
+ pages = {74--87},
+ publisher = {KONVENS 2022 Organizers},
+ title = {Sentiment Analysis on {T}witter for the Major {G}erman Parties during the 2021 {G}erman Federal Election},
+ url = {https://aclanthology.org/2022.konvens-1.9},
+ year = {2022},
+}
+""",
+ adapted_from=["GermanPoliticiansTwitterSentimentClassification"],
+ )
diff --git a/mteb/tasks/Classification/deu/TenKGnadClassification.py b/mteb/tasks/Classification/deu/TenKGnadClassification.py
index e9b2316d2d..ca91bf4715 100644
--- a/mteb/tasks/Classification/deu/TenKGnadClassification.py
+++ b/mteb/tasks/Classification/deu/TenKGnadClassification.py
@@ -5,6 +5,7 @@
class TenKGnadClassification(AbsTaskClassification):
+ superseded_by = "TenKGnadClassification.v2"
metadata = TaskMetadata(
name="TenKGnadClassification",
description="10k German News Articles Dataset (10kGNAD) contains news articles from the online Austrian newspaper website DER Standard with their topic classification (9 classes).",
@@ -39,3 +40,42 @@ class TenKGnadClassification(AbsTaskClassification):
}
""",
)
+
+
+class TenKGnadClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="TenKGnadClassification.v2",
+ description="""10k German News Articles Dataset (10kGNAD) contains news articles from the online Austrian newspaper website DER Standard with their topic classification (9 classes).
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://tblock.github.io/10kGNAD/",
+ dataset={
+ "path": "mteb/ten_k_gnad",
+ "revision": "fc6825fe0d813e7fc92f05fe63ac4bb3ee191c4d",
+ },
+ type="Classification",
+ category="p2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["deu-Latn"],
+ main_score="accuracy",
+ date=("2015-06-01", "2016-05-31"),
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{Schabus2017,
+ address = {Tokyo, Japan},
+ author = {Dietmar Schabus and Marcin Skowron and Martin Trapp},
+ booktitle = {Proceedings of the 40th International ACM SIGIR Conference on Research and Development in Information Retrieval (SIGIR)},
+ doi = {10.1145/3077136.3080711},
+ month = aug,
+ pages = {1241--1244},
+ title = {One Million Posts: A Data Set of German Online Discussions},
+ year = {2017},
+}
+""",
+ adapted_from=["TenKGnadClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/AmazonPolarityClassification.py b/mteb/tasks/Classification/eng/AmazonPolarityClassification.py
index 3c5b1350f1..7333e4c2dc 100644
--- a/mteb/tasks/Classification/eng/AmazonPolarityClassification.py
+++ b/mteb/tasks/Classification/eng/AmazonPolarityClassification.py
@@ -5,6 +5,7 @@
class AmazonPolarityClassification(AbsTaskClassification):
+ superseded_by = "AmazonPolarityClassification.v2"
metadata = TaskMetadata(
name="AmazonPolarityClassification",
description="Amazon Polarity Classification Dataset.",
@@ -40,3 +41,43 @@ class AmazonPolarityClassification(AbsTaskClassification):
""",
prompt="Classify Amazon reviews into positive or negative sentiment",
)
+
+
+class AmazonPolarityClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="AmazonPolarityClassification.v2",
+ description="""Amazon Polarity Classification Dataset.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/amazon_polarity",
+ dataset={
+ "path": "mteb/amazon_polarity",
+ "revision": "ec149c1fe36043668a50804214d4597804001f6f",
+ },
+ type="Classification",
+ category="p2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2012-01-01",
+ "2015-12-31",
+ ), # Estimated range for the collection of reviews
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ annotations_creators="derived",
+ license="apache-2.0",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{McAuley2013HiddenFA,
+ author = {Julian McAuley and Jure Leskovec},
+ journal = {Proceedings of the 7th ACM conference on Recommender systems},
+ title = {Hidden factors and hidden topics: understanding rating dimensions with review text},
+ url = {https://api.semanticscholar.org/CorpusID:6440341},
+ year = {2013},
+}
+""",
+ prompt="Classify Amazon reviews into positive or negative sentiment",
+ adapted_from=["AmazonPolarityClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/ArxivClassification.py b/mteb/tasks/Classification/eng/ArxivClassification.py
index d046bd3bd0..438b018ff2 100644
--- a/mteb/tasks/Classification/eng/ArxivClassification.py
+++ b/mteb/tasks/Classification/eng/ArxivClassification.py
@@ -5,6 +5,7 @@
class ArxivClassification(AbsTaskClassification):
+ superseded_by = "ArxivClassification.v2"
metadata = TaskMetadata(
name="ArxivClassification",
description="Classification Dataset of Arxiv Papers",
@@ -39,3 +40,42 @@ class ArxivClassification(AbsTaskClassification):
}
""",
)
+
+
+class ArxivClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="ArxivClassification.v2",
+ description="""Classification Dataset of Arxiv Papers
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/arxiv",
+ "revision": "202e10e9a5d37a5068397b48184d0728346a7b4a",
+ },
+ reference="https://ieeexplore.ieee.org/document/8675939",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("1998-11-11", "2019-03-28"),
+ domains=["Academic", "Written"],
+ task_subtypes=["Topic classification"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{8675939,
+ author = {He, Jun and Wang, Liqun and Liu, Liu and Feng, Jiao and Wu, Hao},
+ doi = {10.1109/ACCESS.2019.2907992},
+ journal = {IEEE Access},
+ number = {},
+ pages = {40707-40718},
+ title = {Long Document Classification From Local Word Glimpses via Recurrent Attention Learning},
+ volume = {7},
+ year = {2019},
+}
+""",
+ adapted_from=["ArxivClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/Banking77Classification.py b/mteb/tasks/Classification/eng/Banking77Classification.py
index 5581df7fb0..f3543eec87 100644
--- a/mteb/tasks/Classification/eng/Banking77Classification.py
+++ b/mteb/tasks/Classification/eng/Banking77Classification.py
@@ -5,6 +5,7 @@
class Banking77Classification(AbsTaskClassification):
+ superseded_by = "Banking77Classification.v2"
metadata = TaskMetadata(
name="Banking77Classification",
description="Dataset composed of online banking queries annotated with their corresponding intents.",
@@ -57,3 +58,60 @@ class Banking77Classification(AbsTaskClassification):
""",
prompt="Given a online banking query, find the corresponding intents",
)
+
+
+class Banking77ClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="Banking77Classification.v2",
+ description="""Dataset composed of online banking queries annotated with their corresponding intents.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2003.04807",
+ dataset={
+ "path": "mteb/banking77",
+ "revision": "18072d2685ea682290f7b8924d94c62acc19c0b2",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2019-01-01",
+ "2019-12-31",
+ ), # Estimated range for the collection of queries
+ domains=["Written"],
+ task_subtypes=[],
+ license="mit",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{casanueva-etal-2020-efficient,
+ address = {Online},
+ author = {Casanueva, I{\~n}igo and
+Tem{\v{c}}inas, Tadas and
+Gerz, Daniela and
+Henderson, Matthew and
+Vuli{\'c}, Ivan},
+ booktitle = {Proceedings of the 2nd Workshop on Natural Language Processing for Conversational AI},
+ doi = {10.18653/v1/2020.nlp4convai-1.5},
+ editor = {Wen, Tsung-Hsien and
+Celikyilmaz, Asli and
+Yu, Zhou and
+Papangelis, Alexandros and
+Eric, Mihail and
+Kumar, Anuj and
+Casanueva, I{\~n}igo and
+Shah, Rushin},
+ month = jul,
+ pages = {38--45},
+ publisher = {Association for Computational Linguistics},
+ title = {Efficient Intent Detection with Dual Sentence Encoders},
+ url = {https://aclanthology.org/2020.nlp4convai-1.5},
+ year = {2020},
+}
+""",
+ prompt="Given a online banking query, find the corresponding intents",
+ adapted_from=["Banking77Classification"],
+ )
diff --git a/mteb/tasks/Classification/eng/DBpediaClassification.py b/mteb/tasks/Classification/eng/DBpediaClassification.py
index 51904a4c08..6764e7934c 100644
--- a/mteb/tasks/Classification/eng/DBpediaClassification.py
+++ b/mteb/tasks/Classification/eng/DBpediaClassification.py
@@ -5,6 +5,7 @@
class DBpediaClassification(AbsTaskClassification):
+ superseded_by = "DBpediaClassification.v2"
metadata = TaskMetadata(
name="DBpediaClassification",
description="DBpedia14 is a dataset of English texts from Wikipedia articles, categorized into 14 non-overlapping classes based on their DBpedia ontology.",
@@ -46,3 +47,48 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train", "test"]
)
+
+
+class DBpediaClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="DBpediaClassification.v2",
+ description="""DBpedia14 is a dataset of English texts from Wikipedia articles, categorized into 14 non-overlapping classes based on their DBpedia ontology.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/1509.01626",
+ dataset={
+ "path": "mteb/d_bpedia",
+ "revision": "e45aab5cbb44baba43d8a0640d809d2aa0a0a770",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2022-01-25", "2022-01-25"),
+ domains=["Encyclopaedic", "Written"],
+ task_subtypes=["Topic classification"],
+ license="cc-by-sa-3.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{NIPS2015_250cf8b5,
+ author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
+ booktitle = {Advances in Neural Information Processing Systems},
+ editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
+ pages = {},
+ publisher = {Curran Associates, Inc.},
+ title = {Character-level Convolutional Networks for Text Classification},
+ url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf},
+ volume = {28},
+ year = {2015},
+}
+""",
+ adapted_from=["DBpediaClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train", "test"]
+ )
diff --git a/mteb/tasks/Classification/eng/EmotionClassification.py b/mteb/tasks/Classification/eng/EmotionClassification.py
index d04d8143a6..fcee2a86a7 100644
--- a/mteb/tasks/Classification/eng/EmotionClassification.py
+++ b/mteb/tasks/Classification/eng/EmotionClassification.py
@@ -5,6 +5,7 @@
class EmotionClassification(AbsTaskClassification):
+ superseded_by = "EmotionClassification.v2"
metadata = TaskMetadata(
name="EmotionClassification",
description="Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise.",
@@ -56,3 +57,59 @@ class EmotionClassification(AbsTaskClassification):
)
samples_per_label = 16
+
+
+class EmotionClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="EmotionClassification.v2",
+ description="""Emotion is a dataset of English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://www.aclweb.org/anthology/D18-1404",
+ dataset={
+ "path": "mteb/emotion",
+ "revision": "13535ec7ed83ac3920c40db3c3fd4133af55cc06",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2017-01-01",
+ "2018-12-31",
+ ), # Estimated range for the collection of Twitter messages
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{saravia-etal-2018-carer,
+ abstract = {Emotions are expressed in nuanced ways, which varies by collective or individual experiences, knowledge, and beliefs. Therefore, to understand emotion, as conveyed through text, a robust mechanism capable of capturing and modeling different linguistic nuances and phenomena is needed. We propose a semi-supervised, graph-based algorithm to produce rich structural descriptors which serve as the building blocks for constructing contextualized affect representations from text. The pattern-based representations are further enriched with word embeddings and evaluated through several emotion recognition tasks. Our experimental results demonstrate that the proposed method outperforms state-of-the-art techniques on emotion recognition tasks.},
+ address = {Brussels, Belgium},
+ author = {Saravia, Elvis and
+Liu, Hsien-Chi Toby and
+Huang, Yen-Hao and
+Wu, Junlin and
+Chen, Yi-Shin},
+ booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
+ doi = {10.18653/v1/D18-1404},
+ editor = {Riloff, Ellen and
+Chiang, David and
+Hockenmaier, Julia and
+Tsujii, Jun{'}ichi},
+ month = oct # {-} # nov,
+ pages = {3687--3697},
+ publisher = {Association for Computational Linguistics},
+ title = {{CARER}: Contextualized Affect Representations for Emotion Recognition},
+ url = {https://aclanthology.org/D18-1404},
+ year = {2018},
+}
+""",
+ prompt="Classify the emotion expressed in the given Twitter message into one of the six emotions: anger, fear, joy, love, sadness, and surprise",
+ adapted_from=["EmotionClassification"],
+ )
+
+ samples_per_label = 16
diff --git a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py
index 6d3f672f9d..1d3580e6e3 100644
--- a/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py
+++ b/mteb/tasks/Classification/eng/FinancialPhrasebankClassification.py
@@ -5,6 +5,7 @@
class FinancialPhrasebankClassification(AbsTaskClassification):
+ superseded_by = "FinancialPhrasebankClassification.v2"
metadata = TaskMetadata(
name="FinancialPhrasebankClassification",
description="Polar sentiment dataset of sentences from financial news, categorized by sentiment into positive, negative, or neutral.",
@@ -41,3 +42,40 @@ class FinancialPhrasebankClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset = self.dataset.rename_column("sentence", "text")
+
+
+class FinancialPhrasebankClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="FinancialPhrasebankClassification.v2",
+ description="""Polar sentiment dataset of sentences from financial news, categorized by sentiment into positive, negative, or neutral.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/1307.5336",
+ dataset={
+ "path": "mteb/financial_phrasebank",
+ "revision": "9349ecd31615a97081c245f5d7dbc0f4c6a1a656",
+ "name": "sentences_allagree",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2013-11-01", "2013-11-01"),
+ domains=["News", "Written", "Financial"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-nc-sa-3.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{Malo2014GoodDO,
+ author = {P. Malo and A. Sinha and P. Korhonen and J. Wallenius and P. Takala},
+ journal = {Journal of the Association for Information Science and Technology},
+ title = {Good debt or bad debt: Detecting semantic orientations in economic texts},
+ volume = {65},
+ year = {2014},
+}
+""",
+ adapted_from=["FinancialPhrasebankClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/FrenkEnClassification.py b/mteb/tasks/Classification/eng/FrenkEnClassification.py
index b9de110e20..db7a0e25d1 100644
--- a/mteb/tasks/Classification/eng/FrenkEnClassification.py
+++ b/mteb/tasks/Classification/eng/FrenkEnClassification.py
@@ -5,6 +5,7 @@
class FrenkEnClassification(AbsTaskClassification):
+ superseded_by = "FrenkEnClassification.v2"
metadata = TaskMetadata(
name="FrenkEnClassification",
description="English subset of the FRENK dataset",
@@ -39,3 +40,41 @@ class FrenkEnClassification(AbsTaskClassification):
}
""",
)
+
+
+class FrenkEnClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="FrenkEnClassification.v2",
+ description="""English subset of the FRENK dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/frenk_en",
+ "revision": "630d941b6e0879a7238da89af6bfe1b1eb27ca0f",
+ },
+ reference="https://arxiv.org/abs/1906.02045",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2021-05-28", "2021-05-28"),
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{ljubešić2019frenk,
+ archiveprefix = {arXiv},
+ author = {Nikola Ljubešić and Darja Fišer and Tomaž Erjavec},
+ eprint = {1906.02045},
+ primaryclass = {cs.CL},
+ title = {The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English},
+ url = {https://arxiv.org/abs/1906.02045},
+ year = {2019},
+}
+""",
+ adapted_from=["FrenkEnClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/ImdbClassification.py b/mteb/tasks/Classification/eng/ImdbClassification.py
index df2ac734ed..2318fcb26c 100644
--- a/mteb/tasks/Classification/eng/ImdbClassification.py
+++ b/mteb/tasks/Classification/eng/ImdbClassification.py
@@ -5,6 +5,7 @@
class ImdbClassification(AbsTaskClassification):
+ superseded_by = "ImdbClassification.v2"
metadata = TaskMetadata(
name="ImdbClassification",
description="Large Movie Review Dataset",
@@ -52,3 +53,55 @@ class ImdbClassification(AbsTaskClassification):
""",
prompt="Classify the sentiment expressed in the given movie review text from the IMDB dataset",
)
+
+
+class ImdbClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="ImdbClassification.v2",
+ description="""Large Movie Review Dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/imdb",
+ "revision": "d05f0155defa7991dad75bc68c5ccb6774b1fdc5",
+ },
+ reference="http://www.aclweb.org/anthology/P11-1015",
+ type="Classification",
+ category="p2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2000-01-01",
+ "2010-12-31",
+ ), # Estimated range for the collection of movie reviews
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{maas-etal-2011-learning,
+ address = {Portland, Oregon, USA},
+ author = {Maas, Andrew L. and
+Daly, Raymond E. and
+Pham, Peter T. and
+Huang, Dan and
+Ng, Andrew Y. and
+Potts, Christopher},
+ booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
+ editor = {Lin, Dekang and
+Matsumoto, Yuji and
+Mihalcea, Rada},
+ month = jun,
+ pages = {142--150},
+ publisher = {Association for Computational Linguistics},
+ title = {Learning Word Vectors for Sentiment Analysis},
+ url = {https://aclanthology.org/P11-1015},
+ year = {2011},
+}
+""",
+ prompt="Classify the sentiment expressed in the given movie review text from the IMDB dataset",
+ adapted_from=["ImdbClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/LegalBenchClassification.py b/mteb/tasks/Classification/eng/LegalBenchClassification.py
index d19df22a19..96862494a5 100644
--- a/mteb/tasks/Classification/eng/LegalBenchClassification.py
+++ b/mteb/tasks/Classification/eng/LegalBenchClassification.py
@@ -3447,6 +3447,7 @@ def dataset_transform(self):
class JCrewBlockerLegalBenchClassification(AbsTaskClassification):
+ superseded_by = "JCrewBlockerLegalBenchClassification.v2"
metadata = TaskMetadata(
name="JCrewBlockerLegalBenchClassification",
description="The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of detemining whether the J.Crew Blocker is present in the document.",
@@ -3491,6 +3492,44 @@ def dataset_transform(self):
self.dataset = self.dataset.rename_column("answer", "label")
+class JCrewBlockerLegalBenchClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="JCrewBlockerLegalBenchClassification.v2",
+ description="""The J.Crew Blocker, also known as the J.Crew Protection, is a provision included in leveraged loan documents to prevent companies from removing security by transferring intellectual property (IP) into new subsidiaries and raising additional debt. The task consists of detemining whether the J.Crew Blocker is present in the document.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/nguha/legalbench",
+ dataset={
+ "path": "mteb/j_crew_blocker_legal_bench",
+ "name": "jcrew_blocker",
+ "revision": "692cc80266711eaa41d03c9fb168bff60807ee8a",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2016-01-01", "2023-08-23"), # best guess
+ domains=["Legal", "Written"],
+ task_subtypes=[],
+ license="cc-by-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{guha2023legalbench,
+ archiveprefix = {arXiv},
+ author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li},
+ eprint = {2308.11462},
+ primaryclass = {cs.CL},
+ title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models},
+ year = {2023},
+}
+""",
+ adapted_from=["JCrewBlockerLegalBenchClassification"],
+ )
+
+
class LearnedHandsBenefitsLegalBenchClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="LearnedHandsBenefitsLegalBenchClassification",
@@ -4362,6 +4401,7 @@ def dataset_transform(self):
class LegalReasoningCausalityLegalBenchClassification(AbsTaskClassification):
+ superseded_by = "LegalReasoningCausalityLegalBenchClassification.v2"
metadata = TaskMetadata(
name="LegalReasoningCausalityLegalBenchClassification",
description="Given an excerpt from a district court opinion, classify if it relies on statistical evidence in its reasoning.",
@@ -4406,6 +4446,44 @@ def dataset_transform(self):
self.dataset = self.dataset.rename_column("answer", "label")
+class LegalReasoningCausalityLegalBenchClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="LegalReasoningCausalityLegalBenchClassification.v2",
+ description="""Given an excerpt from a district court opinion, classify if it relies on statistical evidence in its reasoning.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/nguha/legalbench",
+ dataset={
+ "path": "mteb/legal_reasoning_causality_legal_bench",
+ "name": "legal_reasoning_causality",
+ "revision": "563c52ea5216784b608912e67049226ae8cdf702",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2020-01-01", "2023-08-23"), # best guess
+ domains=["Legal", "Written"],
+ task_subtypes=[],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{guha2023legalbench,
+ archiveprefix = {arXiv},
+ author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li},
+ eprint = {2308.11462},
+ primaryclass = {cs.CL},
+ title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models},
+ year = {2023},
+}
+""",
+ adapted_from=["LegalReasoningCausalityLegalBenchClassification"],
+ )
+
+
_MAUD_DATASET_MAP = [
{
"name": "maud_ability_to_consummate_concept_is_subject_to_mae_carveouts",
@@ -4549,6 +4627,7 @@ def dataset_transform(self):
class MAUDLegalBenchClassification(AbsTaskClassification):
+ superseded_by = "MAUDLegalBenchClassification.v2"
metadata = TaskMetadata(
name="MAUDLegalBenchClassification",
description="""This task was constructed from the MAUD dataset, which consists of over 47,000 labels across 152 merger agreements annotated to identify 92 questions in each agreement used by the 2021 American Bar Association (ABA) Public Target Deal Points Study. Each dataset is formatted as a series of multiple-choice questions, where given a segment of the merger agreement and a Deal Point question, the model is to choose the answer that best characterizes the agreement as response.
@@ -4692,6 +4771,93 @@ def dataset_transform(self):
)
+class MAUDLegalBenchClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="MAUDLegalBenchClassification.v2",
+ description="""This task was constructed from the MAUD dataset, which consists of over 47,000 labels across 152 merger agreements annotated to identify 92 questions in each agreement used by the 2021 American Bar Association (ABA) Public Target Deal Points Study. Each dataset is formatted as a series of multiple-choice questions, where given a segment of the merger agreement and a Deal Point question, the model is to choose the answer that best characterizes the agreement as response.
+
+ This is a combination of all 34 of the MAUD Legal Bench datasets:
+ 1. MAUD Ability To Consummate Concept Is Subject To MAE Carveouts: Given an excerpt from a merger agreement and the task is to answer: is the “ability to consummate” concept subject to Material Adverse Effect (MAE) carveouts? amongst the multiple choice options.
+ 2. MAUD Accuracy Of Fundamental Target RWS Bringdown Standard: Given an excerpt from a merger agreement and the task is to answer: how accurate must the fundamental representations and warranties be according to the bring down provision, amongst the multiple choice options.
+ 3. MAUD Accuracy Of Target Capitalization RW Outstanding Shares Bringdown Standard Answer: Given an excerpt from a merger agreement and the task is to answer: how accurate must the fundamental representations and warranties be according to the bring down provision, amongst the multiple choice options.
+ 4. MAUD Accuracy Of Target General RW Bringdown Timing Answer: Given an excerpt from a merger agreement and the task is to answer: how accurate must the fundamental representations and warranties be according to the bring down provision, amongst the multiple choice options.
+ 5. MAUD Additional Matching Rights Period For Modifications Cor: Given an excerpt from a merger agreement and the task is to answer: how long is the additional matching rights period for modifications in case the board changes its recommendation, amongst the multiple choice options.
+ 6. MAUD Application Of Buyer Consent Requirement Negative Interim Covenant: Given an excerpt from a merger agreement and the task is to answer: what negative covenants does the requirement of Buyer consent apply to, amongst the multiple choice options.
+ 7. MAUD Buyer Consent Requirement Ordinary Course: Given an excerpt from a merger agreement and the task is to answer: in case the Buyer's consent for the acquired company's ordinary business operations is required, are there any limitations on the Buyer's right to condition, withhold, or delay their consent, amongst the multiple choice options.
+ 8. MAUD Change In Law Subject To Disproportionate Impact Modifier: Given an excerpt from a merger agreement and the task is to answer: do changes in law that have disproportionate impact qualify for Material Adverse Effect (MAE), amongst the multiple choice options.
+ 9. MAUD Changes In GAAP Or Other Accounting Principles Subject To Disproportionate Impact Modifier: Given an excerpt from a merger agreement and the task is to answer: do changes in GAAP or other accounting principles that have disproportionate impact qualify for Material Adverse Effect (MAE), amongst the multiple choice options.
+ 10. MAUD COR Permitted In Response To Intervening Event: Given an excerpt from a merger agreement and the task is to answer: is Change of Recommendation permitted in response to an intervening event, amongst the multiple choice options.
+ 11. MAUD COR Permitted With Board Fiduciary Determination Only: Given an excerpt from a merger agreement and the task is to answer: is Change of Recommendation permitted as long as the board determines that such change is required to fulfill its fiduciary obligations, amongst the multiple choice options.
+ 12. MAUD COR Standard Intervening Event: Given an excerpt from a merger agreement and the task is to answer: what standard should the board follow when determining whether to change its recommendation in response to an intervening event, amongst the multiple choice options.
+ 13. MAUD COR Standard Superior Offer: Given an excerpt from a merger agreement and the task is to answer: what standard should the board follow when determining whether to change its recommendation in connection with a superior offer, amongst the multiple choice options.
+ 14. MAUD Definition Contains Knowledge Requirement Answer: Given an excerpt from a merger agreement and the task is to answer: what is the knowledge requirement in the definition of “Intervening Event”, amongst the multiple choice options.
+ 15. MAUD Definition Includes Asset Deals: Given an excerpt from a merger agreement and the task is to answer: what qualifies as a superior offer in terms of asset deals, amongst the multiple choice options.
+ 16. MAUD Definition Includes Stock Deals: Given an excerpt from a merger agreement and the task is to answer: what qualifies as a superior offer in terms of stock deals, amongst the multiple choice options.
+ 17. MAUD Fiduciary Exception Board Determination Standard: Given an excerpt from a merger agreement and the task is to answer: under what circumstances could the Board take actions on a different acquisition proposal notwithstanding the no-shop provision, amongst the multiple choice options.
+ 18. MAUD Fiduciary Exception Board Determination Trigger No Shop: Given an excerpt from a merger agreement and the task is to answer: what type of offer could the Board take actions on notwithstanding the no-shop provision, amongst the multiple choice options.
+ 19. MAUD Financial Point Of View Is The Sole Consideration: Given an excerpt from a merger agreement and the task is to answer: is “financial point of view” the sole consideration when determining whether an offer is superior, amongst the multiple choice options.
+ 20. MAUD FLS MAE Standard: Given an excerpt from a merger agreement and the task is to answer: what is the Forward Looking Standard (FLS) with respect to Material Adverse Effect (MAE), amongst the multiple choice options.
+ 21. MAUD General Economic and Financial Conditions Subject To Disproportionate Impact Modifier: Given an excerpt from a merger agreement and the task is to answer: do changes caused by general economic and financial conditions that have disproportionate impact qualify for Material Adverse Effect (MAE), amongst the multiple choice options.
+ 22. MAUD Includes Consistent With Past Practice: Given an excerpt from a merger agreement and the task is to answer: does the wording of the Efforts Covenant clause include “consistent with past practice”, amongst the multiple choice options.
+ 23. MAUD Initial Matching Rights Period COR: Given an excerpt from a merger agreement and the task is to answer: how long is the initial matching rights period in case the board changes its recommendation, amongst the multiple choice options.
+ 24. MAUD Initial Matching Rights Period FTR: Given an excerpt from a merger agreement and the task is to answer: how long is the initial matching rights period in connection with the Fiduciary Termination Right (FTR), amongst the multiple choice options.
+ 25. MAUDInterveningEventRequiredToOccurAfterSigningAnswer: Given an excerpt from a merger agreement and the task is to answer: is an “Intervening Event” required to occur after signing, amongst the multiple choice options.
+ 26. MAUD Knowledge Definition: Given an excerpt from a merger agreement and the task is to answer: what counts as Knowledge, amongst the multiple choice options.
+ 27. MAUDLiabilityStandardForNoShopBreachByTargetNonDORepresentatives: Given an excerpt from a merger agreement and the task is to answer: what is the liability standard for no-shop breach by Target Non-D&O Representatives, amongst the multiple choice options.
+ 28. MAUD Ordinary Course Efforts Standard: Given an excerpt from a merger agreement and the task is to answer: what is the efforts standard, amongst the multiple choice options.
+ 29. MAUD Pandemic Or Other Public Health Event Subject To Disproportionate Impact Modifier: Given an excerpt from a merger agreement and the task is to answer: do pandemics or other public health events have to have disproportionate impact to qualify for Material Adverse Effect (MAE), amongst the multiple choice options.
+ 30. MAUD Pandemic Or Other Public Health Event Specific Reference To Pandemic Related Governmental Responses Or Measures: Given an excerpt from a merger agreement and the task is to answer: is there specific reference to pandemic-related governmental responses or measures in the clause that qualifies pandemics or other public health events for Material Adverse Effect (MAE), amongst the multiple choice options.
+ 31. MAUD Relational Language MAE Applies To: Given an excerpt from a merger agreement and the task is to answer: what carveouts pertaining to Material Adverse Effect (MAE) does the relational language apply to?, amongst the multiple choice options.
+ 32. MAUD Specific Performance: Given an excerpt from a merger agreement and the task is to answer: what is the wording of the Specific Performance clause regarding the parties' entitlement in the event of a contractual breach, amongst the multiple choice options.
+ 33. MAUD Tail Period Length: Given an excerpt from a merger agreement and the task is to answer: how long is the Tail Period, amongst the multiple choice options.
+ 34. MAUD Type Of Consideration: Given an excerpt from a merger agreement and the task is to answer: what type of consideration is specified in this agreement, amongst the multiple choice options.
+
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/nguha/legalbench",
+ dataset={
+ "path": "mteb/maud_legal_bench",
+ "revision": "655744e3745703e6f551e78b4c4cba1702774ce3",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2021-01-01", "2023-08-23"),
+ domains=["Legal", "Written"],
+ task_subtypes=[],
+ license="cc-by-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{guha2023legalbench,
+ archiveprefix = {arXiv},
+ author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li},
+ eprint = {2308.11462},
+ primaryclass = {cs.CL},
+ title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models},
+ year = {2023},
+}
+
+@article{wang2023maud,
+ author = {Wang, Steven H and Scardigli, Antoine and Tang, Leonard and Chen, Wei and Levkin, Dimitry and Chen, Anya and Ball, Spencer and Woodside, Thomas and Zhang, Oliver and Hendrycks, Dan},
+ journal = {arXiv preprint arXiv:2301.00876},
+ title = {MAUD: An Expert-Annotated Legal NLP Dataset for Merger Agreement Understanding},
+ year = {2023},
+}
+""",
+ adapted_from=["MAUDLegalBenchClassification"],
+ )
+
+ def dataset_transform(self):
+ # The train split has one example in each dataset, so we combine it with the test split and resample
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
+
+
class NYSJudicialEthicsLegalBenchClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="NYSJudicialEthicsLegalBenchClassification",
@@ -4793,6 +4959,7 @@ def dataset_transform(self):
class OPP115DataSecurityLegalBenchClassification(AbsTaskClassification):
+ superseded_by = "OPP115DataSecurityLegalBenchClassification.v2"
metadata = TaskMetadata(
name="OPP115DataSecurityLegalBenchClassification",
description="Given a clause from a privacy policy, classify if the clause describes how user information is protected.",
@@ -4845,7 +5012,54 @@ def dataset_transform(self):
self.dataset = self.dataset.rename_column("answer", "label")
+class OPP115DataSecurityLegalBenchClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="OPP115DataSecurityLegalBenchClassification.v2",
+ description="""Given a clause from a privacy policy, classify if the clause describes how user information is protected.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/nguha/legalbench",
+ dataset={
+ "path": "mteb/opp115_data_security_legal_bench",
+ "name": "opp115_data_security",
+ "revision": "8596086d90fa4f2574b15d96a60cb6bc9889806b",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2015-01-01", "2023-08-23"),
+ domains=["Legal", "Written"],
+ task_subtypes=[],
+ license="cc-by-nc-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{guha2023legalbench,
+ archiveprefix = {arXiv},
+ author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li},
+ eprint = {2308.11462},
+ primaryclass = {cs.CL},
+ title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models},
+ year = {2023},
+}
+
+@inproceedings{wilson2016creation,
+ author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others},
+ booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+ pages = {1330--1340},
+ title = {The creation and analysis of a website privacy policy corpus},
+ year = {2016},
+}
+""",
+ adapted_from=["OPP115DataSecurityLegalBenchClassification"],
+ )
+
+
class OPP115DoNotTrackLegalBenchClassification(AbsTaskClassification):
+ superseded_by = "OPP115DoNotTrackLegalBenchClassification.v2"
metadata = TaskMetadata(
name="OPP115DoNotTrackLegalBenchClassification",
description="Given a clause from a privacy policy, classify if the clause describes if and how Do Not Track signals for online tracking and advertising are honored.",
@@ -4898,6 +5112,52 @@ def dataset_transform(self):
self.dataset = self.dataset.rename_column("answer", "label")
+class OPP115DoNotTrackLegalBenchClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="OPP115DoNotTrackLegalBenchClassification.v2",
+ description="""Given a clause from a privacy policy, classify if the clause describes if and how Do Not Track signals for online tracking and advertising are honored.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/nguha/legalbench",
+ dataset={
+ "path": "mteb/opp115_do_not_track_legal_bench",
+ "name": "opp115_do_not_track",
+ "revision": "3e2cc83cd3fc98dc6d76825c21ed4fbed86d560c",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2015-01-01", "2023-08-23"),
+ domains=["Legal", "Written"],
+ task_subtypes=[],
+ license="cc-by-nc-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{guha2023legalbench,
+ archiveprefix = {arXiv},
+ author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li},
+ eprint = {2308.11462},
+ primaryclass = {cs.CL},
+ title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models},
+ year = {2023},
+}
+
+@inproceedings{wilson2016creation,
+ author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others},
+ booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+ pages = {1330--1340},
+ title = {The creation and analysis of a website privacy policy corpus},
+ year = {2016},
+}
+""",
+ adapted_from=["OPP115DoNotTrackLegalBenchClassification"],
+ )
+
+
class OPP115FirstPartyCollectionUseLegalBenchClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="OPP115FirstPartyCollectionUseLegalBenchClassification",
@@ -5166,6 +5426,7 @@ def dataset_transform(self):
class OPP115UserChoiceControlLegalBenchClassification(AbsTaskClassification):
+ superseded_by = "OPP115UserChoiceControlLegalBenchClassification.v2"
metadata = TaskMetadata(
name="OPP115UserChoiceControlLegalBenchClassification",
description="Given a clause fro ma privacy policy, classify if the clause describes the choices and control options available to users.",
@@ -5218,7 +5479,54 @@ def dataset_transform(self):
self.dataset = self.dataset.rename_column("answer", "label")
+class OPP115UserChoiceControlLegalBenchClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="OPP115UserChoiceControlLegalBenchClassification.v2",
+ description="""Given a clause fro ma privacy policy, classify if the clause describes the choices and control options available to users.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/nguha/legalbench",
+ dataset={
+ "path": "mteb/opp115_user_choice_control_legal_bench",
+ "name": "opp115_user_choice_control",
+ "revision": "f308b16f8baee2080cf43e28ff01d93032d51eee",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2015-01-01", "2023-08-23"),
+ domains=["Legal", "Written"],
+ task_subtypes=[],
+ license="cc-by-nc-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{guha2023legalbench,
+ archiveprefix = {arXiv},
+ author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li},
+ eprint = {2308.11462},
+ primaryclass = {cs.CL},
+ title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models},
+ year = {2023},
+}
+
+@inproceedings{wilson2016creation,
+ author = {Wilson, Shomir and Schaub, Florian and Dara, Aswarth Abhilash and Liu, Frederick and Cherivirala, Sushain and Leon, Pedro Giovanni and Andersen, Mads Schaarup and Zimmeck, Sebastian and Sathyendra, Kanthashree Mysore and Russell, N Cameron and others},
+ booktitle = {Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
+ pages = {1330--1340},
+ title = {The creation and analysis of a website privacy policy corpus},
+ year = {2016},
+}
+""",
+ adapted_from=["OPP115UserChoiceControlLegalBenchClassification"],
+ )
+
+
class OralArgumentQuestionPurposeLegalBenchClassification(AbsTaskClassification):
+ superseded_by = "OralArgumentQuestionPurposeLegalBenchClassification.v2"
metadata = TaskMetadata(
name="OralArgumentQuestionPurposeLegalBenchClassification",
description="""This task classifies questions asked by Supreme Court justices at oral argument into seven categories:
@@ -5267,7 +5575,54 @@ def dataset_transform(self):
)
+class OralArgumentQuestionPurposeLegalBenchClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="OralArgumentQuestionPurposeLegalBenchClassification.v2",
+ description="""This task classifies questions asked by Supreme Court justices at oral argument into seven categories:
+ 1. Background - questions seeking factual or procedural information that is missing or not clear in the briefing
+ 2. Clarification - questions seeking to get an advocate to clarify her position or the scope of the rule being advocated for
+ 3. Implications - questions about the limits of a rule or its implications for future cases
+ 4. Support - questions offering support for the advocate’s position
+ 5. Criticism - questions criticizing an advocate’s position
+ 6. Communicate - question designed primarily to communicate with other justices
+ 7. Humor - questions designed to interject humor into the argument and relieve tension
+
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/nguha/legalbench",
+ dataset={
+ "path": "mteb/oral_argument_question_purpose_legal_bench",
+ "name": "oral_argument_question_purpose",
+ "revision": "cdc020e244cb846ce4e0325cb602cf04126c79d2",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2021-01-01", "2023-08-23"), # best guess
+ domains=["Legal", "Written"],
+ task_subtypes=[],
+ license="cc-by-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{guha2023legalbench,
+ archiveprefix = {arXiv},
+ author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li},
+ eprint = {2308.11462},
+ primaryclass = {cs.CL},
+ title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models},
+ year = {2023},
+}
+""",
+ adapted_from=["OralArgumentQuestionPurposeLegalBenchClassification"],
+ )
+
+
class OverrulingLegalBenchClassification(AbsTaskClassification):
+ superseded_by = "OverrulingLegalBenchClassification.v2"
metadata = TaskMetadata(
name="OverrulingLegalBenchClassification",
description="""This task consists of classifying whether or not a particular sentence of case law overturns the decision of a previous case.""",
@@ -5323,6 +5678,57 @@ def dataset_transform(self):
)
+class OverrulingLegalBenchClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="OverrulingLegalBenchClassification.v2",
+ description="""This task consists of classifying whether or not a particular sentence of case law overturns the decision of a previous case.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/nguha/legalbench",
+ dataset={
+ "path": "mteb/overruling_legal_bench",
+ "name": "overruling",
+ "revision": "fee708d1959b3258bc3e408afdd3e6c2051adf80",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("1965-01-01", "2023-08-23"),
+ domains=["Legal", "Written"],
+ task_subtypes=[],
+ license="cc-by-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{guha2023legalbench,
+ archiveprefix = {arXiv},
+ author = {Neel Guha and Julian Nyarko and Daniel E. Ho and Christopher Ré and Adam Chilton and Aditya Narayana and Alex Chohlas-Wood and Austin Peters and Brandon Waldon and Daniel N. Rockmore and Diego Zambrano and Dmitry Talisman and Enam Hoque and Faiz Surani and Frank Fagan and Galit Sarfaty and Gregory M. Dickinson and Haggai Porat and Jason Hegland and Jessica Wu and Joe Nudell and Joel Niklaus and John Nay and Jonathan H. Choi and Kevin Tobia and Margaret Hagan and Megan Ma and Michael Livermore and Nikon Rasumov-Rahe and Nils Holzenberger and Noam Kolt and Peter Henderson and Sean Rehaag and Sharad Goel and Shang Gao and Spencer Williams and Sunny Gandhi and Tom Zur and Varun Iyer and Zehua Li},
+ eprint = {2308.11462},
+ primaryclass = {cs.CL},
+ title = {LegalBench: A Collaboratively Built Benchmark for Measuring Legal Reasoning in Large Language Models},
+ year = {2023},
+}
+
+@inproceedings{zheng2021does,
+ author = {Zheng, Lucia and Guha, Neel and Anderson, Brandon R and Henderson, Peter and Ho, Daniel E},
+ booktitle = {Proceedings of the eighteenth international conference on artificial intelligence and law},
+ pages = {159--168},
+ title = {When does pretraining help? assessing self-supervised learning for law and the casehold dataset of 53,000+ legal holdings},
+ year = {2021},
+}
+""",
+ adapted_from=["OverrulingLegalBenchClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
+
+
class PersonalJurisdictionLegalBenchClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="PersonalJurisdictionLegalBenchClassification",
diff --git a/mteb/tasks/Classification/eng/NewsClassification.py b/mteb/tasks/Classification/eng/NewsClassification.py
index aec198d5c1..3fba46de16 100644
--- a/mteb/tasks/Classification/eng/NewsClassification.py
+++ b/mteb/tasks/Classification/eng/NewsClassification.py
@@ -5,6 +5,7 @@
class NewsClassification(AbsTaskClassification):
+ superseded_by = "NewsClassification.v2"
metadata = TaskMetadata(
name="NewsClassification",
description="Large News Classification Dataset",
@@ -43,3 +44,46 @@ class NewsClassification(AbsTaskClassification):
}
""",
)
+
+
+class NewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="NewsClassification.v2",
+ description="""Large News Classification Dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/news",
+ "revision": "7c1f485c1f43d6aef852c5df6db23b047991a8e7",
+ },
+ reference="https://arxiv.org/abs/1509.01626",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2004-01-01",
+ "2015-12-31",
+ ), # Estimated range for the collection of news articles
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="apache-2.0",
+ annotations_creators="expert-annotated",
+ dialect=["eng-Latn-US", "en-Latn-GB", "en-Latn-AU"],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{NIPS2015_250cf8b5,
+ author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
+ booktitle = {Advances in Neural Information Processing Systems},
+ editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
+ pages = {},
+ publisher = {Curran Associates, Inc.},
+ title = {Character-level Convolutional Networks for Text Classification},
+ url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf},
+ volume = {28},
+ year = {2015},
+}
+""",
+ adapted_from=["NewsClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/PatentClassification.py b/mteb/tasks/Classification/eng/PatentClassification.py
index 270bde1698..0598118b0f 100644
--- a/mteb/tasks/Classification/eng/PatentClassification.py
+++ b/mteb/tasks/Classification/eng/PatentClassification.py
@@ -5,6 +5,7 @@
class PatentClassification(AbsTaskClassification):
+ superseded_by = "PatentClassification.v2"
metadata = TaskMetadata(
name="PatentClassification",
description="Classification Dataset of Patents and Abstract",
@@ -47,3 +48,49 @@ class PatentClassification(AbsTaskClassification):
}
""",
)
+
+
+class PatentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="PatentClassification.v2",
+ description="""Classification Dataset of Patents and Abstract
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/patent",
+ "revision": "f5e5c81286448c68264300fe1e6f3de599922890",
+ },
+ reference="https://aclanthology.org/P19-1212.pdf",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2021-11-05", "2022-10-22"),
+ domains=["Legal", "Written"],
+ task_subtypes=["Topic classification"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{sharma-etal-2019-bigpatent,
+ abstract = {Most existing text summarization datasets are compiled from the news domain, where summaries have a flattened discourse structure. In such datasets, summary-worthy content often appears in the beginning of input articles. Moreover, large segments from input articles are present verbatim in their respective summaries. These issues impede the learning and evaluation of systems that can understand an article{'}s global content structure as well as produce abstractive summaries with high compression ratio. In this work, we present a novel dataset, BIGPATENT, consisting of 1.3 million records of U.S. patent documents along with human written abstractive summaries. Compared to existing summarization datasets, BIGPATENT has the following properties: i) summaries contain a richer discourse structure with more recurring entities, ii) salient content is evenly distributed in the input, and iii) lesser and shorter extractive fragments are present in the summaries. Finally, we train and evaluate baselines and popular learning models on BIGPATENT to shed light on new challenges and motivate future directions for summarization research.},
+ address = {Florence, Italy},
+ author = {Sharma, Eva and
+Li, Chen and
+Wang, Lu},
+ booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
+ doi = {10.18653/v1/P19-1212},
+ editor = {Korhonen, Anna and
+Traum, David and
+M{\`a}rquez, Llu{\'\i}s},
+ month = jul,
+ pages = {2204--2213},
+ publisher = {Association for Computational Linguistics},
+ title = {{BIGPATENT}: A Large-Scale Dataset for Abstractive and Coherent Summarization},
+ url = {https://aclanthology.org/P19-1212},
+ year = {2019},
+}
+""",
+ )
diff --git a/mteb/tasks/Classification/eng/PoemSentimentClassification.py b/mteb/tasks/Classification/eng/PoemSentimentClassification.py
index 8671929fea..c330171c58 100644
--- a/mteb/tasks/Classification/eng/PoemSentimentClassification.py
+++ b/mteb/tasks/Classification/eng/PoemSentimentClassification.py
@@ -5,6 +5,7 @@
class PoemSentimentClassification(AbsTaskClassification):
+ superseded_by = "PoemSentimentClassification.v2"
metadata = TaskMetadata(
name="PoemSentimentClassification",
description="Poem Sentiment is a sentiment dataset of poem verses from Project Gutenberg.",
@@ -41,3 +42,40 @@ class PoemSentimentClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset = self.dataset.rename_column("verse_text", "text")
+
+
+class PoemSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="PoemSentimentClassification.v2",
+ description="""Poem Sentiment is a sentiment dataset of poem verses from Project Gutenberg.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2011.02686",
+ dataset={
+ "path": "mteb/poem_sentiment",
+ "revision": "9fdc57b89ccc09a8d9256f376112d626878e51a7",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["validation", "test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("1700-01-01", "1900-01-01"),
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-4.0",
+ annotations_creators="human-annotated",
+ dialect=["eng-Latn-US", "en-Latn-GB"],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{sheng2020investigating,
+ archiveprefix = {arXiv},
+ author = {Emily Sheng and David Uthus},
+ eprint = {2011.02686},
+ primaryclass = {cs.CL},
+ title = {Investigating Societal Biases in a Poetry Composition System},
+ year = {2020},
+}
+""",
+ adapted_from=["PoemSentimentClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py
index 5ae0df8602..5afbc2e13f 100644
--- a/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py
+++ b/mteb/tasks/Classification/eng/SDSEyeProtectionClassification.py
@@ -5,6 +5,7 @@
class SDSEyeProtectionClassification(AbsTaskClassification):
+ superseded_by = "SDSEyeProtectionClassification.v2"
metadata = TaskMetadata(
name="SDSEyeProtectionClassification",
description="ChemTEB evaluates the performance of text embedding models on chemical domain data.",
@@ -43,3 +44,46 @@ class SDSEyeProtectionClassification(AbsTaskClassification):
}
""",
)
+
+
+class SDSEyeProtectionClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SDSEyeProtectionClassification.v2",
+ description="""ChemTEB evaluates the performance of text embedding models on chemical domain data.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2412.00532",
+ dataset={
+ "path": "mteb/sds_eye_protection",
+ "revision": "ead011d2286d5395fea054d2282ca0478ceb7cfb",
+ },
+ type="Classification",
+ category="s2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2024-06-01", "2024-11-30"),
+ domains=["Chemistry"],
+ task_subtypes=[],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="LM-generated and reviewed",
+ dialect=[],
+ sample_creation="created",
+ bibtex_citation=r"""
+@article{kasmaee2024chemteb,
+ author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila},
+ journal = {arXiv preprint arXiv:2412.00532},
+ title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain},
+ year = {2024},
+}
+
+@inproceedings{pereira2020msds,
+ author = {Pereira, Eliseu},
+ booktitle = {15th Doctoral Symposium},
+ pages = {42},
+ title = {MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets},
+ year = {2020},
+}
+""",
+ adapted_from=["SDSEyeProtectionClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/SDSGlovesClassification.py b/mteb/tasks/Classification/eng/SDSGlovesClassification.py
index 41b68096db..cc5c173bd0 100644
--- a/mteb/tasks/Classification/eng/SDSGlovesClassification.py
+++ b/mteb/tasks/Classification/eng/SDSGlovesClassification.py
@@ -5,6 +5,7 @@
class SDSGlovesClassification(AbsTaskClassification):
+ superseded_by = "SDSGlovesClassification.v2"
metadata = TaskMetadata(
name="SDSGlovesClassification",
description="ChemTEB evaluates the performance of text embedding models on chemical domain data.",
@@ -43,3 +44,46 @@ class SDSGlovesClassification(AbsTaskClassification):
}
""",
)
+
+
+class SDSGlovesClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SDSGlovesClassification.v2",
+ description="""ChemTEB evaluates the performance of text embedding models on chemical domain data.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2412.00532",
+ dataset={
+ "path": "mteb/sds_gloves",
+ "revision": "09b09ee755ada02c68ad835971e22b1959d79448",
+ },
+ type="Classification",
+ category="s2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2024-06-01", "2024-11-30"),
+ domains=["Chemistry"],
+ task_subtypes=[],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="LM-generated and reviewed",
+ dialect=[],
+ sample_creation="created",
+ bibtex_citation=r"""
+@article{kasmaee2024chemteb,
+ author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila},
+ journal = {arXiv preprint arXiv:2412.00532},
+ title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain},
+ year = {2024},
+}
+
+@inproceedings{pereira2020msds,
+ author = {Pereira, Eliseu},
+ booktitle = {15th Doctoral Symposium},
+ pages = {42},
+ title = {MSDS-OPP: Operator Procedures Prediction in Material Safety Data Sheets},
+ year = {2020},
+}
+""",
+ adapted_from=["SDSGlovesClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/ToxicChatClassification.py b/mteb/tasks/Classification/eng/ToxicChatClassification.py
index e189cd51a0..b591659ba7 100644
--- a/mteb/tasks/Classification/eng/ToxicChatClassification.py
+++ b/mteb/tasks/Classification/eng/ToxicChatClassification.py
@@ -7,6 +7,7 @@
class ToxicChatClassification(AbsTaskClassification):
+ superseded_by = "ToxicChatClassification.v2"
metadata = TaskMetadata(
name="ToxicChatClassification",
description="""This dataset contains toxicity annotations on 10K user
@@ -64,3 +65,54 @@ def dataset_transform(self):
# only use human-annotated data
self.dataset = self.dataset.filter(lambda x: x["human_annotation"])
self.dataset = self.dataset.remove_columns(remove_cols)
+
+
+class ToxicChatClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="ToxicChatClassification.v2",
+ description="""This dataset contains toxicity annotations on 10K user
+ prompts collected from the Vicuna online demo. We utilize a human-AI
+ collaborative annotation framework to guarantee the quality of annotation
+ while maintaining a feasible annotation workload. The details of data
+ collection, pre-processing, and annotation can be found in our paper.
+ We believe that ToxicChat can be a valuable resource to drive further
+ advancements toward building a safe and healthy environment for user-AI
+ interactions.
+ Only human annotated samples are selected here.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2023.findings-emnlp.311/",
+ dataset={
+ "path": "mteb/toxic_chat",
+ "name": "toxicchat0124",
+ "revision": "800fec53e44419d13668be291aca50a071ab5849",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=_EVAL_SPLITS,
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2023-10-26", "2024-01-31"),
+ domains=["Constructed", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{lin2023toxicchat,
+ archiveprefix = {arXiv},
+ author = {Zi Lin and Zihan Wang and Yongqi Tong and Yangkun Wang and Yuxin Guo and Yujia Wang and Jingbo Shang},
+ eprint = {2310.17389},
+ primaryclass = {cs.CL},
+ title = {ToxicChat: Unveiling Hidden Challenges of Toxicity Detection in Real-World User-AI Conversation},
+ year = {2023},
+}
+""",
+ adapted_from=["ToxicChatClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/eng/ToxicConversationsClassification.py b/mteb/tasks/Classification/eng/ToxicConversationsClassification.py
index 439b19ba7c..c847c40ceb 100644
--- a/mteb/tasks/Classification/eng/ToxicConversationsClassification.py
+++ b/mteb/tasks/Classification/eng/ToxicConversationsClassification.py
@@ -5,6 +5,7 @@
class ToxicConversationsClassification(AbsTaskClassification):
+ superseded_by = "ToxicConversationsClassification.v2"
metadata = TaskMetadata(
name="ToxicConversationsClassification",
description="Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not.",
@@ -47,3 +48,50 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class ToxicConversationsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="ToxicConversationsClassification.v2",
+ description="""Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview",
+ dataset={
+ "path": "mteb/toxic_conversations",
+ "revision": "7ae55309fbe51a11e13c24887ceed200153514e9",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2017-01-01",
+ "2018-12-31",
+ ), # Estimated range for the collection of comments
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-4.0",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{jigsaw-unintended-bias-in-toxicity-classification,
+ author = {cjadams and Daniel Borkan and inversion and Jeffrey Sorensen and Lucas Dixon and Lucy Vasserman and nithum},
+ publisher = {Kaggle},
+ title = {Jigsaw Unintended Bias in Toxicity Classification},
+ url = {https://kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification},
+ year = {2019},
+}
+""",
+ prompt="Classify the given comments as either toxic or not toxic",
+ adapted_from=["ToxicConversationsClassification"],
+ )
+
+ samples_per_label = 16
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py b/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py
index a4ab4b5c70..bddfff5221 100644
--- a/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py
+++ b/mteb/tasks/Classification/eng/TweetSentimentExtractionClassification.py
@@ -5,6 +5,7 @@
class TweetSentimentExtractionClassification(AbsTaskClassification):
+ superseded_by = "TweetSentimentExtractionClassification.v2"
metadata = TaskMetadata(
name="TweetSentimentExtractionClassification",
description="",
@@ -42,3 +43,45 @@ class TweetSentimentExtractionClassification(AbsTaskClassification):
)
samples_per_label = 32
+
+
+class TweetSentimentExtractionClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="TweetSentimentExtractionClassification.v2",
+ description="""
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview",
+ dataset={
+ "path": "mteb/tweet_sentiment_extraction",
+ "revision": "7261898ee3b9a739595e8dbf41df6b2332f429bb",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=(
+ "2020-01-01",
+ "2020-12-31",
+ ), # Estimated range for the collection of tweets
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{tweet-sentiment-extraction,
+ author = {Maggie, Phil Culliton, Wei Chen},
+ publisher = {Kaggle},
+ title = {Tweet Sentiment Extraction},
+ url = {https://kaggle.com/competitions/tweet-sentiment-extraction},
+ year = {2020},
+}
+""",
+ prompt="Classify the sentiment of a given tweet as either positive, negative, or neutral",
+ adapted_from=["TweetSentimentExtractionClassification"],
+ )
+
+ samples_per_label = 32
diff --git a/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py b/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py
index cad250dac0..6e3edb64ea 100644
--- a/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py
+++ b/mteb/tasks/Classification/eng/TweetTopicSingleClassification.py
@@ -5,6 +5,7 @@
class TweetTopicSingleClassification(AbsTaskClassification):
+ superseded_by = "TweetTopicSingleClassification.v2"
metadata = TaskMetadata(
name="TweetTopicSingleClassification",
description="""Topic classification dataset on Twitter with 6 labels. Each instance of
@@ -52,3 +53,51 @@ class TweetTopicSingleClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset["train"] = self.dataset["train_2021"]
+
+
+class TweetTopicSingleClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="TweetTopicSingleClassification.v2",
+ description="""Topic classification dataset on Twitter with 6 labels. Each instance of
+ TweetTopic comes with a timestamp which distributes from September 2019 to August 2021.
+ Tweets were preprocessed before the annotation to normalize some artifacts, converting
+ URLs into a special token {{URL}} and non-verified usernames into {{USERNAME}}. For verified
+ usernames, we replace its display name (or account name) with symbols {@}.
+
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/tweet_topic_single",
+ "revision": "a7904e26081f987da81ad2cc063e09e714e875d0",
+ },
+ reference="https://arxiv.org/abs/2209.09824",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test_2021"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2019-09-01", "2021-08-31"),
+ domains=["Social", "News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="not specified",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{dimosthenis-etal-2022-twitter,
+ address = {Gyeongju, Republic of Korea},
+ author = {Antypas, Dimosthenis and
+Ushio, Asahi and
+Camacho-Collados, Jose and
+Neves, Leonardo and
+Silva, Vitor and
+Barbieri, Francesco},
+ booktitle = {Proceedings of the 29th International Conference on Computational Linguistics},
+ month = oct,
+ publisher = {International Committee on Computational Linguistics},
+ title = {{T}witter {T}opic {C}lassification},
+ year = {2022},
+}
+""",
+ adapted_from=["TweetTopicSingleClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py
index c6ade13b66..746f68f4aa 100644
--- a/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py
+++ b/mteb/tasks/Classification/eng/WikipediaBioMetChemClassification.py
@@ -5,6 +5,7 @@
class WikipediaBioMetChemClassification(AbsTaskClassification):
+ superseded_by = "WikipediaBioMetChemClassification.v2"
metadata = TaskMetadata(
name="WikipediaBioMetChemClassification",
description="ChemTEB evaluates the performance of text embedding models on chemical domain data.",
@@ -35,3 +36,38 @@ class WikipediaBioMetChemClassification(AbsTaskClassification):
}
""",
)
+
+
+class WikipediaBioMetChemClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="WikipediaBioMetChemClassification.v2",
+ description="""ChemTEB evaluates the performance of text embedding models on chemical domain data.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2412.00532",
+ dataset={
+ "path": "mteb/wikipedia_bio_met_chem",
+ "revision": "5341bdf799e94949b5c2684bb54bfe462c528179",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2024-06-01", "2024-11-30"),
+ domains=["Chemistry"],
+ task_subtypes=[],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="created",
+ bibtex_citation=r"""
+@article{kasmaee2024chemteb,
+ author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila},
+ journal = {arXiv preprint arXiv:2412.00532},
+ title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain},
+ year = {2024},
+}
+""",
+ adapted_from=["WikipediaBioMetChemClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py
index 940bcdc44e..133ea90055 100644
--- a/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py
+++ b/mteb/tasks/Classification/eng/WikipediaChemFieldsClassification.py
@@ -5,6 +5,7 @@
class WikipediaChemFieldsClassification(AbsTaskClassification):
+ superseded_by = "WikipediaChemFieldsClassification.v2"
metadata = TaskMetadata(
name="WikipediaChemFieldsClassification",
description="ChemTEB evaluates the performance of text embedding models on chemical domain data.",
@@ -35,3 +36,38 @@ class WikipediaChemFieldsClassification(AbsTaskClassification):
}
""",
)
+
+
+class WikipediaChemFieldsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="WikipediaChemFieldsClassification.v2",
+ description="""ChemTEB evaluates the performance of text embedding models on chemical domain data.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2412.00532",
+ dataset={
+ "path": "mteb/wikipedia_chem_fields",
+ "revision": "70ff36ec5157fcc54c59f3b85d2b1fb232f8feec",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2024-06-01", "2024-11-30"),
+ domains=["Chemistry"],
+ task_subtypes=[],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="created",
+ bibtex_citation=r"""
+@article{kasmaee2024chemteb,
+ author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila},
+ journal = {arXiv preprint arXiv:2412.00532},
+ title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain},
+ year = {2024},
+}
+""",
+ adapted_from=["WikipediaChemFieldsClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py
index 8ee7c5b145..4f145eaa3c 100644
--- a/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py
+++ b/mteb/tasks/Classification/eng/WikipediaCompChemSpectroscopyClassification.py
@@ -5,6 +5,7 @@
class WikipediaCompChemSpectroscopyClassification(AbsTaskClassification):
+ superseded_by = "WikipediaCompChemSpectroscopyClassification.v2"
metadata = TaskMetadata(
name="WikipediaCompChemSpectroscopyClassification",
description="ChemTEB evaluates the performance of text embedding models on chemical domain data.",
@@ -35,3 +36,38 @@ class WikipediaCompChemSpectroscopyClassification(AbsTaskClassification):
}
""",
)
+
+
+class WikipediaCompChemSpectroscopyClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="WikipediaCompChemSpectroscopyClassification.v2",
+ description="""ChemTEB evaluates the performance of text embedding models on chemical domain data.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2412.00532",
+ dataset={
+ "path": "mteb/wikipedia_comp_chem_spectroscopy",
+ "revision": "a1ef45291dc5304482c42b9c053a5f7801e1006b",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2024-06-01", "2024-11-30"),
+ domains=["Chemistry"],
+ task_subtypes=[],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="created",
+ bibtex_citation=r"""
+@article{kasmaee2024chemteb,
+ author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila},
+ journal = {arXiv preprint arXiv:2412.00532},
+ title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain},
+ year = {2024},
+}
+""",
+ adapted_from=["WikipediaCompChemSpectroscopyClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py
index 9bc991261a..52c6136167 100644
--- a/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py
+++ b/mteb/tasks/Classification/eng/WikipediaCrystallographyAnalyticalClassification.py
@@ -5,6 +5,7 @@
class WikipediaCrystallographyAnalyticalClassification(AbsTaskClassification):
+ superseded_by = "WikipediaCrystallographyAnalyticalClassification.v2"
metadata = TaskMetadata(
name="WikipediaCrystallographyAnalyticalClassification",
description="ChemTEB evaluates the performance of text embedding models on chemical domain data.",
@@ -35,3 +36,38 @@ class WikipediaCrystallographyAnalyticalClassification(AbsTaskClassification):
}
""",
)
+
+
+class WikipediaCrystallographyAnalyticalClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="WikipediaCrystallographyAnalyticalClassification.v2",
+ description="""ChemTEB evaluates the performance of text embedding models on chemical domain data.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2412.00532",
+ dataset={
+ "path": "mteb/wikipedia_crystallography_analytical",
+ "revision": "b98f3205a68a9a50ab345abc85f01911089a93de",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2024-06-01", "2024-11-30"),
+ domains=["Chemistry"],
+ task_subtypes=[],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="created",
+ bibtex_citation=r"""
+@article{kasmaee2024chemteb,
+ author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila},
+ journal = {arXiv preprint arXiv:2412.00532},
+ title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain},
+ year = {2024},
+}
+""",
+ adapted_from=["WikipediaCrystallographyAnalyticalClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py
index f1d97d3a70..9c476ce76f 100644
--- a/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py
+++ b/mteb/tasks/Classification/eng/WikipediaTheoreticalAppliedClassification.py
@@ -5,6 +5,7 @@
class WikipediaTheoreticalAppliedClassification(AbsTaskClassification):
+ superseded_by = "WikipediaTheoreticalAppliedClassification.v2"
metadata = TaskMetadata(
name="WikipediaTheoreticalAppliedClassification",
description="ChemTEB evaluates the performance of text embedding models on chemical domain data.",
@@ -35,3 +36,38 @@ class WikipediaTheoreticalAppliedClassification(AbsTaskClassification):
}
""",
)
+
+
+class WikipediaTheoreticalAppliedClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="WikipediaTheoreticalAppliedClassification.v2",
+ description="""ChemTEB evaluates the performance of text embedding models on chemical domain data.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2412.00532",
+ dataset={
+ "path": "mteb/wikipedia_theoretical_applied",
+ "revision": "9b984e9591b6c2d9291370b1bb233c2465d5bd2f",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2024-06-01", "2024-11-30"),
+ domains=["Chemistry"],
+ task_subtypes=[],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="created",
+ bibtex_citation=r"""
+@article{kasmaee2024chemteb,
+ author = {Kasmaee, Ali Shiraee and Khodadad, Mohammad and Saloot, Mohammad Arshi and Sherck, Nick and Dokas, Stephen and Mahyar, Hamidreza and Samiee, Soheila},
+ journal = {arXiv preprint arXiv:2412.00532},
+ title = {ChemTEB: Chemical Text Embedding Benchmark, an Overview of Embedding Models Performance \& Efficiency on a Specific Domain},
+ year = {2024},
+}
+""",
+ adapted_from=["WikipediaTheoreticalAppliedClassification"],
+ )
diff --git a/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py b/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py
index 7a699bfaaf..d00b732bca 100644
--- a/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py
+++ b/mteb/tasks/Classification/eng/YahooAnswersTopicsClassification.py
@@ -5,6 +5,7 @@
class YahooAnswersTopicsClassification(AbsTaskClassification):
+ superseded_by = "YahooAnswersTopicsClassification.v2"
metadata = TaskMetadata(
name="YahooAnswersTopicsClassification",
description="Dataset composed of questions and answers from Yahoo Answers, categorized into topics.",
@@ -56,3 +57,50 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train", "test"]
)
+
+
+class YahooAnswersTopicsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="YahooAnswersTopicsClassification.v2",
+ description="""Dataset composed of questions and answers from Yahoo Answers, categorized into topics.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/yahoo_answers_topics",
+ dataset={
+ "path": "mteb/yahoo_answers_topics",
+ "revision": "c4d89f9633025d50954ab98a4c2c2feb188f6279",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2022-01-25", "2022-01-25"),
+ domains=["Web", "Written"],
+ task_subtypes=["Topic classification"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{NIPS2015_250cf8b5,
+ author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
+ booktitle = {Advances in Neural Information Processing Systems},
+ editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
+ pages = {},
+ publisher = {Curran Associates, Inc.},
+ title = {Character-level Convolutional Networks for Text Classification},
+ url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf},
+ volume = {28},
+ year = {2015},
+}
+""",
+ adapted_from=["YahooAnswersTopicsClassification"],
+ )
+
+ samples_per_label = 32
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train", "test"]
+ )
diff --git a/mteb/tasks/Classification/eng/YelpReviewFullClassification.py b/mteb/tasks/Classification/eng/YelpReviewFullClassification.py
index 2c088af31a..fb1ff45f89 100644
--- a/mteb/tasks/Classification/eng/YelpReviewFullClassification.py
+++ b/mteb/tasks/Classification/eng/YelpReviewFullClassification.py
@@ -5,6 +5,7 @@
class YelpReviewFullClassification(AbsTaskClassification):
+ superseded_by = "YelpReviewFullClassification.v2"
metadata = TaskMetadata(
name="YelpReviewFullClassification",
description="Yelp Review Full is a dataset for sentiment analysis, containing 5 classes corresponding to ratings 1-5.",
@@ -47,3 +48,50 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class YelpReviewFullClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="YelpReviewFullClassification.v2",
+ description="""Yelp Review Full is a dataset for sentiment analysis, containing 5 classes corresponding to ratings 1-5.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/1509.01626",
+ dataset={
+ "path": "mteb/yelp_review_full",
+ "revision": "49d71141934ae2e58733acd90908140e8ecaaee0",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["eng-Latn"],
+ main_score="accuracy",
+ date=("2015-01-01", "2015-12-31"), # reviews from 2015
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="https://huggingface.co/datasets/Yelp/yelp_review_full#licensing-information",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{NIPS2015_250cf8b5,
+ author = {Zhang, Xiang and Zhao, Junbo and LeCun, Yann},
+ booktitle = {Advances in Neural Information Processing Systems},
+ editor = {C. Cortes and N. Lawrence and D. Lee and M. Sugiyama and R. Garnett},
+ pages = {},
+ publisher = {Curran Associates, Inc.},
+ title = {Character-level Convolutional Networks for Text Classification},
+ url = {https://proceedings.neurips.cc/paper_files/paper/2015/file/250cf8b51c773f3f8dc8b4be867a9a02-Paper.pdf},
+ volume = {28},
+ year = {2015},
+}
+""",
+ adapted_from=["YelpReviewFullClassification"],
+ )
+
+ samples_per_label = 128
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/est/estonian_valence.py b/mteb/tasks/Classification/est/estonian_valence.py
index 11561aa385..ade49a0789 100644
--- a/mteb/tasks/Classification/est/estonian_valence.py
+++ b/mteb/tasks/Classification/est/estonian_valence.py
@@ -5,6 +5,7 @@
class EstonianValenceClassification(AbsTaskClassification):
+ superseded_by = "EstonianValenceClassification.v2"
metadata = TaskMetadata(
name="EstonianValenceClassification",
dataset={
@@ -53,3 +54,45 @@ def dataset_transform(self):
)
samples_per_label = 16
+
+
+class EstonianValenceClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="EstonianValenceClassification.v2",
+ dataset={
+ "path": "mteb/estonian_valence",
+ "revision": "8795961e2af5b83bcb8a6928636845ac2b92f92e",
+ },
+ description="""Dataset containing annotated Estonian news data from the Postimees and Õhtuleht newspapers.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["est-Latn"],
+ main_score="accuracy",
+ date=(
+ "1857-01-01", # Inception of Postimees
+ "2023-11-08", # dataset publication
+ ),
+ domains=["News", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ dialect=[],
+ license="cc-by-4.0",
+ annotations_creators="human-annotated",
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{Pajupuu2023,
+ author = {Hille Pajupuu and Jaan Pajupuu and Rene Altrov and Kairi Tamuri},
+ doi = {10.6084/m9.figshare.24517054.v1},
+ month = {11},
+ title = {{Estonian Valence Corpus / Eesti valentsikorpus}},
+ url = {https://figshare.com/articles/dataset/Estonian_Valence_Corpus_Eesti_valentsikorpus/24517054},
+ year = {2023},
+}
+""",
+ adapted_from=["EstonianValenceClassification"],
+ )
+
+ samples_per_label = 16
diff --git a/mteb/tasks/Classification/fas/FaMTEBClassification.py b/mteb/tasks/Classification/fas/FaMTEBClassification.py
index ad94db9185..3eefd40cfb 100644
--- a/mteb/tasks/Classification/fas/FaMTEBClassification.py
+++ b/mteb/tasks/Classification/fas/FaMTEBClassification.py
@@ -437,6 +437,7 @@ class SynPerChatbotToneUserClassification(AbsTaskClassification):
class SynPerTextToneClassification(AbsTaskClassification):
+ superseded_by = "SynPerTextToneClassification.v2"
metadata = TaskMetadata(
name="SynPerTextToneClassification",
description="Persian Text Tone",
@@ -463,7 +464,37 @@ class SynPerTextToneClassification(AbsTaskClassification):
samples_per_label = 32
+class SynPerTextToneClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SynPerTextToneClassification.v2",
+ description="""Persian Text Tone
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://mcinext.com/",
+ dataset={
+ "path": "mteb/syn_per_text_tone",
+ "revision": "0ed7459db7e905714dc02cbe25b4eac55e91021e",
+ },
+ type="Classification",
+ category="s2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["fas-Arab"],
+ main_score="accuracy",
+ date=("2024-09-01", "2024-12-31"),
+ domains=[],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="LM-generated",
+ dialect=[],
+ sample_creation="LM-generated and verified",
+ bibtex_citation=""" """,
+ adapted_from=["SynPerTextToneClassification"],
+ )
+ samples_per_label = 32
+
+
class SIDClassification(AbsTaskClassification):
+ superseded_by = "SIDClassification.v2"
metadata = TaskMetadata(
name="SIDClassification",
description="SID Classification",
@@ -490,7 +521,37 @@ class SIDClassification(AbsTaskClassification):
samples_per_label = 32
+class SIDClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SIDClassification.v2",
+ description="""SID Classification
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://mcinext.com/",
+ dataset={
+ "path": "mteb/sid",
+ "revision": "8234b2081bd9ca33bdbc7bf68f5f9540fe3fd480",
+ },
+ type="Classification",
+ category="p2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["fas-Arab"],
+ main_score="accuracy",
+ date=("2024-09-01", "2024-12-31"),
+ domains=["Academic"],
+ task_subtypes=[],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=""" """,
+ adapted_from=["SIDClassification"],
+ )
+ samples_per_label = 32
+
+
class DeepSentiPers(AbsTaskClassification):
+ superseded_by = "DeepSentiPers.v2"
metadata = TaskMetadata(
name="DeepSentiPers",
description="Persian Sentiment Analysis Dataset",
@@ -520,7 +581,37 @@ def dataset_transform(self):
self.dataset = self.dataset.rename_column("review", "text")
+class DeepSentiPersV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="DeepSentiPers.v2",
+ description="""Persian Sentiment Analysis Dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/JoyeBright/DeepSentiPers",
+ dataset={
+ "path": "mteb/deep_senti_pers",
+ "revision": "8d60d8315ac650ef0af32d68c4f92916ffc5cfb8",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["fas-Arab"],
+ main_score="accuracy",
+ date=("2024-09-01", "2024-12-31"),
+ domains=["Reviews"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=""" """,
+ adapted_from=["DeepSentiPers"],
+ )
+ samples_per_label = 32
+
+
class PersianTextEmotion(AbsTaskClassification):
+ superseded_by = "PersianTextEmotion.v2"
metadata = TaskMetadata(
name="PersianTextEmotion",
description="Emotion is a Persian dataset with six basic emotions: anger, fear, joy, love, sadness, and surprise.",
@@ -547,7 +638,37 @@ class PersianTextEmotion(AbsTaskClassification):
samples_per_label = 32
+class PersianTextEmotionV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="PersianTextEmotion.v2",
+ description="""Emotion is a Persian dataset with six basic emotions: anger, fear, joy, love, sadness, and surprise.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/SeyedAli/Persian-Text-Emotion",
+ dataset={
+ "path": "mteb/persian_text_emotion",
+ "revision": "a45594021eca1d1577296edc030d972a92ff26b3",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["fas-Arab"],
+ main_score="accuracy",
+ date=("2024-09-01", "2024-12-31"),
+ domains=[],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=""" """,
+ adapted_from=["PersianTextEmotion"],
+ )
+ samples_per_label = 32
+
+
class SentimentDKSF(AbsTaskClassification):
+ superseded_by = "SentimentDKSF.v2"
metadata = TaskMetadata(
name="SentimentDKSF",
description="The Sentiment DKSF (Digikala/Snappfood comments) is a dataset for sentiment analysis.",
@@ -574,7 +695,37 @@ class SentimentDKSF(AbsTaskClassification):
samples_per_label = 32
+class SentimentDKSFV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SentimentDKSF.v2",
+ description="""The Sentiment DKSF (Digikala/Snappfood comments) is a dataset for sentiment analysis.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/hezarai/hezar",
+ dataset={
+ "path": "mteb/sentiment_dksf",
+ "revision": "05129fb229c8f68267d112cffa655f1312ec6575",
+ },
+ type="Classification",
+ category="s2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["fas-Arab"],
+ main_score="accuracy",
+ date=("2024-09-01", "2024-12-31"),
+ domains=["Reviews"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=""" """,
+ adapted_from=["SentimentDKSF"],
+ )
+ samples_per_label = 32
+
+
class NLPTwitterAnalysisClassification(AbsTaskClassification):
+ superseded_by = "NLPTwitterAnalysisClassification.v2"
metadata = TaskMetadata(
name="NLPTwitterAnalysisClassification",
description="Twitter Analysis Classification",
@@ -604,6 +755,35 @@ def dataset_transform(self):
self.dataset = self.dataset.rename_column("tweet", "text")
+class NLPTwitterAnalysisClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="NLPTwitterAnalysisClassification.v2",
+ description="""Twitter Analysis Classification
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main",
+ dataset={
+ "path": "mteb/nlp_twitter_analysis",
+ "revision": "41d85185019495609522fece20e93d11ab705301",
+ },
+ type="Classification",
+ category="s2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["fas-Arab"],
+ main_score="accuracy",
+ date=("2024-09-01", "2024-12-31"),
+ domains=["Social"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=""" """,
+ adapted_from=["NLPTwitterAnalysisClassification"],
+ )
+ samples_per_label = 32
+
+
class DigikalamagClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="DigikalamagClassification",
diff --git a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
index 5f4239d2fa..ce31bd0ad8 100644
--- a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
+++ b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
@@ -7,6 +7,7 @@
class FilipinoHateSpeechClassification(AbsTaskClassification):
+ superseded_by = "FilipinoHateSpeechClassification.v2"
metadata = TaskMetadata(
name="FilipinoHateSpeechClassification",
description="Filipino Twitter dataset for sentiment classification.",
@@ -40,3 +41,41 @@ class FilipinoHateSpeechClassification(AbsTaskClassification):
}
""",
)
+
+
+class FilipinoHateSpeechClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="FilipinoHateSpeechClassification.v2",
+ description="""Filipino Twitter dataset for sentiment classification.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019",
+ dataset={
+ "path": "mteb/filipino_hate_speech",
+ "revision": "fdb7536c08d2f10d8b8b618c3ff00b0be286d844",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2019-08-01", "2019-08-01"),
+ eval_splits=["validation", "test"],
+ eval_langs=["fil-Latn"],
+ main_score="accuracy",
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{Cabasag-2019-hate-speech,
+ author = {Neil Vicente Cabasag, Vicente Raphael Chan, Sean Christian Lim, Mark Edward Gonzales, and Charibeth Cheng},
+ journal = {Philippine Computing Journal},
+ month = {August},
+ number = {1},
+ title = {Hate speech in Philippine election-related tweets: Automatic detection and classification using natural language processing.},
+ volume = {XIV},
+ year = {2019},
+}
+""",
+ adapted_from=["FilipinoHateSpeechClassification"],
+ )
diff --git a/mteb/tasks/Classification/fin/FinToxicityClassification.py b/mteb/tasks/Classification/fin/FinToxicityClassification.py
index d847dac5a8..13e1206dce 100644
--- a/mteb/tasks/Classification/fin/FinToxicityClassification.py
+++ b/mteb/tasks/Classification/fin/FinToxicityClassification.py
@@ -5,6 +5,7 @@
class FinToxicityClassification(AbsTaskClassification):
+ superseded_by = "FinToxicityClassification.v2"
metadata = TaskMetadata(
name="FinToxicityClassification",
description="""
@@ -53,6 +54,47 @@ def dataset_transform(self):
if col not in ["text", "label"]
]
self.dataset = self.dataset.remove_columns(remove_cols)
- self.dataset = self.stratified_subsampling(
- self.dataset, seed=self.seed, splits=["train", "test"]
- )
+
+
+class FinToxicityClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="FinToxicityClassification.v2",
+ description="""
+ This dataset is a DeepL -based machine translated version of the Jigsaw toxicity dataset for Finnish. The dataset is originally from a Kaggle competition https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data.
+ The original dataset poses a multi-label text classification problem and includes the labels identity_attack, insult, obscene, severe_toxicity, threat and toxicity.
+ Here adapted for toxicity classification, which is the most represented class.
+
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/fin_toxicity",
+ "revision": "1deba6e874be1d5632a4ac0d1fb71f4bc3dea0d6",
+ },
+ reference="https://aclanthology.org/2023.nodalida-1.68",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["fin-Latn"],
+ main_score="f1",
+ date=("2023-03-13", "2023-09-25"),
+ domains=["News", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="machine-translated",
+ bibtex_citation=r"""
+@inproceedings{eskelinen-etal-2023-toxicity,
+ author = {Eskelinen, Anni and
+Silvala, Laura and
+Ginter, Filip and
+Pyysalo, Sampo and
+Laippala, Veronika},
+ booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)},
+ month = may,
+ title = {Toxicity Detection in {F}innish Using Machine Translation},
+ year = {2023},
+}
+""",
+ adapted_from=["FinToxicityClassification"],
+ )
diff --git a/mteb/tasks/Classification/fra/FrenchBookReviews.py b/mteb/tasks/Classification/fra/FrenchBookReviews.py
index cb9c7b37c9..0af4947a35 100644
--- a/mteb/tasks/Classification/fra/FrenchBookReviews.py
+++ b/mteb/tasks/Classification/fra/FrenchBookReviews.py
@@ -5,6 +5,7 @@
class FrenchBookReviews(AbsTaskClassification):
+ superseded_by = "FrenchBookReviews.v2"
metadata = TaskMetadata(
name="FrenchBookReviews",
dataset={
@@ -35,3 +36,37 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class FrenchBookReviewsV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="FrenchBookReviews.v2",
+ dataset={
+ "path": "mteb/french_book_reviews",
+ "revision": "71d755fd76073533c3d0c262f6b542eb0fa7ce96",
+ },
+ description="""It is a French book reviews dataset containing a huge number of reader reviews on French books. Each review is pared with a rating that ranges from 0.5 to 5 (with 0.5 increment).
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/Abirate/french_book_reviews",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["fra-Latn"],
+ main_score="accuracy",
+ date=("2022-01-01", "2023-01-01"),
+ domains=["Reviews", "Written"],
+ task_subtypes=[],
+ license="cc0-1.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation="""
+ """,
+ adapted_from=["FrenchBookReviews"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train"]
+ )
diff --git a/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py b/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py
index b488661093..04ff73b78d 100644
--- a/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py
+++ b/mteb/tasks/Classification/fra/MovieReviewSentimentClassification.py
@@ -5,6 +5,7 @@
class MovieReviewSentimentClassification(AbsTaskClassification):
+ superseded_by = "MovieReviewSentimentClassification.v2"
metadata = TaskMetadata(
name="MovieReviewSentimentClassification",
dataset={
@@ -41,3 +42,43 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["validation", "test"]
)
+
+
+class MovieReviewSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="MovieReviewSentimentClassification.v2",
+ dataset={
+ "path": "mteb/movie_review_sentiment",
+ "revision": "4e182033cbfe75ae0556cd640d028986be82afd8",
+ },
+ description="""The Allociné dataset is a French-language dataset for sentiment analysis that contains movie reviews produced by the online community of the Allociné.fr website.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/TheophileBlard/french-sentiment-analysis-with-bert",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["validation", "test"],
+ eval_langs=["fra-Latn"],
+ main_score="accuracy",
+ date=("2006-01-01", "2020-01-01"),
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@software{blard2020,
+ author = {Théophile Blard},
+ title = {French sentiment analysis with BERT},
+ url = {https://github.com/TheophileBlard/french-sentiment-analysis-with-bert},
+ year = {2020},
+}
+""",
+ adapted_from=["MovieReviewSentimentClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["validation", "test"]
+ )
diff --git a/mteb/tasks/Classification/guj/GujaratiNewsClassification.py b/mteb/tasks/Classification/guj/GujaratiNewsClassification.py
index 0c93c0dd21..e0714a2b00 100644
--- a/mteb/tasks/Classification/guj/GujaratiNewsClassification.py
+++ b/mteb/tasks/Classification/guj/GujaratiNewsClassification.py
@@ -5,6 +5,7 @@
class GujaratiNewsClassification(AbsTaskClassification):
+ superseded_by = "GujaratiNewsClassification.v2"
metadata = TaskMetadata(
name="GujaratiNewsClassification",
description="A Gujarati dataset for 3-class classification of Gujarati news articles",
@@ -31,3 +32,31 @@ class GujaratiNewsClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset = self.dataset.rename_column("headline", "text")
+
+
+class GujaratiNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="GujaratiNewsClassification.v2",
+ description="""A Gujarati dataset for 3-class classification of Gujarati news articles
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/goru001/nlp-for-gujarati",
+ dataset={
+ "path": "mteb/gujarati_news",
+ "revision": "a815d60b34dc8ee11910743d380b7d14a4e227cb",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2014-01-01", "2018-01-01"),
+ eval_splits=["test"],
+ eval_langs=["guj-Gujr"],
+ main_score="accuracy",
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation="", # none found,
+ adapted_from=["GujaratiNewsClassification"],
+ )
diff --git a/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py b/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py
index 5f70955710..3bbdb698b3 100644
--- a/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py
+++ b/mteb/tasks/Classification/heb/HebrewSentimentAnalysis.py
@@ -7,6 +7,7 @@
class HebrewSentimentAnalysis(AbsTaskClassification):
+ superseded_by = "HebrewSentimentAnalysis.v2"
metadata = TaskMetadata(
name="HebrewSentimentAnalysis",
dataset={
@@ -49,3 +50,44 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class HebrewSentimentAnalysisV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="HebrewSentimentAnalysis.v2",
+ dataset={
+ "path": "mteb/hebrew_sentiment_analysis",
+ "revision": "7ecd049fc8ac0d6f0a0121c8ff9fe44ea5bd935b",
+ "name": "morph",
+ },
+ description="""HebrewSentiment is a data set consists of 12,804 user comments to posts on the official Facebook page of Israel’s president, Mr. Reuven Rivlin. In October 2015, we used the open software application Netvizz (Rieder, 2013) to scrape all the comments to all of the president’s posts in the period of June – August 2014, the first three months of Rivlin’s presidency.2 While the president’s posts aimed at reconciling tensions and called for tolerance and empathy, the sentiment expressed in the comments to the president’s posts was polarized between citizens who warmly thanked the president, and citizens that fiercely critiqued his policy.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/hebrew_sentiment",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["heb-Hebr"],
+ main_score="accuracy",
+ date=("2015-10-01", "2015-10-31"),
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="mit",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{amram-etal-2018-representations,
+ address = {Santa Fe, New Mexico, USA},
+ author = {Amram, Adam and Ben David, Anat and Tsarfaty, Reut},
+ booktitle = {Proceedings of the 27th International Conference on Computational Linguistics},
+ month = aug,
+ pages = {2242--2252},
+ publisher = {Association for Computational Linguistics},
+ title = {Representations and Architectures in Neural Sentiment Analysis for Morphologically Rich Languages: A Case Study from {M}odern {H}ebrew},
+ url = {https://www.aclweb.org/anthology/C18-1190},
+ year = {2018},
+}
+""",
+ adapted_from=["HebrewSentimentAnalysis"],
+ )
diff --git a/mteb/tasks/Classification/hin/HindiDiscourseClassification.py b/mteb/tasks/Classification/hin/HindiDiscourseClassification.py
index 52fc83a720..1b72ce3133 100644
--- a/mteb/tasks/Classification/hin/HindiDiscourseClassification.py
+++ b/mteb/tasks/Classification/hin/HindiDiscourseClassification.py
@@ -5,6 +5,7 @@
class HindiDiscourseClassification(AbsTaskClassification):
+ superseded_by = "HindiDiscourseClassification.v2"
metadata = TaskMetadata(
name="HindiDiscourseClassification",
dataset={
@@ -60,3 +61,59 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class HindiDiscourseClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="HindiDiscourseClassification.v2",
+ dataset={
+ "path": "mteb/hindi_discourse",
+ "revision": "9d10173a3df9858adc90711d8da9abf3df0a1259",
+ },
+ description="""A Hindi Discourse dataset in Hindi with values for coherence.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2020.lrec-1.149/",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["hin-Deva"],
+ main_score="accuracy",
+ date=("2019-12-01", "2020-04-09"),
+ domains=["Fiction", "Social", "Written"],
+ dialect=[],
+ task_subtypes=["Discourse coherence"],
+ license="mit",
+ annotations_creators="expert-annotated",
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{dhanwal-etal-2020-annotated,
+ address = {Marseille, France},
+ author = {Dhanwal, Swapnil and
+Dutta, Hritwik and
+Nankani, Hitesh and
+Shrivastava, Nilay and
+Kumar, Yaman and
+Li, Junyi Jessy and
+Mahata, Debanjan and
+Gosangi, Rakesh and
+Zhang, Haimin and
+Shah, Rajiv Ratn and
+Stent, Amanda},
+ booktitle = {Proceedings of the 12th Language Resources and Evaluation Conference},
+ isbn = {979-10-95546-34-4},
+ language = {English},
+ month = may,
+ publisher = {European Language Resources Association},
+ title = {An Annotated Dataset of Discourse Modes in {H}indi Stories},
+ url = {https://www.aclweb.org/anthology/2020.lrec-1.149},
+ year = {2020},
+}
+""",
+ adapted_from=["HindiDiscourseClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train"]
+ )
diff --git a/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py b/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py
index c922567b8f..27625ff4b9 100644
--- a/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py
+++ b/mteb/tasks/Classification/hin/SentimentAnalysisHindi.py
@@ -5,6 +5,7 @@
class SentimentAnalysisHindi(AbsTaskClassification):
+ superseded_by = "SentimentAnalysisHindi.v2"
metadata = TaskMetadata(
name="SentimentAnalysisHindi",
description="Hindi Sentiment Analysis Dataset",
@@ -42,3 +43,40 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class SentimentAnalysisHindiV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SentimentAnalysisHindi.v2",
+ description="""Hindi Sentiment Analysis Dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/OdiaGenAI/sentiment_analysis_hindi",
+ dataset={
+ "path": "mteb/sentiment_analysis_hindi",
+ "revision": "27fc099ce1c5a7295b9231e53a37648cdef6cb79",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["hin-Deva"],
+ main_score="f1",
+ date=("2023-09-15", "2023-10-16"),
+ dialect=[],
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="derived",
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{OdiaGenAI,
+ author = {Shantipriya Parida and Sambit Sekhar and Soumendra Kumar Sahoo and Swateek Jena and Abhijeet Parida and Satya Ranjan Dash and Guneet Singh Kohli},
+ howpublished = {{https://huggingface.co/OdiaGenAI}},
+ journal = {Hugging Face repository},
+ publisher = {Hugging Face},
+ title = {OdiaGenAI: Generative AI and LLM Initiative for the Odia Language},
+ year = {2023},
+}
+""",
+ adapted_from=["SentimentAnalysisHindi"],
+ )
diff --git a/mteb/tasks/Classification/hrv/FrenkHrClassification.py b/mteb/tasks/Classification/hrv/FrenkHrClassification.py
index 440e0c90ad..700c18f1e3 100644
--- a/mteb/tasks/Classification/hrv/FrenkHrClassification.py
+++ b/mteb/tasks/Classification/hrv/FrenkHrClassification.py
@@ -5,6 +5,7 @@
class FrenkHrClassification(AbsTaskClassification):
+ superseded_by = "FrenkHrClassification.v2"
metadata = TaskMetadata(
name="FrenkHrClassification",
description="Croatian subset of the FRENK dataset",
@@ -39,3 +40,41 @@ class FrenkHrClassification(AbsTaskClassification):
}
""",
)
+
+
+class FrenkHrClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="FrenkHrClassification.v2",
+ description="""Croatian subset of the FRENK dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/frenk_hr",
+ "revision": "09f90d0bee34d5e703caed26737166591a8f12b9",
+ },
+ reference="https://arxiv.org/abs/1906.02045",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["hrv-Latn"],
+ main_score="accuracy",
+ date=("2021-05-28", "2021-05-28"),
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{ljubešić2019frenk,
+ archiveprefix = {arXiv},
+ author = {Nikola Ljubešić and Darja Fišer and Tomaž Erjavec},
+ eprint = {1906.02045},
+ primaryclass = {cs.CL},
+ title = {The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English},
+ url = {https://arxiv.org/abs/1906.02045},
+ year = {2019},
+}
+""",
+ adapted_from=["FrenkHrClassification"],
+ )
diff --git a/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py b/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py
index adcfbd57df..8977120251 100644
--- a/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py
+++ b/mteb/tasks/Classification/ind/IndonesianIdClickbaitClassification.py
@@ -5,6 +5,7 @@
class IndonesianIdClickbaitClassification(AbsTaskClassification):
+ superseded_by = "IndonesianIdClickbaitClassification.v2"
metadata = TaskMetadata(
name="IndonesianIdClickbaitClassification",
dataset={
@@ -50,3 +51,50 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class IndonesianIdClickbaitClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="IndonesianIdClickbaitClassification.v2",
+ dataset={
+ "path": "mteb/indonesian_id_clickbait",
+ "revision": "a54158a1b437a85e1982a70d0c57a69c69d0a5b8",
+ },
+ description="""The CLICK-ID dataset is a collection of Indonesian news headlines that was collected from 12 local online news publishers.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="http://www.sciencedirect.com/science/article/pii/S2352340920311252",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["ind-Latn"],
+ main_score="f1",
+ date=("2020-10-01", "2020-10-01"),
+ domains=["News", "Written"],
+ dialect=[],
+ task_subtypes=["Claim verification"],
+ license="cc-by-4.0",
+ annotations_creators="expert-annotated",
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{WILLIAM2020106231,
+ abstract = {News analysis is a popular task in Natural Language Processing (NLP). In particular, the problem of clickbait in news analysis has gained attention in recent years [1, 2]. However, the majority of the tasks has been focused on English news, in which there is already a rich representative resource. For other languages, such as Indonesian, there is still a lack of resource for clickbait tasks. Therefore, we introduce the CLICK-ID dataset of Indonesian news headlines extracted from 12 Indonesian online news publishers. It is comprised of 15,000 annotated headlines with clickbait and non-clickbait labels. Using the CLICK-ID dataset, we then developed an Indonesian clickbait classification model achieving favourable performance. We believe that this corpus will be useful for replicable experiments in clickbait detection or other experiments in NLP areas.},
+ author = {Andika William and Yunita Sari},
+ doi = {https://doi.org/10.1016/j.dib.2020.106231},
+ issn = {2352-3409},
+ journal = {Data in Brief},
+ keywords = {Indonesian, Natural Language Processing, News articles, Clickbait, Text-classification},
+ pages = {106231},
+ title = {CLICK-ID: A novel dataset for Indonesian clickbait headlines},
+ url = {http://www.sciencedirect.com/science/article/pii/S2352340920311252},
+ volume = {32},
+ year = {2020},
+}
+""",
+ adapted_from=["IndonesianIdClickbaitClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train"]
+ )
diff --git a/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py b/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py
index cef0f33fac..ac6e7002b8 100644
--- a/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py
+++ b/mteb/tasks/Classification/ind/IndonesianMongabayConservationClassification.py
@@ -10,6 +10,7 @@
class IndonesianMongabayConservationClassification(AbsTaskClassification):
+ superseded_by = "IndonesianMongabayConservationClassification.v2"
metadata = TaskMetadata(
name="IndonesianMongabayConservationClassification",
description="Conservation dataset that was collected from mongabay.co.id contains topic-classification task (multi-label format) and sentiment classification. This task only covers sentiment analysis (positive, neutral negative)",
@@ -100,3 +101,53 @@ def dataset_transform(self):
)
self.dataset = datasets.DatasetDict(ds)
+
+
+class IndonesianMongabayConservationClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="IndonesianMongabayConservationClassification.v2",
+ description="""Conservation dataset that was collected from mongabay.co.id contains topic-classification task (multi-label format) and sentiment classification. This task only covers sentiment analysis (positive, neutral negative)
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2023.sealp-1.4/",
+ dataset={
+ "path": "mteb/indonesian_mongabay_conservation",
+ "revision": "04863a3b6885470071f649a4d4dcd7e9d8e98cf8",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2012-01-01", "2023-12-31"),
+ eval_splits=["validation", "test"],
+ eval_langs=["ind-Latn"],
+ main_score="f1",
+ domains=["Web", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{fransiska-etal-2023-utilizing,
+ address = {Nusa Dua, Bali, Indonesia},
+ author = {Fransiska, Mega and
+Pitaloka, Diah and
+Saripudin, Saripudin and
+Putra, Satrio and
+Sutawika*, Lintang},
+ booktitle = {Proceedings of the First Workshop in South East Asian Language Processing},
+ doi = {10.18653/v1/2023.sealp-1.4},
+ editor = {Wijaya, Derry and
+Aji, Alham Fikri and
+Vania, Clara and
+Winata, Genta Indra and
+Purwarianti, Ayu},
+ month = nov,
+ pages = {30--54},
+ publisher = {Association for Computational Linguistics},
+ title = {Utilizing Weak Supervision to Generate {I}ndonesian Conservation Datasets},
+ url = {https://aclanthology.org/2023.sealp-1.4},
+ year = {2023},
+}
+""",
+ adapted_from=["IndonesianMongabayConservationClassification"],
+ )
diff --git a/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py b/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py
index db6f371494..66d34839f8 100644
--- a/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py
+++ b/mteb/tasks/Classification/ita/ItalianLinguistAcceptabilityClassification.py
@@ -5,6 +5,7 @@
class ItalianLinguisticAcceptabilityClassification(AbsTaskClassification):
+ superseded_by = "Itacola.v2"
metadata = TaskMetadata(
name="Itacola",
dataset={
@@ -52,3 +53,47 @@ def dataset_transform(self):
.rename_columns({"sentence": "text"})
.remove_columns(["unique_id", "source"])
)
+
+
+class ItalianLinguisticAcceptabilityClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="Itacola.v2",
+ dataset={
+ "path": "mteb/italian_linguistic_acceptability",
+ "revision": "4550151a0f0433e65df172c088427063e376ce81",
+ },
+ description="""An Italian Corpus of Linguistic Acceptability taken from linguistic literature with a binary annotation made by the original authors themselves.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2021.findings-emnlp.250/",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["ita-Latn"],
+ main_score="accuracy",
+ date=("2021-01-01", "2021-12-31"),
+ domains=["Non-fiction", "Spoken", "Written"],
+ dialect=[],
+ task_subtypes=["Linguistic acceptability"],
+ license="not specified",
+ annotations_creators="expert-annotated",
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{trotta-etal-2021-monolingual-cross,
+ address = {Punta Cana, Dominican Republic},
+ author = {Trotta, Daniela and
+Guarasci, Raffaele and
+Leonardelli, Elisa and
+Tonelli, Sara},
+ booktitle = {Findings of the Association for Computational Linguistics: EMNLP 2021},
+ doi = {10.18653/v1/2021.findings-emnlp.250},
+ month = nov,
+ pages = {2929--2940},
+ publisher = {Association for Computational Linguistics},
+ title = {Monolingual and Cross-Lingual Acceptability Judgments with the {I}talian {C}o{LA} corpus},
+ url = {https://aclanthology.org/2021.findings-emnlp.250},
+ year = {2021},
+}
+""",
+ adapted_from=["ItalianLinguisticAcceptabilityClassification"],
+ )
diff --git a/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py b/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py
index b0fa0144bd..2f9e613701 100644
--- a/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py
+++ b/mteb/tasks/Classification/jav/JavaneseIMDBClassification.py
@@ -5,6 +5,7 @@
class JavaneseIMDBClassification(AbsTaskClassification):
+ superseded_by = "JavaneseIMDBClassification.v2"
metadata = TaskMetadata(
name="JavaneseIMDBClassification",
description="Large Movie Review Dataset translated to Javanese. This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets.",
@@ -43,3 +44,45 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class JavaneseIMDBClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="JavaneseIMDBClassification.v2",
+ description="""Large Movie Review Dataset translated to Javanese. This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/w11wo/nlp-datasets#javanese-imdb",
+ dataset={
+ "path": "mteb/javanese_imdb",
+ "revision": "47aadc77049fa4e7b9001c69a255555814d026d9",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2021-06-24", "2021-06-24"),
+ eval_splits=["test"],
+ eval_langs=["jav-Latn"],
+ main_score="accuracy",
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="mit",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{wongso2021causal,
+ author = {Wongso, Wilson and Setiawan, David Samuel and Suhartono, Derwin},
+ booktitle = {2021 International Conference on Advanced Computer Science and Information Systems (ICACSIS)},
+ organization = {IEEE},
+ pages = {1--7},
+ title = {Causal and Masked Language Modeling of Javanese Language using Transformer-based Architectures},
+ year = {2021},
+}
+""",
+ adapted_from=["JavaneseIMDBClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/jpn/WRIMEClassification.py b/mteb/tasks/Classification/jpn/WRIMEClassification.py
index 893b092167..e4c5d8ec50 100644
--- a/mteb/tasks/Classification/jpn/WRIMEClassification.py
+++ b/mteb/tasks/Classification/jpn/WRIMEClassification.py
@@ -5,6 +5,7 @@
class WRIMEClassification(AbsTaskClassification):
+ superseded_by = "WRIMEClassification.v2"
metadata = TaskMetadata(
name="WRIMEClassification",
dataset={
@@ -68,3 +69,64 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class WRIMEClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="WRIMEClassification.v2",
+ dataset={
+ "path": "mteb/wrime",
+ "revision": "6687c3bd031a0b144189958bad57db0b95a48dec",
+ "name": "ver2",
+ },
+ description="""A dataset of Japanese social network rated for sentiment
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2021.naacl-main.169/",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["jpn-Jpan"],
+ main_score="accuracy",
+ date=("2011-06-01", "2020-05-31"),
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="https://huggingface.co/datasets/shunk031/wrime#licensing-information",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{kajiwara-etal-2021-wrime,
+ abstract = {We annotate 17,000 SNS posts with both the writer{'}s subjective emotional intensity and the reader{'}s objective one to construct a Japanese emotion analysis dataset. In this study, we explore the difference between the emotional intensity of the writer and that of the readers with this dataset. We found that the reader cannot fully detect the emotions of the writer, especially anger and trust. In addition, experimental results in estimating the emotional intensity show that it is more difficult to estimate the writer{'}s subjective labels than the readers{'}. The large gap between the subjective and objective emotions imply the complexity of the mapping from a post to the subjective emotion intensities, which also leads to a lower performance with machine learning models.},
+ address = {Online},
+ author = {Kajiwara, Tomoyuki and
+Chu, Chenhui and
+Takemura, Noriko and
+Nakashima, Yuta and
+Nagahara, Hajime},
+ booktitle = {Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies},
+ doi = {10.18653/v1/2021.naacl-main.169},
+ editor = {Toutanova, Kristina and
+Rumshisky, Anna and
+Zettlemoyer, Luke and
+Hakkani-Tur, Dilek and
+Beltagy, Iz and
+Bethard, Steven and
+Cotterell, Ryan and
+Chakraborty, Tanmoy and
+Zhou, Yichao},
+ month = jun,
+ pages = {2095--2104},
+ publisher = {Association for Computational Linguistics},
+ title = {{WRIME}: A New Dataset for Emotional Intensity Estimation with Subjective and Objective Annotations},
+ url = {https://aclanthology.org/2021.naacl-main.169},
+ year = {2021},
+}
+""",
+ adapted_from=["WRIMEClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/kan/KannadaNewsClassification.py b/mteb/tasks/Classification/kan/KannadaNewsClassification.py
index 4d3edcc6ca..5350d63e4f 100644
--- a/mteb/tasks/Classification/kan/KannadaNewsClassification.py
+++ b/mteb/tasks/Classification/kan/KannadaNewsClassification.py
@@ -5,6 +5,7 @@
class KannadaNewsClassification(AbsTaskClassification):
+ superseded_by = "KannadaNewsClassification.v2"
metadata = TaskMetadata(
name="KannadaNewsClassification",
description="The Kannada news dataset contains only the headlines of news article in three categories: Entertainment, Tech, and Sports. The data set contains around 6300 news article headlines which are collected from Kannada news websites. The data set has been cleaned and contains train and test set using which can be used to benchmark topic classification models in Kannada.",
@@ -41,3 +42,43 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class KannadaNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="KannadaNewsClassification.v2",
+ description="""The Kannada news dataset contains only the headlines of news article in three categories: Entertainment, Tech, and Sports. The data set contains around 6300 news article headlines which are collected from Kannada news websites. The data set has been cleaned and contains train and test set using which can be used to benchmark topic classification models in Kannada.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/kannada_news",
+ "revision": "bf9c88b5bd4e5b349a39492e5298a928ab509a92",
+ },
+ reference="https://github.com/goru001/nlp-for-kannada",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["kan-Knda"],
+ main_score="accuracy",
+ date=("2019-03-17", "2020-08-06"),
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{kunchukuttan2020indicnlpcorpus,
+ author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar},
+ journal = {arXiv preprint arXiv:2005.00085},
+ title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages},
+ year = {2020},
+}
+""",
+ adapted_from=["KannadaNewsClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train"]
+ )
diff --git a/mteb/tasks/Classification/kor/KlueTC.py b/mteb/tasks/Classification/kor/KlueTC.py
index bf878570ac..8278f85684 100644
--- a/mteb/tasks/Classification/kor/KlueTC.py
+++ b/mteb/tasks/Classification/kor/KlueTC.py
@@ -5,6 +5,7 @@
class KlueTC(AbsTaskClassification):
+ superseded_by = "KLUE-TC.v2"
metadata = TaskMetadata(
name="KLUE-TC",
dataset={
@@ -53,3 +54,46 @@ def id2str(example):
{"title": "text", "label": "label_id"}
)
self.dataset = self.dataset.map(id2str)
+
+
+class KlueTCV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="KLUE-TC.v2",
+ dataset={
+ "path": "mteb/klue_tc",
+ "name": "ynat",
+ "revision": "c0e3d82ac01def9bfd92dffb1e7dde619b50d0a2",
+ },
+ description="""Topic classification dataset of human-annotated news headlines. Part of the Korean Language Understanding Evaluation (KLUE).
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2105.09680",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["validation"],
+ eval_langs=["kor-Hang"],
+ main_score="accuracy",
+ date=("2016-01-01", "2020-12-31"), # from 2016 to 2020
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="cc-by-sa-4.0",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{park2021klue,
+ archiveprefix = {arXiv},
+ author = {Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho},
+ eprint = {2105.09680},
+ primaryclass = {cs.CL},
+ title = {KLUE: Korean Language Understanding Evaluation},
+ year = {2021},
+}
+""",
+ adapted_from=["KlueTC"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["validation"]
+ )
diff --git a/mteb/tasks/Classification/kor/KorHateClassification.py b/mteb/tasks/Classification/kor/KorHateClassification.py
index a9ec38fdef..d26a238fd3 100644
--- a/mteb/tasks/Classification/kor/KorHateClassification.py
+++ b/mteb/tasks/Classification/kor/KorHateClassification.py
@@ -5,6 +5,7 @@
class KorHateClassification(AbsTaskClassification):
+ superseded_by = "KorHateClassification.v2"
metadata = TaskMetadata(
name="KorHateClassification",
description="""The dataset was created to provide the first human-labeled Korean corpus for
@@ -57,3 +58,52 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class KorHateClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="KorHateClassification.v2",
+ description="""The dataset was created to provide the first human-labeled Korean corpus for
+ toxic speech detection from a Korean online entertainment news aggregator. Recently,
+ two young Korean celebrities suffered from a series of tragic incidents that led to two
+ major Korean web portals to close the comments section on their platform. However, this only
+ serves as a temporary solution, and the fundamental issue has not been solved yet. This dataset
+ hopes to improve Korean hate speech detection. Annotation was performed by 32 annotators,
+ consisting of 29 annotators from the crowdsourcing platform DeepNatural AI and three NLP researchers.
+
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/kor_hate",
+ "revision": "5d64e6dcbe9204c934e9a3852b1130a6f2d51ad4",
+ },
+ reference="https://paperswithcode.com/dataset/korean-hatespeech-dataset",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["kor-Hang"],
+ main_score="accuracy",
+ date=("2018-01-01", "2020-01-01"),
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-sa-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{moon2020beep,
+ archiveprefix = {arXiv},
+ author = {Jihyung Moon and Won Ik Cho and Junbum Lee},
+ eprint = {2005.12503},
+ primaryclass = {cs.CL},
+ title = {BEEP! Korean Corpus of Online News Comments for Toxic Speech Detection},
+ year = {2020},
+}
+""",
+ adapted_from=["KorHateClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train"]
+ )
diff --git a/mteb/tasks/Classification/kor/KorSarcasmClassification.py b/mteb/tasks/Classification/kor/KorSarcasmClassification.py
index abae7c8222..0461db81c6 100644
--- a/mteb/tasks/Classification/kor/KorSarcasmClassification.py
+++ b/mteb/tasks/Classification/kor/KorSarcasmClassification.py
@@ -5,6 +5,7 @@
class KorSarcasmClassification(AbsTaskClassification):
+ superseded_by = "KorSarcasmClassification.v2"
metadata = TaskMetadata(
name="KorSarcasmClassification",
description="""
@@ -51,3 +52,53 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class KorSarcasmClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="KorSarcasmClassification.v2",
+ description="""
+ The Korean Sarcasm Dataset was created to detect sarcasm in text, which can significantly alter the original
+ meaning of a sentence. 9319 tweets were collected from Twitter and labeled for sarcasm or not_sarcasm. These
+ tweets were gathered by querying for: irony sarcastic, and
+ sarcasm.
+ The dataset was created by gathering HTML data from Twitter. Queries for hashtags that include sarcasm
+ and variants of it were used to return tweets. It was preprocessed by removing the keyword
+ hashtag, urls and mentions of the user to preserve anonymity.
+
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/kor_sarcasm",
+ "revision": "0e5e17b4dba569776e445f5639ba13dc406b2b0e",
+ },
+ reference="https://github.com/SpellOnYou/korean-sarcasm",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["kor-Hang"],
+ main_score="accuracy",
+ date=("2018-10-31", "2019-09-28"), # estimated based on git history
+ domains=["Social", "Written"],
+ task_subtypes=["Topic classification"],
+ license="mit",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{kim2019kocasm,
+ author = {Kim, Jiwon and Cho, Won Ik},
+ howpublished = {https://github.com/SpellOnYou/korean-sarcasm},
+ journal = {GitHub repository},
+ publisher = {GitHub},
+ title = {Kocasm: Korean Automatic Sarcasm Detection},
+ year = {2019},
+}
+""",
+ adapted_from=["KorSarcasmClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train"]
+ )
diff --git a/mteb/tasks/Classification/kur/KurdishSentimentClassification.py b/mteb/tasks/Classification/kur/KurdishSentimentClassification.py
index 876b7450fd..d9d4bac595 100644
--- a/mteb/tasks/Classification/kur/KurdishSentimentClassification.py
+++ b/mteb/tasks/Classification/kur/KurdishSentimentClassification.py
@@ -5,6 +5,7 @@
class KurdishSentimentClassification(AbsTaskClassification):
+ superseded_by = "KurdishSentimentClassification.v2"
metadata = TaskMetadata(
name="KurdishSentimentClassification",
description="Kurdish Sentiment Dataset",
@@ -38,3 +39,41 @@ class KurdishSentimentClassification(AbsTaskClassification):
}
""",
)
+
+
+class KurdishSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="KurdishSentimentClassification.v2",
+ description="""Kurdish Sentiment Dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://link.springer.com/article/10.1007/s10579-023-09716-6",
+ dataset={
+ "path": "mteb/kurdish_sentiment",
+ "revision": "f6b00b2a1fcbffd83f10a76c85f246ca750c83d2",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["kur-Arab"],
+ main_score="accuracy",
+ date=("2023-01-01", "2024-01-02"),
+ domains=["Web", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-4.0",
+ annotations_creators="derived",
+ dialect=["Sorani"],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{article,
+ author = {Badawi, Soran and Kazemi, Arefeh and Rezaie, Vali},
+ doi = {10.1007/s10579-023-09716-6},
+ journal = {Language Resources and Evaluation},
+ month = {01},
+ pages = {1-20},
+ title = {KurdiSent: a corpus for kurdish sentiment analysis},
+ year = {2024},
+}
+""",
+ adapted_from=["KurdishSentimentClassification"],
+ )
diff --git a/mteb/tasks/Classification/mal/MalayalamNewsClassification.py b/mteb/tasks/Classification/mal/MalayalamNewsClassification.py
index 689e7688ac..991311b5cb 100644
--- a/mteb/tasks/Classification/mal/MalayalamNewsClassification.py
+++ b/mteb/tasks/Classification/mal/MalayalamNewsClassification.py
@@ -5,6 +5,7 @@
class MalayalamNewsClassification(AbsTaskClassification):
+ superseded_by = "MalayalamNewsClassification.v2"
metadata = TaskMetadata(
name="MalayalamNewsClassification",
description="A Malayalam dataset for 3-class classification of Malayalam news articles",
@@ -38,3 +39,38 @@ class MalayalamNewsClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"headings": "text"})
+
+
+class MalayalamNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="MalayalamNewsClassification.v2",
+ description="""A Malayalam dataset for 3-class classification of Malayalam news articles
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/goru001/nlp-for-malyalam",
+ dataset={
+ "path": "mteb/malayalam_news",
+ "revision": "2bb37780ab4a68cb0b28a902059463563b2dbab9",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2014-01-01", "2018-01-01"),
+ eval_splits=["test"],
+ eval_langs=["mal-Mlym"],
+ main_score="accuracy",
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{kunchukuttan2020indicnlpcorpus,
+ author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar},
+ journal = {arXiv preprint arXiv:2005.00085},
+ title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages},
+ year = {2020},
+}
+""",
+ adapted_from=["MalayalamNewsClassification"],
+ )
diff --git a/mteb/tasks/Classification/mar/MarathiNewsClassification.py b/mteb/tasks/Classification/mar/MarathiNewsClassification.py
index 4f652e2373..152561ed4e 100644
--- a/mteb/tasks/Classification/mar/MarathiNewsClassification.py
+++ b/mteb/tasks/Classification/mar/MarathiNewsClassification.py
@@ -5,6 +5,7 @@
class MarathiNewsClassification(AbsTaskClassification):
+ superseded_by = "MarathiNewsClassification.v2"
metadata = TaskMetadata(
name="MarathiNewsClassification",
description="A Marathi dataset for 3-class classification of Marathi news articles",
@@ -39,3 +40,38 @@ class MarathiNewsClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"headline": "text"})
self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
+
+
+class MarathiNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="MarathiNewsClassification.v2",
+ description="""A Marathi dataset for 3-class classification of Marathi news articles
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/goru001/nlp-for-marathi",
+ dataset={
+ "path": "mteb/marathi_news",
+ "revision": "97932a2f3b75d7bd9fae0d212975c1a1568935eb",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2014-01-01", "2018-01-01"),
+ eval_splits=["test"],
+ eval_langs=["mar-Deva"],
+ main_score="f1",
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{kunchukuttan2020indicnlpcorpus,
+ author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar},
+ journal = {arXiv preprint arXiv:2005.00085},
+ title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages},
+ year = {2020},
+}
+""",
+ adapted_from=["MarathiNewsClassification"],
+ )
diff --git a/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py b/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py
index 3eb6b2dc81..b2283b172c 100644
--- a/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py
+++ b/mteb/tasks/Classification/mkd/MacedonianTweetSentimentClassification.py
@@ -5,6 +5,7 @@
class MacedonianTweetSentimentClassification(AbsTaskClassification):
+ superseded_by = "MacedonianTweetSentimentClassification.v2"
metadata = TaskMetadata(
name="MacedonianTweetSentimentClassification",
description="An Macedonian dataset for tweet sentiment classification.",
@@ -45,3 +46,48 @@ class MacedonianTweetSentimentClassification(AbsTaskClassification):
}
""",
)
+
+
+class MacedonianTweetSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="MacedonianTweetSentimentClassification.v2",
+ description="""An Macedonian dataset for tweet sentiment classification.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/R15-1034/",
+ dataset={
+ "path": "mteb/macedonian_tweet_sentiment",
+ "revision": "3a8d98dc743809835255f727698d09814b699126",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=["2014-11-01", "2015-04-01"],
+ eval_splits=["test"],
+ eval_langs=["mkd-Cyrl"],
+ main_score="accuracy",
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-nc-sa-3.0",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{jovanoski-etal-2015-sentiment,
+ address = {Hissar, Bulgaria},
+ author = {Jovanoski, Dame and
+Pachovski, Veno and
+Nakov, Preslav},
+ booktitle = {Proceedings of the International Conference Recent Advances in Natural Language Processing},
+ editor = {Mitkov, Ruslan and
+Angelova, Galia and
+Bontcheva, Kalina},
+ month = sep,
+ pages = {249--257},
+ publisher = {INCOMA Ltd. Shoumen, BULGARIA},
+ title = {Sentiment Analysis in {T}witter for {M}acedonian},
+ url = {https://aclanthology.org/R15-1034},
+ year = {2015},
+}
+""",
+ adapted_from=["MacedonianTweetSentimentClassification"],
+ )
diff --git a/mteb/tasks/Classification/rus/ru_nlu_intent_classification.py b/mteb/tasks/Classification/multilingual/ru_nlu_intent_classification.py
similarity index 100%
rename from mteb/tasks/Classification/rus/ru_nlu_intent_classification.py
rename to mteb/tasks/Classification/multilingual/ru_nlu_intent_classification.py
diff --git a/mteb/tasks/Classification/mya/MyanmarNews.py b/mteb/tasks/Classification/mya/MyanmarNews.py
index f0aed9a7c2..6c67154477 100644
--- a/mteb/tasks/Classification/mya/MyanmarNews.py
+++ b/mteb/tasks/Classification/mya/MyanmarNews.py
@@ -5,6 +5,7 @@
class MyanmarNews(AbsTaskClassification):
+ superseded_by = "MyanmarNews.v2"
metadata = TaskMetadata(
name="MyanmarNews",
dataset={
@@ -37,3 +38,40 @@ class MyanmarNews(AbsTaskClassification):
}
""",
)
+
+
+class MyanmarNewsV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="MyanmarNews.v2",
+ dataset={
+ "path": "mteb/myanmar_news",
+ "revision": "475b43ffbdb5138ad67a01a2c860bc7db502f3c5",
+ },
+ description="""The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categorie, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/myanmar_news",
+ type="Classification",
+ category="p2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["mya-Mymr"],
+ main_score="accuracy",
+ date=("2017-10-01", "2017-10-31"),
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="gpl-3.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{Khine2017,
+ author = {A. H. Khine and K. T. Nwet and K. M. Soe},
+ booktitle = {15th Proceedings of International Conference on Computer Applications},
+ month = {February},
+ pages = {401--408},
+ title = {Automatic Myanmar News Classification},
+ year = {2017},
+}
+""",
+ adapted_from=["MyanmarNews"],
+ )
diff --git a/mteb/tasks/Classification/nep/NepaliNewsClassification.py b/mteb/tasks/Classification/nep/NepaliNewsClassification.py
index d266e38a6b..010d39b1c7 100644
--- a/mteb/tasks/Classification/nep/NepaliNewsClassification.py
+++ b/mteb/tasks/Classification/nep/NepaliNewsClassification.py
@@ -5,6 +5,7 @@
class NepaliNewsClassification(AbsTaskClassification):
+ superseded_by = "NepaliNewsClassification.v2"
metadata = TaskMetadata(
name="NepaliNewsClassification",
description="A Nepali dataset for 7500 news articles ",
@@ -54,3 +55,56 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class NepaliNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="NepaliNewsClassification.v2",
+ description="""A Nepali dataset for 7500 news articles
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/goru001/nlp-for-nepali",
+ dataset={
+ "path": "mteb/nepali_news",
+ "revision": "1e5e6cd30972f05f0f21af38bd3a887714d41938",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2019-01-01", "2020-01-01"),
+ eval_splits=["test"],
+ eval_langs=["nep-Deva"],
+ main_score="accuracy",
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{arora-2020-inltk,
+ abstract = {We present iNLTK, an open-source NLP library consisting of pre-trained language models and out-of-the-box support for Data Augmentation, Textual Similarity, Sentence Embeddings, Word Embeddings, Tokenization and Text Generation in 13 Indic Languages. By using pre-trained models from iNLTK for text classification on publicly available datasets, we significantly outperform previously reported results. On these datasets, we also show that by using pre-trained models and data augmentation from iNLTK, we can achieve more than 95{\%} of the previous best performance by using less than 10{\%} of the training data. iNLTK is already being widely used by the community and has 40,000+ downloads, 600+ stars and 100+ forks on GitHub.},
+ address = {Online},
+ author = {Arora, Gaurav},
+ booktitle = {Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)},
+ doi = {10.18653/v1/2020.nlposs-1.10},
+ editor = {Park, Eunjeong L. and
+Hagiwara, Masato and
+Milajevs, Dmitrijs and
+Liu, Nelson F. and
+Chauhan, Geeticka and
+Tan, Liling},
+ month = nov,
+ pages = {66--71},
+ publisher = {Association for Computational Linguistics},
+ title = {i{NLTK}: Natural Language Toolkit for Indic Languages},
+ url = {https://aclanthology.org/2020.nlposs-1.10},
+ year = {2020},
+}
+""",
+ adapted_from=["NepaliNewsClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train"]
+ )
diff --git a/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py b/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py
index 08c233b1ec..ed9236be3a 100644
--- a/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py
+++ b/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py
@@ -5,6 +5,7 @@
class DutchBookReviewSentimentClassification(AbsTaskClassification):
+ superseded_by = "DutchBookReviewSentimentClassification.v2"
metadata = TaskMetadata(
name="DutchBookReviewSentimentClassification",
description="A Dutch book review for sentiment classification.",
@@ -44,3 +45,47 @@ class DutchBookReviewSentimentClassification(AbsTaskClassification):
}
""",
)
+
+
+class DutchBookReviewSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="DutchBookReviewSentimentClassification.v2",
+ description="""A Dutch book review for sentiment classification.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/benjaminvdb/DBRD",
+ dataset={
+ "path": "mteb/dutch_book_review_sentiment",
+ "revision": "73cffedb578b628588db03b8608880cf95688bb2",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2019-10-04", "2019-10-04"),
+ eval_splits=["test"],
+ eval_langs=["nld-Latn"],
+ main_score="accuracy",
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{DBLP:journals/corr/abs-1910-00896,
+ archiveprefix = {arXiv},
+ author = {Benjamin, van der Burgh and
+Suzan, Verberne},
+ bibsource = {dblp computer science bibliography, https://dblp.org},
+ biburl = {https://dblp.org/rec/journals/corr/abs-1910-00896.bib},
+ eprint = {1910.00896},
+ journal = {CoRR},
+ timestamp = {Fri, 04 Oct 2019 12:28:06 +0200},
+ title = {The merits of Universal Language Model Fine-tuning for Small Datasets
+- a case with Dutch book reviews},
+ url = {http://arxiv.org/abs/1910.00896},
+ volume = {abs/1910.00896},
+ year = {2019},
+}
+""",
+ adapted_from=["DutchBookReviewSentimentClassification"],
+ )
diff --git a/mteb/tasks/Classification/nob/NoRecClassification.py b/mteb/tasks/Classification/nob/NoRecClassification.py
index 3dfa084b0e..cdba137eb2 100644
--- a/mteb/tasks/Classification/nob/NoRecClassification.py
+++ b/mteb/tasks/Classification/nob/NoRecClassification.py
@@ -5,6 +5,7 @@
class NoRecClassification(AbsTaskClassification):
+ superseded_by = "NoRecClassification.v2"
metadata = TaskMetadata(
name="NoRecClassification",
description="A Norwegian dataset for sentiment classification on review",
@@ -60,3 +61,63 @@ class NoRecClassification(AbsTaskClassification):
""",
prompt="Classify Norwegian reviews by sentiment",
)
+
+
+class NoRecClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="NoRecClassification.v2",
+ description="""A Norwegian dataset for sentiment classification on review
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/L18-1661/",
+ dataset={
+ # using the mini version to keep results ~comparable to the ScandEval benchmark
+ "path": "mteb/no_rec",
+ "revision": "10aae1fb3fe2c19888bd4ea11695bbf19aa8bed3",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["nob-Latn"],
+ main_score="accuracy",
+ date=("1998-01-01", "2018-01-01"), # based on plot in paper
+ domains=["Written", "Reviews"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-nc-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{velldal-etal-2018-norec,
+ address = {Miyazaki, Japan},
+ author = {Velldal, Erik and
+{\\O}vrelid, Lilja and
+Bergem, Eivind Alexander and
+Stadsnes, Cathrine and
+Touileb, Samia and
+J{\\o}rgensen, Fredrik},
+ booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation ({LREC} 2018)},
+ editor = {Calzolari, Nicoletta and
+Choukri, Khalid and
+Cieri, Christopher and
+Declerck, Thierry and
+Goggi, Sara and
+Hasida, Koiti and
+Isahara, Hitoshi and
+Maegaard, Bente and
+Mariani, Joseph and
+Mazo, H{\\'e}l{\\`e}ne and
+Moreno, Asuncion and
+Odijk, Jan and
+Piperidis, Stelios and
+Tokunaga, Takenobu},
+ month = may,
+ publisher = {European Language Resources Association (ELRA)},
+ title = {{N}o{R}e{C}: The {N}orwegian Review Corpus},
+ url = {https://aclanthology.org/L18-1661},
+ year = {2018},
+}
+""",
+ prompt="Classify Norwegian reviews by sentiment",
+ adapted_from=["NoRecClassification"],
+ )
diff --git a/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py b/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py
index b91c704063..03fbf0c381 100644
--- a/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py
+++ b/mteb/tasks/Classification/nob/NorwegianParliamentClassification.py
@@ -5,6 +5,7 @@
class NorwegianParliamentClassification(AbsTaskClassification):
+ superseded_by = "NorwegianParliamentClassification.v2"
metadata = TaskMetadata(
name="NorwegianParliamentClassification",
description="Norwegian parliament speeches annotated for sentiment",
@@ -49,3 +50,51 @@ class NorwegianParliamentClassification(AbsTaskClassification):
""",
prompt="Classify parliament speeches in Norwegian based on political affiliation",
)
+
+
+class NorwegianParliamentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="NorwegianParliamentClassification.v2",
+ description="""Norwegian parliament speeches annotated for sentiment
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/NbAiLab/norwegian_parliament",
+ dataset={
+ "path": "mteb/norwegian_parliament",
+ "revision": "7f2012a878e67486ac871cb450d6ef0dc2ebed7f",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test", "validation"],
+ eval_langs=["nob-Latn"],
+ # assumed to be bokmål
+ main_score="accuracy",
+ date=("1999-01-01", "2016-01-01"), # based on dates within the dataset
+ domains=["Government", "Spoken"],
+ task_subtypes=["Political classification"],
+ license="cc-by-4.0",
+ annotations_creators="derived", # based on the speaker affiliation
+ dialect=[], # unknown
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{kummervold-etal-2021-operationalizing,
+ abstract = {In this work, we show the process of building a large-scale training set from digital and digitized collections at a national library. The resulting Bidirectional Encoder Representations from Transformers (BERT)-based language model for Norwegian outperforms multilingual BERT (mBERT) models in several token and sequence classification tasks for both Norwegian Bokm{\aa}l and Norwegian Nynorsk. Our model also improves the mBERT performance for other languages present in the corpus such as English, Swedish, and Danish. For languages not included in the corpus, the weights degrade moderately while keeping strong multilingual properties. Therefore, we show that building high-quality models within a memory institution using somewhat noisy optical character recognition (OCR) content is feasible, and we hope to pave the way for other memory institutions to follow.},
+ address = {Reykjavik, Iceland (Online)},
+ author = {Kummervold, Per E and
+De la Rosa, Javier and
+Wetjen, Freddy and
+Brygfjeld, Svein Arne},
+ booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)},
+ editor = {Dobnik, Simon and
+{\O}vrelid, Lilja},
+ month = may # { 31--2 } # jun,
+ pages = {20--29},
+ publisher = {Link{\"o}ping University Electronic Press, Sweden},
+ title = {Operationalizing a National Digital Library: The Case for a {N}orwegian Transformer Model},
+ url = {https://aclanthology.org/2021.nodalida-main.3},
+ year = {2021},
+}
+""",
+ prompt="Classify parliament speeches in Norwegian based on political affiliation",
+ adapted_from=["NorwegianParliamentClassification"],
+ )
diff --git a/mteb/tasks/Classification/ory/OdiaNewsClassification.py b/mteb/tasks/Classification/ory/OdiaNewsClassification.py
index 214b8f67f3..dab2f279a8 100644
--- a/mteb/tasks/Classification/ory/OdiaNewsClassification.py
+++ b/mteb/tasks/Classification/ory/OdiaNewsClassification.py
@@ -5,6 +5,7 @@
class OdiaNewsClassification(AbsTaskClassification):
+ superseded_by = "OdiaNewsClassification.v2"
metadata = TaskMetadata(
name="OdiaNewsClassification",
description="A Odia dataset for 3-class classification of Odia news articles",
@@ -39,3 +40,41 @@ class OdiaNewsClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"headings": "text"})
self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
+
+
+class OdiaNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="OdiaNewsClassification.v2",
+ description="""A Odia dataset for 3-class classification of Odia news articles
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/goru001/nlp-for-odia",
+ dataset={
+ "path": "mteb/odia_news",
+ "revision": "a594be3144fdf15c1a7efcb0aa7484cbb9a8dba3",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2014-01-01", "2018-01-01"),
+ eval_splits=["test"],
+ eval_langs=["ory-Orya"],
+ main_score="f1",
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{kunchukuttan2020indicnlpcorpus,
+ author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar},
+ journal = {arXiv preprint arXiv:2005.00085},
+ title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages},
+ year = {2020},
+}
+""",
+ adapted_from=["OdiaNewsClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
diff --git a/mteb/tasks/Classification/pol/PolishClassification.py b/mteb/tasks/Classification/pol/PolishClassification.py
index d75a37a362..ad63a10da5 100644
--- a/mteb/tasks/Classification/pol/PolishClassification.py
+++ b/mteb/tasks/Classification/pol/PolishClassification.py
@@ -5,6 +5,7 @@
class CbdClassification(AbsTaskClassification):
+ superseded_by = "CBD.v2"
metadata = TaskMetadata(
name="CBD",
description="Polish Tweets annotated for cyberbullying detection.",
@@ -40,7 +41,46 @@ class CbdClassification(AbsTaskClassification):
)
+class CbdClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="CBD.v2",
+ description="""Polish Tweets annotated for cyberbullying detection.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="http://2019.poleval.pl/files/poleval2019.pdf",
+ dataset={
+ "path": "mteb/cbd",
+ "revision": "d962699e284a173179a05052b49d0a9001a25bc0",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["pol-Latn"],
+ main_score="accuracy",
+ date=("2019-01-01", "2019-12-31"), # best guess: based on publication date
+ domains=["Written", "Social"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="bsd-3-clause",
+ annotations_creators="human-annotated", # guess
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@proceedings{ogr:kob:19:poleval,
+ address = {Warsaw, Poland},
+ editor = {Maciej Ogrodniczuk and Łukasz Kobyliński},
+ isbn = {978-83-63159-28-3},
+ publisher = {Institute of Computer Science, Polish Academy of Sciences},
+ title = {{Proceedings of the PolEval 2019 Workshop}},
+ url = {http://2019.poleval.pl/files/poleval2019.pdf},
+ year = {2019},
+}
+""",
+ adapted_from=["CbdClassification"],
+ )
+
+
class PolEmo2InClassification(AbsTaskClassification):
+ superseded_by = "PolEmo2.0-IN.v2"
metadata = TaskMetadata(
name="PolEmo2.0-IN",
description="A collection of Polish online reviews from four domains: medicine, hotels, products and "
@@ -83,7 +123,52 @@ class PolEmo2InClassification(AbsTaskClassification):
)
+class PolEmo2InClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="PolEmo2.0-IN.v2",
+ description="A collection of Polish online reviews from four domains: medicine, hotels, products and "
+ + "school. The PolEmo2.0-IN task is to predict the sentiment of in-domain (medicine and hotels) reviews.",
+ reference="https://aclanthology.org/K19-1092.pdf",
+ dataset={
+ "path": "mteb/pol_emo2_in",
+ "revision": "15f86f0432cd7c91437cf7c673993527e2f53fd8",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["pol-Latn"],
+ main_score="accuracy",
+ date=("2004-01-01", "2019-05-30"), # based on plot in paper
+ domains=["Written", "Social"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{kocon-etal-2019-multi,
+ abstract = {In this article we present an extended version of PolEmo {--} a corpus of consumer reviews from 4 domains: medicine, hotels, products and school. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and sentence was manually annotated with sentiment in 2+1 scheme, which gives a total of 197,046 annotations. We obtained a high value of Positive Specific Agreement, which is 0.91 for texts and 0.88 for sentences. PolEmo 2.0 is publicly available under a Creative Commons copyright license. We explored recent deep learning approaches for the recognition of sentiment, such as Bi-directional Long Short-Term Memory (BiLSTM) and Bidirectional Encoder Representations from Transformers (BERT).},
+ address = {Hong Kong, China},
+ author = {Koco{\'n}, Jan and
+Mi{\l}kowski, Piotr and
+Za{\'s}ko-Zieli{\'n}ska, Monika},
+ booktitle = {Proceedings of the 23rd Conference on Computational Natural Language Learning (CoNLL)},
+ doi = {10.18653/v1/K19-1092},
+ month = nov,
+ pages = {980--991},
+ publisher = {Association for Computational Linguistics},
+ title = {Multi-Level Sentiment Analysis of {P}ol{E}mo 2.0: Extended Corpus of Multi-Domain Consumer Reviews},
+ url = {https://aclanthology.org/K19-1092},
+ year = {2019},
+}
+""",
+ adapted_from=["PolEmo2InClassification"],
+ )
+
+
class PolEmo2OutClassification(AbsTaskClassification):
+ superseded_by = "PolEmo2.0-OUT.v2"
metadata = TaskMetadata(
name="PolEmo2.0-OUT",
description="A collection of Polish online reviews from four domains: medicine, hotels, products and "
@@ -111,7 +196,37 @@ class PolEmo2OutClassification(AbsTaskClassification):
)
+class PolEmo2OutClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="PolEmo2.0-OUT.v2",
+ description="A collection of Polish online reviews from four domains: medicine, hotels, products and "
+ + "school. The PolEmo2.0-OUT task is to predict the sentiment of out-of-domain (products and "
+ + "school) reviews using models train on reviews from medicine and hotels domains.",
+ reference="https://aclanthology.org/K19-1092.pdf",
+ dataset={
+ "path": "mteb/pol_emo2_out",
+ "revision": "f7f3752b56dcbc4c84077274dfa687efa38476fb",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["pol-Latn"],
+ main_score="accuracy",
+ date=("2004-01-01", "2019-05-30"), # based on plot in paper
+ domains=["Written", "Social"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-sa-4.0",
+ annotations_creators=None,
+ dialect=None,
+ sample_creation=None,
+ bibtex_citation=None,
+ adapted_from=["PolEmo2OutClassification"],
+ )
+
+
class AllegroReviewsClassification(AbsTaskClassification):
+ superseded_by = "AllegroReviews.v2"
metadata = TaskMetadata(
name="AllegroReviews",
description="A Polish dataset for sentiment classification on reviews from e-commerce marketplace Allegro.",
@@ -161,7 +276,60 @@ class AllegroReviewsClassification(AbsTaskClassification):
)
+class AllegroReviewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="AllegroReviews.v2",
+ description="""A Polish dataset for sentiment classification on reviews from e-commerce marketplace Allegro.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2020.acl-main.111.pdf",
+ dataset={
+ "path": "mteb/allegro_reviews",
+ "revision": "5233456d195235bf93f45b8ef54d72f72957dbf1",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["pol-Latn"],
+ main_score="accuracy",
+ date=(
+ "2020-06-22",
+ "2020-07-07",
+ ), # best guess: based on commit dates in https://github.com/allegro/klejbenchmark-baselines
+ domains=["Reviews"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{rybak-etal-2020-klej,
+ abstract = {In recent years, a series of Transformer-based models unlocked major improvements in general natural language understanding (NLU) tasks. Such a fast pace of research would not be possible without general NLU benchmarks, which allow for a fair comparison of the proposed methods. However, such benchmarks are available only for a handful of languages. To alleviate this issue, we introduce a comprehensive multi-task benchmark for the Polish language understanding, accompanied by an online leaderboard. It consists of a diverse set of tasks, adopted from existing datasets for named entity recognition, question-answering, textual entailment, and others. We also introduce a new sentiment analysis task for the e-commerce domain, named Allegro Reviews (AR). To ensure a common evaluation scheme and promote models that generalize to different NLU tasks, the benchmark includes datasets from varying domains and applications. Additionally, we release HerBERT, a Transformer-based model trained specifically for the Polish language, which has the best average performance and obtains the best results for three out of nine tasks. Finally, we provide an extensive evaluation, including several standard baselines and recently proposed, multilingual Transformer-based models.},
+ address = {Online},
+ author = {Rybak, Piotr and
+Mroczkowski, Robert and
+Tracz, Janusz and
+Gawlik, Ireneusz},
+ booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
+ doi = {10.18653/v1/2020.acl-main.111},
+ editor = {Jurafsky, Dan and
+Chai, Joyce and
+Schluter, Natalie and
+Tetreault, Joel},
+ month = jul,
+ pages = {1191--1201},
+ publisher = {Association for Computational Linguistics},
+ title = {{KLEJ}: Comprehensive Benchmark for {P}olish Language Understanding},
+ url = {https://aclanthology.org/2020.acl-main.111/},
+ year = {2020},
+}
+""",
+ adapted_from=["AllegroReviewsClassification"],
+ )
+
+
class PacClassification(AbsTaskClassification):
+ superseded_by = "PAC.v2"
metadata = TaskMetadata(
name="PAC",
description="Polish Paraphrase Corpus",
@@ -196,3 +364,41 @@ class PacClassification(AbsTaskClassification):
}
""",
)
+
+
+class PacClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="PAC.v2",
+ description="""Polish Paraphrase Corpus
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/pdf/2211.13112.pdf",
+ dataset={
+ "path": "mteb/pac",
+ "revision": "53c98e6a9173c550f1b60f0da9152e67e9618897",
+ },
+ type="Classification",
+ category="p2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["pol-Latn"],
+ main_score="accuracy",
+ date=("2021-01-01", "2021-12-31"), # best guess: based on publication date
+ domains=["Legal", "Written"],
+ task_subtypes=[],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators=None,
+ dialect=[],
+ sample_creation=None,
+ bibtex_citation=r"""
+@misc{augustyniak2022waydesigningcompilinglepiszcze,
+ archiveprefix = {arXiv},
+ author = {Łukasz Augustyniak and Kamil Tagowski and Albert Sawczyn and Denis Janiak and Roman Bartusiak and Adrian Szymczak and Marcin Wątroba and Arkadiusz Janz and Piotr Szymański and Mikołaj Morzy and Tomasz Kajdanowicz and Maciej Piasecki},
+ eprint = {2211.13112},
+ primaryclass = {cs.CL},
+ title = {This is the way: designing and compiling LEPISZCZE, a comprehensive NLP benchmark for Polish},
+ url = {https://arxiv.org/abs/2211.13112},
+ year = {2022},
+}
+""",
+ adapted_from=["PacClassification"],
+ )
diff --git a/mteb/tasks/Classification/ron/Moroco.py b/mteb/tasks/Classification/ron/Moroco.py
index 479dd5a1d4..c8114a3c82 100644
--- a/mteb/tasks/Classification/ron/Moroco.py
+++ b/mteb/tasks/Classification/ron/Moroco.py
@@ -7,6 +7,7 @@
class Moroco(AbsTaskClassification):
+ superseded_by = "Moroco.v2"
metadata = TaskMetadata(
name="Moroco",
dataset={
@@ -50,3 +51,47 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class MorocoV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="Moroco.v2",
+ dataset={
+ "path": "mteb/moroco",
+ "revision": "6e70588dbd3d583da8b85989c1c3ab3d4bd2e7c4",
+ },
+ description="""The Moldavian and Romanian Dialectal Corpus. The MOROCO data set contains Moldavian and Romanian samples of text collected from the news domain. The samples belong to one of the following six topics: (0) culture, (1) finance, (2) politics, (3) science, (4) sports, (5) tech
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/moroco",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["ron-Latn"],
+ main_score="accuracy",
+ date=("2017-10-01", "2017-10-31"),
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="cc-by-4.0",
+ annotations_creators="derived",
+ dialect=[
+ "ron-Latn-ron",
+ "ron-Latn-mol",
+ ], # Moldavian, or the Romanian dialect used in Moldova, does not have an ISO 639-1 code assigned to it. However, it has been given the three-letter code "mol" under ISO 639-3
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{Butnaru-ACL-2019,
+ author = {Andrei M. Butnaru and Radu Tudor Ionescu},
+ booktitle = {Proceedings of ACL},
+ pages = {688--698},
+ title = {{MOROCO: The Moldavian and Romanian Dialectal Corpus}},
+ year = {2019},
+}
+""",
+ adapted_from=["Moroco"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py b/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py
index be06de0fa5..fd39716cc7 100644
--- a/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py
+++ b/mteb/tasks/Classification/ron/RomanianReviewsSentiment.py
@@ -5,6 +5,7 @@
class RomanianReviewsSentiment(AbsTaskClassification):
+ superseded_by = "RomanianReviewsSentiment.v2"
metadata = TaskMetadata(
name="RomanianReviewsSentiment",
description="LaRoSeDa (A Large Romanian Sentiment Data Set) contains 15,000 reviews written in Romanian",
@@ -44,3 +45,43 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class RomanianReviewsSentimentV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="RomanianReviewsSentiment.v2",
+ description="""LaRoSeDa (A Large Romanian Sentiment Data Set) contains 15,000 reviews written in Romanian
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2101.04197",
+ dataset={
+ "path": "mteb/romanian_reviews_sentiment",
+ "revision": "6b320d55fcf5fc184a9e7cc828debb34f7949432",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2020-01-01", "2021-01-11"),
+ eval_splits=["test"],
+ eval_langs=["ron-Latn"],
+ main_score="accuracy",
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{tache2101clustering,
+ author = {Anca Maria Tache and Mihaela Gaman and Radu Tudor Ionescu},
+ journal = {ArXiv},
+ title = {Clustering Word Embeddings with Self-Organizing Maps. Application on LaRoSeDa -- A Large Romanian Sentiment Data Set},
+ year = {2021},
+}
+""",
+ adapted_from=["RomanianReviewsSentiment"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/ron/RomanianSentimentClassification.py b/mteb/tasks/Classification/ron/RomanianSentimentClassification.py
index 3622620d50..9375a62dc3 100644
--- a/mteb/tasks/Classification/ron/RomanianSentimentClassification.py
+++ b/mteb/tasks/Classification/ron/RomanianSentimentClassification.py
@@ -7,6 +7,7 @@
class RomanianSentimentClassification(AbsTaskClassification):
+ superseded_by = "RomanianSentimentClassification.v2"
metadata = TaskMetadata(
name="RomanianSentimentClassification",
description="An Romanian dataset for sentiment classification.",
@@ -44,3 +45,43 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class RomanianSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="RomanianSentimentClassification.v2",
+ description="""An Romanian dataset for sentiment classification.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2009.08712",
+ dataset={
+ "path": "mteb/romanian_sentiment",
+ "revision": "bf545b83db13cf73ed402749b21a7777e0afdc6a",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2020-09-18", "2020-09-18"),
+ eval_splits=["test"],
+ eval_langs=["ron-Latn"],
+ main_score="accuracy",
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{dumitrescu2020birth,
+ author = {Dumitrescu, Stefan Daniel and Avram, Andrei-Marius and Pyysalo, Sampo},
+ journal = {arXiv preprint arXiv:2009.08712},
+ title = {The birth of Romanian BERT},
+ year = {2020},
+}
+""",
+ adapted_from=["RomanianSentimentClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/rus/GeoreviewClassification.py b/mteb/tasks/Classification/rus/GeoreviewClassification.py
index 3a9298ead2..f5f9efb1e3 100644
--- a/mteb/tasks/Classification/rus/GeoreviewClassification.py
+++ b/mteb/tasks/Classification/rus/GeoreviewClassification.py
@@ -5,6 +5,7 @@
class GeoreviewClassification(AbsTaskClassification):
+ superseded_by = "GeoreviewClassification.v2"
metadata = TaskMetadata(
name="GeoreviewClassification",
dataset={
@@ -35,3 +36,37 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
)
+
+
+class GeoreviewClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="GeoreviewClassification.v2",
+ dataset={
+ "path": "mteb/georeview",
+ "revision": "5194395f82217bc31212fd6a275002fb405f9dfb",
+ },
+ description="""Review classification (5-point scale) based on Yandex Georeview dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/yandex/geo-reviews-dataset-2023",
+ type="Classification",
+ category="p2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["rus-Cyrl"],
+ main_score="accuracy",
+ date=("2023-01-01", "2023-08-01"),
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation="",
+ prompt="Classify the organization rating based on the reviews",
+ adapted_from=["GeoreviewClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/rus/HeadlineClassification.py b/mteb/tasks/Classification/rus/HeadlineClassification.py
index 9def591d0c..2d4725f221 100644
--- a/mteb/tasks/Classification/rus/HeadlineClassification.py
+++ b/mteb/tasks/Classification/rus/HeadlineClassification.py
@@ -5,6 +5,7 @@
class HeadlineClassification(AbsTaskClassification):
+ superseded_by = "HeadlineClassification.v2"
metadata = TaskMetadata(
name="HeadlineClassification",
dataset={
@@ -59,3 +60,62 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
)
+
+
+class HeadlineClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="HeadlineClassification.v2",
+ dataset={
+ "path": "mteb/headline",
+ "revision": "6bd88e7778ee2e3bd8d0ade1be3ad5b6d969145a",
+ },
+ description="""Headline rubric classification based on the paraphraser plus dataset.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2020.ngt-1.6/",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["rus-Cyrl"],
+ main_score="accuracy",
+ date=("2009-01-01", "2020-01-01"),
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{gudkov-etal-2020-automatically,
+ abstract = {The article is focused on automatic development and ranking of a large corpus for Russian paraphrase generation which proves to be the first corpus of such type in Russian computational linguistics. Existing manually annotated paraphrase datasets for Russian are limited to small-sized ParaPhraser corpus and ParaPlag which are suitable for a set of NLP tasks, such as paraphrase and plagiarism detection, sentence similarity and relatedness estimation, etc. Due to size restrictions, these datasets can hardly be applied in end-to-end text generation solutions. Meanwhile, paraphrase generation requires a large amount of training data. In our study we propose a solution to the problem: we collect, rank and evaluate a new publicly available headline paraphrase corpus (ParaPhraser Plus), and then perform text generation experiments with manual evaluation on automatically ranked corpora using the Universal Transformer architecture.},
+ address = {Online},
+ author = {Gudkov, Vadim and
+Mitrofanova, Olga and
+Filippskikh, Elizaveta},
+ booktitle = {Proceedings of the Fourth Workshop on Neural Generation and Translation},
+ doi = {10.18653/v1/2020.ngt-1.6},
+ editor = {Birch, Alexandra and
+Finch, Andrew and
+Hayashi, Hiroaki and
+Heafield, Kenneth and
+Junczys-Dowmunt, Marcin and
+Konstas, Ioannis and
+Li, Xian and
+Neubig, Graham and
+Oda, Yusuke},
+ month = jul,
+ pages = {54--59},
+ publisher = {Association for Computational Linguistics},
+ title = {Automatically Ranked {R}ussian Paraphrase Corpus for Text Generation},
+ url = {https://aclanthology.org/2020.ngt-1.6},
+ year = {2020},
+}
+""",
+ prompt="Classify the topic or theme of the given news headline",
+ adapted_from=["HeadlineClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/rus/InappropriatenessClassification.py b/mteb/tasks/Classification/rus/InappropriatenessClassification.py
index 7ff0ed11b2..0fea5995ce 100644
--- a/mteb/tasks/Classification/rus/InappropriatenessClassification.py
+++ b/mteb/tasks/Classification/rus/InappropriatenessClassification.py
@@ -5,6 +5,7 @@
class InappropriatenessClassification(AbsTaskClassification):
+ superseded_by = "InappropriatenessClassification.v2"
metadata = TaskMetadata(
name="InappropriatenessClassification",
dataset={
@@ -65,6 +66,69 @@ def dataset_transform(self):
)
+class InappropriatenessClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="InappropriatenessClassification.v2",
+ dataset={
+ "path": "mteb/inappropriateness",
+ "revision": "2bdbb71d9b972709173f1477d7dd33c3d67f51ac",
+ },
+ description="""Inappropriateness identification in the form of binary classification
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2021.bsnlp-1.4",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["rus-Cyrl"],
+ main_score="accuracy",
+ date=("2006-01-01", "2021-04-01"),
+ domains=["Web", "Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{babakov-etal-2021-detecting,
+ abstract = {Not all topics are equally {``}flammable{''} in terms of toxicity: a calm discussion of turtles or fishing less often fuels inappropriate toxic dialogues than a discussion of politics or sexual minorities. We define a set of sensitive topics that can yield inappropriate and toxic messages and describe the methodology of collecting and labelling a dataset for appropriateness. While toxicity in user-generated data is well-studied, we aim at defining a more fine-grained notion of inappropriateness. The core of inappropriateness is that it can harm the reputation of a speaker. This is different from toxicity in two respects: (i) inappropriateness is topic-related, and (ii) inappropriate message is not toxic but still unacceptable. We collect and release two datasets for Russian: a topic-labelled dataset and an appropriateness-labelled dataset. We also release pre-trained classification models trained on this data.},
+ address = {Kiyv, Ukraine},
+ author = {Babakov, Nikolay and
+Logacheva, Varvara and
+Kozlova, Olga and
+Semenov, Nikita and
+Panchenko, Alexander},
+ booktitle = {Proceedings of the 8th Workshop on Balto-Slavic Natural Language Processing},
+ editor = {Babych, Bogdan and
+Kanishcheva, Olga and
+Nakov, Preslav and
+Piskorski, Jakub and
+Pivovarova, Lidia and
+Starko, Vasyl and
+Steinberger, Josef and
+Yangarber, Roman and
+Marci{\'n}czuk, Micha{\l} and
+Pollak, Senja and
+P{\v{r}}ib{\'a}{\v{n}}, Pavel and
+Robnik-{\v{S}}ikonja, Marko},
+ month = apr,
+ pages = {26--36},
+ publisher = {Association for Computational Linguistics},
+ title = {Detecting Inappropriate Messages on Sensitive Topics that Could Harm a Company{'}s Reputation},
+ url = {https://aclanthology.org/2021.bsnlp-1.4},
+ year = {2021},
+}
+""",
+ prompt="Classify the given message as either sensitive topic or not",
+ adapted_from=["InappropriatenessClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
+ )
+
+
class InappropriatenessClassificationv2(AbsTaskClassification):
metadata = TaskMetadata(
name="InappropriatenessClassificationv2",
diff --git a/mteb/tasks/Classification/rus/RuReviewsClassification.py b/mteb/tasks/Classification/rus/RuReviewsClassification.py
index 37f9e83af3..6629869441 100644
--- a/mteb/tasks/Classification/rus/RuReviewsClassification.py
+++ b/mteb/tasks/Classification/rus/RuReviewsClassification.py
@@ -5,6 +5,7 @@
class RuReviewsClassification(AbsTaskClassification):
+ superseded_by = "RuReviewsClassification.v2"
metadata = TaskMetadata(
name="RuReviewsClassification",
dataset={
@@ -47,3 +48,50 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
)
+
+
+class RuReviewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="RuReviewsClassification.v2",
+ dataset={
+ "path": "mteb/ru_reviews",
+ "revision": "46d80ee5ac51be8234725558677e59050b9c418e",
+ },
+ description="""Product review classification (3-point scale) based on RuRevies dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/sismetanin/rureviews",
+ type="Classification",
+ category="p2p",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["rus-Cyrl"],
+ main_score="accuracy",
+ date=("2000-01-01", "2020-01-01"),
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="apache-2.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{Smetanin-SA-2019,
+ author = {Sergey Smetanin and Michail Komarov},
+ booktitle = {2019 IEEE 21st Conference on Business Informatics (CBI)},
+ doi = {10.1109/CBI.2019.00062},
+ issn = {2378-1963},
+ month = {July},
+ number = {},
+ pages = {482-486},
+ title = {Sentiment Analysis of Product Reviews in Russian using Convolutional Neural Networks},
+ volume = {01},
+ year = {2019},
+}
+""",
+ prompt="Classify product reviews into positive, negative or neutral sentiment",
+ adapted_from=["RuReviewsClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, n_samples=2048, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py b/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py
index 8e511655e5..520114847f 100644
--- a/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py
+++ b/mteb/tasks/Classification/rus/ru_toixic_classification_okmlcup.py
@@ -5,6 +5,7 @@
class RuToxicOKMLCUPClassification(AbsTaskClassification):
+ superseded_by = "RuToxicOKMLCUPClassification.v2"
metadata = TaskMetadata(
name="RuToxicOKMLCUPClassification",
dataset={
@@ -31,3 +32,31 @@ class RuToxicOKMLCUPClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset = self.dataset.rename_column("toxic", "label")
+
+
+class RuToxicOKMLCUPClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="RuToxicOKMLCUPClassification.v2",
+ dataset={
+ "path": "mteb/ru_toxic_okmlcup",
+ "revision": "729025d2cfa68fcbc587ea80014a42d569cd9048",
+ },
+ description="""On the Odnoklassniki social network, users post a huge number of comments of various directions and nature every day.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://cups.online/ru/contests/okmlcup2020",
+ type="Classification",
+ category="t2t",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["rus-Cyrl"],
+ main_score="accuracy",
+ date=("2015-01-01", "2020-01-01"),
+ domains=[],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation="""""",
+ adapted_from=["RuToxicOKMLCUPClassification"],
+ )
diff --git a/mteb/tasks/Classification/rus/senti_ru_eval.py b/mteb/tasks/Classification/rus/senti_ru_eval.py
index a935dd8c76..a7c6fdbf11 100644
--- a/mteb/tasks/Classification/rus/senti_ru_eval.py
+++ b/mteb/tasks/Classification/rus/senti_ru_eval.py
@@ -5,6 +5,7 @@
class SentiRuEval2016Classification(AbsTaskClassification):
+ superseded_by = "SentiRuEval2016.v2"
metadata = TaskMetadata(
name="SentiRuEval2016",
dataset={
@@ -38,3 +39,39 @@ class SentiRuEval2016Classification(AbsTaskClassification):
}
""",
)
+
+
+class SentiRuEval2016ClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SentiRuEval2016.v2",
+ dataset={
+ "path": "mteb/senti_ru_eval2016",
+ "revision": "bfa4cbec1753ffed29a8244a4ec208cc9e6c09a0",
+ },
+ description="""Russian sentiment analysis evaluation SentiRuEval-2016 devoted to reputation monitoring of banks and telecom companies in Twitter. We describe the task, data, the procedure of data preparation, and participants’ results.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/mokoron/sentirueval",
+ type="Classification",
+ category="t2t",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["rus-Cyrl"],
+ main_score="accuracy",
+ date=("2015-01-01", "2016-01-01"),
+ domains=[],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{loukachevitch2016sentirueval,
+ author = {Loukachevitch, NV and Rubtsova, Yu V},
+ booktitle = {Computational Linguistics and Intellectual Technologies},
+ pages = {416--426},
+ title = {SentiRuEval-2016: overcoming time gap and data sparsity in tweet sentiment analysis},
+ year = {2016},
+}
+""",
+ adapted_from=["SentiRuEval2016Classification"],
+ )
diff --git a/mteb/tasks/Classification/san/SanskritShlokasClassification.py b/mteb/tasks/Classification/san/SanskritShlokasClassification.py
index 91b8436e8d..daff38224f 100644
--- a/mteb/tasks/Classification/san/SanskritShlokasClassification.py
+++ b/mteb/tasks/Classification/san/SanskritShlokasClassification.py
@@ -17,7 +17,7 @@ class SanskritShlokasClassification(AbsTaskClassification):
category="s2s",
modalities=["text"],
date=("2019-01-01", "2020-01-01"),
- eval_splits=["train", "validation"],
+ eval_splits=["validation"],
eval_langs=["san-Deva"],
main_score="accuracy",
domains=["Religious", "Written"],
diff --git a/mteb/tasks/Classification/sin/SinhalaNewsClassification.py b/mteb/tasks/Classification/sin/SinhalaNewsClassification.py
index 4b8c54a184..3d050607f5 100644
--- a/mteb/tasks/Classification/sin/SinhalaNewsClassification.py
+++ b/mteb/tasks/Classification/sin/SinhalaNewsClassification.py
@@ -5,6 +5,7 @@
class SinhalaNewsClassification(AbsTaskClassification):
+ superseded_by = "SinhalaNewsClassification.v2"
metadata = TaskMetadata(
name="SinhalaNewsClassification",
description="This file contains news texts (sentences) belonging to 5 different news categories (political, business, technology, sports and Entertainment). The original dataset was released by Nisansa de Silva (Sinhala Text Classification: Observations from the Perspective of a Resource Poor Language, 2015).",
@@ -50,3 +51,50 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class SinhalaNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SinhalaNewsClassification.v2",
+ description="""This file contains news texts (sentences) belonging to 5 different news categories (political, business, technology, sports and Entertainment). The original dataset was released by Nisansa de Silva (Sinhala Text Classification: Observations from the Perspective of a Resource Poor Language, 2015).
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/sinhala_news",
+ "revision": "e0b6e93ed5f086fe358595dff1aaad9eb877667a",
+ },
+ reference="https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Category-classification",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["sin-Sinh"],
+ main_score="accuracy",
+ date=("2019-03-17", "2020-08-06"),
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{deSilva2015,
+ author = {Nisansa de Silva},
+ journal = {Year of Publication},
+ title = {Sinhala Text Classification: Observations from the Perspective of a Resource Poor Language},
+ year = {2015},
+}
+
+@article{dhananjaya2022,
+ author = {Dhananjaya et al.},
+ journal = {Year of Publication},
+ title = {BERTifying Sinhala - A Comprehensive Analysis of Pre-trained Language Models for Sinhala Text Classification},
+ year = {2022},
+}
+""",
+ adapted_from=["SinhalaNewsClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train"]
+ )
diff --git a/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py b/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py
index a665d1c0c9..0cb1bd9cd8 100644
--- a/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py
+++ b/mteb/tasks/Classification/sin/SinhalaNewsSourceClassification.py
@@ -5,6 +5,7 @@
class SinhalaNewsSourceClassification(AbsTaskClassification):
+ superseded_by = "SinhalaNewsSourceClassification.v2"
metadata = TaskMetadata(
name="SinhalaNewsSourceClassification",
description="This dataset contains Sinhala news headlines extracted from 9 news sources (websites) (Sri Lanka Army, Dinamina, GossipLanka, Hiru, ITN, Lankapuwath, NewsLK, Newsfirst, World Socialist Web Site-Sinhala).",
@@ -41,3 +42,43 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class SinhalaNewsSourceClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SinhalaNewsSourceClassification.v2",
+ description="""This dataset contains Sinhala news headlines extracted from 9 news sources (websites) (Sri Lanka Army, Dinamina, GossipLanka, Hiru, ITN, Lankapuwath, NewsLK, Newsfirst, World Socialist Web Site-Sinhala).
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/sinhala_news_source",
+ "revision": "6902767dbfa6189cbe5f5b5b56ee6300b1702d33",
+ },
+ reference="https://huggingface.co/datasets/NLPC-UOM/Sinhala-News-Source-classification",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["sin-Sinh"],
+ main_score="accuracy",
+ date=("2021-02-17", "2022-08-20"),
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{dhananjaya2022,
+ author = {Dhananjaya et al.},
+ journal = {Year of Publication},
+ title = {BERTifying Sinhala - A Comprehensive Analysis of Pre-trained Language Models for Sinhala Text Classification},
+ year = {2022},
+}
+""",
+ adapted_from=["SinhalaNewsSourceClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train"]
+ )
diff --git a/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py b/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py
index 6577f7f315..ba3e09a50c 100644
--- a/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py
+++ b/mteb/tasks/Classification/slk/CSFDSKMovieReviewSentimentClassification.py
@@ -5,6 +5,7 @@
class CSFDSKMovieReviewSentimentClassification(AbsTaskClassification):
+ superseded_by = "CSFDSKMovieReviewSentimentClassification.v2"
metadata = TaskMetadata(
name="CSFDSKMovieReviewSentimentClassification",
description="The dataset contains 30k user reviews from csfd.cz in Slovak.",
@@ -50,3 +51,48 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES
)
+
+
+class CSFDSKMovieReviewSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="CSFDSKMovieReviewSentimentClassification.v2",
+ description="""The dataset contains 30k user reviews from csfd.cz in Slovak.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/abs/2304.01922",
+ dataset={
+ "path": "mteb/csfdsk_movie_review_sentiment",
+ "revision": "257ee340c1399ab5e038a3aea38877f67940774d",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2002-05-21", "2020-03-05"),
+ eval_splits=["test"],
+ eval_langs=["slk-Latn"],
+ main_score="accuracy",
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{štefánik2023resources,
+ archiveprefix = {arXiv},
+ author = {Michal Štefánik and Marek Kadlčík and Piotr Gramacki and Petr Sojka},
+ eprint = {2304.01922},
+ primaryclass = {cs.CL},
+ title = {Resources and Few-shot Learners for In-context Learning in Slavic Languages},
+ year = {2023},
+}
+""",
+ adapted_from=["CSFDSKMovieReviewSentimentClassification"],
+ )
+
+ # Increase the samples_per_label in order to improve baseline performance
+ samples_per_label = 20
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES
+ )
diff --git a/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py
index bd131ece85..6a4823b6e9 100644
--- a/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py
+++ b/mteb/tasks/Classification/slk/SlovakHateSpeechClassification.py
@@ -5,6 +5,7 @@
class SlovakHateSpeechClassification(AbsTaskClassification):
+ superseded_by = "SlovakHateSpeechClassification.v2"
metadata = TaskMetadata(
name="SlovakHateSpeechClassification",
description="The dataset contains posts from a social network with human annotations for hateful or offensive language in Slovak.",
@@ -28,3 +29,31 @@ class SlovakHateSpeechClassification(AbsTaskClassification):
sample_creation="found",
bibtex_citation="",
)
+
+
+class SlovakHateSpeechClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SlovakHateSpeechClassification.v2",
+ description="""The dataset contains posts from a social network with human annotations for hateful or offensive language in Slovak.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/TUKE-KEMT/hate_speech_slovak",
+ dataset={
+ "path": "mteb/slovak_hate_speech",
+ "revision": "691fe861df0ffa25066cbf6da8e64ebd296af6ab",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2024-05-25", "2024-06-06"),
+ eval_splits=["test"],
+ eval_langs=["slk-Latn"],
+ main_score="accuracy",
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-sa-4.0",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation="",
+ adapted_from=["SlovakHateSpeechClassification"],
+ )
diff --git a/mteb/tasks/Classification/slv/FrenkSlClassification.py b/mteb/tasks/Classification/slv/FrenkSlClassification.py
index f88d4ff9ff..6a5c765a8d 100644
--- a/mteb/tasks/Classification/slv/FrenkSlClassification.py
+++ b/mteb/tasks/Classification/slv/FrenkSlClassification.py
@@ -5,6 +5,7 @@
class FrenkSlClassification(AbsTaskClassification):
+ superseded_by = "FrenkSlClassification.v2"
metadata = TaskMetadata(
name="FrenkSlClassification",
description="Slovenian subset of the FRENK dataset. Also available on HuggingFace dataset hub: English subset, Croatian subset.",
@@ -44,3 +45,46 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class FrenkSlClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="FrenkSlClassification.v2",
+ description="""Slovenian subset of the FRENK dataset. Also available on HuggingFace dataset hub: English subset, Croatian subset.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/frenk_sl",
+ "revision": "3b69facc14651fbd152fda173683a7ecf9125b82",
+ },
+ reference="https://arxiv.org/pdf/1906.02045",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["slv-Latn"],
+ main_score="accuracy",
+ date=("2021-05-28", "2021-05-28"),
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{ljubešić2019frenk,
+ archiveprefix = {arXiv},
+ author = {Nikola Ljubešić and Darja Fišer and Tomaž Erjavec},
+ eprint = {1906.02045},
+ primaryclass = {cs.CL},
+ title = {The FRENK Datasets of Socially Unacceptable Discourse in Slovene and English},
+ url = {https://arxiv.org/abs/1906.02045},
+ year = {2019},
+}
+""",
+ adapted_from=["FrenkSlClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/spa/SpanishNewsClassification.py b/mteb/tasks/Classification/spa/SpanishNewsClassification.py
index 59ac97ba20..5543bab43e 100644
--- a/mteb/tasks/Classification/spa/SpanishNewsClassification.py
+++ b/mteb/tasks/Classification/spa/SpanishNewsClassification.py
@@ -5,6 +5,7 @@
class SpanishNewsClassification(AbsTaskClassification):
+ superseded_by = "SpanishNewsClassification.v2"
metadata = TaskMetadata(
name="SpanishNewsClassification",
description="A Spanish dataset for news classification. The dataset includes articles from reputable Spanish news sources spanning 12 different categories.",
@@ -35,3 +36,37 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class SpanishNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SpanishNewsClassification.v2",
+ description="""A Spanish dataset for news classification. The dataset includes articles from reputable Spanish news sources spanning 12 different categories.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/MarcOrfilaCarreras/spanish-news",
+ dataset={
+ "path": "mteb/spanish_news",
+ "revision": "345aa68ec44052d28828c6f88e7a2aafaf74be5a",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2023-05-01", "2024-05-01"),
+ eval_splits=["test"],
+ eval_langs=["spa-Latn"],
+ main_score="accuracy",
+ domains=["News", "Written"],
+ task_subtypes=[],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation="""
+ """,
+ adapted_from=["SpanishNewsClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train"]
+ )
diff --git a/mteb/tasks/Classification/spa/SpanishSentimentClassification.py b/mteb/tasks/Classification/spa/SpanishSentimentClassification.py
index 785b131bbc..49f88eff08 100644
--- a/mteb/tasks/Classification/spa/SpanishSentimentClassification.py
+++ b/mteb/tasks/Classification/spa/SpanishSentimentClassification.py
@@ -5,6 +5,7 @@
class SpanishSentimentClassification(AbsTaskClassification):
+ superseded_by = "SpanishSentimentClassification.v2"
metadata = TaskMetadata(
name="SpanishSentimentClassification",
description="A Spanish dataset for sentiment classification.",
@@ -52,3 +53,55 @@ class SpanishSentimentClassification(AbsTaskClassification):
}
""",
)
+
+
+class SpanishSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SpanishSentimentClassification.v2",
+ description="""A Spanish dataset for sentiment classification.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/sepidmnorozy/Spanish_sentiment",
+ dataset={
+ "path": "mteb/spanish_sentiment",
+ "revision": "307dea211013736d7d146dad9d2f6330e44d29b8",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2022-08-16", "2022-08-16"),
+ eval_splits=["validation", "test"],
+ eval_langs=["spa-Latn"],
+ main_score="accuracy",
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{mollanorozy-etal-2023-cross,
+ address = {Dubrovnik, Croatia},
+ author = {Mollanorozy, Sepideh and
+Tanti, Marc and
+Nissim, Malvina},
+ booktitle = {Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP},
+ doi = {10.18653/v1/2023.sigtyp-1.9},
+ editor = {Beinborn, Lisa and
+Goswami, Koustava and
+Murado{\\u{g}}lu, Saliha and
+Sorokin, Alexey and
+Kumar, Ritesh and
+Shcherbakov, Andreas and
+Ponti, Edoardo M. and
+Cotterell, Ryan and
+Vylomova, Ekaterina},
+ month = may,
+ pages = {89--95},
+ publisher = {Association for Computational Linguistics},
+ title = {Cross-lingual Transfer Learning with \{P\}ersian},
+ url = {https://aclanthology.org/2023.sigtyp-1.9},
+ year = {2023},
+}
+""",
+ adapted_from=["SpanishSentimentClassification"],
+ )
diff --git a/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py b/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py
index e5b667e289..297eb94438 100644
--- a/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py
+++ b/mteb/tasks/Classification/ssw/SiswatiNewsClassification.py
@@ -5,6 +5,7 @@
class SiswatiNewsClassification(AbsTaskClassification):
+ superseded_by = "SiswatiNewsClassification.v2"
metadata = TaskMetadata(
name="SiswatiNewsClassification",
description="Siswati News Classification Dataset",
@@ -41,3 +42,41 @@ class SiswatiNewsClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"title": "text"})
+
+
+class SiswatiNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SiswatiNewsClassification.v2",
+ description="""Siswati News Classification Dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news",
+ dataset={
+ "path": "mteb/siswati_news",
+ "revision": "e316774d8bbaa9b43858d093ea0f1eb38c6a9b4c",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["ssw-Latn"],
+ main_score="accuracy",
+ date=("2022-08-01", "2022-08-01"),
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="cc-by-sa-4.0",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{Madodonga_Marivate_Adendorff_2023,
+ author = {Madodonga, Andani and Marivate, Vukosi and Adendorff, Matthew},
+ doi = {10.55492/dhasa.v4i01.4449},
+ month = {Jan.},
+ title = {Izindaba-Tindzaba: Machine learning news categorisation for Long and Short Text for isiZulu and Siswati},
+ url = {https://upjournals.up.ac.za/index.php/dhasa/article/view/4449},
+ volume = {4},
+ year = {2023},
+}
+""",
+ adapted_from=["SiswatiNewsClassification"],
+ )
diff --git a/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py b/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py
index 25df08775d..109a237052 100644
--- a/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py
+++ b/mteb/tasks/Classification/svk/SlovakMovieReviewSentimentClassification.py
@@ -5,6 +5,7 @@
class SlovakMovieReviewSentimentClassification(AbsTaskClassification):
+ superseded_by = "SlovakMovieReviewSentimentClassification.v2"
metadata = TaskMetadata(
name="SlovakMovieReviewSentimentClassification",
description="User reviews of movies on the CSFD movie database, with 2 sentiment classes (positive, negative)",
@@ -42,3 +43,43 @@ def dataset_transform(self) -> None:
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class SlovakMovieReviewSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SlovakMovieReviewSentimentClassification.v2",
+ description="""User reviews of movies on the CSFD movie database, with 2 sentiment classes (positive, negative)
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://arxiv.org/pdf/2304.01922",
+ dataset={
+ "path": "mteb/slovak_movie_review_sentiment",
+ "revision": "29a7405aabcfd4860a51ae6f65a5650d63108f26",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["svk-Latn"],
+ main_score="accuracy",
+ date=("2002-05-21", "2020-03-05"),
+ dialect=[],
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="derived",
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{vstefanik2023resources,
+ author = {{\v{S}}tef{\'a}nik, Michal and Kadl{\v{c}}{\'\i}k, Marek and Gramacki, Piotr and Sojka, Petr},
+ journal = {arXiv preprint arXiv:2304.01922},
+ title = {Resources and Few-shot Learners for In-context Learning in Slavic Languages},
+ year = {2023},
+}
+""",
+ adapted_from=["SlovakMovieReviewSentimentClassification"],
+ )
+
+ def dataset_transform(self) -> None:
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/swa/SwahiliNewsClassification.py b/mteb/tasks/Classification/swa/SwahiliNewsClassification.py
index 518b749de0..abf644702d 100644
--- a/mteb/tasks/Classification/swa/SwahiliNewsClassification.py
+++ b/mteb/tasks/Classification/swa/SwahiliNewsClassification.py
@@ -5,6 +5,7 @@
class SwahiliNewsClassification(AbsTaskClassification):
+ superseded_by = "SwahiliNewsClassification.v2"
metadata = TaskMetadata(
name="SwahiliNewsClassification",
description="Dataset for Swahili News Classification, categorized with 6 domains (Local News (Kitaifa), International News (Kimataifa), Finance News (Uchumi), Health News (Afya), Sports News (Michezo), and Entertainment News (Burudani)). Building and Optimizing Swahili Language Models: Techniques, Embeddings, and Datasets",
@@ -45,3 +46,45 @@ def dataset_transform(self) -> None:
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
+
+
+class SwahiliNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SwahiliNewsClassification.v2",
+ description="""Dataset for Swahili News Classification, categorized with 6 domains (Local News (Kitaifa), International News (Kimataifa), Finance News (Uchumi), Health News (Afya), Sports News (Michezo), and Entertainment News (Burudani)). Building and Optimizing Swahili Language Models: Techniques, Embeddings, and Datasets
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/Mollel/SwahiliNewsClassification",
+ dataset={
+ "path": "mteb/swahili_news",
+ "revision": "d929055f41849d5bc3533c07d978fcfbc89d6a4e",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["swa-Latn"],
+ main_score="accuracy",
+ date=("2019-01-01", "2023-05-01"),
+ dialect=[],
+ domains=["News", "Written"],
+ task_subtypes=[],
+ license="cc-by-nc-sa-4.0",
+ annotations_creators="derived",
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{davis2020swahili,
+ author = {Davis, David},
+ doi = {10.5281/zenodo.5514203},
+ publisher = {Zenodo},
+ title = {Swahili: News Classification Dataset (0.2)},
+ url = {https://doi.org/10.5281/zenodo.5514203},
+ year = {2020},
+}
+""",
+ adapted_from=["SwahiliNewsClassification"],
+ )
+
+ def dataset_transform(self) -> None:
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train"]
+ )
diff --git a/mteb/tasks/Classification/swe/DalajClassification.py b/mteb/tasks/Classification/swe/DalajClassification.py
index 05983d0e4f..36e594e569 100644
--- a/mteb/tasks/Classification/swe/DalajClassification.py
+++ b/mteb/tasks/Classification/swe/DalajClassification.py
@@ -6,6 +6,7 @@
class DalajClassification(AbsTaskClassification):
+ superseded_by = "DalajClassification.v2"
metadata = TaskMetadata(
name="DalajClassification",
dataset={
@@ -67,3 +68,42 @@ def __convert_sample_to_classification(sample):
batched=True,
remove_columns=columns_to_keep,
)
+
+
+class DalajClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="DalajClassification.v2",
+ dataset={
+ "path": "mteb/dalaj",
+ "revision": "ecf6f2d83e8e85816ec3974896557a4aafce4f3e",
+ "name": "dalaj",
+ },
+ description="""A Swedish dataset for linguistic acceptability. Available as a part of Superlim.
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://spraakbanken.gu.se/en/resources/superlim",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["swe-Latn"],
+ main_score="accuracy",
+ date=("2017-01-01", "2020-12-31"),
+ domains=["Non-fiction", "Written"],
+ task_subtypes=["Linguistic acceptability"],
+ license="cc-by-4.0",
+ annotations_creators="expert-annotated",
+ dialect=[],
+ sample_creation="created",
+ bibtex_citation=r"""
+@misc{2105.06681,
+ author = {Elena Volodina and Yousuf Ali Mohammed and Julia Klezl},
+ eprint = {arXiv:2105.06681},
+ title = {DaLAJ - a dataset for linguistic acceptability judgments for Swedish: Format, baseline, sharing},
+ year = {2021},
+}
+""",
+ prompt="Classify texts based on linguistic acceptability in Swedish",
+ adapted_from=["DalajClassification"],
+ )
+
+ samples_per_label = 16
diff --git a/mteb/tasks/Classification/swe/SweRecClassification.py b/mteb/tasks/Classification/swe/SweRecClassification.py
index 8cc7b8dff8..5de86af7bf 100644
--- a/mteb/tasks/Classification/swe/SweRecClassification.py
+++ b/mteb/tasks/Classification/swe/SweRecClassification.py
@@ -5,6 +5,7 @@
class SweRecClassification(AbsTaskClassification):
+ superseded_by = "SweRecClassification.v2"
metadata = TaskMetadata(
name="SweRecClassification",
description="A Swedish dataset for sentiment classification on review",
@@ -43,3 +44,46 @@ class SweRecClassification(AbsTaskClassification):
""",
prompt="Classify Swedish reviews by sentiment",
)
+
+
+class SweRecClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SweRecClassification.v2",
+ description="""A Swedish dataset for sentiment classification on review
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2023.nodalida-1.20/",
+ dataset={
+ "path": "mteb/swe_rec",
+ "revision": "2a18a4ccc6770319b7f717cda1800f7d5bd5cd1a",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["swe-Latn"],
+ main_score="accuracy",
+ date=("2023-01-01", "2023-12-31"), # based on the publication date
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{nielsen-2023-scandeval,
+ address = {T{\'o}rshavn, Faroe Islands},
+ author = {Nielsen, Dan},
+ booktitle = {Proceedings of the 24th Nordic Conference on Computational Linguistics (NoDaLiDa)},
+ editor = {Alum{\"a}e, Tanel and
+Fishel, Mark},
+ month = may,
+ pages = {185--201},
+ publisher = {University of Tartu Library},
+ title = {{S}cand{E}val: A Benchmark for {S}candinavian Natural Language Processing},
+ url = {https://aclanthology.org/2023.nodalida-1.20},
+ year = {2023},
+}
+""",
+ prompt="Classify Swedish reviews by sentiment",
+ adapted_from=["SweRecClassification"],
+ )
diff --git a/mteb/tasks/Classification/swe/SwedishSentimentClassification.py b/mteb/tasks/Classification/swe/SwedishSentimentClassification.py
index 149be829fc..4541b5622f 100644
--- a/mteb/tasks/Classification/swe/SwedishSentimentClassification.py
+++ b/mteb/tasks/Classification/swe/SwedishSentimentClassification.py
@@ -5,6 +5,7 @@
class SwedishSentimentClassification(AbsTaskClassification):
+ superseded_by = "SwedishSentimentClassification.v2"
metadata = TaskMetadata(
name="SwedishSentimentClassification",
description="Dataset of Swedish reviews scarped from various public available websites",
@@ -28,3 +29,31 @@ class SwedishSentimentClassification(AbsTaskClassification):
sample_creation="found",
bibtex_citation="",
)
+
+
+class SwedishSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="SwedishSentimentClassification.v2",
+ description="""Dataset of Swedish reviews scarped from various public available websites
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/swedish_reviews",
+ dataset={
+ "path": "mteb/swedish_sentiment",
+ "revision": "f521560ac618eea57c85392c574c16b6c08c9487",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["validation", "test"],
+ eval_langs=["swe-Latn"],
+ main_score="accuracy",
+ date=("2021-01-01", "2022-01-01"),
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation="",
+ adapted_from=["SwedishSentimentClassification"],
+ )
diff --git a/mteb/tasks/Classification/tam/TamilNewsClassification.py b/mteb/tasks/Classification/tam/TamilNewsClassification.py
index 3f4505bce8..af5a2f2c71 100644
--- a/mteb/tasks/Classification/tam/TamilNewsClassification.py
+++ b/mteb/tasks/Classification/tam/TamilNewsClassification.py
@@ -5,6 +5,7 @@
class TamilNewsClassification(AbsTaskClassification):
+ superseded_by = "TamilNewsClassification.v2"
metadata = TaskMetadata(
name="TamilNewsClassification",
description="A Tamil dataset for 6-class classification of Tamil news articles",
@@ -41,3 +42,41 @@ def dataset_transform(self):
{"NewsInTamil": "text", "Category": "label"}
)
self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
+
+
+class TamilNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="TamilNewsClassification.v2",
+ description="""A Tamil dataset for 6-class classification of Tamil news articles
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/vanangamudi/tamil-news-classification",
+ dataset={
+ "path": "mteb/tamil_news",
+ "revision": "b417dba1f5a3143f8325b6b6fb585ab4a57c03a0",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2014-01-01", "2018-01-01"),
+ eval_splits=["test"],
+ eval_langs=["tam-Taml"],
+ main_score="f1",
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{kunchukuttan2020indicnlpcorpus,
+ author = {Anoop Kunchukuttan and Divyanshu Kakwani and Satish Golla and Gokul N.C. and Avik Bhattacharyya and Mitesh M. Khapra and Pratyush Kumar},
+ journal = {arXiv preprint arXiv:2005.00085},
+ title = {AI4Bharat-IndicNLP Corpus: Monolingual Corpora and Word Embeddings for Indic Languages},
+ year = {2020},
+}
+""",
+ adapted_from=["TamilNewsClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
diff --git a/mteb/tasks/Classification/tel/TeluguAndhraJyotiNewsClassification.py b/mteb/tasks/Classification/tel/TeluguAndhraJyotiNewsClassification.py
index 3d07293c64..3daf1f54cf 100644
--- a/mteb/tasks/Classification/tel/TeluguAndhraJyotiNewsClassification.py
+++ b/mteb/tasks/Classification/tel/TeluguAndhraJyotiNewsClassification.py
@@ -5,6 +5,7 @@
class TeluguAndhraJyotiNewsClassification(AbsTaskClassification):
+ superseded_by = "TeluguAndhraJyotiNewsClassification.v2"
metadata = TaskMetadata(
name="TeluguAndhraJyotiNewsClassification",
description="A Telugu dataset for 5-class classification of Telugu news articles",
@@ -32,3 +33,34 @@ class TeluguAndhraJyotiNewsClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"body": "text", "topic": "label"})
self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
+
+
+class TeluguAndhraJyotiNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="TeluguAndhraJyotiNewsClassification.v2",
+ description="""A Telugu dataset for 5-class classification of Telugu news articles
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/AnushaMotamarri/Telugu-Newspaper-Article-Dataset",
+ dataset={
+ "path": "mteb/telugu_andhra_jyoti_news",
+ "revision": "032752fb5f3a8c5b0814061c6502e0e8d58ec77c",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2014-01-01", "2018-01-01"),
+ eval_splits=["test"],
+ eval_langs=["tel-Telu"],
+ main_score="f1",
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation="",
+ adapted_from=["TeluguAndhraJyotiNewsClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(self.dataset, seed=self.seed)
diff --git a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py
index 242249340a..b829c3b2c9 100644
--- a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py
+++ b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py
@@ -5,6 +5,7 @@
class WisesightSentimentClassification(AbsTaskClassification):
+ superseded_by = "WisesightSentimentClassification.v2"
metadata = TaskMetadata(
name="WisesightSentimentClassification",
description="Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment label (positive, neutral, negative, question)",
@@ -42,3 +43,45 @@ class WisesightSentimentClassification(AbsTaskClassification):
}
""",
)
+
+
+class WisesightSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="WisesightSentimentClassification.v2",
+ description="""Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment label (positive, neutral, negative, question)
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/PyThaiNLP/wisesight-sentiment",
+ dataset={
+ "path": "mteb/wisesight_sentiment",
+ "revision": "aa2a5976a75df7f667215ac14353b3f5d07ba598",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["tha-Thai"],
+ main_score="f1",
+ date=("2019-05-24", "2021-09-16"),
+ dialect=[],
+ domains=["Social", "News", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="cc0-1.0",
+ annotations_creators="expert-annotated",
+ sample_creation="found",
+ bibtex_citation=r"""
+@software{bact_2019_3457447,
+ author = {Suriyawongkul, Arthit and
+Chuangsuwanich, Ekapol and
+Chormai, Pattarawat and
+Polpanumas, Charin},
+ doi = {10.5281/zenodo.3457447},
+ month = sep,
+ publisher = {Zenodo},
+ title = {PyThaiNLP/wisesight-sentiment: First release},
+ url = {https://doi.org/10.5281/zenodo.3457447},
+ version = {v1.0},
+ year = {2019},
+}
+""",
+ adapted_from=["WisesightSentimentClassification"],
+ )
diff --git a/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py b/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py
index 9a51214759..94f23830c6 100644
--- a/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py
+++ b/mteb/tasks/Classification/tha/WongnaiReviewsClassification .py
@@ -6,7 +6,7 @@
class WongnaiReviewsClassification(AbsTaskClassification):
metadata = TaskMetadata(
- name="WongnaiReviewsClassification ",
+ name="WongnaiReviewsClassification",
description="Wongnai features over 200,000 restaurants, beauty salons, and spas across Thailand on its platform, with detailed information about each merchant and user reviews. In this dataset there are 5 classes corressponding each star rating",
reference="https://github.com/wongnai/wongnai-corpus",
dataset={
diff --git a/mteb/tasks/Classification/tsn/TswanaNewsClassification.py b/mteb/tasks/Classification/tsn/TswanaNewsClassification.py
index e9095fd0d3..4d0fd118e2 100644
--- a/mteb/tasks/Classification/tsn/TswanaNewsClassification.py
+++ b/mteb/tasks/Classification/tsn/TswanaNewsClassification.py
@@ -5,6 +5,7 @@
class TswanaNewsClassification(AbsTaskClassification):
+ superseded_by = "TswanaNewsClassification.v2"
metadata = TaskMetadata(
name="TswanaNewsClassification",
description="Tswana News Classification Dataset",
@@ -39,3 +40,42 @@ class TswanaNewsClassification(AbsTaskClassification):
}
""",
)
+
+
+class TswanaNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="TswanaNewsClassification.v2",
+ description="""Tswana News Classification Dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://link.springer.com/chapter/10.1007/978-3-031-49002-6_17",
+ dataset={
+ "path": "mteb/tswana_news",
+ "revision": "2bbd0687d1733ac419fba18378bd9d864aae081c",
+ },
+ type="Classification",
+ task_subtypes=["Topic classification"],
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["tsn-Latn"],
+ main_score="accuracy",
+ date=("2015-01-01", "2023-01-01"),
+ domains=["News", "Written"],
+ license="cc-by-sa-4.0",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{marivate2023puoberta,
+ author = {Vukosi Marivate and Moseli Mots'Oehli and Valencia Wagner and Richard Lastrucci and Isheanesu Dzingirai},
+ booktitle = {SACAIR 2023 (To Appear)},
+ dataset_url = {https://github.com/dsfsi/PuoBERTa},
+ keywords = {NLP},
+ preprint_url = {https://arxiv.org/abs/2310.09141},
+ software_url = {https://huggingface.co/dsfsi/PuoBERTa},
+ title = {PuoBERTa: Training and evaluation of a curated language model for Setswana},
+ year = {2023},
+}
+""",
+ adapted_from=["TswanaNewsClassification"],
+ )
diff --git a/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py b/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py
index 680b52009b..e0f97dd325 100644
--- a/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py
+++ b/mteb/tasks/Classification/tur/TurkishMovieSentimentClassification.py
@@ -5,6 +5,7 @@
class TurkishMovieSentimentClassification(AbsTaskClassification):
+ superseded_by = "TurkishMovieSentimentClassification.v2"
metadata = TaskMetadata(
name="TurkishMovieSentimentClassification",
description="Turkish Movie Review Dataset",
@@ -41,3 +42,44 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class TurkishMovieSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="TurkishMovieSentimentClassification.v2",
+ description="""Turkish Movie Review Dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf",
+ dataset={
+ "path": "mteb/turkish_movie_sentiment",
+ "revision": "8ef5ce93ff2504de7fc46776317b78bdd8db47f2",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["tur-Latn"],
+ main_score="accuracy",
+ date=("2013-01-01", "2013-08-11"),
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{Demirtas2013CrosslingualPD,
+ author = {Erkin Demirtas and Mykola Pechenizkiy},
+ booktitle = {wisdom},
+ title = {Cross-lingual polarity detection with machine translation},
+ url = {https://api.semanticscholar.org/CorpusID:3912960},
+ year = {2013},
+}
+""",
+ adapted_from=["TurkishMovieSentimentClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py b/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py
index 7bfb086d99..98f089565a 100644
--- a/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py
+++ b/mteb/tasks/Classification/tur/TurkishProductSentimentClassification.py
@@ -5,6 +5,7 @@
class TurkishProductSentimentClassification(AbsTaskClassification):
+ superseded_by = "TurkishProductSentimentClassification.v2"
metadata = TaskMetadata(
name="TurkishProductSentimentClassification",
description="Turkish Product Review Dataset",
@@ -36,3 +37,39 @@ class TurkishProductSentimentClassification(AbsTaskClassification):
}
""",
)
+
+
+class TurkishProductSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="TurkishProductSentimentClassification.v2",
+ description="""Turkish Product Review Dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://www.win.tue.nl/~mpechen/publications/pubs/MT_WISDOM2013.pdf",
+ dataset={
+ "path": "mteb/turkish_product_sentiment",
+ "revision": "c846c08821e2ca649929a5562953c0466cd44736",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["tur-Latn"],
+ main_score="accuracy",
+ date=("2013-01-01", "2013-08-11"),
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{Demirtas2013CrosslingualPD,
+ author = {Erkin Demirtas and Mykola Pechenizkiy},
+ booktitle = {wisdom},
+ title = {Cross-lingual polarity detection with machine translation},
+ url = {https://api.semanticscholar.org/CorpusID:3912960},
+ year = {2013},
+}
+""",
+ adapted_from=["TurkishProductSentimentClassification"],
+ )
diff --git a/mteb/tasks/Classification/ukr/UkrFormalityClassification.py b/mteb/tasks/Classification/ukr/UkrFormalityClassification.py
index fadc60edd8..aedd65910b 100644
--- a/mteb/tasks/Classification/ukr/UkrFormalityClassification.py
+++ b/mteb/tasks/Classification/ukr/UkrFormalityClassification.py
@@ -5,6 +5,7 @@
class UkrFormalityClassification(AbsTaskClassification):
+ superseded_by = "UkrFormalityClassification.v2"
metadata = TaskMetadata(
name="UkrFormalityClassification",
description="""
@@ -52,3 +53,53 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train", "test"]
)
+
+
+class UkrFormalityClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="UkrFormalityClassification.v2",
+ description="""
+ This dataset contains Ukrainian Formality Classification dataset obtained by
+ trainslating English GYAFC data.
+ English data source: https://aclanthology.org/N18-1012/
+ Translation into Ukrainian language using model: https://huggingface.co/facebook/nllb-200-distilled-600M
+ Additionally, the dataset was balanced, witha labels: 0 - informal, 1 - formal.
+
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ dataset={
+ "path": "mteb/ukr_formality",
+ "revision": "e0b2dfa57d505f207deb571e58b0bd0b81180bd4",
+ },
+ reference="https://huggingface.co/datasets/ukr-detect/ukr-formality-dataset-translated-gyafc",
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["ukr-Cyrl"],
+ main_score="accuracy",
+ date=("2018-04-11", "2018-06-20"),
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="openrail++",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="machine-translated",
+ bibtex_citation=r"""
+@inproceedings{rao-tetreault-2018-dear,
+ author = {Rao, Sudha and
+Tetreault, Joel},
+ booktitle = {Proceedings of the 2018 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)},
+ month = jun,
+ publisher = {Association for Computational Linguistics},
+ title = {Dear Sir or Madam, May {I} Introduce the {GYAFC} Dataset: Corpus, Benchmarks and Metrics for Formality Style Transfer},
+ url = {https://aclanthology.org/N18-1012},
+ year = {2018},
+}
+""",
+ adapted_from=["UkrFormalityClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["train", "test"]
+ )
diff --git a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
index b6555c9f08..a3d116f807 100644
--- a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
+++ b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
@@ -5,6 +5,7 @@
class UrduRomanSentimentClassification(AbsTaskClassification):
+ superseded_by = "UrduRomanSentimentClassification.v2"
metadata = TaskMetadata(
name="UrduRomanSentimentClassification",
description="The Roman Urdu dataset is a data corpus comprising of more than 20000 records tagged for sentiment (Positive, Negative, Neutral)",
@@ -36,3 +37,39 @@ class UrduRomanSentimentClassification(AbsTaskClassification):
}
""",
)
+
+
+class UrduRomanSentimentClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="UrduRomanSentimentClassification.v2",
+ description="""The Roman Urdu dataset is a data corpus comprising of more than 20000 records tagged for sentiment (Positive, Negative, Neutral)
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set",
+ dataset={
+ "path": "mteb/urdu_roman_sentiment",
+ "revision": "fe3ea6b93097e7a2eb1356ad3665fd01667ac6be",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ date=("2018-01-01", "2018-08-28"),
+ eval_splits=["test"],
+ eval_langs=["urd-Latn"],
+ main_score="f1",
+ domains=["Social", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="mit",
+ annotations_creators="derived",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@misc{misc_roman_urdu_data_set_458,
+ author = {Sharf,Zareen},
+ howpublished = {UCI Machine Learning Repository},
+ note = {{DOI}: https://doi.org/10.24432/C58325},
+ title = {{Roman Urdu Data Set}},
+ year = {2018},
+}
+""",
+ adapted_from=["UrduRomanSentimentClassification"],
+ )
diff --git a/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py b/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py
index 8d40b89ff8..d4a2eee070 100644
--- a/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py
+++ b/mteb/tasks/Classification/vie/VieStudentFeedbackClassification.py
@@ -7,6 +7,7 @@
class VieStudentFeedbackClassification(AbsTaskClassification):
+ superseded_by = "VieStudentFeedbackClassification.v2"
metadata = TaskMetadata(
name="VieStudentFeedbackClassification",
description="A Vietnamese dataset for classification of student feedback",
@@ -50,3 +51,47 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class VieStudentFeedbackClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="VieStudentFeedbackClassification.v2",
+ description="""A Vietnamese dataset for classification of student feedback
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://ieeexplore.ieee.org/document/8573337",
+ dataset={
+ "path": "mteb/vie_student_feedback",
+ "revision": "9f9451c4aaaa5bf528a90fd430afa128fa748e45",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["vie-Latn"],
+ main_score="accuracy",
+ date=("2021-12-26", "2021-12-26"),
+ domains=["Reviews", "Written"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="mit",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="created",
+ bibtex_citation=r"""
+@inproceedings{8573337,
+ author = {Nguyen, Kiet Van and Nguyen, Vu Duc and Nguyen, Phu X. V. and Truong, Tham T. H. and Nguyen, Ngan Luu-Thuy},
+ booktitle = {2018 10th International Conference on Knowledge and Systems Engineering (KSE)},
+ doi = {10.1109/KSE.2018.8573337},
+ number = {},
+ pages = {19-24},
+ title = {UIT-VSFC: Vietnamese Students’ Feedback Corpus for Sentiment Analysis},
+ volume = {},
+ year = {2018},
+}
+""",
+ adapted_from=["VieStudentFeedbackClassification"],
+ )
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/zho/CMTEBClassification.py b/mteb/tasks/Classification/zho/CMTEBClassification.py
index 64fb95298a..58c42c3ed6 100644
--- a/mteb/tasks/Classification/zho/CMTEBClassification.py
+++ b/mteb/tasks/Classification/zho/CMTEBClassification.py
@@ -5,6 +5,7 @@
class TNews(AbsTaskClassification):
+ superseded_by = "TNews.v2"
metadata = TaskMetadata(
name="TNews",
description="Short Text Classification for News",
@@ -77,7 +78,83 @@ class TNews(AbsTaskClassification):
samples_per_label = 32
+class TNewsV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="TNews.v2",
+ description="""Short Text Classification for News
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://www.cluebenchmarks.com/introduce.html",
+ dataset={
+ "path": "mteb/t_news",
+ "revision": "0b80e40cb6a16956286e0dcbd4647a515cd277c4",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["validation"],
+ eval_langs=["cmn-Hans"],
+ main_score="accuracy",
+ date=None,
+ domains=None,
+ task_subtypes=None,
+ license=None,
+ annotations_creators=None,
+ dialect=None,
+ sample_creation=None,
+ bibtex_citation=r"""
+@inproceedings{xu-etal-2020-clue,
+ address = {Barcelona, Spain (Online)},
+ author = {Xu, Liang and
+Hu, Hai and
+Zhang, Xuanwei and
+Li, Lu and
+Cao, Chenjie and
+Li, Yudong and
+Xu, Yechen and
+Sun, Kai and
+Yu, Dian and
+Yu, Cong and
+Tian, Yin and
+Dong, Qianqian and
+Liu, Weitang and
+Shi, Bo and
+Cui, Yiming and
+Li, Junyi and
+Zeng, Jun and
+Wang, Rongzhao and
+Xie, Weijian and
+Li, Yanting and
+Patterson, Yina and
+Tian, Zuoyu and
+Zhang, Yiwen and
+Zhou, He and
+Liu, Shaoweihua and
+Zhao, Zhe and
+Zhao, Qipeng and
+Yue, Cong and
+Zhang, Xinrui and
+Yang, Zhengliang and
+Richardson, Kyle and
+Lan, Zhenzhong },
+ booktitle = {Proceedings of the 28th International Conference on Computational Linguistics},
+ doi = {10.18653/v1/2020.coling-main.419},
+ month = dec,
+ pages = {4762--4772},
+ publisher = {International Committee on Computational Linguistics},
+ title = {{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark},
+ url = {https://aclanthology.org/2020.coling-main.419},
+ year = {2020},
+}
+""",
+ prompt="Classify the fine-grained category of the given news title",
+ adapted_from=["TNews"],
+ )
+
+ samples_per_label = 32
+
+
class IFlyTek(AbsTaskClassification):
+ superseded_by = "IFlyTek.v2"
metadata = TaskMetadata(
name="IFlyTek",
description="Long Text classification for the description of Apps",
@@ -157,7 +234,90 @@ def metadata_dict(self) -> dict[str, str]:
return metadata_dict
+class IFlyTekV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="IFlyTek.v2",
+ description="""Long Text classification for the description of Apps
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://www.cluebenchmarks.com/introduce.html",
+ dataset={
+ "path": "mteb/i_fly_tek",
+ "revision": "a435e336f513cbd9175503bf156bcdbbdaae7682",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["validation"],
+ eval_langs=["cmn-Hans"],
+ main_score="accuracy",
+ date=None,
+ domains=None,
+ task_subtypes=None,
+ license=None,
+ annotations_creators=None,
+ dialect=None,
+ sample_creation=None,
+ bibtex_citation=r"""
+@inproceedings{xu-etal-2020-clue,
+ abstract = {The advent of natural language understanding (NLU) benchmarks for English, such as GLUE and SuperGLUE allows new NLU models to be evaluated across a diverse set of tasks. These comprehensive benchmarks have facilitated a broad range of research and applications in natural language processing (NLP). The problem, however, is that most such benchmarks are limited to English, which has made it difficult to replicate many of the successes in English NLU for other languages. To help remedy this issue, we introduce the first large-scale Chinese Language Understanding Evaluation (CLUE) benchmark. CLUE is an open-ended, community-driven project that brings together 9 tasks spanning several well-established single-sentence/sentence-pair classification tasks, as well as machine reading comprehension, all on original Chinese text. To establish results on these tasks, we report scores using an exhaustive set of current state-of-the-art pre-trained Chinese models (9 in total). We also introduce a number of supplementary datasets and additional tools to help facilitate further progress on Chinese NLU. Our benchmark is released at https://www.cluebenchmarks.com},
+ address = {Barcelona, Spain (Online)},
+ author = {Xu, Liang and
+Hu, Hai and
+Zhang, Xuanwei and
+Li, Lu and
+Cao, Chenjie and
+Li, Yudong and
+Xu, Yechen and
+Sun, Kai and
+Yu, Dian and
+Yu, Cong and
+Tian, Yin and
+Dong, Qianqian and
+Liu, Weitang and
+Shi, Bo and
+Cui, Yiming and
+Li, Junyi and
+Zeng, Jun and
+Wang, Rongzhao and
+Xie, Weijian and
+Li, Yanting and
+Patterson, Yina and
+Tian, Zuoyu and
+Zhang, Yiwen and
+Zhou, He and
+Liu, Shaoweihua and
+Zhao, Zhe and
+Zhao, Qipeng and
+Yue, Cong and
+Zhang, Xinrui and
+Yang, Zhengliang and
+Richardson, Kyle and
+Lan, Zhenzhong },
+ booktitle = {Proceedings of the 28th International Conference on Computational Linguistics},
+ doi = {10.18653/v1/2020.coling-main.419},
+ month = dec,
+ pages = {4762--4772},
+ publisher = {International Committee on Computational Linguistics},
+ title = {{CLUE}: A {C}hinese Language Understanding Evaluation Benchmark},
+ url = {https://aclanthology.org/2020.coling-main.419},
+ year = {2020},
+}
+""",
+ prompt="Given an App description text, find the appropriate fine-grained category",
+ adapted_from=["IFlyTek"],
+ )
+
+ samples_per_label = 32
+
+ @property
+ def metadata_dict(self) -> dict[str, str]:
+ metadata_dict = super().metadata_dict
+ metadata_dict["n_experiments"] = 5
+ return metadata_dict
+
+
class MultilingualSentiment(AbsTaskClassification):
+ superseded_by = "MultilingualSentiment.v2"
metadata = TaskMetadata(
name="MultilingualSentiment",
description="A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative",
@@ -186,7 +346,39 @@ class MultilingualSentiment(AbsTaskClassification):
samples_per_label = 32
+class MultilingualSentimentV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="MultilingualSentiment.v2",
+ description="""A collection of multilingual sentiments datasets grouped into 3 classes -- positive, neutral, negative
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/tyqiangz/multilingual-sentiment-datasets",
+ dataset={
+ "path": "mteb/multilingual_sentiment",
+ "revision": "0b29d56f2b01f431e809942450b8cb7c9a496b99",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["validation", "test"],
+ eval_langs=["cmn-Hans"],
+ main_score="accuracy",
+ date=None,
+ domains=None,
+ task_subtypes=None,
+ license=None,
+ annotations_creators=None,
+ dialect=None,
+ sample_creation=None,
+ bibtex_citation=None,
+ prompt="Classify sentiment of the customer review into positive, neutral, or negative",
+ adapted_from=["MultilingualSentiment"],
+ )
+
+ samples_per_label = 32
+
+
class JDReview(AbsTaskClassification):
+ superseded_by = "JDReview.v2"
metadata = TaskMetadata(
name="JDReview",
description="review for iphone",
@@ -222,6 +414,44 @@ class JDReview(AbsTaskClassification):
samples_per_label = 32
+class JDReviewV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="JDReview.v2",
+ description="""review for iphone
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2023.nodalida-1.20/",
+ dataset={
+ "path": "mteb/jd_review",
+ "revision": "43fcb7f8f4079c749f748e966e485634c65e6ae4",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["cmn-Hans"],
+ main_score="accuracy",
+ date=None,
+ domains=None,
+ task_subtypes=None,
+ license=None,
+ annotations_creators=None,
+ dialect=None,
+ sample_creation=None,
+ bibtex_citation=r"""
+@article{xiao2023c,
+ author = {Xiao, Shitao and Liu, Zheng and Zhang, Peitian and Muennighof, Niklas},
+ journal = {arXiv preprint arXiv:2309.07597},
+ title = {C-pack: Packaged resources to advance general chinese embedding},
+ year = {2023},
+}
+""",
+ prompt="Classify the customer review for iPhone on e-commerce platform into positive or negative",
+ adapted_from=["JDReview"],
+ )
+
+ samples_per_label = 32
+
+
class OnlineShopping(AbsTaskClassification):
metadata = TaskMetadata(
name="OnlineShopping",
@@ -259,6 +489,7 @@ class OnlineShopping(AbsTaskClassification):
class Waimai(AbsTaskClassification):
+ superseded_by = "Waimai.v2"
metadata = TaskMetadata(
name="Waimai",
description="Sentiment Analysis of user reviews on takeaway platforms",
@@ -292,3 +523,41 @@ class Waimai(AbsTaskClassification):
)
samples_per_label = 32
+
+
+class WaimaiV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="Waimai.v2",
+ description="""Sentiment Analysis of user reviews on takeaway platforms
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://aclanthology.org/2023.nodalida-1.20/",
+ dataset={
+ "path": "mteb/waimai",
+ "revision": "29d99f78f6f1d577e1c28a097883c12a4dc88283",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["cmn-Hans"],
+ main_score="accuracy",
+ date=None,
+ domains=None,
+ task_subtypes=None,
+ license=None,
+ annotations_creators=None,
+ dialect=None,
+ sample_creation=None,
+ bibtex_citation=r"""
+@article{xiao2023c,
+ author = {Xiao, Shitao and Liu, Zheng and Zhang, Peitian and Muennighof, Niklas},
+ journal = {arXiv preprint arXiv:2309.07597},
+ title = {C-pack: Packaged resources to advance general chinese embedding},
+ year = {2023},
+}
+""",
+ prompt="Classify the customer review from a food takeaway platform into positive or negative",
+ adapted_from=["Waimai"],
+ )
+
+ samples_per_label = 32
diff --git a/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py b/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py
index 7c6134a731..76f187f4a4 100644
--- a/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py
+++ b/mteb/tasks/Classification/zho/YueOpenriceReviewClassification.py
@@ -5,6 +5,7 @@
class YueOpenriceReviewClassification(AbsTaskClassification):
+ superseded_by = "YueOpenriceReviewClassification.v2"
metadata = TaskMetadata(
name="YueOpenriceReviewClassification",
description="A Cantonese dataset for review classification",
@@ -44,3 +45,47 @@ def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
+
+
+class YueOpenriceReviewClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="YueOpenriceReviewClassification.v2",
+ description="""A Cantonese dataset for review classification
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://github.com/Christainx/Dataset_Cantonese_Openrice",
+ dataset={
+ "path": "mteb/yue_openrice_review",
+ "revision": "702b7ebe3b3ac712f1c31e87ab7171b1f1ca6b6b",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["yue-Hant"],
+ main_score="accuracy",
+ date=("2019-01-01", "2019-05-01"),
+ domains=["Reviews", "Spoken"],
+ task_subtypes=["Sentiment/Hate speech"],
+ license="not specified",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@inproceedings{xiang2019sentiment,
+ author = {Xiang, Rong and Jiao, Ying and Lu, Qin},
+ booktitle = {Proceedings of the 8th KDD Workshop on Issues of Sentiment Discovery and Opinion Mining (WISDOM)},
+ organization = {KDD WISDOM},
+ pages = {1--9},
+ title = {Sentiment Augmented Attention Network for Cantonese Restaurant Review Analysis},
+ year = {2019},
+}
+""",
+ adapted_from=["YueOpenriceReviewClassification"],
+ )
+
+ samples_per_label = 32
+
+ def dataset_transform(self):
+ self.dataset = self.stratified_subsampling(
+ self.dataset, seed=self.seed, splits=["test"]
+ )
diff --git a/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py b/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py
index f8ca8c8e36..60c1dc8736 100644
--- a/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py
+++ b/mteb/tasks/Classification/zul/IsiZuluNewsClassification.py
@@ -5,6 +5,7 @@
class IsiZuluNewsClassification(AbsTaskClassification):
+ superseded_by = "IsiZuluNewsClassification.v2"
metadata = TaskMetadata(
name="IsiZuluNewsClassification",
description="isiZulu News Classification Dataset",
@@ -41,3 +42,41 @@ class IsiZuluNewsClassification(AbsTaskClassification):
def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"title": "text"})
+
+
+class IsiZuluNewsClassificationV2(AbsTaskClassification):
+ metadata = TaskMetadata(
+ name="IsiZuluNewsClassification.v2",
+ description="""isiZulu News Classification Dataset
+ This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+ reference="https://huggingface.co/datasets/dsfsi/za-isizulu-siswati-news",
+ dataset={
+ "path": "mteb/isi_zulu_news",
+ "revision": "45708aaaf9c6133227ea5db5cf26571facb9ccdb",
+ },
+ type="Classification",
+ category="s2s",
+ modalities=["text"],
+ eval_splits=["test"],
+ eval_langs=["zul-Latn"],
+ main_score="accuracy",
+ date=("2022-08-01", "2022-08-01"),
+ domains=["News", "Written"],
+ task_subtypes=["Topic classification"],
+ license="cc-by-sa-4.0",
+ annotations_creators="human-annotated",
+ dialect=[],
+ sample_creation="found",
+ bibtex_citation=r"""
+@article{Madodonga_Marivate_Adendorff_2023,
+ author = {Madodonga, Andani and Marivate, Vukosi and Adendorff, Matthew},
+ doi = {10.55492/dhasa.v4i01.4449},
+ month = {Jan.},
+ title = {Izindaba-Tindzaba: Machine learning news categorisation for Long and Short Text for isiZulu and Siswati},
+ url = {https://upjournals.up.ac.za/index.php/dhasa/article/view/4449},
+ volume = {4},
+ year = {2023},
+}
+""",
+ adapted_from=["IsiZuluNewsClassification"],
+ )
diff --git a/scripts/data/clean_and_update_tasks.py b/scripts/data/clean_and_update_tasks.py
new file mode 100644
index 0000000000..e74576b14b
--- /dev/null
+++ b/scripts/data/clean_and_update_tasks.py
@@ -0,0 +1,1216 @@
+from __future__ import annotations
+
+import ast
+import importlib.util
+import re
+import traceback
+import warnings
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Literal, Optional
+
+import datasets
+import orjson
+import pandas as pd
+import typer
+from datasets import Dataset, DatasetDict, load_dataset
+from huggingface_hub import HfApi
+from tqdm import tqdm
+
+"""
+This script is designed for data cleaning and the automatic creation of new task versions with updated data.
+Currently, it supports only monolingual classification tasks.
+
+The results of this script can be seen in the following PRs:
+- https://github.com/embeddings-benchmark/mteb/pull/2632
+- https://github.com/embeddings-benchmark/mteb/pull/2900
+"""
+
+app = typer.Typer()
+datasets.logging.set_verbosity_error()
+datasets.logging.disable_progress_bar()
+
+
+@dataclass
+class TaskMetadataInfo:
+ class_name: str
+ name: str
+ version: int
+ dataset: dict[str, str]
+ eval_splits: list[str]
+ eval_langs: list[str]
+
+
+def format_scores(
+ scores_files: list[Path],
+ filter_multiple_model_versions: bool = False,
+ name_type: Literal["model", "parent"] = "model",
+ filter_missing_scores: bool = False,
+ use_all_subsets: bool = False,
+) -> pd.DataFrame:
+ if filter_multiple_model_versions:
+ latest_files: dict[tuple[str, str], Path] = {}
+ for scores_file in scores_files:
+ model_name = (
+ scores_file.parent.parent.name
+ if name_type == "model"
+ else scores_file.parent.parent.parent.name
+ )
+ task_id = scores_file.name
+ key = (model_name, task_id)
+ if (
+ key not in latest_files
+ or scores_file.stat().st_mtime > latest_files[key].stat().st_mtime
+ ):
+ latest_files[key] = scores_file
+ filtered_scores_files = list(latest_files.values())
+ else:
+ filtered_scores_files = scores_files
+
+ scores_data = []
+ for scores_file in filtered_scores_files:
+ try:
+ s = orjson.loads(scores_file.read_bytes().replace(b"NaN", b"null"))
+ if use_all_subsets:
+ for subset in s["scores"]:
+ scores_data.extend(
+ {
+ "task_name": s["task_name"],
+ "evaluation_time": s.get(
+ "evaluation_time", None
+ ), # Handle missing eval time
+ "model_name": scores_file.parent.parent.name
+ if name_type == "model"
+ else scores_file.parent.parent.parent.name,
+ "subset": subset,
+ **score,
+ }
+ for score in s["scores"][subset]
+ )
+
+ else:
+ score_set = s["scores"].get(
+ "test", s["scores"].get("dev", s["scores"].get("train"))
+ )
+ if score_set is None:
+ warnings.warn(
+ f"No 'test' or 'dev' or 'train' scores found in {scores_file}",
+ stacklevel=2,
+ )
+ continue
+
+ scores_data.extend(
+ {
+ "task_name": s["task_name"],
+ "evaluation_time": s.get("evaluation_time", None),
+ "model_name": scores_file.parent.parent.name
+ if name_type == "model"
+ else scores_file.parent.parent.parent.name,
+ **score,
+ }
+ for score in score_set
+ )
+ except Exception as e:
+ warnings.warn(f"Error processing file {scores_file}: {e}", stacklevel=2)
+
+ if not scores_data:
+ return pd.DataFrame()
+
+ scores = pd.DataFrame(scores_data)
+ scores["languages"] = scores["languages"].apply(
+ lambda x: ",".join(x) if isinstance(x, list) else x
+ )
+ scores["model_name_original"] = scores["model_name"].str.replace("__", "/")
+
+ scores = scores.drop_duplicates(
+ subset=["task_name", "model_name", "languages", "hf_subset", "main_score"]
+ )
+ scores = scores.sort_values(["task_name", "main_score"], ascending=[True, False])
+
+ mode_count = scores.groupby("model_name")["main_score"].count().mode().iloc[0]
+ model_counts = scores.groupby("model_name")["main_score"].count()
+ filtered_models = model_counts[model_counts < mode_count].index.tolist()
+
+ if filtered_models:
+ print(
+ f"WARNING: The following models have fewer scores than the mode ({mode_count}):"
+ )
+ for model in filtered_models:
+ print(f" - {model}: {model_counts[model]} scores")
+
+ if filter_missing_scores:
+ scores = scores[~scores["model_name"].isin(filtered_models)]
+
+ return scores
+
+
+def find_class_node(
+ module: ast.Module, class_name: str
+) -> tuple[ast.ClassDef | None, int]:
+ for i, node in enumerate(module.body):
+ if isinstance(node, ast.ClassDef) and node.name == class_name:
+ return node, i
+ return None, -1
+
+
+def find_latest_class_name(module: ast.Module) -> str | None:
+ version_pattern = re.compile(r"^(?P.+?)(?:V(?P\d+))?$")
+ groups: dict[str, list[tuple[int, ast.ClassDef, int]]] = {}
+ for idx, node in enumerate(module.body):
+ if isinstance(node, ast.ClassDef):
+ m = version_pattern.match(node.name)
+ if not m:
+ continue
+ base = m.group("base")
+ ver = int(m.group("ver")) if m.group("ver") else 1
+ groups.setdefault(base, []).append((ver, node, idx))
+ if not groups:
+ return None
+ base_class_name, _ = max(groups.items(), key=lambda kv: max(e[0] for e in kv[1]))
+ return base_class_name
+
+
+def read_lines(file_path: Path) -> list[str]:
+ return file_path.read_text().splitlines(keepends=True)
+
+
+def write_lines(file_path: Path, lines: list[str]) -> None:
+ file_path.write_text("".join(lines))
+
+
+def resolve_local_variable(
+ var_name: str, func_node: ast.FunctionDef
+) -> ast.expr | None:
+ # Walk backwards to find the last assignment
+ for stmt in reversed(func_node.body):
+ if isinstance(stmt, ast.Assign):
+ for target in stmt.targets:
+ if isinstance(target, ast.Name) and target.id == var_name:
+ return stmt.value
+ return None
+
+
+def update_class_header(
+ new_block: list[str], old_name: str, base_name: str, new_suffix: str
+) -> list[str]:
+ header_re = re.compile(r"^(\s*class\s+)" + re.escape(old_name) + r"(\b.*)$")
+ new_block[0] = header_re.sub(r"\1" + base_name + new_suffix + r"\2", new_block[0])
+ return new_block
+
+
+def update_task_metadata(
+ new_block: list[str], old_name: str, base_name: str, new_meta_suffix: str
+) -> list[str]:
+ within = False
+ parens = 0
+ metadata_start_index = -1
+ metadata_end_index = -1
+ indent = -1
+ adapted_from_index = -1
+
+ for i, line in enumerate(new_block):
+ if not within and re.match(r"\s*metadata\s*=\s*TaskMetadata\s*\(", line):
+ within = True
+ metadata_start_index = i
+ parens = line.count("(") - line.count(")")
+ if line.strip().endswith("()"):
+ parens = 0
+ metadata_end_index = i
+ within = False
+ elif within:
+ if "adapted_from" in line:
+ adapted_from_index = i
+ parens += line.count("(") - line.count(")")
+ m = re.match(r'^(\s*name\s*=\s*")([^"]*)(".*)$', line)
+ if m:
+ base_name = re.sub(r"\.v\d+$", "", m.group(2))
+ new_block[i] = (
+ m.group(1) + base_name + new_meta_suffix + m.group(3) + "\n"
+ )
+ if indent == -1 and line.strip() and not line.strip().startswith("#"):
+ indent = len(line) - len(line.lstrip(" "))
+ if parens <= 0:
+ within = False
+ metadata_end_index = i
+
+ if metadata_start_index != -1:
+ if adapted_from_index != -1:
+ line = new_block[adapted_from_index]
+ line_indent = len(line) - len(line.lstrip(" "))
+ new_block[adapted_from_index] = (
+ f'{" " * line_indent}adapted_from=["{old_name}"],\n'
+ )
+ else:
+ if indent == -1:
+ line = new_block[metadata_end_index]
+ indent = (len(line) - len(line.lstrip())) + 4
+
+ line_to_insert_before_idx = metadata_end_index
+ # handle case where last line is just ')'
+ if new_block[line_to_insert_before_idx].strip() == ")":
+ line_to_insert_before_idx -= 1
+
+ last_line_idx = -1
+ for i in range(line_to_insert_before_idx, -1, -1):
+ if new_block[i].strip():
+ last_line_idx = i
+ break
+
+ if last_line_idx != -1:
+ if not new_block[last_line_idx].rstrip().endswith(","):
+ new_block[last_line_idx] = new_block[last_line_idx].rstrip() + ",\n"
+
+ new_line = " " * indent + f'adapted_from=["{old_name}"],\n'
+ new_block.insert(metadata_end_index, new_line)
+ return new_block
+
+
+def handle_dataset_transform(
+ new_block: list[str], block: list[str], ds: DatasetDict
+) -> list[str]:
+ text = "".join(new_block)
+ module = ast.parse(text)
+ class_node = next(node for node in module.body if isinstance(node, ast.ClassDef))
+
+ transform_node = None
+ for node in class_node.body:
+ if isinstance(node, ast.FunctionDef) and node.name == "dataset_transform":
+ transform_node = node
+ break
+
+ if transform_node:
+ subsampling_calls = []
+ for node in ast.walk(transform_node):
+ if (
+ isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Attribute)
+ and node.func.attr == "stratified_subsampling"
+ ):
+ subsampling_calls.append(node)
+
+ if not subsampling_calls:
+ start = transform_node.lineno - 1
+ end = transform_node.end_lineno
+ del new_block[start:end]
+ else:
+ n_samples = 2048
+ splits = ["test"]
+ for call in subsampling_calls:
+ for kw in call.keywords:
+ if kw.arg == "n_samples":
+ value_node = kw.value
+ if isinstance(value_node, ast.Name):
+ resolved_node = resolve_local_variable(
+ value_node.id, transform_node
+ )
+ if resolved_node:
+ value_node = resolved_node
+ try:
+ n_samples = ast.literal_eval(value_node)
+ except (ValueError, TypeError):
+ pass # Keep default
+ elif kw.arg == "splits":
+ splits = ast.literal_eval(kw.value)
+
+ needs_subsampling = any(
+ len(ds[split]) > n_samples for split in splits if split in ds
+ )
+
+ if needs_subsampling:
+ original_class_lines = "".join(block)
+ original_class_module = ast.parse(original_class_lines)
+ original_class_node = next(
+ node
+ for node in original_class_module.body
+ if isinstance(node, ast.ClassDef)
+ )
+
+ original_transform_node = None
+ for node in original_class_node.body:
+ if (
+ isinstance(node, ast.FunctionDef)
+ and node.name == "dataset_transform"
+ ):
+ original_transform_node = node
+ break
+
+ new_transform_body = []
+ if original_transform_node:
+ for stmt in original_transform_node.body:
+ is_subsampling_stmt = False
+ for node in ast.walk(stmt):
+ if (
+ isinstance(node, ast.Call)
+ and isinstance(node.func, ast.Attribute)
+ and node.func.attr == "stratified_subsampling"
+ ):
+ is_subsampling_stmt = True
+ break
+ if is_subsampling_stmt:
+ stmt_start_line = stmt.lineno - 1
+ stmt_end_line = (
+ stmt.end_lineno
+ if stmt.end_lineno
+ else stmt_start_line + 1
+ )
+ new_transform_body.extend(
+ block[stmt_start_line:stmt_end_line]
+ )
+
+ start = (
+ transform_node.body[0].lineno - 1
+ if transform_node.body
+ else transform_node.lineno
+ )
+ end = transform_node.end_lineno
+ del new_block[start:end]
+
+ if new_transform_body:
+ indent = (
+ " " * (transform_node.body[0].col_offset)
+ if transform_node.body
+ else " " * (transform_node.col_offset + 4)
+ )
+ new_transform_body_lines = [
+ f"{indent}{line.lstrip()}" for line in new_transform_body
+ ]
+ new_block.insert(start, "".join(new_transform_body_lines))
+
+ else:
+ start = transform_node.lineno - 1
+ end = transform_node.end_lineno
+ del new_block[start:end]
+ return new_block
+
+
+def get_v2_block(
+ block: list[str],
+ old_name: str,
+ base_name: str,
+ new_suffix: str,
+ new_meta_suffix: str,
+ ds: DatasetDict,
+) -> list[str]:
+ new_block = block.copy()
+ new_block = update_class_header(new_block, old_name, base_name, new_suffix)
+ new_block = update_task_metadata(new_block, old_name, base_name, new_meta_suffix)
+ new_block = handle_dataset_transform(new_block, block, ds)
+ return new_block
+
+
+def deduplicate(dataset: Dataset) -> Dataset:
+ unique_texts = set()
+ indices_to_keep = []
+ for i, text in enumerate(dataset["text"]):
+ text = text.strip()
+ if text not in unique_texts:
+ unique_texts.add(text)
+ indices_to_keep.append(i)
+
+ return dataset.select(indices_to_keep)
+
+
+def filter_empty(dataset: Dataset) -> Dataset:
+ return dataset.filter(lambda x: len(x["text"].strip()) > 0)
+
+
+def filter_leakage(train_dataset: Dataset, test_dataset: Dataset) -> Dataset:
+ train_texts = set(train_dataset["text"])
+ test_indices_no_leakage = [
+ i for i, text in enumerate(test_dataset["text"]) if text not in train_texts
+ ]
+ return test_dataset.select(test_indices_no_leakage)
+
+
+def filter_controversial(dataset_dict: DatasetDict) -> DatasetDict:
+ normalized: dict[str, set[str | tuple[str, ...]]] = {}
+ for _, ds in dataset_dict.items():
+ for text, label in zip(ds["text"], ds["label"]):
+ key = text.strip().lower()
+ normalized.setdefault(key, set()).add(
+ label if isinstance(label, (str, int, float)) else tuple(label)
+ )
+ bad_texts = {t for t, labels in normalized.items() if len(labels) > 1}
+ return DatasetDict(
+ {
+ split: ds.filter(lambda x: x["text"].strip().lower() not in bad_texts)
+ for split, ds in dataset_dict.items()
+ }
+ )
+
+
+def filter_short(dataset: Dataset, min_words: int = 3) -> Dataset:
+ return dataset.filter(lambda x: len(x["text"].strip().split()) >= min_words)
+
+
+def calculate_inner_indent(lines: list[str], node: ast.ClassDef) -> int:
+ indent = len(lines[node.lineno - 1]) - len(lines[node.lineno - 1].lstrip(" "))
+ for line in lines[node.lineno : node.end_lineno]:
+ if line.strip():
+ return len(line) - len(line.lstrip(" "))
+ return indent + 4
+
+
+def load_and_transform(file_path: Path, metadata: TaskMetadataInfo) -> DatasetDict:
+ return load_dataset(file_path, metadata.class_name)
+
+
+def split_train_test(
+ ds: DatasetDict, metadata: TaskMetadataInfo
+) -> tuple[DatasetDict, bool, list[tuple[str, str, int]]]:
+ report: list[tuple[str, str, int]] = []
+ is_changed = False
+ if "train" in ds and metadata.eval_splits == "train":
+ is_changed = True
+ before = len(ds["train"])
+ ds["train"] = ds["train"].cast_column(
+ "label", datasets.ClassLabel(names=list(set(ds["train"]["label"])))
+ )
+ label_counts = pd.Series(ds["train"]["label"]).value_counts()
+ one_sample_labels = set(label_counts[label_counts == 1].index.tolist())
+
+ if len(one_sample_labels) > 0:
+ before_size = len(ds["train"])
+ ds["train"] = ds["train"].filter(
+ lambda x: x["label"] not in one_sample_labels
+ )
+ removed = before_size - len(ds["train"])
+ if removed > 0:
+ report.append(("filter_one_sample_labels", "train", removed))
+
+ splits = ds["train"].train_test_split(
+ test_size=min(2048, before // 2), seed=42, stratify_by_column="label"
+ )
+ ds = DatasetDict({"train": splits["train"], "test": splits["test"]})
+ report.append(("create_test_split", "train_to_test", before - len(ds["train"])))
+ metadata.eval_splits = ["test"]
+ return ds, is_changed, report
+
+
+def clean_dataset(
+ ds: DatasetDict,
+ metadata: TaskMetadataInfo,
+) -> tuple[DatasetDict, list[tuple[str, str, int]], bool]:
+ report: list[tuple[str, str, int]] = []
+ is_changed = False
+
+ skip_codes = {"zho", "jpn", "tha", "mya", "cmn"}
+ apply_short = not any(
+ lang.split("-")[0] in skip_codes for lang in metadata.eval_langs
+ )
+
+ transforms = [
+ ("filter_empty", filter_empty),
+ ("deduplicate", deduplicate),
+ ]
+ if apply_short:
+ transforms.append(("filter_short", filter_short))
+
+ for split in ["train", *metadata.eval_splits]:
+ if split not in ds:
+ continue
+ for name, fn in transforms:
+ before = len(ds[split])
+ ds[split] = fn(ds[split])
+ removed = before - len(ds[split])
+ if removed > 0:
+ is_changed = True
+ report.append((name, split, removed))
+
+ ds, is_changed_after_split, split_report = split_train_test(ds, metadata)
+ report.extend(split_report)
+ is_changed = is_changed or is_changed_after_split
+
+ for split in metadata.eval_splits:
+ if split == "train":
+ continue
+ before_test = len(ds[split])
+ ds["test"] = filter_leakage(ds["train"], ds[split])
+ removed = before_test - len(ds[split])
+ if removed > 0:
+ is_changed = True
+ report.append(("filter_leakage", split, removed))
+
+ orig = {split: len(ds[split]) for split in ds}
+ ds = filter_controversial(ds)
+ for split in ds:
+ removed = orig[split] - len(ds[split])
+ if removed > 0:
+ is_changed = True
+ report.append(("filter_controversial", split, removed))
+
+ return ds, report, is_changed
+
+
+def print_report(
+ report_folder: Path,
+ language: str,
+ original_records: list[tuple[str, str, int]],
+ filter_records: list[tuple[str, str, str, int]],
+) -> None:
+ report_lines: list[str] = []
+ report_lines.append("## Original Sizes")
+ report_lines.append("| Task | Split | Original Size |")
+ report_lines.append("|------|:-----:|--------------:|")
+ for task, split, size in original_records:
+ report_lines.append(f"| {task} | {split} | {size} |")
+
+ report_lines.append("")
+ report_lines.append("## Cleaning Report")
+ report_lines.append("| Task | Filter | Split | Removed |")
+ report_lines.append("|------|--------|:-----:|--------:|")
+ for task, name, split, removed in filter_records:
+ report_lines.append(f"| {task} | {name} | {split} | {removed} |")
+
+ (report_folder / f"report_{language}.md").write_text("\n".join(report_lines))
+
+
+def push_dataset(ds: DatasetDict, metadata: TaskMetadataInfo, username: str) -> str:
+ prev_path = metadata.dataset.get("path", "")
+ if prev_path.startswith("mteb/") and username != "mteb":
+ repo_id = prev_path.replace("mteb/", f"{username}/")
+ else:
+ base = metadata.class_name
+ if base.endswith("Classification"):
+ base = base[: -len("Classification")]
+ name = re.sub(r"(.)([A-Z][a-z]+)", r"\1_\2", base)
+ name = re.sub(r"([a-z0-9])([A-Z])", r"\1_\2", name).lower()
+ repo_id = f"{username}/{name}"
+ ds.push_to_hub(repo_id, config_name=metadata.dataset.get("name", "default"))
+ return repo_id
+
+
+def update_metadata(
+ file_path: Path, class_name: str, new_ver: int, repo_id: str, pr_id: int
+) -> None:
+ api = HfApi()
+ commit = api.list_repo_commits(repo_id=repo_id, repo_type="dataset")[0].commit_id
+ update_v2_metadata_dataset(
+ file_path, class_name + f"V{new_ver}", repo_id, commit, pr_id
+ )
+
+
+def parse_metadata_dataset(file_path: Path, class_name: str) -> dict[str, str]:
+ source = file_path.read_text()
+ module = ast.parse(source)
+ class_node, _ = find_class_node(module, class_name)
+ for node in class_node.body:
+ if isinstance(node, ast.Assign):
+ target = node.targets[0]
+ if isinstance(target, ast.Name) and target.id == "metadata":
+ call = node.value
+ for kw in call.keywords:
+ if kw.arg == "dataset":
+ return ast.literal_eval(kw.value)
+ return {}
+
+
+def get_transform_statements(file_path: Path, class_name: str) -> list[ast.stmt]:
+ source = file_path.read_text()
+ module = ast.parse(source)
+ class_node, _ = find_class_node(module, class_name)
+ for node in class_node.body:
+ if isinstance(node, ast.FunctionDef) and node.name == "dataset_transform":
+ return [
+ stmt
+ for stmt in node.body
+ if not (
+ isinstance(stmt, ast.Assign)
+ and isinstance(stmt.value, ast.Call)
+ and isinstance(stmt.value.func, ast.Attribute)
+ and stmt.value.func.attr == "stratified_subsampling"
+ )
+ ]
+ return []
+
+
+def load_dataset(file_path: Path, class_name: str) -> DatasetDict:
+ original = file_path.read_text()
+ lines = original.splitlines(keepends=True)
+ filtered: list[str] = []
+ skip = False
+ parens = 0
+
+ for line in lines:
+ if not skip and "stratified_subsampling" in line:
+ skip = True
+ parens = line.count("(") - line.count(")")
+ continue
+ if skip:
+ parens += line.count("(") - line.count(")")
+ if parens <= 0:
+ skip = False
+ filtered.append(" pass")
+ continue
+ filtered.append(line)
+
+ file_path.write_text("".join(filtered))
+
+ spec = importlib.util.spec_from_file_location("task_module", str(file_path))
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ TaskClass = getattr(module, class_name)
+ task = TaskClass()
+ task.load_data()
+ ds = task.dataset
+
+ file_path.write_text(original)
+
+ return ds
+
+
+def _find_metadata_assignment(class_node: ast.ClassDef) -> ast.Assign | None:
+ for stmt in class_node.body:
+ if (
+ isinstance(stmt, ast.Assign)
+ and isinstance(stmt.targets[0], ast.Name)
+ and stmt.targets[0].id == "metadata"
+ ):
+ return stmt
+ return None
+
+
+def _find_keyword(call_node: ast.Call, keyword_name: str) -> ast.keyword | None:
+ for kw in call_node.keywords:
+ if kw.arg == keyword_name:
+ return kw
+ return None
+
+
+def _get_indent(line: str) -> str:
+ return line[: len(line) - len(line.lstrip())]
+
+
+def _update_description(
+ lines: list[str], call_node: ast.Call, pr_id: int
+) -> tuple[list[str], list[int]]:
+ desc_kw = _find_keyword(call_node, "description")
+ if not desc_kw or not isinstance(desc_kw.value, ast.Constant):
+ return lines, []
+
+ value_node = desc_kw.value
+ original_desc = value_node.value
+ start_line_idx = desc_kw.lineno - 1
+ end_line_idx = value_node.end_lineno - 1
+
+ indent = _get_indent(lines[start_line_idx])
+ new_desc_val = f'"""{original_desc}\n This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/{pr_id})"""'
+ lines[start_line_idx] = f"{indent}description={new_desc_val},\n"
+
+ deleted_indices = list(range(start_line_idx + 1, end_line_idx + 1))
+ return lines, deleted_indices
+
+
+def _update_dataset_dict(
+ lines: list[str], call_node: ast.Call, new_path: str, new_revision: str
+) -> tuple[list[str], list[int]]:
+ dataset_kw = _find_keyword(call_node, "dataset")
+ if not dataset_kw or not isinstance(dataset_kw.value, ast.Dict):
+ return lines, []
+
+ dict_node = dataset_kw.value
+ lines_to_delete = []
+
+ for i, key_node in enumerate(dict_node.keys):
+ if not isinstance(key_node, ast.Constant):
+ continue
+
+ value_node = dict_node.values[i]
+ line_idx = key_node.lineno - 1
+ indent = _get_indent(lines[line_idx])
+ key = key_node.value
+
+ if key == "path":
+ lines[line_idx] = f'{indent}"path": "{new_path}",\n'
+ elif key == "revision":
+ lines[line_idx] = f'{indent}"revision": "{new_revision}",\n'
+ elif key == "trust_remote_code":
+ lines_to_delete.extend(range(line_idx, value_node.end_lineno))
+
+ return lines, lines_to_delete
+
+
+def _update_eval_splits(
+ lines: list[str], call_node: ast.Call, module: ast.Module
+) -> list[str]:
+ eval_splits_kw = _find_keyword(call_node, "eval_splits")
+ if not eval_splits_kw:
+ return lines
+
+ value_node = eval_splits_kw.value
+ if isinstance(value_node, ast.Name):
+ resolved = _resolve_variable(value_node.id, module)
+ if resolved:
+ value_node = resolved
+
+ is_train_split = (
+ isinstance(value_node, ast.List)
+ and len(value_node.elts) == 1
+ and isinstance(value_node.elts[0], ast.Constant)
+ and value_node.elts[0].value == "train"
+ )
+
+ if is_train_split:
+ line_idx = eval_splits_kw.lineno - 1
+ indent = _get_indent(lines[line_idx])
+ lines[line_idx] = f'{indent}eval_splits=["test"],\n'
+
+ return lines
+
+
+def update_v2_metadata_dataset(
+ file_path: Path, class_name: str, new_path: str, new_revision: str, pr_id: int
+) -> None:
+ lines = read_lines(file_path)
+ module = ast.parse("".join(lines))
+
+ class_node, _ = find_class_node(module, class_name)
+ if not class_node:
+ raise ValueError(f"Class {class_name} not found in {file_path}")
+
+ metadata_node = _find_metadata_assignment(class_node)
+ if not metadata_node or not isinstance(metadata_node.value, ast.Call):
+ return
+
+ call_node = metadata_node.value
+ lines, desc_deleted = _update_description(lines, call_node, pr_id)
+ lines, ds_deleted = _update_dataset_dict(lines, call_node, new_path, new_revision)
+ lines = _update_eval_splits(lines, call_node, module)
+
+ all_deleted_indices = sorted(list(set(desc_deleted + ds_deleted)), reverse=True)
+ for i in all_deleted_indices:
+ del lines[i]
+
+ write_lines(file_path, lines)
+
+
+def _resolve_variable(name: str, module: ast.Module) -> ast.expr | None:
+ for node in module.body:
+ if isinstance(node, ast.Assign):
+ for target in node.targets:
+ if isinstance(target, ast.Name) and target.id == name:
+ return node.value
+ return None
+
+
+def parse_all_task_metadata(
+ file_path: Path, latest_version: bool = True
+) -> list[TaskMetadataInfo]:
+ source = file_path.read_text()
+ module = ast.parse(source)
+
+ version_pattern = re.compile(r"^(?P.+?)(?:V(?P\d+))?$")
+ all_tasks: list[TaskMetadataInfo] = []
+
+ for node in module.body:
+ if not isinstance(node, ast.ClassDef):
+ continue
+
+ m = version_pattern.match(node.name)
+ if not m:
+ continue
+ ver = int(m.group("ver")) if m.group("ver") else 1
+
+ name = ""
+ dataset: dict[str, str] = {}
+ eval_split: list[str] = ["test"]
+ eval_langs: list[str] = []
+ for stmt in node.body:
+ if not (
+ isinstance(stmt, ast.Assign)
+ and isinstance(stmt.targets[0], ast.Name)
+ and stmt.targets[0].id == "metadata"
+ ):
+ continue
+ if not isinstance(stmt.value, ast.Call):
+ continue
+ call = stmt.value
+ for kw in call.keywords:
+ value_node = kw.value
+ if isinstance(value_node, ast.Name):
+ resolved_node = _resolve_variable(value_node.id, module)
+ if resolved_node:
+ value_node = resolved_node
+ try:
+ if kw.arg == "name" and isinstance(value_node, ast.Constant):
+ name = value_node.value
+ elif kw.arg == "dataset":
+ dataset = ast.literal_eval(value_node)
+ elif kw.arg == "eval_splits":
+ eval_split = ast.literal_eval(value_node) or ["test"]
+ elif kw.arg == "eval_langs":
+ eval_langs = ast.literal_eval(value_node)
+ except (ValueError, SyntaxError):
+ pass
+ break
+
+ if not name:
+ continue
+
+ all_tasks.append(
+ TaskMetadataInfo(node.name, name, ver, dataset, eval_split, eval_langs)
+ )
+
+ if not latest_version:
+ return all_tasks
+
+ latest: dict[str, TaskMetadataInfo] = {}
+ for task in all_tasks:
+ base_name = re.sub(r"V\d+$", "", task.name)
+ if base_name not in latest or task.version > latest[base_name].version:
+ latest[base_name] = task
+
+ return list(latest.values())
+
+
+def parse_all_task_metadata_versions(file_path: Path) -> list[TaskMetadataInfo]:
+ return parse_all_task_metadata(file_path, latest_version=False)
+
+
+def bump_version_for_class(
+ file_path: Path, base_class_name: str, ds: DatasetDict
+) -> int:
+ lines = read_lines(file_path)
+ module = ast.parse("".join(lines))
+
+ version_pattern = re.compile(rf"^{re.escape(base_class_name)}(?:V(?P\d+))?$")
+ selected: tuple[int, ast.ClassDef] | None = None
+ for node in module.body:
+ if not isinstance(node, ast.ClassDef):
+ continue
+ m = version_pattern.match(node.name)
+ if not m:
+ continue
+ ver = int(m.group("ver")) if m.group("ver") else 1
+ if selected is None or ver > selected[0]:
+ selected = (ver, node)
+ if selected is None:
+ raise ValueError(f"Class {base_class_name} not found in {file_path}")
+
+ version, node = selected
+ inner = calculate_inner_indent(lines, node)
+ new_version = version + 1
+
+ task_name = ""
+ for stmt in node.body:
+ if (
+ isinstance(stmt, ast.Assign)
+ and isinstance(stmt.targets[0], ast.Name)
+ and stmt.targets[0].id == "metadata"
+ ):
+ call = stmt.value
+ if (
+ isinstance(call, ast.Call)
+ and isinstance(call.func, ast.Name)
+ and call.func.id == "TaskMetadata"
+ ):
+ for kw in call.keywords:
+ if kw.arg == "name" and isinstance(kw.value, ast.Constant):
+ task_name = kw.value.value
+ break
+ break
+
+ superseded = " " * inner + f'superseded_by = "{task_name}.v{new_version}"\n'
+ block = lines[node.lineno - 1 : node.end_lineno]
+ v2_block = get_v2_block(
+ block, node.name, base_class_name, f"V{new_version}", f".v{new_version}", ds
+ )
+
+ new_lines: list[str] = []
+ for i, l in enumerate(lines):
+ new_lines.append(l)
+ if i == node.lineno - 1:
+ new_lines.append(superseded)
+ if i == node.end_lineno - 1:
+ new_lines.append("\n")
+ new_lines.extend(v2_block)
+ write_lines(file_path, new_lines)
+ return new_version
+
+
+def process_task(
+ file_path: Path,
+ metadata: TaskMetadataInfo,
+ pr_id: int,
+ username: str,
+ verbose: bool,
+) -> tuple[
+ tuple[str, int] | None,
+ list[tuple[str, str, int]],
+ list[tuple[str, str, str, int]],
+]:
+ if verbose:
+ print(" task ->", metadata.class_name)
+ try:
+ ds = load_and_transform(file_path, metadata)
+ except Exception:
+ print(metadata.class_name, "dataset loading failed")
+ traceback.print_exc()
+ return None, [], []
+
+ if verbose:
+ print(ds)
+
+ original_size = {split: len(ds[split]) for split in ds}
+ ds_cleaned, report, is_changed = clean_dataset(ds.copy(), metadata)
+ if verbose:
+ print(f"is_changed: {is_changed}")
+
+ if not is_changed:
+ if verbose:
+ print(f"{metadata.class_name} is unchanged")
+ return None, [], []
+
+ original_records = [
+ (metadata.name, split, size) for split, size in original_size.items()
+ ]
+ filter_records = [
+ (metadata.name, name, split, removed) for name, split, removed in report
+ ]
+
+ repo_id = push_dataset(ds_cleaned, metadata, username)
+ base_name = re.sub(r"V\d+$", "", metadata.class_name)
+ new_ver = bump_version_for_class(file_path, base_name, ds)
+ update_metadata(file_path, base_name, new_ver, repo_id, pr_id)
+
+ return (metadata.name, new_ver), original_records, filter_records
+
+
+@app.command()
+def create_and_prepare(
+ folder: Path = typer.Argument(..., exists=True, dir_okay=True),
+ pr_id: int = typer.Argument(..., help="Pull request ID"),
+ report_folder: Path = typer.Option(
+ "scripts/data/cleaning_reports", exists=True, dir_okay=True
+ ),
+ username: str = "mteb",
+ start_lang: Optional[str] = None,
+ verbose: bool = typer.Option(False, "--verbose"),
+) -> None:
+ changed_tasks: list[tuple[str, int]] = []
+ all_original_records: list[tuple[str, str, int]] = []
+ all_filter_records: list[tuple[str, str, str, int]] = []
+
+ files_to_process = sorted(
+ p for p in folder.glob("**/*.py") if p.name != "__init__.py"
+ )
+ if start_lang:
+ files_to_process = [p for p in files_to_process if p.parent.name >= start_lang]
+ progress_bar = tqdm(files_to_process, desc="Processing files")
+
+ try:
+ for file_path in progress_bar:
+ progress_bar.set_description(f"Processing {file_path.name}")
+ if verbose:
+ print("working on", file_path.name)
+
+ for metadata in parse_all_task_metadata(file_path):
+ changed_task, original_records, filter_records = process_task(
+ file_path, metadata, pr_id, username, verbose
+ )
+ if changed_task:
+ changed_tasks.append(changed_task)
+ all_original_records.extend(original_records)
+ all_filter_records.extend(filter_records)
+ except Exception:
+ print(traceback.format_exc())
+
+ if changed_tasks:
+ print_report(
+ report_folder, folder.name, all_original_records, all_filter_records
+ )
+
+ unique_changed = sorted(list(set(changed_tasks)))
+ tasks_str = " ".join(
+ f"{task_name} {task_name}.v{version}"
+ for task_name, version in unique_changed
+ )
+ print(
+ "mteb run -m intfloat/multilingual-e5-small -t"
+ f" {tasks_str} && mteb run -m"
+ " sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2 -t"
+ f" {tasks_str}"
+ )
+
+
+@app.command()
+def compare_results(
+ results_dir: Path = typer.Option(
+ "/home/admin/vatolin/experiments/mteb/results", exists=True, dir_okay=True
+ ),
+ tasks_file: Optional[Path] = typer.Option(
+ None,
+ "--tasks-file",
+ "-f",
+ exists=True,
+ file_okay=True,
+ dir_okay=False,
+ help="File with a list of tasks to compare. One task per line.",
+ ),
+) -> None:
+ models = [
+ "intfloat/multilingual-e5-small",
+ "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
+ ]
+ scores_files = [
+ f for f in results_dir.glob("**/**/*.json") if f.stem not in {"model_meta"}
+ ]
+ df_all: pd.DataFrame = format_scores(scores_files, use_all_subsets=True)
+
+ required_tasks: set[str] | None = None
+ if tasks_file:
+ required_tasks = {
+ line.strip() for line in tasks_file.read_text().splitlines() if line.strip()
+ }
+
+ for model in models:
+ df_model = df_all[df_all["model_name"] == model.replace("/", "__")]
+
+ df_old = df_model[
+ ~df_model["task_name"].str.contains(r"\.v\d+$", regex=True)
+ ].copy()
+ df_new = df_model[
+ df_model["task_name"].str.contains(r"\.v\d+$", regex=True)
+ ].copy()
+
+ df_new["task_name"] = df_new["task_name"].str.replace(
+ r"\.v\d+$", "", regex=True
+ )
+
+ if required_tasks:
+ available_for_comparison = set(df_old["task_name"]).intersection(
+ set(df_new["task_name"])
+ )
+ missing_tasks = required_tasks - available_for_comparison
+ if missing_tasks:
+ print(f"**{model}**")
+ print(
+ f"Skipping due to missing tasks: {', '.join(sorted(missing_tasks))}"
+ )
+ print()
+ continue
+
+ df_old = df_old[df_old["task_name"].isin(required_tasks)]
+ df_new = df_new[df_new["task_name"].isin(required_tasks)]
+
+ old_duplicated = df_old.duplicated(subset=["task_name", "subset"])
+ if old_duplicated.sum() > 0:
+ print("Duplicated scores")
+ print(model)
+ print(df_old[old_duplicated][["task_name", "subset", "languages"]])
+ continue
+ new_duplicated = df_new.duplicated(subset=["task_name", "subset"])
+ if new_duplicated.sum() > 0:
+ print(model)
+ print(df_new[new_duplicated][["task_name", "subset", "languages"]])
+ continue
+ df_old = df_old.set_index(["task_name", "subset"])["main_score"]
+ df_new = df_new.set_index(["task_name", "subset"])["main_score"]
+ df_cmp = pd.DataFrame(
+ {
+ "main_score_old": df_old,
+ "main_score_new": df_new,
+ }
+ ).dropna()
+
+ if df_cmp.empty:
+ continue
+
+ df_cmp["delta_percent"] = (
+ (df_cmp["main_score_new"] - df_cmp["main_score_old"])
+ / df_cmp["main_score_old"]
+ * 100
+ ).round(2)
+ df_cmp = df_cmp.reset_index(drop=False).sort_values(["task_name", "subset"])
+
+ print(f"**{model}**")
+ print(
+ df_cmp[
+ [
+ "task_name",
+ "subset",
+ "main_score_old",
+ "main_score_new",
+ "delta_percent",
+ ]
+ ].to_markdown(index=False)
+ )
+ print()
+
+
+@app.command()
+def report_cleaning(
+ tasks: list[str] = typer.Argument(
+ ..., help="List of task names to generate cleaning reports for."
+ ),
+ folder: Path = typer.Option(
+ "mteb/tasks/Classification", exists=True, dir_okay=True
+ ),
+ report_folder: Path = typer.Option(
+ "scripts/data/cleaning_reports", exists=True, dir_okay=True
+ ),
+ verbose: bool = typer.Option(False, "--verbose"),
+) -> None:
+ all_original_records: list[tuple[str, str, int]] = []
+ all_filter_records: list[tuple[str, str, str, int]] = []
+
+ all_tasks_map: dict[str, tuple[TaskMetadataInfo, Path]] = {}
+ files_to_process = sorted(
+ p for p in folder.glob("**/*.py") if p.name != "__init__.py"
+ )
+
+ for file_path in files_to_process:
+ tasks_in_file = parse_all_task_metadata_versions(file_path)
+ for task_metadata in tasks_in_file:
+ all_tasks_map[task_metadata.name] = (task_metadata, file_path)
+
+ for task_name in tasks:
+ v2_task_name = f"{task_name}.v2"
+ if v2_task_name not in all_tasks_map:
+ if verbose:
+ print(f"Task {v2_task_name} not found, skipping.")
+ continue
+
+ if task_name not in all_tasks_map:
+ if verbose:
+ print(f"Base task {task_name} not found, skipping.")
+ continue
+
+ v1_metadata, file_path = all_tasks_map[task_name]
+ if verbose:
+ print(f"Processing {task_name} from {file_path}")
+
+ try:
+ ds = load_and_transform(file_path, v1_metadata)
+ print(ds)
+ except Exception:
+ print(f"Dataset loading failed for {v1_metadata.class_name}")
+ traceback.print_exc()
+ continue
+
+ original_size = {split: len(ds[split]) for split in ds}
+ ds_new, report, _ = clean_dataset(ds.copy(), v1_metadata)
+ print(ds_new)
+ print(report)
+
+ original_records = [
+ (v1_metadata.name, split, size) for split, size in original_size.items()
+ ]
+ filter_records = [
+ (v1_metadata.name, name, split, removed) for name, split, removed in report
+ ]
+
+ all_original_records.extend(original_records)
+ all_filter_records.extend(filter_records)
+
+ if all_filter_records:
+ print_report(
+ report_folder, folder.name, all_original_records, all_filter_records
+ )
+ print(f"Report generated in {report_folder}/report_{folder.name}.md")
+ else:
+ print("No tasks with v2 versions found or no changes after cleaning.")
+
+
+if __name__ == "__main__":
+ app()
diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py
index bcf9a1f83d..22bf88e1df 100644
--- a/tests/test_TaskMetadata.py
+++ b/tests/test_TaskMetadata.py
@@ -13,13 +13,20 @@
_HISTORIC_DATASETS = [
"PolEmo2.0-IN",
"PolEmo2.0-OUT",
+ "PolEmo2.0-OUT.v2",
"PAC",
+ "PAC.v2",
"TNews",
+ "TNews.v2",
"IFlyTek",
+ "IFlyTek.v2",
"MultilingualSentiment",
+ "MultilingualSentiment.v2",
"JDReview",
+ "JDReview.v2",
"OnlineShopping",
"Waimai",
+ "Waimai.v2",
"BlurbsClusteringP2P",
"BlurbsClusteringS2S",
"TenKGnadClusteringP2P",