embeddings-benchmark · isaac-chung · Jul 19, 2025 · Jul 13, 2025 · Jul 13, 2025 · Jul 13, 2025
diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py
@@ -16,6 +16,7 @@
 from .ces.CzechSubjectivityClassification import *
 from .dan.AngryTweetsClassification import *
 from .dan.DanishPoliticalCommentsClassification import *
+from .dan.DdiscoCohesionClassification import *
 from .dan.DKHateClassification import *
 from .dan.LccSentimentClassification import *
 from .deu.GermanPoliticiansTwitterSentimentClassification import *
@@ -109,6 +110,7 @@
 from .multilingual.NusaParagraphEmotionClassification import *
 from .multilingual.NusaParagraphTopicClassification import *
 from .multilingual.NusaXSenti import *
+from .multilingual.ru_nlu_intent_classification import *
 from .multilingual.ScalaClassification import *
 from .multilingual.ScandiSentClassification import *
 from .multilingual.SIB200Classification import *
@@ -132,7 +134,6 @@
 from .rus.HeadlineClassification import *
 from .rus.InappropriatenessClassification import *
 from .rus.KinopoiskClassification import *
-from .rus.ru_nlu_intent_classification import *
 from .rus.ru_toixic_classification_okmlcup import *
 from .rus.RuReviewsClassification import *
 from .rus.RuSciBenchGRNTIClassification import *

diff --git a/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py b/mteb/tasks/Classification/ces/CSFDCZMovieReviewSentimentClassification.py
@@ -5,6 +5,7 @@
 
 
 class CSFDCZMovieReviewSentimentClassification(AbsTaskClassification):
+    superseded_by = "CSFDCZMovieReviewSentimentClassification.v2"
     metadata = TaskMetadata(
         name="CSFDCZMovieReviewSentimentClassification",
         description="The dataset contains 30k user reviews from csfd.cz in Czech.",
@@ -49,3 +50,49 @@ def dataset_transform(self):
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES
         )
+
+
+class CSFDCZMovieReviewSentimentClassificationV2(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="CSFDCZMovieReviewSentimentClassification.v2",
+        description="""The dataset contains 30k user reviews from csfd.cz in Czech.
+        This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+        reference="https://arxiv.org/abs/2304.01922",
+        dataset={
+            "path": "mteb/csfdcz_movie_review_sentiment",
+            "revision": "bda232f79c949fd881572f7e1b9ad59fd04a6c7c",
+        },
+        type="Classification",
+        category="s2s",
+        modalities=["text"],
+        date=("2002-06-28", "2020-03-13"),
+        eval_splits=["test"],
+        eval_langs=["ces-Latn"],
+        main_score="accuracy",
+        domains=["Reviews", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="cc-by-sa-4.0",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@misc{štefánik2023resources,
+  archiveprefix = {arXiv},
+  author = {Michal Štefánik and Marek Kadlčík and Piotr Gramacki and Petr Sojka},
+  eprint = {2304.01922},
+  primaryclass = {cs.CL},
+  title = {Resources and Few-shot Learners for In-context Learning in Slavic Languages},
+  year = {2023},
+}
+""",
+        adapted_from=["CSFDCZMovieReviewSentimentClassification"],
+    )
+    # Increase the samples_per_label in order to improve baseline performance
+    samples_per_label = 20
+
+    def dataset_transform(self):
+        N_SAMPLES = 2048
+
+        self.dataset = self.stratified_subsampling(
+            self.dataset, seed=self.seed, splits=["test"], n_samples=N_SAMPLES
+        )
diff --git a/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py b/mteb/tasks/Classification/ces/CzechProductReviewSentimentClassification.py
@@ -5,6 +5,7 @@
 
 
 class CzechProductReviewSentimentClassification(AbsTaskClassification):
+    superseded_by = "CzechProductReviewSentimentClassification.v2"
     metadata = TaskMetadata(
         name="CzechProductReviewSentimentClassification",
         description="User reviews of products on Czech e-shop Mall.cz with 3 sentiment classes (positive, neutral, negative)",
@@ -54,3 +55,54 @@ def dataset_transform(self) -> None:
         self.dataset = self.stratified_subsampling(
             self.dataset, seed=self.seed, splits=["test"]
         )
+
+
+class CzechProductReviewSentimentClassificationV2(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="CzechProductReviewSentimentClassification.v2",
+        description="""User reviews of products on Czech e-shop Mall.cz with 3 sentiment classes (positive, neutral, negative)
+        This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+        reference="https://aclanthology.org/W13-1609/",
+        dataset={
+            "path": "mteb/czech_product_review_sentiment",
+            "revision": "1a3fb305bde30eec7067ab15ad2db9f61b115ca2",
+        },
+        type="Classification",
+        category="s2s",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["ces-Latn"],
+        main_score="accuracy",
+        date=("2013-01-01", "2013-06-01"),
+        dialect=[],
+        domains=["Reviews", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="cc-by-nc-sa-4.0",
+        annotations_creators="derived",
+        sample_creation="found",
+        bibtex_citation=r"""
+@inproceedings{habernal-etal-2013-sentiment,
+  address = {Atlanta, Georgia},
+  author = {Habernal, Ivan  and
+Pt{\'a}{\v{c}}ek, Tom{\'a}{\v{s}}  and
+Steinberger, Josef},
+  booktitle = {Proceedings of the 4th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis},
+  editor = {Balahur, Alexandra  and
+van der Goot, Erik  and
+Montoyo, Andres},
+  month = jun,
+  pages = {65--74},
+  publisher = {Association for Computational Linguistics},
+  title = {Sentiment Analysis in {C}zech Social Media Using Supervised Machine Learning},
+  url = {https://aclanthology.org/W13-1609},
+  year = {2013},
+}
+""",
+        adapted_from=["CzechProductReviewSentimentClassification"],
+    )
+    samples_per_label = 16
+
+    def dataset_transform(self) -> None:
+        self.dataset = self.stratified_subsampling(
+            self.dataset, seed=self.seed, splits=["test"]
+        )
diff --git a/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py b/mteb/tasks/Classification/ces/CzechSoMeSentimentClassification.py
@@ -5,6 +5,7 @@
 
 
 class CzechSoMeSentimentClassification(AbsTaskClassification):
+    superseded_by = "CzechSoMeSentimentClassification.v2"
     metadata = TaskMetadata(
         name="CzechSoMeSentimentClassification",
         description="User comments on Facebook",
@@ -51,3 +52,49 @@ def dataset_transform(self) -> None:
         self.dataset = self.dataset.rename_columns(
             {"comment": "text", "sentiment_int": "label"}
         )
+
+
+class CzechSoMeSentimentClassificationV2(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="CzechSoMeSentimentClassification.v2",
+        description="""User comments on Facebook
+        This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+        reference="https://aclanthology.org/W13-1609/",
+        dataset={
+            "path": "mteb/czech_so_me_sentiment",
+            "revision": "a12152e40ff9857bf3c83694528f40ec5c02aafc",
+        },
+        type="Classification",
+        category="s2s",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["ces-Latn"],
+        main_score="accuracy",
+        date=("2013-01-01", "2013-06-01"),
+        dialect=[],
+        domains=["Reviews", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="cc-by-nc-sa-4.0",
+        annotations_creators="derived",
+        sample_creation="found",
+        bibtex_citation=r"""
+@inproceedings{habernal-etal-2013-sentiment,
+  address = {Atlanta, Georgia},
+  author = {Habernal, Ivan  and
+Pt{\'a}{\v{c}}ek, Tom{\'a}{\v{s}}  and
+Steinberger, Josef},
+  booktitle = {Proceedings of the 4th Workshop on Computational Approaches to Subjectivity, Sentiment and Social Media Analysis},
+  editor = {Balahur, Alexandra  and
+van der Goot, Erik  and
+Montoyo, Andres},
+  month = jun,
+  pages = {65--74},
+  publisher = {Association for Computational Linguistics},
+  title = {Sentiment Analysis in {C}zech Social Media Using Supervised Machine Learning},
+  url = {https://aclanthology.org/W13-1609},
+  year = {2013},
+}
+""",
+        adapted_from=["CzechSoMeSentimentClassification"],
+    )
+    samples_per_label = 16
diff --git a/mteb/tasks/Classification/dan/AngryTweetsClassification.py b/mteb/tasks/Classification/dan/AngryTweetsClassification.py
@@ -5,6 +5,7 @@
 
 
 class AngryTweetsClassification(AbsTaskClassification):
+    superseded_by = "AngryTweetsClassification.v2"
     metadata = TaskMetadata(
         name="AngryTweetsClassification",
         dataset={
@@ -39,3 +40,42 @@ class AngryTweetsClassification(AbsTaskClassification):
     )
 
     samples_per_label = 16
+
+
+class AngryTweetsClassificationV2(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="AngryTweetsClassification.v2",
+        dataset={
+            "path": "mteb/angry_tweets",
+            "revision": "b9475fb66a13befda4fa9871cd92343bb2c0eb77",
+        },
+        description="""A sentiment dataset with 3 classes (positiv, negativ, neutral) for Danish tweets
+        This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+        reference="https://aclanthology.org/2021.nodalida-main.53/",
+        type="Classification",
+        category="s2s",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["dan-Latn"],
+        main_score="accuracy",
+        date=("2021-01-01", "2021-12-31"),
+        domains=["Social", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="cc-by-4.0",
+        annotations_creators="human-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@inproceedings{pauli2021danlp,
+  author = {Pauli, Amalie Brogaard and Barrett, Maria and Lacroix, Oph{\'e}lie and Hvingelby, Rasmus},
+  booktitle = {Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)},
+  pages = {460--466},
+  title = {DaNLP: An open-source toolkit for Danish Natural Language Processing},
+  year = {2021},
+}
+""",
+        prompt="Classify Danish tweets by sentiment. (positive, negative, neutral).",
+        adapted_from=["AngryTweetsClassification"],
+    )
+
+    samples_per_label = 16
diff --git a/mteb/tasks/Classification/dan/DKHateClassification.py b/mteb/tasks/Classification/dan/DKHateClassification.py
@@ -5,6 +5,7 @@
 
 
 class DKHateClassification(AbsTaskClassification):
+    superseded_by = "DKHateClassification.v2"
     metadata = TaskMetadata(
         name="DKHateClassification",
         dataset={
@@ -69,3 +70,64 @@ def dataset_transform(self):
         self.dataset = self.dataset.map(
             lambda x: {"label": lab2idx[x["label"]]}, remove_columns=["label"]
         )
+
+
+class DKHateClassificationV2(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="DKHateClassification.v2",
+        dataset={
+            "path": "mteb/dk_hate",
+            "revision": "0468ff11393992d8347cf4282fb706fe970608d4",
+        },
+        description="""Danish Tweets annotated for Hate Speech either being Offensive or not
+        This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+        reference="https://aclanthology.org/2020.lrec-1.430/",
+        type="Classification",
+        category="s2s",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["dan-Latn"],
+        main_score="accuracy",
+        date=("2018-01-01", "2018-12-31"),
+        domains=["Social", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="cc-by-4.0",
+        annotations_creators="expert-annotated",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@inproceedings{sigurbergsson-derczynski-2020-offensive,
+  abstract = {The presence of offensive language on social media platforms and the implications this poses is becoming a major concern in modern society. Given the enormous amount of content created every day, automatic methods are required to detect and deal with this type of content. Until now, most of the research has focused on solving the problem for the English language, while the problem is multilingual. We construct a Danish dataset DKhate containing user-generated comments from various social media platforms, and to our knowledge, the first of its kind, annotated for various types and target of offensive language. We develop four automatic classification systems, each designed to work for both the English and the Danish language. In the detection of offensive language in English, the best performing system achieves a macro averaged F1-score of 0.74, and the best performing system for Danish achieves a macro averaged F1-score of 0.70. In the detection of whether or not an offensive post is targeted, the best performing system for English achieves a macro averaged F1-score of 0.62, while the best performing system for Danish achieves a macro averaged F1-score of 0.73. Finally, in the detection of the target type in a targeted offensive post, the best performing system for English achieves a macro averaged F1-score of 0.56, and the best performing system for Danish achieves a macro averaged F1-score of 0.63. Our work for both the English and the Danish language captures the type and targets of offensive language, and present automatic methods for detecting different kinds of offensive language such as hate speech and cyberbullying.},
+  address = {Marseille, France},
+  author = {Sigurbergsson, Gudbjartur Ingi  and
+Derczynski, Leon},
+  booktitle = {Proceedings of the Twelfth Language Resources and Evaluation Conference},
+  editor = {Calzolari, Nicoletta  and
+B{\'e}chet, Fr{\'e}d{\'e}ric  and
+Blache, Philippe  and
+Choukri, Khalid  and
+Cieri, Christopher  and
+Declerck, Thierry  and
+Goggi, Sara  and
+Isahara, Hitoshi  and
+Maegaard, Bente  and
+Mariani, Joseph  and
+Mazo, H{\'e}l{\`e}ne  and
+Moreno, Asuncion  and
+Odijk, Jan  and
+Piperidis, Stelios},
+  isbn = {979-10-95546-34-4},
+  language = {English},
+  month = may,
+  pages = {3498--3508},
+  publisher = {European Language Resources Association},
+  title = {Offensive Language and Hate Speech Detection for {D}anish},
+  url = {https://aclanthology.org/2020.lrec-1.430},
+  year = {2020},
+}
+""",
+        prompt="Classify Danish tweets based on offensiveness (offensive, not offensive)",
+        adapted_from=["DKHateClassification"],
+    )
+
+    samples_per_label = 16
diff --git a/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py b/mteb/tasks/Classification/dan/DanishPoliticalCommentsClassification.py
@@ -5,6 +5,7 @@
 
 
 class DanishPoliticalCommentsClassification(AbsTaskClassification):
+    superseded_by = "DanishPoliticalCommentsClassification.v2"
     metadata = TaskMetadata(
         name="DanishPoliticalCommentsClassification",
         dataset={
@@ -49,3 +50,44 @@ def dataset_transform(self):
 
         # create train and test splits
         self.dataset = self.dataset["train"].train_test_split(0.2, seed=self.seed)
+
+
+class DanishPoliticalCommentsClassificationV2(AbsTaskClassification):
+    metadata = TaskMetadata(
+        name="DanishPoliticalCommentsClassification.v2",
+        dataset={
+            "path": "mteb/danish_political_comments",
+            "revision": "476a9e7327aba70ad3e97a169d7310b86be9b245",
+        },
+        description="""A dataset of Danish political comments rated for sentiment
+        This version corrects errors found in the original data. For details, see [pull request](https://github.com/embeddings-benchmark/mteb/pull/2900)""",
+        reference="https://huggingface.co/datasets/danish_political_comments",
+        type="Classification",
+        category="s2s",
+        modalities=["text"],
+        eval_splits=["test"],
+        eval_langs=["dan-Latn"],
+        main_score="accuracy",
+        date=(
+            "2000-01-01",
+            "2022-12-31",
+        ),  # Estimated range for the collection of comments
+        domains=["Social", "Written"],
+        task_subtypes=["Sentiment/Hate speech"],
+        license="not specified",
+        annotations_creators="derived",
+        dialect=[],
+        sample_creation="found",
+        bibtex_citation=r"""
+@techreport{SAMsentiment,
+  author = {Mads Guldborg Kjeldgaard Kongsbak and Steffan Eybye Christensen and Lucas Høyberg Puvis~de~Chavannes and Peter Due Jensen},
+  institution = {IT University of Copenhagen},
+  title = {Sentiment Analysis Multitool, SAM},
+  year = {2019},
+}
+""",
+        prompt="Classify Danish political comments for sentiment",
+        adapted_from=["DanishPoliticalCommentsClassification"],
+    )
+
+    samples_per_label = 16