From e68836f73bd0e60925522a48bab840a0f10791a8 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sat, 21 Dec 2024 15:21:56 +0300
Subject: [PATCH 1/5] reupload datasets

---
 .../ara/HotelReviewSentimentClassification.py | 13 +++--
 .../ara/TweetEmotionClassification.py         | 14 +++--
 .../deu/TenKGnadClassification.py             |  5 +-
 .../Classification/eng/ArxivClassification.py |  5 +-
 .../eng/PatentClassification.py               | 13 +++--
 .../fil/FilipinoHateSpeechClassification.py   | 13 +++--
 mteb/tasks/Classification/mya/MyanmarNews.py  | 14 +++--
 .../DutchBookReviewSentimentClassification.py |  5 +-
 .../swe/SwedishSentimentClassification.py     | 13 +++--
 .../tha/WisesightSentimentClassification.py   | 21 +++-----
 .../urd/UrduRomanSentimentClassification.py   | 16 +++---
 .../multilingual/IndicQARetrieval.py          | 52 +------------------
 mteb/tasks/STS/jpn/JSTS.py                    |  9 +---
 13 files changed, 60 insertions(+), 133 deletions(-)

diff --git a/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py b/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py
index 24b7bc33fc..26fe78b8de 100644
--- a/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py
+++ b/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py
@@ -8,9 +8,8 @@ class HotelReviewSentimentClassification(AbsTaskClassification):
     metadata = TaskMetadata(
         name="HotelReviewSentimentClassification",
         dataset={
-            "path": "Elnagara/hard",
-            "revision": "b108d2c32ee4e1f4176ea233e1a5ac17bceb9ef9",
-            "trust_remote_code": True,
+            "path": "mteb/HotelReviewSentimentClassification",
+            "revision": "273d5105974460d3979149e29e88c06a8214c541",
         },
         description="HARD is a dataset of Arabic hotel reviews collected from the Booking.com website.",
         reference="https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3",
@@ -39,7 +38,7 @@ class HotelReviewSentimentClassification(AbsTaskClassification):
 """,
     )
 
-    def dataset_transform(self):
-        self.dataset = self.stratified_subsampling(
-            self.dataset, seed=self.seed, splits=["train"]
-        )
+    # def dataset_transform(self):
+    #     self.dataset = self.stratified_subsampling(
+    #         self.dataset, seed=self.seed, splits=["train"]
+    #     )
diff --git a/mteb/tasks/Classification/ara/TweetEmotionClassification.py b/mteb/tasks/Classification/ara/TweetEmotionClassification.py
index e7fb8687ac..3cc183f92b 100644
--- a/mteb/tasks/Classification/ara/TweetEmotionClassification.py
+++ b/mteb/tasks/Classification/ara/TweetEmotionClassification.py
@@ -8,9 +8,8 @@ class TweetEmotionClassification(AbsTaskClassification):
     metadata = TaskMetadata(
         name="TweetEmotionClassification",
         dataset={
-            "path": "emotone-ar-cicling2017/emotone_ar",
-            "revision": "0ded8ff72cc68cbb7bb5c01b0a9157982b73ddaf",
-            "trust_remote_code": True,
+            "path": "mteb/TweetEmotionClassification",
+            "revision": "0d803980e91953cc67c21429f74b301b7b1b3f08",
         },
         description="A dataset of 10,000 tweets that was created with the aim of covering the most frequently used emotion categories in Arabic tweets.",
         reference="https://link.springer.com/chapter/10.1007/978-3-319-77116-8_8",
@@ -39,8 +38,7 @@ class TweetEmotionClassification(AbsTaskClassification):
 """,
     )
 
-    def dataset_transform(self):
-        self.dataset = self.dataset.rename_column("tweet", "text")
-        self.dataset = self.stratified_subsampling(
-            self.dataset, seed=self.seed, splits=["train"]
-        )
+    # def dataset_transform(self):
+    #     self.dataset = self.stratified_subsampling(
+    #         self.dataset, seed=self.seed, splits=["train"]
+    #     )
diff --git a/mteb/tasks/Classification/deu/TenKGnadClassification.py b/mteb/tasks/Classification/deu/TenKGnadClassification.py
index 592d66c983..f9bde63b6e 100644
--- a/mteb/tasks/Classification/deu/TenKGnadClassification.py
+++ b/mteb/tasks/Classification/deu/TenKGnadClassification.py
@@ -10,9 +10,8 @@ class TenKGnadClassification(AbsTaskClassification):
         description="10k German News Articles Dataset (10kGNAD) contains news articles from the online Austrian newspaper website DER Standard with their topic classification (9 classes).",
         reference="https://tblock.github.io/10kGNAD/",
         dataset={
-            "path": "community-datasets/gnad10",
-            "revision": "0798affe9b3f88cfda4267b6fbc50fac67046ee5",
-            "trust_remote_code": True,
+            "path": "mteb/TenKGnadClassification",
+            "revision": "ae9862bbcddc27b4bd93e2a7b463b7b5d05c6c55",
         },
         type="Classification",
         category="p2p",
diff --git a/mteb/tasks/Classification/eng/ArxivClassification.py b/mteb/tasks/Classification/eng/ArxivClassification.py
index 92bd473a74..d24b0b2a0c 100644
--- a/mteb/tasks/Classification/eng/ArxivClassification.py
+++ b/mteb/tasks/Classification/eng/ArxivClassification.py
@@ -9,9 +9,8 @@ class ArxivClassification(AbsTaskClassification):
         name="ArxivClassification",
         description="Classification Dataset of Arxiv Papers",
         dataset={
-            "path": "ccdv/arxiv-classification",
-            "revision": "f9bd92144ed76200d6eb3ce73a8bd4eba9ffdc85",
-            "trust_remote_code": True,
+            "path": "mteb/ArxivClassification",
+            "revision": "5e80893bf045abefbf8cbe5d713bddc91ae158d5",
         },
         reference="https://ieeexplore.ieee.org/document/8675939",
         type="Classification",
diff --git a/mteb/tasks/Classification/eng/PatentClassification.py b/mteb/tasks/Classification/eng/PatentClassification.py
index 9f10a8a794..5c7360b994 100644
--- a/mteb/tasks/Classification/eng/PatentClassification.py
+++ b/mteb/tasks/Classification/eng/PatentClassification.py
@@ -9,9 +9,8 @@ class PatentClassification(AbsTaskClassification):
         name="PatentClassification",
         description="Classification Dataset of Patents and Abstract",
         dataset={
-            "path": "ccdv/patent-classification",
-            "revision": "2f38a1dfdecfacee0184d74eaeafd3c0fb49d2a6",
-            "trust_remote_code": True,
+            "path": "mteb/PatentClassification",
+            "revision": "6bd77eb030ab3bfbf1e6f7a2b069979daf167311",
         },
         reference="https://aclanthology.org/P19-1212.pdf",
         type="Classification",
@@ -47,7 +46,7 @@ class PatentClassification(AbsTaskClassification):
         }""",
     )
 
-    def dataset_transform(self):
-        self.dataset = self.stratified_subsampling(
-            self.dataset, seed=self.seed, splits=["test"]
-        )
+    # def dataset_transform(self):
+    #     self.dataset = self.stratified_subsampling(
+    #         self.dataset, seed=self.seed, splits=["test"]
+    #     )
diff --git a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
index df6205d427..bdf520ecb8 100644
--- a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
+++ b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
@@ -12,9 +12,8 @@ class FilipinoHateSpeechClassification(AbsTaskClassification):
         description="Filipino Twitter dataset for sentiment classification.",
         reference="https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019",
         dataset={
-            "path": "jcblaise/hatespeech_filipino",
-            "revision": "b01711587b073e55569de75ef04d7da4592a3618",
-            "trust_remote_code": True,
+            "path": "mteb/FilipinoHateSpeechClassification",
+            "revision": "087a17c0b7f9a78901c88aea00ad2892a319fdac",
         },
         type="Classification",
         category="s2s",
@@ -42,7 +41,7 @@ class FilipinoHateSpeechClassification(AbsTaskClassification):
         """,
     )
 
-    def dataset_transform(self):
-        self.dataset = self.stratified_subsampling(
-            self.dataset, seed=self.seed, splits=["validation", "test"]
-        )
+    # def dataset_transform(self):
+    #     self.dataset = self.stratified_subsampling(
+    #         self.dataset, seed=self.seed, splits=["validation", "test"]
+    #     )
diff --git a/mteb/tasks/Classification/mya/MyanmarNews.py b/mteb/tasks/Classification/mya/MyanmarNews.py
index 8418e20533..be075d7fbe 100644
--- a/mteb/tasks/Classification/mya/MyanmarNews.py
+++ b/mteb/tasks/Classification/mya/MyanmarNews.py
@@ -8,9 +8,8 @@ class MyanmarNews(AbsTaskClassification):
     metadata = TaskMetadata(
         name="MyanmarNews",
         dataset={
-            "path": "ayehninnkhine/myanmar_news",
-            "revision": "b899ec06227db3679b0fe3c4188a6b48cc0b65eb",
-            "trust_remote_code": True,
+            "path": "mteb/MyanmarNews",
+            "revision": "644419f24bc820bbf8af24e0b4714a069812e0a3",
         },
         description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categorie, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.",
         reference="https://huggingface.co/datasets/myanmar_news",
@@ -38,8 +37,7 @@ class MyanmarNews(AbsTaskClassification):
         }""",
     )
 
-    def dataset_transform(self):
-        self.dataset = self.dataset.rename_columns({"category": "label"})
-        self.dataset = self.stratified_subsampling(
-            self.dataset, seed=self.seed, splits=["train"]
-        )
+    # def dataset_transform(self):
+    #     self.dataset = self.stratified_subsampling(
+    #         self.dataset, seed=self.seed, splits=["train"]
+    #     )
diff --git a/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py b/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py
index f0ee1b07dc..9c85aca4df 100644
--- a/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py
+++ b/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py
@@ -10,9 +10,8 @@ class DutchBookReviewSentimentClassification(AbsTaskClassification):
         description="A Dutch book review for sentiment classification.",
         reference="https://github.com/benjaminvdb/DBRD",
         dataset={
-            "path": "benjaminvdb/dbrd",
-            "revision": "3f756ab4572e071eb53e887ab629f19fa747d39e",
-            "trust_remote_code": True,
+            "path": "mteb/DutchBookReviewSentimentClassification",
+            "revision": "1c2815ad38cf4794eb8d678fb08f569ea79392f6",
         },
         type="Classification",
         category="s2s",
diff --git a/mteb/tasks/Classification/swe/SwedishSentimentClassification.py b/mteb/tasks/Classification/swe/SwedishSentimentClassification.py
index 4c0fdc16cb..731258177a 100644
--- a/mteb/tasks/Classification/swe/SwedishSentimentClassification.py
+++ b/mteb/tasks/Classification/swe/SwedishSentimentClassification.py
@@ -10,9 +10,8 @@ class SwedishSentimentClassification(AbsTaskClassification):
         description="Dataset of Swedish reviews scarped from various public available websites",
         reference="https://huggingface.co/datasets/swedish_reviews",
         dataset={
-            "path": "timpal0l/swedish_reviews",
-            "revision": "105ba6b3cb99b9fd64880215be469d60ebf44a1b",
-            "trust_remote_code": True,
+            "path": "mteb/SwedishSentimentClassification",
+            "revision": "39e35f55d58338ebd602f8d83b52cfe027f5146a",
         },
         type="Classification",
         category="s2s",
@@ -30,7 +29,7 @@ class SwedishSentimentClassification(AbsTaskClassification):
         bibtex_citation="",
     )
 
-    def dataset_transform(self):
-        self.dataset = self.stratified_subsampling(
-            self.dataset, seed=self.seed, splits=["validation", "test"]
-        )
+    # def dataset_transform(self):
+    #     self.dataset = self.stratified_subsampling(
+    #         self.dataset, seed=self.seed, splits=["validation", "test"]
+    #     )
diff --git a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py
index 3a76003d5b..04258f8710 100644
--- a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py
+++ b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py
@@ -10,9 +10,8 @@ class WisesightSentimentClassification(AbsTaskClassification):
         description="Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment label (positive, neutral, negative, question)",
         reference="https://github.com/PyThaiNLP/wisesight-sentiment",
         dataset={
-            "path": "pythainlp/wisesight_sentiment",
-            "revision": "14aa5773afa135ba835cc5179bbc4a63657a42ae",
-            "trust_remote_code": True,
+            "path": "mteb/WisesightSentimentClassification",
+            "revision": "727ea9bd253f9eedf16aebec6ac3f07791fb3db2",
         },
         type="Classification",
         category="s2s",
@@ -44,13 +43,9 @@ class WisesightSentimentClassification(AbsTaskClassification):
 """,
     )
 
-    def dataset_transform(self):
-        for split in self.dataset.keys():
-            self.dataset[split] = self.dataset[split].rename_column("texts", "text")
-            self.dataset[split] = self.dataset[split].rename_column("category", "label")
-
-        self.dataset = self.stratified_subsampling(
-            self.dataset,
-            seed=self.seed,
-            splits=["test"],
-        )
+    # def dataset_transform(self):
+    #     self.dataset = self.stratified_subsampling(
+    #         self.dataset,
+    #         seed=self.seed,
+    #         splits=["test"],
+    #     )
diff --git a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
index 62440ef9c2..c58b0f4d89 100644
--- a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
+++ b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
@@ -10,9 +10,8 @@ class UrduRomanSentimentClassification(AbsTaskClassification):
         description="The Roman Urdu dataset is a data corpus comprising of more than 20000 records tagged for sentiment (Positive, Negative, Neutral)",
         reference="https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set",
         dataset={
-            "path": "community-datasets/roman_urdu",
-            "revision": "566be6449bb30b9b9f2b59173391647fe0ca3224",
-            "trust_remote_code": True,
+            "path": "mteb/UrduRomanSentimentClassification",
+            "revision": "905c1121c002c4b9adc4ebc5faaf4d6f50d1b1ee",
         },
         type="Classification",
         category="s2s",
@@ -38,10 +37,7 @@ class UrduRomanSentimentClassification(AbsTaskClassification):
     """,
     )
 
-    def dataset_transform(self):
-        self.dataset = self.dataset.rename_columns(
-            {"sentence": "text", "sentiment": "label"}
-        )
-        self.dataset = self.stratified_subsampling(
-            self.dataset, seed=self.seed, splits=["train"]
-        )
+    # def dataset_transform(self):
+    #     self.dataset = self.stratified_subsampling(
+    #         self.dataset, seed=self.seed, splits=["train"]
+    #     )
diff --git a/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py b/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py
index 62a166f89c..71273b3d61 100644
--- a/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py
+++ b/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py
@@ -1,12 +1,7 @@
 from __future__ import annotations
 
-from hashlib import sha256
-
-import datasets
-
 from mteb.abstasks.MultilingualTask import MultilingualTask
 from mteb.abstasks.TaskMetadata import TaskMetadata
-
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 _LANGUAGES = {
@@ -28,9 +23,8 @@ class IndicQARetrieval(MultilingualTask, AbsTaskRetrieval):
     metadata = TaskMetadata(
         name="IndicQARetrieval",
         dataset={
-            "path": "ai4bharat/IndicQA",
-            "revision": "570d90ae4f7b64fe4fdd5f42fc9f9279b8c9fd9d",
-            "trust_remote_code": True,
+            "path": "mteb/IndicQARetrieval",
+            "revision": "51e8b328988795d658f6f34acd34044e9346e2ee",
         },
         description="IndicQA is a manually curated cloze-style reading comprehension dataset that can be used for evaluating question-answering models in 11 Indic languages. It is repurposed retrieving relevant context for each question.",
         reference="https://arxiv.org/abs/2212.05409",
@@ -55,45 +49,3 @@ class IndicQARetrieval(MultilingualTask, AbsTaskRetrieval):
   doi       = {10.18653/v1/2023.acl-long.693}
 }""",
     )
-
-    def load_data(self, **kwargs):
-        if self.data_loaded:
-            return
-
-        split = "test"
-        queries = {lang: {split: {}} for lang in self.hf_subsets}
-        corpus = {lang: {split: {}} for lang in self.hf_subsets}
-        relevant_docs = {lang: {split: {}} for lang in self.hf_subsets}
-
-        for lang in self.hf_subsets:
-            data = datasets.load_dataset(
-                name=f"indicqa.{lang}", **self.metadata_dict["dataset"]
-            )[split]
-            data = data.filter(lambda x: x["answers"]["text"] != "")
-
-            question_ids = {
-                question: sha256(question.encode("utf-8")).hexdigest()
-                for question in set(data["question"])
-            }
-            context_ids = {
-                context: sha256(context.encode("utf-8")).hexdigest()
-                for context in set(data["context"])
-            }
-
-            for row in data:
-                question = row["question"]
-                context = row["context"]
-                query_id = question_ids[question]
-                queries[lang][split][query_id] = question
-
-                doc_id = context_ids[context]
-                corpus[lang][split][doc_id] = {"text": context}
-                if query_id not in relevant_docs[lang][split]:
-                    relevant_docs[lang][split][query_id] = {}
-                relevant_docs[lang][split][query_id][doc_id] = 1
-
-        self.corpus = datasets.DatasetDict(corpus)
-        self.queries = datasets.DatasetDict(queries)
-        self.relevant_docs = datasets.DatasetDict(relevant_docs)
-
-        self.data_loaded = True
diff --git a/mteb/tasks/STS/jpn/JSTS.py b/mteb/tasks/STS/jpn/JSTS.py
index 4993359190..bdd031c865 100644
--- a/mteb/tasks/STS/jpn/JSTS.py
+++ b/mteb/tasks/STS/jpn/JSTS.py
@@ -9,10 +9,8 @@ class JSTS(AbsTaskSTS):
     metadata = TaskMetadata(
         name="JSTS",
         dataset={
-            "path": "shunk031/JGLUE",
-            "revision": "50e79c314a7603ebc92236b66a0973d51a00ed8c",
-            "name": "JSTS",
-            "trust_remote_code": True,
+            "path": "mteb/JSTS",
+            "revision": "5bac629e25799df4c9c80a6a5db983d6cba9e77d",
         },
         description="Japanese Semantic Textual Similarity Benchmark dataset construct from YJ Image Captions Dataset "
         + "(Miyazaki and Shimizu, 2016) and annotated by crowdsource annotators.",
@@ -65,6 +63,3 @@ def metadata_dict(self) -> dict[str, str]:
         metadata_dict["min_score"] = 0
         metadata_dict["max_score"] = 5
         return metadata_dict
-
-    def dataset_transform(self) -> None:
-        self.dataset = self.dataset.rename_column("label", "score")

From 1c6632cac4024dbe1056d4993b11fa69541c0f97 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sat, 21 Dec 2024 22:15:27 +0300
Subject: [PATCH 2/5] fix loader

---
 mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py b/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py
index 71273b3d61..7ad283d860 100644
--- a/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py
+++ b/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py
@@ -19,7 +19,7 @@
 }
 
 
-class IndicQARetrieval(MultilingualTask, AbsTaskRetrieval):
+class IndicQARetrieval(AbsTaskRetrieval, MultilingualTask):
     metadata = TaskMetadata(
         name="IndicQARetrieval",
         dataset={

From 82559ac9120fa219d82f494efe4cc5499772ce9d Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sat, 21 Dec 2024 22:52:35 +0300
Subject: [PATCH 3/5] remove commented code

---
 .../ara/HotelReviewSentimentClassification.py              | 5 -----
 .../tasks/Classification/ara/TweetEmotionClassification.py | 5 -----
 mteb/tasks/Classification/eng/PatentClassification.py      | 5 -----
 .../Classification/fil/FilipinoHateSpeechClassification.py | 5 -----
 mteb/tasks/Classification/mya/MyanmarNews.py               | 5 -----
 .../Classification/swe/SwedishSentimentClassification.py   | 5 -----
 .../Classification/tha/WisesightSentimentClassification.py | 7 -------
 .../Classification/urd/UrduRomanSentimentClassification.py | 5 -----
 8 files changed, 42 deletions(-)

diff --git a/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py b/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py
index 26fe78b8de..bb6ad6aa18 100644
--- a/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py
+++ b/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py
@@ -37,8 +37,3 @@ class HotelReviewSentimentClassification(AbsTaskClassification):
 }
 """,
     )
-
-    # def dataset_transform(self):
-    #     self.dataset = self.stratified_subsampling(
-    #         self.dataset, seed=self.seed, splits=["train"]
-    #     )
diff --git a/mteb/tasks/Classification/ara/TweetEmotionClassification.py b/mteb/tasks/Classification/ara/TweetEmotionClassification.py
index 3cc183f92b..d79956731c 100644
--- a/mteb/tasks/Classification/ara/TweetEmotionClassification.py
+++ b/mteb/tasks/Classification/ara/TweetEmotionClassification.py
@@ -37,8 +37,3 @@ class TweetEmotionClassification(AbsTaskClassification):
 }
 """,
     )
-
-    # def dataset_transform(self):
-    #     self.dataset = self.stratified_subsampling(
-    #         self.dataset, seed=self.seed, splits=["train"]
-    #     )
diff --git a/mteb/tasks/Classification/eng/PatentClassification.py b/mteb/tasks/Classification/eng/PatentClassification.py
index 5c7360b994..f8cd3b49af 100644
--- a/mteb/tasks/Classification/eng/PatentClassification.py
+++ b/mteb/tasks/Classification/eng/PatentClassification.py
@@ -45,8 +45,3 @@ class PatentClassification(AbsTaskClassification):
             abstract = "Most existing text summarization datasets are compiled from the news domain, where summaries have a flattened discourse structure. In such datasets, summary-worthy content often appears in the beginning of input articles. Moreover, large segments from input articles are present verbatim in their respective summaries. These issues impede the learning and evaluation of systems that can understand an article{'}s global content structure as well as produce abstractive summaries with high compression ratio. In this work, we present a novel dataset, BIGPATENT, consisting of 1.3 million records of U.S. patent documents along with human written abstractive summaries. Compared to existing summarization datasets, BIGPATENT has the following properties: i) summaries contain a richer discourse structure with more recurring entities, ii) salient content is evenly distributed in the input, and iii) lesser and shorter extractive fragments are present in the summaries. Finally, we train and evaluate baselines and popular learning models on BIGPATENT to shed light on new challenges and motivate future directions for summarization research.",
         }""",
     )
-
-    # def dataset_transform(self):
-    #     self.dataset = self.stratified_subsampling(
-    #         self.dataset, seed=self.seed, splits=["test"]
-    #     )
diff --git a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
index bdf520ecb8..f5e8c1d66f 100644
--- a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
+++ b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py
@@ -40,8 +40,3 @@ class FilipinoHateSpeechClassification(AbsTaskClassification):
         }
         """,
     )
-
-    # def dataset_transform(self):
-    #     self.dataset = self.stratified_subsampling(
-    #         self.dataset, seed=self.seed, splits=["validation", "test"]
-    #     )
diff --git a/mteb/tasks/Classification/mya/MyanmarNews.py b/mteb/tasks/Classification/mya/MyanmarNews.py
index be075d7fbe..2248dd0032 100644
--- a/mteb/tasks/Classification/mya/MyanmarNews.py
+++ b/mteb/tasks/Classification/mya/MyanmarNews.py
@@ -36,8 +36,3 @@ class MyanmarNews(AbsTaskClassification):
         pages     = {401--408}
         }""",
     )
-
-    # def dataset_transform(self):
-    #     self.dataset = self.stratified_subsampling(
-    #         self.dataset, seed=self.seed, splits=["train"]
-    #     )
diff --git a/mteb/tasks/Classification/swe/SwedishSentimentClassification.py b/mteb/tasks/Classification/swe/SwedishSentimentClassification.py
index 731258177a..149be829fc 100644
--- a/mteb/tasks/Classification/swe/SwedishSentimentClassification.py
+++ b/mteb/tasks/Classification/swe/SwedishSentimentClassification.py
@@ -28,8 +28,3 @@ class SwedishSentimentClassification(AbsTaskClassification):
         sample_creation="found",
         bibtex_citation="",
     )
-
-    # def dataset_transform(self):
-    #     self.dataset = self.stratified_subsampling(
-    #         self.dataset, seed=self.seed, splits=["validation", "test"]
-    #     )
diff --git a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py
index 04258f8710..1a142529c7 100644
--- a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py
+++ b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py
@@ -42,10 +42,3 @@ class WisesightSentimentClassification(AbsTaskClassification):
 
 """,
     )
-
-    # def dataset_transform(self):
-    #     self.dataset = self.stratified_subsampling(
-    #         self.dataset,
-    #         seed=self.seed,
-    #         splits=["test"],
-    #     )
diff --git a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
index c58b0f4d89..eb66927269 100644
--- a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
+++ b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
@@ -36,8 +36,3 @@ class UrduRomanSentimentClassification(AbsTaskClassification):
 }
     """,
     )
-
-    # def dataset_transform(self):
-    #     self.dataset = self.stratified_subsampling(
-    #         self.dataset, seed=self.seed, splits=["train"]
-    #     )

From 76febfcefeffe33ef59627a710d49b883ff13141 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sat, 21 Dec 2024 22:53:21 +0300
Subject: [PATCH 4/5] lint

---
 mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py b/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py
index 7ad283d860..c0e2ef4cc3 100644
--- a/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py
+++ b/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py
@@ -2,6 +2,7 @@
 
 from mteb.abstasks.MultilingualTask import MultilingualTask
 from mteb.abstasks.TaskMetadata import TaskMetadata
+
 from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval
 
 _LANGUAGES = {

From 31278e34eb6bcb61abcb6459cc17774edc40e229 Mon Sep 17 00:00:00 2001
From: Roman Solomatin <36135455+Samoed@users.noreply.github.com>
Date: Sat, 21 Dec 2024 22:54:18 +0300
Subject: [PATCH 5/5] update pyproject dependencies

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 4ab7f1f88e..48a104fff6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -25,7 +25,7 @@ classifiers = [
 ]
 requires-python = ">=3.9"
 dependencies = [
-    "datasets>=2.19.0,<3.0.0",
+    "datasets>=2.19.0",
     "numpy>=1.0.0,<3.0.0",
     "requests>=2.26.0",
     "scikit_learn>=1.0.2",