diff --git a/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py b/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py index 24b7bc33fc..bb6ad6aa18 100644 --- a/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py +++ b/mteb/tasks/Classification/ara/HotelReviewSentimentClassification.py @@ -8,9 +8,8 @@ class HotelReviewSentimentClassification(AbsTaskClassification): metadata = TaskMetadata( name="HotelReviewSentimentClassification", dataset={ - "path": "Elnagara/hard", - "revision": "b108d2c32ee4e1f4176ea233e1a5ac17bceb9ef9", - "trust_remote_code": True, + "path": "mteb/HotelReviewSentimentClassification", + "revision": "273d5105974460d3979149e29e88c06a8214c541", }, description="HARD is a dataset of Arabic hotel reviews collected from the Booking.com website.", reference="https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3", @@ -38,8 +37,3 @@ class HotelReviewSentimentClassification(AbsTaskClassification): } """, ) - - def dataset_transform(self): - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["train"] - ) diff --git a/mteb/tasks/Classification/ara/TweetEmotionClassification.py b/mteb/tasks/Classification/ara/TweetEmotionClassification.py index e7fb8687ac..d79956731c 100644 --- a/mteb/tasks/Classification/ara/TweetEmotionClassification.py +++ b/mteb/tasks/Classification/ara/TweetEmotionClassification.py @@ -8,9 +8,8 @@ class TweetEmotionClassification(AbsTaskClassification): metadata = TaskMetadata( name="TweetEmotionClassification", dataset={ - "path": "emotone-ar-cicling2017/emotone_ar", - "revision": "0ded8ff72cc68cbb7bb5c01b0a9157982b73ddaf", - "trust_remote_code": True, + "path": "mteb/TweetEmotionClassification", + "revision": "0d803980e91953cc67c21429f74b301b7b1b3f08", }, description="A dataset of 10,000 tweets that was created with the aim of covering the most frequently used emotion categories in Arabic tweets.", reference="https://link.springer.com/chapter/10.1007/978-3-319-77116-8_8", @@ -38,9 +37,3 @@ class TweetEmotionClassification(AbsTaskClassification): } """, ) - - def dataset_transform(self): - self.dataset = self.dataset.rename_column("tweet", "text") - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["train"] - ) diff --git a/mteb/tasks/Classification/deu/TenKGnadClassification.py b/mteb/tasks/Classification/deu/TenKGnadClassification.py index 592d66c983..f9bde63b6e 100644 --- a/mteb/tasks/Classification/deu/TenKGnadClassification.py +++ b/mteb/tasks/Classification/deu/TenKGnadClassification.py @@ -10,9 +10,8 @@ class TenKGnadClassification(AbsTaskClassification): description="10k German News Articles Dataset (10kGNAD) contains news articles from the online Austrian newspaper website DER Standard with their topic classification (9 classes).", reference="https://tblock.github.io/10kGNAD/", dataset={ - "path": "community-datasets/gnad10", - "revision": "0798affe9b3f88cfda4267b6fbc50fac67046ee5", - "trust_remote_code": True, + "path": "mteb/TenKGnadClassification", + "revision": "ae9862bbcddc27b4bd93e2a7b463b7b5d05c6c55", }, type="Classification", category="p2p", diff --git a/mteb/tasks/Classification/eng/ArxivClassification.py b/mteb/tasks/Classification/eng/ArxivClassification.py index 92bd473a74..d24b0b2a0c 100644 --- a/mteb/tasks/Classification/eng/ArxivClassification.py +++ b/mteb/tasks/Classification/eng/ArxivClassification.py @@ -9,9 +9,8 @@ class ArxivClassification(AbsTaskClassification): name="ArxivClassification", description="Classification Dataset of Arxiv Papers", dataset={ - "path": "ccdv/arxiv-classification", - "revision": "f9bd92144ed76200d6eb3ce73a8bd4eba9ffdc85", - "trust_remote_code": True, + "path": "mteb/ArxivClassification", + "revision": "5e80893bf045abefbf8cbe5d713bddc91ae158d5", }, reference="https://ieeexplore.ieee.org/document/8675939", type="Classification", diff --git a/mteb/tasks/Classification/eng/PatentClassification.py b/mteb/tasks/Classification/eng/PatentClassification.py index 9f10a8a794..f8cd3b49af 100644 --- a/mteb/tasks/Classification/eng/PatentClassification.py +++ b/mteb/tasks/Classification/eng/PatentClassification.py @@ -9,9 +9,8 @@ class PatentClassification(AbsTaskClassification): name="PatentClassification", description="Classification Dataset of Patents and Abstract", dataset={ - "path": "ccdv/patent-classification", - "revision": "2f38a1dfdecfacee0184d74eaeafd3c0fb49d2a6", - "trust_remote_code": True, + "path": "mteb/PatentClassification", + "revision": "6bd77eb030ab3bfbf1e6f7a2b069979daf167311", }, reference="https://aclanthology.org/P19-1212.pdf", type="Classification", @@ -46,8 +45,3 @@ class PatentClassification(AbsTaskClassification): abstract = "Most existing text summarization datasets are compiled from the news domain, where summaries have a flattened discourse structure. In such datasets, summary-worthy content often appears in the beginning of input articles. Moreover, large segments from input articles are present verbatim in their respective summaries. These issues impede the learning and evaluation of systems that can understand an article{'}s global content structure as well as produce abstractive summaries with high compression ratio. In this work, we present a novel dataset, BIGPATENT, consisting of 1.3 million records of U.S. patent documents along with human written abstractive summaries. Compared to existing summarization datasets, BIGPATENT has the following properties: i) summaries contain a richer discourse structure with more recurring entities, ii) salient content is evenly distributed in the input, and iii) lesser and shorter extractive fragments are present in the summaries. Finally, we train and evaluate baselines and popular learning models on BIGPATENT to shed light on new challenges and motivate future directions for summarization research.", }""", ) - - def dataset_transform(self): - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["test"] - ) diff --git a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py index df6205d427..f5e8c1d66f 100644 --- a/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py +++ b/mteb/tasks/Classification/fil/FilipinoHateSpeechClassification.py @@ -12,9 +12,8 @@ class FilipinoHateSpeechClassification(AbsTaskClassification): description="Filipino Twitter dataset for sentiment classification.", reference="https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019", dataset={ - "path": "jcblaise/hatespeech_filipino", - "revision": "b01711587b073e55569de75ef04d7da4592a3618", - "trust_remote_code": True, + "path": "mteb/FilipinoHateSpeechClassification", + "revision": "087a17c0b7f9a78901c88aea00ad2892a319fdac", }, type="Classification", category="s2s", @@ -41,8 +40,3 @@ class FilipinoHateSpeechClassification(AbsTaskClassification): } """, ) - - def dataset_transform(self): - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["validation", "test"] - ) diff --git a/mteb/tasks/Classification/mya/MyanmarNews.py b/mteb/tasks/Classification/mya/MyanmarNews.py index 8418e20533..2248dd0032 100644 --- a/mteb/tasks/Classification/mya/MyanmarNews.py +++ b/mteb/tasks/Classification/mya/MyanmarNews.py @@ -8,9 +8,8 @@ class MyanmarNews(AbsTaskClassification): metadata = TaskMetadata( name="MyanmarNews", dataset={ - "path": "ayehninnkhine/myanmar_news", - "revision": "b899ec06227db3679b0fe3c4188a6b48cc0b65eb", - "trust_remote_code": True, + "path": "mteb/MyanmarNews", + "revision": "644419f24bc820bbf8af24e0b4714a069812e0a3", }, description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categorie, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.", reference="https://huggingface.co/datasets/myanmar_news", @@ -37,9 +36,3 @@ class MyanmarNews(AbsTaskClassification): pages = {401--408} }""", ) - - def dataset_transform(self): - self.dataset = self.dataset.rename_columns({"category": "label"}) - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["train"] - ) diff --git a/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py b/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py index f0ee1b07dc..9c85aca4df 100644 --- a/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py +++ b/mteb/tasks/Classification/nld/DutchBookReviewSentimentClassification.py @@ -10,9 +10,8 @@ class DutchBookReviewSentimentClassification(AbsTaskClassification): description="A Dutch book review for sentiment classification.", reference="https://github.com/benjaminvdb/DBRD", dataset={ - "path": "benjaminvdb/dbrd", - "revision": "3f756ab4572e071eb53e887ab629f19fa747d39e", - "trust_remote_code": True, + "path": "mteb/DutchBookReviewSentimentClassification", + "revision": "1c2815ad38cf4794eb8d678fb08f569ea79392f6", }, type="Classification", category="s2s", diff --git a/mteb/tasks/Classification/swe/SwedishSentimentClassification.py b/mteb/tasks/Classification/swe/SwedishSentimentClassification.py index 4c0fdc16cb..149be829fc 100644 --- a/mteb/tasks/Classification/swe/SwedishSentimentClassification.py +++ b/mteb/tasks/Classification/swe/SwedishSentimentClassification.py @@ -10,9 +10,8 @@ class SwedishSentimentClassification(AbsTaskClassification): description="Dataset of Swedish reviews scarped from various public available websites", reference="https://huggingface.co/datasets/swedish_reviews", dataset={ - "path": "timpal0l/swedish_reviews", - "revision": "105ba6b3cb99b9fd64880215be469d60ebf44a1b", - "trust_remote_code": True, + "path": "mteb/SwedishSentimentClassification", + "revision": "39e35f55d58338ebd602f8d83b52cfe027f5146a", }, type="Classification", category="s2s", @@ -29,8 +28,3 @@ class SwedishSentimentClassification(AbsTaskClassification): sample_creation="found", bibtex_citation="", ) - - def dataset_transform(self): - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["validation", "test"] - ) diff --git a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py index 3a76003d5b..1a142529c7 100644 --- a/mteb/tasks/Classification/tha/WisesightSentimentClassification.py +++ b/mteb/tasks/Classification/tha/WisesightSentimentClassification.py @@ -10,9 +10,8 @@ class WisesightSentimentClassification(AbsTaskClassification): description="Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment label (positive, neutral, negative, question)", reference="https://github.com/PyThaiNLP/wisesight-sentiment", dataset={ - "path": "pythainlp/wisesight_sentiment", - "revision": "14aa5773afa135ba835cc5179bbc4a63657a42ae", - "trust_remote_code": True, + "path": "mteb/WisesightSentimentClassification", + "revision": "727ea9bd253f9eedf16aebec6ac3f07791fb3db2", }, type="Classification", category="s2s", @@ -43,14 +42,3 @@ class WisesightSentimentClassification(AbsTaskClassification): """, ) - - def dataset_transform(self): - for split in self.dataset.keys(): - self.dataset[split] = self.dataset[split].rename_column("texts", "text") - self.dataset[split] = self.dataset[split].rename_column("category", "label") - - self.dataset = self.stratified_subsampling( - self.dataset, - seed=self.seed, - splits=["test"], - ) diff --git a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py index 62440ef9c2..eb66927269 100644 --- a/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py +++ b/mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py @@ -10,9 +10,8 @@ class UrduRomanSentimentClassification(AbsTaskClassification): description="The Roman Urdu dataset is a data corpus comprising of more than 20000 records tagged for sentiment (Positive, Negative, Neutral)", reference="https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set", dataset={ - "path": "community-datasets/roman_urdu", - "revision": "566be6449bb30b9b9f2b59173391647fe0ca3224", - "trust_remote_code": True, + "path": "mteb/UrduRomanSentimentClassification", + "revision": "905c1121c002c4b9adc4ebc5faaf4d6f50d1b1ee", }, type="Classification", category="s2s", @@ -37,11 +36,3 @@ class UrduRomanSentimentClassification(AbsTaskClassification): } """, ) - - def dataset_transform(self): - self.dataset = self.dataset.rename_columns( - {"sentence": "text", "sentiment": "label"} - ) - self.dataset = self.stratified_subsampling( - self.dataset, seed=self.seed, splits=["train"] - ) diff --git a/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py b/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py index 62a166f89c..c0e2ef4cc3 100644 --- a/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py +++ b/mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py @@ -1,9 +1,5 @@ from __future__ import annotations -from hashlib import sha256 - -import datasets - from mteb.abstasks.MultilingualTask import MultilingualTask from mteb.abstasks.TaskMetadata import TaskMetadata @@ -24,13 +20,12 @@ } -class IndicQARetrieval(MultilingualTask, AbsTaskRetrieval): +class IndicQARetrieval(AbsTaskRetrieval, MultilingualTask): metadata = TaskMetadata( name="IndicQARetrieval", dataset={ - "path": "ai4bharat/IndicQA", - "revision": "570d90ae4f7b64fe4fdd5f42fc9f9279b8c9fd9d", - "trust_remote_code": True, + "path": "mteb/IndicQARetrieval", + "revision": "51e8b328988795d658f6f34acd34044e9346e2ee", }, description="IndicQA is a manually curated cloze-style reading comprehension dataset that can be used for evaluating question-answering models in 11 Indic languages. It is repurposed retrieving relevant context for each question.", reference="https://arxiv.org/abs/2212.05409", @@ -55,45 +50,3 @@ class IndicQARetrieval(MultilingualTask, AbsTaskRetrieval): doi = {10.18653/v1/2023.acl-long.693} }""", ) - - def load_data(self, **kwargs): - if self.data_loaded: - return - - split = "test" - queries = {lang: {split: {}} for lang in self.hf_subsets} - corpus = {lang: {split: {}} for lang in self.hf_subsets} - relevant_docs = {lang: {split: {}} for lang in self.hf_subsets} - - for lang in self.hf_subsets: - data = datasets.load_dataset( - name=f"indicqa.{lang}", **self.metadata_dict["dataset"] - )[split] - data = data.filter(lambda x: x["answers"]["text"] != "") - - question_ids = { - question: sha256(question.encode("utf-8")).hexdigest() - for question in set(data["question"]) - } - context_ids = { - context: sha256(context.encode("utf-8")).hexdigest() - for context in set(data["context"]) - } - - for row in data: - question = row["question"] - context = row["context"] - query_id = question_ids[question] - queries[lang][split][query_id] = question - - doc_id = context_ids[context] - corpus[lang][split][doc_id] = {"text": context} - if query_id not in relevant_docs[lang][split]: - relevant_docs[lang][split][query_id] = {} - relevant_docs[lang][split][query_id][doc_id] = 1 - - self.corpus = datasets.DatasetDict(corpus) - self.queries = datasets.DatasetDict(queries) - self.relevant_docs = datasets.DatasetDict(relevant_docs) - - self.data_loaded = True diff --git a/mteb/tasks/STS/jpn/JSTS.py b/mteb/tasks/STS/jpn/JSTS.py index 4993359190..bdd031c865 100644 --- a/mteb/tasks/STS/jpn/JSTS.py +++ b/mteb/tasks/STS/jpn/JSTS.py @@ -9,10 +9,8 @@ class JSTS(AbsTaskSTS): metadata = TaskMetadata( name="JSTS", dataset={ - "path": "shunk031/JGLUE", - "revision": "50e79c314a7603ebc92236b66a0973d51a00ed8c", - "name": "JSTS", - "trust_remote_code": True, + "path": "mteb/JSTS", + "revision": "5bac629e25799df4c9c80a6a5db983d6cba9e77d", }, description="Japanese Semantic Textual Similarity Benchmark dataset construct from YJ Image Captions Dataset " + "(Miyazaki and Shimizu, 2016) and annotated by crowdsource annotators.", @@ -65,6 +63,3 @@ def metadata_dict(self) -> dict[str, str]: metadata_dict["min_score"] = 0 metadata_dict["max_score"] = 5 return metadata_dict - - def dataset_transform(self) -> None: - self.dataset = self.dataset.rename_column("label", "score") diff --git a/pyproject.toml b/pyproject.toml index 4ab7f1f88e..48a104fff6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -25,7 +25,7 @@ classifiers = [ ] requires-python = ">=3.9" dependencies = [ - "datasets>=2.19.0,<3.0.0", + "datasets>=2.19.0", "numpy>=1.0.0,<3.0.0", "requests>=2.26.0", "scikit_learn>=1.0.2",