Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ class HotelReviewSentimentClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="HotelReviewSentimentClassification",
dataset={
"path": "Elnagara/hard",
"revision": "b108d2c32ee4e1f4176ea233e1a5ac17bceb9ef9",
"trust_remote_code": True,
"path": "mteb/HotelReviewSentimentClassification",
"revision": "273d5105974460d3979149e29e88c06a8214c541",
},
description="HARD is a dataset of Arabic hotel reviews collected from the Booking.com website.",
reference="https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3",
Expand Down Expand Up @@ -38,8 +37,3 @@ class HotelReviewSentimentClassification(AbsTaskClassification):
}
""",
)

def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
11 changes: 2 additions & 9 deletions mteb/tasks/Classification/ara/TweetEmotionClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ class TweetEmotionClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="TweetEmotionClassification",
dataset={
"path": "emotone-ar-cicling2017/emotone_ar",
"revision": "0ded8ff72cc68cbb7bb5c01b0a9157982b73ddaf",
"trust_remote_code": True,
"path": "mteb/TweetEmotionClassification",
"revision": "0d803980e91953cc67c21429f74b301b7b1b3f08",
},
description="A dataset of 10,000 tweets that was created with the aim of covering the most frequently used emotion categories in Arabic tweets.",
reference="https://link.springer.com/chapter/10.1007/978-3-319-77116-8_8",
Expand Down Expand Up @@ -38,9 +37,3 @@ class TweetEmotionClassification(AbsTaskClassification):
}
""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("tweet", "text")
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
5 changes: 2 additions & 3 deletions mteb/tasks/Classification/deu/TenKGnadClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ class TenKGnadClassification(AbsTaskClassification):
description="10k German News Articles Dataset (10kGNAD) contains news articles from the online Austrian newspaper website DER Standard with their topic classification (9 classes).",
reference="https://tblock.github.io/10kGNAD/",
dataset={
"path": "community-datasets/gnad10",
"revision": "0798affe9b3f88cfda4267b6fbc50fac67046ee5",
"trust_remote_code": True,
"path": "mteb/TenKGnadClassification",
"revision": "ae9862bbcddc27b4bd93e2a7b463b7b5d05c6c55",
},
type="Classification",
category="p2p",
Expand Down
5 changes: 2 additions & 3 deletions mteb/tasks/Classification/eng/ArxivClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ class ArxivClassification(AbsTaskClassification):
name="ArxivClassification",
description="Classification Dataset of Arxiv Papers",
dataset={
"path": "ccdv/arxiv-classification",
"revision": "f9bd92144ed76200d6eb3ce73a8bd4eba9ffdc85",
"trust_remote_code": True,
"path": "mteb/ArxivClassification",
"revision": "5e80893bf045abefbf8cbe5d713bddc91ae158d5",
},
reference="https://ieeexplore.ieee.org/document/8675939",
type="Classification",
Expand Down
10 changes: 2 additions & 8 deletions mteb/tasks/Classification/eng/PatentClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ class PatentClassification(AbsTaskClassification):
name="PatentClassification",
description="Classification Dataset of Patents and Abstract",
dataset={
"path": "ccdv/patent-classification",
"revision": "2f38a1dfdecfacee0184d74eaeafd3c0fb49d2a6",
"trust_remote_code": True,
"path": "mteb/PatentClassification",
"revision": "6bd77eb030ab3bfbf1e6f7a2b069979daf167311",
},
reference="https://aclanthology.org/P19-1212.pdf",
type="Classification",
Expand Down Expand Up @@ -48,8 +47,3 @@ class PatentClassification(AbsTaskClassification):
}
""",
)

def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@ class FilipinoHateSpeechClassification(AbsTaskClassification):
description="Filipino Twitter dataset for sentiment classification.",
reference="https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019",
dataset={
"path": "legacy-datasets/hate_speech_filipino",
"revision": "1994e9bb7f3ec07518e3f0d9e870cb293e234686",
"trust_remote_code": True,
"path": "mteb/FilipinoHateSpeechClassification",
"revision": "087a17c0b7f9a78901c88aea00ad2892a319fdac",
},
type="Classification",
category="s2s",
Expand All @@ -41,8 +40,3 @@ class FilipinoHateSpeechClassification(AbsTaskClassification):
}
""",
)

def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["validation", "test"]
)
11 changes: 2 additions & 9 deletions mteb/tasks/Classification/mya/MyanmarNews.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ class MyanmarNews(AbsTaskClassification):
metadata = TaskMetadata(
name="MyanmarNews",
dataset={
"path": "ayehninnkhine/myanmar_news",
"revision": "b899ec06227db3679b0fe3c4188a6b48cc0b65eb",
"trust_remote_code": True,
"path": "mteb/MyanmarNews",
"revision": "644419f24bc820bbf8af24e0b4714a069812e0a3",
},
description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categorie, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.",
reference="https://huggingface.co/datasets/myanmar_news",
Expand Down Expand Up @@ -38,9 +37,3 @@ class MyanmarNews(AbsTaskClassification):
}
""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"category": "label"})
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ class DutchBookReviewSentimentClassification(AbsTaskClassification):
description="A Dutch book review for sentiment classification.",
reference="https://github.com/benjaminvdb/DBRD",
dataset={
"path": "benjaminvdb/dbrd",
"revision": "3f756ab4572e071eb53e887ab629f19fa747d39e",
"trust_remote_code": True,
"path": "mteb/DutchBookReviewSentimentClassification",
"revision": "1c2815ad38cf4794eb8d678fb08f569ea79392f6",
},
type="Classification",
category="s2s",
Expand Down
10 changes: 2 additions & 8 deletions mteb/tasks/Classification/swe/SwedishSentimentClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ class SwedishSentimentClassification(AbsTaskClassification):
description="Dataset of Swedish reviews scarped from various public available websites",
reference="https://huggingface.co/datasets/swedish_reviews",
dataset={
"path": "timpal0l/swedish_reviews",
"revision": "105ba6b3cb99b9fd64880215be469d60ebf44a1b",
"trust_remote_code": True,
"path": "mteb/SwedishSentimentClassification",
"revision": "39e35f55d58338ebd602f8d83b52cfe027f5146a",
},
type="Classification",
category="s2s",
Expand All @@ -29,8 +28,3 @@ class SwedishSentimentClassification(AbsTaskClassification):
sample_creation="found",
bibtex_citation="",
)

def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["validation", "test"]
)
16 changes: 2 additions & 14 deletions mteb/tasks/Classification/tha/WisesightSentimentClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ class WisesightSentimentClassification(AbsTaskClassification):
description="Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment label (positive, neutral, negative, question)",
reference="https://github.com/PyThaiNLP/wisesight-sentiment",
dataset={
"path": "pythainlp/wisesight_sentiment",
"revision": "14aa5773afa135ba835cc5179bbc4a63657a42ae",
"trust_remote_code": True,
"path": "mteb/WisesightSentimentClassification",
"revision": "727ea9bd253f9eedf16aebec6ac3f07791fb3db2",
},
type="Classification",
category="s2s",
Expand Down Expand Up @@ -43,14 +42,3 @@ class WisesightSentimentClassification(AbsTaskClassification):
}
""",
)

def dataset_transform(self):
for split in self.dataset.keys():
self.dataset[split] = self.dataset[split].rename_column("texts", "text")
self.dataset[split] = self.dataset[split].rename_column("category", "label")

self.dataset = self.stratified_subsampling(
self.dataset,
seed=self.seed,
splits=["test"],
)
13 changes: 2 additions & 11 deletions mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ class UrduRomanSentimentClassification(AbsTaskClassification):
description="The Roman Urdu dataset is a data corpus comprising of more than 20000 records tagged for sentiment (Positive, Negative, Neutral)",
reference="https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set",
dataset={
"path": "community-datasets/roman_urdu",
"revision": "566be6449bb30b9b9f2b59173391647fe0ca3224",
"trust_remote_code": True,
"path": "mteb/UrduRomanSentimentClassification",
"revision": "905c1121c002c4b9adc4ebc5faaf4d6f50d1b1ee",
},
type="Classification",
category="s2s",
Expand All @@ -37,11 +36,3 @@ class UrduRomanSentimentClassification(AbsTaskClassification):
}
""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_columns(
{"sentence": "text", "sentiment": "label"}
)
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
53 changes: 3 additions & 50 deletions mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
from __future__ import annotations

from hashlib import sha256

import datasets

from mteb.abstasks.MultilingualTask import MultilingualTask
from mteb.abstasks.TaskMetadata import TaskMetadata

Expand All @@ -24,13 +20,12 @@
}


class IndicQARetrieval(MultilingualTask, AbsTaskRetrieval):
class IndicQARetrieval(AbsTaskRetrieval, MultilingualTask):
metadata = TaskMetadata(
name="IndicQARetrieval",
dataset={
"path": "ai4bharat/IndicQA",
"revision": "570d90ae4f7b64fe4fdd5f42fc9f9279b8c9fd9d",
"trust_remote_code": True,
"path": "mteb/IndicQARetrieval",
"revision": "51e8b328988795d658f6f34acd34044e9346e2ee",
},
description="IndicQA is a manually curated cloze-style reading comprehension dataset that can be used for evaluating question-answering models in 11 Indic languages. It is repurposed retrieving relevant context for each question.",
reference="https://arxiv.org/abs/2212.05409",
Expand All @@ -57,45 +52,3 @@ class IndicQARetrieval(MultilingualTask, AbsTaskRetrieval):
}
""",
)

def load_data(self, **kwargs):
if self.data_loaded:
return

split = "test"
queries = {lang: {split: {}} for lang in self.hf_subsets}
corpus = {lang: {split: {}} for lang in self.hf_subsets}
relevant_docs = {lang: {split: {}} for lang in self.hf_subsets}

for lang in self.hf_subsets:
data = datasets.load_dataset(
name=f"indicqa.{lang}", **self.metadata_dict["dataset"]
)[split]
data = data.filter(lambda x: x["answers"]["text"] != "")

question_ids = {
question: sha256(question.encode("utf-8")).hexdigest()
for question in set(data["question"])
}
context_ids = {
context: sha256(context.encode("utf-8")).hexdigest()
for context in set(data["context"])
}

for row in data:
question = row["question"]
context = row["context"]
query_id = question_ids[question]
queries[lang][split][query_id] = question

doc_id = context_ids[context]
corpus[lang][split][doc_id] = {"text": context}
if query_id not in relevant_docs[lang][split]:
relevant_docs[lang][split][query_id] = {}
relevant_docs[lang][split][query_id][doc_id] = 1

self.corpus = datasets.DatasetDict(corpus)
self.queries = datasets.DatasetDict(queries)
self.relevant_docs = datasets.DatasetDict(relevant_docs)

self.data_loaded = True
9 changes: 2 additions & 7 deletions mteb/tasks/STS/jpn/JSTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@ class JSTS(AbsTaskSTS):
metadata = TaskMetadata(
name="JSTS",
dataset={
"path": "shunk031/JGLUE",
"revision": "50e79c314a7603ebc92236b66a0973d51a00ed8c",
"name": "JSTS",
"trust_remote_code": True,
"path": "mteb/JSTS",
"revision": "5bac629e25799df4c9c80a6a5db983d6cba9e77d",
},
description="Japanese Semantic Textual Similarity Benchmark dataset construct from YJ Image Captions Dataset "
+ "(Miyazaki and Shimizu, 2016) and annotated by crowdsource annotators.",
Expand Down Expand Up @@ -67,6 +65,3 @@ def metadata_dict(self) -> dict[str, str]:
metadata_dict["min_score"] = 0
metadata_dict["max_score"] = 5
return metadata_dict

def dataset_transform(self) -> None:
self.dataset = self.dataset.rename_column("label", "score")
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ classifiers = [
]
requires-python = ">=3.9,<3.13"
dependencies = [
"datasets>=2.19.0,<3.0.0",
"datasets>=2.19.0",
"numpy>=1.0.0,<3.0.0",
"requests>=2.26.0",
"scikit_learn>=1.0.2",
Expand Down