Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ class HotelReviewSentimentClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="HotelReviewSentimentClassification",
dataset={
"path": "Elnagara/hard",
"revision": "b108d2c32ee4e1f4176ea233e1a5ac17bceb9ef9",
"trust_remote_code": True,
"path": "mteb/HotelReviewSentimentClassification",
"revision": "273d5105974460d3979149e29e88c06a8214c541",
},
description="HARD is a dataset of Arabic hotel reviews collected from the Booking.com website.",
reference="https://link.springer.com/chapter/10.1007/978-3-319-67056-0_3",
Expand Down Expand Up @@ -38,8 +37,3 @@ class HotelReviewSentimentClassification(AbsTaskClassification):
}
""",
)

def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
11 changes: 2 additions & 9 deletions mteb/tasks/Classification/ara/TweetEmotionClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ class TweetEmotionClassification(AbsTaskClassification):
metadata = TaskMetadata(
name="TweetEmotionClassification",
dataset={
"path": "emotone-ar-cicling2017/emotone_ar",
"revision": "0ded8ff72cc68cbb7bb5c01b0a9157982b73ddaf",
"trust_remote_code": True,
"path": "mteb/TweetEmotionClassification",
"revision": "0d803980e91953cc67c21429f74b301b7b1b3f08",
},
description="A dataset of 10,000 tweets that was created with the aim of covering the most frequently used emotion categories in Arabic tweets.",
reference="https://link.springer.com/chapter/10.1007/978-3-319-77116-8_8",
Expand Down Expand Up @@ -38,9 +37,3 @@ class TweetEmotionClassification(AbsTaskClassification):
}
""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_column("tweet", "text")
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
5 changes: 2 additions & 3 deletions mteb/tasks/Classification/deu/TenKGnadClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ class TenKGnadClassification(AbsTaskClassification):
description="10k German News Articles Dataset (10kGNAD) contains news articles from the online Austrian newspaper website DER Standard with their topic classification (9 classes).",
reference="https://tblock.github.io/10kGNAD/",
dataset={
"path": "community-datasets/gnad10",
"revision": "0798affe9b3f88cfda4267b6fbc50fac67046ee5",
"trust_remote_code": True,
"path": "mteb/TenKGnadClassification",
"revision": "ae9862bbcddc27b4bd93e2a7b463b7b5d05c6c55",
},
type="Classification",
category="p2p",
Expand Down
5 changes: 2 additions & 3 deletions mteb/tasks/Classification/eng/ArxivClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ class ArxivClassification(AbsTaskClassification):
name="ArxivClassification",
description="Classification Dataset of Arxiv Papers",
dataset={
"path": "ccdv/arxiv-classification",
"revision": "f9bd92144ed76200d6eb3ce73a8bd4eba9ffdc85",
"trust_remote_code": True,
"path": "mteb/ArxivClassification",
"revision": "5e80893bf045abefbf8cbe5d713bddc91ae158d5",
},
reference="https://ieeexplore.ieee.org/document/8675939",
type="Classification",
Expand Down
10 changes: 2 additions & 8 deletions mteb/tasks/Classification/eng/PatentClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,8 @@ class PatentClassification(AbsTaskClassification):
name="PatentClassification",
description="Classification Dataset of Patents and Abstract",
dataset={
"path": "ccdv/patent-classification",
"revision": "2f38a1dfdecfacee0184d74eaeafd3c0fb49d2a6",
"trust_remote_code": True,
"path": "mteb/PatentClassification",
"revision": "6bd77eb030ab3bfbf1e6f7a2b069979daf167311",
},
reference="https://aclanthology.org/P19-1212.pdf",
type="Classification",
Expand Down Expand Up @@ -46,8 +45,3 @@ class PatentClassification(AbsTaskClassification):
abstract = "Most existing text summarization datasets are compiled from the news domain, where summaries have a flattened discourse structure. In such datasets, summary-worthy content often appears in the beginning of input articles. Moreover, large segments from input articles are present verbatim in their respective summaries. These issues impede the learning and evaluation of systems that can understand an article{'}s global content structure as well as produce abstractive summaries with high compression ratio. In this work, we present a novel dataset, BIGPATENT, consisting of 1.3 million records of U.S. patent documents along with human written abstractive summaries. Compared to existing summarization datasets, BIGPATENT has the following properties: i) summaries contain a richer discourse structure with more recurring entities, ii) salient content is evenly distributed in the input, and iii) lesser and shorter extractive fragments are present in the summaries. Finally, we train and evaluate baselines and popular learning models on BIGPATENT to shed light on new challenges and motivate future directions for summarization research.",
}""",
)

def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["test"]
)
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,8 @@ class FilipinoHateSpeechClassification(AbsTaskClassification):
description="Filipino Twitter dataset for sentiment classification.",
reference="https://pcj.csp.org.ph/index.php/pcj/issue/download/29/PCJ%20V14%20N1%20pp1-14%202019",
dataset={
"path": "jcblaise/hatespeech_filipino",
"revision": "b01711587b073e55569de75ef04d7da4592a3618",
"trust_remote_code": True,
"path": "mteb/FilipinoHateSpeechClassification",
"revision": "087a17c0b7f9a78901c88aea00ad2892a319fdac",
},
type="Classification",
category="s2s",
Expand All @@ -41,8 +40,3 @@ class FilipinoHateSpeechClassification(AbsTaskClassification):
}
""",
)

def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["validation", "test"]
)
11 changes: 2 additions & 9 deletions mteb/tasks/Classification/mya/MyanmarNews.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ class MyanmarNews(AbsTaskClassification):
metadata = TaskMetadata(
name="MyanmarNews",
dataset={
"path": "ayehninnkhine/myanmar_news",
"revision": "b899ec06227db3679b0fe3c4188a6b48cc0b65eb",
"trust_remote_code": True,
"path": "mteb/MyanmarNews",
"revision": "644419f24bc820bbf8af24e0b4714a069812e0a3",
},
description="The Myanmar News dataset on Hugging Face contains news articles in Burmese. It is designed for tasks such as text classification, sentiment analysis, and language modeling. The dataset includes a variety of news topics in 4 categorie, providing a rich resource for natural language processing applications involving Burmese which is a low resource language.",
reference="https://huggingface.co/datasets/myanmar_news",
Expand All @@ -37,9 +36,3 @@ class MyanmarNews(AbsTaskClassification):
pages = {401--408}
}""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_columns({"category": "label"})
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ class DutchBookReviewSentimentClassification(AbsTaskClassification):
description="A Dutch book review for sentiment classification.",
reference="https://github.com/benjaminvdb/DBRD",
dataset={
"path": "benjaminvdb/dbrd",
"revision": "3f756ab4572e071eb53e887ab629f19fa747d39e",
"trust_remote_code": True,
"path": "mteb/DutchBookReviewSentimentClassification",
"revision": "1c2815ad38cf4794eb8d678fb08f569ea79392f6",
},
type="Classification",
category="s2s",
Expand Down
10 changes: 2 additions & 8 deletions mteb/tasks/Classification/swe/SwedishSentimentClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ class SwedishSentimentClassification(AbsTaskClassification):
description="Dataset of Swedish reviews scarped from various public available websites",
reference="https://huggingface.co/datasets/swedish_reviews",
dataset={
"path": "timpal0l/swedish_reviews",
"revision": "105ba6b3cb99b9fd64880215be469d60ebf44a1b",
"trust_remote_code": True,
"path": "mteb/SwedishSentimentClassification",
"revision": "39e35f55d58338ebd602f8d83b52cfe027f5146a",
},
type="Classification",
category="s2s",
Expand All @@ -29,8 +28,3 @@ class SwedishSentimentClassification(AbsTaskClassification):
sample_creation="found",
bibtex_citation="",
)

def dataset_transform(self):
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["validation", "test"]
)
16 changes: 2 additions & 14 deletions mteb/tasks/Classification/tha/WisesightSentimentClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ class WisesightSentimentClassification(AbsTaskClassification):
description="Wisesight Sentiment Corpus: Social media messages in Thai language with sentiment label (positive, neutral, negative, question)",
reference="https://github.com/PyThaiNLP/wisesight-sentiment",
dataset={
"path": "pythainlp/wisesight_sentiment",
"revision": "14aa5773afa135ba835cc5179bbc4a63657a42ae",
"trust_remote_code": True,
"path": "mteb/WisesightSentimentClassification",
"revision": "727ea9bd253f9eedf16aebec6ac3f07791fb3db2",
},
type="Classification",
category="s2s",
Expand Down Expand Up @@ -43,14 +42,3 @@ class WisesightSentimentClassification(AbsTaskClassification):

""",
)

def dataset_transform(self):
for split in self.dataset.keys():
self.dataset[split] = self.dataset[split].rename_column("texts", "text")
self.dataset[split] = self.dataset[split].rename_column("category", "label")

self.dataset = self.stratified_subsampling(
self.dataset,
seed=self.seed,
splits=["test"],
)
13 changes: 2 additions & 11 deletions mteb/tasks/Classification/urd/UrduRomanSentimentClassification.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@ class UrduRomanSentimentClassification(AbsTaskClassification):
description="The Roman Urdu dataset is a data corpus comprising of more than 20000 records tagged for sentiment (Positive, Negative, Neutral)",
reference="https://archive.ics.uci.edu/dataset/458/roman+urdu+data+set",
dataset={
"path": "community-datasets/roman_urdu",
"revision": "566be6449bb30b9b9f2b59173391647fe0ca3224",
"trust_remote_code": True,
"path": "mteb/UrduRomanSentimentClassification",
"revision": "905c1121c002c4b9adc4ebc5faaf4d6f50d1b1ee",
},
type="Classification",
category="s2s",
Expand All @@ -37,11 +36,3 @@ class UrduRomanSentimentClassification(AbsTaskClassification):
}
""",
)

def dataset_transform(self):
self.dataset = self.dataset.rename_columns(
{"sentence": "text", "sentiment": "label"}
)
self.dataset = self.stratified_subsampling(
self.dataset, seed=self.seed, splits=["train"]
)
53 changes: 3 additions & 50 deletions mteb/tasks/Retrieval/multilingual/IndicQARetrieval.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,5 @@
from __future__ import annotations

from hashlib import sha256

import datasets

from mteb.abstasks.MultilingualTask import MultilingualTask
from mteb.abstasks.TaskMetadata import TaskMetadata

Expand All @@ -24,13 +20,12 @@
}


class IndicQARetrieval(MultilingualTask, AbsTaskRetrieval):
class IndicQARetrieval(AbsTaskRetrieval, MultilingualTask):
metadata = TaskMetadata(
name="IndicQARetrieval",
dataset={
"path": "ai4bharat/IndicQA",
"revision": "570d90ae4f7b64fe4fdd5f42fc9f9279b8c9fd9d",
"trust_remote_code": True,
"path": "mteb/IndicQARetrieval",
"revision": "51e8b328988795d658f6f34acd34044e9346e2ee",
},
description="IndicQA is a manually curated cloze-style reading comprehension dataset that can be used for evaluating question-answering models in 11 Indic languages. It is repurposed retrieving relevant context for each question.",
reference="https://arxiv.org/abs/2212.05409",
Expand All @@ -55,45 +50,3 @@ class IndicQARetrieval(MultilingualTask, AbsTaskRetrieval):
doi = {10.18653/v1/2023.acl-long.693}
}""",
)

def load_data(self, **kwargs):
if self.data_loaded:
return

split = "test"
queries = {lang: {split: {}} for lang in self.hf_subsets}
corpus = {lang: {split: {}} for lang in self.hf_subsets}
relevant_docs = {lang: {split: {}} for lang in self.hf_subsets}

for lang in self.hf_subsets:
data = datasets.load_dataset(
name=f"indicqa.{lang}", **self.metadata_dict["dataset"]
)[split]
data = data.filter(lambda x: x["answers"]["text"] != "")

question_ids = {
question: sha256(question.encode("utf-8")).hexdigest()
for question in set(data["question"])
}
context_ids = {
context: sha256(context.encode("utf-8")).hexdigest()
for context in set(data["context"])
}

for row in data:
question = row["question"]
context = row["context"]
query_id = question_ids[question]
queries[lang][split][query_id] = question

doc_id = context_ids[context]
corpus[lang][split][doc_id] = {"text": context}
if query_id not in relevant_docs[lang][split]:
relevant_docs[lang][split][query_id] = {}
relevant_docs[lang][split][query_id][doc_id] = 1

self.corpus = datasets.DatasetDict(corpus)
self.queries = datasets.DatasetDict(queries)
self.relevant_docs = datasets.DatasetDict(relevant_docs)

self.data_loaded = True
9 changes: 2 additions & 7 deletions mteb/tasks/STS/jpn/JSTS.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,8 @@ class JSTS(AbsTaskSTS):
metadata = TaskMetadata(
name="JSTS",
dataset={
"path": "shunk031/JGLUE",
"revision": "50e79c314a7603ebc92236b66a0973d51a00ed8c",
"name": "JSTS",
"trust_remote_code": True,
"path": "mteb/JSTS",
"revision": "5bac629e25799df4c9c80a6a5db983d6cba9e77d",
},
description="Japanese Semantic Textual Similarity Benchmark dataset construct from YJ Image Captions Dataset "
+ "(Miyazaki and Shimizu, 2016) and annotated by crowdsource annotators.",
Expand Down Expand Up @@ -65,6 +63,3 @@ def metadata_dict(self) -> dict[str, str]:
metadata_dict["min_score"] = 0
metadata_dict["max_score"] = 5
return metadata_dict

def dataset_transform(self) -> None:
self.dataset = self.dataset.rename_column("label", "score")
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ classifiers = [
]
requires-python = ">=3.9"
dependencies = [
"datasets>=2.19.0,<3.0.0",
"datasets>=2.19.0",
"numpy>=1.0.0,<3.0.0",
"requests>=2.26.0",
"scikit_learn>=1.0.2",
Expand Down