diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index f0f8a66143..248c81ac6f 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -96,6 +96,7 @@ "Summarization", "InstructionRetrieval", "Speed", + "SummaryRetrieval", ] diff --git a/mteb/benchmarks/benchmarks.py b/mteb/benchmarks/benchmarks.py index c1e5236ecf..eddf9d76bb 100644 --- a/mteb/benchmarks/benchmarks.py +++ b/mteb/benchmarks/benchmarks.py @@ -1222,6 +1222,86 @@ def load_results( }""", ) +FA_MTEB = Benchmark( + name="FaMTEB(fas, beta)", + tasks=get_tasks( + languages=["fas"], + tasks=[ + # Classification + "PersianFoodSentimentClassification", + "SynPerChatbotConvSAClassification", + "SynPerChatbotConvSAToneChatbotClassification", + "SynPerChatbotConvSAToneUserClassification", + "SynPerChatbotSatisfactionLevelClassification", + "SynPerChatbotRAGToneChatbotClassification", + "SynPerChatbotRAGToneUserClassification", + "SynPerChatbotToneChatbotClassification", + "SynPerChatbotToneUserClassification", + "PersianTextTone", + "SIDClassification", + "DeepSentiPers", + "PersianTextEmotion", + "SentimentDKSF", + "NLPTwitterAnalysisClassification", + "DigikalamagClassification", + "MassiveIntentClassification", + "MassiveScenarioClassification", + # Clustering + "BeytooteClustering", + "DigikalamagClustering", + "HamshahriClustring", + "NLPTwitterAnalysisClustering", + "SIDClustring", + # PairClassification + "FarsTail", + "CExaPPC", + "SynPerChatbotRAGFAQPC", + "FarsiParaphraseDetection", + "SynPerTextKeywordsPC", + "SynPerQAPC", + "ParsinluEntail", + "ParsinluQueryParaphPC", + # Reranking + "MIRACLReranking", + "WikipediaRerankingMultilingual", + # Retrieval + "SynPerQARetrieval", + "SynPerChatbotTopicsRetrieval", + "SynPerChatbotRAGTopicsRetrieval", + "SynPerChatbotRAGFAQRetrieval", + "PersianWebDocumentRetrieval", + "WikipediaRetrievalMultilingual", + "MIRACLRetrieval", + "ClimateFEVER-Fa", + "DBPedia-Fa", + "HotpotQA-Fa", + "MSMARCO-Fa", + "NQ-Fa", + "ArguAna-Fa", + "CQADupstackRetrieval-Fa", + "FiQA2018-Fa", + "NFCorpus-Fa", + "QuoraRetrieval-Fa", + "SCIDOCS-Fa", + "SciFact-Fa", + "TRECCOVID-Fa", + "Touche2020-Fa", + # STS + "Farsick", + "SynPerSTS", + "Query2Query", + # SummaryRetrieval + "SAMSumFa", + "SynPerChatbotSumSRetrieval", + "SynPerChatbotRAGSumSRetrieval", + ], + ), + description="Main Persian (Farsi) benchmarks from MTEB", + reference=None, + citation=None, + contacts=["mehran-sarmadi", "ERfun", "morteza20"], +) + CHEMTEB = Benchmark( name="ChemTEB", tasks=get_tasks( diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 12b0623b6b..b2aab22714 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -57,6 +57,7 @@ from .eng.YahooAnswersTopicsClassification import * from .eng.YelpReviewFullClassification import * from .est.estonian_valence import * +from .fas.FaMTEBClassification import * from .fas.PersianFoodSentimentClassification import * from .fil.FilipinoHateSpeechClassification import * from .fil.FilipinoShopeeReviewsClassification import * diff --git a/mteb/tasks/Classification/fas/FaMTEBClassification.py b/mteb/tasks/Classification/fas/FaMTEBClassification.py new file mode 100644 index 0000000000..43c7971429 --- /dev/null +++ b/mteb/tasks/Classification/fas/FaMTEBClassification.py @@ -0,0 +1,635 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SynPerChatbotConvSAAnger(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAAnger", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Anger", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-anger", + "revision": "5cae68b7fc094cb2fa6890a464e4d836e8107f5e", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSASatisfaction(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSASatisfaction", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Satisfaction", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-satisfaction", + "revision": "50fd9d5d09edd53af89af765636be5db6f983f0e", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAFriendship(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAFriendship", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Friendship", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-friendship", + "revision": "9dae119101e9b4e9bb40d5b9d29ffd7a621f9942", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAFear(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAFear", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Fear", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-fear", + "revision": "3c22f7e6bf4e366c86d69293c9164bf9e9d80aac", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAJealousy(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAJealousy", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Jealousy", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-jealousy", + "revision": "0d5104ecaa109d2448afe1f40dbf860f5e4458a8", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSASurprise(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSASurprise", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Surprise", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-surprise", + "revision": "62dad66fc2837b0ac5e5175fe7c265d2d502a386", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSALove(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSALove", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Love", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-love", + "revision": "0e000b2f73e9bb74ec8fc6da10011c52725b8469", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSASadness(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSASadness", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Sadness", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-sadness", + "revision": "e9c678325565a5e4dadc43fd6693a8ccff1dd6b2", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAHappiness(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAHappiness", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Happiness", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-happiness", + "revision": "e60893b7a8d01c9b8c12fadfe8f0a06e9d548a63", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAToneChatbotClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAToneChatbotClassification", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Tone Chatbot Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-conversational-sentiment-analysis-tone-chatbot-classification", + "revision": "1f403cfadb85004fbf7e2480334fffc4c999b4ab", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotConvSAToneUserClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotConvSAToneUserClassification", + description="Synthetic Persian Chatbot Conversational Sentiment Analysis Tone User", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/chatbot-conversational-sentiment-analysis-tone-user-classification", + "revision": "dd0f76661bef69819cc38c8a455b10af86ac3571", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotSatisfactionLevelClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotSatisfactionLevelClassification", + description="Synthetic Persian Chatbot Satisfaction Level Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-satisfaction-level-classification", + "revision": "e72db473602d750f1bcdc9f0436e1e3b967e088f", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotRAGToneChatbotClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotRAGToneChatbotClassification", + description="Synthetic Persian Chatbot RAG Tone Chatbot Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-tone-chatbot-classification", + "revision": "76f15a203fc13bd98a8f0fdddab1b68c28d7d674", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotRAGToneUserClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotRAGToneUserClassification", + description="Synthetic Persian Chatbot RAG Tone User Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-tone-user-classification", + "revision": "f1f6ad83bb135dc94fbf1ca05c3ba164f5619369", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotToneChatbotClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotToneChatbotClassification", + description="Synthetic Persian Chatbot Tone Chatbot Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-tone-chatbot-classification", + "revision": "a5a739a036fa7bb8ae0be91bc081fdd260d4bdab", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SynPerChatbotToneUserClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SynPerChatbotToneUserClassification", + description="Synthetic Persian Chatbot Tone User Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-tone-user-classification", + "revision": "780d629437f7be127c6b287a61776372f9f333b9", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class PersianTextTone(AbsTaskClassification): + metadata = TaskMetadata( + name="PersianTextTone", + description="Persian Text Tone", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/persian-text-tone", + "revision": "7144f4c6bdd77911df0dfc5a8bd44dba17e27e3a", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SIDClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SIDClassification", + description="SID Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/sid-classification", + "revision": "29bed651bb980395f5aa473607154d93226945e1", + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class DeepSentiPers(AbsTaskClassification): + metadata = TaskMetadata( + name="DeepSentiPers", + description="Persian Sentiment Analysis Dataset", + reference="https://github.com/JoyeBright/DeepSentiPers", + dataset={ + "path": "PartAI/DeepSentiPers", + "revision": "ee4f09f404051761cfe14d68127532c82de41cb3", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("review", "text") + + +class PersianTextEmotion(AbsTaskClassification): + metadata = TaskMetadata( + name="PersianTextEmotion", + description="Emotion is a Persian dataset with six basic emotions: anger, fear, joy, love, sadness, and surprise.", + reference="https://huggingface.co/datasets/SeyedAli/Persian-Text-Emotion", + dataset={ + "path": "SeyedAli/Persian-Text-Emotion", + "revision": "518fcd2c8b89917c7696770672688217a2eabf88", + }, + type="Classification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class SentimentDKSF(AbsTaskClassification): + metadata = TaskMetadata( + name="SentimentDKSF", + description="The Sentiment DKSF (Digikala/Snappfood comments) is a dataset for sentiment analysis.", + reference="https://github.com/hezarai/hezar", + dataset={ + "path": "hezarai/sentiment-dksf", + "revision": "b4d5a8dd501db610b5ad89e9aa13f863b842b395", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Reviews"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + +class NLPTwitterAnalysisClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="NLPTwitterAnalysisClassification", + description="Twitter Analysis Classification", + reference="https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/tree/main", + dataset={ + "path": "hamedhf/nlp_twitter_analysis", + "revision": "4ceb1312583fd2c7c73ad2d550b726124dcd39a0", + }, + type="Classification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Social"], + task_subtypes=["Sentiment/Hate speech"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("tweet", "text") + + +class DigikalamagClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="DigikalamagClassification", + description="A total of 8,515 articles scraped from Digikala Online Magazine. This dataset includes seven different classes.", + reference="https://hooshvare.github.io/docs/datasets/tc", + dataset={ + "path": "PNLPhub/DigiMag", + "revision": "969b335c9f50eda5c384460be4eb2b55505c2c53", + "trust_remote_code": True, + }, + type="Classification", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="accuracy", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + samples_per_label = 32 + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("content", "text") diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index de27839290..65d8b01246 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -20,6 +20,7 @@ from .eng.WikiCitiesClustering import * from .eng.WikipediaChemistrySpecialtiesClustering import * from .eng.WikipediaChemistryTopicsClustering import * +from .fas.FaMTEBClustering import * from .fra.AlloProfClusteringP2P import * from .fra.AlloProfClusteringS2S import * from .fra.HALClusteringS2S import * diff --git a/mteb/tasks/Clustering/fas/FaMTEBClustering.py b/mteb/tasks/Clustering/fas/FaMTEBClustering.py new file mode 100644 index 0000000000..da0b8b53f3 --- /dev/null +++ b/mteb/tasks/Clustering/fas/FaMTEBClustering.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import numpy as np +from datasets import Dataset, DatasetDict + +from mteb.abstasks.AbsTaskClusteringFast import ( + AbsTaskClusteringFast, + check_label_distribution, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class BeytooteClustering(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="BeytooteClustering", + description="Beytoote Website Articles Clustering", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/beytoote-clustering", + "revision": "62ca5aecb9414214162569f2f1bfb07aa219a70e", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["News"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class DigikalamagClustering(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="DigikalamagClustering", + description="A total of 8,515 articles scraped from Digikala Online Magazine. This dataset includes seven different classes.", + reference="https://hooshvare.github.io/docs/datasets/tc", + dataset={ + "path": "PNLPhub/DigiMag", + "revision": "969b335c9f50eda5c384460be4eb2b55505c2c53", + "trust_remote_code": True, + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"label": "labels", "content": "sentences"} + ) + + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class HamshahriClustring(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="HamshahriClustring", + description="These datasets have been extracted from the RSS feed of two Farsi news agency websites.", + reference="https://github.com/mallahyari/Farsi-datasets", + dataset={ + "path": "community-datasets/farsi_news", + "revision": "ca93dc707cea06cdb2e3ede3b547a1092053aca6", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["News"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.map( + lambda x: {"sentences": f"{x['title']}\n: {x['summary']}"} + ) + self.dataset = self.dataset.map(lambda x: {"labels": x["tags"][0]}) + self.dataset = DatasetDict({"test": self.dataset["hamshahri"]}) + + ds = {} + for split in self.metadata.eval_splits: + labels = self.dataset[split]["labels"] + sentences = self.dataset[split]["sentences"] + + check_label_distribution(self.dataset[split]) + + # Remove sentences and labels with only 1 label example. + unique_labels, counts = np.unique(labels, return_counts=True) + solo_label_idx = np.where(counts == 1) + solo_labels = unique_labels[solo_label_idx] + is_solo = np.isin(labels, solo_labels) + split_ds = Dataset.from_dict({"labels": labels, "sentences": sentences}) + if is_solo.any(): + split_ds = split_ds.select(np.nonzero(is_solo == False)[0]) # noqa: E712 + ds[split] = split_ds + self.dataset = DatasetDict(ds) + + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class NLPTwitterAnalysisClustering(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="NLPTwitterAnalysisClustering", + description="Clustering of tweets from twitter across 26 categories.", + reference="https://huggingface.co/datasets/hamedhf/nlp_twitter_analysis/commits/main", + dataset={ + "path": "hamedhf/nlp_twitter_analysis", + "revision": "4ceb1312583fd2c7c73ad2d550b726124dcd39a0", + }, + type="Clustering", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["Social"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("tweet", "sentences") + self.dataset = self.dataset.rename_column("label", "labels") + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) + + +class SIDClustring(AbsTaskClusteringFast): + metadata = TaskMetadata( + name="SIDClustring", + description="Clustering of summariesfrom SIDClustring across categories.", + reference="https://www.sid.com/", + dataset={ + "path": "MCINext/sid-clustering", + "revision": "d361bb18535d592e845aeaaa8ac47a239aa2f87a", + }, + type="Clustering", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="v_measure", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, + seed=self.seed, + splits=["test"], + label="labels", + ) diff --git a/mteb/tasks/Clustering/fas/__init__.py b/mteb/tasks/Clustering/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index e229195df0..6cd75ea144 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -11,6 +11,7 @@ from .eng.SprintDuplicateQuestionsPC import * from .eng.TwitterSemEval2015PC import * from .eng.TwitterURLCorpusPC import * +from .fas.FaMTEBPairClassification import * from .fas.FarsTail import * from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * diff --git a/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py new file mode 100644 index 0000000000..6deba76d8d --- /dev/null +++ b/mteb/tasks/PairClassification/fas/FaMTEBPairClassification.py @@ -0,0 +1,282 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CExaPPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="CExaPPC", + description="ExaPPC is a large paraphrase corpus consisting of monolingual sentence-level paraphrases using different sources.", + reference="https://github.com/exaco/exappc", + dataset={ + "path": "PNLPhub/C-ExaPPC", + "revision": "68a0ff474739367a36c8066ee04802a65aefc117", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Social", "Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + self.dataset = self.dataset.map( + lambda example: {"label": 1 if example["label"] == "paraphrase" else 0} + ) + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sentence1"], + "sentence2": self.dataset[split]["sentence2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset + + +class SynPerChatbotRAGFAQPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="SynPerChatbotRAGFAQPC", + description="Synthetic Persian Chatbot RAG FAQ Pair Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-faq-pair-classification", + "revision": "2128d809e27ab8528906e2231f8e824516fb8e5a", + }, + type="PairClassification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"][0], + "sentence2": self.dataset[split]["sent2"][0], + "labels": self.dataset[split]["labels"][0], + } + ] + self.dataset = _dataset + + +class FarsiParaphraseDetection(AbsTaskPairClassification): + metadata = TaskMetadata( + name="FarsiParaphraseDetection", + description="Farsi Paraphrase Detection", + reference="https://huggingface.co/datasets/alighasemi/farsi_paraphrase_detection", + dataset={ + "path": "alighasemi/farsi_paraphrase_detection", + "revision": "c8129741af418d9ae43cfc1fc4f285704e26035f", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sentence1"], + "sentence2": self.dataset[split]["sentence2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset + + +class SynPerTextKeywordsPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="SynPerTextKeywordsPC", + description="Synthetic Persian Text Keywords Pair Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-text-keyword-pair-classification", + "revision": "ea9a840cb163b415cc70b2f7adf2554feae159dc", + }, + type="PairClassification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Web", "News", "Religious", "Blog"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"][0], + "sentence2": self.dataset[split]["sent2"][0], + "labels": self.dataset[split]["labels"][0], + } + ] + self.dataset = _dataset + + +class SynPerQAPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="SynPerQAPC", + description="Synthetic Persian QA Pair Classification", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-qa-pair-classification", + "revision": "d1b62ef31bebbb48ae01867993a1e583c2ce7d93", + }, + type="PairClassification", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=["Web", "News", "Religious", "Blog"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"][0], + "sentence2": self.dataset[split]["sent2"][0], + "labels": self.dataset[split]["labels"][0], + } + ] + self.dataset = _dataset + + +class ParsinluEntail(AbsTaskPairClassification): + metadata = TaskMetadata( + name="ParsinluEntail", + description="A Persian textual entailment task (deciding sent1 entails sent2). The questions are partially translated from the SNLI dataset and partially generated by expert annotators.", + reference="https://github.com/persiannlp/parsinlu", + dataset={ + "path": "persiannlp/parsinlu_entailment", + "revision": "c49b2d8fa0d6476520695c52207690b7ec854043", + "trust_remote_code": True, + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + self.dataset = self.dataset.filter(lambda x: x["label"] != "n") + self.dataset = self.dataset.map( + lambda example: {"label": 1 if example["label"] == "e" else 0} + ) + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["sent1"], + "sentence2": self.dataset[split]["sent2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset + + +class ParsinluQueryParaphPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="ParsinluQueryParaphPC", + description="A Persian query paraphrasng task (deciding whether two questions are paraphrases of each other). The questions are partially generated from Google auto-complete, and partially translated from the Quora paraphrasing dataset.", + reference="https://huggingface.co/datasets/persiannlp/parsinlu_query_paraphrasing", + dataset={ + "path": "persiannlp/parsinlu_query_paraphrasing", + "revision": "ec675bb3ac50c1a52317c101fe1d724b4601f47a", + "trust_remote_code": True, + }, + type="PairClassification", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="max_ap", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + _dataset = {} + self.dataset = self.dataset.map( + lambda example: {"label": 1 if example["label"] == "1" else 0} + ) + for split in self.metadata.eval_splits: + _dataset[split] = [ + { + "sentence1": self.dataset[split]["q1"], + "sentence2": self.dataset[split]["q2"], + "labels": self.dataset[split]["label"], + } + ] + self.dataset = _dataset diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 6c146379ea..291dd983c3 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -103,6 +103,8 @@ from .eng.TRECCOVIDRetrieval import * from .eng.WinoGrandeRetrieval import * from .est.estqa import * +from .fas.BEIRFa import * +from .fas.FaMTEBRetrieval import * from .fra.AlloprofRetrieval import * from .fra.BSARDRetrieval import * from .fra.FQuADRetrieval import * diff --git a/mteb/tasks/Retrieval/fas/BEIRFa.py b/mteb/tasks/Retrieval/fas/BEIRFa.py new file mode 100644 index 0000000000..0952eefff9 --- /dev/null +++ b/mteb/tasks/Retrieval/fas/BEIRFa.py @@ -0,0 +1,662 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class ArguAnaFa(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="ArguAna-Fa", + description="ArguAna-Fa", + reference="https://huggingface.co/datasets/MCINext/arguana-fa", + dataset={ + "path": "MCINext/arguana-fa", + "revision": "fa97814be356fe4d18caadb457b4469bd34019ca", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Blog"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class ClimateFEVERFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ClimateFEVER-Fa", + description="ClimateFEVER-Fa", + reference="https://huggingface.co/datasets/MCINext/climate-fever-fa", + dataset={ + "path": "MCINext/climate-fever-fa", + "revision": "45d9176b6fcba33abc58494ee82f18ee7e8ddbae", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackAndroidRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackAndroidRetrieval-Fa", + description="CQADupstackAndroidRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-android-fa", + dataset={ + "path": "MCINext/cqadupstack-android-fa", + "revision": "bcdaf4e30477eea9b9b17ecbb153ca403e5c3ebd", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackEnglishRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackEnglishRetrieval-Fa", + description="CQADupstackEnglishRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-english-fa", + dataset={ + "path": "MCINext/cqadupstack-english-fa", + "revision": "029a2e69e7d9e68b6bdc471073606104417a5be7", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackGamingRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackGamingRetrieval-Fa", + description="CQADupstackGamingRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-gaming-fa", + dataset={ + "path": "MCINext/cqadupstack-gaming-fa", + "revision": "e9c7ad03f29a55ab14eae730146961b8cdc14227", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackGisRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackGisRetrieval-Fa", + description="CQADupstackGisRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-gis-fa", + dataset={ + "path": "MCINext/cqadupstack-gis-fa", + "revision": "e907c4144dc27bc8a035d78d69e15f39c56623a9", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackMathematicaRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackMathematicaRetrieval-Fa", + description="CQADupstackMathematicaRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-mathematica-fa", + dataset={ + "path": "MCINext/cqadupstack-mathematica-fa", + "revision": "b92e24fab42ab599535a19ee744de5485ec92f64", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackPhysicsRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackPhysicsRetrieval-Fa", + description="CQADupstackPhysicsRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-physics-fa", + dataset={ + "path": "MCINext/cqadupstack-physics-fa", + "revision": "816ad7473d6813f77a0ca5e72b1ff7a52752d370", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackProgrammersRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackProgrammersRetrieval-Fa", + description="CQADupstackProgrammersRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-programmers-fa", + dataset={ + "path": "MCINext/cqadupstack-programmers-fa", + "revision": "be6460df57ab7c1b2c9fe295a31660dbd077ecf0", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackStatsRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackStatsRetrieval-Fa", + description="CQADupstackStatsRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-stats-fa", + dataset={ + "path": "MCINext/cqadupstack-stats-fa", + "revision": "c6e2c8b6153958118ec04352dd82a30ea2e2cad5", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackTexRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackTexRetrieval-Fa", + description="CQADupstackTexRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-tex-fa", + dataset={ + "path": "MCINext/cqadupstack-tex-fa", + "revision": "860d152c86fda27229270b6bf4e832ff374ac01b", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackUnixRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackUnixRetrieval-Fa", + description="CQADupstackUnixRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-unix-fa", + dataset={ + "path": "MCINext/cqadupstack-unix-fa", + "revision": "c2a326387954aad66ff00d324a9278067b1e3bb6", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackWebmastersRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackWebmastersRetrieval-Fa", + description="CQADupstackWebmastersRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-webmasters-fa", + dataset={ + "path": "MCINext/cqadupstack-webmasters-fa", + "revision": "506f29f8ce59648efe99afee736b0b158eced516", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class CQADupstackWordpressRetrievalFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackWordpressRetrieval-Fa", + description="CQADupstackWordpressRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/cqadupstack-wordpress-fa", + dataset={ + "path": "MCINext/cqadupstack-wordpress-fa", + "revision": "7f755e88647b4023df52da04d4e3d31a7de5fcb0", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class DBPediaFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="DBPedia-Fa", + description="DBPedia-Fa", + reference="https://huggingface.co/datasets/MCINext/dbpedia-fa", + dataset={ + "path": "MCINext/dbpedia-fa", + "revision": "13529e6e301e9d72f86def882cfbca04791d83f9", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Encyclopaedic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class FiQA2018Fa(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="FiQA2018-Fa", + description="FiQA2018-Fa", + reference="https://huggingface.co/datasets/MCINext/fiqa-fa", + dataset={ + "path": "MCINext/fiqa-fa", + "revision": "e683ce7ecd0b47edc3e29fda7cfd75335be4a997", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class HotpotQAFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="HotpotQA-Fa", + description="HotpotQA-Fa", + reference="https://huggingface.co/datasets/MCINext/hotpotqa-fa", + dataset={ + "path": "MCINext/hotpotqa-fa", + "revision": "1cafde1306aa56b5dfdce0b14633ae9ee1a63ddb", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Encyclopaedic"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class MSMARCOFa(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="MSMARCO-Fa", + description="MSMARCO-Fa", + reference="https://huggingface.co/datasets/MCINext/msmarco-fa", + dataset={ + "path": "MCINext/msmarco-fa", + "revision": "88f90b0b04f91778ba5341095b0a9f1d7799ce10", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["dev"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class NFCorpusFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NFCorpus-Fa", + description="NFCorpus-Fa", + reference="https://huggingface.co/datasets/MCINext/nfcorpus-fa", + dataset={ + "path": "MCINext/nfcorpus-fa", + "revision": "70aa71825a791e87210c0355a01f538aa611feae", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class NQFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NQ-Fa", + description="NQ-Fa", + reference="https://huggingface.co/datasets/MCINext/nq-fa", + dataset={ + "path": "MCINext/nq-fa", + "revision": "d4ea898b644c8d5f608b60947cb637bebbf1ac66", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Encyclopaedic"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class QuoraRetrievalFa(AbsTaskRetrieval): + ignore_identical_ids = True + + metadata = TaskMetadata( + name="QuoraRetrieval-Fa", + description="QuoraRetrieval-Fa", + reference="https://huggingface.co/datasets/MCINext/quora-fa", + dataset={ + "path": "MCINext/quora-fa", + "revision": "1a43f4f5dcd71e6b14b275ae82c3237cdd4fd5fd", + }, + type="Retrieval", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class SCIDOCSFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="SCIDOCS-Fa", + description="SCIDOCS-Fa", + reference="https://huggingface.co/datasets/MCINext/scidocs-fa", + dataset={ + "path": "MCINext/scidocs-fa", + "revision": "6611ebf4b4c1aaf8b021e4728440db2188291b8b", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class SciFactFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="SciFact-Fa", + description="SciFact-Fa", + reference="https://huggingface.co/datasets/MCINext/scifact-fa", + dataset={ + "path": "MCINext/scifact-fa", + "revision": "7723397096199c4d6f367b445fccaf282c518abe", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Academic"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class TRECCOVIDFa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="TRECCOVID-Fa", + description="TRECCOVID-Fa", + reference="https://huggingface.co/datasets/MCINext/trec-covid-fa", + dataset={ + "path": "MCINext/trec-covid-fa", + "revision": "98e6c2d33dfa166ee326e8b1bc7ea82c7e6898dd", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Medical"], + task_subtypes=["Article retrieval"], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + +class Touche2020Fa(AbsTaskRetrieval): + metadata = TaskMetadata( + name="Touche2020-Fa", + description="Touche2020-Fa", + reference="https://huggingface.co/datasets/MCINext/touche2020-fa", + dataset={ + "path": "MCINext/touche2020-fa", + "revision": "0f464636f91641cc6ef6f6f8f249c73f4a609982", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) diff --git a/mteb/tasks/Retrieval/fas/FaMTEBRetrieval.py b/mteb/tasks/Retrieval/fas/FaMTEBRetrieval.py new file mode 100644 index 0000000000..875f7ea7db --- /dev/null +++ b/mteb/tasks/Retrieval/fas/FaMTEBRetrieval.py @@ -0,0 +1,140 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskRetrieval import AbsTaskRetrieval + + +class SynPerQARetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerQARetrieval", + description="Synthetic Persian QA Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-qa-retrieval/settings", + dataset={ + "path": "MCINext/synthetic-persian-qa-retrieval", + "revision": "e85114f13f42dc1edc456d58931cc38d44d697cf", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class SynPerChatbotTopicsRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerChatbotTopicsRetrieval", + description="Synthetic Persian Chatbot Topics Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-topics-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-topics-retrieval", + "revision": "086995ca4cea33f37a407c2fa5282f74913740ee", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class SynPerChatbotRAGTopicsRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerChatbotRAGTopicsRetrieval", + description="Synthetic Persian Chatbot RAG Topics Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-topics-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-topics-retrieval", + "revision": "da8f36a723da155738f5e3d8d84d543589bd5083", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class SynPerChatbotRAGFAQRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="SynPerChatbotRAGFAQRetrieval", + description="Synthetic Persian Chatbot RAG FAQ Retrieval", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-faq-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-faq-retrieval", + "revision": "9d32af6540970e2845028cbfffe6b0d0e8f52428", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=["Question answering"], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation="""""", + ) + + +class PersianWebDocumentRetrieval(AbsTaskRetrieval): + ignore_identical_ids = True + metadata = TaskMetadata( + name="PersianWebDocumentRetrieval", + description="Persian dataset designed specifically for the task of text information retrieval through the web.", + reference="https://ieeexplore.ieee.org/document/10553090", + dataset={ + "path": "MCINext/persian-web-document-retrieval", + "revision": "b3dc818368a867b30ccb55a42ff287d253512c36", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="ndcg_at_10", + date=("2024-09-01", "2024-12-31"), + domains=["Web"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation="""""", + ) diff --git a/mteb/tasks/Retrieval/fas/__init__.py b/mteb/tasks/Retrieval/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/STS/__init__.py b/mteb/tasks/STS/__init__.py index b61b79b293..471789f1c9 100644 --- a/mteb/tasks/STS/__init__.py +++ b/mteb/tasks/STS/__init__.py @@ -10,6 +10,7 @@ from .eng.STS16STS import * from .eng.STSBenchmarkSTS import * from .fao.FaroeseSTS import * +from .fas.FaMTEBSTS import * from .fin.FinParaSTS import * from .fra.SickFrSTS import * from .jpn.JSICK import * diff --git a/mteb/tasks/STS/fas/FaMTEBSTS.py b/mteb/tasks/STS/fas/FaMTEBSTS.py new file mode 100644 index 0000000000..2ce9522cd4 --- /dev/null +++ b/mteb/tasks/STS/fas/FaMTEBSTS.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +from mteb.abstasks.TaskMetadata import TaskMetadata + +from ....abstasks.AbsTaskSTS import AbsTaskSTS + + +class Farsick(AbsTaskSTS): + metadata = TaskMetadata( + name="Farsick", + description="A Persian Semantic Textual Similarity And Natural Language Inference Dataset", + reference="https://github.com/ZahraGhasemi-AI/FarSick", + dataset={ + "path": "MCINext/farsick-sts", + "revision": "f8b8d630f631c6c16b7bc3cb924bdf62a51bed06", + }, + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="cosine_spearman", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 1 + metadata_dict["max_score"] = 5 + return metadata_dict + + +class SynPerSTS(AbsTaskSTS): + metadata = TaskMetadata( + name="SynPerSTS", + description="Synthetic Persian Semantic Textual Similarity Dataset", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/synthetic-persian-sts", + "revision": "914047db08928b5326d8b106583dc563b73d1ecf", + }, + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="cosine_spearman", + date=("2024-09-01", "2024-12-31"), + domains=["Web", "News", "Religious", "Blog"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 1 + metadata_dict["max_score"] = 5 + return metadata_dict + + +class Query2Query(AbsTaskSTS): + metadata = TaskMetadata( + name="Query2Query", + description="Query to Query Datasets.", + reference="https://mcinext.com/", + dataset={ + "path": "MCINext/query-to-query-sts", + "revision": "52602079f9032fcf181775a310d79d2f197534e4", + }, + type="STS", + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="cosine_spearman", + date=("2024-09-01", "2024-12-31"), + domains=[], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 2 + return metadata_dict diff --git a/mteb/tasks/STS/fas/__init__.py b/mteb/tasks/STS/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/SummaryRetrieval/__init__.py b/mteb/tasks/SummaryRetrieval/__init__.py new file mode 100644 index 0000000000..d000983be9 --- /dev/null +++ b/mteb/tasks/SummaryRetrieval/__init__.py @@ -0,0 +1,3 @@ +from __future__ import annotations + +from .fas.FaMTEBSummaryRetrieval import * diff --git a/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py b/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py new file mode 100644 index 0000000000..cf3f9dbe52 --- /dev/null +++ b/mteb/tasks/SummaryRetrieval/fas/FaMTEBSummaryRetrieval.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskBitextMining import AbsTaskBitextMining +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SAMSumFa(AbsTaskBitextMining): + metadata = TaskMetadata( + name="SAMSumFa", + description="Translated Version of SAMSum Dataset", + reference="https://huggingface.co/datasets/MCINext/samsum-fa", + dataset={ + "path": "MCINext/samsum-fa", + "revision": "fd981d78a0ab82c20d2e693a8b3929c5d71b0743", + }, + type="SummaryRetrieval", + category="s2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="f1", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="found", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"text": "sentence1", "summary": "sentence2"} + ) + + +class SynPerChatbotSumSRetrieval(AbsTaskBitextMining): + metadata = TaskMetadata( + name="SynPerChatbotSumSRetrieval", + description="Synthetic Persian Chatbot Summary Dataset", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-summary-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-summary-retrieval", + "revision": "9002f5e9de4ef61f1f5c34831d2a5ed855bac0ae", + }, + type="SummaryRetrieval", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="f1", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"text": "sentence1", "summary": "sentence2"} + ) + + +class SynPerChatbotRAGSumSRetrieval(AbsTaskBitextMining): + metadata = TaskMetadata( + name="SynPerChatbotRAGSumSRetrieval", + description="Synthetic Persian Chatbot RAG Summary Dataset", + reference="https://huggingface.co/datasets/MCINext/synthetic-persian-chatbot-rag-summary-retrieval", + dataset={ + "path": "MCINext/synthetic-persian-chatbot-rag-summary-retrieval", + "revision": "f77746f286bbf2177ee7b5a803da8be440d5d4c1", + }, + type="SummaryRetrieval", + category="p2p", + modalities=["text"], + eval_splits=["test"], + eval_langs=["fas-Arab"], + main_score="f1", + date=("2024-09-01", "2024-12-31"), + domains=["Spoken"], + task_subtypes=[], + license="not specified", + annotations_creators="LM-generated", + dialect=[], + sample_creation="LM-generated and verified", + bibtex_citation=""" """, + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns( + {"text": "sentence1", "summary": "sentence2"} + ) diff --git a/mteb/tasks/SummaryRetrieval/fas/__init__.py b/mteb/tasks/SummaryRetrieval/fas/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/__init__.py b/mteb/tasks/__init__.py index 8a53eb0d7d..8877da707c 100644 --- a/mteb/tasks/__init__.py +++ b/mteb/tasks/__init__.py @@ -12,3 +12,4 @@ from .SpeedTask import * from .STS import * from .Summarization import * +from .SummaryRetrieval import * diff --git a/mteb/tasks/aggregated_tasks/CQADupStackRetrievalFa.py b/mteb/tasks/aggregated_tasks/CQADupStackRetrievalFa.py new file mode 100644 index 0000000000..6a60f4b000 --- /dev/null +++ b/mteb/tasks/aggregated_tasks/CQADupStackRetrievalFa.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate, AggregateTaskMetadata +from mteb.tasks.Retrieval import ( + CQADupstackAndroidRetrievalFa, + CQADupstackEnglishRetrievalFa, + CQADupstackGamingRetrievalFa, + CQADupstackGisRetrievalFa, + CQADupstackMathematicaRetrievalFa, + CQADupstackPhysicsRetrievalFa, + CQADupstackProgrammersRetrievalFa, + CQADupstackStatsRetrievalFa, + CQADupstackTexRetrievalFa, + CQADupstackUnixRetrievalFa, + CQADupstackWebmastersRetrievalFa, + CQADupstackWordpressRetrievalFa, +) + +task_list_cqa: list[AbsTask] = [ + CQADupstackAndroidRetrievalFa(), + CQADupstackEnglishRetrievalFa(), + CQADupstackGamingRetrievalFa(), + CQADupstackGisRetrievalFa(), + CQADupstackMathematicaRetrievalFa(), + CQADupstackPhysicsRetrievalFa(), + CQADupstackProgrammersRetrievalFa(), + CQADupstackStatsRetrievalFa(), + CQADupstackTexRetrievalFa(), + CQADupstackUnixRetrievalFa(), + CQADupstackWebmastersRetrievalFa(), + CQADupstackWordpressRetrievalFa(), +] + + +class CQADupstackRetrievalFa(AbsTaskAggregate): + metadata = AggregateTaskMetadata( + name="CQADupstackRetrieval-Fa", + description="CQADupstackRetrieval-Fa", + reference="", + tasks=task_list_cqa, + main_score="ndcg_at_10", + type="Retrieval", # since everything is retrieval - otherwise it would be "Aggregated" + eval_splits=["test"], + bibtex_citation=""" """, + ) diff --git a/mteb/tasks/aggregated_tasks/SynPerChatbotConvSAClassification.py b/mteb/tasks/aggregated_tasks/SynPerChatbotConvSAClassification.py new file mode 100644 index 0000000000..46c6ed9600 --- /dev/null +++ b/mteb/tasks/aggregated_tasks/SynPerChatbotConvSAClassification.py @@ -0,0 +1,40 @@ +from __future__ import annotations + +from mteb.abstasks import AbsTask +from mteb.abstasks.aggregated_task import AbsTaskAggregate, AggregateTaskMetadata +from mteb.tasks.Classification import ( + SynPerChatbotConvSAAnger, + SynPerChatbotConvSAFear, + SynPerChatbotConvSAFriendship, + SynPerChatbotConvSAHappiness, + SynPerChatbotConvSAJealousy, + SynPerChatbotConvSALove, + SynPerChatbotConvSASadness, + SynPerChatbotConvSASatisfaction, + SynPerChatbotConvSASurprise, +) + +task_list_cqa: list[AbsTask] = [ + SynPerChatbotConvSAAnger(), + SynPerChatbotConvSASatisfaction(), + SynPerChatbotConvSAFriendship(), + SynPerChatbotConvSAFear(), + SynPerChatbotConvSAJealousy(), + SynPerChatbotConvSASurprise(), + SynPerChatbotConvSALove(), + SynPerChatbotConvSASadness(), + SynPerChatbotConvSAHappiness(), +] + + +class SynPerChatbotConvSAClassification(AbsTaskAggregate): + metadata = AggregateTaskMetadata( + name="SynPerChatbotConvSAClassification", + description="SynPerChatbotConvSAClassification", + reference="", + tasks=task_list_cqa, + main_score="accuracy", + type="Classification", + eval_splits=["test"], + bibtex_citation=""" """, + ) diff --git a/mteb/tasks/aggregated_tasks/__init__.py b/mteb/tasks/aggregated_tasks/__init__.py index 15025208cc..5333db7916 100644 --- a/mteb/tasks/aggregated_tasks/__init__.py +++ b/mteb/tasks/aggregated_tasks/__init__.py @@ -1,5 +1,11 @@ from __future__ import annotations from .CQADupStackRetrieval import CQADupstackRetrieval +from .CQADupStackRetrievalFa import CQADupstackRetrievalFa +from .SynPerChatbotConvSAClassification import SynPerChatbotConvSAClassification -__all__ = ["CQADupstackRetrieval"] +__all__ = [ + "CQADupstackRetrieval", + "CQADupstackRetrievalFa", + "SynPerChatbotConvSAClassification", +] diff --git a/tests/test_TaskMetadata.py b/tests/test_TaskMetadata.py index 09df66628a..f7ac92a697 100644 --- a/tests/test_TaskMetadata.py +++ b/tests/test_TaskMetadata.py @@ -179,6 +179,8 @@ "TamilNewsClassification", "TenKGnadClusteringP2P.v2", "TenKGnadClusteringS2S.v2", + "SynPerChatbotConvSAClassification", + "CQADupstackRetrieval-Fa", ]