diff --git a/mteb/abstasks/TaskMetadata.py b/mteb/abstasks/TaskMetadata.py index f2c631b3b2..1eb6f6bb63 100644 --- a/mteb/abstasks/TaskMetadata.py +++ b/mteb/abstasks/TaskMetadata.py @@ -92,6 +92,7 @@ "machine-translated and verified", "machine-translated and localized", "LM-generated and verified", + "machine-translated and LM verified", "rendered", "multiple", ] diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index 4392b280d8..d70c94944d 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -163,6 +163,18 @@ from .tur.TurkishProductSentimentClassification import * from .ukr.UkrFormalityClassification import * from .urd.UrduRomanSentimentClassification import * +from .vie.AmazonCounterfactualVNClassification import * +from .vie.AmazonPolarityVNClassification import * +from .vie.AmazonReviewsVNClassification import * +from .vie.Banking77VNClassification import * +from .vie.EmotionVNClassification import * +from .vie.ImdbVNClassification import * +from .vie.MassiveIntentVNClassification import * +from .vie.MassiveScenarioVNClassification import * +from .vie.MTOPDomainVNClassification import * +from .vie.MTOPIntentVNClassification import * +from .vie.ToxicConversationsVNClassification import * +from .vie.TweetSentimentExtractionVNClassification import * from .vie.VieStudentFeedbackClassification import * from .zho.CMTEBClassification import * from .zho.YueOpenriceReviewClassification import ( diff --git a/mteb/tasks/Classification/vie/AmazonCounterfactualVNClassification.py b/mteb/tasks/Classification/vie/AmazonCounterfactualVNClassification.py new file mode 100644 index 0000000000..540e5f2eda --- /dev/null +++ b/mteb/tasks/Classification/vie/AmazonCounterfactualVNClassification.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class AmazonCounterfactualVNClassification(AbsTaskClassification): + num_samples = 32 + + metadata = TaskMetadata( + name="AmazonCounterfactualVNClassification", + dataset={ + "path": "GreenNode/amazon-counterfactual-vn", + "revision": "b48bc27d383cfca5b6a47135a52390fa5f66b253", + }, + description="""A collection of translated Amazon customer reviews annotated for counterfactual detection pair classification. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria. + """, + reference="https://arxiv.org/abs/2104.06893", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Reviews", "Written"], + task_subtypes=["Counterfactual Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["AmazonCounterfactualClassification"], + ) diff --git a/mteb/tasks/Classification/vie/AmazonPolarityVNClassification.py b/mteb/tasks/Classification/vie/AmazonPolarityVNClassification.py new file mode 100644 index 0000000000..e3beceda5c --- /dev/null +++ b/mteb/tasks/Classification/vie/AmazonPolarityVNClassification.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class AmazonPolarityVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="AmazonPolarityVNClassification", + description="""A collection of translated Amazon customer reviews annotated for polarity classification. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria. + """, + reference="https://huggingface.co/datasets/amazon_polarity", + dataset={ + "path": "GreenNode/amazon-polarity-vn", + "revision": "4e9a0d6e6bd97ab32f23c50c043d751eed2a5f8a", + }, + type="Classification", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["AmazonPolarityClassification"], + ) diff --git a/mteb/tasks/Classification/vie/AmazonReviewsVNClassification.py b/mteb/tasks/Classification/vie/AmazonReviewsVNClassification.py new file mode 100644 index 0000000000..25ad64744e --- /dev/null +++ b/mteb/tasks/Classification/vie/AmazonReviewsVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class AmazonReviewsVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="AmazonReviewsVNClassification", + dataset={ + "path": "GreenNode/amazon-reviews-multi-vn", + "revision": "27da94deb6d4f44af789a3d70750fa506b79f189", + }, + description="""A collection of translated Amazon reviews specifically designed to aid research in multilingual text classification. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2010.02573", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Reviews", "Written"], + task_subtypes=["Emotion classification"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["AmazonReviewsClassification"], + ) diff --git a/mteb/tasks/Classification/vie/Banking77VNClassification.py b/mteb/tasks/Classification/vie/Banking77VNClassification.py new file mode 100644 index 0000000000..a051965bca --- /dev/null +++ b/mteb/tasks/Classification/vie/Banking77VNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class Banking77VNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="Banking77VNClassification", + description="""A translated dataset composed of online banking queries annotated with their corresponding intents. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2003.04807", + dataset={ + "path": "GreenNode/banking77-vn", + "revision": "42541b07c25a49604be129fba6d70b752be229c1", + }, + type="Classification", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["Banking77Classification"], + ) diff --git a/mteb/tasks/Classification/vie/EmotionVNClassification.py b/mteb/tasks/Classification/vie/EmotionVNClassification.py new file mode 100644 index 0000000000..d2a7b44e7a --- /dev/null +++ b/mteb/tasks/Classification/vie/EmotionVNClassification.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class EmotionVNClassification(AbsTaskClassification): + num_samples = 16 + + metadata = TaskMetadata( + name="EmotionVNClassification", + description="""Emotion is a translated dataset of Vietnamese from English Twitter messages with six basic emotions: anger, fear, joy, love, sadness, and surprise. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.aclweb.org/anthology/D18-1404", + dataset={ + "path": "GreenNode/emotion-vn", + "revision": "797a93c0dd755ebf5818fbf54d0e0a024df9216d", + }, + type="Classification", + category="s2s", + eval_splits=["validation", "test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["EmotionClassification"], + ) diff --git a/mteb/tasks/Classification/vie/ImdbVNClassification.py b/mteb/tasks/Classification/vie/ImdbVNClassification.py new file mode 100644 index 0000000000..9d87ca3c98 --- /dev/null +++ b/mteb/tasks/Classification/vie/ImdbVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ImdbVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="ImdbVNClassification", + description="""A translated dataset of large movie reviews annotated for sentiment classification. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + dataset={ + "path": "GreenNode/imdb-vn", + "revision": "0dccb383ee26c90c99d03c8674cf40de642f099a", + }, + reference="http://www.aclweb.org/anthology/P11-1015", + type="Classification", + category="p2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Reviews", "Written"], + task_subtypes=["Sentiment/Hate speech"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["ImdbClassification"], + ) diff --git a/mteb/tasks/Classification/vie/MTOPDomainVNClassification.py b/mteb/tasks/Classification/vie/MTOPDomainVNClassification.py new file mode 100644 index 0000000000..9050165762 --- /dev/null +++ b/mteb/tasks/Classification/vie/MTOPDomainVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class MTOPDomainVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="MTOPDomainVNClassification", + dataset={ + "path": "GreenNode/mtop-domain-vn", + "revision": "6e1ec8c54c018151c77472d94b1c0765230cf6ca", + }, + description="""A translated dataset from MTOP: Multilingual Task-Oriented Semantic Parsing + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/pdf/2008.09335.pdf", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Spoken", "Spoken"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["MTOPDomainClassification"], + ) diff --git a/mteb/tasks/Classification/vie/MTOPIntentVNClassification.py b/mteb/tasks/Classification/vie/MTOPIntentVNClassification.py new file mode 100644 index 0000000000..d50b263722 --- /dev/null +++ b/mteb/tasks/Classification/vie/MTOPIntentVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class MTOPIntentVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="MTOPIntentVNClassification", + dataset={ + "path": "GreenNode/mtop-intent-vn", + "revision": "c4e81a5c9a813a0142d905e261e5a446cc6fbc4a", + }, + description="""A translated dataset from MTOP: Multilingual Task-Oriented Semantic Parsing + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/pdf/2008.09335.pdf", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Spoken", "Spoken"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["MTOPIntentClassification"], + ) diff --git a/mteb/tasks/Classification/vie/MassiveIntentVNClassification.py b/mteb/tasks/Classification/vie/MassiveIntentVNClassification.py new file mode 100644 index 0000000000..49e7fe0219 --- /dev/null +++ b/mteb/tasks/Classification/vie/MassiveIntentVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class MassiveIntentVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="MassiveIntentVNClassification", + dataset={ + "path": "GreenNode/amazon-massive-intent-vn", + "revision": "35c7ced69f958dbbaa24f792db4a9250e461866d", + }, + description="""A translated dataset from MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Spoken"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["MassiveIntentClassification"], + ) diff --git a/mteb/tasks/Classification/vie/MassiveScenarioVNClassification.py b/mteb/tasks/Classification/vie/MassiveScenarioVNClassification.py new file mode 100644 index 0000000000..5d214e9fa2 --- /dev/null +++ b/mteb/tasks/Classification/vie/MassiveScenarioVNClassification.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class MassiveScenarioVNClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="MassiveScenarioVNClassification", + dataset={ + "path": "GreenNode/amazon-massive-scenario-vn", + "revision": "a82e282d9f5aec1a8cf7d868ce40f70669c16b89", + }, + description="""A translated dataset from MASSIVE: A 1M-Example Multilingual Natural Language Understanding Dataset with 51 Typologically-Diverse Languages + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2204.08582#:~:text=MASSIVE%20contains%201M%20realistic%2C%20parallel,diverse%20languages%20from%2029%20genera.", + category="s2s", + type="Classification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Spoken"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["MassiveScenarioClassification"], + ) diff --git a/mteb/tasks/Classification/vie/ToxicConversationsVNClassification.py b/mteb/tasks/Classification/vie/ToxicConversationsVNClassification.py new file mode 100644 index 0000000000..a3fb2d6d13 --- /dev/null +++ b/mteb/tasks/Classification/vie/ToxicConversationsVNClassification.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ToxicConversationsVNClassification(AbsTaskClassification): + num_samples = 16 + + metadata = TaskMetadata( + name="ToxicConversationsVNClassification", + description="""A translated dataset from Collection of comments from the Civil Comments platform together with annotations if the comment is toxic or not. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.kaggle.com/competitions/jigsaw-unintended-bias-in-toxicity-classification/overview", + dataset={ + "path": "GreenNode/toxic-conversations-50k-vn", + "revision": "2cc697991407cbbe34e7ef7bc9564449a4a99132", + }, + type="Classification", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["ToxicConversationsClassification"], + ) + + def dataset_transform(self): + self.dataset = self.stratified_subsampling( + self.dataset, seed=self.seed, splits=["test"] + ) diff --git a/mteb/tasks/Classification/vie/TweetSentimentExtractionVNClassification.py b/mteb/tasks/Classification/vie/TweetSentimentExtractionVNClassification.py new file mode 100644 index 0000000000..1645c182bd --- /dev/null +++ b/mteb/tasks/Classification/vie/TweetSentimentExtractionVNClassification.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TweetSentimentExtractionVNClassification(AbsTaskClassification): + num_samples = 32 + + metadata = TaskMetadata( + name="TweetSentimentExtractionVNClassification", + description="""A collection of translated tweets annotated for sentiment extraction. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.kaggle.com/competitions/tweet-sentiment-extraction/overview", + dataset={ + "path": "GreenNode/tweet-sentiment-extraction-vn", + "revision": "f453803eff1e91579eb235dc1d7c38d39b3f1340", + }, + type="Classification", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="accuracy", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["TweetSentimentExtractionClassification"], + ) diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index 0c86095000..3476ea491a 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -50,4 +50,9 @@ from .spa.SpanishNewsClusteringP2P import * from .swe.swedn_clustering import * from .swe.SwednClustering import * +from .vie.RedditClusteringP2PVN import * +from .vie.RedditClusteringVN import * +from .vie.StackExchangeClusteringP2PVN import * +from .vie.StackExchangeClusteringVN import * +from .vie.TwentyNewsgroupsClusteringVN import * from .zho.CMTEBClustering import * diff --git a/mteb/tasks/Clustering/vie/RedditClusteringP2PVN.py b/mteb/tasks/Clustering/vie/RedditClusteringP2PVN.py new file mode 100644 index 0000000000..6f200ca758 --- /dev/null +++ b/mteb/tasks/Clustering/vie/RedditClusteringP2PVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class RedditClusteringP2PVN(AbsTaskClustering): + metadata = TaskMetadata( + name="RedditClusteringP2P-VN", + description="""A translated dataset from Clustering of title+posts from reddit. Clustering of 10 sets of 50k paragraphs and 40 sets of 10k paragraphs. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/reddit-clustering-p2p-vn", + "revision": "841856dcb82496f1f2f59356e4798ce15baeb200", + }, + type="Clustering", + category="p2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Social", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["RedditClusteringP2P"], + ) diff --git a/mteb/tasks/Clustering/vie/RedditClusteringVN.py b/mteb/tasks/Clustering/vie/RedditClusteringVN.py new file mode 100644 index 0000000000..0bb1cf95ec --- /dev/null +++ b/mteb/tasks/Clustering/vie/RedditClusteringVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class RedditClusteringVN(AbsTaskClustering): + metadata = TaskMetadata( + name="RedditClustering-VN", + description="""A translated dataset from Clustering of titles from 199 subreddits. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/reddit-clustering-vn", + "revision": "7f7d4097979633181b2df3f73905218f74c4665d", + }, + type="Clustering", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Social", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["RedditClustering"], + ) diff --git a/mteb/tasks/Clustering/vie/StackExchangeClusteringP2PVN.py b/mteb/tasks/Clustering/vie/StackExchangeClusteringP2PVN.py new file mode 100644 index 0000000000..24e578deb9 --- /dev/null +++ b/mteb/tasks/Clustering/vie/StackExchangeClusteringP2PVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class StackExchangeClusteringP2PVN(AbsTaskClustering): + metadata = TaskMetadata( + name="StackExchangeClusteringP2P-VN", + description="""A translated Clustering of title+body from stackexchange. Clustering of 5 sets of 10k paragraphs and 5 sets of 5k paragraphs. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/stackexchange-clustering-p2p-vn", + "revision": "8f154ee524a466850028531d21e1a62d958b8156", + }, + type="Clustering", + category="p2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["StackExchangeClusteringP2P"], + ) diff --git a/mteb/tasks/Clustering/vie/StackExchangeClusteringVN.py b/mteb/tasks/Clustering/vie/StackExchangeClusteringVN.py new file mode 100644 index 0000000000..d476c41de1 --- /dev/null +++ b/mteb/tasks/Clustering/vie/StackExchangeClusteringVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class StackExchangeClusteringVN(AbsTaskClustering): + metadata = TaskMetadata( + name="StackExchangeClustering-VN", + description="""A translated dataset from Clustering of titles from 121 stackexchanges. Clustering of 25 sets, each with 10-50 classes, and each class with 100 - 1000 sentences. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://arxiv.org/abs/2104.07081", + dataset={ + "path": "GreenNode/stackexchange-clustering-vn", + "revision": "cf01db048f2bf705741675b51613dc48e0bb122b", + }, + type="Clustering", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["StackExchangeClustering"], + ) diff --git a/mteb/tasks/Clustering/vie/TwentyNewsgroupsClusteringVN.py b/mteb/tasks/Clustering/vie/TwentyNewsgroupsClusteringVN.py new file mode 100644 index 0000000000..d45be8112d --- /dev/null +++ b/mteb/tasks/Clustering/vie/TwentyNewsgroupsClusteringVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClustering import AbsTaskClustering +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TwentyNewsgroupsClusteringVN(AbsTaskClustering): + metadata = TaskMetadata( + name="TwentyNewsgroupsClustering-VN", + description="""A translated dataset from Clustering of the 20 Newsgroups dataset (subject only). + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://scikit-learn.org/0.19/datasets/twenty_newsgroups.html", + dataset={ + "path": "GreenNode/twentynewsgroups-clustering-vn", + "revision": "770e1b9029cd85c79410bc6df1528a43fc2b9ad1", + }, + type="Clustering", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="v_measure", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["News", "Written"], + task_subtypes=["Thematic clustering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["TwentyNewsgroupsClustering"], + ) diff --git a/mteb/tasks/Clustering/vie/__init__.py b/mteb/tasks/Clustering/vie/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index d3ecd19272..62e9d8daa2 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -28,4 +28,7 @@ from .por.Assin2RTE import * from .por.SickBrPC import * from .rus.TERRa import * +from .vie.SprintDuplicateQuestionsPCVN import * +from .vie.TwitterSemEval2015PCVN import * +from .vie.TwitterURLCorpusPCVN import * from .zho.CMTEBPairClassification import * diff --git a/mteb/tasks/PairClassification/vie/SprintDuplicateQuestionsPCVN.py b/mteb/tasks/PairClassification/vie/SprintDuplicateQuestionsPCVN.py new file mode 100644 index 0000000000..13b5bd6d52 --- /dev/null +++ b/mteb/tasks/PairClassification/vie/SprintDuplicateQuestionsPCVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SprintDuplicateQuestionsPCVN(AbsTaskPairClassification): + metadata = TaskMetadata( + name="SprintDuplicateQuestions-VN", + description="""A translated dataset from Duplicate questions from the Sprint community. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.aclweb.org/anthology/D18-1131/", + dataset={ + "path": "GreenNode/sprintduplicatequestions-pairclassification-vn", + "revision": "2552beae0e4fe7fe05d088814f78a4c309ad2219", + }, + type="PairClassification", + category="s2s", + eval_splits=["validation", "test"], + eval_langs=["vie-Latn"], + main_score="ap", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Programming", "Written"], + task_subtypes=["Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["SprintDuplicateQuestions"], + ) diff --git a/mteb/tasks/PairClassification/vie/TwitterSemEval2015PCVN.py b/mteb/tasks/PairClassification/vie/TwitterSemEval2015PCVN.py new file mode 100644 index 0000000000..8587c22d78 --- /dev/null +++ b/mteb/tasks/PairClassification/vie/TwitterSemEval2015PCVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TwitterSemEval2015PCVN(AbsTaskPairClassification): + metadata = TaskMetadata( + name="TwitterSemEval2015-VN", + dataset={ + "path": "GreenNode/twittersemeval2015-pairclassification-vn", + "revision": "9215a3c954078fd15c2bbecca914477d53944de1", + }, + description="""A translated dataset from Paraphrase-Pairs of Tweets from the SemEval 2015 workshop. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://alt.qcri.org/semeval2015/task1/", + category="s2s", + type="PairClassification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ap", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Social", "Written"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["TwitterSemEval2015"], + ) diff --git a/mteb/tasks/PairClassification/vie/TwitterURLCorpusPCVN.py b/mteb/tasks/PairClassification/vie/TwitterURLCorpusPCVN.py new file mode 100644 index 0000000000..39d91783ab --- /dev/null +++ b/mteb/tasks/PairClassification/vie/TwitterURLCorpusPCVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TwitterURLCorpusPC(AbsTaskPairClassification): + metadata = TaskMetadata( + name="TwitterURLCorpus-VN", + dataset={ + "path": "GreenNode/twitterurlcorpus-pairclassification-vn", + "revision": "6e6a40aaade2129f70432f2156a6d24b63d72be3", + }, + description="""A translated dataset from Paraphrase-Pairs of Tweets. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://languagenet.github.io/", + category="s2s", + type="PairClassification", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ap", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Social", "Written"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["TwitterURLCorpus"], + ) diff --git a/mteb/tasks/PairClassification/vie/__init__.py b/mteb/tasks/PairClassification/vie/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py index ef4f8531c6..06ffc29141 100644 --- a/mteb/tasks/Reranking/__init__.py +++ b/mteb/tasks/Reranking/__init__.py @@ -17,4 +17,7 @@ from .multilingual.WikipediaRerankingMultilingual import * from .multilingual.XGlueWPRReranking import * from .rus.RuBQReranking import * +from .vie.AskUbuntuDupQuestionsVN import * +from .vie.SciDocsRerankingVN import * +from .vie.StackOverflowDupQuestionsVN import * from .zho.CMTEBReranking import * diff --git a/mteb/tasks/Reranking/vie/AskUbuntuDupQuestionsVN.py b/mteb/tasks/Reranking/vie/AskUbuntuDupQuestionsVN.py new file mode 100644 index 0000000000..a1eaa9485b --- /dev/null +++ b/mteb/tasks/Reranking/vie/AskUbuntuDupQuestionsVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class AskUbuntuDupQuestionsVN(AbsTaskReranking): + metadata = TaskMetadata( + name="AskUbuntuDupQuestions-VN", + description="""A translated dataset from AskUbuntu Question Dataset - Questions from AskUbuntu with manual annotations marking pairs of questions as similar or non-similar + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://github.com/taolei87/askubuntu", + dataset={ + "path": "GreenNode/askubuntudupquestions-reranking-vn", + "revision": "5cfaa5c07252d30c37302bfc056f0d85884971a1", + }, + type="Reranking", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="map", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Programming", "Web"], + task_subtypes=["Scientific Reranking"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["AskUbuntuDupQuestions"], + ) diff --git a/mteb/tasks/Reranking/vie/SciDocsRerankingVN.py b/mteb/tasks/Reranking/vie/SciDocsRerankingVN.py new file mode 100644 index 0000000000..68b527dc07 --- /dev/null +++ b/mteb/tasks/Reranking/vie/SciDocsRerankingVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SciDocsRerankingVN(AbsTaskReranking): + metadata = TaskMetadata( + name="SciDocsRR-VN", + description="""A translated dataset from Ranking of related scientific papers based on their title. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://allenai.org/data/scidocs", + dataset={ + "path": "GreenNode/scidocs-reranking-vn", + "revision": "c9ab36ae6c75f754df6f1e043c09b5e0b5547cac", + }, + type="Reranking", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="map", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=["Scientific Reranking"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["SciDocsRR"], + ) diff --git a/mteb/tasks/Reranking/vie/StackOverflowDupQuestionsVN.py b/mteb/tasks/Reranking/vie/StackOverflowDupQuestionsVN.py new file mode 100644 index 0000000000..0fe2e3b7f9 --- /dev/null +++ b/mteb/tasks/Reranking/vie/StackOverflowDupQuestionsVN.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class StackOverflowDupQuestionsVN(AbsTaskReranking): + metadata = TaskMetadata( + name="StackOverflowDupQuestions-VN", + description="""A translated dataset from Stack Overflow Duplicate Questions Task for questions with the tags Java, JavaScript and Python + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.microsoft.com/en-us/research/uploads/prod/2019/03/nl4se18LinkSO.pdf", + dataset={ + "path": "GreenNode/stackoverflowdupquestions-reranking-vn", + "revision": "3ceb17db245f52beaf27a3720aa71e1cc5f06faf", + }, + type="Reranking", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="map", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Academic", "Non-fiction", "Written"], + task_subtypes=["Scientific Reranking"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["StackOverflowDupQuestions"], + ) diff --git a/mteb/tasks/Reranking/vie/__init__.py b/mteb/tasks/Reranking/vie/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 91a63662ae..6b19080b71 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -195,7 +195,31 @@ from .swe.SwednRetrieval import * from .swe.SweFaqRetrieval import * from .tur.TurHistQuad import * +from .vie.ArguAnaVNRetrieval import * +from .vie.ClimateFEVERVNRetrieval import * +from .vie.CQADupstackAndroidVNRetrieval import * +from .vie.CQADupstackGisVNRetrieval import * +from .vie.CQADupstackMathematicaVNRetrieval import * +from .vie.CQADupstackPhysicsVNRetrieval import * +from .vie.CQADupstackProgrammersVNRetrieval import * +from .vie.CQADupstackStatsVNRetrieval import * +from .vie.CQADupstackTexVNRetrieval import * +from .vie.CQADupstackUnixVNRetrieval import * +from .vie.CQADupstackWebmastersVNRetrieval import * +from .vie.CQADupstackWordpressVNRetrieval import * +from .vie.DBPediaVNRetrieval import * +from .vie.FEVERVNRetrieval import * +from .vie.FiQA2018VNRetrieval import * from .vie.GreenNodeTableMarkdownRetrieval import * +from .vie.HotpotQAVNRetrieval import * +from .vie.MSMARCOVNRetrieval import * +from .vie.NFCorpusVNRetrieval import * +from .vie.NQVNRetrieval import * +from .vie.QuoraVNRetrieval import * +from .vie.SCIDOCSVNRetrieval import * +from .vie.SciFactVNRetrieval import * +from .vie.Touche2020VNRetrieval import * +from .vie.TRECCOVIDVNRetrieval import * from .vie.VieQuADRetrieval import * from .vie.ZacLegalTextRetrieval import * from .zho.CMTEBRetrieval import * diff --git a/mteb/tasks/Retrieval/vie/ArguAnaVNRetrieval.py b/mteb/tasks/Retrieval/vie/ArguAnaVNRetrieval.py new file mode 100644 index 0000000000..4492d9b493 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/ArguAnaVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ArguAnaVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ArguAna-VN", + description="""A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://argumentation.bplaced.net/arguana/data", + dataset={ + "path": "GreenNode/arguana-vn", + "revision": "2a5133a05d7430e6f353497b1624a6e73148105b", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Medical", "Written"], + task_subtypes=["Article retrieval"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["ArguAna"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackAndroidVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackAndroidVNRetrieval.py new file mode 100644 index 0000000000..cb177bdf05 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackAndroidVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackAndroidVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackAndroid-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-android-vn", + "revision": "4a022e7213ccc05ee970a176abd0293b3a0a2da0", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Programming", "Web", "Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackAndroid"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackGisVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackGisVNRetrieval.py new file mode 100644 index 0000000000..2b531eb7d8 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackGisVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackGisVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackGis-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-gis-vn", + "revision": "755156d548a8288efdb29b80bad302750ab33977", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackGis"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackMathematicaVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackMathematicaVNRetrieval.py new file mode 100644 index 0000000000..a12c8f1c29 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackMathematicaVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackMathematicaVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackMathematica-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-mathematica-vn", + "revision": "d0cc9b60ba66faa3fb21cb9a54ef969af548b312", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackMathematica"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackPhysicsVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackPhysicsVNRetrieval.py new file mode 100644 index 0000000000..e10955e22a --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackPhysicsVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackPhysicsVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackPhysics-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-physics-vn", + "revision": "8b6b68b59933cc72985f674f76c80a678c27d6be", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackPhysics"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackProgrammersVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackProgrammersVNRetrieval.py new file mode 100644 index 0000000000..ebe78eeddc --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackProgrammersVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackProgrammersRetrieval(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackProgrammers-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-programmers-vn", + "revision": "1a628c4e61f71ffdb7707d6d4024d25cfe68215a", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Programming", "Written", "Non-fiction"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackProgrammers"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackStatsVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackStatsVNRetrieval.py new file mode 100644 index 0000000000..339f9f43d4 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackStatsVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackStatsVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackStats-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-stats-vn", + "revision": "6b8164f3af61f3bb7728724229ba36213fb46c25", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Academic", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackStats"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackTexVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackTexVNRetrieval.py new file mode 100644 index 0000000000..ceb699bb88 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackTexVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackTexVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackTex-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-tex-vn", + "revision": "aec43e5ae40451526528b3fc80dd5983ec388e21", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Non-fiction"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackTex"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackUnixVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackUnixVNRetrieval.py new file mode 100644 index 0000000000..dc751d04a9 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackUnixVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackUnixVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackUnix-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-unix-vn", + "revision": "f8b884697871cb38901139f2435c273135f83a3f", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Web", "Programming"], + task_subtypes=["Question answering", "Duplicate Detection"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackUnix"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackWebmastersVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackWebmastersVNRetrieval.py new file mode 100644 index 0000000000..03751d00ae --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackWebmastersVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackWebmastersVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackWebmasters-VN", + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + dataset={ + "path": "GreenNode/cqadupstack-webmasters-vn", + "revision": "482d6e560d977b137e435d33379c5a8049e70e8d", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Web"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackWebmasters"], + ) diff --git a/mteb/tasks/Retrieval/vie/CQADupstackWordpressVNRetrieval.py b/mteb/tasks/Retrieval/vie/CQADupstackWordpressVNRetrieval.py new file mode 100644 index 0000000000..a77659887b --- /dev/null +++ b/mteb/tasks/Retrieval/vie/CQADupstackWordpressVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class CQADupstackWordpressVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="CQADupstackWordpress-VN", + dataset={ + "path": "GreenNode/cqadupstack-wordpress-vn", + "revision": "2230f80e1baf42aa005731ca86577621c566fcd7", + }, + description="""A translated dataset from CQADupStack: A Benchmark Data Set for Community Question-Answering Research + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="http://nlp.cis.unimelb.edu.au/resources/cqadupstack/", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Web", "Programming"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["CQADupstackWordpress"], + ) diff --git a/mteb/tasks/Retrieval/vie/ClimateFEVERVNRetrieval.py b/mteb/tasks/Retrieval/vie/ClimateFEVERVNRetrieval.py new file mode 100644 index 0000000000..9edb81001e --- /dev/null +++ b/mteb/tasks/Retrieval/vie/ClimateFEVERVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class ClimateFEVERVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="ClimateFEVER-VN", + description="""A translated dataset from CLIMATE-FEVER is a dataset adopting the FEVER methodology that consists of 1,535 real-world claims regarding climate-change. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.sustainablefinance.uzh.ch/en/research/climate-fever.html", + dataset={ + "path": "GreenNode/climate-fever-vn", + "revision": "42328bf787e17b1ad1a88be4f5e87ea9fb668511", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["ClimateFEVER"], + ) diff --git a/mteb/tasks/Retrieval/vie/DBPediaVNRetrieval.py b/mteb/tasks/Retrieval/vie/DBPediaVNRetrieval.py new file mode 100644 index 0000000000..ec01dc8abe --- /dev/null +++ b/mteb/tasks/Retrieval/vie/DBPediaVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class DBPediaVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="DBPedia-VN", + description="""A translated dataset from DBpedia-Entity is a standard test collection for entity search over the DBpedia knowledge base + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://github.com/iai-group/DBpedia-Entity/", + dataset={ + "path": "GreenNode/dbpedia-vn", + "revision": "c3e20179fbcee16217ef9461a14a54b7faca9b63", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Encyclopaedic"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["DBPedia"], + ) diff --git a/mteb/tasks/Retrieval/vie/FEVERVNRetrieval.py b/mteb/tasks/Retrieval/vie/FEVERVNRetrieval.py new file mode 100644 index 0000000000..49fdcf2fe6 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/FEVERVNRetrieval.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class FEVERVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="FEVER-VN", + dataset={ + "path": "GreenNode/fever-vn", + "revision": "a543dd8b98aed3603110c01d26db05ba39b87d49", + }, + description="""A translated dataset from FEVER (Fact Extraction and VERification) consists of 185,445 claims generated by altering sentences + extracted from Wikipedia and subsequently verified without knowledge of the sentence they were + derived from. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://fever.ai/", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Encyclopaedic", "Written"], + task_subtypes=["Claim verification"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["FEVER"], + ) diff --git a/mteb/tasks/Retrieval/vie/FiQA2018VNRetrieval.py b/mteb/tasks/Retrieval/vie/FiQA2018VNRetrieval.py new file mode 100644 index 0000000000..a039aa53c9 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/FiQA2018VNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class FiQA2018VN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="FiQA2018-VN", + description="""A translated dataset from Financial Opinion Mining and Question Answering + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://sites.google.com/view/fiqa/", + dataset={ + "path": "GreenNode/fiqa-vn", + "revision": "6c3cdf6f102151dbbbbc1d2cf999b305eba44dae", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Financial"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["FiQA2018"], + ) diff --git a/mteb/tasks/Retrieval/vie/HotpotQAVNRetrieval.py b/mteb/tasks/Retrieval/vie/HotpotQAVNRetrieval.py new file mode 100644 index 0000000000..13de52d87a --- /dev/null +++ b/mteb/tasks/Retrieval/vie/HotpotQAVNRetrieval.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class HotpotQAVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="HotpotQA-VN", + dataset={ + "path": "GreenNode/hotpotqa-vn", + "revision": "8a5220c7af5084f0d5d2afeb74f9c2b41b759ff0", + }, + description="""A translated dataset from HotpotQA is a question answering dataset featuring natural, multi-hop questions, with strong + supervision for supporting facts to enable more explainable question answering systems. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://hotpotqa.github.io/", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Written"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["HotpotQA"], + ) diff --git a/mteb/tasks/Retrieval/vie/MSMARCOVNRetrieval.py b/mteb/tasks/Retrieval/vie/MSMARCOVNRetrieval.py new file mode 100644 index 0000000000..2fbb5140f4 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/MSMARCOVNRetrieval.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class MSMARCOVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="MSMARCO-VN", + dataset={ + "path": "GreenNode/msmarco-vn", + "revision": "85d1ad4cc9070b8d019d65f5af1631a2ab91e294", + }, + description="""A translated dataset from MS MARCO is a collection of datasets focused on deep learning in search + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://microsoft.github.io/msmarco/", + type="Retrieval", + category="s2p", + eval_splits=["dev"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=[ + "Encyclopaedic", + "Academic", + "Blog", + "News", + "Medical", + "Government", + "Reviews", + "Non-fiction", + "Social", + "Web", + ], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["MSMARCO"], + ) diff --git a/mteb/tasks/Retrieval/vie/NFCorpusVNRetrieval.py b/mteb/tasks/Retrieval/vie/NFCorpusVNRetrieval.py new file mode 100644 index 0000000000..7086eb5b7a --- /dev/null +++ b/mteb/tasks/Retrieval/vie/NFCorpusVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NFCorpusVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NFCorpus-VN", + dataset={ + "path": "GreenNode/nfcorpus-vn", + "revision": "a13d72fbb859be3dc19ab669d1ec9510407d2dcd", + }, + description="""A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://www.cl.uni-heidelberg.de/statnlpgroup/nfcorpus/", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Medical", "Academic", "Written"], + task_subtypes=["Article retrieval"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["NFCorpus"], + ) diff --git a/mteb/tasks/Retrieval/vie/NQVNRetrieval.py b/mteb/tasks/Retrieval/vie/NQVNRetrieval.py new file mode 100644 index 0000000000..5bb940c167 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/NQVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class NQVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="NQ-VN", + dataset={ + "path": "GreenNode/nq-vn", + "revision": "40a6d7f343b9c9f4855a426d8c431ad5f8aaf56b", + }, + description="""A translated dataset from NFCorpus: A Full-Text Learning to Rank Dataset for Medical Information Retrieval + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://ai.google.com/research/NaturalQuestions/", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Encyclopaedic"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["NQ"], + ) diff --git a/mteb/tasks/Retrieval/vie/QuoraVNRetrieval.py b/mteb/tasks/Retrieval/vie/QuoraVNRetrieval.py new file mode 100644 index 0000000000..d3e357f871 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/QuoraVNRetrieval.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class QuoraVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="Quora-VN", + dataset={ + "path": "GreenNode/quora-vn", + "revision": "3363d81e41b67c1032bf3b234882a03d271e2289", + }, + description="""A translated dataset from QuoraRetrieval is based on questions that are marked as duplicates on the Quora platform. Given a + question, find other (duplicate) questions. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs", + type="Retrieval", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Written", "Web", "Blog"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["Quora"], + ) diff --git a/mteb/tasks/Retrieval/vie/SCIDOCSVNRetrieval.py b/mteb/tasks/Retrieval/vie/SCIDOCSVNRetrieval.py new file mode 100644 index 0000000000..93088401c9 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/SCIDOCSVNRetrieval.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SCIDOCSVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="SCIDOCS-VN", + dataset={ + "path": "GreenNode/scidocs-vn", + "revision": "724cddfa9d328a193f303a0a9b7789468ac79f26", + }, + description="""A translated dataset from SciDocs, a new evaluation benchmark consisting of seven document-level tasks ranging from citation + prediction, to document classification and recommendation. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://allenai.org/data/scidocs", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Academic", "Written", "Non-fiction"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["SCIDOCS"], + ) diff --git a/mteb/tasks/Retrieval/vie/SciFactVNRetrieval.py b/mteb/tasks/Retrieval/vie/SciFactVNRetrieval.py new file mode 100644 index 0000000000..d2d5034742 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/SciFactVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SciFactVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="SciFact-VN", + dataset={ + "path": "GreenNode/scifact-vn", + "revision": "483a7cf890c523c954e7751d328c5bb65061dcff", + }, + description="""A translated dataset from SciFact verifies scientific claims using evidence from the research literature containing scientific paper abstracts. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://github.com/allenai/scifact", + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Academic", "Medical", "Written"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["SciFact"], + ) diff --git a/mteb/tasks/Retrieval/vie/TRECCOVIDVNRetrieval.py b/mteb/tasks/Retrieval/vie/TRECCOVIDVNRetrieval.py new file mode 100644 index 0000000000..ef9f69224a --- /dev/null +++ b/mteb/tasks/Retrieval/vie/TRECCOVIDVNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class TRECCOVIDVN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="TRECCOVID-VN", + description="""A translated dataset from TRECCOVID is an ad-hoc search challenge based on the COVID-19 dataset containing scientific articles related to the COVID-19 pandemic. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://ir.nist.gov/covidSubmit/index.html", + dataset={ + "path": "GreenNode/trec-covid-vn", + "revision": "54d73a1ea11ea0ae4ec0214ec519c93db79dee88", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Medical", "Academic", "Written"], + task_subtypes=["Article retrieval"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["TRECCOVID"], + ) diff --git a/mteb/tasks/Retrieval/vie/Touche2020VNRetrieval.py b/mteb/tasks/Retrieval/vie/Touche2020VNRetrieval.py new file mode 100644 index 0000000000..8011850d66 --- /dev/null +++ b/mteb/tasks/Retrieval/vie/Touche2020VNRetrieval.py @@ -0,0 +1,44 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class Touche2020VN(AbsTaskRetrieval): + metadata = TaskMetadata( + name="Touche2020-VN", + description="""A translated dataset from Touché Task 1: Argument Retrieval for Controversial Questions + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://webis.de/events/touche-20/shared-task-1.html", + dataset={ + "path": "GreenNode/webis-touche2020-vn", + "revision": "cd4389b182aec622c8121ee8db988359197159c1", + }, + type="Retrieval", + category="s2p", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="ndcg_at_10", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Academic"], + task_subtypes=["Question answering"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["Touche2020"], + ) diff --git a/mteb/tasks/STS/__init__.py b/mteb/tasks/STS/__init__.py index 471789f1c9..00ccf0d7ff 100644 --- a/mteb/tasks/STS/__init__.py +++ b/mteb/tasks/STS/__init__.py @@ -29,4 +29,7 @@ from .rus.RUParaPhraserSTS import * from .rus.RuSTSBenchmarkSTS import * from .spa.STSES import * +from .vie.BiossesSTSVN import * +from .vie.SickrSTSVN import * +from .vie.STSBenchmarkSTSVN import * from .zho.CMTEBSTS import * diff --git a/mteb/tasks/STS/vie/BiossesSTSVN.py b/mteb/tasks/STS/vie/BiossesSTSVN.py new file mode 100644 index 0000000000..dcc1819289 --- /dev/null +++ b/mteb/tasks/STS/vie/BiossesSTSVN.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class BiossesSTSVN(AbsTaskSTS): + metadata = TaskMetadata( + name="BIOSSES-VN", + dataset={ + "path": "GreenNode/biosses-sts-vn", + "revision": "1dae4a6df91c0852680cd4ab48c8c1d8a9ed49b2", + }, + description="""A translated dataset from Biomedical Semantic Similarity Estimation. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://tabilab.cmpe.boun.edu.tr/BIOSSES/DataSet.html", + type="STS", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="cosine_spearman", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Medical"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["BIOSSES"], + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 5 + return metadata_dict diff --git a/mteb/tasks/STS/vie/STSBenchmarkSTSVN.py b/mteb/tasks/STS/vie/STSBenchmarkSTSVN.py new file mode 100644 index 0000000000..1c7879a01d --- /dev/null +++ b/mteb/tasks/STS/vie/STSBenchmarkSTSVN.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class STSBenchmarkSTSVN(AbsTaskSTS): + metadata = TaskMetadata( + name="STSBenchmark-VN", + dataset={ + "path": "GreenNode/stsbenchmark-sts-vn", + "revision": "f24d66738cda4a02138ada5af7689a92ce1fcad6", + }, + description="""A translated dataset from Semantic Textual Similarity Benchmark (STSbenchmark) dataset. + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://github.com/PhilipMay/stsb-multi-mt/", + type="STS", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="cosine_spearman", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Blog", "News", "Written"], + task_subtypes=[], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["STSBenchmark"], + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 5 + return metadata_dict diff --git a/mteb/tasks/STS/vie/SickrSTSVN.py b/mteb/tasks/STS/vie/SickrSTSVN.py new file mode 100644 index 0000000000..93c5d585d2 --- /dev/null +++ b/mteb/tasks/STS/vie/SickrSTSVN.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskSTS import AbsTaskSTS +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SickrSTSVN(AbsTaskSTS): + metadata = TaskMetadata( + name="SICK-R-VN", + dataset={ + "path": "GreenNode/sickr-sts-vn", + "revision": "bc89f0401983c456b609f7fb324278f346b2cccf", + }, + description="""A translated dataset from Semantic Textual Similarity SICK-R dataset as described here: + The process of creating the VN-MTEB (Vietnamese Massive Text Embedding Benchmark) from English samples involves a new automated system: + - The system uses large language models (LLMs), specifically Coherence's Aya model, for translation. + - Applies advanced embedding models to filter the translations. + - Use LLM-as-a-judge to scoring the quality of the samples base on multiple criteria.""", + reference="https://aclanthology.org/2020.lrec-1.207", + type="STS", + category="s2s", + eval_splits=["test"], + eval_langs=["vie-Latn"], + main_score="cosine_spearman", + date=("2025-07-29", "2025-07-30"), + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="machine-translated and LM verified", + domains=["Web", "Written"], + task_subtypes=["Textual Entailment"], + bibtex_citation=r""" +@misc{pham2025vnmtebvietnamesemassivetext, + archiveprefix = {arXiv}, + author = {Loc Pham and Tung Luu and Thu Vo and Minh Nguyen and Viet Hoang}, + eprint = {2507.21500}, + primaryclass = {cs.CL}, + title = {VN-MTEB: Vietnamese Massive Text Embedding Benchmark}, + url = {https://arxiv.org/abs/2507.21500}, + year = {2025}, +} +""", + adapted_from=["SICK-R"], + ) + + @property + def metadata_dict(self) -> dict[str, str]: + metadata_dict = super().metadata_dict + metadata_dict["min_score"] = 0 + metadata_dict["max_score"] = 5 + return metadata_dict diff --git a/mteb/tasks/STS/vie/__init__.py b/mteb/tasks/STS/vie/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scripts/data/clean_and_update_tasks.py b/scripts/data/clean_and_update_tasks.py index e74576b14b..caeed2649a 100644 --- a/scripts/data/clean_and_update_tasks.py +++ b/scripts/data/clean_and_update_tasks.py @@ -7,13 +7,13 @@ import warnings from dataclasses import dataclass from pathlib import Path -from typing import Literal, Optional +from typing import Literal import datasets import orjson import pandas as pd import typer -from datasets import Dataset, DatasetDict, load_dataset +from datasets import Dataset, DatasetDict from huggingface_hub import HfApi from tqdm import tqdm @@ -777,7 +777,7 @@ def update_v2_metadata_dataset( lines, ds_deleted = _update_dataset_dict(lines, call_node, new_path, new_revision) lines = _update_eval_splits(lines, call_node, module) - all_deleted_indices = sorted(list(set(desc_deleted + ds_deleted)), reverse=True) + all_deleted_indices = sorted(set(desc_deleted + ds_deleted), reverse=True) for i in all_deleted_indices: del lines[i] @@ -984,7 +984,7 @@ def create_and_prepare( "scripts/data/cleaning_reports", exists=True, dir_okay=True ), username: str = "mteb", - start_lang: Optional[str] = None, + start_lang: str | None = None, verbose: bool = typer.Option(False, "--verbose"), ) -> None: changed_tasks: list[tuple[str, int]] = [] @@ -1020,7 +1020,7 @@ def create_and_prepare( report_folder, folder.name, all_original_records, all_filter_records ) - unique_changed = sorted(list(set(changed_tasks))) + unique_changed = sorted(set(changed_tasks)) tasks_str = " ".join( f"{task_name} {task_name}.v{version}" for task_name, version in unique_changed @@ -1038,7 +1038,7 @@ def compare_results( results_dir: Path = typer.Option( "/home/admin/vatolin/experiments/mteb/results", exists=True, dir_okay=True ), - tasks_file: Optional[Path] = typer.Option( + tasks_file: Path | None = typer.Option( None, "--tasks-file", "-f", diff --git a/tests/test_models/test_model_meta.py b/tests/test_models/test_model_meta.py index f0c54dd99c..3a2b214068 100644 --- a/tests/test_models/test_model_meta.py +++ b/tests/test_models/test_model_meta.py @@ -62,6 +62,7 @@ def test_model_similar_tasks(training_datasets): "Touche2020", "Touche2020-Fa", "Touche2020-NL", + "Touche2020-VN", "Touche2020Retrieval.v3", ] assert sorted(dummy_model_meta.get_training_datasets().keys()) == expected