diff --git a/mteb/tasks/Clustering/__init__.py b/mteb/tasks/Clustering/__init__.py index 2ec2d69132..0c86095000 100644 --- a/mteb/tasks/Clustering/__init__.py +++ b/mteb/tasks/Clustering/__init__.py @@ -29,6 +29,8 @@ from .fra.HALClusteringS2S import * from .jpn.LivedoorNewsClustering import * from .jpn.MewsC16JaClustering import * +from .kor.KlueMrcDomainClustering import * +from .kor.KlueYnatMrcCategoryClustering import * from .multilingual.IndicReviewsClusteringP2P import * from .multilingual.MasakhaNEWSClusteringP2P import * from .multilingual.MasakhaNEWSClusteringS2S import * diff --git a/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py b/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py index 0b94e671c6..fc2b27b884 100644 --- a/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py +++ b/mteb/tasks/Clustering/kor/KlueMrcDomainClustering.py @@ -9,7 +9,7 @@ class KlueMrcDomainClustering(AbsTaskClustering): metadata = TaskMetadata( name="KlueMrcDomainClustering", - description="this dataset is a processed and redistributed version of the KLUE-MRC dataset. Domain: Game / Media / Automotive / Finance / Real Estate / Education ", + description="this dataset is a processed and redistributed version of the KLUE-MRC dataset. Domain: Game / Media / Automotive / Finance / Real Estate / Education", reference="https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_context_domain", type="Clustering", category="p2p", @@ -22,13 +22,12 @@ class KlueMrcDomainClustering(AbsTaskClustering): "revision": "a814b5ef0b6814991785f2c31af8e38ef7bb3f0d", }, date=("2016-01-01", "2020-12-31"), - form="Written", domains=["News", "Written"], task_subtypes=[], license="cc-by-sa-4.0", annotations_creators="human-annotated", dialect=[], - text_creation="found", + sample_creation="found", bibtex_citation="""@misc{park2021klue, title={KLUE: Korean Language Understanding Evaluation}, author={Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, @@ -37,6 +36,7 @@ class KlueMrcDomainClustering(AbsTaskClustering): archivePrefix={arXiv}, primaryClass={cs.CL}, }""", + prompt="Identify the topic or theme of the given texts", ) def dataset_transform(self): diff --git a/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py b/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py index 86fc961f04..d31dd87add 100644 --- a/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py +++ b/mteb/tasks/Clustering/kor/KlueYnatMrcCategoryClustering.py @@ -9,10 +9,10 @@ class KlueYnatMrcCategoryClustering(AbsTaskClustering): metadata = TaskMetadata( name="KlueYnatMrcCategoryClustering", - description="this dataset is a processed and redistributed version of the KLUE-Ynat & KLUE-MRC dataset. News_category: IT/Science, Sports, Media/Culture, Ecomomy/Finance, Real Estate ", + description="this dataset is a processed and redistributed version of the KLUE-Ynat & KLUE-MRC dataset. News_category: IT/Science, Sports, Media/Culture, Ecomomy/Finance, Real Estate", reference="https://huggingface.co/datasets/on-and-on/clustering_klue_mrc_ynat_title", type="Clustering", - category="p2p", + category="s2s", modalities=["text"], eval_splits=["test"], eval_langs=["kor-Hang"], @@ -22,13 +22,12 @@ class KlueYnatMrcCategoryClustering(AbsTaskClustering): "revision": "5bbded98f39e3bf6e81e15aa79c6616008519e29", }, date=("2016-01-01", "2020-12-31"), - form="Written", domains=["News", "Written"], task_subtypes=[], license="cc-by-sa-4.0", annotations_creators="human-annotated", dialect=[], - text_creation="found", + sample_creation="found", bibtex_citation="""@misc{park2021klue, title={KLUE: Korean Language Understanding Evaluation}, author={Sungjoon Park and Jihyung Moon and Sungdong Kim and Won Ik Cho and Jiyoon Han and Jangwon Park and Chisung Song and Junseong Kim and Yongsook Song and Taehwan Oh and Joohong Lee and Juhyun Oh and Sungwon Lyu and Younghoon Jeong and Inkwon Lee and Sangwoo Seo and Dongjun Lee and Hyunwoo Kim and Myeonghwa Lee and Seongbo Jang and Seungwon Do and Sunkyoung Kim and Kyungtae Lim and Jongwon Lee and Kyumin Park and Jamin Shin and Seonghyun Kim and Lucy Park and Alice Oh and Jungwoo Ha and Kyunghyun Cho}, @@ -37,6 +36,7 @@ class KlueYnatMrcCategoryClustering(AbsTaskClustering): archivePrefix={arXiv}, primaryClass={cs.CL}, }""", + prompt="Identify the topic or theme of the given texts", ) def dataset_transform(self): diff --git a/mteb/tasks/Clustering/kor/__init__.py b/mteb/tasks/Clustering/kor/__init__.py new file mode 100644 index 0000000000..e69de29bb2