diff --git a/mteb/tasks/Classification/__init__.py b/mteb/tasks/Classification/__init__.py index eafd1a7109..36592d1e0d 100644 --- a/mteb/tasks/Classification/__init__.py +++ b/mteb/tasks/Classification/__init__.py @@ -71,8 +71,10 @@ from .hrv.FrenkHrClassification import * from .ind.IndonesianIdClickbaitClassification import * from .ind.IndonesianMongabayConservationClassification import * +from .ita.DadoEvalCoarseClassification import * from .ita.ItaCaseholdClassification import * from .ita.ItalianLinguistAcceptabilityClassification import * +from .ita.SardiStanceClassification import * from .jav.JavaneseIMDBClassification import * from .jpn.WRIMEClassification import * from .kan.KannadaNewsClassification import * diff --git a/mteb/tasks/Classification/ita/DadoEvalCoarseClassification.py b/mteb/tasks/Classification/ita/DadoEvalCoarseClassification.py new file mode 100644 index 0000000000..f5d71cdf3f --- /dev/null +++ b/mteb/tasks/Classification/ita/DadoEvalCoarseClassification.py @@ -0,0 +1,48 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class DadoEvalCoarseClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="DadoEvalCoarseClassification", + dataset={ + "path": "MattiaSangermano/DaDoEval", + "revision": "7a78eb7cc137fdd1c5826be1a9e9813177706509", + }, + description="The DaDoEval dataset is a curated collection of 2,759 documents authored by Alcide De Gasperi, spanning the period from 1901 to 1954. Each document in the dataset is manually tagged with its date of issue.", + reference="https://github.com/dhfbk/DaDoEval", + type="Classification", + date=("1901-01-01", "1954-12-31"), + category="s2s", + modalities=["text"], + eval_splits=["test"], + eval_langs=["ita-Latn"], + main_score="accuracy", + domains=["Written"], + task_subtypes=[], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{menini2020dadoeval, + author = {Menini, Stefano and Moretti, Giovanni and Sprugnoli, Rachele and Tonelli, Sara and others}, + booktitle = {Proceedings of the Seventh Evaluation Campaign of Natural Language Processing and Speech Tools for Italian. Final Workshop (EVALITA 2020)}, + organization = {Accademia University Press}, + pages = {391--397}, + title = {DaDoEval@ EVALITA 2020: Same-genre and cross-genre dating of historical documents}, + year = {2020}, +} +""", + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_column("class", "label") + unused_cols = [ + col + for col in self.dataset["test"].column_names + if col not in ["text", "label"] + ] + self.dataset = self.dataset.remove_columns(unused_cols) diff --git a/mteb/tasks/Classification/ita/SardiStanceClassification.py b/mteb/tasks/Classification/ita/SardiStanceClassification.py new file mode 100644 index 0000000000..b1ebf99f84 --- /dev/null +++ b/mteb/tasks/Classification/ita/SardiStanceClassification.py @@ -0,0 +1,47 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskClassification import AbsTaskClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class SardiStanceClassification(AbsTaskClassification): + metadata = TaskMetadata( + name="SardiStanceClassification", + dataset={ + "path": "MattiaSangermano/SardiStance", + "revision": "e25d91e6f6a28ebef42212128f0d5e275b676233", + }, + description="SardiStance is a unique dataset designed for the task of stance detection in Italian tweets. It consists of tweets related to the Sardines movement, providing a valuable resource for researchers and practitioners in the field of NLP.", + reference="https://github.com/mirkolai/evalita-sardistance", + type="Classification", + category="s2s", + date=("2019-11-01", "2020-01-31"), + modalities=["text"], + eval_splits=["test"], + eval_langs=["ita-Latn"], + main_score="accuracy", + domains=["Social"], + task_subtypes=["Political classification"], + license="cc-by-nc-sa-4.0", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{cignarella2020sardistance, + author = {Cignarella, Alessandra Teresa and Lai, Mirko and Bosco, Cristina and Patti, Viviana and Rosso, Paolo and others}, + booktitle = {CEUR WORKSHOP PROCEEDINGS}, + organization = {Ceur}, + pages = {1--10}, + title = {Sardistance@ evalita2020: Overview of the task on stance detection in italian tweets}, + year = {2020}, +} +""", + ) + + def dataset_transform(self): + unused_cols = [ + col + for col in self.dataset["test"].column_names + if col not in ["text", "label"] + ] + self.dataset = self.dataset.remove_columns(unused_cols) diff --git a/mteb/tasks/MultiLabelClassification/__init__.py b/mteb/tasks/MultiLabelClassification/__init__.py index 6b8ab6b7d4..096f96a880 100644 --- a/mteb/tasks/MultiLabelClassification/__init__.py +++ b/mteb/tasks/MultiLabelClassification/__init__.py @@ -1,5 +1,6 @@ from __future__ import annotations +from .ita.EmitClassification import * from .kor.KorHateSpeechMLClassification import * from .mlt.MalteseNewsClassification import * from .multilingual.MultiEURLEXMultilabelClassification import * diff --git a/mteb/tasks/MultiLabelClassification/ita/EmitClassification.py b/mteb/tasks/MultiLabelClassification/ita/EmitClassification.py new file mode 100644 index 0000000000..a6ca105c4b --- /dev/null +++ b/mteb/tasks/MultiLabelClassification/ita/EmitClassification.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskMultilabelClassification import ( + AbsTaskMultilabelClassification, +) +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class EmitClassification(AbsTaskMultilabelClassification): + metadata = TaskMetadata( + name="EmitClassification", + description="""The EMit dataset is a comprehensive resource for the detection of emotions in Italian social media texts. + The EMit dataset consists of social media messages about TV shows, TV series, music videos, and advertisements. + Each message is annotated with one or more of the 8 primary emotions defined by Plutchik + (anger, anticipation, disgust, fear, joy, sadness, surprise, trust), as well as an additional label “love.” + """, + reference="https://github.com/oaraque/emit", + dataset={ + "path": "MattiaSangermano/emit", + "revision": "b0ceff2da0ca463d5c8c97a4e1c6e40545a1c3a6", + }, + type="MultilabelClassification", + category="s2s", + modalities=["text"], + date=("2022-01-01", "2022-12-31"), + eval_splits=["test"], + eval_langs=["ita-Latn"], + main_score="accuracy", + domains=["Social", "Written"], + task_subtypes=["Sentiment/Hate speech"], + license="cc-by-sa-4.0", + annotations_creators="expert-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{araque2023emit, + author = {Araque, O and Frenda, S and Sprugnoli, R and Nozza, D and Patti, V and others}, + booktitle = {CEUR WORKSHOP PROCEEDINGS}, + organization = {CEUR-WS}, + pages = {1--8}, + title = {EMit at EVALITA 2023: Overview of the Categorical Emotion Detection in Italian Social Media Task}, + volume = {3473}, + year = {2023}, +} +""", + ) + + def dataset_transform(self): + self.dataset = self.dataset.rename_columns({"emotion_labels": "label"}) + unused_cols = [ + col + for col in self.dataset["test"].column_names + if col not in ["text", "label"] + ] + self.dataset = self.dataset.remove_columns(unused_cols) diff --git a/mteb/tasks/MultiLabelClassification/ita/__init__.py b/mteb/tasks/MultiLabelClassification/ita/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/PairClassification/__init__.py b/mteb/tasks/PairClassification/__init__.py index f562879bd8..d3ecd19272 100644 --- a/mteb/tasks/PairClassification/__init__.py +++ b/mteb/tasks/PairClassification/__init__.py @@ -16,6 +16,7 @@ from .fas.FarsTail import * from .hye.ArmenianParaphrasePC import * from .ind.IndoNLI import * +from .ita.DisCoTexPairClassification import * from .kor.KlueNLI import * from .multilingual.OpusparcusPC import * from .multilingual.PawsXPairClassification import * diff --git a/mteb/tasks/PairClassification/ita/DisCoTexPairClassification.py b/mteb/tasks/PairClassification/ita/DisCoTexPairClassification.py new file mode 100644 index 0000000000..9204eb23a6 --- /dev/null +++ b/mteb/tasks/PairClassification/ita/DisCoTexPairClassification.py @@ -0,0 +1,55 @@ +from __future__ import annotations + +from mteb.abstasks.AbsTaskPairClassification import AbsTaskPairClassification +from mteb.abstasks.TaskMetadata import TaskMetadata + + +class DisCoTexPairClassification(AbsTaskPairClassification): + metadata = TaskMetadata( + name="DisCoTexPairClassification", + description="The DisCoTEX dataset aims at assessing discourse coherence in Italian texts. This dataset focuses on Italian real-world texts and provides resources to model coherence in natural language.", + reference="https://github.com/davidecolla/DisCoTex", + dataset={ + "path": "MattiaSangermano/DisCoTex-last-sentence", + "revision": "ab9ea43f8e54c8b24b12cd1b77d6eb462385a30b", + }, + type="PairClassification", + category="s2s", + modalities=["text"], + date=("2023-01-01", "2023-12-31"), + eval_splits=["test"], + eval_langs=["ita-Latn"], + main_score="max_ap", + domains=["Social", "Written"], + task_subtypes=[], + license="not specified", + annotations_creators="derived", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@inproceedings{brunato2023discotex, + author = {Brunato, Dominique and Colla, Davide and Dell'Orletta, Felice and Dini, Irene and Radicioni, Daniele Paolo and Ravelli, Andrea Amelio and others}, + booktitle = {CEUR WORKSHOP PROCEEDINGS}, + organization = {CEUR}, + pages = {1--8}, + title = {DisCoTex at EVALITA 2023: overview of the assessing discourse coherence in Italian texts task}, + volume = {3473}, + year = {2023}, +} +""", + ) + + def dataset_transform(self): + self.dataset = self.dataset.remove_columns(["id", "source"]) + self.dataset = self.dataset.map( + lambda x: { + "prompt": [x["prompt"]], + "target": [x["target"]], + "class": [x["class"]], + }, + batched=True, + batch_size=len(self.dataset["train"]), + ) + self.dataset = self.dataset.rename_column("prompt", "sentence1") + self.dataset = self.dataset.rename_column("target", "sentence2") + self.dataset = self.dataset.rename_column("class", "labels") diff --git a/mteb/tasks/PairClassification/ita/__init__.py b/mteb/tasks/PairClassification/ita/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mteb/tasks/Reranking/__init__.py b/mteb/tasks/Reranking/__init__.py index 4fa8ed73cf..1f992b48f5 100644 --- a/mteb/tasks/Reranking/__init__.py +++ b/mteb/tasks/Reranking/__init__.py @@ -13,5 +13,6 @@ from .multilingual.ESCIReranking import * from .multilingual.MIRACLReranking import * from .multilingual.WikipediaRerankingMultilingual import * +from .multilingual.XGlueWPRReranking import * from .rus.RuBQReranking import * from .zho.CMTEBReranking import * diff --git a/mteb/tasks/Reranking/multilingual/XGlueWPRReranking.py b/mteb/tasks/Reranking/multilingual/XGlueWPRReranking.py new file mode 100644 index 0000000000..59c972f3e8 --- /dev/null +++ b/mteb/tasks/Reranking/multilingual/XGlueWPRReranking.py @@ -0,0 +1,145 @@ +from __future__ import annotations + +import logging + +import datasets +import pandas as pd + +from mteb.abstasks.AbsTaskReranking import AbsTaskReranking +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +logger = logging.getLogger(__name__) + + +_LANGUAGES = { + "de": ["deu-Latn"], + "en": ["eng-Latn"], + "es": ["spa-Latn"], + "fr": ["fra-Latn"], + "it": ["ita-Latn"], + "pt": ["por-Latn"], + "zh": ["zho-Hans"], +} + +_CITATION = r""" +@misc{11234/1-3105, + author = {Zeman, Daniel and Nivre, Joakim and Abrams, Mitchell and Aepli, No{\"e}mi and Agi{\'c}, {\v Z}eljko and Ahrenberg, Lars and Aleksandravi{\v c}i{\=u}t{\.e}, Gabriel{\.e} and Antonsen, Lene and Aplonova, Katya and Aranzabe, Maria Jesus and Arutie, Gashaw and Asahara, Masayuki and Ateyah, Luma and Attia, Mohammed and Atutxa, Aitziber and Augustinus, Liesbeth and Badmaeva, Elena and Ballesteros, Miguel and Banerjee, Esha and Bank, Sebastian and Barbu Mititelu, Verginica and Basmov, Victoria and Batchelor, Colin and Bauer, John and Bellato, Sandra and Bengoetxea, Kepa and Berzak, Yevgeni and Bhat, Irshad Ahmad and Bhat, Riyaz Ahmad and Biagetti, Erica and Bick, Eckhard and Bielinskien{\.e}, Agn{\.e} and Blokland, Rogier and Bobicev, Victoria and Boizou, Lo{\"{\i}}c and Borges V{\"o}lker, Emanuel and B{\"o}rstell, Carl and Bosco, Cristina and Bouma, Gosse and Bowman, Sam and Boyd, Adriane and Brokait{\.e}, Kristina and Burchardt, Aljoscha and Candito, Marie and Caron, Bernard and Caron, Gauthier and Cavalcanti, Tatiana and Cebiro{\u g}lu Eryi{\u g}it, G{\"u}l{\c s}en and Cecchini, Flavio Massimiliano and Celano, Giuseppe G. A. and {\v C}{\'e}pl{\"o}, Slavom{\'{\i}}r and Cetin, Savas and Chalub, Fabricio and Choi, Jinho and Cho, Yongseok and Chun, Jayeol and Cignarella, Alessandra T. and Cinkov{\'a}, Silvie and Collomb, Aur{\'e}lie and {\c C}{\"o}ltekin, {\c C}a{\u g}r{\i} and Connor, Miriam and Courtin, Marine and Davidson, Elizabeth and de Marneffe, Marie-Catherine and de Paiva, Valeria and de Souza, Elvis and Diaz de Ilarraza, Arantza and Dickerson, Carly and Dione, Bamba and Dirix, Peter and Dobrovoljc, Kaja and Dozat, Timothy and Droganova, Kira and Dwivedi, Puneet and Eckhoff, Hanne and Eli, Marhaba and Elkahky, Ali and Ephrem, Binyam and Erina, Olga and Erjavec, Toma{\v z} and Etienne, Aline and Evelyn, Wograine and Farkas, Rich{\'a}rd and Fernandez Alcalde, Hector and Foster, Jennifer and Freitas, Cl{\'a}udia and Fujita, Kazunori and Gajdo{\v s}ov{\'a}, Katar{\'{\i}}na and Galbraith, Daniel and Garcia, Marcos and G{\"a}rdenfors, Moa and Garza, Sebastian and Gerdes, Kim and Ginter, Filip and Goenaga, Iakes and Gojenola, Koldo and G{\"o}k{\i}rmak, Memduh and Goldberg, Yoav and G{\'o}mez Guinovart, Xavier and Gonz{\'a}lez Saavedra, Berta and Grici{\=u}t{\.e}, Bernadeta and Grioni, Matias and Gr{\=u}z{\={\i}}tis, Normunds and Guillaume, Bruno and Guillot-Barbance, C{\'e}line and Habash, Nizar and Haji{\v c}, Jan and Haji{\v c} jr., Jan and H{\"a}m{\"a}l{\"a}inen, Mika and H{\`a} M{\~y}, Linh and Han, Na-Rae and Harris, Kim and Haug, Dag and Heinecke, Johannes and Hennig, Felix and Hladk{\'a}, Barbora and Hlav{\'a}{\v c}ov{\'a}, Jaroslava and Hociung, Florinel and Hohle, Petter and Hwang, Jena and Ikeda, Takumi and Ion, Radu and Irimia, Elena and Ishola, {\d O}l{\'a}j{\'{\i}}d{\'e} and Jel{\'{\i}}nek, Tom{\'a}{\v s} and Johannsen, Anders and J{\o}rgensen, Fredrik and Juutinen, Markus and Ka{\c s}{\i}kara, H{\"u}ner and Kaasen, Andre and Kabaeva, Nadezhda and Kahane, Sylvain and Kanayama, Hiroshi and Kanerva, Jenna and Katz, Boris and Kayadelen, Tolga and Kenney, Jessica and Kettnerov{\'a}, V{\'a}clava and Kirchner, Jesse and Klementieva, Elena and K{\"o}hn, Arne and Kopacewicz, Kamil and Kotsyba, Natalia and Kovalevskait{\.e}, Jolanta and Krek, Simon and Kwak, Sookyoung and Laippala, Veronika and Lambertino, Lorenzo and Lam, Lucia and Lando, Tatiana and Larasati, Septina Dian and Lavrentiev, Alexei and Lee, John and L{\^e} H{\`{\^o}}ng, Phương and Lenci, Alessandro and Lertpradit, Saran and Leung, Herman and Li, Cheuk Ying and Li, Josie and Li, Keying and Lim, {KyungTae} and Liovina, Maria and Li, Yuan and Ljube{\v s}i{\'c}, Nikola and Loginova, Olga and Lyashevskaya, Olga and Lynn, Teresa and Macketanz, Vivien and Makazhanov, Aibek and Mandl, Michael and Manning, Christopher and Manurung, Ruli and M{\u a}r{\u a}nduc, C{\u a}t{\u a}lina and Mare{\v c}ek, David and Marheinecke, Katrin and Mart{\'{\i}}nez Alonso, H{\'e}ctor and Martins, Andr{\'e} and Ma{\v s}ek, Jan and Matsumoto, Yuji and {McDonald}, Ryan and {McGuinness}, Sarah and Mendon{\c c}a, Gustavo and Miekka, Niko and Misirpashayeva, Margarita and Missil{\"a}, Anna and Mititelu, C{\u a}t{\u a}lin and Mitrofan, Maria and Miyao, Yusuke and Montemagni, Simonetta and More, Amir and Moreno Romero, Laura and Mori, Keiko Sophie and Morioka, Tomohiko and Mori, Shinsuke and Moro, Shigeki and Mortensen, Bjartur and Moskalevskyi, Bohdan and Muischnek, Kadri and Munro, Robert and Murawaki, Yugo and M{\"u}{\"u}risep, Kaili and Nainwani, Pinkey and Navarro Hor{\~n}iacek, Juan Ignacio and Nedoluzhko, Anna and Ne{\v s}pore-B{\=e}rzkalne, Gunta and Nguy{\~{\^e}}n Th{\d i}, Lương and Nguy{\~{\^e}}n Th{\d i} Minh, Huy{\`{\^e}}n and Nikaido, Yoshihiro and Nikolaev, Vitaly and Nitisaroj, Rattima and Nurmi, Hanna and Ojala, Stina and Ojha, Atul Kr. and Ol{\'u}{\`o}kun, Ad{\'e}day{\d o}̀ and Omura, Mai and Osenova, Petya and {\"O}stling, Robert and {\O}vrelid, Lilja and Partanen, Niko and Pascual, Elena and Passarotti, Marco and Patejuk, Agnieszka and Paulino-Passos, Guilherme and Peljak-{\L}api{\'n}ska, Angelika and Peng, Siyao and Perez, Cenel-Augusto and Perrier, Guy and Petrova, Daria and Petrov, Slav and Phelan, Jason and Piitulainen, Jussi and Pirinen, Tommi A and Pitler, Emily and Plank, Barbara and Poibeau, Thierry and Ponomareva, Larisa and Popel, Martin and Pretkalni{\c n}a, Lauma and Pr{\'e}vost, Sophie and Prokopidis, Prokopis and Przepi{\'o}rkowski, Adam and Puolakainen, Tiina and Pyysalo, Sampo and Qi, Peng and R{\"a}{\"a}bis, Andriela and Rademaker, Alexandre and Ramasamy, Loganathan and Rama, Taraka and Ramisch, Carlos and Ravishankar, Vinit and Real, Livy and Reddy, Siva and Rehm, Georg and Riabov, Ivan and Rie{\ss}ler, Michael and Rimkut{\.e}, Erika and Rinaldi, Larissa and Rituma, Laura and Rocha, Luisa and Romanenko, Mykhailo and Rosa, Rudolf and Rovati, Davide and Roșca, Valentin and Rudina, Olga and Rueter, Jack and Sadde, Shoval and Sagot, Beno{\^{\i}}t and Saleh, Shadi and Salomoni, Alessio and Samard{\v z}i{\'c}, Tanja and Samson, Stephanie and Sanguinetti, Manuela and S{\"a}rg, Dage and Saul{\={\i}}te, Baiba and Sawanakunanon, Yanin and Schneider, Nathan and Schuster, Sebastian and Seddah, Djam{\'e} and Seeker, Wolfgang and Seraji, Mojgan and Shen, Mo and Shimada, Atsuko and Shirasu, Hiroyuki and Shohibussirri, Muh and Sichinava, Dmitry and Silveira, Aline and Silveira, Natalia and Simi, Maria and Simionescu, Radu and Simk{\'o}, Katalin and {\v S}imkov{\'a}, M{\'a}ria and Simov, Kiril and Smith, Aaron and Soares-Bastos, Isabela and Spadine, Carolyn and Stella, Antonio and Straka, Milan and Strnadov{\'a}, Jana and Suhr, Alane and Sulubacak, Umut and Suzuki, Shingo and Sz{\'a}nt{\'o}, Zsolt and Taji, Dima and Takahashi, Yuta and Tamburini, Fabio and Tanaka, Takaaki and Tellier, Isabelle and Thomas, Guillaume and Torga, Liisi and Trosterud, Trond and Trukhina, Anna and Tsarfaty, Reut and Tyers, Francis and Uematsu, Sumire and Ure{\v s}ov{\'a}, Zde{\v n}ka and Uria, Larraitz and Uszkoreit, Hans and Utka, Andrius and Vajjala, Sowmya and van Niekerk, Daniel and van Noord, Gertjan and Varga, Viktor and Villemonte de la Clergerie, Eric and Vincze, Veronika and Wallin, Lars and Walsh, Abigail and Wang, Jing Xian and Washington, Jonathan North and Wendt, Maximilan and Williams, Seyi and Wir{\'e}n, Mats and Wittern, Christian and Woldemariam, Tsegay and Wong, Tak-sum and Wr{\'o}blewska, Alina and Yako, Mary and Yamazaki, Naoki and Yan, Chunxiao and Yasuoka, Koichi and Yavrumyan, Marat M. and Yu, Zhuoran and {\v Z}abokrtsk{\'y}, Zden{\v e}k and Zeldes, Amir and Zhang, Manying and Zhu, Hanzhi}, + copyright = {Licence Universal Dependencies v2.5}, + note = {{LINDAT}/{CLARIAH}-{CZ} digital library at the Institute of Formal and Applied Linguistics ({{\'U}FAL}), Faculty of Mathematics and Physics, Charles University}, + title = {Universal Dependencies 2.5}, + url = {http://hdl.handle.net/11234/1-3105}, + year = {2019}, +} + +@inproceedings{Conneau2018XNLIEC, + author = {Alexis Conneau and Guillaume Lample and Ruty Rinott and Adina Williams and Samuel R. Bowman and Holger Schwenk and Veselin Stoyanov}, + booktitle = {EMNLP}, + title = {XNLI: Evaluating Cross-lingual Sentence Representations}, + year = {2018}, +} + +@article{Lewis2019MLQAEC, + author = {Patrick Lewis and Barlas Oguz and Ruty Rinott and Sebastian Riedel and Holger Schwenk}, + journal = {ArXiv}, + title = {MLQA: Evaluating Cross-lingual Extractive Question Answering}, + volume = {abs/1910.07475}, + year = {2019}, +} + +@article{Liang2020XGLUEAN, + author = {Yaobo Liang and Nan Duan and Yeyun Gong and Ning Wu and Fenfei Guo and Weizhen Qi and Ming Gong and Linjun Shou and Daxin Jiang and Guihong Cao and Xiaodong Fan and Ruofei Zhang and Rahul Agrawal and Edward Cui and Sining Wei and Taroon Bharti and Ying Qiao and Jiun-Hung Chen and Winnie Wu and Shuguang Liu and Fan Yang and Daniel Campos and Rangan Majumder and Ming Zhou}, + journal = {arXiv}, + title = {XGLUE: A New Benchmark Dataset for Cross-lingual Pre-training, Understanding and Generation}, + volume = {abs/2004.01401}, + year = {2020}, +} + +@article{Sang2002IntroductionTT, + author = {Erik F. Tjong Kim Sang}, + journal = {ArXiv}, + title = {Introduction to the CoNLL-2002 Shared Task: Language-Independent Named Entity Recognition}, + volume = {cs.CL/0209010}, + year = {2002}, +} + +@article{Sang2003IntroductionTT, + author = {Erik F. Tjong Kim Sang and Fien De Meulder}, + journal = {ArXiv}, + title = {Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition}, + volume = {cs.CL/0306050}, + year = {2003}, +} + +@article{Yang2019PAWSXAC, + author = {Yinfei Yang and Yuan Zhang and Chris Tar and Jason Baldridge}, + journal = {ArXiv}, + title = {PAWS-X: A Cross-lingual Adversarial Dataset for Paraphrase Identification}, + volume = {abs/1908.11828}, + year = {2019}, +} +""" + + +class XGlueWPRReranking(MultilingualTask, AbsTaskReranking): + metadata = TaskMetadata( + name="XGlueWPRReranking", + description="""XGLUE is a new benchmark dataset to evaluate the performance of cross-lingual pre-trained models + with respect to cross-lingual natural language understanding and generation. XGLUE is composed of 11 tasks spans 19 languages.""", + reference="https://github.com/microsoft/XGLUE", + dataset={ + "path": "forresty/xglue", + "revision": "833b866f2f71a28d7251569020f0ff82ee5fdbbb", + "name": "wpr", + "trust_remote_code": True, + }, + type="Reranking", + category="s2p", + date=("2019-01-01", "2020-12-31"), + modalities=["text"], + eval_splits=["validation", "test"], + eval_langs=_LANGUAGES, + main_score="map", + domains=["Written"], + task_subtypes=[], + license="http://hdl.handle.net/11234/1-3105", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=_CITATION, + ) + + def load_data(self, **kwargs): + def _aggregate_texts(group, neg_label): + return pd.Series( + { + "positive": group[group["relavance_label"] != neg_label][ + "text" + ].tolist(), + "negative": group[group["relavance_label"] == neg_label][ + "text" + ].tolist(), + } + ) + + self.dataset = {} + for lang in self.hf_subsets: + ds = {} + for eval_split in self.metadata.eval_splits: + ds[eval_split] = datasets.load_dataset( + split=f"{eval_split}.{lang}", **self.metadata.dataset + ).map(lambda x: {"text": x["web_page_title"] + x["web_page_snippet"]}) + + neg_label = ds[eval_split].features["relavance_label"]._str2int["Bad"] + + grouped_df = ( + ds[eval_split] + .to_pandas() + .groupby("query") + .apply(_aggregate_texts, neg_label=neg_label) + .reset_index() + ) + + ds[eval_split] = datasets.Dataset.from_pandas(grouped_df) + + self.dataset[lang] = datasets.DatasetDict(ds) diff --git a/mteb/tasks/Retrieval/__init__.py b/mteb/tasks/Retrieval/__init__.py index 39932c47ec..31613e4532 100644 --- a/mteb/tasks/Retrieval/__init__.py +++ b/mteb/tasks/Retrieval/__init__.py @@ -129,6 +129,7 @@ from .multilingual.IndicQARetrieval import * from .multilingual.MintakaRetrieval import * from .multilingual.MIRACLRetrieval import * +from .multilingual.MKQARetrieval import * from .multilingual.MLQARetrieval import * from .multilingual.MrTidyRetrieval import * from .multilingual.MultiLongDocRetrieval import * diff --git a/mteb/tasks/Retrieval/multilingual/MKQARetrieval.py b/mteb/tasks/Retrieval/multilingual/MKQARetrieval.py new file mode 100644 index 0000000000..d36ba40ec8 --- /dev/null +++ b/mteb/tasks/Retrieval/multilingual/MKQARetrieval.py @@ -0,0 +1,134 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.AbsTaskRetrieval import AbsTaskRetrieval +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_LANGUAGE_MAPPING = { + "ar": "ara-Arab", + "da": "dan-Latn", + "de": "deu-Latn", + "en": "eng-Latn", + "es": "spa-Latn", + "fi": "fin-Latn", + "fr": "fra-Latn", + "he": "heb-Hebr", + "hu": "hun-Latn", + "it": "ita-Latn", + "ja": "jpn-Jpan", + "ko": "kor-Kore", + "km": "khm-Khmr", + "ms": "msa-Latn", + "nl": "nld-Latn", + "no": "nor-Latn", + "pl": "pol-Latn", + "pt": "por-Latn", + "ru": "rus-Cyrl", + "sv": "swe-Latn", + "th": "tha-Thai", + "tr": "tur-Latn", + "vi": "vie-Latn", + "zh_cn": "zho-Hans", + "zh_hk": "zho-Hant", + "zh_tw": "zho-Hant", +} + + +_EVAL_LANGS = {k: [v] for k, v in _LANGUAGE_MAPPING.items()} + + +class MKQARetrieval(AbsTaskRetrieval, MultilingualTask): + metadata = TaskMetadata( + name="MKQARetrieval", + description="""Multilingual Knowledge Questions & Answers (MKQA)contains 10,000 queries sampled from the Google Natural Questions dataset. + For each query we collect new passage-independent answers. These queries and answers are then human translated into 25 Non-English languages.""", + reference="https://github.com/apple/ml-mkqa", + dataset={ + "path": "apple/mkqa", + "revision": "325131889721ae0ed885b76ecb8011369d75abad", + "trust_remote_code": True, + "name": "mkqa", + }, + type="Retrieval", + category="s2p", + modalities=["text"], + date=("2020-01-01", "2020-12-31"), + eval_splits=["train"], + eval_langs=_EVAL_LANGS, + main_score="ndcg_at_10", + domains=["Written"], + task_subtypes=["Question answering"], + license="cc-by-3.0", + annotations_creators="human-annotated", + dialect=[], + sample_creation="found", + bibtex_citation=r""" +@misc{mkqa, + author = {Shayne Longpre and Yi Lu and Joachim Daiber}, + title = {MKQA: A Linguistically Diverse Benchmark for Multilingual Open Domain Question Answering}, + url = {https://arxiv.org/pdf/2007.15207.pdf}, + year = {2020}, +} + """, + ) + + def load_data(self, **kwargs): + """In this retrieval dataset, corpus and queries are in the same language.""" + if self.data_loaded: + return + + self.queries, self.corpus, self.relevant_docs = {}, {}, {} + + ds = datasets.load_dataset( + **self.metadata_dict["dataset"], + ) + + for lang in self.hf_subsets: + self.queries[lang] = {} + self.corpus[lang] = {} + self.relevant_docs[lang] = {} + + for eval_split in self.metadata.eval_splits: + self.queries[lang][eval_split] = {} + self.corpus[lang][eval_split] = {} + self.relevant_docs[lang][eval_split] = {} + + split_data = ds[eval_split] + + query_ids = { + query: f"Q{i}" + for i, query in enumerate( + {entry[lang] for entry in split_data["queries"]} + ) + } + + context_texts = { + hit["text"] + for entry in split_data["answers"] + for hit in entry[lang] + } + + context_ids = {text: f"C{i}" for i, text in enumerate(context_texts)} + + for row in split_data: + query = row["queries"][lang] + contexts = [entry["text"] for entry in row["answers"][lang]] + + if query is None or None in contexts: + continue + + query_id = query_ids[query] + for context in contexts: + context_id = context_ids[context] + self.queries[lang][eval_split][query_id] = query + self.corpus[lang][eval_split][context_id] = { + "title": "", + "text": context, + } + if query_id not in self.relevant_docs[lang][eval_split]: + self.relevant_docs[lang][eval_split][query_id] = {} + self.relevant_docs[lang][eval_split][query_id][context_id] = 1 + + self.data_loaded = True