diff --git a/docs/tasks.md b/docs/tasks.md index 706ada1068..35b2b89623 100644 --- a/docs/tasks.md +++ b/docs/tasks.md @@ -437,6 +437,7 @@ The following tables give you an overview of the tasks in MTEB. | [MIRACLReranking](https://project-miracl.github.io/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Reranking | s2s | [Encyclopaedic, Written] | None | None | | [MIRACLRetrieval](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [MIRACLRetrievalHardNegatives](http://miracl.ai/) (Zhang et al., 2023) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | +| [MIRACLVisionRetrieval](https://arxiv.org/pdf/2407.01449) (Radek Osmulski, 2025) | ['ara', 'ben', 'deu', 'eng', 'fas', 'fin', 'fra', 'hin', 'ind', 'jpn', 'kor', 'rus', 'spa', 'swa', 'tel', 'tha', 'yor', 'zho'] | DocumentUnderstanding | t2i | [Encyclopaedic] | None | None | | [MLQARetrieval](https://huggingface.co/datasets/mlqa) (Lewis et al., 2019) | ['ara', 'deu', 'eng', 'hin', 'spa', 'vie', 'zho'] | Retrieval | s2p | [Encyclopaedic, Written] | None | None | | [MLQuestions](https://github.com/McGill-NLP/MLQuestions) (Kulshreshtha et al., 2021) | ['eng'] | Retrieval | s2p | [Academic, Encyclopaedic, Written] | None | None | | [MLSUMClusteringP2P.v2](https://huggingface.co/datasets/mteb/mlsum) (Scialom et al., 2020) | ['deu', 'fra', 'rus', 'spa'] | Clustering | p2p | [News, Written] | None | None | @@ -984,7 +985,7 @@ The following tables give you an overview of the tasks in MTEB. | apu | Apurinã | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | apw | Western Apache | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | apz | Safeyoka | Angan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ara | Arabic | Unclassified | 0 | 2 | 0 | 4 | 12 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 36 | +| ara | Arabic | Unclassified | 0 | 2 | 0 | 4 | 12 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 10 | 2 | 0 | 0 | 0 | 0 | 2 | 0 | 37 | | arb | Standard Arabic | Afro-Asiatic | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | | are | Western Arrarnta | Pama-Nyungan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | arl | Arabela | Zaparoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1031,7 +1032,7 @@ The following tables give you an overview of the tasks in MTEB. | bef | Benabena | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | bel | Belarusian | Indo-European | 0 | 0 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | | bem | Bemba (Zambia) | Atlantic-Congo | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| ben | Bengali | Indo-European | 0 | 1 | 0 | 9 | 9 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 32 | +| ben | Bengali | Indo-European | 0 | 1 | 0 | 9 | 9 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 33 | | beo | Beami | Bosavi | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ber | Berber (Other) | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | beu | Blagar | Timor-Alor-Pantar | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1173,7 +1174,7 @@ The following tables give you an overview of the tasks in MTEB. | dah | Gwahatike | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dan | Danish | Indo-European | 0 | 2 | 0 | 8 | 10 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 31 | | ded | Dedua | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| deu | German | Indo-European | 0 | 2 | 0 | 8 | 14 | 7 | 0 | 3 | 0 | 0 | 0 | 0 | 1 | 7 | 2 | 20 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 72 | +| deu | German | Indo-European | 0 | 2 | 0 | 8 | 14 | 7 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 7 | 2 | 20 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 73 | | dgc | Casiguran Dumagat Agta | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dgr | Dogrib | Athabaskan-Eyak-Tlingit | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | dgz | Daga | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1201,7 +1202,7 @@ The following tables give you an overview of the tasks in MTEB. | ell | Modern Greek (1453-) | Indo-European | 0 | 2 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 2 | 0 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 21 | | emi | Mussau-Emira | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | emp | Northern Emberá | Chocoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| eng | English | Indo-European | 0 | 3 | 49 | 20 | 161 | 21 | 7 | 14 | 22 | 5 | 0 | 3 | 1 | 13 | 9 | 113 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 497 | +| eng | English | Indo-European | 0 | 3 | 49 | 20 | 161 | 21 | 7 | 15 | 22 | 5 | 0 | 3 | 1 | 13 | 9 | 113 | 13 | 2 | 1 | 6 | 7 | 3 | 24 | 498 | | enq | Enga | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | epo | Esperanto | Artificial Language | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | eri | Ogea | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1215,14 +1216,14 @@ The following tables give you an overview of the tasks in MTEB. | fai | Faiwol | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fao | Faroese | Indo-European | 0 | 0 | 0 | 3 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | | far | Fataleka | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fas | Persian | Indo-European | 0 | 1 | 0 | 6 | 28 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | 2 | 41 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 94 | +| fas | Persian | Indo-European | 0 | 1 | 0 | 6 | 28 | 5 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 8 | 2 | 41 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 95 | | ffm | Maasina Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fij | Fijian | Austronesian | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | | fil | Filipino | Austronesian | 0 | 1 | 0 | 1 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | -| fin | Finnish | Uralic | 0 | 1 | 0 | 5 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 24 | +| fin | Finnish | Uralic | 0 | 1 | 0 | 5 | 6 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 2 | 6 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | | fon | Fon | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | | for | Fore | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| fra | French | Indo-European | 0 | 1 | 0 | 9 | 13 | 8 | 0 | 3 | 0 | 0 | 0 | 0 | 1 | 6 | 3 | 17 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 70 | +| fra | French | Indo-European | 0 | 1 | 0 | 9 | 13 | 8 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 6 | 3 | 17 | 4 | 0 | 1 | 0 | 0 | 4 | 0 | 71 | | fry | Western Frisian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fuc | Pulaar | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | fue | Borgu Fulfulde | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1279,7 +1280,7 @@ The following tables give you an overview of the tasks in MTEB. | hch | Huichol | Uto-Aztecan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | heb | Hebrew | Afro-Asiatic | 0 | 1 | 0 | 6 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 15 | | heg | Helong | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| hin | Hindi | Indo-European | 0 | 1 | 0 | 11 | 12 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 11 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 43 | +| hin | Hindi | Indo-European | 0 | 1 | 0 | 11 | 12 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 2 | 11 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 44 | | hix | Hixkaryána | Cariban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | hla | Halia | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | hlt | Matu Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1311,7 +1312,7 @@ The following tables give you an overview of the tasks in MTEB. | imo | Imbongu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ina | Interlingua (International Auxiliary Language Association) | Artificial Language | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | inb | Inga | Quechuan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| ind | Indonesian | Austronesian | 0 | 3 | 0 | 8 | 7 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 27 | +| ind | Indonesian | Austronesian | 0 | 3 | 0 | 8 | 7 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 5 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 28 | | ino | Inoke-Yate | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | iou | Tuma-Irumu | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ipi | Ipili | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1328,7 +1329,7 @@ The following tables give you an overview of the tasks in MTEB. | jid | Bu (Kaduna State) | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | jiv | Shuar | Chicham | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | jni | Janji | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| jpn | Japanese | Japonic | 0 | 3 | 0 | 7 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 42 | +| jpn | Japanese | Japonic | 0 | 3 | 0 | 7 | 8 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 3 | 14 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 43 | | jvn | Caribbean Javanese | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kab | Kabyle | Afro-Asiatic | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | | kac | Kachin | Sino-Tibetan | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 4 | @@ -1384,7 +1385,7 @@ The following tables give you an overview of the tasks in MTEB. | knj | Western Kanjobal | Mayan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | knv | Tabo | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kon | Kongo | Unclassified | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | -| kor | Korean | Koreanic | 0 | 2 | 0 | 6 | 8 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 10 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 39 | +| kor | Korean | Koreanic | 0 | 2 | 0 | 6 | 8 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 3 | 1 | 10 | 3 | 0 | 0 | 0 | 0 | 2 | 0 | 40 | | kos | Kosraean | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kpf | Komba | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | kpg | Kapingamarangi | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1709,7 +1710,7 @@ The following tables give you an overview of the tasks in MTEB. | ruf | Luguru | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | rug | Roviana | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | run | Rundi | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | -| rus | Russian | Indo-European | 0 | 2 | 0 | 7 | 18 | 6 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 4 | 2 | 17 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 64 | +| rus | Russian | Indo-European | 0 | 2 | 0 | 7 | 18 | 6 | 0 | 1 | 0 | 0 | 0 | 0 | 2 | 4 | 2 | 17 | 4 | 0 | 0 | 0 | 0 | 2 | 0 | 65 | | rwo | Rawa | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | sab | Buglere | Chibchan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | sag | Sango | Atlantic-Congo | 0 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 | @@ -1748,7 +1749,7 @@ The following tables give you an overview of the tasks in MTEB. | soq | Kanasi | Dagan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | sot | Southern Sotho | Atlantic-Congo | 0 | 0 | 0 | 1 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 5 | | soy | Miyobe | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| spa | Spanish | Indo-European | 0 | 2 | 0 | 6 | 13 | 4 | 0 | 3 | 0 | 0 | 0 | 0 | 1 | 3 | 2 | 15 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 57 | +| spa | Spanish | Indo-European | 0 | 2 | 0 | 6 | 13 | 4 | 0 | 4 | 0 | 0 | 0 | 0 | 1 | 3 | 2 | 15 | 4 | 0 | 0 | 0 | 0 | 4 | 0 | 58 | | spl | Selepet | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | spm | Akukem | Ramu | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | spp | Supyire Senoufo | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1772,7 +1773,7 @@ The following tables give you an overview of the tasks in MTEB. | sus | Susu | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | suz | Sunwar | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | svk | Slovakian Sign Language | Sign Language | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 1 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| swa | Swahili (macrolanguage) | Atlantic-Congo | 0 | 1 | 0 | 1 | 7 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | | swe | Swedish | Indo-European | 0 | 1 | 0 | 6 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 1 | 5 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 27 | | swg | Swabian | Indo-European | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | swh | Swahili (individual language) | Atlantic-Congo | 0 | 0 | 0 | 3 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 6 | @@ -1797,7 +1798,7 @@ The following tables give you an overview of the tasks in MTEB. | tcz | Thado Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | tdt | Tetun Dili | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | tee | Huehuetla Tepehua | Totonacan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tel | Telugu | Dravidian | 0 | 1 | 0 | 7 | 7 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | +| tel | Telugu | Dravidian | 0 | 1 | 0 | 7 | 7 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 5 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | | ter | Tereno | Arawakan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | tet | Tetum | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | tew | Tewa (USA) | Kiowa-Tanoan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1806,7 +1807,7 @@ The following tables give you an overview of the tasks in MTEB. | tgl | Tagalog | Austronesian | 0 | 0 | 0 | 5 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 11 | | tgo | Sudest | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | tgp | Tangoa | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| tha | Thai | Tai-Kadai | 0 | 1 | 0 | 6 | 8 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 25 | +| tha | Thai | Tai-Kadai | 0 | 1 | 0 | 6 | 8 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 7 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | | tif | Tifal | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | tim | Timbe | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | tir | Tigrinya | Afro-Asiatic | 0 | 0 | 0 | 2 | 2 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 8 | @@ -1930,7 +1931,7 @@ The following tables give you an overview of the tasks in MTEB. | yle | Yele | Unclassified | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | yml | Iamalele | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | yon | Yongkom | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| yor | Yoruba | Atlantic-Congo | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 16 | +| yor | Yoruba | Atlantic-Congo | 0 | 0 | 0 | 4 | 5 | 3 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 3 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 17 | | yrb | Yareba | Yareban | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | yre | Yaouré | Mande | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | yss | Yessan-Mayo | Sepik | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1955,7 +1956,7 @@ The following tables give you an overview of the tasks in MTEB. | zaw | Mitla Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zca | Coatecas Altas Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zga | Kinga | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| zho | Chinese | Unclassified | 0 | 2 | 0 | 4 | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 26 | +| zho | Chinese | Unclassified | 0 | 2 | 0 | 4 | 2 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 2 | 1 | 14 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 27 | | zia | Zia | Nuclear Trans New Guinea | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | ziw | Zigula | Atlantic-Congo | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zlm | Malay (individual language) | Austronesian | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | @@ -1974,7 +1975,7 @@ The following tables give you an overview of the tasks in MTEB. | zty | Yatee Zapotec | Otomanguean | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | | zul | Zulu | Atlantic-Congo | 0 | 0 | 0 | 2 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 7 | | zyp | Zyphe Chin | Sino-Tibetan | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | -| Total | None | None | None | 0 | 55 | 49 | 1494 | 846 | 316 | 7 | 23 | 22 | 5 | 0 | 3 | 28 | 92 | 56 | 593 | 88 | 2 | 2 | 6 | 7 | 37 | 24 | +| Total | None | None | None | 0 | 55 | 49 | 1494 | 846 | 316 | 7 | 41 | 22 | 5 | 0 | 3 | 28 | 92 | 56 | 593 | 88 | 2 | 2 | 6 | 7 | 37 | 24 | diff --git a/docs/usage/usage.md b/docs/usage/usage.md index a5b9319311..d6c06a540b 100644 --- a/docs/usage/usage.md +++ b/docs/usage/usage.md @@ -456,12 +456,15 @@ There are times you may want to cache the embeddings so you can re-use them. Thi ```python # define your task(s) and model above as normal -... +task = mteb.get_task("LccSentimentClassification") +model = mteb.get_model("minishlab/M2V_base_glove_subword") +evaluation = mteb.MTEB(tasks=[task]) + # wrap the model with the cache wrapper from mteb.models.cache_wrapper import CachedEmbeddingWrapper -model_with_cached_emb = CachedEmbeddingWrapper(model, cache_path='') +model_with_cached_emb = CachedEmbeddingWrapper(model, cache_path='path_to_cache_dir') # run as normal -evaluation.run(model, ...) +evaluation.run(model_with_cached_emb) ``` If you want to directly access the cached embeddings (e.g. for subsequent analyses) follow this example: @@ -471,8 +474,8 @@ import numpy as np from mteb.models.cache_wrapper import TextVectorMap # Access the memory-mapped file and convert to array -vector_map = TextVectorMap("/AppsRetrieval") -vector_map.load(name="AppsRetrieval") +vector_map = TextVectorMap("path_to_cache_dir/LccSentimentClassification") +vector_map.load(name="LccSentimentClassification") vectors = np.asarray(vector_map.vectors) # Remove all "placeholders" in the embedding cache diff --git a/mteb/leaderboard/table.py b/mteb/leaderboard/table.py index fbc01496e8..5286680dc9 100644 --- a/mteb/leaderboard/table.py +++ b/mteb/leaderboard/table.py @@ -258,7 +258,7 @@ def apply_styling( joint_table[score_columns] = joint_table[score_columns].map(format_scores) joint_table_style = joint_table.style.format( { - **{column: "{:.2f}" for column in score_columns}, + **dict.fromkeys(score_columns, "{:.2f}"), "Rank (Borda)": "{:.0f}", }, na_rep="", diff --git a/mteb/models/cache_wrapper.py b/mteb/models/cache_wrapper.py index f27e58e48a..fe38ea6310 100644 --- a/mteb/models/cache_wrapper.py +++ b/mteb/models/cache_wrapper.py @@ -155,8 +155,8 @@ def load(self, name: str | None = None) -> None: self.vectors = np.memmap( self.vectors_file, dtype="float32", mode="r+" ) - self.vectors = self.vectors.reshape(-1, self.vector_dim) - logger.info(f"Loaded vectors file with shape: {self.vectors.shape}") + self.vectors = self.vectors.reshape(-1, self.vector_dim) # type: ignore + logger.info(f"Loaded vectors file with shape: {self.vectors.shape}") # type: ignore else: logger.warning( "Vector dimension not set. Unable to load vectors file." @@ -214,22 +214,30 @@ def __init__(self, model: Encoder, cache_path: str | Path): logger.info("Initialized CachedEmbeddingWrapper") def encode( - self, texts: list[str], batch_size: int = 32, task_name: str = None, **kwargs + self, + texts: list[str], + batch_size: int = 32, + task_name: str | None = None, + **kwargs, ) -> np.ndarray: """Encode texts using the wrapped model, with caching""" + _task_name = task_name or "no_task_name" + try: results = [] uncached_texts = [] uncached_indices = [] # Initialize cache - if task_name not in self.cache_dict: - self.cache_dict[task_name] = TextVectorMap(self.cache_path / task_name) - self.cache_dict[task_name].load(name=task_name) + if _task_name not in self.cache_dict: + self.cache_dict[_task_name] = TextVectorMap( + self.cache_path / _task_name + ) + self.cache_dict[_task_name].load(name=_task_name) # Check cache for each text for i, text in enumerate(texts): - vector = self.cache_dict[task_name].get_vector(text) + vector = self.cache_dict[_task_name].get_vector(text) if vector is not None: results.append(vector) else: @@ -240,16 +248,19 @@ def encode( if uncached_texts: logger.info(f"Encoding {len(uncached_texts)} new texts") new_vectors = self._model.encode( - uncached_texts, batch_size=batch_size, **kwargs + uncached_texts, + batch_size=batch_size, + task_name=task_name, # type: ignore + **kwargs, ) if isinstance(new_vectors, torch.Tensor): new_vectors = new_vectors.cpu().numpy() # Add new vectors to cache for text, vector in zip(uncached_texts, new_vectors): - self.cache_dict[task_name].add(text, vector) + self.cache_dict[_task_name].add(text, vector) results.extend(new_vectors) - self.cache_dict[task_name].save() + self.cache_dict[_task_name].save() else: logger.info("All texts found in cache") @@ -287,7 +298,7 @@ def __getattr__(self, name: str) -> Any: def __dir__(self) -> list[str]: """Return all attributes from both this class and the wrapped model""" - return list(set(super().__dir__() + dir(self._model))) + return list(set(super().__dir__() + dir(self._model))) # type: ignore def __del__(self): self.close() diff --git a/mteb/models/cadet_models.py b/mteb/models/cadet_models.py index 978f542271..b144dcee7c 100644 --- a/mteb/models/cadet_models.py +++ b/mteb/models/cadet_models.py @@ -1,7 +1,8 @@ from __future__ import annotations -from mteb.model_meta import ModelMeta, sentence_transformers_loader +from mteb.model_meta import ModelMeta from mteb.models.bge_models import bge_m3_training_data +from mteb.models.sentence_transformers_models import sentence_transformers_loader cadet_training_data = { # we train with the corpora of FEVER, MSMARCO, and DBPEDIA. We only train with synthetic generated queries. diff --git a/mteb/models/overview.py b/mteb/models/overview.py index 0f59cb1b7b..fe72be01fe 100644 --- a/mteb/models/overview.py +++ b/mteb/models/overview.py @@ -68,6 +68,7 @@ promptriever_models, qodo_models, qtack_models, + qwen3_models, repllama_models, rerankers_custom, rerankers_monot5_based, @@ -144,6 +145,7 @@ promptriever_models, qodo_models, qtack_models, + qwen3_models, repllama_models, rerankers_custom, rerankers_monot5_based, diff --git a/mteb/models/qwen3_models.py b/mteb/models/qwen3_models.py new file mode 100644 index 0000000000..e70e01b3e0 --- /dev/null +++ b/mteb/models/qwen3_models.py @@ -0,0 +1,186 @@ +from __future__ import annotations + +from mteb.encoder_interface import Encoder, PromptType +from mteb.model_meta import ModelMeta +from mteb.models.instruct_wrapper import InstructSentenceTransformerModel + + +def instruction_template( + instruction: str, prompt_type: PromptType | None = None +) -> str: + if not instruction or prompt_type == PromptType.passage: + return "" + if isinstance(instruction, dict): + if prompt_type is None: + instruction = list(instruction.values())[0] # TODO + else: + instruction = instruction[prompt_type] + return f"Instruct: {instruction}\nQuery:" + + +multilingual_langs = [ + "afr-Latn", + "ara-Arab", + "aze-Latn", + "bel-Cyrl", + "bul-Cyrl", + "ben-Beng", + "cat-Latn", + "ceb-Latn", + "ces-Latn", + "cym-Latn", + "dan-Latn", + "deu-Latn", + "ell-Grek", + "eng-Latn", + "spa-Latn", + "est-Latn", + "eus-Latn", + "fas-Arab", + "fin-Latn", + "fra-Latn", + "glg-Latn", + "guj-Gujr", + "heb-Hebr", + "hin-Deva", + "hrv-Latn", + "hat-Latn", + "hun-Latn", + "hye-Armn", + "ind-Latn", + "isl-Latn", + "ita-Latn", + "jpn-Jpan", + "jav-Latn", + "kat-Geor", + "kaz-Cyrl", + "khm-Khmr", + "kan-Knda", + "kor-Hang", + "kir-Cyrl", + "lao-Laoo", + "lit-Latn", + "lav-Latn", + "mkd-Cyrl", + "mal-Mlym", + "mon-Cyrl", + "mar-Deva", + "msa-Latn", + "mya-Mymr", + "nep-Deva", + "nld-Latn", + "nor-Latn", + "pan-Guru", + "pol-Latn", + "por-Latn", + "que-Latn", + "ron-Latn", + "rus-Cyrl", + "sin-Sinh", + "slk-Latn", + "slv-Latn", + "swa-Latn", + "tam-Taml", + "tel-Telu", + "tha-Thai", + "tgl-Latn", + "tur-Latn", + "ukr-Cyrl", + "urd-Arab", + "vie-Latn", + "yor-Latn", + "zho-Hans", +] + +training_data = { + "T2Retrieval": ["train"], + "DuRetrieval": ["train"], + "MMarcoReranking": ["train"], + "CMedQAv2-reranking": ["train"], + "NQ": ["train"], + "MSMARCO": ["train"], + "HotpotQA": ["train"], + "FEVER": ["train"], + "MrTidyRetrieval": ["train"], + "MIRACLRetrieval": ["train"], + "CodeSearchNet": ["train"], +} + + +def q3e_instruct_loader(model_name_or_path: str, revision: str, **kwargs) -> Encoder: + model = InstructSentenceTransformerModel( + model_name_or_path, + revision=revision, + instruction_template=instruction_template, + apply_instruction_to_passages=False, + **kwargs, + ) + encoder = model.model._first_module() + if encoder.auto_model.config._attn_implementation == "flash_attention_2": + # The Qwen3 code only use left padding in flash_attention_2 mode. + encoder.tokenizer.padding_side = "left" + return model + + +Qwen3_Embedding_0B6 = ModelMeta( + loader=q3e_instruct_loader, + name="Qwen/Qwen3-Embedding-0.6B", + languages=multilingual_langs, + open_weights=True, + revision="b22da495047858cce924d27d76261e96be6febc0", # Commit of @tomaarsen + release_date="2025-06-05", + n_parameters=595776512, + memory_usage_mb=2272, + embed_dim=1024, + max_tokens=32768, + license="apache-2.0", + reference="https://huggingface.co/Qwen/Qwen3-Embedding-0.6B", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=training_data, +) + +Qwen3_Embedding_4B = ModelMeta( + loader=q3e_instruct_loader, + name="Qwen/Qwen3-Embedding-4B", + languages=multilingual_langs, + open_weights=True, + revision="636cd9bf47d976946cdbb2b0c3ca0cb2f8eea5ff", # Commit of @tomaarsen + release_date="2025-06-05", + n_parameters=4021774336, + memory_usage_mb=15341, + embed_dim=2560, + max_tokens=32768, + license="apache-2.0", + reference="https://huggingface.co/Qwen/Qwen3-Embedding-4B", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=training_data, +) + +Qwen3_Embedding_8B = ModelMeta( + loader=q3e_instruct_loader, + name="Qwen/Qwen3-Embedding-8B", + languages=multilingual_langs, + open_weights=True, + revision="4e423935c619ae4df87b646a3ce949610c66241c", # Commit of @tomaarsen + release_date="2025-06-05", + n_parameters=7567295488, + memory_usage_mb=28866, + embed_dim=4096, + max_tokens=32768, + license="apache-2.0", + reference="https://huggingface.co/Qwen/Qwen3-Embedding-8B", + similarity_fn_name="cosine", + framework=["Sentence Transformers", "PyTorch"], + use_instructions=True, + public_training_code=None, + public_training_data=None, + training_datasets=training_data, +) diff --git a/mteb/tasks/Image/Any2AnyRetrieval/multilingual/MIRACLVisionRetrieval.py b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/MIRACLVisionRetrieval.py new file mode 100644 index 0000000000..0f9415aff1 --- /dev/null +++ b/mteb/tasks/Image/Any2AnyRetrieval/multilingual/MIRACLVisionRetrieval.py @@ -0,0 +1,178 @@ +from __future__ import annotations + +import datasets + +from mteb.abstasks.Image.AbsTaskAny2AnyRetrieval import AbsTaskAny2AnyRetrieval +from mteb.abstasks.MultilingualTask import MultilingualTask +from mteb.abstasks.TaskMetadata import TaskMetadata + +_EVAL_SPLIT = "default" + +_LANGUAGES = { + "ar": ["ara-Arab"], + "bn": ["ben-Beng"], + "de": ["deu-Latn"], + "en": ["eng-Latn"], + "es": ["spa-Latn"], + "fa": ["fas-Arab"], + "fi": ["fin-Latn"], + "fr": ["fra-Latn"], + "hi": ["hin-Deva"], + "id": ["ind-Latn"], + "ja": ["jpn-Jpan"], + "ko": ["kor-Kore"], + "ru": ["rus-Cyrl"], + "sw": ["swa-Latn"], + "te": ["tel-Telu"], + "th": ["tha-Thai"], + "yo": ["yor-Latn"], + "zh": ["zho-Hans"], +} + + +def _load_miracl_data( + path: str, + langs: list, + splits: str, + cache_dir: str | None = None, + revision: str | None = None, + trust_remote_code: bool = False, +): + corpus = {lang: dict.fromkeys(splits) for lang in langs} + queries = {lang: dict.fromkeys(splits) for lang in langs} + relevant_docs = {lang: dict.fromkeys(splits) for lang in langs} + + split = _EVAL_SPLIT + + for lang in langs: + # Load corpus data (Can be several millions for languages) + corpus_identifier = f"corpus-{lang}" + corpus_data = datasets.load_dataset( + path, + corpus_identifier, + cache_dir=cache_dir, + revision=revision, + trust_remote_code=trust_remote_code, + ) + + images_identifier = f"images-{lang}" + images_data = datasets.load_dataset( + path, + images_identifier, + cache_dir=cache_dir, + revision=revision, + trust_remote_code=trust_remote_code, + ) + + # For text data, it would look like this, just use _id column + + imgid2docid = { + str(ex["image_id"]): str(ex["_id"]) + for ex in corpus_data[split] # e.g. “train”, “validation”, etc. + } + + images_data = images_data.map( + lambda x: { + "id": imgid2docid[str(x["file_name"])], + # "modality": "text", + "modality": "image", + "text": None, + }, + remove_columns=["file_name"], + ) + + corpus[lang][split] = images_data[split] + + # Load queries data + queries_identifier = f"queries-{lang}" + queries_data = datasets.load_dataset( + path, + queries_identifier, + cache_dir=cache_dir, + revision=revision, + trust_remote_code=trust_remote_code, + ) + queries_data = queries_data.map( + lambda x: { + "id": str(x["_id"]), + "text": x["text"], + "modality": "text", + "image": None, + }, + remove_columns=["_id"], + ) + queries[lang][split] = queries_data[split] + + # Load relevant documents data + qrels_identifier = f"qrels-{lang}" + qrels_data = datasets.load_dataset( + path, + qrels_identifier, + cache_dir=cache_dir, + revision=revision, + trust_remote_code=trust_remote_code, + ) + relevant_docs[lang][split] = {} + for row in qrels_data[split]: + query_id = str(row["query-id"]) + doc_id = str(row["corpus-id"]) + score = row["score"] + if query_id not in relevant_docs[lang][split]: + relevant_docs[lang][split][query_id] = {} + relevant_docs[lang][split][query_id][doc_id] = score + + corpus = datasets.DatasetDict(corpus) + queries = datasets.DatasetDict(queries) + relevant_docs = datasets.DatasetDict(relevant_docs) + + return corpus, queries, relevant_docs + + +class MIRACLVisionRetrieval(MultilingualTask, AbsTaskAny2AnyRetrieval): + metadata = TaskMetadata( + name="MIRACLVisionRetrieval", + description="Retrieve associated pages according to questions.", + reference="https://arxiv.org/pdf/2407.01449", + dataset={ + "path": "nvidia/miracl-vision", + "revision": "309e1696433408fbd555959cf1da968f3814f8b6", + }, + type="DocumentUnderstanding", + category="t2i", + eval_splits=["default"], + eval_langs=_LANGUAGES, + main_score="ndcg_at_5", + date=("2025-03-01", "2025-06-01"), + domains=["Encyclopaedic"], + task_subtypes=["Image Text Retrieval"], + license="cc-by-sa-4.0", + annotations_creators="derived", + dialect=[], + modalities=["text", "image"], + sample_creation="created", + bibtex_citation=r""" +@article{osmulski2025miraclvisionlargemultilingualvisual, + author = {Radek Osmulski and Gabriel de Souza P. Moreira and Ronay Ak and Mengyao Xu and Benedikt Schifferer and Even Oldridge}, + eprint = {2505.11651}, + journal = {arxiv}, + title = {{MIRACL-VISION: A Large, multilingual, visual document retrieval benchmark}}, + url = {https://arxiv.org/abs/2505.11651}, + year = {2025}, +} +""", + prompt={"query": "Find a screenshot that is relevant to the user's query."}, + ) + + def load_data(self, **kwargs): + if self.data_loaded: + return + + self.corpus, self.queries, self.relevant_docs = _load_miracl_data( + path=self.metadata_dict["dataset"]["path"], + splits=self.metadata_dict["eval_splits"], + langs=self.hf_subsets, + cache_dir=kwargs.get("cache_dir", None), + revision=self.metadata_dict["dataset"]["revision"], + ) + + self.data_loaded = True diff --git a/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py b/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py index 85bfda8a39..580e42dcf4 100644 --- a/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py +++ b/mteb/tasks/Image/ImageClassification/eng/Caltech101Classification.py @@ -15,7 +15,7 @@ class Caltech101Classification(AbsTaskAnyClassification): reference="https://ieeexplore.ieee.org/document/1384978", dataset={ "path": "mteb/Caltech101", - "revision": "52439cf6d4f6ebf563d8cdc7f2c5371d9efd2686", + "revision": "011e51e5fb01f0c820824734edb7a539ab8e6650", }, type="ImageClassification", category="i2c", diff --git a/mteb/tasks/Image/ZeroShotClassification/eng/Caltech101.py b/mteb/tasks/Image/ZeroShotClassification/eng/Caltech101.py index 72f35e06d4..7fc16519cd 100644 --- a/mteb/tasks/Image/ZeroShotClassification/eng/Caltech101.py +++ b/mteb/tasks/Image/ZeroShotClassification/eng/Caltech101.py @@ -13,7 +13,7 @@ class Caltech101ZeroShotClassification(AbsTaskZeroShotClassification): reference="https://ieeexplore.ieee.org/document/1384978", dataset={ "path": "mteb/Caltech101", - "revision": "52439cf6d4f6ebf563d8cdc7f2c5371d9efd2686", + "revision": "011e51e5fb01f0c820824734edb7a539ab8e6650", }, type="ZeroShotClassification", category="i2t",